xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision bb2d13b686e3ccf6c3ccb36209dfb7dcc108b182)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy 
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
242c48331dSMatt Macy  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26eda14cbcSMatt Macy  */
27eda14cbcSMatt Macy 
28eda14cbcSMatt Macy #include <sys/zfs_context.h>
29eda14cbcSMatt Macy #include <sys/spa.h>
30eda14cbcSMatt Macy #include <sys/vdev_impl.h>
31eda14cbcSMatt Macy #include <sys/zio.h>
32eda14cbcSMatt Macy #include <sys/zio_checksum.h>
33eda14cbcSMatt Macy #include <sys/abd.h>
34eda14cbcSMatt Macy #include <sys/fs/zfs.h>
35eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h>
36eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
37eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h>
387877fdebSMatt Macy #include <sys/vdev_draid.h>
39eda14cbcSMatt Macy 
40eda14cbcSMatt Macy #ifdef ZFS_DEBUG
41eda14cbcSMatt Macy #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
42eda14cbcSMatt Macy #endif
43eda14cbcSMatt Macy 
44eda14cbcSMatt Macy /*
45eda14cbcSMatt Macy  * Virtual device vector for RAID-Z.
46eda14cbcSMatt Macy  *
47eda14cbcSMatt Macy  * This vdev supports single, double, and triple parity. For single parity,
48eda14cbcSMatt Macy  * we use a simple XOR of all the data columns. For double or triple parity,
49eda14cbcSMatt Macy  * we use a special case of Reed-Solomon coding. This extends the
50eda14cbcSMatt Macy  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51eda14cbcSMatt Macy  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52eda14cbcSMatt Macy  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53eda14cbcSMatt Macy  * former is also based. The latter is designed to provide higher performance
54eda14cbcSMatt Macy  * for writes.
55eda14cbcSMatt Macy  *
56eda14cbcSMatt Macy  * Note that the Plank paper claimed to support arbitrary N+M, but was then
57eda14cbcSMatt Macy  * amended six years later identifying a critical flaw that invalidates its
58eda14cbcSMatt Macy  * claims. Nevertheless, the technique can be adapted to work for up to
59eda14cbcSMatt Macy  * triple parity. For additional parity, the amendment "Note: Correction to
60eda14cbcSMatt Macy  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61eda14cbcSMatt Macy  * is viable, but the additional complexity means that write performance will
62eda14cbcSMatt Macy  * suffer.
63eda14cbcSMatt Macy  *
64eda14cbcSMatt Macy  * All of the methods above operate on a Galois field, defined over the
65eda14cbcSMatt Macy  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
66eda14cbcSMatt Macy  * can be expressed with a single byte. Briefly, the operations on the
67eda14cbcSMatt Macy  * field are defined as follows:
68eda14cbcSMatt Macy  *
69eda14cbcSMatt Macy  *   o addition (+) is represented by a bitwise XOR
70eda14cbcSMatt Macy  *   o subtraction (-) is therefore identical to addition: A + B = A - B
71eda14cbcSMatt Macy  *   o multiplication of A by 2 is defined by the following bitwise expression:
72eda14cbcSMatt Macy  *
73eda14cbcSMatt Macy  *	(A * 2)_7 = A_6
74eda14cbcSMatt Macy  *	(A * 2)_6 = A_5
75eda14cbcSMatt Macy  *	(A * 2)_5 = A_4
76eda14cbcSMatt Macy  *	(A * 2)_4 = A_3 + A_7
77eda14cbcSMatt Macy  *	(A * 2)_3 = A_2 + A_7
78eda14cbcSMatt Macy  *	(A * 2)_2 = A_1 + A_7
79eda14cbcSMatt Macy  *	(A * 2)_1 = A_0
80eda14cbcSMatt Macy  *	(A * 2)_0 = A_7
81eda14cbcSMatt Macy  *
82eda14cbcSMatt Macy  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
83eda14cbcSMatt Macy  * As an aside, this multiplication is derived from the error correcting
84eda14cbcSMatt Macy  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
85eda14cbcSMatt Macy  *
86eda14cbcSMatt Macy  * Observe that any number in the field (except for 0) can be expressed as a
87eda14cbcSMatt Macy  * power of 2 -- a generator for the field. We store a table of the powers of
88eda14cbcSMatt Macy  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
89eda14cbcSMatt Macy  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
90eda14cbcSMatt Macy  * than field addition). The inverse of a field element A (A^-1) is therefore
91eda14cbcSMatt Macy  * A ^ (255 - 1) = A^254.
92eda14cbcSMatt Macy  *
93eda14cbcSMatt Macy  * The up-to-three parity columns, P, Q, R over several data columns,
94eda14cbcSMatt Macy  * D_0, ... D_n-1, can be expressed by field operations:
95eda14cbcSMatt Macy  *
96eda14cbcSMatt Macy  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
97eda14cbcSMatt Macy  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
98eda14cbcSMatt Macy  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
99eda14cbcSMatt Macy  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
100eda14cbcSMatt Macy  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
101eda14cbcSMatt Macy  *
102eda14cbcSMatt Macy  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
103eda14cbcSMatt Macy  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
104eda14cbcSMatt Macy  * independent coefficients. (There are no additional coefficients that have
105eda14cbcSMatt Macy  * this property which is why the uncorrected Plank method breaks down.)
106eda14cbcSMatt Macy  *
107eda14cbcSMatt Macy  * See the reconstruction code below for how P, Q and R can used individually
108eda14cbcSMatt Macy  * or in concert to recover missing data columns.
109eda14cbcSMatt Macy  */
110eda14cbcSMatt Macy 
111eda14cbcSMatt Macy #define	VDEV_RAIDZ_P		0
112eda14cbcSMatt Macy #define	VDEV_RAIDZ_Q		1
113eda14cbcSMatt Macy #define	VDEV_RAIDZ_R		2
114eda14cbcSMatt Macy 
115eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
116eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
117eda14cbcSMatt Macy 
118eda14cbcSMatt Macy /*
119eda14cbcSMatt Macy  * We provide a mechanism to perform the field multiplication operation on a
120eda14cbcSMatt Macy  * 64-bit value all at once rather than a byte at a time. This works by
121eda14cbcSMatt Macy  * creating a mask from the top bit in each byte and using that to
122eda14cbcSMatt Macy  * conditionally apply the XOR of 0x1d.
123eda14cbcSMatt Macy  */
124eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_2(x, mask) \
125eda14cbcSMatt Macy { \
126eda14cbcSMatt Macy 	(mask) = (x) & 0x8080808080808080ULL; \
127eda14cbcSMatt Macy 	(mask) = ((mask) << 1) - ((mask) >> 7); \
128eda14cbcSMatt Macy 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
129eda14cbcSMatt Macy 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
130eda14cbcSMatt Macy }
131eda14cbcSMatt Macy 
132eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_4(x, mask) \
133eda14cbcSMatt Macy { \
134eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
135eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
136eda14cbcSMatt Macy }
137eda14cbcSMatt Macy 
1387877fdebSMatt Macy static void
1397877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr)
140eda14cbcSMatt Macy {
141184c1b94SMartin Matuska 	for (int c = 0; c < rr->rr_cols; c++) {
142184c1b94SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
143eda14cbcSMatt Macy 
144184c1b94SMartin Matuska 		if (rc->rc_size != 0)
145184c1b94SMartin Matuska 			abd_free(rc->rc_abd);
146184c1b94SMartin Matuska 		if (rc->rc_orig_data != NULL)
147f9693befSMartin Matuska 			abd_free(rc->rc_orig_data);
148eda14cbcSMatt Macy 	}
149eda14cbcSMatt Macy 
1507877fdebSMatt Macy 	if (rr->rr_abd_empty != NULL)
1517877fdebSMatt Macy 		abd_free(rr->rr_abd_empty);
152eda14cbcSMatt Macy 
1537877fdebSMatt Macy 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
1547877fdebSMatt Macy }
1557877fdebSMatt Macy 
1567877fdebSMatt Macy void
1577877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm)
1587877fdebSMatt Macy {
1597877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++)
1607877fdebSMatt Macy 		vdev_raidz_row_free(rm->rm_row[i]);
1617877fdebSMatt Macy 
1627877fdebSMatt Macy 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
163eda14cbcSMatt Macy }
164eda14cbcSMatt Macy 
165eda14cbcSMatt Macy static void
166eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio)
167eda14cbcSMatt Macy {
168eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
169eda14cbcSMatt Macy 
170eda14cbcSMatt Macy 	vdev_raidz_map_free(rm);
171eda14cbcSMatt Macy }
172eda14cbcSMatt Macy 
173f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = {
174eda14cbcSMatt Macy 	.vsd_free = vdev_raidz_map_free_vsd,
175eda14cbcSMatt Macy };
176eda14cbcSMatt Macy 
17781b22a98SMartin Matuska static void
17881b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
17981b22a98SMartin Matuska {
18081b22a98SMartin Matuska 	int c;
18181b22a98SMartin Matuska 	int nwrapped = 0;
18281b22a98SMartin Matuska 	uint64_t off = 0;
18381b22a98SMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
18481b22a98SMartin Matuska 
18581b22a98SMartin Matuska 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
18681b22a98SMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
18781b22a98SMartin Matuska 
18881b22a98SMartin Matuska 	/*
18981b22a98SMartin Matuska 	 * Pad any parity columns with additional space to account for skip
19081b22a98SMartin Matuska 	 * sectors.
19181b22a98SMartin Matuska 	 */
19281b22a98SMartin Matuska 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
19381b22a98SMartin Matuska 		ASSERT0(rm->rm_skipstart);
19481b22a98SMartin Matuska 		nwrapped = rm->rm_nskip;
19581b22a98SMartin Matuska 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
19681b22a98SMartin Matuska 		nwrapped =
19781b22a98SMartin Matuska 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
19881b22a98SMartin Matuska 	}
19981b22a98SMartin Matuska 
20081b22a98SMartin Matuska 	/*
20181b22a98SMartin Matuska 	 * Optional single skip sectors (rc_size == 0) will be handled in
20281b22a98SMartin Matuska 	 * vdev_raidz_io_start_write().
20381b22a98SMartin Matuska 	 */
20481b22a98SMartin Matuska 	int skipped = rr->rr_scols - rr->rr_cols;
20581b22a98SMartin Matuska 
20681b22a98SMartin Matuska 	/* Allocate buffers for the parity columns */
20781b22a98SMartin Matuska 	for (c = 0; c < rr->rr_firstdatacol; c++) {
20881b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
20981b22a98SMartin Matuska 
21081b22a98SMartin Matuska 		/*
21181b22a98SMartin Matuska 		 * Parity columns will pad out a linear ABD to account for
21281b22a98SMartin Matuska 		 * the skip sector. A linear ABD is used here because
21381b22a98SMartin Matuska 		 * parity calculations use the ABD buffer directly to calculate
21481b22a98SMartin Matuska 		 * parity. This avoids doing a memcpy back to the ABD after the
21581b22a98SMartin Matuska 		 * parity has been calculated. By issuing the parity column
21681b22a98SMartin Matuska 		 * with the skip sector we can reduce contention on the child
21781b22a98SMartin Matuska 		 * VDEV queue locks (vq_lock).
21881b22a98SMartin Matuska 		 */
21981b22a98SMartin Matuska 		if (c < nwrapped) {
22081b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_linear(
22181b22a98SMartin Matuska 			    rc->rc_size + (1ULL << ashift), B_FALSE);
22281b22a98SMartin Matuska 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
22381b22a98SMartin Matuska 			skipped++;
22481b22a98SMartin Matuska 		} else {
22581b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
22681b22a98SMartin Matuska 		}
22781b22a98SMartin Matuska 	}
22881b22a98SMartin Matuska 
22981b22a98SMartin Matuska 	for (off = 0; c < rr->rr_cols; c++) {
23081b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
23181b22a98SMartin Matuska 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
23281b22a98SMartin Matuska 		    zio->io_abd, off, rc->rc_size);
23381b22a98SMartin Matuska 
23481b22a98SMartin Matuska 		/*
23581b22a98SMartin Matuska 		 * Generate I/O for skip sectors to improve aggregation
23681b22a98SMartin Matuska 		 * continuity. We will use gang ABD's to reduce contention
23781b22a98SMartin Matuska 		 * on the child VDEV queue locks (vq_lock) by issuing
23881b22a98SMartin Matuska 		 * a single I/O that contains the data and skip sector.
23981b22a98SMartin Matuska 		 *
24081b22a98SMartin Matuska 		 * It is important to make sure that rc_size is not updated
24181b22a98SMartin Matuska 		 * even though we are adding a skip sector to the ABD. When
24281b22a98SMartin Matuska 		 * calculating the parity in vdev_raidz_generate_parity_row()
24381b22a98SMartin Matuska 		 * the rc_size is used to iterate through the ABD's. We can
24481b22a98SMartin Matuska 		 * not have zero'd out skip sectors used for calculating
24581b22a98SMartin Matuska 		 * parity for raidz, because those same sectors are not used
24681b22a98SMartin Matuska 		 * during reconstruction.
24781b22a98SMartin Matuska 		 */
24881b22a98SMartin Matuska 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
24981b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_gang();
25081b22a98SMartin Matuska 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
25181b22a98SMartin Matuska 			abd_gang_add(rc->rc_abd,
25281b22a98SMartin Matuska 			    abd_get_zeros(1ULL << ashift), B_TRUE);
25381b22a98SMartin Matuska 			skipped++;
25481b22a98SMartin Matuska 		} else {
25581b22a98SMartin Matuska 			rc->rc_abd = abd;
25681b22a98SMartin Matuska 		}
25781b22a98SMartin Matuska 		off += rc->rc_size;
25881b22a98SMartin Matuska 	}
25981b22a98SMartin Matuska 
26081b22a98SMartin Matuska 	ASSERT3U(off, ==, zio->io_size);
26181b22a98SMartin Matuska 	ASSERT3S(skipped, ==, rm->rm_nskip);
26281b22a98SMartin Matuska }
26381b22a98SMartin Matuska 
26481b22a98SMartin Matuska static void
26581b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
26681b22a98SMartin Matuska {
26781b22a98SMartin Matuska 	int c;
26881b22a98SMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
26981b22a98SMartin Matuska 
27081b22a98SMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
27181b22a98SMartin Matuska 
27281b22a98SMartin Matuska 	/* Allocate buffers for the parity columns */
27381b22a98SMartin Matuska 	for (c = 0; c < rr->rr_firstdatacol; c++)
27481b22a98SMartin Matuska 		rr->rr_col[c].rc_abd =
27581b22a98SMartin Matuska 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
27681b22a98SMartin Matuska 
27781b22a98SMartin Matuska 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
27881b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
27981b22a98SMartin Matuska 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
28081b22a98SMartin Matuska 		    zio->io_abd, off, rc->rc_size);
28181b22a98SMartin Matuska 		off += rc->rc_size;
28281b22a98SMartin Matuska 	}
28381b22a98SMartin Matuska }
28481b22a98SMartin Matuska 
285eda14cbcSMatt Macy /*
286eda14cbcSMatt Macy  * Divides the IO evenly across all child vdevs; usually, dcols is
287eda14cbcSMatt Macy  * the number of children in the target vdev.
288eda14cbcSMatt Macy  *
289eda14cbcSMatt Macy  * Avoid inlining the function to keep vdev_raidz_io_start(), which
290eda14cbcSMatt Macy  * is this functions only caller, as small as possible on the stack.
291eda14cbcSMatt Macy  */
292eda14cbcSMatt Macy noinline raidz_map_t *
293eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
294eda14cbcSMatt Macy     uint64_t nparity)
295eda14cbcSMatt Macy {
2967877fdebSMatt Macy 	raidz_row_t *rr;
297eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
298eda14cbcSMatt Macy 	uint64_t b = zio->io_offset >> ashift;
299eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
300eda14cbcSMatt Macy 	uint64_t s = zio->io_size >> ashift;
301eda14cbcSMatt Macy 	/* The first column for this stripe. */
302eda14cbcSMatt Macy 	uint64_t f = b % dcols;
303eda14cbcSMatt Macy 	/* The starting byte offset on each child vdev. */
304eda14cbcSMatt Macy 	uint64_t o = (b / dcols) << ashift;
305eda14cbcSMatt Macy 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
306eda14cbcSMatt Macy 
3077877fdebSMatt Macy 	raidz_map_t *rm =
3087877fdebSMatt Macy 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
3097877fdebSMatt Macy 	rm->rm_nrows = 1;
3107877fdebSMatt Macy 
311eda14cbcSMatt Macy 	/*
312eda14cbcSMatt Macy 	 * "Quotient": The number of data sectors for this stripe on all but
313eda14cbcSMatt Macy 	 * the "big column" child vdevs that also contain "remainder" data.
314eda14cbcSMatt Macy 	 */
315eda14cbcSMatt Macy 	q = s / (dcols - nparity);
316eda14cbcSMatt Macy 
317eda14cbcSMatt Macy 	/*
318eda14cbcSMatt Macy 	 * "Remainder": The number of partial stripe data sectors in this I/O.
319eda14cbcSMatt Macy 	 * This will add a sector to some, but not all, child vdevs.
320eda14cbcSMatt Macy 	 */
321eda14cbcSMatt Macy 	r = s - q * (dcols - nparity);
322eda14cbcSMatt Macy 
323eda14cbcSMatt Macy 	/* The number of "big columns" - those which contain remainder data. */
324eda14cbcSMatt Macy 	bc = (r == 0 ? 0 : r + nparity);
325eda14cbcSMatt Macy 
326eda14cbcSMatt Macy 	/*
327eda14cbcSMatt Macy 	 * The total number of data and parity sectors associated with
328eda14cbcSMatt Macy 	 * this I/O.
329eda14cbcSMatt Macy 	 */
330eda14cbcSMatt Macy 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
331eda14cbcSMatt Macy 
3327877fdebSMatt Macy 	/*
3337877fdebSMatt Macy 	 * acols: The columns that will be accessed.
3347877fdebSMatt Macy 	 * scols: The columns that will be accessed or skipped.
3357877fdebSMatt Macy 	 */
336eda14cbcSMatt Macy 	if (q == 0) {
337eda14cbcSMatt Macy 		/* Our I/O request doesn't span all child vdevs. */
338eda14cbcSMatt Macy 		acols = bc;
339eda14cbcSMatt Macy 		scols = MIN(dcols, roundup(bc, nparity + 1));
340eda14cbcSMatt Macy 	} else {
341eda14cbcSMatt Macy 		acols = dcols;
342eda14cbcSMatt Macy 		scols = dcols;
343eda14cbcSMatt Macy 	}
344eda14cbcSMatt Macy 
345eda14cbcSMatt Macy 	ASSERT3U(acols, <=, scols);
346eda14cbcSMatt Macy 
3477877fdebSMatt Macy 	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
3487877fdebSMatt Macy 	rm->rm_row[0] = rr;
349eda14cbcSMatt Macy 
3507877fdebSMatt Macy 	rr->rr_cols = acols;
3517877fdebSMatt Macy 	rr->rr_scols = scols;
3527877fdebSMatt Macy 	rr->rr_bigcols = bc;
3537877fdebSMatt Macy 	rr->rr_missingdata = 0;
3547877fdebSMatt Macy 	rr->rr_missingparity = 0;
3557877fdebSMatt Macy 	rr->rr_firstdatacol = nparity;
3567877fdebSMatt Macy 	rr->rr_abd_empty = NULL;
3577877fdebSMatt Macy 	rr->rr_nempty = 0;
3587877fdebSMatt Macy #ifdef ZFS_DEBUG
3597877fdebSMatt Macy 	rr->rr_offset = zio->io_offset;
3607877fdebSMatt Macy 	rr->rr_size = zio->io_size;
3617877fdebSMatt Macy #endif
362eda14cbcSMatt Macy 
363eda14cbcSMatt Macy 	asize = 0;
364eda14cbcSMatt Macy 
365eda14cbcSMatt Macy 	for (c = 0; c < scols; c++) {
3667877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
367eda14cbcSMatt Macy 		col = f + c;
368eda14cbcSMatt Macy 		coff = o;
369eda14cbcSMatt Macy 		if (col >= dcols) {
370eda14cbcSMatt Macy 			col -= dcols;
371eda14cbcSMatt Macy 			coff += 1ULL << ashift;
372eda14cbcSMatt Macy 		}
3737877fdebSMatt Macy 		rc->rc_devidx = col;
3747877fdebSMatt Macy 		rc->rc_offset = coff;
3757877fdebSMatt Macy 		rc->rc_abd = NULL;
3767877fdebSMatt Macy 		rc->rc_orig_data = NULL;
3777877fdebSMatt Macy 		rc->rc_error = 0;
3787877fdebSMatt Macy 		rc->rc_tried = 0;
3797877fdebSMatt Macy 		rc->rc_skipped = 0;
38016038816SMartin Matuska 		rc->rc_force_repair = 0;
38116038816SMartin Matuska 		rc->rc_allow_repair = 1;
3827877fdebSMatt Macy 		rc->rc_need_orig_restore = B_FALSE;
383eda14cbcSMatt Macy 
384eda14cbcSMatt Macy 		if (c >= acols)
3857877fdebSMatt Macy 			rc->rc_size = 0;
386eda14cbcSMatt Macy 		else if (c < bc)
3877877fdebSMatt Macy 			rc->rc_size = (q + 1) << ashift;
388eda14cbcSMatt Macy 		else
3897877fdebSMatt Macy 			rc->rc_size = q << ashift;
390eda14cbcSMatt Macy 
3917877fdebSMatt Macy 		asize += rc->rc_size;
392eda14cbcSMatt Macy 	}
393eda14cbcSMatt Macy 
394eda14cbcSMatt Macy 	ASSERT3U(asize, ==, tot << ashift);
395eda14cbcSMatt Macy 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
3967877fdebSMatt Macy 	rm->rm_skipstart = bc;
397eda14cbcSMatt Macy 
398eda14cbcSMatt Macy 	/*
399eda14cbcSMatt Macy 	 * If all data stored spans all columns, there's a danger that parity
400eda14cbcSMatt Macy 	 * will always be on the same device and, since parity isn't read
401eda14cbcSMatt Macy 	 * during normal operation, that device's I/O bandwidth won't be
402eda14cbcSMatt Macy 	 * used effectively. We therefore switch the parity every 1MB.
403eda14cbcSMatt Macy 	 *
404eda14cbcSMatt Macy 	 * ... at least that was, ostensibly, the theory. As a practical
405eda14cbcSMatt Macy 	 * matter unless we juggle the parity between all devices evenly, we
406eda14cbcSMatt Macy 	 * won't see any benefit. Further, occasional writes that aren't a
407eda14cbcSMatt Macy 	 * multiple of the LCM of the number of children and the minimum
408eda14cbcSMatt Macy 	 * stripe width are sufficient to avoid pessimal behavior.
409eda14cbcSMatt Macy 	 * Unfortunately, this decision created an implicit on-disk format
410eda14cbcSMatt Macy 	 * requirement that we need to support for all eternity, but only
411eda14cbcSMatt Macy 	 * for single-parity RAID-Z.
412eda14cbcSMatt Macy 	 *
413eda14cbcSMatt Macy 	 * If we intend to skip a sector in the zeroth column for padding
414eda14cbcSMatt Macy 	 * we must make sure to note this swap. We will never intend to
415eda14cbcSMatt Macy 	 * skip the first column since at least one data and one parity
416eda14cbcSMatt Macy 	 * column must appear in each row.
417eda14cbcSMatt Macy 	 */
4187877fdebSMatt Macy 	ASSERT(rr->rr_cols >= 2);
4197877fdebSMatt Macy 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
420eda14cbcSMatt Macy 
4217877fdebSMatt Macy 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
4227877fdebSMatt Macy 		devidx = rr->rr_col[0].rc_devidx;
4237877fdebSMatt Macy 		o = rr->rr_col[0].rc_offset;
4247877fdebSMatt Macy 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
4257877fdebSMatt Macy 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
4267877fdebSMatt Macy 		rr->rr_col[1].rc_devidx = devidx;
4277877fdebSMatt Macy 		rr->rr_col[1].rc_offset = o;
428eda14cbcSMatt Macy 
429eda14cbcSMatt Macy 		if (rm->rm_skipstart == 0)
430eda14cbcSMatt Macy 			rm->rm_skipstart = 1;
431eda14cbcSMatt Macy 	}
432eda14cbcSMatt Macy 
43381b22a98SMartin Matuska 	if (zio->io_type == ZIO_TYPE_WRITE) {
43481b22a98SMartin Matuska 		vdev_raidz_map_alloc_write(zio, rm, ashift);
43581b22a98SMartin Matuska 	} else {
43681b22a98SMartin Matuska 		vdev_raidz_map_alloc_read(zio, rm);
43781b22a98SMartin Matuska 	}
43881b22a98SMartin Matuska 
439eda14cbcSMatt Macy 	/* init RAIDZ parity ops */
440eda14cbcSMatt Macy 	rm->rm_ops = vdev_raidz_math_get_ops();
441eda14cbcSMatt Macy 
442eda14cbcSMatt Macy 	return (rm);
443eda14cbcSMatt Macy }
444eda14cbcSMatt Macy 
445eda14cbcSMatt Macy struct pqr_struct {
446eda14cbcSMatt Macy 	uint64_t *p;
447eda14cbcSMatt Macy 	uint64_t *q;
448eda14cbcSMatt Macy 	uint64_t *r;
449eda14cbcSMatt Macy };
450eda14cbcSMatt Macy 
451eda14cbcSMatt Macy static int
452eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private)
453eda14cbcSMatt Macy {
454eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
455eda14cbcSMatt Macy 	const uint64_t *src = buf;
456eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
457eda14cbcSMatt Macy 
458eda14cbcSMatt Macy 	ASSERT(pqr->p && !pqr->q && !pqr->r);
459eda14cbcSMatt Macy 
460eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++)
461eda14cbcSMatt Macy 		*pqr->p ^= *src;
462eda14cbcSMatt Macy 
463eda14cbcSMatt Macy 	return (0);
464eda14cbcSMatt Macy }
465eda14cbcSMatt Macy 
466eda14cbcSMatt Macy static int
467eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private)
468eda14cbcSMatt Macy {
469eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
470eda14cbcSMatt Macy 	const uint64_t *src = buf;
471eda14cbcSMatt Macy 	uint64_t mask;
472eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
473eda14cbcSMatt Macy 
474eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && !pqr->r);
475eda14cbcSMatt Macy 
476eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
477eda14cbcSMatt Macy 		*pqr->p ^= *src;
478eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
479eda14cbcSMatt Macy 		*pqr->q ^= *src;
480eda14cbcSMatt Macy 	}
481eda14cbcSMatt Macy 
482eda14cbcSMatt Macy 	return (0);
483eda14cbcSMatt Macy }
484eda14cbcSMatt Macy 
485eda14cbcSMatt Macy static int
486eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private)
487eda14cbcSMatt Macy {
488eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
489eda14cbcSMatt Macy 	const uint64_t *src = buf;
490eda14cbcSMatt Macy 	uint64_t mask;
491eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
492eda14cbcSMatt Macy 
493eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && pqr->r);
494eda14cbcSMatt Macy 
495eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
496eda14cbcSMatt Macy 		*pqr->p ^= *src;
497eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
498eda14cbcSMatt Macy 		*pqr->q ^= *src;
499eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
500eda14cbcSMatt Macy 		*pqr->r ^= *src;
501eda14cbcSMatt Macy 	}
502eda14cbcSMatt Macy 
503eda14cbcSMatt Macy 	return (0);
504eda14cbcSMatt Macy }
505eda14cbcSMatt Macy 
506eda14cbcSMatt Macy static void
5077877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr)
508eda14cbcSMatt Macy {
5097877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
510eda14cbcSMatt Macy 
5117877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
5127877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
513eda14cbcSMatt Macy 
5147877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
5157877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
516eda14cbcSMatt Macy 		} else {
517eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, NULL, NULL };
5187877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
519eda14cbcSMatt Macy 			    vdev_raidz_p_func, &pqr);
520eda14cbcSMatt Macy 		}
521eda14cbcSMatt Macy 	}
522eda14cbcSMatt Macy }
523eda14cbcSMatt Macy 
524eda14cbcSMatt Macy static void
5257877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr)
526eda14cbcSMatt Macy {
5277877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
5287877fdebSMatt Macy 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
5297877fdebSMatt Macy 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
5307877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
5317877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
532eda14cbcSMatt Macy 
5337877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
5347877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
535eda14cbcSMatt Macy 
5367877fdebSMatt Macy 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
537eda14cbcSMatt Macy 
5387877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
539eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
5407877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
5417877fdebSMatt Macy 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
542eda14cbcSMatt Macy 
5437877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
544eda14cbcSMatt Macy 				p[i] = 0;
545eda14cbcSMatt Macy 				q[i] = 0;
546eda14cbcSMatt Macy 			}
547eda14cbcSMatt Macy 		} else {
548eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, NULL };
549eda14cbcSMatt Macy 
550eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
5517877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
552eda14cbcSMatt Macy 			    vdev_raidz_pq_func, &pqr);
553eda14cbcSMatt Macy 
554eda14cbcSMatt Macy 			/*
555eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
556eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
557eda14cbcSMatt Macy 			 */
5587877fdebSMatt Macy 			uint64_t mask;
5597877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
560eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
561eda14cbcSMatt Macy 			}
562eda14cbcSMatt Macy 		}
563eda14cbcSMatt Macy 	}
564eda14cbcSMatt Macy }
565eda14cbcSMatt Macy 
566eda14cbcSMatt Macy static void
5677877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
568eda14cbcSMatt Macy {
5697877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
5707877fdebSMatt Macy 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
5717877fdebSMatt Macy 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
5727877fdebSMatt Macy 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
5737877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
5747877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
5757877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
5767877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
577eda14cbcSMatt Macy 
5787877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
5797877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
580eda14cbcSMatt Macy 
5817877fdebSMatt Macy 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
582eda14cbcSMatt Macy 
5837877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
584eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
5857877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
5867877fdebSMatt Macy 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
5877877fdebSMatt Macy 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
588eda14cbcSMatt Macy 
5897877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
590eda14cbcSMatt Macy 				p[i] = 0;
591eda14cbcSMatt Macy 				q[i] = 0;
592eda14cbcSMatt Macy 				r[i] = 0;
593eda14cbcSMatt Macy 			}
594eda14cbcSMatt Macy 		} else {
595eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, r };
596eda14cbcSMatt Macy 
597eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
5987877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
599eda14cbcSMatt Macy 			    vdev_raidz_pqr_func, &pqr);
600eda14cbcSMatt Macy 
601eda14cbcSMatt Macy 			/*
602eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
603eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
604eda14cbcSMatt Macy 			 */
6057877fdebSMatt Macy 			uint64_t mask;
6067877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
607eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
608eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_4(r[i], mask);
609eda14cbcSMatt Macy 			}
610eda14cbcSMatt Macy 		}
611eda14cbcSMatt Macy 	}
612eda14cbcSMatt Macy }
613eda14cbcSMatt Macy 
614eda14cbcSMatt Macy /*
615eda14cbcSMatt Macy  * Generate RAID parity in the first virtual columns according to the number of
616eda14cbcSMatt Macy  * parity columns available.
617eda14cbcSMatt Macy  */
618eda14cbcSMatt Macy void
6197877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
620eda14cbcSMatt Macy {
6217877fdebSMatt Macy 	ASSERT3U(rr->rr_cols, !=, 0);
6227877fdebSMatt Macy 
623eda14cbcSMatt Macy 	/* Generate using the new math implementation */
6247877fdebSMatt Macy 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
625eda14cbcSMatt Macy 		return;
626eda14cbcSMatt Macy 
6277877fdebSMatt Macy 	switch (rr->rr_firstdatacol) {
628eda14cbcSMatt Macy 	case 1:
6297877fdebSMatt Macy 		vdev_raidz_generate_parity_p(rr);
630eda14cbcSMatt Macy 		break;
631eda14cbcSMatt Macy 	case 2:
6327877fdebSMatt Macy 		vdev_raidz_generate_parity_pq(rr);
633eda14cbcSMatt Macy 		break;
634eda14cbcSMatt Macy 	case 3:
6357877fdebSMatt Macy 		vdev_raidz_generate_parity_pqr(rr);
636eda14cbcSMatt Macy 		break;
637eda14cbcSMatt Macy 	default:
638eda14cbcSMatt Macy 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
639eda14cbcSMatt Macy 	}
640eda14cbcSMatt Macy }
641eda14cbcSMatt Macy 
6427877fdebSMatt Macy void
6437877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm)
6447877fdebSMatt Macy {
6457877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
6467877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
6477877fdebSMatt Macy 		vdev_raidz_generate_parity_row(rm, rr);
6487877fdebSMatt Macy 	}
6497877fdebSMatt Macy }
6507877fdebSMatt Macy 
651eda14cbcSMatt Macy static int
652eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
653eda14cbcSMatt Macy {
654e92ffd9bSMartin Matuska 	(void) private;
655eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
656eda14cbcSMatt Macy 	uint64_t *src = sbuf;
657eda14cbcSMatt Macy 	int cnt = size / sizeof (src[0]);
658eda14cbcSMatt Macy 
659eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++) {
660eda14cbcSMatt Macy 		dst[i] ^= src[i];
661eda14cbcSMatt Macy 	}
662eda14cbcSMatt Macy 
663eda14cbcSMatt Macy 	return (0);
664eda14cbcSMatt Macy }
665eda14cbcSMatt Macy 
666eda14cbcSMatt Macy static int
667eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
668eda14cbcSMatt Macy     void *private)
669eda14cbcSMatt Macy {
670e92ffd9bSMartin Matuska 	(void) private;
671eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
672eda14cbcSMatt Macy 	uint64_t *src = sbuf;
673eda14cbcSMatt Macy 	uint64_t mask;
674eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
675eda14cbcSMatt Macy 
676eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, src++) {
677eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
678eda14cbcSMatt Macy 		*dst ^= *src;
679eda14cbcSMatt Macy 	}
680eda14cbcSMatt Macy 
681eda14cbcSMatt Macy 	return (0);
682eda14cbcSMatt Macy }
683eda14cbcSMatt Macy 
684eda14cbcSMatt Macy static int
685eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
686eda14cbcSMatt Macy {
687e92ffd9bSMartin Matuska 	(void) private;
688eda14cbcSMatt Macy 	uint64_t *dst = buf;
689eda14cbcSMatt Macy 	uint64_t mask;
690eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
691eda14cbcSMatt Macy 
692eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++) {
693eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
694eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
695eda14cbcSMatt Macy 	}
696eda14cbcSMatt Macy 
697eda14cbcSMatt Macy 	return (0);
698eda14cbcSMatt Macy }
699eda14cbcSMatt Macy 
700eda14cbcSMatt Macy struct reconst_q_struct {
701eda14cbcSMatt Macy 	uint64_t *q;
702eda14cbcSMatt Macy 	int exp;
703eda14cbcSMatt Macy };
704eda14cbcSMatt Macy 
705eda14cbcSMatt Macy static int
706eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
707eda14cbcSMatt Macy {
708eda14cbcSMatt Macy 	struct reconst_q_struct *rq = private;
709eda14cbcSMatt Macy 	uint64_t *dst = buf;
710eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
711eda14cbcSMatt Macy 
712eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
713eda14cbcSMatt Macy 		int j;
714eda14cbcSMatt Macy 		uint8_t *b;
715eda14cbcSMatt Macy 
716eda14cbcSMatt Macy 		*dst ^= *rq->q;
717eda14cbcSMatt Macy 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
718eda14cbcSMatt Macy 			*b = vdev_raidz_exp2(*b, rq->exp);
719eda14cbcSMatt Macy 		}
720eda14cbcSMatt Macy 	}
721eda14cbcSMatt Macy 
722eda14cbcSMatt Macy 	return (0);
723eda14cbcSMatt Macy }
724eda14cbcSMatt Macy 
725eda14cbcSMatt Macy struct reconst_pq_struct {
726eda14cbcSMatt Macy 	uint8_t *p;
727eda14cbcSMatt Macy 	uint8_t *q;
728eda14cbcSMatt Macy 	uint8_t *pxy;
729eda14cbcSMatt Macy 	uint8_t *qxy;
730eda14cbcSMatt Macy 	int aexp;
731eda14cbcSMatt Macy 	int bexp;
732eda14cbcSMatt Macy };
733eda14cbcSMatt Macy 
734eda14cbcSMatt Macy static int
735eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
736eda14cbcSMatt Macy {
737eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
738eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
739eda14cbcSMatt Macy 	uint8_t *yd = ybuf;
740eda14cbcSMatt Macy 
741eda14cbcSMatt Macy 	for (int i = 0; i < size;
742eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
743eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
744eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
745eda14cbcSMatt Macy 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
746eda14cbcSMatt Macy 	}
747eda14cbcSMatt Macy 
748eda14cbcSMatt Macy 	return (0);
749eda14cbcSMatt Macy }
750eda14cbcSMatt Macy 
751eda14cbcSMatt Macy static int
752eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
753eda14cbcSMatt Macy {
754eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
755eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
756eda14cbcSMatt Macy 
757eda14cbcSMatt Macy 	for (int i = 0; i < size;
758eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
759eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
760eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
761eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
762eda14cbcSMatt Macy 	}
763eda14cbcSMatt Macy 
764eda14cbcSMatt Macy 	return (0);
765eda14cbcSMatt Macy }
766eda14cbcSMatt Macy 
767f9693befSMartin Matuska static void
7687877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
769eda14cbcSMatt Macy {
770eda14cbcSMatt Macy 	int x = tgts[0];
771eda14cbcSMatt Macy 	abd_t *dst, *src;
772eda14cbcSMatt Macy 
7737877fdebSMatt Macy 	ASSERT3U(ntgts, ==, 1);
7747877fdebSMatt Macy 	ASSERT3U(x, >=, rr->rr_firstdatacol);
7757877fdebSMatt Macy 	ASSERT3U(x, <, rr->rr_cols);
776eda14cbcSMatt Macy 
7777877fdebSMatt Macy 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
778eda14cbcSMatt Macy 
7797877fdebSMatt Macy 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
7807877fdebSMatt Macy 	dst = rr->rr_col[x].rc_abd;
781eda14cbcSMatt Macy 
7827877fdebSMatt Macy 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
783eda14cbcSMatt Macy 
7847877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
7857877fdebSMatt Macy 		uint64_t size = MIN(rr->rr_col[x].rc_size,
7867877fdebSMatt Macy 		    rr->rr_col[c].rc_size);
787eda14cbcSMatt Macy 
7887877fdebSMatt Macy 		src = rr->rr_col[c].rc_abd;
789eda14cbcSMatt Macy 
790eda14cbcSMatt Macy 		if (c == x)
791eda14cbcSMatt Macy 			continue;
792eda14cbcSMatt Macy 
793eda14cbcSMatt Macy 		(void) abd_iterate_func2(dst, src, 0, 0, size,
794eda14cbcSMatt Macy 		    vdev_raidz_reconst_p_func, NULL);
795eda14cbcSMatt Macy 	}
796eda14cbcSMatt Macy }
797eda14cbcSMatt Macy 
798f9693befSMartin Matuska static void
7997877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
800eda14cbcSMatt Macy {
801eda14cbcSMatt Macy 	int x = tgts[0];
802eda14cbcSMatt Macy 	int c, exp;
803eda14cbcSMatt Macy 	abd_t *dst, *src;
804eda14cbcSMatt Macy 
805eda14cbcSMatt Macy 	ASSERT(ntgts == 1);
806eda14cbcSMatt Macy 
8077877fdebSMatt Macy 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
808eda14cbcSMatt Macy 
8097877fdebSMatt Macy 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
8107877fdebSMatt Macy 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
8117877fdebSMatt Macy 		    rr->rr_col[c].rc_size);
812eda14cbcSMatt Macy 
8137877fdebSMatt Macy 		src = rr->rr_col[c].rc_abd;
8147877fdebSMatt Macy 		dst = rr->rr_col[x].rc_abd;
815eda14cbcSMatt Macy 
8167877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
817eda14cbcSMatt Macy 			abd_copy(dst, src, size);
8187877fdebSMatt Macy 			if (rr->rr_col[x].rc_size > size) {
819eda14cbcSMatt Macy 				abd_zero_off(dst, size,
8207877fdebSMatt Macy 				    rr->rr_col[x].rc_size - size);
8217877fdebSMatt Macy 			}
822eda14cbcSMatt Macy 		} else {
8237877fdebSMatt Macy 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
824eda14cbcSMatt Macy 			(void) abd_iterate_func2(dst, src, 0, 0, size,
825eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_func, NULL);
826eda14cbcSMatt Macy 			(void) abd_iterate_func(dst,
8277877fdebSMatt Macy 			    size, rr->rr_col[x].rc_size - size,
828eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
829eda14cbcSMatt Macy 		}
830eda14cbcSMatt Macy 	}
831eda14cbcSMatt Macy 
8327877fdebSMatt Macy 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
8337877fdebSMatt Macy 	dst = rr->rr_col[x].rc_abd;
8347877fdebSMatt Macy 	exp = 255 - (rr->rr_cols - 1 - x);
835eda14cbcSMatt Macy 
836eda14cbcSMatt Macy 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
8377877fdebSMatt Macy 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
838eda14cbcSMatt Macy 	    vdev_raidz_reconst_q_post_func, &rq);
839eda14cbcSMatt Macy }
840eda14cbcSMatt Macy 
841f9693befSMartin Matuska static void
8427877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
843eda14cbcSMatt Macy {
844eda14cbcSMatt Macy 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
845eda14cbcSMatt Macy 	abd_t *pdata, *qdata;
846eda14cbcSMatt Macy 	uint64_t xsize, ysize;
847eda14cbcSMatt Macy 	int x = tgts[0];
848eda14cbcSMatt Macy 	int y = tgts[1];
849eda14cbcSMatt Macy 	abd_t *xd, *yd;
850eda14cbcSMatt Macy 
851eda14cbcSMatt Macy 	ASSERT(ntgts == 2);
852eda14cbcSMatt Macy 	ASSERT(x < y);
8537877fdebSMatt Macy 	ASSERT(x >= rr->rr_firstdatacol);
8547877fdebSMatt Macy 	ASSERT(y < rr->rr_cols);
855eda14cbcSMatt Macy 
8567877fdebSMatt Macy 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
857eda14cbcSMatt Macy 
858eda14cbcSMatt Macy 	/*
859eda14cbcSMatt Macy 	 * Move the parity data aside -- we're going to compute parity as
860eda14cbcSMatt Macy 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
861eda14cbcSMatt Macy 	 * reuse the parity generation mechanism without trashing the actual
862eda14cbcSMatt Macy 	 * parity so we make those columns appear to be full of zeros by
863eda14cbcSMatt Macy 	 * setting their lengths to zero.
864eda14cbcSMatt Macy 	 */
8657877fdebSMatt Macy 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
8667877fdebSMatt Macy 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
8677877fdebSMatt Macy 	xsize = rr->rr_col[x].rc_size;
8687877fdebSMatt Macy 	ysize = rr->rr_col[y].rc_size;
869eda14cbcSMatt Macy 
8707877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
8717877fdebSMatt Macy 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
8727877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
8737877fdebSMatt Macy 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
8747877fdebSMatt Macy 	rr->rr_col[x].rc_size = 0;
8757877fdebSMatt Macy 	rr->rr_col[y].rc_size = 0;
876eda14cbcSMatt Macy 
8777877fdebSMatt Macy 	vdev_raidz_generate_parity_pq(rr);
878eda14cbcSMatt Macy 
8797877fdebSMatt Macy 	rr->rr_col[x].rc_size = xsize;
8807877fdebSMatt Macy 	rr->rr_col[y].rc_size = ysize;
881eda14cbcSMatt Macy 
882eda14cbcSMatt Macy 	p = abd_to_buf(pdata);
883eda14cbcSMatt Macy 	q = abd_to_buf(qdata);
8847877fdebSMatt Macy 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
8857877fdebSMatt Macy 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
8867877fdebSMatt Macy 	xd = rr->rr_col[x].rc_abd;
8877877fdebSMatt Macy 	yd = rr->rr_col[y].rc_abd;
888eda14cbcSMatt Macy 
889eda14cbcSMatt Macy 	/*
890eda14cbcSMatt Macy 	 * We now have:
891eda14cbcSMatt Macy 	 *	Pxy = P + D_x + D_y
892eda14cbcSMatt Macy 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
893eda14cbcSMatt Macy 	 *
894eda14cbcSMatt Macy 	 * We can then solve for D_x:
895eda14cbcSMatt Macy 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
896eda14cbcSMatt Macy 	 * where
897eda14cbcSMatt Macy 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
898eda14cbcSMatt Macy 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
899eda14cbcSMatt Macy 	 *
900eda14cbcSMatt Macy 	 * With D_x in hand, we can easily solve for D_y:
901eda14cbcSMatt Macy 	 *	D_y = P + Pxy + D_x
902eda14cbcSMatt Macy 	 */
903eda14cbcSMatt Macy 
904eda14cbcSMatt Macy 	a = vdev_raidz_pow2[255 + x - y];
9057877fdebSMatt Macy 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
906eda14cbcSMatt Macy 	tmp = 255 - vdev_raidz_log2[a ^ 1];
907eda14cbcSMatt Macy 
908eda14cbcSMatt Macy 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
909eda14cbcSMatt Macy 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
910eda14cbcSMatt Macy 
911eda14cbcSMatt Macy 	ASSERT3U(xsize, >=, ysize);
912eda14cbcSMatt Macy 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
913eda14cbcSMatt Macy 
914eda14cbcSMatt Macy 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
915eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_func, &rpq);
916eda14cbcSMatt Macy 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
917eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_tail_func, &rpq);
918eda14cbcSMatt Macy 
9197877fdebSMatt Macy 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
9207877fdebSMatt Macy 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
921eda14cbcSMatt Macy 
922eda14cbcSMatt Macy 	/*
923eda14cbcSMatt Macy 	 * Restore the saved parity data.
924eda14cbcSMatt Macy 	 */
9257877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
9267877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
927eda14cbcSMatt Macy }
928eda14cbcSMatt Macy 
929eda14cbcSMatt Macy /*
930eda14cbcSMatt Macy  * In the general case of reconstruction, we must solve the system of linear
931eda14cbcSMatt Macy  * equations defined by the coefficients used to generate parity as well as
932eda14cbcSMatt Macy  * the contents of the data and parity disks. This can be expressed with
933eda14cbcSMatt Macy  * vectors for the original data (D) and the actual data (d) and parity (p)
934eda14cbcSMatt Macy  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
935eda14cbcSMatt Macy  *
936eda14cbcSMatt Macy  *            __   __                     __     __
937eda14cbcSMatt Macy  *            |     |         __     __   |  p_0  |
938eda14cbcSMatt Macy  *            |  V  |         |  D_0  |   | p_m-1 |
939eda14cbcSMatt Macy  *            |     |    x    |   :   | = |  d_0  |
940eda14cbcSMatt Macy  *            |  I  |         | D_n-1 |   |   :   |
941eda14cbcSMatt Macy  *            |     |         ~~     ~~   | d_n-1 |
942eda14cbcSMatt Macy  *            ~~   ~~                     ~~     ~~
943eda14cbcSMatt Macy  *
944eda14cbcSMatt Macy  * I is simply a square identity matrix of size n, and V is a vandermonde
945eda14cbcSMatt Macy  * matrix defined by the coefficients we chose for the various parity columns
946eda14cbcSMatt Macy  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
947eda14cbcSMatt Macy  * computation as well as linear separability.
948eda14cbcSMatt Macy  *
949eda14cbcSMatt Macy  *      __               __               __     __
950eda14cbcSMatt Macy  *      |   1   ..  1 1 1 |               |  p_0  |
951eda14cbcSMatt Macy  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
952eda14cbcSMatt Macy  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
953eda14cbcSMatt Macy  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
954eda14cbcSMatt Macy  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
955eda14cbcSMatt Macy  *      |   :       : : : |   |   :   |   |  d_2  |
956eda14cbcSMatt Macy  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
957eda14cbcSMatt Macy  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
958eda14cbcSMatt Macy  *      |   0   ..  0 0 1 |               | d_n-1 |
959eda14cbcSMatt Macy  *      ~~               ~~               ~~     ~~
960eda14cbcSMatt Macy  *
961eda14cbcSMatt Macy  * Note that I, V, d, and p are known. To compute D, we must invert the
962eda14cbcSMatt Macy  * matrix and use the known data and parity values to reconstruct the unknown
963eda14cbcSMatt Macy  * data values. We begin by removing the rows in V|I and d|p that correspond
964eda14cbcSMatt Macy  * to failed or missing columns; we then make V|I square (n x n) and d|p
965eda14cbcSMatt Macy  * sized n by removing rows corresponding to unused parity from the bottom up
966eda14cbcSMatt Macy  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
967eda14cbcSMatt Macy  * using Gauss-Jordan elimination. In the example below we use m=3 parity
968eda14cbcSMatt Macy  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
969eda14cbcSMatt Macy  *           __                               __
970eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
971eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
972eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |      / /
973eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |     / /
974eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  | <--' /
975eda14cbcSMatt Macy  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
976eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
977eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
978eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
979eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
980eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
981eda14cbcSMatt Macy  *           ~~                               ~~
982eda14cbcSMatt Macy  *           __                               __
983eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
984eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  |
985eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |
986eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |
987eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  |
988eda14cbcSMatt Macy  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
989eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
990eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
991eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
992eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
993eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
994eda14cbcSMatt Macy  *           ~~                               ~~
995eda14cbcSMatt Macy  *
996eda14cbcSMatt Macy  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
997eda14cbcSMatt Macy  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
998eda14cbcSMatt Macy  * matrix is not singular.
999eda14cbcSMatt Macy  * __                                                                 __
1000eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1001eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1002eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1003eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1004eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1005eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1006eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1007eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1008eda14cbcSMatt Macy  * ~~                                                                 ~~
1009eda14cbcSMatt Macy  * __                                                                 __
1010eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1011eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1012eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1013eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1014eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1015eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1016eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1017eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1018eda14cbcSMatt Macy  * ~~                                                                 ~~
1019eda14cbcSMatt Macy  * __                                                                 __
1020eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1021eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1022eda14cbcSMatt Macy  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1023eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1024eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1025eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1026eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1027eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1028eda14cbcSMatt Macy  * ~~                                                                 ~~
1029eda14cbcSMatt Macy  * __                                                                 __
1030eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1031eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1032eda14cbcSMatt Macy  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1033eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1034eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1035eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1036eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1037eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1038eda14cbcSMatt Macy  * ~~                                                                 ~~
1039eda14cbcSMatt Macy  * __                                                                 __
1040eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1041eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1042eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1043eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1044eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1045eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1046eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1047eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1048eda14cbcSMatt Macy  * ~~                                                                 ~~
1049eda14cbcSMatt Macy  * __                                                                 __
1050eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1051eda14cbcSMatt Macy  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1052eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1053eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1054eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1055eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1056eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1057eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1058eda14cbcSMatt Macy  * ~~                                                                 ~~
1059eda14cbcSMatt Macy  *                   __                               __
1060eda14cbcSMatt Macy  *                   |  0   0   1   0   0   0   0   0  |
1061eda14cbcSMatt Macy  *                   | 167 100  5   41 159 169 217 208 |
1062eda14cbcSMatt Macy  *                   | 166 100  4   40 158 168 216 209 |
1063eda14cbcSMatt Macy  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1064eda14cbcSMatt Macy  *                   |  0   0   0   0   1   0   0   0  |
1065eda14cbcSMatt Macy  *                   |  0   0   0   0   0   1   0   0  |
1066eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   1   0  |
1067eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   0   1  |
1068eda14cbcSMatt Macy  *                   ~~                               ~~
1069eda14cbcSMatt Macy  *
1070eda14cbcSMatt Macy  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1071eda14cbcSMatt Macy  * of the missing data.
1072eda14cbcSMatt Macy  *
1073eda14cbcSMatt Macy  * As is apparent from the example above, the only non-trivial rows in the
1074eda14cbcSMatt Macy  * inverse matrix correspond to the data disks that we're trying to
1075eda14cbcSMatt Macy  * reconstruct. Indeed, those are the only rows we need as the others would
1076eda14cbcSMatt Macy  * only be useful for reconstructing data known or assumed to be valid. For
1077eda14cbcSMatt Macy  * that reason, we only build the coefficients in the rows that correspond to
1078eda14cbcSMatt Macy  * targeted columns.
1079eda14cbcSMatt Macy  */
1080eda14cbcSMatt Macy 
1081eda14cbcSMatt Macy static void
10827877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1083eda14cbcSMatt Macy     uint8_t **rows)
1084eda14cbcSMatt Macy {
1085eda14cbcSMatt Macy 	int i, j;
1086eda14cbcSMatt Macy 	int pow;
1087eda14cbcSMatt Macy 
10887877fdebSMatt Macy 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1089eda14cbcSMatt Macy 
1090eda14cbcSMatt Macy 	/*
1091eda14cbcSMatt Macy 	 * Fill in the missing rows of interest.
1092eda14cbcSMatt Macy 	 */
1093eda14cbcSMatt Macy 	for (i = 0; i < nmap; i++) {
1094eda14cbcSMatt Macy 		ASSERT3S(0, <=, map[i]);
1095eda14cbcSMatt Macy 		ASSERT3S(map[i], <=, 2);
1096eda14cbcSMatt Macy 
1097eda14cbcSMatt Macy 		pow = map[i] * n;
1098eda14cbcSMatt Macy 		if (pow > 255)
1099eda14cbcSMatt Macy 			pow -= 255;
1100eda14cbcSMatt Macy 		ASSERT(pow <= 255);
1101eda14cbcSMatt Macy 
1102eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1103eda14cbcSMatt Macy 			pow -= map[i];
1104eda14cbcSMatt Macy 			if (pow < 0)
1105eda14cbcSMatt Macy 				pow += 255;
1106eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_pow2[pow];
1107eda14cbcSMatt Macy 		}
1108eda14cbcSMatt Macy 	}
1109eda14cbcSMatt Macy }
1110eda14cbcSMatt Macy 
1111eda14cbcSMatt Macy static void
11127877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1113eda14cbcSMatt Macy     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1114eda14cbcSMatt Macy {
1115eda14cbcSMatt Macy 	int i, j, ii, jj;
1116eda14cbcSMatt Macy 	uint8_t log;
1117eda14cbcSMatt Macy 
1118eda14cbcSMatt Macy 	/*
1119eda14cbcSMatt Macy 	 * Assert that the first nmissing entries from the array of used
1120eda14cbcSMatt Macy 	 * columns correspond to parity columns and that subsequent entries
1121eda14cbcSMatt Macy 	 * correspond to data columns.
1122eda14cbcSMatt Macy 	 */
1123eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
11247877fdebSMatt Macy 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1125eda14cbcSMatt Macy 	}
1126eda14cbcSMatt Macy 	for (; i < n; i++) {
11277877fdebSMatt Macy 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1128eda14cbcSMatt Macy 	}
1129eda14cbcSMatt Macy 
1130eda14cbcSMatt Macy 	/*
1131eda14cbcSMatt Macy 	 * First initialize the storage where we'll compute the inverse rows.
1132eda14cbcSMatt Macy 	 */
1133eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1134eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1135eda14cbcSMatt Macy 			invrows[i][j] = (i == j) ? 1 : 0;
1136eda14cbcSMatt Macy 		}
1137eda14cbcSMatt Macy 	}
1138eda14cbcSMatt Macy 
1139eda14cbcSMatt Macy 	/*
1140eda14cbcSMatt Macy 	 * Subtract all trivial rows from the rows of consequence.
1141eda14cbcSMatt Macy 	 */
1142eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1143eda14cbcSMatt Macy 		for (j = nmissing; j < n; j++) {
11447877fdebSMatt Macy 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
11457877fdebSMatt Macy 			jj = used[j] - rr->rr_firstdatacol;
1146eda14cbcSMatt Macy 			ASSERT3S(jj, <, n);
1147eda14cbcSMatt Macy 			invrows[i][j] = rows[i][jj];
1148eda14cbcSMatt Macy 			rows[i][jj] = 0;
1149eda14cbcSMatt Macy 		}
1150eda14cbcSMatt Macy 	}
1151eda14cbcSMatt Macy 
1152eda14cbcSMatt Macy 	/*
1153eda14cbcSMatt Macy 	 * For each of the rows of interest, we must normalize it and subtract
1154eda14cbcSMatt Macy 	 * a multiple of it from the other rows.
1155eda14cbcSMatt Macy 	 */
1156eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1157eda14cbcSMatt Macy 		for (j = 0; j < missing[i]; j++) {
1158eda14cbcSMatt Macy 			ASSERT0(rows[i][j]);
1159eda14cbcSMatt Macy 		}
1160eda14cbcSMatt Macy 		ASSERT3U(rows[i][missing[i]], !=, 0);
1161eda14cbcSMatt Macy 
1162eda14cbcSMatt Macy 		/*
1163eda14cbcSMatt Macy 		 * Compute the inverse of the first element and multiply each
1164eda14cbcSMatt Macy 		 * element in the row by that value.
1165eda14cbcSMatt Macy 		 */
1166eda14cbcSMatt Macy 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1167eda14cbcSMatt Macy 
1168eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1169eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1170eda14cbcSMatt Macy 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1171eda14cbcSMatt Macy 		}
1172eda14cbcSMatt Macy 
1173eda14cbcSMatt Macy 		for (ii = 0; ii < nmissing; ii++) {
1174eda14cbcSMatt Macy 			if (i == ii)
1175eda14cbcSMatt Macy 				continue;
1176eda14cbcSMatt Macy 
1177eda14cbcSMatt Macy 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1178eda14cbcSMatt Macy 
1179eda14cbcSMatt Macy 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1180eda14cbcSMatt Macy 
1181eda14cbcSMatt Macy 			for (j = 0; j < n; j++) {
1182eda14cbcSMatt Macy 				rows[ii][j] ^=
1183eda14cbcSMatt Macy 				    vdev_raidz_exp2(rows[i][j], log);
1184eda14cbcSMatt Macy 				invrows[ii][j] ^=
1185eda14cbcSMatt Macy 				    vdev_raidz_exp2(invrows[i][j], log);
1186eda14cbcSMatt Macy 			}
1187eda14cbcSMatt Macy 		}
1188eda14cbcSMatt Macy 	}
1189eda14cbcSMatt Macy 
1190eda14cbcSMatt Macy 	/*
1191eda14cbcSMatt Macy 	 * Verify that the data that is left in the rows are properly part of
1192eda14cbcSMatt Macy 	 * an identity matrix.
1193eda14cbcSMatt Macy 	 */
1194eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1195eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1196eda14cbcSMatt Macy 			if (j == missing[i]) {
1197eda14cbcSMatt Macy 				ASSERT3U(rows[i][j], ==, 1);
1198eda14cbcSMatt Macy 			} else {
1199eda14cbcSMatt Macy 				ASSERT0(rows[i][j]);
1200eda14cbcSMatt Macy 			}
1201eda14cbcSMatt Macy 		}
1202eda14cbcSMatt Macy 	}
1203eda14cbcSMatt Macy }
1204eda14cbcSMatt Macy 
1205eda14cbcSMatt Macy static void
12067877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1207eda14cbcSMatt Macy     int *missing, uint8_t **invrows, const uint8_t *used)
1208eda14cbcSMatt Macy {
1209eda14cbcSMatt Macy 	int i, j, x, cc, c;
1210eda14cbcSMatt Macy 	uint8_t *src;
1211eda14cbcSMatt Macy 	uint64_t ccount;
1212eda14cbcSMatt Macy 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1213eda14cbcSMatt Macy 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1214eda14cbcSMatt Macy 	uint8_t log = 0;
1215eda14cbcSMatt Macy 	uint8_t val;
1216eda14cbcSMatt Macy 	int ll;
1217eda14cbcSMatt Macy 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1218eda14cbcSMatt Macy 	uint8_t *p, *pp;
1219eda14cbcSMatt Macy 	size_t psize;
1220eda14cbcSMatt Macy 
1221eda14cbcSMatt Macy 	psize = sizeof (invlog[0][0]) * n * nmissing;
1222eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1223eda14cbcSMatt Macy 
1224eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing; i++) {
1225eda14cbcSMatt Macy 		invlog[i] = pp;
1226eda14cbcSMatt Macy 		pp += n;
1227eda14cbcSMatt Macy 	}
1228eda14cbcSMatt Macy 
1229eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1230eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1231eda14cbcSMatt Macy 			ASSERT3U(invrows[i][j], !=, 0);
1232eda14cbcSMatt Macy 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1233eda14cbcSMatt Macy 		}
1234eda14cbcSMatt Macy 	}
1235eda14cbcSMatt Macy 
1236eda14cbcSMatt Macy 	for (i = 0; i < n; i++) {
1237eda14cbcSMatt Macy 		c = used[i];
12387877fdebSMatt Macy 		ASSERT3U(c, <, rr->rr_cols);
1239eda14cbcSMatt Macy 
12407877fdebSMatt Macy 		ccount = rr->rr_col[c].rc_size;
12417877fdebSMatt Macy 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
12427877fdebSMatt Macy 		if (ccount == 0)
12437877fdebSMatt Macy 			continue;
12447877fdebSMatt Macy 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1245eda14cbcSMatt Macy 		for (j = 0; j < nmissing; j++) {
12467877fdebSMatt Macy 			cc = missing[j] + rr->rr_firstdatacol;
12477877fdebSMatt Macy 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
12487877fdebSMatt Macy 			ASSERT3U(cc, <, rr->rr_cols);
1249eda14cbcSMatt Macy 			ASSERT3U(cc, !=, c);
1250eda14cbcSMatt Macy 
12517877fdebSMatt Macy 			dcount[j] = rr->rr_col[cc].rc_size;
12527877fdebSMatt Macy 			if (dcount[j] != 0)
12537877fdebSMatt Macy 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1254eda14cbcSMatt Macy 		}
1255eda14cbcSMatt Macy 
1256eda14cbcSMatt Macy 		for (x = 0; x < ccount; x++, src++) {
1257eda14cbcSMatt Macy 			if (*src != 0)
1258eda14cbcSMatt Macy 				log = vdev_raidz_log2[*src];
1259eda14cbcSMatt Macy 
1260eda14cbcSMatt Macy 			for (cc = 0; cc < nmissing; cc++) {
1261eda14cbcSMatt Macy 				if (x >= dcount[cc])
1262eda14cbcSMatt Macy 					continue;
1263eda14cbcSMatt Macy 
1264eda14cbcSMatt Macy 				if (*src == 0) {
1265eda14cbcSMatt Macy 					val = 0;
1266eda14cbcSMatt Macy 				} else {
1267eda14cbcSMatt Macy 					if ((ll = log + invlog[cc][i]) >= 255)
1268eda14cbcSMatt Macy 						ll -= 255;
1269eda14cbcSMatt Macy 					val = vdev_raidz_pow2[ll];
1270eda14cbcSMatt Macy 				}
1271eda14cbcSMatt Macy 
1272eda14cbcSMatt Macy 				if (i == 0)
1273eda14cbcSMatt Macy 					dst[cc][x] = val;
1274eda14cbcSMatt Macy 				else
1275eda14cbcSMatt Macy 					dst[cc][x] ^= val;
1276eda14cbcSMatt Macy 			}
1277eda14cbcSMatt Macy 		}
1278eda14cbcSMatt Macy 	}
1279eda14cbcSMatt Macy 
1280eda14cbcSMatt Macy 	kmem_free(p, psize);
1281eda14cbcSMatt Macy }
1282eda14cbcSMatt Macy 
1283f9693befSMartin Matuska static void
12847877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1285eda14cbcSMatt Macy {
1286eda14cbcSMatt Macy 	int n, i, c, t, tt;
1287eda14cbcSMatt Macy 	int nmissing_rows;
1288eda14cbcSMatt Macy 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1289eda14cbcSMatt Macy 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1290eda14cbcSMatt Macy 	uint8_t *p, *pp;
1291eda14cbcSMatt Macy 	size_t psize;
1292eda14cbcSMatt Macy 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1293eda14cbcSMatt Macy 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1294eda14cbcSMatt Macy 	uint8_t *used;
1295eda14cbcSMatt Macy 
1296eda14cbcSMatt Macy 	abd_t **bufs = NULL;
1297eda14cbcSMatt Macy 
1298eda14cbcSMatt Macy 	/*
1299eda14cbcSMatt Macy 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
13007877fdebSMatt Macy 	 * temporary linear ABDs if any non-linear ABDs are found.
1301eda14cbcSMatt Macy 	 */
13027877fdebSMatt Macy 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
13037877fdebSMatt Macy 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
13047877fdebSMatt Macy 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
13057877fdebSMatt Macy 			    KM_PUSHPAGE);
1306eda14cbcSMatt Macy 
13077877fdebSMatt Macy 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
13087877fdebSMatt Macy 				raidz_col_t *col = &rr->rr_col[c];
1309eda14cbcSMatt Macy 
1310eda14cbcSMatt Macy 				bufs[c] = col->rc_abd;
13117877fdebSMatt Macy 				if (bufs[c] != NULL) {
13127877fdebSMatt Macy 					col->rc_abd = abd_alloc_linear(
13137877fdebSMatt Macy 					    col->rc_size, B_TRUE);
13147877fdebSMatt Macy 					abd_copy(col->rc_abd, bufs[c],
13157877fdebSMatt Macy 					    col->rc_size);
1316eda14cbcSMatt Macy 				}
1317eda14cbcSMatt Macy 			}
1318eda14cbcSMatt Macy 
13197877fdebSMatt Macy 			break;
13207877fdebSMatt Macy 		}
13217877fdebSMatt Macy 	}
13227877fdebSMatt Macy 
13237877fdebSMatt Macy 	n = rr->rr_cols - rr->rr_firstdatacol;
1324eda14cbcSMatt Macy 
1325eda14cbcSMatt Macy 	/*
1326eda14cbcSMatt Macy 	 * Figure out which data columns are missing.
1327eda14cbcSMatt Macy 	 */
1328eda14cbcSMatt Macy 	nmissing_rows = 0;
1329eda14cbcSMatt Macy 	for (t = 0; t < ntgts; t++) {
13307877fdebSMatt Macy 		if (tgts[t] >= rr->rr_firstdatacol) {
1331eda14cbcSMatt Macy 			missing_rows[nmissing_rows++] =
13327877fdebSMatt Macy 			    tgts[t] - rr->rr_firstdatacol;
1333eda14cbcSMatt Macy 		}
1334eda14cbcSMatt Macy 	}
1335eda14cbcSMatt Macy 
1336eda14cbcSMatt Macy 	/*
1337eda14cbcSMatt Macy 	 * Figure out which parity columns to use to help generate the missing
1338eda14cbcSMatt Macy 	 * data columns.
1339eda14cbcSMatt Macy 	 */
1340eda14cbcSMatt Macy 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1341eda14cbcSMatt Macy 		ASSERT(tt < ntgts);
13427877fdebSMatt Macy 		ASSERT(c < rr->rr_firstdatacol);
1343eda14cbcSMatt Macy 
1344eda14cbcSMatt Macy 		/*
1345eda14cbcSMatt Macy 		 * Skip any targeted parity columns.
1346eda14cbcSMatt Macy 		 */
1347eda14cbcSMatt Macy 		if (c == tgts[tt]) {
1348eda14cbcSMatt Macy 			tt++;
1349eda14cbcSMatt Macy 			continue;
1350eda14cbcSMatt Macy 		}
1351eda14cbcSMatt Macy 
1352eda14cbcSMatt Macy 		parity_map[i] = c;
1353eda14cbcSMatt Macy 		i++;
1354eda14cbcSMatt Macy 	}
1355eda14cbcSMatt Macy 
1356eda14cbcSMatt Macy 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1357eda14cbcSMatt Macy 	    nmissing_rows * n + sizeof (used[0]) * n;
1358eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1359eda14cbcSMatt Macy 
1360eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1361eda14cbcSMatt Macy 		rows[i] = pp;
1362eda14cbcSMatt Macy 		pp += n;
1363eda14cbcSMatt Macy 		invrows[i] = pp;
1364eda14cbcSMatt Macy 		pp += n;
1365eda14cbcSMatt Macy 	}
1366eda14cbcSMatt Macy 	used = pp;
1367eda14cbcSMatt Macy 
1368eda14cbcSMatt Macy 	for (i = 0; i < nmissing_rows; i++) {
1369eda14cbcSMatt Macy 		used[i] = parity_map[i];
1370eda14cbcSMatt Macy 	}
1371eda14cbcSMatt Macy 
13727877fdebSMatt Macy 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1373eda14cbcSMatt Macy 		if (tt < nmissing_rows &&
13747877fdebSMatt Macy 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
1375eda14cbcSMatt Macy 			tt++;
1376eda14cbcSMatt Macy 			continue;
1377eda14cbcSMatt Macy 		}
1378eda14cbcSMatt Macy 
1379eda14cbcSMatt Macy 		ASSERT3S(i, <, n);
1380eda14cbcSMatt Macy 		used[i] = c;
1381eda14cbcSMatt Macy 		i++;
1382eda14cbcSMatt Macy 	}
1383eda14cbcSMatt Macy 
1384eda14cbcSMatt Macy 	/*
1385eda14cbcSMatt Macy 	 * Initialize the interesting rows of the matrix.
1386eda14cbcSMatt Macy 	 */
13877877fdebSMatt Macy 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
1388eda14cbcSMatt Macy 
1389eda14cbcSMatt Macy 	/*
1390eda14cbcSMatt Macy 	 * Invert the matrix.
1391eda14cbcSMatt Macy 	 */
13927877fdebSMatt Macy 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
1393eda14cbcSMatt Macy 	    invrows, used);
1394eda14cbcSMatt Macy 
1395eda14cbcSMatt Macy 	/*
1396eda14cbcSMatt Macy 	 * Reconstruct the missing data using the generated matrix.
1397eda14cbcSMatt Macy 	 */
13987877fdebSMatt Macy 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
1399eda14cbcSMatt Macy 	    invrows, used);
1400eda14cbcSMatt Macy 
1401eda14cbcSMatt Macy 	kmem_free(p, psize);
1402eda14cbcSMatt Macy 
1403eda14cbcSMatt Macy 	/*
1404eda14cbcSMatt Macy 	 * copy back from temporary linear abds and free them
1405eda14cbcSMatt Macy 	 */
1406eda14cbcSMatt Macy 	if (bufs) {
14077877fdebSMatt Macy 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14087877fdebSMatt Macy 			raidz_col_t *col = &rr->rr_col[c];
1409eda14cbcSMatt Macy 
14107877fdebSMatt Macy 			if (bufs[c] != NULL) {
1411eda14cbcSMatt Macy 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
1412eda14cbcSMatt Macy 				abd_free(col->rc_abd);
14137877fdebSMatt Macy 			}
1414eda14cbcSMatt Macy 			col->rc_abd = bufs[c];
1415eda14cbcSMatt Macy 		}
14167877fdebSMatt Macy 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
1417eda14cbcSMatt Macy 	}
1418eda14cbcSMatt Macy }
1419eda14cbcSMatt Macy 
1420f9693befSMartin Matuska static void
14217877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
14227877fdebSMatt Macy     const int *t, int nt)
1423eda14cbcSMatt Macy {
1424eda14cbcSMatt Macy 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1425eda14cbcSMatt Macy 	int ntgts;
1426eda14cbcSMatt Macy 	int i, c, ret;
1427eda14cbcSMatt Macy 	int nbadparity, nbaddata;
1428eda14cbcSMatt Macy 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1429eda14cbcSMatt Macy 
14307877fdebSMatt Macy 	nbadparity = rr->rr_firstdatacol;
14317877fdebSMatt Macy 	nbaddata = rr->rr_cols - nbadparity;
1432eda14cbcSMatt Macy 	ntgts = 0;
14337877fdebSMatt Macy 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
14347877fdebSMatt Macy 		if (c < rr->rr_firstdatacol)
1435eda14cbcSMatt Macy 			parity_valid[c] = B_FALSE;
1436eda14cbcSMatt Macy 
1437eda14cbcSMatt Macy 		if (i < nt && c == t[i]) {
1438eda14cbcSMatt Macy 			tgts[ntgts++] = c;
1439eda14cbcSMatt Macy 			i++;
14407877fdebSMatt Macy 		} else if (rr->rr_col[c].rc_error != 0) {
1441eda14cbcSMatt Macy 			tgts[ntgts++] = c;
14427877fdebSMatt Macy 		} else if (c >= rr->rr_firstdatacol) {
1443eda14cbcSMatt Macy 			nbaddata--;
1444eda14cbcSMatt Macy 		} else {
1445eda14cbcSMatt Macy 			parity_valid[c] = B_TRUE;
1446eda14cbcSMatt Macy 			nbadparity--;
1447eda14cbcSMatt Macy 		}
1448eda14cbcSMatt Macy 	}
1449eda14cbcSMatt Macy 
1450eda14cbcSMatt Macy 	ASSERT(ntgts >= nt);
1451eda14cbcSMatt Macy 	ASSERT(nbaddata >= 0);
1452eda14cbcSMatt Macy 	ASSERT(nbaddata + nbadparity == ntgts);
1453eda14cbcSMatt Macy 
1454eda14cbcSMatt Macy 	dt = &tgts[nbadparity];
1455eda14cbcSMatt Macy 
1456eda14cbcSMatt Macy 	/* Reconstruct using the new math implementation */
14577877fdebSMatt Macy 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
1458eda14cbcSMatt Macy 	if (ret != RAIDZ_ORIGINAL_IMPL)
1459f9693befSMartin Matuska 		return;
1460eda14cbcSMatt Macy 
1461eda14cbcSMatt Macy 	/*
1462eda14cbcSMatt Macy 	 * See if we can use any of our optimized reconstruction routines.
1463eda14cbcSMatt Macy 	 */
1464eda14cbcSMatt Macy 	switch (nbaddata) {
1465eda14cbcSMatt Macy 	case 1:
1466f9693befSMartin Matuska 		if (parity_valid[VDEV_RAIDZ_P]) {
1467f9693befSMartin Matuska 			vdev_raidz_reconstruct_p(rr, dt, 1);
1468f9693befSMartin Matuska 			return;
1469f9693befSMartin Matuska 		}
1470eda14cbcSMatt Macy 
14717877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 1);
1472eda14cbcSMatt Macy 
1473f9693befSMartin Matuska 		if (parity_valid[VDEV_RAIDZ_Q]) {
1474f9693befSMartin Matuska 			vdev_raidz_reconstruct_q(rr, dt, 1);
1475f9693befSMartin Matuska 			return;
1476f9693befSMartin Matuska 		}
1477eda14cbcSMatt Macy 
14787877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 2);
1479eda14cbcSMatt Macy 		break;
1480eda14cbcSMatt Macy 
1481eda14cbcSMatt Macy 	case 2:
14827877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 1);
1483eda14cbcSMatt Macy 
1484eda14cbcSMatt Macy 		if (parity_valid[VDEV_RAIDZ_P] &&
1485f9693befSMartin Matuska 		    parity_valid[VDEV_RAIDZ_Q]) {
1486f9693befSMartin Matuska 			vdev_raidz_reconstruct_pq(rr, dt, 2);
1487f9693befSMartin Matuska 			return;
1488f9693befSMartin Matuska 		}
1489eda14cbcSMatt Macy 
14907877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 2);
1491eda14cbcSMatt Macy 
1492eda14cbcSMatt Macy 		break;
1493eda14cbcSMatt Macy 	}
1494eda14cbcSMatt Macy 
1495f9693befSMartin Matuska 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
1496eda14cbcSMatt Macy }
1497eda14cbcSMatt Macy 
1498eda14cbcSMatt Macy static int
1499eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1500eda14cbcSMatt Macy     uint64_t *logical_ashift, uint64_t *physical_ashift)
1501eda14cbcSMatt Macy {
15027877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
15037877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
1504eda14cbcSMatt Macy 	int c;
1505eda14cbcSMatt Macy 	int lasterror = 0;
1506eda14cbcSMatt Macy 	int numerrors = 0;
1507eda14cbcSMatt Macy 
1508eda14cbcSMatt Macy 	ASSERT(nparity > 0);
1509eda14cbcSMatt Macy 
1510eda14cbcSMatt Macy 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1511eda14cbcSMatt Macy 	    vd->vdev_children < nparity + 1) {
1512eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1513eda14cbcSMatt Macy 		return (SET_ERROR(EINVAL));
1514eda14cbcSMatt Macy 	}
1515eda14cbcSMatt Macy 
1516eda14cbcSMatt Macy 	vdev_open_children(vd);
1517eda14cbcSMatt Macy 
1518eda14cbcSMatt Macy 	for (c = 0; c < vd->vdev_children; c++) {
15197877fdebSMatt Macy 		vdev_t *cvd = vd->vdev_child[c];
1520eda14cbcSMatt Macy 
1521eda14cbcSMatt Macy 		if (cvd->vdev_open_error != 0) {
1522eda14cbcSMatt Macy 			lasterror = cvd->vdev_open_error;
1523eda14cbcSMatt Macy 			numerrors++;
1524eda14cbcSMatt Macy 			continue;
1525eda14cbcSMatt Macy 		}
1526eda14cbcSMatt Macy 
1527eda14cbcSMatt Macy 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1528eda14cbcSMatt Macy 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1529eda14cbcSMatt Macy 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1530c7046f76SMartin Matuska 	}
1531c7046f76SMartin Matuska 	for (c = 0; c < vd->vdev_children; c++) {
1532c7046f76SMartin Matuska 		vdev_t *cvd = vd->vdev_child[c];
1533c7046f76SMartin Matuska 
1534c7046f76SMartin Matuska 		if (cvd->vdev_open_error != 0)
1535c7046f76SMartin Matuska 			continue;
1536c7046f76SMartin Matuska 		*physical_ashift = vdev_best_ashift(*logical_ashift,
1537c7046f76SMartin Matuska 		    *physical_ashift, cvd->vdev_physical_ashift);
1538eda14cbcSMatt Macy 	}
1539eda14cbcSMatt Macy 
1540eda14cbcSMatt Macy 	*asize *= vd->vdev_children;
1541eda14cbcSMatt Macy 	*max_asize *= vd->vdev_children;
1542eda14cbcSMatt Macy 
1543eda14cbcSMatt Macy 	if (numerrors > nparity) {
1544eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1545eda14cbcSMatt Macy 		return (lasterror);
1546eda14cbcSMatt Macy 	}
1547eda14cbcSMatt Macy 
1548eda14cbcSMatt Macy 	return (0);
1549eda14cbcSMatt Macy }
1550eda14cbcSMatt Macy 
1551eda14cbcSMatt Macy static void
1552eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd)
1553eda14cbcSMatt Macy {
15547877fdebSMatt Macy 	for (int c = 0; c < vd->vdev_children; c++) {
15557877fdebSMatt Macy 		if (vd->vdev_child[c] != NULL)
1556eda14cbcSMatt Macy 			vdev_close(vd->vdev_child[c]);
1557eda14cbcSMatt Macy 	}
15587877fdebSMatt Macy }
1559eda14cbcSMatt Macy 
1560eda14cbcSMatt Macy static uint64_t
1561eda14cbcSMatt Macy vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1562eda14cbcSMatt Macy {
15637877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1564eda14cbcSMatt Macy 	uint64_t asize;
1565eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
15667877fdebSMatt Macy 	uint64_t cols = vdrz->vd_logical_width;
15677877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
1568eda14cbcSMatt Macy 
1569eda14cbcSMatt Macy 	asize = ((psize - 1) >> ashift) + 1;
1570eda14cbcSMatt Macy 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1571eda14cbcSMatt Macy 	asize = roundup(asize, nparity + 1) << ashift;
1572eda14cbcSMatt Macy 
1573eda14cbcSMatt Macy 	return (asize);
1574eda14cbcSMatt Macy }
1575eda14cbcSMatt Macy 
15767877fdebSMatt Macy /*
15777877fdebSMatt Macy  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
15787877fdebSMatt Macy  * so each child must provide at least 1/Nth of its asize.
15797877fdebSMatt Macy  */
15807877fdebSMatt Macy static uint64_t
15817877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd)
15827877fdebSMatt Macy {
15837877fdebSMatt Macy 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
15847877fdebSMatt Macy 	    vd->vdev_children);
15857877fdebSMatt Macy }
15867877fdebSMatt Macy 
15877877fdebSMatt Macy void
1588eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio)
1589eda14cbcSMatt Macy {
1590eda14cbcSMatt Macy 	raidz_col_t *rc = zio->io_private;
1591eda14cbcSMatt Macy 
159281b22a98SMartin Matuska 	ASSERT3P(rc->rc_abd, !=, NULL);
1593eda14cbcSMatt Macy 	rc->rc_error = zio->io_error;
1594eda14cbcSMatt Macy 	rc->rc_tried = 1;
1595eda14cbcSMatt Macy 	rc->rc_skipped = 0;
1596eda14cbcSMatt Macy }
1597eda14cbcSMatt Macy 
1598eda14cbcSMatt Macy static void
15997877fdebSMatt Macy vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
1600eda14cbcSMatt Macy {
1601eda14cbcSMatt Macy #ifdef ZFS_DEBUG
1602eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
1603eda14cbcSMatt Macy 
16047877fdebSMatt Macy 	range_seg64_t logical_rs, physical_rs, remain_rs;
16057877fdebSMatt Macy 	logical_rs.rs_start = rr->rr_offset;
1606eda14cbcSMatt Macy 	logical_rs.rs_end = logical_rs.rs_start +
16077877fdebSMatt Macy 	    vdev_raidz_asize(vd, rr->rr_size);
1608eda14cbcSMatt Macy 
16097877fdebSMatt Macy 	raidz_col_t *rc = &rr->rr_col[col];
1610eda14cbcSMatt Macy 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1611eda14cbcSMatt Macy 
16127877fdebSMatt Macy 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
16137877fdebSMatt Macy 	ASSERT(vdev_xlate_is_empty(&remain_rs));
1614eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1615eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1616eda14cbcSMatt Macy 	/*
1617eda14cbcSMatt Macy 	 * It would be nice to assert that rs_end is equal
1618eda14cbcSMatt Macy 	 * to rc_offset + rc_size but there might be an
1619eda14cbcSMatt Macy 	 * optional I/O at the end that is not accounted in
1620eda14cbcSMatt Macy 	 * rc_size.
1621eda14cbcSMatt Macy 	 */
1622eda14cbcSMatt Macy 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1623eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1624eda14cbcSMatt Macy 		    rc->rc_size + (1 << tvd->vdev_ashift));
1625eda14cbcSMatt Macy 	} else {
1626eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1627eda14cbcSMatt Macy 	}
1628eda14cbcSMatt Macy #endif
1629eda14cbcSMatt Macy }
1630eda14cbcSMatt Macy 
16317877fdebSMatt Macy static void
16327877fdebSMatt Macy vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
16337877fdebSMatt Macy {
16347877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
16357877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
16367877fdebSMatt Macy 
16377877fdebSMatt Macy 	vdev_raidz_generate_parity_row(rm, rr);
16387877fdebSMatt Macy 
163981b22a98SMartin Matuska 	for (int c = 0; c < rr->rr_scols; c++) {
16407877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
164181b22a98SMartin Matuska 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
16427877fdebSMatt Macy 
16437877fdebSMatt Macy 		/* Verify physical to logical translation */
16447877fdebSMatt Macy 		vdev_raidz_io_verify(vd, rr, c);
16457877fdebSMatt Macy 
164681b22a98SMartin Matuska 		if (rc->rc_size > 0) {
164781b22a98SMartin Matuska 			ASSERT3P(rc->rc_abd, !=, NULL);
16487877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
164981b22a98SMartin Matuska 			    rc->rc_offset, rc->rc_abd,
165081b22a98SMartin Matuska 			    abd_get_size(rc->rc_abd), zio->io_type,
165181b22a98SMartin Matuska 			    zio->io_priority, 0, vdev_raidz_child_done, rc));
165281b22a98SMartin Matuska 		} else {
165381b22a98SMartin Matuska 			/*
165481b22a98SMartin Matuska 			 * Generate optional write for skip sector to improve
165581b22a98SMartin Matuska 			 * aggregation contiguity.
165681b22a98SMartin Matuska 			 */
165781b22a98SMartin Matuska 			ASSERT3P(rc->rc_abd, ==, NULL);
165881b22a98SMartin Matuska 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
165981b22a98SMartin Matuska 			    rc->rc_offset, NULL, 1ULL << ashift,
16607877fdebSMatt Macy 			    zio->io_type, zio->io_priority,
166181b22a98SMartin Matuska 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
166281b22a98SMartin Matuska 			    NULL));
166381b22a98SMartin Matuska 		}
16647877fdebSMatt Macy 	}
16657877fdebSMatt Macy }
16667877fdebSMatt Macy 
16677877fdebSMatt Macy static void
16687877fdebSMatt Macy vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
16697877fdebSMatt Macy {
16707877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
16717877fdebSMatt Macy 
16727877fdebSMatt Macy 	/*
16737877fdebSMatt Macy 	 * Iterate over the columns in reverse order so that we hit the parity
16747877fdebSMatt Macy 	 * last -- any errors along the way will force us to read the parity.
16757877fdebSMatt Macy 	 */
16767877fdebSMatt Macy 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
16777877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
16787877fdebSMatt Macy 		if (rc->rc_size == 0)
16797877fdebSMatt Macy 			continue;
16807877fdebSMatt Macy 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
16817877fdebSMatt Macy 		if (!vdev_readable(cvd)) {
16827877fdebSMatt Macy 			if (c >= rr->rr_firstdatacol)
16837877fdebSMatt Macy 				rr->rr_missingdata++;
16847877fdebSMatt Macy 			else
16857877fdebSMatt Macy 				rr->rr_missingparity++;
16867877fdebSMatt Macy 			rc->rc_error = SET_ERROR(ENXIO);
16877877fdebSMatt Macy 			rc->rc_tried = 1;	/* don't even try */
16887877fdebSMatt Macy 			rc->rc_skipped = 1;
16897877fdebSMatt Macy 			continue;
16907877fdebSMatt Macy 		}
16917877fdebSMatt Macy 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
16927877fdebSMatt Macy 			if (c >= rr->rr_firstdatacol)
16937877fdebSMatt Macy 				rr->rr_missingdata++;
16947877fdebSMatt Macy 			else
16957877fdebSMatt Macy 				rr->rr_missingparity++;
16967877fdebSMatt Macy 			rc->rc_error = SET_ERROR(ESTALE);
16977877fdebSMatt Macy 			rc->rc_skipped = 1;
16987877fdebSMatt Macy 			continue;
16997877fdebSMatt Macy 		}
17007877fdebSMatt Macy 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
17017877fdebSMatt Macy 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
17027877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
17037877fdebSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
17047877fdebSMatt Macy 			    zio->io_type, zio->io_priority, 0,
17057877fdebSMatt Macy 			    vdev_raidz_child_done, rc));
17067877fdebSMatt Macy 		}
17077877fdebSMatt Macy 	}
17087877fdebSMatt Macy }
17097877fdebSMatt Macy 
1710eda14cbcSMatt Macy /*
1711eda14cbcSMatt Macy  * Start an IO operation on a RAIDZ VDev
1712eda14cbcSMatt Macy  *
1713eda14cbcSMatt Macy  * Outline:
1714eda14cbcSMatt Macy  * - For write operations:
1715eda14cbcSMatt Macy  *   1. Generate the parity data
1716eda14cbcSMatt Macy  *   2. Create child zio write operations to each column's vdev, for both
1717eda14cbcSMatt Macy  *      data and parity.
1718eda14cbcSMatt Macy  *   3. If the column skips any sectors for padding, create optional dummy
1719eda14cbcSMatt Macy  *      write zio children for those areas to improve aggregation continuity.
1720eda14cbcSMatt Macy  * - For read operations:
1721eda14cbcSMatt Macy  *   1. Create child zio read operations to each data column's vdev to read
1722eda14cbcSMatt Macy  *      the range of data required for zio.
1723eda14cbcSMatt Macy  *   2. If this is a scrub or resilver operation, or if any of the data
1724eda14cbcSMatt Macy  *      vdevs have had errors, then create zio read operations to the parity
1725eda14cbcSMatt Macy  *      columns' VDevs as well.
1726eda14cbcSMatt Macy  */
1727eda14cbcSMatt Macy static void
1728eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio)
1729eda14cbcSMatt Macy {
1730eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
1731eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
17327877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1733eda14cbcSMatt Macy 
1734f9693befSMartin Matuska 	raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
17357877fdebSMatt Macy 	    vdrz->vd_logical_width, vdrz->vd_nparity);
1736f9693befSMartin Matuska 	zio->io_vsd = rm;
1737f9693befSMartin Matuska 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1738eda14cbcSMatt Macy 
17397877fdebSMatt Macy 	/*
17407877fdebSMatt Macy 	 * Until raidz expansion is implemented all maps for a raidz vdev
17417877fdebSMatt Macy 	 * contain a single row.
17427877fdebSMatt Macy 	 */
17437877fdebSMatt Macy 	ASSERT3U(rm->rm_nrows, ==, 1);
17447877fdebSMatt Macy 	raidz_row_t *rr = rm->rm_row[0];
17457877fdebSMatt Macy 
1746eda14cbcSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
17477877fdebSMatt Macy 		vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
17487877fdebSMatt Macy 	} else {
1749eda14cbcSMatt Macy 		ASSERT(zio->io_type == ZIO_TYPE_READ);
17507877fdebSMatt Macy 		vdev_raidz_io_start_read(zio, rr);
1751eda14cbcSMatt Macy 	}
1752eda14cbcSMatt Macy 
1753eda14cbcSMatt Macy 	zio_execute(zio);
1754eda14cbcSMatt Macy }
1755eda14cbcSMatt Macy 
1756eda14cbcSMatt Macy /*
1757eda14cbcSMatt Macy  * Report a checksum error for a child of a RAID-Z device.
1758eda14cbcSMatt Macy  */
1759e92ffd9bSMartin Matuska void
1760e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1761eda14cbcSMatt Macy {
1762eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1763eda14cbcSMatt Macy 
17647877fdebSMatt Macy 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
17657877fdebSMatt Macy 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
1766eda14cbcSMatt Macy 		zio_bad_cksum_t zbc;
1767eda14cbcSMatt Macy 		raidz_map_t *rm = zio->io_vsd;
1768eda14cbcSMatt Macy 
1769eda14cbcSMatt Macy 		zbc.zbc_has_cksum = 0;
1770eda14cbcSMatt Macy 		zbc.zbc_injected = rm->rm_ecksuminjected;
1771eda14cbcSMatt Macy 
17722c48331dSMatt Macy 		mutex_enter(&vd->vdev_stat_lock);
17732c48331dSMatt Macy 		vd->vdev_stat.vs_checksum_errors++;
17742c48331dSMatt Macy 		mutex_exit(&vd->vdev_stat_lock);
1775*bb2d13b6SMartin Matuska 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
1776*bb2d13b6SMartin Matuska 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1777*bb2d13b6SMartin Matuska 		    rc->rc_abd, bad_data, &zbc);
17782c48331dSMatt Macy 	}
1779eda14cbcSMatt Macy }
1780eda14cbcSMatt Macy 
1781eda14cbcSMatt Macy /*
1782eda14cbcSMatt Macy  * We keep track of whether or not there were any injected errors, so that
1783eda14cbcSMatt Macy  * any ereports we generate can note it.
1784eda14cbcSMatt Macy  */
1785eda14cbcSMatt Macy static int
1786eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio)
1787eda14cbcSMatt Macy {
1788da5137abSMartin Matuska 	zio_bad_cksum_t zbc = {{{0}}};
1789eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
1790eda14cbcSMatt Macy 
1791eda14cbcSMatt Macy 	int ret = zio_checksum_error(zio, &zbc);
1792eda14cbcSMatt Macy 	if (ret != 0 && zbc.zbc_injected != 0)
1793eda14cbcSMatt Macy 		rm->rm_ecksuminjected = 1;
1794eda14cbcSMatt Macy 
1795eda14cbcSMatt Macy 	return (ret);
1796eda14cbcSMatt Macy }
1797eda14cbcSMatt Macy 
1798eda14cbcSMatt Macy /*
1799eda14cbcSMatt Macy  * Generate the parity from the data columns. If we tried and were able to
1800eda14cbcSMatt Macy  * read the parity without error, verify that the generated parity matches the
1801eda14cbcSMatt Macy  * data we read. If it doesn't, we fire off a checksum error. Return the
18027877fdebSMatt Macy  * number of such failures.
1803eda14cbcSMatt Macy  */
1804eda14cbcSMatt Macy static int
18057877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
1806eda14cbcSMatt Macy {
1807eda14cbcSMatt Macy 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1808eda14cbcSMatt Macy 	int c, ret = 0;
18097877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
1810eda14cbcSMatt Macy 	raidz_col_t *rc;
1811eda14cbcSMatt Macy 
1812eda14cbcSMatt Macy 	blkptr_t *bp = zio->io_bp;
1813eda14cbcSMatt Macy 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1814eda14cbcSMatt Macy 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1815eda14cbcSMatt Macy 
1816eda14cbcSMatt Macy 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1817eda14cbcSMatt Macy 		return (ret);
1818eda14cbcSMatt Macy 
18197877fdebSMatt Macy 	for (c = 0; c < rr->rr_firstdatacol; c++) {
18207877fdebSMatt Macy 		rc = &rr->rr_col[c];
1821eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
1822eda14cbcSMatt Macy 			continue;
1823eda14cbcSMatt Macy 
1824a0b956f5SMartin Matuska 		orig[c] = rc->rc_abd;
1825a0b956f5SMartin Matuska 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
1826a0b956f5SMartin Matuska 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
1827eda14cbcSMatt Macy 	}
1828eda14cbcSMatt Macy 
18297877fdebSMatt Macy 	/*
1830e92ffd9bSMartin Matuska 	 * Verify any empty sectors are zero filled to ensure the parity
1831e92ffd9bSMartin Matuska 	 * is calculated correctly even if these non-data sectors are damaged.
1832e92ffd9bSMartin Matuska 	 */
1833e92ffd9bSMartin Matuska 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
1834e92ffd9bSMartin Matuska 		ret += vdev_draid_map_verify_empty(zio, rr);
1835e92ffd9bSMartin Matuska 
1836e92ffd9bSMartin Matuska 	/*
18377877fdebSMatt Macy 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
18387877fdebSMatt Macy 	 * isn't harmful but it does have the side effect of fixing stuff
18397877fdebSMatt Macy 	 * we didn't realize was necessary (i.e. even if we return 0).
18407877fdebSMatt Macy 	 */
18417877fdebSMatt Macy 	vdev_raidz_generate_parity_row(rm, rr);
1842eda14cbcSMatt Macy 
18437877fdebSMatt Macy 	for (c = 0; c < rr->rr_firstdatacol; c++) {
18447877fdebSMatt Macy 		rc = &rr->rr_col[c];
18457877fdebSMatt Macy 
1846eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
1847eda14cbcSMatt Macy 			continue;
18487877fdebSMatt Macy 
1849eda14cbcSMatt Macy 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1850e92ffd9bSMartin Matuska 			vdev_raidz_checksum_error(zio, rc, orig[c]);
1851eda14cbcSMatt Macy 			rc->rc_error = SET_ERROR(ECKSUM);
1852eda14cbcSMatt Macy 			ret++;
1853eda14cbcSMatt Macy 		}
1854eda14cbcSMatt Macy 		abd_free(orig[c]);
1855eda14cbcSMatt Macy 	}
1856eda14cbcSMatt Macy 
1857eda14cbcSMatt Macy 	return (ret);
1858eda14cbcSMatt Macy }
1859eda14cbcSMatt Macy 
1860eda14cbcSMatt Macy static int
18617877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr)
1862eda14cbcSMatt Macy {
1863eda14cbcSMatt Macy 	int error = 0;
1864eda14cbcSMatt Macy 
18657877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++)
18667877fdebSMatt Macy 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
1867eda14cbcSMatt Macy 
1868eda14cbcSMatt Macy 	return (error);
1869eda14cbcSMatt Macy }
1870eda14cbcSMatt Macy 
1871eda14cbcSMatt Macy static void
18727877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
1873eda14cbcSMatt Macy {
1874eda14cbcSMatt Macy 	int unexpected_errors = 0;
1875eda14cbcSMatt Macy 	int parity_errors = 0;
1876eda14cbcSMatt Macy 	int parity_untried = 0;
1877eda14cbcSMatt Macy 	int data_errors = 0;
1878eda14cbcSMatt Macy 
18797877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1880eda14cbcSMatt Macy 
18817877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
18827877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
1883eda14cbcSMatt Macy 
1884eda14cbcSMatt Macy 		if (rc->rc_error) {
18857877fdebSMatt Macy 			if (c < rr->rr_firstdatacol)
1886eda14cbcSMatt Macy 				parity_errors++;
1887eda14cbcSMatt Macy 			else
1888eda14cbcSMatt Macy 				data_errors++;
1889eda14cbcSMatt Macy 
1890eda14cbcSMatt Macy 			if (!rc->rc_skipped)
1891eda14cbcSMatt Macy 				unexpected_errors++;
18927877fdebSMatt Macy 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
1893eda14cbcSMatt Macy 			parity_untried++;
1894eda14cbcSMatt Macy 		}
1895a0b956f5SMartin Matuska 
1896a0b956f5SMartin Matuska 		if (rc->rc_force_repair)
1897a0b956f5SMartin Matuska 			unexpected_errors++;
1898eda14cbcSMatt Macy 	}
1899eda14cbcSMatt Macy 
1900eda14cbcSMatt Macy 	/*
19017877fdebSMatt Macy 	 * If we read more parity disks than were used for
19027877fdebSMatt Macy 	 * reconstruction, confirm that the other parity disks produced
19037877fdebSMatt Macy 	 * correct data.
19047877fdebSMatt Macy 	 *
19057877fdebSMatt Macy 	 * Note that we also regenerate parity when resilvering so we
19067877fdebSMatt Macy 	 * can write it out to failed devices later.
19077877fdebSMatt Macy 	 */
19087877fdebSMatt Macy 	if (parity_errors + parity_untried <
19097877fdebSMatt Macy 	    rr->rr_firstdatacol - data_errors ||
19107877fdebSMatt Macy 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
19117877fdebSMatt Macy 		int n = raidz_parity_verify(zio, rr);
19127877fdebSMatt Macy 		unexpected_errors += n;
19137877fdebSMatt Macy 	}
19147877fdebSMatt Macy 
19157877fdebSMatt Macy 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
19167877fdebSMatt Macy 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
19177877fdebSMatt Macy 		/*
19187877fdebSMatt Macy 		 * Use the good data we have in hand to repair damaged children.
19197877fdebSMatt Macy 		 */
19207877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
19217877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
19227877fdebSMatt Macy 			vdev_t *vd = zio->io_vd;
19237877fdebSMatt Macy 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
19247877fdebSMatt Macy 
192516038816SMartin Matuska 			if (!rc->rc_allow_repair) {
192616038816SMartin Matuska 				continue;
192716038816SMartin Matuska 			} else if (!rc->rc_force_repair &&
192816038816SMartin Matuska 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
19297877fdebSMatt Macy 				continue;
19307877fdebSMatt Macy 			}
19317877fdebSMatt Macy 
19327877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
19337877fdebSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
19347877fdebSMatt Macy 			    ZIO_TYPE_WRITE,
19357877fdebSMatt Macy 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
19367877fdebSMatt Macy 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
19377877fdebSMatt Macy 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
19387877fdebSMatt Macy 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
19397877fdebSMatt Macy 		}
19407877fdebSMatt Macy 	}
19417877fdebSMatt Macy }
19427877fdebSMatt Macy 
19437877fdebSMatt Macy static void
19447877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm)
19457877fdebSMatt Macy {
19467877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
19477877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
19487877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
19497877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
19507877fdebSMatt Macy 			if (rc->rc_need_orig_restore) {
1951f9693befSMartin Matuska 				abd_copy(rc->rc_abd,
19527877fdebSMatt Macy 				    rc->rc_orig_data, rc->rc_size);
19537877fdebSMatt Macy 				rc->rc_need_orig_restore = B_FALSE;
19547877fdebSMatt Macy 			}
19557877fdebSMatt Macy 		}
19567877fdebSMatt Macy 	}
19577877fdebSMatt Macy }
19587877fdebSMatt Macy 
19597877fdebSMatt Macy /*
19607877fdebSMatt Macy  * returns EINVAL if reconstruction of the block will not be possible
19617877fdebSMatt Macy  * returns ECKSUM if this specific reconstruction failed
19627877fdebSMatt Macy  * returns 0 on successful reconstruction
19637877fdebSMatt Macy  */
19647877fdebSMatt Macy static int
19657877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
19667877fdebSMatt Macy {
19677877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
19687877fdebSMatt Macy 
19697877fdebSMatt Macy 	/* Reconstruct each row */
19707877fdebSMatt Macy 	for (int r = 0; r < rm->rm_nrows; r++) {
19717877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[r];
19727877fdebSMatt Macy 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
19737877fdebSMatt Macy 		int t = 0;
19747877fdebSMatt Macy 		int dead = 0;
19757877fdebSMatt Macy 		int dead_data = 0;
19767877fdebSMatt Macy 
19777877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
19787877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
19797877fdebSMatt Macy 			ASSERT0(rc->rc_need_orig_restore);
19807877fdebSMatt Macy 			if (rc->rc_error != 0) {
19817877fdebSMatt Macy 				dead++;
19827877fdebSMatt Macy 				if (c >= nparity)
19837877fdebSMatt Macy 					dead_data++;
19847877fdebSMatt Macy 				continue;
19857877fdebSMatt Macy 			}
19867877fdebSMatt Macy 			if (rc->rc_size == 0)
19877877fdebSMatt Macy 				continue;
19887877fdebSMatt Macy 			for (int lt = 0; lt < ntgts; lt++) {
19897877fdebSMatt Macy 				if (rc->rc_devidx == ltgts[lt]) {
19907877fdebSMatt Macy 					if (rc->rc_orig_data == NULL) {
19917877fdebSMatt Macy 						rc->rc_orig_data =
1992f9693befSMartin Matuska 						    abd_alloc_linear(
1993f9693befSMartin Matuska 						    rc->rc_size, B_TRUE);
1994f9693befSMartin Matuska 						abd_copy(rc->rc_orig_data,
19957877fdebSMatt Macy 						    rc->rc_abd, rc->rc_size);
19967877fdebSMatt Macy 					}
19977877fdebSMatt Macy 					rc->rc_need_orig_restore = B_TRUE;
19987877fdebSMatt Macy 
19997877fdebSMatt Macy 					dead++;
20007877fdebSMatt Macy 					if (c >= nparity)
20017877fdebSMatt Macy 						dead_data++;
20027877fdebSMatt Macy 					my_tgts[t++] = c;
20037877fdebSMatt Macy 					break;
20047877fdebSMatt Macy 				}
20057877fdebSMatt Macy 			}
20067877fdebSMatt Macy 		}
20077877fdebSMatt Macy 		if (dead > nparity) {
20087877fdebSMatt Macy 			/* reconstruction not possible */
20097877fdebSMatt Macy 			raidz_restore_orig_data(rm);
20107877fdebSMatt Macy 			return (EINVAL);
20117877fdebSMatt Macy 		}
20127877fdebSMatt Macy 		if (dead_data > 0)
2013f9693befSMartin Matuska 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
20147877fdebSMatt Macy 	}
20157877fdebSMatt Macy 
20167877fdebSMatt Macy 	/* Check for success */
20177877fdebSMatt Macy 	if (raidz_checksum_verify(zio) == 0) {
20187877fdebSMatt Macy 
20197877fdebSMatt Macy 		/* Reconstruction succeeded - report errors */
20207877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
20217877fdebSMatt Macy 			raidz_row_t *rr = rm->rm_row[i];
20227877fdebSMatt Macy 
20237877fdebSMatt Macy 			for (int c = 0; c < rr->rr_cols; c++) {
20247877fdebSMatt Macy 				raidz_col_t *rc = &rr->rr_col[c];
20257877fdebSMatt Macy 				if (rc->rc_need_orig_restore) {
20267877fdebSMatt Macy 					/*
20277877fdebSMatt Macy 					 * Note: if this is a parity column,
20287877fdebSMatt Macy 					 * we don't really know if it's wrong.
20297877fdebSMatt Macy 					 * We need to let
20307877fdebSMatt Macy 					 * vdev_raidz_io_done_verified() check
20317877fdebSMatt Macy 					 * it, and if we set rc_error, it will
20327877fdebSMatt Macy 					 * think that it is a "known" error
20337877fdebSMatt Macy 					 * that doesn't need to be checked
20347877fdebSMatt Macy 					 * or corrected.
20357877fdebSMatt Macy 					 */
20367877fdebSMatt Macy 					if (rc->rc_error == 0 &&
20377877fdebSMatt Macy 					    c >= rr->rr_firstdatacol) {
2038e92ffd9bSMartin Matuska 						vdev_raidz_checksum_error(zio,
2039f9693befSMartin Matuska 						    rc, rc->rc_orig_data);
20407877fdebSMatt Macy 						rc->rc_error =
20417877fdebSMatt Macy 						    SET_ERROR(ECKSUM);
20427877fdebSMatt Macy 					}
20437877fdebSMatt Macy 					rc->rc_need_orig_restore = B_FALSE;
20447877fdebSMatt Macy 				}
20457877fdebSMatt Macy 			}
20467877fdebSMatt Macy 
20477877fdebSMatt Macy 			vdev_raidz_io_done_verified(zio, rr);
20487877fdebSMatt Macy 		}
20497877fdebSMatt Macy 
20507877fdebSMatt Macy 		zio_checksum_verified(zio);
20517877fdebSMatt Macy 
20527877fdebSMatt Macy 		return (0);
20537877fdebSMatt Macy 	}
20547877fdebSMatt Macy 
20557877fdebSMatt Macy 	/* Reconstruction failed - restore original data */
20567877fdebSMatt Macy 	raidz_restore_orig_data(rm);
20577877fdebSMatt Macy 	return (ECKSUM);
20587877fdebSMatt Macy }
20597877fdebSMatt Macy 
20607877fdebSMatt Macy /*
20617877fdebSMatt Macy  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
20627877fdebSMatt Macy  * Note that the algorithm below is non-optimal because it doesn't take into
20637877fdebSMatt Macy  * account how reconstruction is actually performed. For example, with
20647877fdebSMatt Macy  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
20657877fdebSMatt Macy  * is targeted as invalid as if columns 1 and 4 are targeted since in both
20667877fdebSMatt Macy  * cases we'd only use parity information in column 0.
20677877fdebSMatt Macy  *
20687877fdebSMatt Macy  * The order that we find the various possible combinations of failed
20697877fdebSMatt Macy  * disks is dictated by these rules:
20707877fdebSMatt Macy  * - Examine each "slot" (the "i" in tgts[i])
20717877fdebSMatt Macy  *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
20727877fdebSMatt Macy  *   - if we can't increment because it runs into the next slot,
20737877fdebSMatt Macy  *     reset our slot to the minimum, and examine the next slot
20747877fdebSMatt Macy  *
20757877fdebSMatt Macy  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
20767877fdebSMatt Macy  *  3 columns to reconstruct), we will generate the following sequence:
20777877fdebSMatt Macy  *
20787877fdebSMatt Macy  *  STATE        ACTION
20797877fdebSMatt Macy  *  0 1 2        special case: skip since these are all parity
20807877fdebSMatt Macy  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
20817877fdebSMatt Macy  *  0   2 3      first slot: increment to 1
20827877fdebSMatt Macy  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
20837877fdebSMatt Macy  *  0 1     4    first: reset to 0; middle: increment to 2
20847877fdebSMatt Macy  *  0   2   4    first: increment to 1
20857877fdebSMatt Macy  *    1 2   4    first: reset to 0; middle: increment to 3
20867877fdebSMatt Macy  *  0     3 4    first: increment to 1
20877877fdebSMatt Macy  *    1   3 4    first: increment to 2
20887877fdebSMatt Macy  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
20897877fdebSMatt Macy  *  0 1       5  first: reset to 0; middle: increment to 2
20907877fdebSMatt Macy  *  0   2     5  first: increment to 1
20917877fdebSMatt Macy  *    1 2     5  first: reset to 0; middle: increment to 3
20927877fdebSMatt Macy  *  0     3   5  first: increment to 1
20937877fdebSMatt Macy  *    1   3   5  first: increment to 2
20947877fdebSMatt Macy  *      2 3   5  first: reset to 0; middle: increment to 4
20957877fdebSMatt Macy  *  0       4 5  first: increment to 1
20967877fdebSMatt Macy  *    1     4 5  first: increment to 2
20977877fdebSMatt Macy  *      2   4 5  first: increment to 3
20987877fdebSMatt Macy  *        3 4 5  done
20997877fdebSMatt Macy  *
210016038816SMartin Matuska  * This strategy works for dRAID but is less efficient when there are a large
21017877fdebSMatt Macy  * number of child vdevs and therefore permutations to check. Furthermore,
21027877fdebSMatt Macy  * since the raidz_map_t rows likely do not overlap reconstruction would be
21037877fdebSMatt Macy  * possible as long as there are no more than nparity data errors per row.
21047877fdebSMatt Macy  * These additional permutations are not currently checked but could be as
21057877fdebSMatt Macy  * a future improvement.
21067877fdebSMatt Macy  */
21077877fdebSMatt Macy static int
21087877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio)
21097877fdebSMatt Macy {
21107877fdebSMatt Macy 	int nparity = vdev_get_nparity(zio->io_vd);
21117877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
21127877fdebSMatt Macy 
21137877fdebSMatt Macy 	/* Check if there's enough data to attempt reconstrution. */
21147877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
21157877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
21167877fdebSMatt Macy 		int total_errors = 0;
21177877fdebSMatt Macy 
21187877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
21197877fdebSMatt Macy 			if (rr->rr_col[c].rc_error)
21207877fdebSMatt Macy 				total_errors++;
21217877fdebSMatt Macy 		}
21227877fdebSMatt Macy 
21237877fdebSMatt Macy 		if (total_errors > nparity)
21247877fdebSMatt Macy 			return (vdev_raidz_worst_error(rr));
21257877fdebSMatt Macy 	}
21267877fdebSMatt Macy 
21277877fdebSMatt Macy 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
21287877fdebSMatt Macy 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
21297877fdebSMatt Macy 		int *ltgts = &tstore[1]; /* value is logical child ID */
21307877fdebSMatt Macy 
21317877fdebSMatt Macy 		/* Determine number of logical children, n */
21327877fdebSMatt Macy 		int n = zio->io_vd->vdev_children;
21337877fdebSMatt Macy 
21347877fdebSMatt Macy 		ASSERT3U(num_failures, <=, nparity);
21357877fdebSMatt Macy 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
21367877fdebSMatt Macy 
21377877fdebSMatt Macy 		/* Handle corner cases in combrec logic */
21387877fdebSMatt Macy 		ltgts[-1] = -1;
21397877fdebSMatt Macy 		for (int i = 0; i < num_failures; i++) {
21407877fdebSMatt Macy 			ltgts[i] = i;
21417877fdebSMatt Macy 		}
21427877fdebSMatt Macy 		ltgts[num_failures] = n;
21437877fdebSMatt Macy 
21447877fdebSMatt Macy 		for (;;) {
21457877fdebSMatt Macy 			int err = raidz_reconstruct(zio, ltgts, num_failures,
21467877fdebSMatt Macy 			    nparity);
21477877fdebSMatt Macy 			if (err == EINVAL) {
21487877fdebSMatt Macy 				/*
21497877fdebSMatt Macy 				 * Reconstruction not possible with this #
21507877fdebSMatt Macy 				 * failures; try more failures.
21517877fdebSMatt Macy 				 */
21527877fdebSMatt Macy 				break;
21537877fdebSMatt Macy 			} else if (err == 0)
21547877fdebSMatt Macy 				return (0);
21557877fdebSMatt Macy 
21567877fdebSMatt Macy 			/* Compute next targets to try */
21577877fdebSMatt Macy 			for (int t = 0; ; t++) {
21587877fdebSMatt Macy 				ASSERT3U(t, <, num_failures);
21597877fdebSMatt Macy 				ltgts[t]++;
21607877fdebSMatt Macy 				if (ltgts[t] == n) {
21617877fdebSMatt Macy 					/* try more failures */
21627877fdebSMatt Macy 					ASSERT3U(t, ==, num_failures - 1);
21637877fdebSMatt Macy 					break;
21647877fdebSMatt Macy 				}
21657877fdebSMatt Macy 
21667877fdebSMatt Macy 				ASSERT3U(ltgts[t], <, n);
21677877fdebSMatt Macy 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
21687877fdebSMatt Macy 
21697877fdebSMatt Macy 				/*
21707877fdebSMatt Macy 				 * If that spot is available, we're done here.
21717877fdebSMatt Macy 				 * Try the next combination.
21727877fdebSMatt Macy 				 */
21737877fdebSMatt Macy 				if (ltgts[t] != ltgts[t + 1])
21747877fdebSMatt Macy 					break;
21757877fdebSMatt Macy 
21767877fdebSMatt Macy 				/*
21777877fdebSMatt Macy 				 * Otherwise, reset this tgt to the minimum,
21787877fdebSMatt Macy 				 * and move on to the next tgt.
21797877fdebSMatt Macy 				 */
21807877fdebSMatt Macy 				ltgts[t] = ltgts[t - 1] + 1;
21817877fdebSMatt Macy 				ASSERT3U(ltgts[t], ==, t);
21827877fdebSMatt Macy 			}
21837877fdebSMatt Macy 
21847877fdebSMatt Macy 			/* Increase the number of failures and keep trying. */
21857877fdebSMatt Macy 			if (ltgts[num_failures - 1] == n)
21867877fdebSMatt Macy 				break;
21877877fdebSMatt Macy 		}
21887877fdebSMatt Macy 	}
21897877fdebSMatt Macy 
21907877fdebSMatt Macy 	return (ECKSUM);
21917877fdebSMatt Macy }
21927877fdebSMatt Macy 
21937877fdebSMatt Macy void
21947877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
21957877fdebSMatt Macy {
21967877fdebSMatt Macy 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
21977877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[row];
21987877fdebSMatt Macy 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
21997877fdebSMatt Macy 	}
22007877fdebSMatt Macy }
22017877fdebSMatt Macy 
22027877fdebSMatt Macy /*
22037877fdebSMatt Macy  * Complete a write IO operation on a RAIDZ VDev
22047877fdebSMatt Macy  *
22057877fdebSMatt Macy  * Outline:
22067877fdebSMatt Macy  *   1. Check for errors on the child IOs.
22077877fdebSMatt Macy  *   2. Return, setting an error code if too few child VDevs were written
22087877fdebSMatt Macy  *      to reconstruct the data later.  Note that partial writes are
22097877fdebSMatt Macy  *      considered successful if they can be reconstructed at all.
22107877fdebSMatt Macy  */
22117877fdebSMatt Macy static void
22127877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
22137877fdebSMatt Macy {
22147877fdebSMatt Macy 	int total_errors = 0;
22157877fdebSMatt Macy 
22167877fdebSMatt Macy 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
22177877fdebSMatt Macy 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
22187877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
22197877fdebSMatt Macy 
22207877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
22217877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
22227877fdebSMatt Macy 
22237877fdebSMatt Macy 		if (rc->rc_error) {
22247877fdebSMatt Macy 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
22257877fdebSMatt Macy 
22267877fdebSMatt Macy 			total_errors++;
22277877fdebSMatt Macy 		}
22287877fdebSMatt Macy 	}
22297877fdebSMatt Macy 
22307877fdebSMatt Macy 	/*
22317877fdebSMatt Macy 	 * Treat partial writes as a success. If we couldn't write enough
22327877fdebSMatt Macy 	 * columns to reconstruct the data, the I/O failed.  Otherwise,
22337877fdebSMatt Macy 	 * good enough.
2234eda14cbcSMatt Macy 	 *
2235eda14cbcSMatt Macy 	 * Now that we support write reallocation, it would be better
2236eda14cbcSMatt Macy 	 * to treat partial failure as real failure unless there are
2237eda14cbcSMatt Macy 	 * no non-degraded top-level vdevs left, and not update DTLs
2238eda14cbcSMatt Macy 	 * if we intend to reallocate.
2239eda14cbcSMatt Macy 	 */
22407877fdebSMatt Macy 	if (total_errors > rr->rr_firstdatacol) {
22417877fdebSMatt Macy 		zio->io_error = zio_worst_error(zio->io_error,
22427877fdebSMatt Macy 		    vdev_raidz_worst_error(rr));
22437877fdebSMatt Macy 	}
2244eda14cbcSMatt Macy }
2245eda14cbcSMatt Macy 
2246f9693befSMartin Matuska static void
22477877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
22487877fdebSMatt Macy     raidz_row_t *rr)
22497877fdebSMatt Macy {
22507877fdebSMatt Macy 	int parity_errors = 0;
22517877fdebSMatt Macy 	int parity_untried = 0;
22527877fdebSMatt Macy 	int data_errors = 0;
22537877fdebSMatt Macy 	int total_errors = 0;
22547877fdebSMatt Macy 
22557877fdebSMatt Macy 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
22567877fdebSMatt Macy 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
22577877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
22587877fdebSMatt Macy 
22597877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
22607877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
22617877fdebSMatt Macy 
2262a0b956f5SMartin Matuska 		/*
2263a0b956f5SMartin Matuska 		 * If scrubbing and a replacing/sparing child vdev determined
2264a0b956f5SMartin Matuska 		 * that not all of its children have an identical copy of the
2265a0b956f5SMartin Matuska 		 * data, then clear the error so the column is treated like
2266a0b956f5SMartin Matuska 		 * any other read and force a repair to correct the damage.
2267a0b956f5SMartin Matuska 		 */
2268a0b956f5SMartin Matuska 		if (rc->rc_error == ECKSUM) {
2269a0b956f5SMartin Matuska 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
2270a0b956f5SMartin Matuska 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
2271a0b956f5SMartin Matuska 			rc->rc_force_repair = 1;
2272a0b956f5SMartin Matuska 			rc->rc_error = 0;
2273a0b956f5SMartin Matuska 		}
22747877fdebSMatt Macy 
2275a0b956f5SMartin Matuska 		if (rc->rc_error) {
22767877fdebSMatt Macy 			if (c < rr->rr_firstdatacol)
22777877fdebSMatt Macy 				parity_errors++;
22787877fdebSMatt Macy 			else
22797877fdebSMatt Macy 				data_errors++;
22807877fdebSMatt Macy 
22817877fdebSMatt Macy 			total_errors++;
22827877fdebSMatt Macy 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
22837877fdebSMatt Macy 			parity_untried++;
22847877fdebSMatt Macy 		}
22857877fdebSMatt Macy 	}
2286eda14cbcSMatt Macy 
2287eda14cbcSMatt Macy 	/*
22887877fdebSMatt Macy 	 * If there were data errors and the number of errors we saw was
22897877fdebSMatt Macy 	 * correctable -- less than or equal to the number of parity disks read
22907877fdebSMatt Macy 	 * -- reconstruct based on the missing data.
2291eda14cbcSMatt Macy 	 */
22927877fdebSMatt Macy 	if (data_errors != 0 &&
22937877fdebSMatt Macy 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
2294eda14cbcSMatt Macy 		/*
2295eda14cbcSMatt Macy 		 * We either attempt to read all the parity columns or
2296eda14cbcSMatt Macy 		 * none of them. If we didn't try to read parity, we
2297eda14cbcSMatt Macy 		 * wouldn't be here in the correctable case. There must
2298eda14cbcSMatt Macy 		 * also have been fewer parity errors than parity
2299eda14cbcSMatt Macy 		 * columns or, again, we wouldn't be in this code path.
2300eda14cbcSMatt Macy 		 */
2301eda14cbcSMatt Macy 		ASSERT(parity_untried == 0);
23027877fdebSMatt Macy 		ASSERT(parity_errors < rr->rr_firstdatacol);
2303eda14cbcSMatt Macy 
2304eda14cbcSMatt Macy 		/*
2305eda14cbcSMatt Macy 		 * Identify the data columns that reported an error.
2306eda14cbcSMatt Macy 		 */
23077877fdebSMatt Macy 		int n = 0;
23087877fdebSMatt Macy 		int tgts[VDEV_RAIDZ_MAXPARITY];
23097877fdebSMatt Macy 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
23107877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
2311eda14cbcSMatt Macy 			if (rc->rc_error != 0) {
2312eda14cbcSMatt Macy 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2313eda14cbcSMatt Macy 				tgts[n++] = c;
2314eda14cbcSMatt Macy 			}
2315eda14cbcSMatt Macy 		}
2316eda14cbcSMatt Macy 
23177877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol >= n);
2318eda14cbcSMatt Macy 
2319f9693befSMartin Matuska 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
2320eda14cbcSMatt Macy 	}
2321eda14cbcSMatt Macy }
2322eda14cbcSMatt Macy 
2323eda14cbcSMatt Macy /*
23247877fdebSMatt Macy  * Return the number of reads issued.
2325eda14cbcSMatt Macy  */
23267877fdebSMatt Macy static int
23277877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
23287877fdebSMatt Macy {
23297877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
23307877fdebSMatt Macy 	int nread = 0;
2331eda14cbcSMatt Macy 
23327877fdebSMatt Macy 	rr->rr_missingdata = 0;
23337877fdebSMatt Macy 	rr->rr_missingparity = 0;
23347877fdebSMatt Macy 
23357877fdebSMatt Macy 	/*
23367877fdebSMatt Macy 	 * If this rows contains empty sectors which are not required
23377877fdebSMatt Macy 	 * for a normal read then allocate an ABD for them now so they
23387877fdebSMatt Macy 	 * may be read, verified, and any needed repairs performed.
23397877fdebSMatt Macy 	 */
23407877fdebSMatt Macy 	if (rr->rr_nempty && rr->rr_abd_empty == NULL)
23417877fdebSMatt Macy 		vdev_draid_map_alloc_empty(zio, rr);
23427877fdebSMatt Macy 
23437877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
23447877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
23457877fdebSMatt Macy 		if (rc->rc_tried || rc->rc_size == 0)
2346eda14cbcSMatt Macy 			continue;
2347eda14cbcSMatt Macy 
2348eda14cbcSMatt Macy 		zio_nowait(zio_vdev_child_io(zio, NULL,
2349eda14cbcSMatt Macy 		    vd->vdev_child[rc->rc_devidx],
2350eda14cbcSMatt Macy 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
2351eda14cbcSMatt Macy 		    zio->io_type, zio->io_priority, 0,
2352eda14cbcSMatt Macy 		    vdev_raidz_child_done, rc));
23537877fdebSMatt Macy 		nread++;
23547877fdebSMatt Macy 	}
23557877fdebSMatt Macy 	return (nread);
2356eda14cbcSMatt Macy }
2357eda14cbcSMatt Macy 
2358eda14cbcSMatt Macy /*
23597877fdebSMatt Macy  * We're here because either there were too many errors to even attempt
23607877fdebSMatt Macy  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
23617877fdebSMatt Macy  * failed. In either case, there is enough bad data to prevent reconstruction.
23627877fdebSMatt Macy  * Start checksum ereports for all children which haven't failed.
2363eda14cbcSMatt Macy  */
23647877fdebSMatt Macy static void
23657877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio)
23667877fdebSMatt Macy {
23677877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
2368eda14cbcSMatt Macy 
23697877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
23707877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
2371eda14cbcSMatt Macy 
23727877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
23737877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
23747877fdebSMatt Macy 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
23757877fdebSMatt Macy 
23762c48331dSMatt Macy 			if (rc->rc_error != 0)
23772c48331dSMatt Macy 				continue;
23782c48331dSMatt Macy 
2379eda14cbcSMatt Macy 			zio_bad_cksum_t zbc;
2380eda14cbcSMatt Macy 			zbc.zbc_has_cksum = 0;
23812c48331dSMatt Macy 			zbc.zbc_injected = rm->rm_ecksuminjected;
2382eda14cbcSMatt Macy 
2383eda14cbcSMatt Macy 			mutex_enter(&cvd->vdev_stat_lock);
2384eda14cbcSMatt Macy 			cvd->vdev_stat.vs_checksum_errors++;
2385eda14cbcSMatt Macy 			mutex_exit(&cvd->vdev_stat_lock);
2386*bb2d13b6SMartin Matuska 			(void) zfs_ereport_start_checksum(zio->io_spa,
2387*bb2d13b6SMartin Matuska 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
2388*bb2d13b6SMartin Matuska 			    rc->rc_size, &zbc);
2389eda14cbcSMatt Macy 		}
2390eda14cbcSMatt Macy 	}
2391eda14cbcSMatt Macy }
2392eda14cbcSMatt Macy 
23937877fdebSMatt Macy void
23947877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio)
23957877fdebSMatt Macy {
23967877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
23977877fdebSMatt Macy 
23987877fdebSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
23997877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
24007877fdebSMatt Macy 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
24017877fdebSMatt Macy 		}
24027877fdebSMatt Macy 	} else {
24037877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
24047877fdebSMatt Macy 			raidz_row_t *rr = rm->rm_row[i];
24057877fdebSMatt Macy 			vdev_raidz_io_done_reconstruct_known_missing(zio,
24067877fdebSMatt Macy 			    rm, rr);
24077877fdebSMatt Macy 		}
24087877fdebSMatt Macy 
24097877fdebSMatt Macy 		if (raidz_checksum_verify(zio) == 0) {
24107877fdebSMatt Macy 			for (int i = 0; i < rm->rm_nrows; i++) {
24117877fdebSMatt Macy 				raidz_row_t *rr = rm->rm_row[i];
24127877fdebSMatt Macy 				vdev_raidz_io_done_verified(zio, rr);
24137877fdebSMatt Macy 			}
2414eda14cbcSMatt Macy 			zio_checksum_verified(zio);
24157877fdebSMatt Macy 		} else {
2416eda14cbcSMatt Macy 			/*
24177877fdebSMatt Macy 			 * A sequential resilver has no checksum which makes
24187877fdebSMatt Macy 			 * combinatoral reconstruction impossible. This code
24197877fdebSMatt Macy 			 * path is unreachable since raidz_checksum_verify()
24207877fdebSMatt Macy 			 * has no checksum to verify and must succeed.
2421eda14cbcSMatt Macy 			 */
24227877fdebSMatt Macy 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
2423eda14cbcSMatt Macy 
24247877fdebSMatt Macy 			/*
24257877fdebSMatt Macy 			 * This isn't a typical situation -- either we got a
24267877fdebSMatt Macy 			 * read error or a child silently returned bad data.
24277877fdebSMatt Macy 			 * Read every block so we can try again with as much
24287877fdebSMatt Macy 			 * data and parity as we can track down. If we've
24297877fdebSMatt Macy 			 * already been through once before, all children will
24307877fdebSMatt Macy 			 * be marked as tried so we'll proceed to combinatorial
24317877fdebSMatt Macy 			 * reconstruction.
24327877fdebSMatt Macy 			 */
24337877fdebSMatt Macy 			int nread = 0;
24347877fdebSMatt Macy 			for (int i = 0; i < rm->rm_nrows; i++) {
24357877fdebSMatt Macy 				nread += vdev_raidz_read_all(zio,
24367877fdebSMatt Macy 				    rm->rm_row[i]);
24377877fdebSMatt Macy 			}
24387877fdebSMatt Macy 			if (nread != 0) {
24397877fdebSMatt Macy 				/*
24407877fdebSMatt Macy 				 * Normally our stage is VDEV_IO_DONE, but if
24417877fdebSMatt Macy 				 * we've already called redone(), it will have
24427877fdebSMatt Macy 				 * changed to VDEV_IO_START, in which case we
24437877fdebSMatt Macy 				 * don't want to call redone() again.
24447877fdebSMatt Macy 				 */
24457877fdebSMatt Macy 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
24467877fdebSMatt Macy 					zio_vdev_io_redone(zio);
24477877fdebSMatt Macy 				return;
24487877fdebSMatt Macy 			}
2449eda14cbcSMatt Macy 
24507877fdebSMatt Macy 			zio->io_error = vdev_raidz_combrec(zio);
24517877fdebSMatt Macy 			if (zio->io_error == ECKSUM &&
24527877fdebSMatt Macy 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
24537877fdebSMatt Macy 				vdev_raidz_io_done_unrecoverable(zio);
24547877fdebSMatt Macy 			}
2455eda14cbcSMatt Macy 		}
2456eda14cbcSMatt Macy 	}
2457eda14cbcSMatt Macy }
2458eda14cbcSMatt Macy 
2459eda14cbcSMatt Macy static void
2460eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2461eda14cbcSMatt Macy {
24627877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
24637877fdebSMatt Macy 	if (faulted > vdrz->vd_nparity)
2464eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2465eda14cbcSMatt Macy 		    VDEV_AUX_NO_REPLICAS);
2466eda14cbcSMatt Macy 	else if (degraded + faulted != 0)
2467eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2468eda14cbcSMatt Macy 	else
2469eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2470eda14cbcSMatt Macy }
2471eda14cbcSMatt Macy 
2472eda14cbcSMatt Macy /*
2473eda14cbcSMatt Macy  * Determine if any portion of the provided block resides on a child vdev
2474eda14cbcSMatt Macy  * with a dirty DTL and therefore needs to be resilvered.  The function
2475eda14cbcSMatt Macy  * assumes that at least one DTL is dirty which implies that full stripe
2476eda14cbcSMatt Macy  * width blocks must be resilvered.
2477eda14cbcSMatt Macy  */
2478eda14cbcSMatt Macy static boolean_t
24797877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
24807877fdebSMatt Macy     uint64_t phys_birth)
2481eda14cbcSMatt Macy {
24827877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2483eda14cbcSMatt Macy 	uint64_t dcols = vd->vdev_children;
24847877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
2485eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2486eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
24877877fdebSMatt Macy 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
2488eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
2489eda14cbcSMatt Macy 	uint64_t s = ((psize - 1) >> ashift) + 1;
2490eda14cbcSMatt Macy 	/* The first column for this stripe. */
2491eda14cbcSMatt Macy 	uint64_t f = b % dcols;
2492eda14cbcSMatt Macy 
24937877fdebSMatt Macy 	/* Unreachable by sequential resilver. */
24947877fdebSMatt Macy 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
24957877fdebSMatt Macy 
24967877fdebSMatt Macy 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
24977877fdebSMatt Macy 		return (B_FALSE);
24987877fdebSMatt Macy 
2499eda14cbcSMatt Macy 	if (s + nparity >= dcols)
2500eda14cbcSMatt Macy 		return (B_TRUE);
2501eda14cbcSMatt Macy 
2502eda14cbcSMatt Macy 	for (uint64_t c = 0; c < s + nparity; c++) {
2503eda14cbcSMatt Macy 		uint64_t devidx = (f + c) % dcols;
2504eda14cbcSMatt Macy 		vdev_t *cvd = vd->vdev_child[devidx];
2505eda14cbcSMatt Macy 
2506eda14cbcSMatt Macy 		/*
2507eda14cbcSMatt Macy 		 * dsl_scan_need_resilver() already checked vd with
2508eda14cbcSMatt Macy 		 * vdev_dtl_contains(). So here just check cvd with
2509eda14cbcSMatt Macy 		 * vdev_dtl_empty(), cheaper and a good approximation.
2510eda14cbcSMatt Macy 		 */
2511eda14cbcSMatt Macy 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2512eda14cbcSMatt Macy 			return (B_TRUE);
2513eda14cbcSMatt Macy 	}
2514eda14cbcSMatt Macy 
2515eda14cbcSMatt Macy 	return (B_FALSE);
2516eda14cbcSMatt Macy }
2517eda14cbcSMatt Macy 
2518eda14cbcSMatt Macy static void
25197877fdebSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
25207877fdebSMatt Macy     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
2521eda14cbcSMatt Macy {
2522e92ffd9bSMartin Matuska 	(void) remain_rs;
2523e92ffd9bSMartin Matuska 
2524eda14cbcSMatt Macy 	vdev_t *raidvd = cvd->vdev_parent;
2525eda14cbcSMatt Macy 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2526eda14cbcSMatt Macy 
2527eda14cbcSMatt Macy 	uint64_t width = raidvd->vdev_children;
2528eda14cbcSMatt Macy 	uint64_t tgt_col = cvd->vdev_id;
2529eda14cbcSMatt Macy 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2530eda14cbcSMatt Macy 
2531eda14cbcSMatt Macy 	/* make sure the offsets are block-aligned */
25327877fdebSMatt Macy 	ASSERT0(logical_rs->rs_start % (1 << ashift));
25337877fdebSMatt Macy 	ASSERT0(logical_rs->rs_end % (1 << ashift));
25347877fdebSMatt Macy 	uint64_t b_start = logical_rs->rs_start >> ashift;
25357877fdebSMatt Macy 	uint64_t b_end = logical_rs->rs_end >> ashift;
2536eda14cbcSMatt Macy 
2537eda14cbcSMatt Macy 	uint64_t start_row = 0;
2538eda14cbcSMatt Macy 	if (b_start > tgt_col) /* avoid underflow */
2539eda14cbcSMatt Macy 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2540eda14cbcSMatt Macy 
2541eda14cbcSMatt Macy 	uint64_t end_row = 0;
2542eda14cbcSMatt Macy 	if (b_end > tgt_col)
2543eda14cbcSMatt Macy 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2544eda14cbcSMatt Macy 
25457877fdebSMatt Macy 	physical_rs->rs_start = start_row << ashift;
25467877fdebSMatt Macy 	physical_rs->rs_end = end_row << ashift;
2547eda14cbcSMatt Macy 
25487877fdebSMatt Macy 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
25497877fdebSMatt Macy 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
25507877fdebSMatt Macy 	    logical_rs->rs_end - logical_rs->rs_start);
25517877fdebSMatt Macy }
25527877fdebSMatt Macy 
25537877fdebSMatt Macy /*
25547877fdebSMatt Macy  * Initialize private RAIDZ specific fields from the nvlist.
25557877fdebSMatt Macy  */
25567877fdebSMatt Macy static int
25577877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
25587877fdebSMatt Macy {
25597877fdebSMatt Macy 	vdev_raidz_t *vdrz;
25607877fdebSMatt Macy 	uint64_t nparity;
25617877fdebSMatt Macy 
25627877fdebSMatt Macy 	uint_t children;
25637877fdebSMatt Macy 	nvlist_t **child;
25647877fdebSMatt Macy 	int error = nvlist_lookup_nvlist_array(nv,
25657877fdebSMatt Macy 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
25667877fdebSMatt Macy 	if (error != 0)
25677877fdebSMatt Macy 		return (SET_ERROR(EINVAL));
25687877fdebSMatt Macy 
25697877fdebSMatt Macy 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
25707877fdebSMatt Macy 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
25717877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25727877fdebSMatt Macy 
25737877fdebSMatt Macy 		/*
25747877fdebSMatt Macy 		 * Previous versions could only support 1 or 2 parity
25757877fdebSMatt Macy 		 * device.
25767877fdebSMatt Macy 		 */
25777877fdebSMatt Macy 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
25787877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25797877fdebSMatt Macy 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
25807877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25817877fdebSMatt Macy 	} else {
25827877fdebSMatt Macy 		/*
25837877fdebSMatt Macy 		 * We require the parity to be specified for SPAs that
25847877fdebSMatt Macy 		 * support multiple parity levels.
25857877fdebSMatt Macy 		 */
25867877fdebSMatt Macy 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
25877877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25887877fdebSMatt Macy 
25897877fdebSMatt Macy 		/*
25907877fdebSMatt Macy 		 * Otherwise, we default to 1 parity device for RAID-Z.
25917877fdebSMatt Macy 		 */
25927877fdebSMatt Macy 		nparity = 1;
25937877fdebSMatt Macy 	}
25947877fdebSMatt Macy 
25957877fdebSMatt Macy 	vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
25967877fdebSMatt Macy 	vdrz->vd_logical_width = children;
25977877fdebSMatt Macy 	vdrz->vd_nparity = nparity;
25987877fdebSMatt Macy 
25997877fdebSMatt Macy 	*tsd = vdrz;
26007877fdebSMatt Macy 
26017877fdebSMatt Macy 	return (0);
26027877fdebSMatt Macy }
26037877fdebSMatt Macy 
26047877fdebSMatt Macy static void
26057877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd)
26067877fdebSMatt Macy {
26077877fdebSMatt Macy 	kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
26087877fdebSMatt Macy }
26097877fdebSMatt Macy 
26107877fdebSMatt Macy /*
26117877fdebSMatt Macy  * Add RAIDZ specific fields to the config nvlist.
26127877fdebSMatt Macy  */
26137877fdebSMatt Macy static void
26147877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
26157877fdebSMatt Macy {
26167877fdebSMatt Macy 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
26177877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
26187877fdebSMatt Macy 
26197877fdebSMatt Macy 	/*
26207877fdebSMatt Macy 	 * Make sure someone hasn't managed to sneak a fancy new vdev
26217877fdebSMatt Macy 	 * into a crufty old storage pool.
26227877fdebSMatt Macy 	 */
26237877fdebSMatt Macy 	ASSERT(vdrz->vd_nparity == 1 ||
26247877fdebSMatt Macy 	    (vdrz->vd_nparity <= 2 &&
26257877fdebSMatt Macy 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
26267877fdebSMatt Macy 	    (vdrz->vd_nparity <= 3 &&
26277877fdebSMatt Macy 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
26287877fdebSMatt Macy 
26297877fdebSMatt Macy 	/*
26307877fdebSMatt Macy 	 * Note that we'll add these even on storage pools where they
26317877fdebSMatt Macy 	 * aren't strictly required -- older software will just ignore
26327877fdebSMatt Macy 	 * it.
26337877fdebSMatt Macy 	 */
26347877fdebSMatt Macy 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
26357877fdebSMatt Macy }
26367877fdebSMatt Macy 
26377877fdebSMatt Macy static uint64_t
26387877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd)
26397877fdebSMatt Macy {
26407877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
26417877fdebSMatt Macy 	return (vdrz->vd_nparity);
26427877fdebSMatt Macy }
26437877fdebSMatt Macy 
26447877fdebSMatt Macy static uint64_t
26457877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd)
26467877fdebSMatt Macy {
26477877fdebSMatt Macy 	return (vd->vdev_children);
2648eda14cbcSMatt Macy }
2649eda14cbcSMatt Macy 
2650eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = {
26517877fdebSMatt Macy 	.vdev_op_init = vdev_raidz_init,
26527877fdebSMatt Macy 	.vdev_op_fini = vdev_raidz_fini,
2653eda14cbcSMatt Macy 	.vdev_op_open = vdev_raidz_open,
2654eda14cbcSMatt Macy 	.vdev_op_close = vdev_raidz_close,
2655eda14cbcSMatt Macy 	.vdev_op_asize = vdev_raidz_asize,
26567877fdebSMatt Macy 	.vdev_op_min_asize = vdev_raidz_min_asize,
26577877fdebSMatt Macy 	.vdev_op_min_alloc = NULL,
2658eda14cbcSMatt Macy 	.vdev_op_io_start = vdev_raidz_io_start,
2659eda14cbcSMatt Macy 	.vdev_op_io_done = vdev_raidz_io_done,
2660eda14cbcSMatt Macy 	.vdev_op_state_change = vdev_raidz_state_change,
2661eda14cbcSMatt Macy 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2662eda14cbcSMatt Macy 	.vdev_op_hold = NULL,
2663eda14cbcSMatt Macy 	.vdev_op_rele = NULL,
2664eda14cbcSMatt Macy 	.vdev_op_remap = NULL,
2665eda14cbcSMatt Macy 	.vdev_op_xlate = vdev_raidz_xlate,
26667877fdebSMatt Macy 	.vdev_op_rebuild_asize = NULL,
26677877fdebSMatt Macy 	.vdev_op_metaslab_init = NULL,
26687877fdebSMatt Macy 	.vdev_op_config_generate = vdev_raidz_config_generate,
26697877fdebSMatt Macy 	.vdev_op_nparity = vdev_raidz_nparity,
26707877fdebSMatt Macy 	.vdev_op_ndisks = vdev_raidz_ndisks,
2671eda14cbcSMatt Macy 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2672eda14cbcSMatt Macy 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2673eda14cbcSMatt Macy };
2674