xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision 2c48331d28f16c0efce5a72a81e7d71668c4a158)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9eda14cbcSMatt Macy  * or http://www.opensolaris.org/os/licensing.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy 
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24*2c48331dSMatt Macy  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26eda14cbcSMatt Macy  */
27eda14cbcSMatt Macy 
28eda14cbcSMatt Macy #include <sys/zfs_context.h>
29eda14cbcSMatt Macy #include <sys/spa.h>
30eda14cbcSMatt Macy #include <sys/vdev_impl.h>
31eda14cbcSMatt Macy #include <sys/zio.h>
32eda14cbcSMatt Macy #include <sys/zio_checksum.h>
33eda14cbcSMatt Macy #include <sys/abd.h>
34eda14cbcSMatt Macy #include <sys/fs/zfs.h>
35eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h>
36eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
37eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h>
38eda14cbcSMatt Macy 
39eda14cbcSMatt Macy #ifdef ZFS_DEBUG
40eda14cbcSMatt Macy #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
41eda14cbcSMatt Macy #endif
42eda14cbcSMatt Macy 
43eda14cbcSMatt Macy /*
44eda14cbcSMatt Macy  * Virtual device vector for RAID-Z.
45eda14cbcSMatt Macy  *
46eda14cbcSMatt Macy  * This vdev supports single, double, and triple parity. For single parity,
47eda14cbcSMatt Macy  * we use a simple XOR of all the data columns. For double or triple parity,
48eda14cbcSMatt Macy  * we use a special case of Reed-Solomon coding. This extends the
49eda14cbcSMatt Macy  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
50eda14cbcSMatt Macy  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
51eda14cbcSMatt Macy  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
52eda14cbcSMatt Macy  * former is also based. The latter is designed to provide higher performance
53eda14cbcSMatt Macy  * for writes.
54eda14cbcSMatt Macy  *
55eda14cbcSMatt Macy  * Note that the Plank paper claimed to support arbitrary N+M, but was then
56eda14cbcSMatt Macy  * amended six years later identifying a critical flaw that invalidates its
57eda14cbcSMatt Macy  * claims. Nevertheless, the technique can be adapted to work for up to
58eda14cbcSMatt Macy  * triple parity. For additional parity, the amendment "Note: Correction to
59eda14cbcSMatt Macy  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
60eda14cbcSMatt Macy  * is viable, but the additional complexity means that write performance will
61eda14cbcSMatt Macy  * suffer.
62eda14cbcSMatt Macy  *
63eda14cbcSMatt Macy  * All of the methods above operate on a Galois field, defined over the
64eda14cbcSMatt Macy  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
65eda14cbcSMatt Macy  * can be expressed with a single byte. Briefly, the operations on the
66eda14cbcSMatt Macy  * field are defined as follows:
67eda14cbcSMatt Macy  *
68eda14cbcSMatt Macy  *   o addition (+) is represented by a bitwise XOR
69eda14cbcSMatt Macy  *   o subtraction (-) is therefore identical to addition: A + B = A - B
70eda14cbcSMatt Macy  *   o multiplication of A by 2 is defined by the following bitwise expression:
71eda14cbcSMatt Macy  *
72eda14cbcSMatt Macy  *	(A * 2)_7 = A_6
73eda14cbcSMatt Macy  *	(A * 2)_6 = A_5
74eda14cbcSMatt Macy  *	(A * 2)_5 = A_4
75eda14cbcSMatt Macy  *	(A * 2)_4 = A_3 + A_7
76eda14cbcSMatt Macy  *	(A * 2)_3 = A_2 + A_7
77eda14cbcSMatt Macy  *	(A * 2)_2 = A_1 + A_7
78eda14cbcSMatt Macy  *	(A * 2)_1 = A_0
79eda14cbcSMatt Macy  *	(A * 2)_0 = A_7
80eda14cbcSMatt Macy  *
81eda14cbcSMatt Macy  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
82eda14cbcSMatt Macy  * As an aside, this multiplication is derived from the error correcting
83eda14cbcSMatt Macy  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
84eda14cbcSMatt Macy  *
85eda14cbcSMatt Macy  * Observe that any number in the field (except for 0) can be expressed as a
86eda14cbcSMatt Macy  * power of 2 -- a generator for the field. We store a table of the powers of
87eda14cbcSMatt Macy  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
88eda14cbcSMatt Macy  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
89eda14cbcSMatt Macy  * than field addition). The inverse of a field element A (A^-1) is therefore
90eda14cbcSMatt Macy  * A ^ (255 - 1) = A^254.
91eda14cbcSMatt Macy  *
92eda14cbcSMatt Macy  * The up-to-three parity columns, P, Q, R over several data columns,
93eda14cbcSMatt Macy  * D_0, ... D_n-1, can be expressed by field operations:
94eda14cbcSMatt Macy  *
95eda14cbcSMatt Macy  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
96eda14cbcSMatt Macy  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
97eda14cbcSMatt Macy  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
98eda14cbcSMatt Macy  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
99eda14cbcSMatt Macy  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
100eda14cbcSMatt Macy  *
101eda14cbcSMatt Macy  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
102eda14cbcSMatt Macy  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
103eda14cbcSMatt Macy  * independent coefficients. (There are no additional coefficients that have
104eda14cbcSMatt Macy  * this property which is why the uncorrected Plank method breaks down.)
105eda14cbcSMatt Macy  *
106eda14cbcSMatt Macy  * See the reconstruction code below for how P, Q and R can used individually
107eda14cbcSMatt Macy  * or in concert to recover missing data columns.
108eda14cbcSMatt Macy  */
109eda14cbcSMatt Macy 
110eda14cbcSMatt Macy #define	VDEV_RAIDZ_P		0
111eda14cbcSMatt Macy #define	VDEV_RAIDZ_Q		1
112eda14cbcSMatt Macy #define	VDEV_RAIDZ_R		2
113eda14cbcSMatt Macy 
114eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
115eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
116eda14cbcSMatt Macy 
117eda14cbcSMatt Macy /*
118eda14cbcSMatt Macy  * We provide a mechanism to perform the field multiplication operation on a
119eda14cbcSMatt Macy  * 64-bit value all at once rather than a byte at a time. This works by
120eda14cbcSMatt Macy  * creating a mask from the top bit in each byte and using that to
121eda14cbcSMatt Macy  * conditionally apply the XOR of 0x1d.
122eda14cbcSMatt Macy  */
123eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_2(x, mask) \
124eda14cbcSMatt Macy { \
125eda14cbcSMatt Macy 	(mask) = (x) & 0x8080808080808080ULL; \
126eda14cbcSMatt Macy 	(mask) = ((mask) << 1) - ((mask) >> 7); \
127eda14cbcSMatt Macy 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
128eda14cbcSMatt Macy 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
129eda14cbcSMatt Macy }
130eda14cbcSMatt Macy 
131eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_4(x, mask) \
132eda14cbcSMatt Macy { \
133eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
134eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
135eda14cbcSMatt Macy }
136eda14cbcSMatt Macy 
137eda14cbcSMatt Macy void
138eda14cbcSMatt Macy vdev_raidz_map_free(raidz_map_t *rm)
139eda14cbcSMatt Macy {
140eda14cbcSMatt Macy 	int c;
141eda14cbcSMatt Macy 
142eda14cbcSMatt Macy 	for (c = 0; c < rm->rm_firstdatacol; c++) {
143eda14cbcSMatt Macy 		abd_free(rm->rm_col[c].rc_abd);
144eda14cbcSMatt Macy 
145eda14cbcSMatt Macy 		if (rm->rm_col[c].rc_gdata != NULL)
146eda14cbcSMatt Macy 			abd_free(rm->rm_col[c].rc_gdata);
147eda14cbcSMatt Macy 	}
148eda14cbcSMatt Macy 
149eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
150eda14cbcSMatt Macy 		abd_put(rm->rm_col[c].rc_abd);
151eda14cbcSMatt Macy 
152eda14cbcSMatt Macy 	if (rm->rm_abd_copy != NULL)
153eda14cbcSMatt Macy 		abd_free(rm->rm_abd_copy);
154eda14cbcSMatt Macy 
155eda14cbcSMatt Macy 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
156eda14cbcSMatt Macy }
157eda14cbcSMatt Macy 
158eda14cbcSMatt Macy static void
159eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio)
160eda14cbcSMatt Macy {
161eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
162eda14cbcSMatt Macy 
163eda14cbcSMatt Macy 	ASSERT0(rm->rm_freed);
164eda14cbcSMatt Macy 	rm->rm_freed = 1;
165eda14cbcSMatt Macy 
166eda14cbcSMatt Macy 	if (rm->rm_reports == 0)
167eda14cbcSMatt Macy 		vdev_raidz_map_free(rm);
168eda14cbcSMatt Macy }
169eda14cbcSMatt Macy 
170eda14cbcSMatt Macy /*ARGSUSED*/
171eda14cbcSMatt Macy static void
172eda14cbcSMatt Macy vdev_raidz_cksum_free(void *arg, size_t ignored)
173eda14cbcSMatt Macy {
174eda14cbcSMatt Macy 	raidz_map_t *rm = arg;
175eda14cbcSMatt Macy 
176eda14cbcSMatt Macy 	ASSERT3U(rm->rm_reports, >, 0);
177eda14cbcSMatt Macy 
178eda14cbcSMatt Macy 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
179eda14cbcSMatt Macy 		vdev_raidz_map_free(rm);
180eda14cbcSMatt Macy }
181eda14cbcSMatt Macy 
182eda14cbcSMatt Macy static void
183eda14cbcSMatt Macy vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
184eda14cbcSMatt Macy {
185eda14cbcSMatt Macy 	raidz_map_t *rm = zcr->zcr_cbdata;
186eda14cbcSMatt Macy 	const size_t c = zcr->zcr_cbinfo;
187eda14cbcSMatt Macy 	size_t x, offset;
188eda14cbcSMatt Macy 
189eda14cbcSMatt Macy 	const abd_t *good = NULL;
190eda14cbcSMatt Macy 	const abd_t *bad = rm->rm_col[c].rc_abd;
191eda14cbcSMatt Macy 
192eda14cbcSMatt Macy 	if (good_data == NULL) {
193eda14cbcSMatt Macy 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
194eda14cbcSMatt Macy 		return;
195eda14cbcSMatt Macy 	}
196eda14cbcSMatt Macy 
197eda14cbcSMatt Macy 	if (c < rm->rm_firstdatacol) {
198eda14cbcSMatt Macy 		/*
199eda14cbcSMatt Macy 		 * The first time through, calculate the parity blocks for
200eda14cbcSMatt Macy 		 * the good data (this relies on the fact that the good
201eda14cbcSMatt Macy 		 * data never changes for a given logical ZIO)
202eda14cbcSMatt Macy 		 */
203eda14cbcSMatt Macy 		if (rm->rm_col[0].rc_gdata == NULL) {
204eda14cbcSMatt Macy 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
205eda14cbcSMatt Macy 
206eda14cbcSMatt Macy 			/*
207eda14cbcSMatt Macy 			 * Set up the rm_col[]s to generate the parity for
208eda14cbcSMatt Macy 			 * good_data, first saving the parity bufs and
209eda14cbcSMatt Macy 			 * replacing them with buffers to hold the result.
210eda14cbcSMatt Macy 			 */
211eda14cbcSMatt Macy 			for (x = 0; x < rm->rm_firstdatacol; x++) {
212eda14cbcSMatt Macy 				bad_parity[x] = rm->rm_col[x].rc_abd;
213eda14cbcSMatt Macy 				rm->rm_col[x].rc_abd =
214eda14cbcSMatt Macy 				    rm->rm_col[x].rc_gdata =
215eda14cbcSMatt Macy 				    abd_alloc_sametype(rm->rm_col[x].rc_abd,
216eda14cbcSMatt Macy 				    rm->rm_col[x].rc_size);
217eda14cbcSMatt Macy 			}
218eda14cbcSMatt Macy 
219eda14cbcSMatt Macy 			/* fill in the data columns from good_data */
220eda14cbcSMatt Macy 			offset = 0;
221eda14cbcSMatt Macy 			for (; x < rm->rm_cols; x++) {
222eda14cbcSMatt Macy 				abd_put(rm->rm_col[x].rc_abd);
223eda14cbcSMatt Macy 
224eda14cbcSMatt Macy 				rm->rm_col[x].rc_abd =
225eda14cbcSMatt Macy 				    abd_get_offset_size((abd_t *)good_data,
226eda14cbcSMatt Macy 				    offset, rm->rm_col[x].rc_size);
227eda14cbcSMatt Macy 				offset += rm->rm_col[x].rc_size;
228eda14cbcSMatt Macy 			}
229eda14cbcSMatt Macy 
230eda14cbcSMatt Macy 			/*
231eda14cbcSMatt Macy 			 * Construct the parity from the good data.
232eda14cbcSMatt Macy 			 */
233eda14cbcSMatt Macy 			vdev_raidz_generate_parity(rm);
234eda14cbcSMatt Macy 
235eda14cbcSMatt Macy 			/* restore everything back to its original state */
236eda14cbcSMatt Macy 			for (x = 0; x < rm->rm_firstdatacol; x++)
237eda14cbcSMatt Macy 				rm->rm_col[x].rc_abd = bad_parity[x];
238eda14cbcSMatt Macy 
239eda14cbcSMatt Macy 			offset = 0;
240eda14cbcSMatt Macy 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
241eda14cbcSMatt Macy 				abd_put(rm->rm_col[x].rc_abd);
242eda14cbcSMatt Macy 				rm->rm_col[x].rc_abd = abd_get_offset_size(
243eda14cbcSMatt Macy 				    rm->rm_abd_copy, offset,
244eda14cbcSMatt Macy 				    rm->rm_col[x].rc_size);
245eda14cbcSMatt Macy 				offset += rm->rm_col[x].rc_size;
246eda14cbcSMatt Macy 			}
247eda14cbcSMatt Macy 		}
248eda14cbcSMatt Macy 
249eda14cbcSMatt Macy 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
250eda14cbcSMatt Macy 		good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
251eda14cbcSMatt Macy 		    rm->rm_col[c].rc_size);
252eda14cbcSMatt Macy 	} else {
253eda14cbcSMatt Macy 		/* adjust good_data to point at the start of our column */
254eda14cbcSMatt Macy 		offset = 0;
255eda14cbcSMatt Macy 		for (x = rm->rm_firstdatacol; x < c; x++)
256eda14cbcSMatt Macy 			offset += rm->rm_col[x].rc_size;
257eda14cbcSMatt Macy 
258eda14cbcSMatt Macy 		good = abd_get_offset_size((abd_t *)good_data, offset,
259eda14cbcSMatt Macy 		    rm->rm_col[c].rc_size);
260eda14cbcSMatt Macy 	}
261eda14cbcSMatt Macy 
262eda14cbcSMatt Macy 	/* we drop the ereport if it ends up that the data was good */
263eda14cbcSMatt Macy 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
264eda14cbcSMatt Macy 	abd_put((abd_t *)good);
265eda14cbcSMatt Macy }
266eda14cbcSMatt Macy 
267eda14cbcSMatt Macy /*
268eda14cbcSMatt Macy  * Invoked indirectly by zfs_ereport_start_checksum(), called
269eda14cbcSMatt Macy  * below when our read operation fails completely.  The main point
270eda14cbcSMatt Macy  * is to keep a copy of everything we read from disk, so that at
271eda14cbcSMatt Macy  * vdev_raidz_cksum_finish() time we can compare it with the good data.
272eda14cbcSMatt Macy  */
273eda14cbcSMatt Macy static void
274eda14cbcSMatt Macy vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
275eda14cbcSMatt Macy {
276eda14cbcSMatt Macy 	size_t c = (size_t)(uintptr_t)arg;
277eda14cbcSMatt Macy 	size_t offset;
278eda14cbcSMatt Macy 
279eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
280eda14cbcSMatt Macy 	size_t size;
281eda14cbcSMatt Macy 
282eda14cbcSMatt Macy 	/* set up the report and bump the refcount  */
283eda14cbcSMatt Macy 	zcr->zcr_cbdata = rm;
284eda14cbcSMatt Macy 	zcr->zcr_cbinfo = c;
285eda14cbcSMatt Macy 	zcr->zcr_finish = vdev_raidz_cksum_finish;
286eda14cbcSMatt Macy 	zcr->zcr_free = vdev_raidz_cksum_free;
287eda14cbcSMatt Macy 
288eda14cbcSMatt Macy 	rm->rm_reports++;
289eda14cbcSMatt Macy 	ASSERT3U(rm->rm_reports, >, 0);
290eda14cbcSMatt Macy 
291eda14cbcSMatt Macy 	if (rm->rm_abd_copy != NULL)
292eda14cbcSMatt Macy 		return;
293eda14cbcSMatt Macy 
294eda14cbcSMatt Macy 	/*
295eda14cbcSMatt Macy 	 * It's the first time we're called for this raidz_map_t, so we need
296eda14cbcSMatt Macy 	 * to copy the data aside; there's no guarantee that our zio's buffer
297eda14cbcSMatt Macy 	 * won't be re-used for something else.
298eda14cbcSMatt Macy 	 *
299eda14cbcSMatt Macy 	 * Our parity data is already in separate buffers, so there's no need
300eda14cbcSMatt Macy 	 * to copy them.
301eda14cbcSMatt Macy 	 */
302eda14cbcSMatt Macy 
303eda14cbcSMatt Macy 	size = 0;
304eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
305eda14cbcSMatt Macy 		size += rm->rm_col[c].rc_size;
306eda14cbcSMatt Macy 
307eda14cbcSMatt Macy 	rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
308eda14cbcSMatt Macy 
309eda14cbcSMatt Macy 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
310eda14cbcSMatt Macy 		raidz_col_t *col = &rm->rm_col[c];
311eda14cbcSMatt Macy 		abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
312eda14cbcSMatt Macy 		    col->rc_size);
313eda14cbcSMatt Macy 
314eda14cbcSMatt Macy 		abd_copy(tmp, col->rc_abd, col->rc_size);
315eda14cbcSMatt Macy 
316eda14cbcSMatt Macy 		abd_put(col->rc_abd);
317eda14cbcSMatt Macy 		col->rc_abd = tmp;
318eda14cbcSMatt Macy 
319eda14cbcSMatt Macy 		offset += col->rc_size;
320eda14cbcSMatt Macy 	}
321eda14cbcSMatt Macy 	ASSERT3U(offset, ==, size);
322eda14cbcSMatt Macy }
323eda14cbcSMatt Macy 
324eda14cbcSMatt Macy static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
325eda14cbcSMatt Macy 	.vsd_free = vdev_raidz_map_free_vsd,
326eda14cbcSMatt Macy 	.vsd_cksum_report = vdev_raidz_cksum_report
327eda14cbcSMatt Macy };
328eda14cbcSMatt Macy 
329eda14cbcSMatt Macy /*
330eda14cbcSMatt Macy  * Divides the IO evenly across all child vdevs; usually, dcols is
331eda14cbcSMatt Macy  * the number of children in the target vdev.
332eda14cbcSMatt Macy  *
333eda14cbcSMatt Macy  * Avoid inlining the function to keep vdev_raidz_io_start(), which
334eda14cbcSMatt Macy  * is this functions only caller, as small as possible on the stack.
335eda14cbcSMatt Macy  */
336eda14cbcSMatt Macy noinline raidz_map_t *
337eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
338eda14cbcSMatt Macy     uint64_t nparity)
339eda14cbcSMatt Macy {
340eda14cbcSMatt Macy 	raidz_map_t *rm;
341eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
342eda14cbcSMatt Macy 	uint64_t b = zio->io_offset >> ashift;
343eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
344eda14cbcSMatt Macy 	uint64_t s = zio->io_size >> ashift;
345eda14cbcSMatt Macy 	/* The first column for this stripe. */
346eda14cbcSMatt Macy 	uint64_t f = b % dcols;
347eda14cbcSMatt Macy 	/* The starting byte offset on each child vdev. */
348eda14cbcSMatt Macy 	uint64_t o = (b / dcols) << ashift;
349eda14cbcSMatt Macy 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
350eda14cbcSMatt Macy 	uint64_t off = 0;
351eda14cbcSMatt Macy 
352eda14cbcSMatt Macy 	/*
353eda14cbcSMatt Macy 	 * "Quotient": The number of data sectors for this stripe on all but
354eda14cbcSMatt Macy 	 * the "big column" child vdevs that also contain "remainder" data.
355eda14cbcSMatt Macy 	 */
356eda14cbcSMatt Macy 	q = s / (dcols - nparity);
357eda14cbcSMatt Macy 
358eda14cbcSMatt Macy 	/*
359eda14cbcSMatt Macy 	 * "Remainder": The number of partial stripe data sectors in this I/O.
360eda14cbcSMatt Macy 	 * This will add a sector to some, but not all, child vdevs.
361eda14cbcSMatt Macy 	 */
362eda14cbcSMatt Macy 	r = s - q * (dcols - nparity);
363eda14cbcSMatt Macy 
364eda14cbcSMatt Macy 	/* The number of "big columns" - those which contain remainder data. */
365eda14cbcSMatt Macy 	bc = (r == 0 ? 0 : r + nparity);
366eda14cbcSMatt Macy 
367eda14cbcSMatt Macy 	/*
368eda14cbcSMatt Macy 	 * The total number of data and parity sectors associated with
369eda14cbcSMatt Macy 	 * this I/O.
370eda14cbcSMatt Macy 	 */
371eda14cbcSMatt Macy 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
372eda14cbcSMatt Macy 
373eda14cbcSMatt Macy 	/* acols: The columns that will be accessed. */
374eda14cbcSMatt Macy 	/* scols: The columns that will be accessed or skipped. */
375eda14cbcSMatt Macy 	if (q == 0) {
376eda14cbcSMatt Macy 		/* Our I/O request doesn't span all child vdevs. */
377eda14cbcSMatt Macy 		acols = bc;
378eda14cbcSMatt Macy 		scols = MIN(dcols, roundup(bc, nparity + 1));
379eda14cbcSMatt Macy 	} else {
380eda14cbcSMatt Macy 		acols = dcols;
381eda14cbcSMatt Macy 		scols = dcols;
382eda14cbcSMatt Macy 	}
383eda14cbcSMatt Macy 
384eda14cbcSMatt Macy 	ASSERT3U(acols, <=, scols);
385eda14cbcSMatt Macy 
386eda14cbcSMatt Macy 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
387eda14cbcSMatt Macy 
388eda14cbcSMatt Macy 	rm->rm_cols = acols;
389eda14cbcSMatt Macy 	rm->rm_scols = scols;
390eda14cbcSMatt Macy 	rm->rm_bigcols = bc;
391eda14cbcSMatt Macy 	rm->rm_skipstart = bc;
392eda14cbcSMatt Macy 	rm->rm_missingdata = 0;
393eda14cbcSMatt Macy 	rm->rm_missingparity = 0;
394eda14cbcSMatt Macy 	rm->rm_firstdatacol = nparity;
395eda14cbcSMatt Macy 	rm->rm_abd_copy = NULL;
396eda14cbcSMatt Macy 	rm->rm_reports = 0;
397eda14cbcSMatt Macy 	rm->rm_freed = 0;
398eda14cbcSMatt Macy 	rm->rm_ecksuminjected = 0;
399eda14cbcSMatt Macy 
400eda14cbcSMatt Macy 	asize = 0;
401eda14cbcSMatt Macy 
402eda14cbcSMatt Macy 	for (c = 0; c < scols; c++) {
403eda14cbcSMatt Macy 		col = f + c;
404eda14cbcSMatt Macy 		coff = o;
405eda14cbcSMatt Macy 		if (col >= dcols) {
406eda14cbcSMatt Macy 			col -= dcols;
407eda14cbcSMatt Macy 			coff += 1ULL << ashift;
408eda14cbcSMatt Macy 		}
409eda14cbcSMatt Macy 		rm->rm_col[c].rc_devidx = col;
410eda14cbcSMatt Macy 		rm->rm_col[c].rc_offset = coff;
411eda14cbcSMatt Macy 		rm->rm_col[c].rc_abd = NULL;
412eda14cbcSMatt Macy 		rm->rm_col[c].rc_gdata = NULL;
413eda14cbcSMatt Macy 		rm->rm_col[c].rc_error = 0;
414eda14cbcSMatt Macy 		rm->rm_col[c].rc_tried = 0;
415eda14cbcSMatt Macy 		rm->rm_col[c].rc_skipped = 0;
416eda14cbcSMatt Macy 
417eda14cbcSMatt Macy 		if (c >= acols)
418eda14cbcSMatt Macy 			rm->rm_col[c].rc_size = 0;
419eda14cbcSMatt Macy 		else if (c < bc)
420eda14cbcSMatt Macy 			rm->rm_col[c].rc_size = (q + 1) << ashift;
421eda14cbcSMatt Macy 		else
422eda14cbcSMatt Macy 			rm->rm_col[c].rc_size = q << ashift;
423eda14cbcSMatt Macy 
424eda14cbcSMatt Macy 		asize += rm->rm_col[c].rc_size;
425eda14cbcSMatt Macy 	}
426eda14cbcSMatt Macy 
427eda14cbcSMatt Macy 	ASSERT3U(asize, ==, tot << ashift);
428eda14cbcSMatt Macy 	rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
429eda14cbcSMatt Macy 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
430eda14cbcSMatt Macy 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
431eda14cbcSMatt Macy 	ASSERT3U(rm->rm_nskip, <=, nparity);
432eda14cbcSMatt Macy 
433eda14cbcSMatt Macy 	for (c = 0; c < rm->rm_firstdatacol; c++)
434eda14cbcSMatt Macy 		rm->rm_col[c].rc_abd =
435eda14cbcSMatt Macy 		    abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
436eda14cbcSMatt Macy 
437eda14cbcSMatt Macy 	rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
438eda14cbcSMatt Macy 	    rm->rm_col[c].rc_size);
439eda14cbcSMatt Macy 	off = rm->rm_col[c].rc_size;
440eda14cbcSMatt Macy 
441eda14cbcSMatt Macy 	for (c = c + 1; c < acols; c++) {
442eda14cbcSMatt Macy 		rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
443eda14cbcSMatt Macy 		    rm->rm_col[c].rc_size);
444eda14cbcSMatt Macy 		off += rm->rm_col[c].rc_size;
445eda14cbcSMatt Macy 	}
446eda14cbcSMatt Macy 
447eda14cbcSMatt Macy 	/*
448eda14cbcSMatt Macy 	 * If all data stored spans all columns, there's a danger that parity
449eda14cbcSMatt Macy 	 * will always be on the same device and, since parity isn't read
450eda14cbcSMatt Macy 	 * during normal operation, that device's I/O bandwidth won't be
451eda14cbcSMatt Macy 	 * used effectively. We therefore switch the parity every 1MB.
452eda14cbcSMatt Macy 	 *
453eda14cbcSMatt Macy 	 * ... at least that was, ostensibly, the theory. As a practical
454eda14cbcSMatt Macy 	 * matter unless we juggle the parity between all devices evenly, we
455eda14cbcSMatt Macy 	 * won't see any benefit. Further, occasional writes that aren't a
456eda14cbcSMatt Macy 	 * multiple of the LCM of the number of children and the minimum
457eda14cbcSMatt Macy 	 * stripe width are sufficient to avoid pessimal behavior.
458eda14cbcSMatt Macy 	 * Unfortunately, this decision created an implicit on-disk format
459eda14cbcSMatt Macy 	 * requirement that we need to support for all eternity, but only
460eda14cbcSMatt Macy 	 * for single-parity RAID-Z.
461eda14cbcSMatt Macy 	 *
462eda14cbcSMatt Macy 	 * If we intend to skip a sector in the zeroth column for padding
463eda14cbcSMatt Macy 	 * we must make sure to note this swap. We will never intend to
464eda14cbcSMatt Macy 	 * skip the first column since at least one data and one parity
465eda14cbcSMatt Macy 	 * column must appear in each row.
466eda14cbcSMatt Macy 	 */
467eda14cbcSMatt Macy 	ASSERT(rm->rm_cols >= 2);
468eda14cbcSMatt Macy 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
469eda14cbcSMatt Macy 
470eda14cbcSMatt Macy 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
471eda14cbcSMatt Macy 		devidx = rm->rm_col[0].rc_devidx;
472eda14cbcSMatt Macy 		o = rm->rm_col[0].rc_offset;
473eda14cbcSMatt Macy 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
474eda14cbcSMatt Macy 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
475eda14cbcSMatt Macy 		rm->rm_col[1].rc_devidx = devidx;
476eda14cbcSMatt Macy 		rm->rm_col[1].rc_offset = o;
477eda14cbcSMatt Macy 
478eda14cbcSMatt Macy 		if (rm->rm_skipstart == 0)
479eda14cbcSMatt Macy 			rm->rm_skipstart = 1;
480eda14cbcSMatt Macy 	}
481eda14cbcSMatt Macy 
482eda14cbcSMatt Macy 	zio->io_vsd = rm;
483eda14cbcSMatt Macy 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
484eda14cbcSMatt Macy 
485eda14cbcSMatt Macy 	/* init RAIDZ parity ops */
486eda14cbcSMatt Macy 	rm->rm_ops = vdev_raidz_math_get_ops();
487eda14cbcSMatt Macy 
488eda14cbcSMatt Macy 	return (rm);
489eda14cbcSMatt Macy }
490eda14cbcSMatt Macy 
491eda14cbcSMatt Macy struct pqr_struct {
492eda14cbcSMatt Macy 	uint64_t *p;
493eda14cbcSMatt Macy 	uint64_t *q;
494eda14cbcSMatt Macy 	uint64_t *r;
495eda14cbcSMatt Macy };
496eda14cbcSMatt Macy 
497eda14cbcSMatt Macy static int
498eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private)
499eda14cbcSMatt Macy {
500eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
501eda14cbcSMatt Macy 	const uint64_t *src = buf;
502eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
503eda14cbcSMatt Macy 
504eda14cbcSMatt Macy 	ASSERT(pqr->p && !pqr->q && !pqr->r);
505eda14cbcSMatt Macy 
506eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++)
507eda14cbcSMatt Macy 		*pqr->p ^= *src;
508eda14cbcSMatt Macy 
509eda14cbcSMatt Macy 	return (0);
510eda14cbcSMatt Macy }
511eda14cbcSMatt Macy 
512eda14cbcSMatt Macy static int
513eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private)
514eda14cbcSMatt Macy {
515eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
516eda14cbcSMatt Macy 	const uint64_t *src = buf;
517eda14cbcSMatt Macy 	uint64_t mask;
518eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
519eda14cbcSMatt Macy 
520eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && !pqr->r);
521eda14cbcSMatt Macy 
522eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
523eda14cbcSMatt Macy 		*pqr->p ^= *src;
524eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
525eda14cbcSMatt Macy 		*pqr->q ^= *src;
526eda14cbcSMatt Macy 	}
527eda14cbcSMatt Macy 
528eda14cbcSMatt Macy 	return (0);
529eda14cbcSMatt Macy }
530eda14cbcSMatt Macy 
531eda14cbcSMatt Macy static int
532eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private)
533eda14cbcSMatt Macy {
534eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
535eda14cbcSMatt Macy 	const uint64_t *src = buf;
536eda14cbcSMatt Macy 	uint64_t mask;
537eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
538eda14cbcSMatt Macy 
539eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && pqr->r);
540eda14cbcSMatt Macy 
541eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
542eda14cbcSMatt Macy 		*pqr->p ^= *src;
543eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
544eda14cbcSMatt Macy 		*pqr->q ^= *src;
545eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
546eda14cbcSMatt Macy 		*pqr->r ^= *src;
547eda14cbcSMatt Macy 	}
548eda14cbcSMatt Macy 
549eda14cbcSMatt Macy 	return (0);
550eda14cbcSMatt Macy }
551eda14cbcSMatt Macy 
552eda14cbcSMatt Macy static void
553eda14cbcSMatt Macy vdev_raidz_generate_parity_p(raidz_map_t *rm)
554eda14cbcSMatt Macy {
555eda14cbcSMatt Macy 	uint64_t *p;
556eda14cbcSMatt Macy 	int c;
557eda14cbcSMatt Macy 	abd_t *src;
558eda14cbcSMatt Macy 
559eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
560eda14cbcSMatt Macy 		src = rm->rm_col[c].rc_abd;
561eda14cbcSMatt Macy 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
562eda14cbcSMatt Macy 
563eda14cbcSMatt Macy 		if (c == rm->rm_firstdatacol) {
564eda14cbcSMatt Macy 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
565eda14cbcSMatt Macy 		} else {
566eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, NULL, NULL };
567eda14cbcSMatt Macy 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
568eda14cbcSMatt Macy 			    vdev_raidz_p_func, &pqr);
569eda14cbcSMatt Macy 		}
570eda14cbcSMatt Macy 	}
571eda14cbcSMatt Macy }
572eda14cbcSMatt Macy 
573eda14cbcSMatt Macy static void
574eda14cbcSMatt Macy vdev_raidz_generate_parity_pq(raidz_map_t *rm)
575eda14cbcSMatt Macy {
576eda14cbcSMatt Macy 	uint64_t *p, *q, pcnt, ccnt, mask, i;
577eda14cbcSMatt Macy 	int c;
578eda14cbcSMatt Macy 	abd_t *src;
579eda14cbcSMatt Macy 
580eda14cbcSMatt Macy 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
581eda14cbcSMatt Macy 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
582eda14cbcSMatt Macy 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
583eda14cbcSMatt Macy 
584eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
585eda14cbcSMatt Macy 		src = rm->rm_col[c].rc_abd;
586eda14cbcSMatt Macy 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
587eda14cbcSMatt Macy 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
588eda14cbcSMatt Macy 
589eda14cbcSMatt Macy 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
590eda14cbcSMatt Macy 
591eda14cbcSMatt Macy 		if (c == rm->rm_firstdatacol) {
592eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
593eda14cbcSMatt Macy 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
594eda14cbcSMatt Macy 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
595eda14cbcSMatt Macy 
596eda14cbcSMatt Macy 			for (i = ccnt; i < pcnt; i++) {
597eda14cbcSMatt Macy 				p[i] = 0;
598eda14cbcSMatt Macy 				q[i] = 0;
599eda14cbcSMatt Macy 			}
600eda14cbcSMatt Macy 		} else {
601eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, NULL };
602eda14cbcSMatt Macy 
603eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
604eda14cbcSMatt Macy 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
605eda14cbcSMatt Macy 			    vdev_raidz_pq_func, &pqr);
606eda14cbcSMatt Macy 
607eda14cbcSMatt Macy 			/*
608eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
609eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
610eda14cbcSMatt Macy 			 */
611eda14cbcSMatt Macy 			for (i = ccnt; i < pcnt; i++) {
612eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
613eda14cbcSMatt Macy 			}
614eda14cbcSMatt Macy 		}
615eda14cbcSMatt Macy 	}
616eda14cbcSMatt Macy }
617eda14cbcSMatt Macy 
618eda14cbcSMatt Macy static void
619eda14cbcSMatt Macy vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
620eda14cbcSMatt Macy {
621eda14cbcSMatt Macy 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
622eda14cbcSMatt Macy 	int c;
623eda14cbcSMatt Macy 	abd_t *src;
624eda14cbcSMatt Macy 
625eda14cbcSMatt Macy 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
626eda14cbcSMatt Macy 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
627eda14cbcSMatt Macy 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
628eda14cbcSMatt Macy 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
629eda14cbcSMatt Macy 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
630eda14cbcSMatt Macy 
631eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
632eda14cbcSMatt Macy 		src = rm->rm_col[c].rc_abd;
633eda14cbcSMatt Macy 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
634eda14cbcSMatt Macy 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
635eda14cbcSMatt Macy 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
636eda14cbcSMatt Macy 
637eda14cbcSMatt Macy 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
638eda14cbcSMatt Macy 
639eda14cbcSMatt Macy 		if (c == rm->rm_firstdatacol) {
640eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
641eda14cbcSMatt Macy 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
642eda14cbcSMatt Macy 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
643eda14cbcSMatt Macy 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
644eda14cbcSMatt Macy 
645eda14cbcSMatt Macy 			for (i = ccnt; i < pcnt; i++) {
646eda14cbcSMatt Macy 				p[i] = 0;
647eda14cbcSMatt Macy 				q[i] = 0;
648eda14cbcSMatt Macy 				r[i] = 0;
649eda14cbcSMatt Macy 			}
650eda14cbcSMatt Macy 		} else {
651eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, r };
652eda14cbcSMatt Macy 
653eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
654eda14cbcSMatt Macy 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
655eda14cbcSMatt Macy 			    vdev_raidz_pqr_func, &pqr);
656eda14cbcSMatt Macy 
657eda14cbcSMatt Macy 			/*
658eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
659eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
660eda14cbcSMatt Macy 			 */
661eda14cbcSMatt Macy 			for (i = ccnt; i < pcnt; i++) {
662eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
663eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_4(r[i], mask);
664eda14cbcSMatt Macy 			}
665eda14cbcSMatt Macy 		}
666eda14cbcSMatt Macy 	}
667eda14cbcSMatt Macy }
668eda14cbcSMatt Macy 
669eda14cbcSMatt Macy /*
670eda14cbcSMatt Macy  * Generate RAID parity in the first virtual columns according to the number of
671eda14cbcSMatt Macy  * parity columns available.
672eda14cbcSMatt Macy  */
673eda14cbcSMatt Macy void
674eda14cbcSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm)
675eda14cbcSMatt Macy {
676eda14cbcSMatt Macy 	/* Generate using the new math implementation */
677eda14cbcSMatt Macy 	if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
678eda14cbcSMatt Macy 		return;
679eda14cbcSMatt Macy 
680eda14cbcSMatt Macy 	switch (rm->rm_firstdatacol) {
681eda14cbcSMatt Macy 	case 1:
682eda14cbcSMatt Macy 		vdev_raidz_generate_parity_p(rm);
683eda14cbcSMatt Macy 		break;
684eda14cbcSMatt Macy 	case 2:
685eda14cbcSMatt Macy 		vdev_raidz_generate_parity_pq(rm);
686eda14cbcSMatt Macy 		break;
687eda14cbcSMatt Macy 	case 3:
688eda14cbcSMatt Macy 		vdev_raidz_generate_parity_pqr(rm);
689eda14cbcSMatt Macy 		break;
690eda14cbcSMatt Macy 	default:
691eda14cbcSMatt Macy 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
692eda14cbcSMatt Macy 	}
693eda14cbcSMatt Macy }
694eda14cbcSMatt Macy 
695eda14cbcSMatt Macy /* ARGSUSED */
696eda14cbcSMatt Macy static int
697eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
698eda14cbcSMatt Macy {
699eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
700eda14cbcSMatt Macy 	uint64_t *src = sbuf;
701eda14cbcSMatt Macy 	int cnt = size / sizeof (src[0]);
702eda14cbcSMatt Macy 
703eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++) {
704eda14cbcSMatt Macy 		dst[i] ^= src[i];
705eda14cbcSMatt Macy 	}
706eda14cbcSMatt Macy 
707eda14cbcSMatt Macy 	return (0);
708eda14cbcSMatt Macy }
709eda14cbcSMatt Macy 
710eda14cbcSMatt Macy /* ARGSUSED */
711eda14cbcSMatt Macy static int
712eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
713eda14cbcSMatt Macy     void *private)
714eda14cbcSMatt Macy {
715eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
716eda14cbcSMatt Macy 	uint64_t *src = sbuf;
717eda14cbcSMatt Macy 	uint64_t mask;
718eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
719eda14cbcSMatt Macy 
720eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, src++) {
721eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
722eda14cbcSMatt Macy 		*dst ^= *src;
723eda14cbcSMatt Macy 	}
724eda14cbcSMatt Macy 
725eda14cbcSMatt Macy 	return (0);
726eda14cbcSMatt Macy }
727eda14cbcSMatt Macy 
728eda14cbcSMatt Macy /* ARGSUSED */
729eda14cbcSMatt Macy static int
730eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
731eda14cbcSMatt Macy {
732eda14cbcSMatt Macy 	uint64_t *dst = buf;
733eda14cbcSMatt Macy 	uint64_t mask;
734eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
735eda14cbcSMatt Macy 
736eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++) {
737eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
738eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
739eda14cbcSMatt Macy 	}
740eda14cbcSMatt Macy 
741eda14cbcSMatt Macy 	return (0);
742eda14cbcSMatt Macy }
743eda14cbcSMatt Macy 
744eda14cbcSMatt Macy struct reconst_q_struct {
745eda14cbcSMatt Macy 	uint64_t *q;
746eda14cbcSMatt Macy 	int exp;
747eda14cbcSMatt Macy };
748eda14cbcSMatt Macy 
749eda14cbcSMatt Macy static int
750eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
751eda14cbcSMatt Macy {
752eda14cbcSMatt Macy 	struct reconst_q_struct *rq = private;
753eda14cbcSMatt Macy 	uint64_t *dst = buf;
754eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
755eda14cbcSMatt Macy 
756eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
757eda14cbcSMatt Macy 		int j;
758eda14cbcSMatt Macy 		uint8_t *b;
759eda14cbcSMatt Macy 
760eda14cbcSMatt Macy 		*dst ^= *rq->q;
761eda14cbcSMatt Macy 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
762eda14cbcSMatt Macy 			*b = vdev_raidz_exp2(*b, rq->exp);
763eda14cbcSMatt Macy 		}
764eda14cbcSMatt Macy 	}
765eda14cbcSMatt Macy 
766eda14cbcSMatt Macy 	return (0);
767eda14cbcSMatt Macy }
768eda14cbcSMatt Macy 
769eda14cbcSMatt Macy struct reconst_pq_struct {
770eda14cbcSMatt Macy 	uint8_t *p;
771eda14cbcSMatt Macy 	uint8_t *q;
772eda14cbcSMatt Macy 	uint8_t *pxy;
773eda14cbcSMatt Macy 	uint8_t *qxy;
774eda14cbcSMatt Macy 	int aexp;
775eda14cbcSMatt Macy 	int bexp;
776eda14cbcSMatt Macy };
777eda14cbcSMatt Macy 
778eda14cbcSMatt Macy static int
779eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
780eda14cbcSMatt Macy {
781eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
782eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
783eda14cbcSMatt Macy 	uint8_t *yd = ybuf;
784eda14cbcSMatt Macy 
785eda14cbcSMatt Macy 	for (int i = 0; i < size;
786eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
787eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
788eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
789eda14cbcSMatt Macy 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
790eda14cbcSMatt Macy 	}
791eda14cbcSMatt Macy 
792eda14cbcSMatt Macy 	return (0);
793eda14cbcSMatt Macy }
794eda14cbcSMatt Macy 
795eda14cbcSMatt Macy static int
796eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
797eda14cbcSMatt Macy {
798eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
799eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
800eda14cbcSMatt Macy 
801eda14cbcSMatt Macy 	for (int i = 0; i < size;
802eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
803eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
804eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
805eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
806eda14cbcSMatt Macy 	}
807eda14cbcSMatt Macy 
808eda14cbcSMatt Macy 	return (0);
809eda14cbcSMatt Macy }
810eda14cbcSMatt Macy 
811eda14cbcSMatt Macy static int
812eda14cbcSMatt Macy vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
813eda14cbcSMatt Macy {
814eda14cbcSMatt Macy 	int x = tgts[0];
815eda14cbcSMatt Macy 	int c;
816eda14cbcSMatt Macy 	abd_t *dst, *src;
817eda14cbcSMatt Macy 
818eda14cbcSMatt Macy 	ASSERT(ntgts == 1);
819eda14cbcSMatt Macy 	ASSERT(x >= rm->rm_firstdatacol);
820eda14cbcSMatt Macy 	ASSERT(x < rm->rm_cols);
821eda14cbcSMatt Macy 
822eda14cbcSMatt Macy 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
823eda14cbcSMatt Macy 	ASSERT(rm->rm_col[x].rc_size > 0);
824eda14cbcSMatt Macy 
825eda14cbcSMatt Macy 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
826eda14cbcSMatt Macy 	dst = rm->rm_col[x].rc_abd;
827eda14cbcSMatt Macy 
828eda14cbcSMatt Macy 	abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
829eda14cbcSMatt Macy 
830eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
831eda14cbcSMatt Macy 		uint64_t size = MIN(rm->rm_col[x].rc_size,
832eda14cbcSMatt Macy 		    rm->rm_col[c].rc_size);
833eda14cbcSMatt Macy 
834eda14cbcSMatt Macy 		src = rm->rm_col[c].rc_abd;
835eda14cbcSMatt Macy 		dst = rm->rm_col[x].rc_abd;
836eda14cbcSMatt Macy 
837eda14cbcSMatt Macy 		if (c == x)
838eda14cbcSMatt Macy 			continue;
839eda14cbcSMatt Macy 
840eda14cbcSMatt Macy 		(void) abd_iterate_func2(dst, src, 0, 0, size,
841eda14cbcSMatt Macy 		    vdev_raidz_reconst_p_func, NULL);
842eda14cbcSMatt Macy 	}
843eda14cbcSMatt Macy 
844eda14cbcSMatt Macy 	return (1 << VDEV_RAIDZ_P);
845eda14cbcSMatt Macy }
846eda14cbcSMatt Macy 
847eda14cbcSMatt Macy static int
848eda14cbcSMatt Macy vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
849eda14cbcSMatt Macy {
850eda14cbcSMatt Macy 	int x = tgts[0];
851eda14cbcSMatt Macy 	int c, exp;
852eda14cbcSMatt Macy 	abd_t *dst, *src;
853eda14cbcSMatt Macy 
854eda14cbcSMatt Macy 	ASSERT(ntgts == 1);
855eda14cbcSMatt Macy 
856eda14cbcSMatt Macy 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
857eda14cbcSMatt Macy 
858eda14cbcSMatt Macy 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
859eda14cbcSMatt Macy 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
860eda14cbcSMatt Macy 		    rm->rm_col[c].rc_size);
861eda14cbcSMatt Macy 
862eda14cbcSMatt Macy 		src = rm->rm_col[c].rc_abd;
863eda14cbcSMatt Macy 		dst = rm->rm_col[x].rc_abd;
864eda14cbcSMatt Macy 
865eda14cbcSMatt Macy 		if (c == rm->rm_firstdatacol) {
866eda14cbcSMatt Macy 			abd_copy(dst, src, size);
867eda14cbcSMatt Macy 			if (rm->rm_col[x].rc_size > size)
868eda14cbcSMatt Macy 				abd_zero_off(dst, size,
869eda14cbcSMatt Macy 				    rm->rm_col[x].rc_size - size);
870eda14cbcSMatt Macy 
871eda14cbcSMatt Macy 		} else {
872eda14cbcSMatt Macy 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
873eda14cbcSMatt Macy 			(void) abd_iterate_func2(dst, src, 0, 0, size,
874eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_func, NULL);
875eda14cbcSMatt Macy 			(void) abd_iterate_func(dst,
876eda14cbcSMatt Macy 			    size, rm->rm_col[x].rc_size - size,
877eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
878eda14cbcSMatt Macy 		}
879eda14cbcSMatt Macy 	}
880eda14cbcSMatt Macy 
881eda14cbcSMatt Macy 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
882eda14cbcSMatt Macy 	dst = rm->rm_col[x].rc_abd;
883eda14cbcSMatt Macy 	exp = 255 - (rm->rm_cols - 1 - x);
884eda14cbcSMatt Macy 
885eda14cbcSMatt Macy 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
886eda14cbcSMatt Macy 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
887eda14cbcSMatt Macy 	    vdev_raidz_reconst_q_post_func, &rq);
888eda14cbcSMatt Macy 
889eda14cbcSMatt Macy 	return (1 << VDEV_RAIDZ_Q);
890eda14cbcSMatt Macy }
891eda14cbcSMatt Macy 
892eda14cbcSMatt Macy static int
893eda14cbcSMatt Macy vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
894eda14cbcSMatt Macy {
895eda14cbcSMatt Macy 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
896eda14cbcSMatt Macy 	abd_t *pdata, *qdata;
897eda14cbcSMatt Macy 	uint64_t xsize, ysize;
898eda14cbcSMatt Macy 	int x = tgts[0];
899eda14cbcSMatt Macy 	int y = tgts[1];
900eda14cbcSMatt Macy 	abd_t *xd, *yd;
901eda14cbcSMatt Macy 
902eda14cbcSMatt Macy 	ASSERT(ntgts == 2);
903eda14cbcSMatt Macy 	ASSERT(x < y);
904eda14cbcSMatt Macy 	ASSERT(x >= rm->rm_firstdatacol);
905eda14cbcSMatt Macy 	ASSERT(y < rm->rm_cols);
906eda14cbcSMatt Macy 
907eda14cbcSMatt Macy 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
908eda14cbcSMatt Macy 
909eda14cbcSMatt Macy 	/*
910eda14cbcSMatt Macy 	 * Move the parity data aside -- we're going to compute parity as
911eda14cbcSMatt Macy 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
912eda14cbcSMatt Macy 	 * reuse the parity generation mechanism without trashing the actual
913eda14cbcSMatt Macy 	 * parity so we make those columns appear to be full of zeros by
914eda14cbcSMatt Macy 	 * setting their lengths to zero.
915eda14cbcSMatt Macy 	 */
916eda14cbcSMatt Macy 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
917eda14cbcSMatt Macy 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
918eda14cbcSMatt Macy 	xsize = rm->rm_col[x].rc_size;
919eda14cbcSMatt Macy 	ysize = rm->rm_col[y].rc_size;
920eda14cbcSMatt Macy 
921eda14cbcSMatt Macy 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
922eda14cbcSMatt Macy 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
923eda14cbcSMatt Macy 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
924eda14cbcSMatt Macy 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
925eda14cbcSMatt Macy 	rm->rm_col[x].rc_size = 0;
926eda14cbcSMatt Macy 	rm->rm_col[y].rc_size = 0;
927eda14cbcSMatt Macy 
928eda14cbcSMatt Macy 	vdev_raidz_generate_parity_pq(rm);
929eda14cbcSMatt Macy 
930eda14cbcSMatt Macy 	rm->rm_col[x].rc_size = xsize;
931eda14cbcSMatt Macy 	rm->rm_col[y].rc_size = ysize;
932eda14cbcSMatt Macy 
933eda14cbcSMatt Macy 	p = abd_to_buf(pdata);
934eda14cbcSMatt Macy 	q = abd_to_buf(qdata);
935eda14cbcSMatt Macy 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
936eda14cbcSMatt Macy 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
937eda14cbcSMatt Macy 	xd = rm->rm_col[x].rc_abd;
938eda14cbcSMatt Macy 	yd = rm->rm_col[y].rc_abd;
939eda14cbcSMatt Macy 
940eda14cbcSMatt Macy 	/*
941eda14cbcSMatt Macy 	 * We now have:
942eda14cbcSMatt Macy 	 *	Pxy = P + D_x + D_y
943eda14cbcSMatt Macy 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
944eda14cbcSMatt Macy 	 *
945eda14cbcSMatt Macy 	 * We can then solve for D_x:
946eda14cbcSMatt Macy 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
947eda14cbcSMatt Macy 	 * where
948eda14cbcSMatt Macy 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
949eda14cbcSMatt Macy 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
950eda14cbcSMatt Macy 	 *
951eda14cbcSMatt Macy 	 * With D_x in hand, we can easily solve for D_y:
952eda14cbcSMatt Macy 	 *	D_y = P + Pxy + D_x
953eda14cbcSMatt Macy 	 */
954eda14cbcSMatt Macy 
955eda14cbcSMatt Macy 	a = vdev_raidz_pow2[255 + x - y];
956eda14cbcSMatt Macy 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
957eda14cbcSMatt Macy 	tmp = 255 - vdev_raidz_log2[a ^ 1];
958eda14cbcSMatt Macy 
959eda14cbcSMatt Macy 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
960eda14cbcSMatt Macy 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
961eda14cbcSMatt Macy 
962eda14cbcSMatt Macy 	ASSERT3U(xsize, >=, ysize);
963eda14cbcSMatt Macy 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
964eda14cbcSMatt Macy 
965eda14cbcSMatt Macy 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
966eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_func, &rpq);
967eda14cbcSMatt Macy 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
968eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_tail_func, &rpq);
969eda14cbcSMatt Macy 
970eda14cbcSMatt Macy 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
971eda14cbcSMatt Macy 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
972eda14cbcSMatt Macy 
973eda14cbcSMatt Macy 	/*
974eda14cbcSMatt Macy 	 * Restore the saved parity data.
975eda14cbcSMatt Macy 	 */
976eda14cbcSMatt Macy 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
977eda14cbcSMatt Macy 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
978eda14cbcSMatt Macy 
979eda14cbcSMatt Macy 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
980eda14cbcSMatt Macy }
981eda14cbcSMatt Macy 
982eda14cbcSMatt Macy /* BEGIN CSTYLED */
983eda14cbcSMatt Macy /*
984eda14cbcSMatt Macy  * In the general case of reconstruction, we must solve the system of linear
985eda14cbcSMatt Macy  * equations defined by the coefficients used to generate parity as well as
986eda14cbcSMatt Macy  * the contents of the data and parity disks. This can be expressed with
987eda14cbcSMatt Macy  * vectors for the original data (D) and the actual data (d) and parity (p)
988eda14cbcSMatt Macy  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
989eda14cbcSMatt Macy  *
990eda14cbcSMatt Macy  *            __   __                     __     __
991eda14cbcSMatt Macy  *            |     |         __     __   |  p_0  |
992eda14cbcSMatt Macy  *            |  V  |         |  D_0  |   | p_m-1 |
993eda14cbcSMatt Macy  *            |     |    x    |   :   | = |  d_0  |
994eda14cbcSMatt Macy  *            |  I  |         | D_n-1 |   |   :   |
995eda14cbcSMatt Macy  *            |     |         ~~     ~~   | d_n-1 |
996eda14cbcSMatt Macy  *            ~~   ~~                     ~~     ~~
997eda14cbcSMatt Macy  *
998eda14cbcSMatt Macy  * I is simply a square identity matrix of size n, and V is a vandermonde
999eda14cbcSMatt Macy  * matrix defined by the coefficients we chose for the various parity columns
1000eda14cbcSMatt Macy  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1001eda14cbcSMatt Macy  * computation as well as linear separability.
1002eda14cbcSMatt Macy  *
1003eda14cbcSMatt Macy  *      __               __               __     __
1004eda14cbcSMatt Macy  *      |   1   ..  1 1 1 |               |  p_0  |
1005eda14cbcSMatt Macy  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1006eda14cbcSMatt Macy  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1007eda14cbcSMatt Macy  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1008eda14cbcSMatt Macy  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1009eda14cbcSMatt Macy  *      |   :       : : : |   |   :   |   |  d_2  |
1010eda14cbcSMatt Macy  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1011eda14cbcSMatt Macy  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1012eda14cbcSMatt Macy  *      |   0   ..  0 0 1 |               | d_n-1 |
1013eda14cbcSMatt Macy  *      ~~               ~~               ~~     ~~
1014eda14cbcSMatt Macy  *
1015eda14cbcSMatt Macy  * Note that I, V, d, and p are known. To compute D, we must invert the
1016eda14cbcSMatt Macy  * matrix and use the known data and parity values to reconstruct the unknown
1017eda14cbcSMatt Macy  * data values. We begin by removing the rows in V|I and d|p that correspond
1018eda14cbcSMatt Macy  * to failed or missing columns; we then make V|I square (n x n) and d|p
1019eda14cbcSMatt Macy  * sized n by removing rows corresponding to unused parity from the bottom up
1020eda14cbcSMatt Macy  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1021eda14cbcSMatt Macy  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1022eda14cbcSMatt Macy  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1023eda14cbcSMatt Macy  *           __                               __
1024eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
1025eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1026eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |      / /
1027eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |     / /
1028eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  | <--' /
1029eda14cbcSMatt Macy  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1030eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
1031eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
1032eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
1033eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
1034eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
1035eda14cbcSMatt Macy  *           ~~                               ~~
1036eda14cbcSMatt Macy  *           __                               __
1037eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
1038eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  |
1039eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |
1040eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |
1041eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  |
1042eda14cbcSMatt Macy  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1043eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
1044eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
1045eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
1046eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
1047eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
1048eda14cbcSMatt Macy  *           ~~                               ~~
1049eda14cbcSMatt Macy  *
1050eda14cbcSMatt Macy  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1051eda14cbcSMatt Macy  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1052eda14cbcSMatt Macy  * matrix is not singular.
1053eda14cbcSMatt Macy  * __                                                                 __
1054eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1055eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1056eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1058eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1059eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1060eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1061eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1062eda14cbcSMatt Macy  * ~~                                                                 ~~
1063eda14cbcSMatt Macy  * __                                                                 __
1064eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1065eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1066eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1067eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1068eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1069eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1070eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1071eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1072eda14cbcSMatt Macy  * ~~                                                                 ~~
1073eda14cbcSMatt Macy  * __                                                                 __
1074eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1075eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1076eda14cbcSMatt Macy  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1077eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1078eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1079eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1080eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1081eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1082eda14cbcSMatt Macy  * ~~                                                                 ~~
1083eda14cbcSMatt Macy  * __                                                                 __
1084eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1085eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1086eda14cbcSMatt Macy  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1087eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1088eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1089eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1090eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1091eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1092eda14cbcSMatt Macy  * ~~                                                                 ~~
1093eda14cbcSMatt Macy  * __                                                                 __
1094eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1095eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1096eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1097eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1098eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1099eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1100eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1101eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1102eda14cbcSMatt Macy  * ~~                                                                 ~~
1103eda14cbcSMatt Macy  * __                                                                 __
1104eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1105eda14cbcSMatt Macy  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1106eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1107eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1108eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1109eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1110eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1111eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1112eda14cbcSMatt Macy  * ~~                                                                 ~~
1113eda14cbcSMatt Macy  *                   __                               __
1114eda14cbcSMatt Macy  *                   |  0   0   1   0   0   0   0   0  |
1115eda14cbcSMatt Macy  *                   | 167 100  5   41 159 169 217 208 |
1116eda14cbcSMatt Macy  *                   | 166 100  4   40 158 168 216 209 |
1117eda14cbcSMatt Macy  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1118eda14cbcSMatt Macy  *                   |  0   0   0   0   1   0   0   0  |
1119eda14cbcSMatt Macy  *                   |  0   0   0   0   0   1   0   0  |
1120eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   1   0  |
1121eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   0   1  |
1122eda14cbcSMatt Macy  *                   ~~                               ~~
1123eda14cbcSMatt Macy  *
1124eda14cbcSMatt Macy  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1125eda14cbcSMatt Macy  * of the missing data.
1126eda14cbcSMatt Macy  *
1127eda14cbcSMatt Macy  * As is apparent from the example above, the only non-trivial rows in the
1128eda14cbcSMatt Macy  * inverse matrix correspond to the data disks that we're trying to
1129eda14cbcSMatt Macy  * reconstruct. Indeed, those are the only rows we need as the others would
1130eda14cbcSMatt Macy  * only be useful for reconstructing data known or assumed to be valid. For
1131eda14cbcSMatt Macy  * that reason, we only build the coefficients in the rows that correspond to
1132eda14cbcSMatt Macy  * targeted columns.
1133eda14cbcSMatt Macy  */
1134eda14cbcSMatt Macy /* END CSTYLED */
1135eda14cbcSMatt Macy 
1136eda14cbcSMatt Macy static void
1137eda14cbcSMatt Macy vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1138eda14cbcSMatt Macy     uint8_t **rows)
1139eda14cbcSMatt Macy {
1140eda14cbcSMatt Macy 	int i, j;
1141eda14cbcSMatt Macy 	int pow;
1142eda14cbcSMatt Macy 
1143eda14cbcSMatt Macy 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1144eda14cbcSMatt Macy 
1145eda14cbcSMatt Macy 	/*
1146eda14cbcSMatt Macy 	 * Fill in the missing rows of interest.
1147eda14cbcSMatt Macy 	 */
1148eda14cbcSMatt Macy 	for (i = 0; i < nmap; i++) {
1149eda14cbcSMatt Macy 		ASSERT3S(0, <=, map[i]);
1150eda14cbcSMatt Macy 		ASSERT3S(map[i], <=, 2);
1151eda14cbcSMatt Macy 
1152eda14cbcSMatt Macy 		pow = map[i] * n;
1153eda14cbcSMatt Macy 		if (pow > 255)
1154eda14cbcSMatt Macy 			pow -= 255;
1155eda14cbcSMatt Macy 		ASSERT(pow <= 255);
1156eda14cbcSMatt Macy 
1157eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1158eda14cbcSMatt Macy 			pow -= map[i];
1159eda14cbcSMatt Macy 			if (pow < 0)
1160eda14cbcSMatt Macy 				pow += 255;
1161eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_pow2[pow];
1162eda14cbcSMatt Macy 		}
1163eda14cbcSMatt Macy 	}
1164eda14cbcSMatt Macy }
1165eda14cbcSMatt Macy 
1166eda14cbcSMatt Macy static void
1167eda14cbcSMatt Macy vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1168eda14cbcSMatt Macy     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1169eda14cbcSMatt Macy {
1170eda14cbcSMatt Macy 	int i, j, ii, jj;
1171eda14cbcSMatt Macy 	uint8_t log;
1172eda14cbcSMatt Macy 
1173eda14cbcSMatt Macy 	/*
1174eda14cbcSMatt Macy 	 * Assert that the first nmissing entries from the array of used
1175eda14cbcSMatt Macy 	 * columns correspond to parity columns and that subsequent entries
1176eda14cbcSMatt Macy 	 * correspond to data columns.
1177eda14cbcSMatt Macy 	 */
1178eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1179eda14cbcSMatt Macy 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1180eda14cbcSMatt Macy 	}
1181eda14cbcSMatt Macy 	for (; i < n; i++) {
1182eda14cbcSMatt Macy 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1183eda14cbcSMatt Macy 	}
1184eda14cbcSMatt Macy 
1185eda14cbcSMatt Macy 	/*
1186eda14cbcSMatt Macy 	 * First initialize the storage where we'll compute the inverse rows.
1187eda14cbcSMatt Macy 	 */
1188eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1189eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1190eda14cbcSMatt Macy 			invrows[i][j] = (i == j) ? 1 : 0;
1191eda14cbcSMatt Macy 		}
1192eda14cbcSMatt Macy 	}
1193eda14cbcSMatt Macy 
1194eda14cbcSMatt Macy 	/*
1195eda14cbcSMatt Macy 	 * Subtract all trivial rows from the rows of consequence.
1196eda14cbcSMatt Macy 	 */
1197eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1198eda14cbcSMatt Macy 		for (j = nmissing; j < n; j++) {
1199eda14cbcSMatt Macy 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1200eda14cbcSMatt Macy 			jj = used[j] - rm->rm_firstdatacol;
1201eda14cbcSMatt Macy 			ASSERT3S(jj, <, n);
1202eda14cbcSMatt Macy 			invrows[i][j] = rows[i][jj];
1203eda14cbcSMatt Macy 			rows[i][jj] = 0;
1204eda14cbcSMatt Macy 		}
1205eda14cbcSMatt Macy 	}
1206eda14cbcSMatt Macy 
1207eda14cbcSMatt Macy 	/*
1208eda14cbcSMatt Macy 	 * For each of the rows of interest, we must normalize it and subtract
1209eda14cbcSMatt Macy 	 * a multiple of it from the other rows.
1210eda14cbcSMatt Macy 	 */
1211eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1212eda14cbcSMatt Macy 		for (j = 0; j < missing[i]; j++) {
1213eda14cbcSMatt Macy 			ASSERT0(rows[i][j]);
1214eda14cbcSMatt Macy 		}
1215eda14cbcSMatt Macy 		ASSERT3U(rows[i][missing[i]], !=, 0);
1216eda14cbcSMatt Macy 
1217eda14cbcSMatt Macy 		/*
1218eda14cbcSMatt Macy 		 * Compute the inverse of the first element and multiply each
1219eda14cbcSMatt Macy 		 * element in the row by that value.
1220eda14cbcSMatt Macy 		 */
1221eda14cbcSMatt Macy 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1222eda14cbcSMatt Macy 
1223eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1224eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1225eda14cbcSMatt Macy 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1226eda14cbcSMatt Macy 		}
1227eda14cbcSMatt Macy 
1228eda14cbcSMatt Macy 		for (ii = 0; ii < nmissing; ii++) {
1229eda14cbcSMatt Macy 			if (i == ii)
1230eda14cbcSMatt Macy 				continue;
1231eda14cbcSMatt Macy 
1232eda14cbcSMatt Macy 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1233eda14cbcSMatt Macy 
1234eda14cbcSMatt Macy 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1235eda14cbcSMatt Macy 
1236eda14cbcSMatt Macy 			for (j = 0; j < n; j++) {
1237eda14cbcSMatt Macy 				rows[ii][j] ^=
1238eda14cbcSMatt Macy 				    vdev_raidz_exp2(rows[i][j], log);
1239eda14cbcSMatt Macy 				invrows[ii][j] ^=
1240eda14cbcSMatt Macy 				    vdev_raidz_exp2(invrows[i][j], log);
1241eda14cbcSMatt Macy 			}
1242eda14cbcSMatt Macy 		}
1243eda14cbcSMatt Macy 	}
1244eda14cbcSMatt Macy 
1245eda14cbcSMatt Macy 	/*
1246eda14cbcSMatt Macy 	 * Verify that the data that is left in the rows are properly part of
1247eda14cbcSMatt Macy 	 * an identity matrix.
1248eda14cbcSMatt Macy 	 */
1249eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1250eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1251eda14cbcSMatt Macy 			if (j == missing[i]) {
1252eda14cbcSMatt Macy 				ASSERT3U(rows[i][j], ==, 1);
1253eda14cbcSMatt Macy 			} else {
1254eda14cbcSMatt Macy 				ASSERT0(rows[i][j]);
1255eda14cbcSMatt Macy 			}
1256eda14cbcSMatt Macy 		}
1257eda14cbcSMatt Macy 	}
1258eda14cbcSMatt Macy }
1259eda14cbcSMatt Macy 
1260eda14cbcSMatt Macy static void
1261eda14cbcSMatt Macy vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1262eda14cbcSMatt Macy     int *missing, uint8_t **invrows, const uint8_t *used)
1263eda14cbcSMatt Macy {
1264eda14cbcSMatt Macy 	int i, j, x, cc, c;
1265eda14cbcSMatt Macy 	uint8_t *src;
1266eda14cbcSMatt Macy 	uint64_t ccount;
1267eda14cbcSMatt Macy 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1268eda14cbcSMatt Macy 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1269eda14cbcSMatt Macy 	uint8_t log = 0;
1270eda14cbcSMatt Macy 	uint8_t val;
1271eda14cbcSMatt Macy 	int ll;
1272eda14cbcSMatt Macy 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1273eda14cbcSMatt Macy 	uint8_t *p, *pp;
1274eda14cbcSMatt Macy 	size_t psize;
1275eda14cbcSMatt Macy 
1276eda14cbcSMatt Macy 	psize = sizeof (invlog[0][0]) * n * nmissing;
1277eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1278eda14cbcSMatt Macy 
1279eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing; i++) {
1280eda14cbcSMatt Macy 		invlog[i] = pp;
1281eda14cbcSMatt Macy 		pp += n;
1282eda14cbcSMatt Macy 	}
1283eda14cbcSMatt Macy 
1284eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1285eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1286eda14cbcSMatt Macy 			ASSERT3U(invrows[i][j], !=, 0);
1287eda14cbcSMatt Macy 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1288eda14cbcSMatt Macy 		}
1289eda14cbcSMatt Macy 	}
1290eda14cbcSMatt Macy 
1291eda14cbcSMatt Macy 	for (i = 0; i < n; i++) {
1292eda14cbcSMatt Macy 		c = used[i];
1293eda14cbcSMatt Macy 		ASSERT3U(c, <, rm->rm_cols);
1294eda14cbcSMatt Macy 
1295eda14cbcSMatt Macy 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1296eda14cbcSMatt Macy 		ccount = rm->rm_col[c].rc_size;
1297eda14cbcSMatt Macy 		for (j = 0; j < nmissing; j++) {
1298eda14cbcSMatt Macy 			cc = missing[j] + rm->rm_firstdatacol;
1299eda14cbcSMatt Macy 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1300eda14cbcSMatt Macy 			ASSERT3U(cc, <, rm->rm_cols);
1301eda14cbcSMatt Macy 			ASSERT3U(cc, !=, c);
1302eda14cbcSMatt Macy 
1303eda14cbcSMatt Macy 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1304eda14cbcSMatt Macy 			dcount[j] = rm->rm_col[cc].rc_size;
1305eda14cbcSMatt Macy 		}
1306eda14cbcSMatt Macy 
1307eda14cbcSMatt Macy 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1308eda14cbcSMatt Macy 
1309eda14cbcSMatt Macy 		for (x = 0; x < ccount; x++, src++) {
1310eda14cbcSMatt Macy 			if (*src != 0)
1311eda14cbcSMatt Macy 				log = vdev_raidz_log2[*src];
1312eda14cbcSMatt Macy 
1313eda14cbcSMatt Macy 			for (cc = 0; cc < nmissing; cc++) {
1314eda14cbcSMatt Macy 				if (x >= dcount[cc])
1315eda14cbcSMatt Macy 					continue;
1316eda14cbcSMatt Macy 
1317eda14cbcSMatt Macy 				if (*src == 0) {
1318eda14cbcSMatt Macy 					val = 0;
1319eda14cbcSMatt Macy 				} else {
1320eda14cbcSMatt Macy 					if ((ll = log + invlog[cc][i]) >= 255)
1321eda14cbcSMatt Macy 						ll -= 255;
1322eda14cbcSMatt Macy 					val = vdev_raidz_pow2[ll];
1323eda14cbcSMatt Macy 				}
1324eda14cbcSMatt Macy 
1325eda14cbcSMatt Macy 				if (i == 0)
1326eda14cbcSMatt Macy 					dst[cc][x] = val;
1327eda14cbcSMatt Macy 				else
1328eda14cbcSMatt Macy 					dst[cc][x] ^= val;
1329eda14cbcSMatt Macy 			}
1330eda14cbcSMatt Macy 		}
1331eda14cbcSMatt Macy 	}
1332eda14cbcSMatt Macy 
1333eda14cbcSMatt Macy 	kmem_free(p, psize);
1334eda14cbcSMatt Macy }
1335eda14cbcSMatt Macy 
1336eda14cbcSMatt Macy static int
1337eda14cbcSMatt Macy vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1338eda14cbcSMatt Macy {
1339eda14cbcSMatt Macy 	int n, i, c, t, tt;
1340eda14cbcSMatt Macy 	int nmissing_rows;
1341eda14cbcSMatt Macy 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1342eda14cbcSMatt Macy 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1343eda14cbcSMatt Macy 
1344eda14cbcSMatt Macy 	uint8_t *p, *pp;
1345eda14cbcSMatt Macy 	size_t psize;
1346eda14cbcSMatt Macy 
1347eda14cbcSMatt Macy 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1348eda14cbcSMatt Macy 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1349eda14cbcSMatt Macy 	uint8_t *used;
1350eda14cbcSMatt Macy 
1351eda14cbcSMatt Macy 	abd_t **bufs = NULL;
1352eda14cbcSMatt Macy 
1353eda14cbcSMatt Macy 	int code = 0;
1354eda14cbcSMatt Macy 
1355eda14cbcSMatt Macy 	/*
1356eda14cbcSMatt Macy 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1357eda14cbcSMatt Macy 	 * temporary linear ABDs.
1358eda14cbcSMatt Macy 	 */
1359eda14cbcSMatt Macy 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1360eda14cbcSMatt Macy 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1361eda14cbcSMatt Macy 
1362eda14cbcSMatt Macy 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1363eda14cbcSMatt Macy 			raidz_col_t *col = &rm->rm_col[c];
1364eda14cbcSMatt Macy 
1365eda14cbcSMatt Macy 			bufs[c] = col->rc_abd;
1366eda14cbcSMatt Macy 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1367eda14cbcSMatt Macy 			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1368eda14cbcSMatt Macy 		}
1369eda14cbcSMatt Macy 	}
1370eda14cbcSMatt Macy 
1371eda14cbcSMatt Macy 	n = rm->rm_cols - rm->rm_firstdatacol;
1372eda14cbcSMatt Macy 
1373eda14cbcSMatt Macy 	/*
1374eda14cbcSMatt Macy 	 * Figure out which data columns are missing.
1375eda14cbcSMatt Macy 	 */
1376eda14cbcSMatt Macy 	nmissing_rows = 0;
1377eda14cbcSMatt Macy 	for (t = 0; t < ntgts; t++) {
1378eda14cbcSMatt Macy 		if (tgts[t] >= rm->rm_firstdatacol) {
1379eda14cbcSMatt Macy 			missing_rows[nmissing_rows++] =
1380eda14cbcSMatt Macy 			    tgts[t] - rm->rm_firstdatacol;
1381eda14cbcSMatt Macy 		}
1382eda14cbcSMatt Macy 	}
1383eda14cbcSMatt Macy 
1384eda14cbcSMatt Macy 	/*
1385eda14cbcSMatt Macy 	 * Figure out which parity columns to use to help generate the missing
1386eda14cbcSMatt Macy 	 * data columns.
1387eda14cbcSMatt Macy 	 */
1388eda14cbcSMatt Macy 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1389eda14cbcSMatt Macy 		ASSERT(tt < ntgts);
1390eda14cbcSMatt Macy 		ASSERT(c < rm->rm_firstdatacol);
1391eda14cbcSMatt Macy 
1392eda14cbcSMatt Macy 		/*
1393eda14cbcSMatt Macy 		 * Skip any targeted parity columns.
1394eda14cbcSMatt Macy 		 */
1395eda14cbcSMatt Macy 		if (c == tgts[tt]) {
1396eda14cbcSMatt Macy 			tt++;
1397eda14cbcSMatt Macy 			continue;
1398eda14cbcSMatt Macy 		}
1399eda14cbcSMatt Macy 
1400eda14cbcSMatt Macy 		code |= 1 << c;
1401eda14cbcSMatt Macy 
1402eda14cbcSMatt Macy 		parity_map[i] = c;
1403eda14cbcSMatt Macy 		i++;
1404eda14cbcSMatt Macy 	}
1405eda14cbcSMatt Macy 
1406eda14cbcSMatt Macy 	ASSERT(code != 0);
1407eda14cbcSMatt Macy 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1408eda14cbcSMatt Macy 
1409eda14cbcSMatt Macy 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1410eda14cbcSMatt Macy 	    nmissing_rows * n + sizeof (used[0]) * n;
1411eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1412eda14cbcSMatt Macy 
1413eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1414eda14cbcSMatt Macy 		rows[i] = pp;
1415eda14cbcSMatt Macy 		pp += n;
1416eda14cbcSMatt Macy 		invrows[i] = pp;
1417eda14cbcSMatt Macy 		pp += n;
1418eda14cbcSMatt Macy 	}
1419eda14cbcSMatt Macy 	used = pp;
1420eda14cbcSMatt Macy 
1421eda14cbcSMatt Macy 	for (i = 0; i < nmissing_rows; i++) {
1422eda14cbcSMatt Macy 		used[i] = parity_map[i];
1423eda14cbcSMatt Macy 	}
1424eda14cbcSMatt Macy 
1425eda14cbcSMatt Macy 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1426eda14cbcSMatt Macy 		if (tt < nmissing_rows &&
1427eda14cbcSMatt Macy 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1428eda14cbcSMatt Macy 			tt++;
1429eda14cbcSMatt Macy 			continue;
1430eda14cbcSMatt Macy 		}
1431eda14cbcSMatt Macy 
1432eda14cbcSMatt Macy 		ASSERT3S(i, <, n);
1433eda14cbcSMatt Macy 		used[i] = c;
1434eda14cbcSMatt Macy 		i++;
1435eda14cbcSMatt Macy 	}
1436eda14cbcSMatt Macy 
1437eda14cbcSMatt Macy 	/*
1438eda14cbcSMatt Macy 	 * Initialize the interesting rows of the matrix.
1439eda14cbcSMatt Macy 	 */
1440eda14cbcSMatt Macy 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1441eda14cbcSMatt Macy 
1442eda14cbcSMatt Macy 	/*
1443eda14cbcSMatt Macy 	 * Invert the matrix.
1444eda14cbcSMatt Macy 	 */
1445eda14cbcSMatt Macy 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1446eda14cbcSMatt Macy 	    invrows, used);
1447eda14cbcSMatt Macy 
1448eda14cbcSMatt Macy 	/*
1449eda14cbcSMatt Macy 	 * Reconstruct the missing data using the generated matrix.
1450eda14cbcSMatt Macy 	 */
1451eda14cbcSMatt Macy 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1452eda14cbcSMatt Macy 	    invrows, used);
1453eda14cbcSMatt Macy 
1454eda14cbcSMatt Macy 	kmem_free(p, psize);
1455eda14cbcSMatt Macy 
1456eda14cbcSMatt Macy 	/*
1457eda14cbcSMatt Macy 	 * copy back from temporary linear abds and free them
1458eda14cbcSMatt Macy 	 */
1459eda14cbcSMatt Macy 	if (bufs) {
1460eda14cbcSMatt Macy 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1461eda14cbcSMatt Macy 			raidz_col_t *col = &rm->rm_col[c];
1462eda14cbcSMatt Macy 
1463eda14cbcSMatt Macy 			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1464eda14cbcSMatt Macy 			abd_free(col->rc_abd);
1465eda14cbcSMatt Macy 			col->rc_abd = bufs[c];
1466eda14cbcSMatt Macy 		}
1467eda14cbcSMatt Macy 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1468eda14cbcSMatt Macy 	}
1469eda14cbcSMatt Macy 
1470eda14cbcSMatt Macy 	return (code);
1471eda14cbcSMatt Macy }
1472eda14cbcSMatt Macy 
1473eda14cbcSMatt Macy int
1474eda14cbcSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
1475eda14cbcSMatt Macy {
1476eda14cbcSMatt Macy 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1477eda14cbcSMatt Macy 	int ntgts;
1478eda14cbcSMatt Macy 	int i, c, ret;
1479eda14cbcSMatt Macy 	int code;
1480eda14cbcSMatt Macy 	int nbadparity, nbaddata;
1481eda14cbcSMatt Macy 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1482eda14cbcSMatt Macy 
1483eda14cbcSMatt Macy 	/*
1484eda14cbcSMatt Macy 	 * The tgts list must already be sorted.
1485eda14cbcSMatt Macy 	 */
1486eda14cbcSMatt Macy 	for (i = 1; i < nt; i++) {
1487eda14cbcSMatt Macy 		ASSERT(t[i] > t[i - 1]);
1488eda14cbcSMatt Macy 	}
1489eda14cbcSMatt Macy 
1490eda14cbcSMatt Macy 	nbadparity = rm->rm_firstdatacol;
1491eda14cbcSMatt Macy 	nbaddata = rm->rm_cols - nbadparity;
1492eda14cbcSMatt Macy 	ntgts = 0;
1493eda14cbcSMatt Macy 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1494eda14cbcSMatt Macy 		if (c < rm->rm_firstdatacol)
1495eda14cbcSMatt Macy 			parity_valid[c] = B_FALSE;
1496eda14cbcSMatt Macy 
1497eda14cbcSMatt Macy 		if (i < nt && c == t[i]) {
1498eda14cbcSMatt Macy 			tgts[ntgts++] = c;
1499eda14cbcSMatt Macy 			i++;
1500eda14cbcSMatt Macy 		} else if (rm->rm_col[c].rc_error != 0) {
1501eda14cbcSMatt Macy 			tgts[ntgts++] = c;
1502eda14cbcSMatt Macy 		} else if (c >= rm->rm_firstdatacol) {
1503eda14cbcSMatt Macy 			nbaddata--;
1504eda14cbcSMatt Macy 		} else {
1505eda14cbcSMatt Macy 			parity_valid[c] = B_TRUE;
1506eda14cbcSMatt Macy 			nbadparity--;
1507eda14cbcSMatt Macy 		}
1508eda14cbcSMatt Macy 	}
1509eda14cbcSMatt Macy 
1510eda14cbcSMatt Macy 	ASSERT(ntgts >= nt);
1511eda14cbcSMatt Macy 	ASSERT(nbaddata >= 0);
1512eda14cbcSMatt Macy 	ASSERT(nbaddata + nbadparity == ntgts);
1513eda14cbcSMatt Macy 
1514eda14cbcSMatt Macy 	dt = &tgts[nbadparity];
1515eda14cbcSMatt Macy 
1516eda14cbcSMatt Macy 	/* Reconstruct using the new math implementation */
1517eda14cbcSMatt Macy 	ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
1518eda14cbcSMatt Macy 	if (ret != RAIDZ_ORIGINAL_IMPL)
1519eda14cbcSMatt Macy 		return (ret);
1520eda14cbcSMatt Macy 
1521eda14cbcSMatt Macy 	/*
1522eda14cbcSMatt Macy 	 * See if we can use any of our optimized reconstruction routines.
1523eda14cbcSMatt Macy 	 */
1524eda14cbcSMatt Macy 	switch (nbaddata) {
1525eda14cbcSMatt Macy 	case 1:
1526eda14cbcSMatt Macy 		if (parity_valid[VDEV_RAIDZ_P])
1527eda14cbcSMatt Macy 			return (vdev_raidz_reconstruct_p(rm, dt, 1));
1528eda14cbcSMatt Macy 
1529eda14cbcSMatt Macy 		ASSERT(rm->rm_firstdatacol > 1);
1530eda14cbcSMatt Macy 
1531eda14cbcSMatt Macy 		if (parity_valid[VDEV_RAIDZ_Q])
1532eda14cbcSMatt Macy 			return (vdev_raidz_reconstruct_q(rm, dt, 1));
1533eda14cbcSMatt Macy 
1534eda14cbcSMatt Macy 		ASSERT(rm->rm_firstdatacol > 2);
1535eda14cbcSMatt Macy 		break;
1536eda14cbcSMatt Macy 
1537eda14cbcSMatt Macy 	case 2:
1538eda14cbcSMatt Macy 		ASSERT(rm->rm_firstdatacol > 1);
1539eda14cbcSMatt Macy 
1540eda14cbcSMatt Macy 		if (parity_valid[VDEV_RAIDZ_P] &&
1541eda14cbcSMatt Macy 		    parity_valid[VDEV_RAIDZ_Q])
1542eda14cbcSMatt Macy 			return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1543eda14cbcSMatt Macy 
1544eda14cbcSMatt Macy 		ASSERT(rm->rm_firstdatacol > 2);
1545eda14cbcSMatt Macy 
1546eda14cbcSMatt Macy 		break;
1547eda14cbcSMatt Macy 	}
1548eda14cbcSMatt Macy 
1549eda14cbcSMatt Macy 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1550eda14cbcSMatt Macy 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1551eda14cbcSMatt Macy 	ASSERT(code > 0);
1552eda14cbcSMatt Macy 	return (code);
1553eda14cbcSMatt Macy }
1554eda14cbcSMatt Macy 
1555eda14cbcSMatt Macy static int
1556eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1557eda14cbcSMatt Macy     uint64_t *logical_ashift, uint64_t *physical_ashift)
1558eda14cbcSMatt Macy {
1559eda14cbcSMatt Macy 	vdev_t *cvd;
1560eda14cbcSMatt Macy 	uint64_t nparity = vd->vdev_nparity;
1561eda14cbcSMatt Macy 	int c;
1562eda14cbcSMatt Macy 	int lasterror = 0;
1563eda14cbcSMatt Macy 	int numerrors = 0;
1564eda14cbcSMatt Macy 
1565eda14cbcSMatt Macy 	ASSERT(nparity > 0);
1566eda14cbcSMatt Macy 
1567eda14cbcSMatt Macy 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1568eda14cbcSMatt Macy 	    vd->vdev_children < nparity + 1) {
1569eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1570eda14cbcSMatt Macy 		return (SET_ERROR(EINVAL));
1571eda14cbcSMatt Macy 	}
1572eda14cbcSMatt Macy 
1573eda14cbcSMatt Macy 	vdev_open_children(vd);
1574eda14cbcSMatt Macy 
1575eda14cbcSMatt Macy 	for (c = 0; c < vd->vdev_children; c++) {
1576eda14cbcSMatt Macy 		cvd = vd->vdev_child[c];
1577eda14cbcSMatt Macy 
1578eda14cbcSMatt Macy 		if (cvd->vdev_open_error != 0) {
1579eda14cbcSMatt Macy 			lasterror = cvd->vdev_open_error;
1580eda14cbcSMatt Macy 			numerrors++;
1581eda14cbcSMatt Macy 			continue;
1582eda14cbcSMatt Macy 		}
1583eda14cbcSMatt Macy 
1584eda14cbcSMatt Macy 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1585eda14cbcSMatt Macy 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1586eda14cbcSMatt Macy 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1587eda14cbcSMatt Macy 		*physical_ashift = MAX(*physical_ashift,
1588eda14cbcSMatt Macy 		    cvd->vdev_physical_ashift);
1589eda14cbcSMatt Macy 	}
1590eda14cbcSMatt Macy 
1591eda14cbcSMatt Macy 	*asize *= vd->vdev_children;
1592eda14cbcSMatt Macy 	*max_asize *= vd->vdev_children;
1593eda14cbcSMatt Macy 
1594eda14cbcSMatt Macy 	if (numerrors > nparity) {
1595eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1596eda14cbcSMatt Macy 		return (lasterror);
1597eda14cbcSMatt Macy 	}
1598eda14cbcSMatt Macy 
1599eda14cbcSMatt Macy 	return (0);
1600eda14cbcSMatt Macy }
1601eda14cbcSMatt Macy 
1602eda14cbcSMatt Macy static void
1603eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd)
1604eda14cbcSMatt Macy {
1605eda14cbcSMatt Macy 	int c;
1606eda14cbcSMatt Macy 
1607eda14cbcSMatt Macy 	for (c = 0; c < vd->vdev_children; c++)
1608eda14cbcSMatt Macy 		vdev_close(vd->vdev_child[c]);
1609eda14cbcSMatt Macy }
1610eda14cbcSMatt Macy 
1611eda14cbcSMatt Macy static uint64_t
1612eda14cbcSMatt Macy vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1613eda14cbcSMatt Macy {
1614eda14cbcSMatt Macy 	uint64_t asize;
1615eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1616eda14cbcSMatt Macy 	uint64_t cols = vd->vdev_children;
1617eda14cbcSMatt Macy 	uint64_t nparity = vd->vdev_nparity;
1618eda14cbcSMatt Macy 
1619eda14cbcSMatt Macy 	asize = ((psize - 1) >> ashift) + 1;
1620eda14cbcSMatt Macy 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1621eda14cbcSMatt Macy 	asize = roundup(asize, nparity + 1) << ashift;
1622eda14cbcSMatt Macy 
1623eda14cbcSMatt Macy 	return (asize);
1624eda14cbcSMatt Macy }
1625eda14cbcSMatt Macy 
1626eda14cbcSMatt Macy static void
1627eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio)
1628eda14cbcSMatt Macy {
1629eda14cbcSMatt Macy 	raidz_col_t *rc = zio->io_private;
1630eda14cbcSMatt Macy 
1631eda14cbcSMatt Macy 	rc->rc_error = zio->io_error;
1632eda14cbcSMatt Macy 	rc->rc_tried = 1;
1633eda14cbcSMatt Macy 	rc->rc_skipped = 0;
1634eda14cbcSMatt Macy }
1635eda14cbcSMatt Macy 
1636eda14cbcSMatt Macy static void
1637eda14cbcSMatt Macy vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1638eda14cbcSMatt Macy {
1639eda14cbcSMatt Macy #ifdef ZFS_DEBUG
1640eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
1641eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
1642eda14cbcSMatt Macy 
1643eda14cbcSMatt Macy 	range_seg64_t logical_rs, physical_rs;
1644eda14cbcSMatt Macy 	logical_rs.rs_start = zio->io_offset;
1645eda14cbcSMatt Macy 	logical_rs.rs_end = logical_rs.rs_start +
1646eda14cbcSMatt Macy 	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1647eda14cbcSMatt Macy 
1648eda14cbcSMatt Macy 	raidz_col_t *rc = &rm->rm_col[col];
1649eda14cbcSMatt Macy 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1650eda14cbcSMatt Macy 
1651eda14cbcSMatt Macy 	vdev_xlate(cvd, &logical_rs, &physical_rs);
1652eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1653eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1654eda14cbcSMatt Macy 	/*
1655eda14cbcSMatt Macy 	 * It would be nice to assert that rs_end is equal
1656eda14cbcSMatt Macy 	 * to rc_offset + rc_size but there might be an
1657eda14cbcSMatt Macy 	 * optional I/O at the end that is not accounted in
1658eda14cbcSMatt Macy 	 * rc_size.
1659eda14cbcSMatt Macy 	 */
1660eda14cbcSMatt Macy 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1661eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1662eda14cbcSMatt Macy 		    rc->rc_size + (1 << tvd->vdev_ashift));
1663eda14cbcSMatt Macy 	} else {
1664eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1665eda14cbcSMatt Macy 	}
1666eda14cbcSMatt Macy #endif
1667eda14cbcSMatt Macy }
1668eda14cbcSMatt Macy 
1669eda14cbcSMatt Macy /*
1670eda14cbcSMatt Macy  * Start an IO operation on a RAIDZ VDev
1671eda14cbcSMatt Macy  *
1672eda14cbcSMatt Macy  * Outline:
1673eda14cbcSMatt Macy  * - For write operations:
1674eda14cbcSMatt Macy  *   1. Generate the parity data
1675eda14cbcSMatt Macy  *   2. Create child zio write operations to each column's vdev, for both
1676eda14cbcSMatt Macy  *      data and parity.
1677eda14cbcSMatt Macy  *   3. If the column skips any sectors for padding, create optional dummy
1678eda14cbcSMatt Macy  *      write zio children for those areas to improve aggregation continuity.
1679eda14cbcSMatt Macy  * - For read operations:
1680eda14cbcSMatt Macy  *   1. Create child zio read operations to each data column's vdev to read
1681eda14cbcSMatt Macy  *      the range of data required for zio.
1682eda14cbcSMatt Macy  *   2. If this is a scrub or resilver operation, or if any of the data
1683eda14cbcSMatt Macy  *      vdevs have had errors, then create zio read operations to the parity
1684eda14cbcSMatt Macy  *      columns' VDevs as well.
1685eda14cbcSMatt Macy  */
1686eda14cbcSMatt Macy static void
1687eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio)
1688eda14cbcSMatt Macy {
1689eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
1690eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
1691eda14cbcSMatt Macy 	vdev_t *cvd;
1692eda14cbcSMatt Macy 	raidz_map_t *rm;
1693eda14cbcSMatt Macy 	raidz_col_t *rc;
1694eda14cbcSMatt Macy 	int c, i;
1695eda14cbcSMatt Macy 
1696eda14cbcSMatt Macy 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1697eda14cbcSMatt Macy 	    vd->vdev_nparity);
1698eda14cbcSMatt Macy 
1699eda14cbcSMatt Macy 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1700eda14cbcSMatt Macy 
1701eda14cbcSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
1702eda14cbcSMatt Macy 		vdev_raidz_generate_parity(rm);
1703eda14cbcSMatt Macy 
1704eda14cbcSMatt Macy 		for (c = 0; c < rm->rm_cols; c++) {
1705eda14cbcSMatt Macy 			rc = &rm->rm_col[c];
1706eda14cbcSMatt Macy 			cvd = vd->vdev_child[rc->rc_devidx];
1707eda14cbcSMatt Macy 
1708eda14cbcSMatt Macy 			/*
1709eda14cbcSMatt Macy 			 * Verify physical to logical translation.
1710eda14cbcSMatt Macy 			 */
1711eda14cbcSMatt Macy 			vdev_raidz_io_verify(zio, rm, c);
1712eda14cbcSMatt Macy 
1713eda14cbcSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1714eda14cbcSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1715eda14cbcSMatt Macy 			    zio->io_type, zio->io_priority, 0,
1716eda14cbcSMatt Macy 			    vdev_raidz_child_done, rc));
1717eda14cbcSMatt Macy 		}
1718eda14cbcSMatt Macy 
1719eda14cbcSMatt Macy 		/*
1720eda14cbcSMatt Macy 		 * Generate optional I/Os for any skipped sectors to improve
1721eda14cbcSMatt Macy 		 * aggregation contiguity.
1722eda14cbcSMatt Macy 		 */
1723eda14cbcSMatt Macy 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1724eda14cbcSMatt Macy 			ASSERT(c <= rm->rm_scols);
1725eda14cbcSMatt Macy 			if (c == rm->rm_scols)
1726eda14cbcSMatt Macy 				c = 0;
1727eda14cbcSMatt Macy 			rc = &rm->rm_col[c];
1728eda14cbcSMatt Macy 			cvd = vd->vdev_child[rc->rc_devidx];
1729eda14cbcSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1730eda14cbcSMatt Macy 			    rc->rc_offset + rc->rc_size, NULL,
1731eda14cbcSMatt Macy 			    1 << tvd->vdev_ashift,
1732eda14cbcSMatt Macy 			    zio->io_type, zio->io_priority,
1733eda14cbcSMatt Macy 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1734eda14cbcSMatt Macy 		}
1735eda14cbcSMatt Macy 
1736eda14cbcSMatt Macy 		zio_execute(zio);
1737eda14cbcSMatt Macy 		return;
1738eda14cbcSMatt Macy 	}
1739eda14cbcSMatt Macy 
1740eda14cbcSMatt Macy 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1741eda14cbcSMatt Macy 
1742eda14cbcSMatt Macy 	/*
1743eda14cbcSMatt Macy 	 * Iterate over the columns in reverse order so that we hit the parity
1744eda14cbcSMatt Macy 	 * last -- any errors along the way will force us to read the parity.
1745eda14cbcSMatt Macy 	 */
1746eda14cbcSMatt Macy 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1747eda14cbcSMatt Macy 		rc = &rm->rm_col[c];
1748eda14cbcSMatt Macy 		cvd = vd->vdev_child[rc->rc_devidx];
1749eda14cbcSMatt Macy 		if (!vdev_readable(cvd)) {
1750eda14cbcSMatt Macy 			if (c >= rm->rm_firstdatacol)
1751eda14cbcSMatt Macy 				rm->rm_missingdata++;
1752eda14cbcSMatt Macy 			else
1753eda14cbcSMatt Macy 				rm->rm_missingparity++;
1754eda14cbcSMatt Macy 			rc->rc_error = SET_ERROR(ENXIO);
1755eda14cbcSMatt Macy 			rc->rc_tried = 1;	/* don't even try */
1756eda14cbcSMatt Macy 			rc->rc_skipped = 1;
1757eda14cbcSMatt Macy 			continue;
1758eda14cbcSMatt Macy 		}
1759eda14cbcSMatt Macy 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1760eda14cbcSMatt Macy 			if (c >= rm->rm_firstdatacol)
1761eda14cbcSMatt Macy 				rm->rm_missingdata++;
1762eda14cbcSMatt Macy 			else
1763eda14cbcSMatt Macy 				rm->rm_missingparity++;
1764eda14cbcSMatt Macy 			rc->rc_error = SET_ERROR(ESTALE);
1765eda14cbcSMatt Macy 			rc->rc_skipped = 1;
1766eda14cbcSMatt Macy 			continue;
1767eda14cbcSMatt Macy 		}
1768eda14cbcSMatt Macy 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1769eda14cbcSMatt Macy 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1770eda14cbcSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1771eda14cbcSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1772eda14cbcSMatt Macy 			    zio->io_type, zio->io_priority, 0,
1773eda14cbcSMatt Macy 			    vdev_raidz_child_done, rc));
1774eda14cbcSMatt Macy 		}
1775eda14cbcSMatt Macy 	}
1776eda14cbcSMatt Macy 
1777eda14cbcSMatt Macy 	zio_execute(zio);
1778eda14cbcSMatt Macy }
1779eda14cbcSMatt Macy 
1780eda14cbcSMatt Macy 
1781eda14cbcSMatt Macy /*
1782eda14cbcSMatt Macy  * Report a checksum error for a child of a RAID-Z device.
1783eda14cbcSMatt Macy  */
1784eda14cbcSMatt Macy static void
1785eda14cbcSMatt Macy raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1786eda14cbcSMatt Macy {
1787eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1788eda14cbcSMatt Macy 
1789eda14cbcSMatt Macy 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1790eda14cbcSMatt Macy 		zio_bad_cksum_t zbc;
1791eda14cbcSMatt Macy 		raidz_map_t *rm = zio->io_vsd;
1792eda14cbcSMatt Macy 
1793eda14cbcSMatt Macy 		zbc.zbc_has_cksum = 0;
1794eda14cbcSMatt Macy 		zbc.zbc_injected = rm->rm_ecksuminjected;
1795eda14cbcSMatt Macy 
1796*2c48331dSMatt Macy 		int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
1797eda14cbcSMatt Macy 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1798eda14cbcSMatt Macy 		    rc->rc_abd, bad_data, &zbc);
1799*2c48331dSMatt Macy 		if (ret != EALREADY) {
1800*2c48331dSMatt Macy 			mutex_enter(&vd->vdev_stat_lock);
1801*2c48331dSMatt Macy 			vd->vdev_stat.vs_checksum_errors++;
1802*2c48331dSMatt Macy 			mutex_exit(&vd->vdev_stat_lock);
1803*2c48331dSMatt Macy 		}
1804eda14cbcSMatt Macy 	}
1805eda14cbcSMatt Macy }
1806eda14cbcSMatt Macy 
1807eda14cbcSMatt Macy /*
1808eda14cbcSMatt Macy  * We keep track of whether or not there were any injected errors, so that
1809eda14cbcSMatt Macy  * any ereports we generate can note it.
1810eda14cbcSMatt Macy  */
1811eda14cbcSMatt Macy static int
1812eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio)
1813eda14cbcSMatt Macy {
1814eda14cbcSMatt Macy 	zio_bad_cksum_t zbc;
1815eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
1816eda14cbcSMatt Macy 
1817eda14cbcSMatt Macy 	bzero(&zbc, sizeof (zio_bad_cksum_t));
1818eda14cbcSMatt Macy 
1819eda14cbcSMatt Macy 	int ret = zio_checksum_error(zio, &zbc);
1820eda14cbcSMatt Macy 	if (ret != 0 && zbc.zbc_injected != 0)
1821eda14cbcSMatt Macy 		rm->rm_ecksuminjected = 1;
1822eda14cbcSMatt Macy 
1823eda14cbcSMatt Macy 	return (ret);
1824eda14cbcSMatt Macy }
1825eda14cbcSMatt Macy 
1826eda14cbcSMatt Macy /*
1827eda14cbcSMatt Macy  * Generate the parity from the data columns. If we tried and were able to
1828eda14cbcSMatt Macy  * read the parity without error, verify that the generated parity matches the
1829eda14cbcSMatt Macy  * data we read. If it doesn't, we fire off a checksum error. Return the
1830eda14cbcSMatt Macy  * number such failures.
1831eda14cbcSMatt Macy  */
1832eda14cbcSMatt Macy static int
1833eda14cbcSMatt Macy raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1834eda14cbcSMatt Macy {
1835eda14cbcSMatt Macy 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1836eda14cbcSMatt Macy 	int c, ret = 0;
1837eda14cbcSMatt Macy 	raidz_col_t *rc;
1838eda14cbcSMatt Macy 
1839eda14cbcSMatt Macy 	blkptr_t *bp = zio->io_bp;
1840eda14cbcSMatt Macy 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1841eda14cbcSMatt Macy 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1842eda14cbcSMatt Macy 
1843eda14cbcSMatt Macy 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1844eda14cbcSMatt Macy 		return (ret);
1845eda14cbcSMatt Macy 
1846eda14cbcSMatt Macy 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1847eda14cbcSMatt Macy 		rc = &rm->rm_col[c];
1848eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
1849eda14cbcSMatt Macy 			continue;
1850eda14cbcSMatt Macy 
1851eda14cbcSMatt Macy 		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1852eda14cbcSMatt Macy 		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1853eda14cbcSMatt Macy 	}
1854eda14cbcSMatt Macy 
1855eda14cbcSMatt Macy 	vdev_raidz_generate_parity(rm);
1856eda14cbcSMatt Macy 
1857eda14cbcSMatt Macy 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1858eda14cbcSMatt Macy 		rc = &rm->rm_col[c];
1859eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
1860eda14cbcSMatt Macy 			continue;
1861eda14cbcSMatt Macy 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1862eda14cbcSMatt Macy 			raidz_checksum_error(zio, rc, orig[c]);
1863eda14cbcSMatt Macy 			rc->rc_error = SET_ERROR(ECKSUM);
1864eda14cbcSMatt Macy 			ret++;
1865eda14cbcSMatt Macy 		}
1866eda14cbcSMatt Macy 		abd_free(orig[c]);
1867eda14cbcSMatt Macy 	}
1868eda14cbcSMatt Macy 
1869eda14cbcSMatt Macy 	return (ret);
1870eda14cbcSMatt Macy }
1871eda14cbcSMatt Macy 
1872eda14cbcSMatt Macy static int
1873eda14cbcSMatt Macy vdev_raidz_worst_error(raidz_map_t *rm)
1874eda14cbcSMatt Macy {
1875eda14cbcSMatt Macy 	int error = 0;
1876eda14cbcSMatt Macy 
1877eda14cbcSMatt Macy 	for (int c = 0; c < rm->rm_cols; c++)
1878eda14cbcSMatt Macy 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1879eda14cbcSMatt Macy 
1880eda14cbcSMatt Macy 	return (error);
1881eda14cbcSMatt Macy }
1882eda14cbcSMatt Macy 
1883eda14cbcSMatt Macy /*
1884eda14cbcSMatt Macy  * Iterate over all combinations of bad data and attempt a reconstruction.
1885eda14cbcSMatt Macy  * Note that the algorithm below is non-optimal because it doesn't take into
1886eda14cbcSMatt Macy  * account how reconstruction is actually performed. For example, with
1887eda14cbcSMatt Macy  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1888eda14cbcSMatt Macy  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1889eda14cbcSMatt Macy  * cases we'd only use parity information in column 0.
1890eda14cbcSMatt Macy  */
1891eda14cbcSMatt Macy static int
1892eda14cbcSMatt Macy vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1893eda14cbcSMatt Macy {
1894eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
1895eda14cbcSMatt Macy 	raidz_col_t *rc;
1896eda14cbcSMatt Macy 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1897eda14cbcSMatt Macy 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1898eda14cbcSMatt Macy 	int *tgts = &tstore[1];
1899eda14cbcSMatt Macy 	int curr, next, i, c, n;
1900eda14cbcSMatt Macy 	int code, ret = 0;
1901eda14cbcSMatt Macy 
1902eda14cbcSMatt Macy 	ASSERT(total_errors < rm->rm_firstdatacol);
1903eda14cbcSMatt Macy 
1904eda14cbcSMatt Macy 	/*
1905eda14cbcSMatt Macy 	 * This simplifies one edge condition.
1906eda14cbcSMatt Macy 	 */
1907eda14cbcSMatt Macy 	tgts[-1] = -1;
1908eda14cbcSMatt Macy 
1909eda14cbcSMatt Macy 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1910eda14cbcSMatt Macy 		/*
1911eda14cbcSMatt Macy 		 * Initialize the targets array by finding the first n columns
1912eda14cbcSMatt Macy 		 * that contain no error.
1913eda14cbcSMatt Macy 		 *
1914eda14cbcSMatt Macy 		 * If there were no data errors, we need to ensure that we're
1915eda14cbcSMatt Macy 		 * always explicitly attempting to reconstruct at least one
1916eda14cbcSMatt Macy 		 * data column. To do this, we simply push the highest target
1917eda14cbcSMatt Macy 		 * up into the data columns.
1918eda14cbcSMatt Macy 		 */
1919eda14cbcSMatt Macy 		for (c = 0, i = 0; i < n; i++) {
1920eda14cbcSMatt Macy 			if (i == n - 1 && data_errors == 0 &&
1921eda14cbcSMatt Macy 			    c < rm->rm_firstdatacol) {
1922eda14cbcSMatt Macy 				c = rm->rm_firstdatacol;
1923eda14cbcSMatt Macy 			}
1924eda14cbcSMatt Macy 
1925eda14cbcSMatt Macy 			while (rm->rm_col[c].rc_error != 0) {
1926eda14cbcSMatt Macy 				c++;
1927eda14cbcSMatt Macy 				ASSERT3S(c, <, rm->rm_cols);
1928eda14cbcSMatt Macy 			}
1929eda14cbcSMatt Macy 
1930eda14cbcSMatt Macy 			tgts[i] = c++;
1931eda14cbcSMatt Macy 		}
1932eda14cbcSMatt Macy 
1933eda14cbcSMatt Macy 		/*
1934eda14cbcSMatt Macy 		 * Setting tgts[n] simplifies the other edge condition.
1935eda14cbcSMatt Macy 		 */
1936eda14cbcSMatt Macy 		tgts[n] = rm->rm_cols;
1937eda14cbcSMatt Macy 
1938eda14cbcSMatt Macy 		/*
1939eda14cbcSMatt Macy 		 * These buffers were allocated in previous iterations.
1940eda14cbcSMatt Macy 		 */
1941eda14cbcSMatt Macy 		for (i = 0; i < n - 1; i++) {
1942eda14cbcSMatt Macy 			ASSERT(orig[i] != NULL);
1943eda14cbcSMatt Macy 		}
1944eda14cbcSMatt Macy 
1945eda14cbcSMatt Macy 		orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
1946eda14cbcSMatt Macy 		    rm->rm_col[0].rc_size);
1947eda14cbcSMatt Macy 
1948eda14cbcSMatt Macy 		curr = 0;
1949eda14cbcSMatt Macy 		next = tgts[curr];
1950eda14cbcSMatt Macy 
1951eda14cbcSMatt Macy 		while (curr != n) {
1952eda14cbcSMatt Macy 			tgts[curr] = next;
1953eda14cbcSMatt Macy 			curr = 0;
1954eda14cbcSMatt Macy 
1955eda14cbcSMatt Macy 			/*
1956eda14cbcSMatt Macy 			 * Save off the original data that we're going to
1957eda14cbcSMatt Macy 			 * attempt to reconstruct.
1958eda14cbcSMatt Macy 			 */
1959eda14cbcSMatt Macy 			for (i = 0; i < n; i++) {
1960eda14cbcSMatt Macy 				ASSERT(orig[i] != NULL);
1961eda14cbcSMatt Macy 				c = tgts[i];
1962eda14cbcSMatt Macy 				ASSERT3S(c, >=, 0);
1963eda14cbcSMatt Macy 				ASSERT3S(c, <, rm->rm_cols);
1964eda14cbcSMatt Macy 				rc = &rm->rm_col[c];
1965eda14cbcSMatt Macy 				abd_copy(orig[i], rc->rc_abd, rc->rc_size);
1966eda14cbcSMatt Macy 			}
1967eda14cbcSMatt Macy 
1968eda14cbcSMatt Macy 			/*
1969eda14cbcSMatt Macy 			 * Attempt a reconstruction and exit the outer loop on
1970eda14cbcSMatt Macy 			 * success.
1971eda14cbcSMatt Macy 			 */
1972eda14cbcSMatt Macy 			code = vdev_raidz_reconstruct(rm, tgts, n);
1973eda14cbcSMatt Macy 			if (raidz_checksum_verify(zio) == 0) {
1974eda14cbcSMatt Macy 
1975eda14cbcSMatt Macy 				for (i = 0; i < n; i++) {
1976eda14cbcSMatt Macy 					c = tgts[i];
1977eda14cbcSMatt Macy 					rc = &rm->rm_col[c];
1978eda14cbcSMatt Macy 					ASSERT(rc->rc_error == 0);
1979eda14cbcSMatt Macy 					if (rc->rc_tried)
1980eda14cbcSMatt Macy 						raidz_checksum_error(zio, rc,
1981eda14cbcSMatt Macy 						    orig[i]);
1982eda14cbcSMatt Macy 					rc->rc_error = SET_ERROR(ECKSUM);
1983eda14cbcSMatt Macy 				}
1984eda14cbcSMatt Macy 
1985eda14cbcSMatt Macy 				ret = code;
1986eda14cbcSMatt Macy 				goto done;
1987eda14cbcSMatt Macy 			}
1988eda14cbcSMatt Macy 
1989eda14cbcSMatt Macy 			/*
1990eda14cbcSMatt Macy 			 * Restore the original data.
1991eda14cbcSMatt Macy 			 */
1992eda14cbcSMatt Macy 			for (i = 0; i < n; i++) {
1993eda14cbcSMatt Macy 				c = tgts[i];
1994eda14cbcSMatt Macy 				rc = &rm->rm_col[c];
1995eda14cbcSMatt Macy 				abd_copy(rc->rc_abd, orig[i], rc->rc_size);
1996eda14cbcSMatt Macy 			}
1997eda14cbcSMatt Macy 
1998eda14cbcSMatt Macy 			do {
1999eda14cbcSMatt Macy 				/*
2000eda14cbcSMatt Macy 				 * Find the next valid column after the curr
2001eda14cbcSMatt Macy 				 * position..
2002eda14cbcSMatt Macy 				 */
2003eda14cbcSMatt Macy 				for (next = tgts[curr] + 1;
2004eda14cbcSMatt Macy 				    next < rm->rm_cols &&
2005eda14cbcSMatt Macy 				    rm->rm_col[next].rc_error != 0; next++)
2006eda14cbcSMatt Macy 					continue;
2007eda14cbcSMatt Macy 
2008eda14cbcSMatt Macy 				ASSERT(next <= tgts[curr + 1]);
2009eda14cbcSMatt Macy 
2010eda14cbcSMatt Macy 				/*
2011eda14cbcSMatt Macy 				 * If that spot is available, we're done here.
2012eda14cbcSMatt Macy 				 */
2013eda14cbcSMatt Macy 				if (next != tgts[curr + 1])
2014eda14cbcSMatt Macy 					break;
2015eda14cbcSMatt Macy 
2016eda14cbcSMatt Macy 				/*
2017eda14cbcSMatt Macy 				 * Otherwise, find the next valid column after
2018eda14cbcSMatt Macy 				 * the previous position.
2019eda14cbcSMatt Macy 				 */
2020eda14cbcSMatt Macy 				for (c = tgts[curr - 1] + 1;
2021eda14cbcSMatt Macy 				    rm->rm_col[c].rc_error != 0; c++)
2022eda14cbcSMatt Macy 					continue;
2023eda14cbcSMatt Macy 
2024eda14cbcSMatt Macy 				tgts[curr] = c;
2025eda14cbcSMatt Macy 				curr++;
2026eda14cbcSMatt Macy 
2027eda14cbcSMatt Macy 			} while (curr != n);
2028eda14cbcSMatt Macy 		}
2029eda14cbcSMatt Macy 	}
2030eda14cbcSMatt Macy 	n--;
2031eda14cbcSMatt Macy done:
2032eda14cbcSMatt Macy 	for (i = 0; i < n; i++)
2033eda14cbcSMatt Macy 		abd_free(orig[i]);
2034eda14cbcSMatt Macy 
2035eda14cbcSMatt Macy 	return (ret);
2036eda14cbcSMatt Macy }
2037eda14cbcSMatt Macy 
2038eda14cbcSMatt Macy /*
2039eda14cbcSMatt Macy  * Complete an IO operation on a RAIDZ VDev
2040eda14cbcSMatt Macy  *
2041eda14cbcSMatt Macy  * Outline:
2042eda14cbcSMatt Macy  * - For write operations:
2043eda14cbcSMatt Macy  *   1. Check for errors on the child IOs.
2044eda14cbcSMatt Macy  *   2. Return, setting an error code if too few child VDevs were written
2045eda14cbcSMatt Macy  *      to reconstruct the data later.  Note that partial writes are
2046eda14cbcSMatt Macy  *      considered successful if they can be reconstructed at all.
2047eda14cbcSMatt Macy  * - For read operations:
2048eda14cbcSMatt Macy  *   1. Check for errors on the child IOs.
2049eda14cbcSMatt Macy  *   2. If data errors occurred:
2050eda14cbcSMatt Macy  *      a. Try to reassemble the data from the parity available.
2051eda14cbcSMatt Macy  *      b. If we haven't yet read the parity drives, read them now.
2052eda14cbcSMatt Macy  *      c. If all parity drives have been read but the data still doesn't
2053eda14cbcSMatt Macy  *         reassemble with a correct checksum, then try combinatorial
2054eda14cbcSMatt Macy  *         reconstruction.
2055eda14cbcSMatt Macy  *      d. If that doesn't work, return an error.
2056eda14cbcSMatt Macy  *   3. If there were unexpected errors or this is a resilver operation,
2057eda14cbcSMatt Macy  *      rewrite the vdevs that had errors.
2058eda14cbcSMatt Macy  */
2059eda14cbcSMatt Macy static void
2060eda14cbcSMatt Macy vdev_raidz_io_done(zio_t *zio)
2061eda14cbcSMatt Macy {
2062eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
2063eda14cbcSMatt Macy 	vdev_t *cvd;
2064eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
2065eda14cbcSMatt Macy 	raidz_col_t *rc = NULL;
2066eda14cbcSMatt Macy 	int unexpected_errors = 0;
2067eda14cbcSMatt Macy 	int parity_errors = 0;
2068eda14cbcSMatt Macy 	int parity_untried = 0;
2069eda14cbcSMatt Macy 	int data_errors = 0;
2070eda14cbcSMatt Macy 	int total_errors = 0;
2071eda14cbcSMatt Macy 	int n, c;
2072eda14cbcSMatt Macy 	int tgts[VDEV_RAIDZ_MAXPARITY];
2073eda14cbcSMatt Macy 	int code;
2074eda14cbcSMatt Macy 
2075eda14cbcSMatt Macy 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2076eda14cbcSMatt Macy 
2077eda14cbcSMatt Macy 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2078eda14cbcSMatt Macy 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2079eda14cbcSMatt Macy 
2080eda14cbcSMatt Macy 	for (c = 0; c < rm->rm_cols; c++) {
2081eda14cbcSMatt Macy 		rc = &rm->rm_col[c];
2082eda14cbcSMatt Macy 
2083eda14cbcSMatt Macy 		if (rc->rc_error) {
2084eda14cbcSMatt Macy 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2085eda14cbcSMatt Macy 
2086eda14cbcSMatt Macy 			if (c < rm->rm_firstdatacol)
2087eda14cbcSMatt Macy 				parity_errors++;
2088eda14cbcSMatt Macy 			else
2089eda14cbcSMatt Macy 				data_errors++;
2090eda14cbcSMatt Macy 
2091eda14cbcSMatt Macy 			if (!rc->rc_skipped)
2092eda14cbcSMatt Macy 				unexpected_errors++;
2093eda14cbcSMatt Macy 
2094eda14cbcSMatt Macy 			total_errors++;
2095eda14cbcSMatt Macy 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2096eda14cbcSMatt Macy 			parity_untried++;
2097eda14cbcSMatt Macy 		}
2098eda14cbcSMatt Macy 	}
2099eda14cbcSMatt Macy 
2100eda14cbcSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
2101eda14cbcSMatt Macy 		/*
2102eda14cbcSMatt Macy 		 * XXX -- for now, treat partial writes as a success.
2103eda14cbcSMatt Macy 		 * (If we couldn't write enough columns to reconstruct
2104eda14cbcSMatt Macy 		 * the data, the I/O failed.  Otherwise, good enough.)
2105eda14cbcSMatt Macy 		 *
2106eda14cbcSMatt Macy 		 * Now that we support write reallocation, it would be better
2107eda14cbcSMatt Macy 		 * to treat partial failure as real failure unless there are
2108eda14cbcSMatt Macy 		 * no non-degraded top-level vdevs left, and not update DTLs
2109eda14cbcSMatt Macy 		 * if we intend to reallocate.
2110eda14cbcSMatt Macy 		 */
2111eda14cbcSMatt Macy 		/* XXPOLICY */
2112eda14cbcSMatt Macy 		if (total_errors > rm->rm_firstdatacol)
2113eda14cbcSMatt Macy 			zio->io_error = vdev_raidz_worst_error(rm);
2114eda14cbcSMatt Macy 
2115eda14cbcSMatt Macy 		return;
2116eda14cbcSMatt Macy 	}
2117eda14cbcSMatt Macy 
2118eda14cbcSMatt Macy 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2119eda14cbcSMatt Macy 	/*
2120eda14cbcSMatt Macy 	 * There are three potential phases for a read:
2121eda14cbcSMatt Macy 	 *	1. produce valid data from the columns read
2122eda14cbcSMatt Macy 	 *	2. read all disks and try again
2123eda14cbcSMatt Macy 	 *	3. perform combinatorial reconstruction
2124eda14cbcSMatt Macy 	 *
2125eda14cbcSMatt Macy 	 * Each phase is progressively both more expensive and less likely to
2126eda14cbcSMatt Macy 	 * occur. If we encounter more errors than we can repair or all phases
2127eda14cbcSMatt Macy 	 * fail, we have no choice but to return an error.
2128eda14cbcSMatt Macy 	 */
2129eda14cbcSMatt Macy 
2130eda14cbcSMatt Macy 	/*
2131eda14cbcSMatt Macy 	 * If the number of errors we saw was correctable -- less than or equal
2132eda14cbcSMatt Macy 	 * to the number of parity disks read -- attempt to produce data that
2133eda14cbcSMatt Macy 	 * has a valid checksum. Naturally, this case applies in the absence of
2134eda14cbcSMatt Macy 	 * any errors.
2135eda14cbcSMatt Macy 	 */
2136eda14cbcSMatt Macy 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2137eda14cbcSMatt Macy 		if (data_errors == 0) {
2138eda14cbcSMatt Macy 			if (raidz_checksum_verify(zio) == 0) {
2139eda14cbcSMatt Macy 				/*
2140eda14cbcSMatt Macy 				 * If we read parity information (unnecessarily
2141eda14cbcSMatt Macy 				 * as it happens since no reconstruction was
2142eda14cbcSMatt Macy 				 * needed) regenerate and verify the parity.
2143eda14cbcSMatt Macy 				 * We also regenerate parity when resilvering
2144eda14cbcSMatt Macy 				 * so we can write it out to the failed device
2145eda14cbcSMatt Macy 				 * later.
2146eda14cbcSMatt Macy 				 */
2147eda14cbcSMatt Macy 				if (parity_errors + parity_untried <
2148eda14cbcSMatt Macy 				    rm->rm_firstdatacol ||
2149eda14cbcSMatt Macy 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2150eda14cbcSMatt Macy 					n = raidz_parity_verify(zio, rm);
2151eda14cbcSMatt Macy 					unexpected_errors += n;
2152eda14cbcSMatt Macy 					ASSERT(parity_errors + n <=
2153eda14cbcSMatt Macy 					    rm->rm_firstdatacol);
2154eda14cbcSMatt Macy 				}
2155eda14cbcSMatt Macy 				goto done;
2156eda14cbcSMatt Macy 			}
2157eda14cbcSMatt Macy 		} else {
2158eda14cbcSMatt Macy 			/*
2159eda14cbcSMatt Macy 			 * We either attempt to read all the parity columns or
2160eda14cbcSMatt Macy 			 * none of them. If we didn't try to read parity, we
2161eda14cbcSMatt Macy 			 * wouldn't be here in the correctable case. There must
2162eda14cbcSMatt Macy 			 * also have been fewer parity errors than parity
2163eda14cbcSMatt Macy 			 * columns or, again, we wouldn't be in this code path.
2164eda14cbcSMatt Macy 			 */
2165eda14cbcSMatt Macy 			ASSERT(parity_untried == 0);
2166eda14cbcSMatt Macy 			ASSERT(parity_errors < rm->rm_firstdatacol);
2167eda14cbcSMatt Macy 
2168eda14cbcSMatt Macy 			/*
2169eda14cbcSMatt Macy 			 * Identify the data columns that reported an error.
2170eda14cbcSMatt Macy 			 */
2171eda14cbcSMatt Macy 			n = 0;
2172eda14cbcSMatt Macy 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2173eda14cbcSMatt Macy 				rc = &rm->rm_col[c];
2174eda14cbcSMatt Macy 				if (rc->rc_error != 0) {
2175eda14cbcSMatt Macy 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2176eda14cbcSMatt Macy 					tgts[n++] = c;
2177eda14cbcSMatt Macy 				}
2178eda14cbcSMatt Macy 			}
2179eda14cbcSMatt Macy 
2180eda14cbcSMatt Macy 			ASSERT(rm->rm_firstdatacol >= n);
2181eda14cbcSMatt Macy 
2182eda14cbcSMatt Macy 			code = vdev_raidz_reconstruct(rm, tgts, n);
2183eda14cbcSMatt Macy 
2184eda14cbcSMatt Macy 			if (raidz_checksum_verify(zio) == 0) {
2185eda14cbcSMatt Macy 				/*
2186eda14cbcSMatt Macy 				 * If we read more parity disks than were used
2187eda14cbcSMatt Macy 				 * for reconstruction, confirm that the other
2188eda14cbcSMatt Macy 				 * parity disks produced correct data. This
2189eda14cbcSMatt Macy 				 * routine is suboptimal in that it regenerates
2190eda14cbcSMatt Macy 				 * the parity that we already used in addition
2191eda14cbcSMatt Macy 				 * to the parity that we're attempting to
2192eda14cbcSMatt Macy 				 * verify, but this should be a relatively
2193eda14cbcSMatt Macy 				 * uncommon case, and can be optimized if it
2194eda14cbcSMatt Macy 				 * becomes a problem. Note that we regenerate
2195eda14cbcSMatt Macy 				 * parity when resilvering so we can write it
2196eda14cbcSMatt Macy 				 * out to failed devices later.
2197eda14cbcSMatt Macy 				 */
2198eda14cbcSMatt Macy 				if (parity_errors < rm->rm_firstdatacol - n ||
2199eda14cbcSMatt Macy 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2200eda14cbcSMatt Macy 					n = raidz_parity_verify(zio, rm);
2201eda14cbcSMatt Macy 					unexpected_errors += n;
2202eda14cbcSMatt Macy 					ASSERT(parity_errors + n <=
2203eda14cbcSMatt Macy 					    rm->rm_firstdatacol);
2204eda14cbcSMatt Macy 				}
2205eda14cbcSMatt Macy 
2206eda14cbcSMatt Macy 				goto done;
2207eda14cbcSMatt Macy 			}
2208eda14cbcSMatt Macy 		}
2209eda14cbcSMatt Macy 	}
2210eda14cbcSMatt Macy 
2211eda14cbcSMatt Macy 	/*
2212eda14cbcSMatt Macy 	 * This isn't a typical situation -- either we got a read error or
2213eda14cbcSMatt Macy 	 * a child silently returned bad data. Read every block so we can
2214eda14cbcSMatt Macy 	 * try again with as much data and parity as we can track down. If
2215eda14cbcSMatt Macy 	 * we've already been through once before, all children will be marked
2216eda14cbcSMatt Macy 	 * as tried so we'll proceed to combinatorial reconstruction.
2217eda14cbcSMatt Macy 	 */
2218eda14cbcSMatt Macy 	unexpected_errors = 1;
2219eda14cbcSMatt Macy 	rm->rm_missingdata = 0;
2220eda14cbcSMatt Macy 	rm->rm_missingparity = 0;
2221eda14cbcSMatt Macy 
2222eda14cbcSMatt Macy 	for (c = 0; c < rm->rm_cols; c++) {
2223eda14cbcSMatt Macy 		if (rm->rm_col[c].rc_tried)
2224eda14cbcSMatt Macy 			continue;
2225eda14cbcSMatt Macy 
2226eda14cbcSMatt Macy 		zio_vdev_io_redone(zio);
2227eda14cbcSMatt Macy 		do {
2228eda14cbcSMatt Macy 			rc = &rm->rm_col[c];
2229eda14cbcSMatt Macy 			if (rc->rc_tried)
2230eda14cbcSMatt Macy 				continue;
2231eda14cbcSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL,
2232eda14cbcSMatt Macy 			    vd->vdev_child[rc->rc_devidx],
2233eda14cbcSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2234eda14cbcSMatt Macy 			    zio->io_type, zio->io_priority, 0,
2235eda14cbcSMatt Macy 			    vdev_raidz_child_done, rc));
2236eda14cbcSMatt Macy 		} while (++c < rm->rm_cols);
2237eda14cbcSMatt Macy 
2238eda14cbcSMatt Macy 		return;
2239eda14cbcSMatt Macy 	}
2240eda14cbcSMatt Macy 
2241eda14cbcSMatt Macy 	/*
2242eda14cbcSMatt Macy 	 * At this point we've attempted to reconstruct the data given the
2243eda14cbcSMatt Macy 	 * errors we detected, and we've attempted to read all columns. There
2244eda14cbcSMatt Macy 	 * must, therefore, be one or more additional problems -- silent errors
2245eda14cbcSMatt Macy 	 * resulting in invalid data rather than explicit I/O errors resulting
2246eda14cbcSMatt Macy 	 * in absent data. We check if there is enough additional data to
2247eda14cbcSMatt Macy 	 * possibly reconstruct the data and then perform combinatorial
2248eda14cbcSMatt Macy 	 * reconstruction over all possible combinations. If that fails,
2249eda14cbcSMatt Macy 	 * we're cooked.
2250eda14cbcSMatt Macy 	 */
2251eda14cbcSMatt Macy 	if (total_errors > rm->rm_firstdatacol) {
2252eda14cbcSMatt Macy 		zio->io_error = vdev_raidz_worst_error(rm);
2253eda14cbcSMatt Macy 
2254eda14cbcSMatt Macy 	} else if (total_errors < rm->rm_firstdatacol &&
2255eda14cbcSMatt Macy 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2256eda14cbcSMatt Macy 		/*
2257eda14cbcSMatt Macy 		 * If we didn't use all the available parity for the
2258eda14cbcSMatt Macy 		 * combinatorial reconstruction, verify that the remaining
2259eda14cbcSMatt Macy 		 * parity is correct.
2260eda14cbcSMatt Macy 		 */
2261eda14cbcSMatt Macy 		if (code != (1 << rm->rm_firstdatacol) - 1)
2262eda14cbcSMatt Macy 			(void) raidz_parity_verify(zio, rm);
2263eda14cbcSMatt Macy 	} else {
2264eda14cbcSMatt Macy 		/*
2265eda14cbcSMatt Macy 		 * We're here because either:
2266eda14cbcSMatt Macy 		 *
2267eda14cbcSMatt Macy 		 *	total_errors == rm_first_datacol, or
2268eda14cbcSMatt Macy 		 *	vdev_raidz_combrec() failed
2269eda14cbcSMatt Macy 		 *
2270eda14cbcSMatt Macy 		 * In either case, there is enough bad data to prevent
2271eda14cbcSMatt Macy 		 * reconstruction.
2272eda14cbcSMatt Macy 		 *
2273eda14cbcSMatt Macy 		 * Start checksum ereports for all children which haven't
2274eda14cbcSMatt Macy 		 * failed, and the IO wasn't speculative.
2275eda14cbcSMatt Macy 		 */
2276eda14cbcSMatt Macy 		zio->io_error = SET_ERROR(ECKSUM);
2277eda14cbcSMatt Macy 
2278eda14cbcSMatt Macy 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2279eda14cbcSMatt Macy 			for (c = 0; c < rm->rm_cols; c++) {
2280eda14cbcSMatt Macy 				vdev_t *cvd;
2281eda14cbcSMatt Macy 				rc = &rm->rm_col[c];
2282eda14cbcSMatt Macy 				cvd = vd->vdev_child[rc->rc_devidx];
2283*2c48331dSMatt Macy 				if (rc->rc_error != 0)
2284*2c48331dSMatt Macy 					continue;
2285*2c48331dSMatt Macy 
2286eda14cbcSMatt Macy 				zio_bad_cksum_t zbc;
2287eda14cbcSMatt Macy 				zbc.zbc_has_cksum = 0;
2288*2c48331dSMatt Macy 				zbc.zbc_injected = rm->rm_ecksuminjected;
2289eda14cbcSMatt Macy 
2290*2c48331dSMatt Macy 				int ret = zfs_ereport_start_checksum(
2291*2c48331dSMatt Macy 				    zio->io_spa, cvd, &zio->io_bookmark, zio,
2292*2c48331dSMatt Macy 				    rc->rc_offset, rc->rc_size,
2293*2c48331dSMatt Macy 				    (void *)(uintptr_t)c, &zbc);
2294*2c48331dSMatt Macy 				if (ret != EALREADY) {
2295eda14cbcSMatt Macy 					mutex_enter(&cvd->vdev_stat_lock);
2296eda14cbcSMatt Macy 					cvd->vdev_stat.vs_checksum_errors++;
2297eda14cbcSMatt Macy 					mutex_exit(&cvd->vdev_stat_lock);
2298eda14cbcSMatt Macy 				}
2299eda14cbcSMatt Macy 			}
2300eda14cbcSMatt Macy 		}
2301eda14cbcSMatt Macy 	}
2302eda14cbcSMatt Macy 
2303eda14cbcSMatt Macy done:
2304eda14cbcSMatt Macy 	zio_checksum_verified(zio);
2305eda14cbcSMatt Macy 
2306eda14cbcSMatt Macy 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2307eda14cbcSMatt Macy 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2308eda14cbcSMatt Macy 		/*
2309eda14cbcSMatt Macy 		 * Use the good data we have in hand to repair damaged children.
2310eda14cbcSMatt Macy 		 */
2311eda14cbcSMatt Macy 		for (c = 0; c < rm->rm_cols; c++) {
2312eda14cbcSMatt Macy 			rc = &rm->rm_col[c];
2313eda14cbcSMatt Macy 			cvd = vd->vdev_child[rc->rc_devidx];
2314eda14cbcSMatt Macy 
2315eda14cbcSMatt Macy 			if (rc->rc_error == 0)
2316eda14cbcSMatt Macy 				continue;
2317eda14cbcSMatt Macy 
2318eda14cbcSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2319eda14cbcSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2320eda14cbcSMatt Macy 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2321eda14cbcSMatt Macy 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2322eda14cbcSMatt Macy 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2323eda14cbcSMatt Macy 		}
2324eda14cbcSMatt Macy 	}
2325eda14cbcSMatt Macy }
2326eda14cbcSMatt Macy 
2327eda14cbcSMatt Macy static void
2328eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2329eda14cbcSMatt Macy {
2330eda14cbcSMatt Macy 	if (faulted > vd->vdev_nparity)
2331eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2332eda14cbcSMatt Macy 		    VDEV_AUX_NO_REPLICAS);
2333eda14cbcSMatt Macy 	else if (degraded + faulted != 0)
2334eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2335eda14cbcSMatt Macy 	else
2336eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2337eda14cbcSMatt Macy }
2338eda14cbcSMatt Macy 
2339eda14cbcSMatt Macy /*
2340eda14cbcSMatt Macy  * Determine if any portion of the provided block resides on a child vdev
2341eda14cbcSMatt Macy  * with a dirty DTL and therefore needs to be resilvered.  The function
2342eda14cbcSMatt Macy  * assumes that at least one DTL is dirty which implies that full stripe
2343eda14cbcSMatt Macy  * width blocks must be resilvered.
2344eda14cbcSMatt Macy  */
2345eda14cbcSMatt Macy static boolean_t
2346eda14cbcSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2347eda14cbcSMatt Macy {
2348eda14cbcSMatt Macy 	uint64_t dcols = vd->vdev_children;
2349eda14cbcSMatt Macy 	uint64_t nparity = vd->vdev_nparity;
2350eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2351eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
2352eda14cbcSMatt Macy 	uint64_t b = offset >> ashift;
2353eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
2354eda14cbcSMatt Macy 	uint64_t s = ((psize - 1) >> ashift) + 1;
2355eda14cbcSMatt Macy 	/* The first column for this stripe. */
2356eda14cbcSMatt Macy 	uint64_t f = b % dcols;
2357eda14cbcSMatt Macy 
2358eda14cbcSMatt Macy 	if (s + nparity >= dcols)
2359eda14cbcSMatt Macy 		return (B_TRUE);
2360eda14cbcSMatt Macy 
2361eda14cbcSMatt Macy 	for (uint64_t c = 0; c < s + nparity; c++) {
2362eda14cbcSMatt Macy 		uint64_t devidx = (f + c) % dcols;
2363eda14cbcSMatt Macy 		vdev_t *cvd = vd->vdev_child[devidx];
2364eda14cbcSMatt Macy 
2365eda14cbcSMatt Macy 		/*
2366eda14cbcSMatt Macy 		 * dsl_scan_need_resilver() already checked vd with
2367eda14cbcSMatt Macy 		 * vdev_dtl_contains(). So here just check cvd with
2368eda14cbcSMatt Macy 		 * vdev_dtl_empty(), cheaper and a good approximation.
2369eda14cbcSMatt Macy 		 */
2370eda14cbcSMatt Macy 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2371eda14cbcSMatt Macy 			return (B_TRUE);
2372eda14cbcSMatt Macy 	}
2373eda14cbcSMatt Macy 
2374eda14cbcSMatt Macy 	return (B_FALSE);
2375eda14cbcSMatt Macy }
2376eda14cbcSMatt Macy 
2377eda14cbcSMatt Macy static void
2378eda14cbcSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
2379eda14cbcSMatt Macy {
2380eda14cbcSMatt Macy 	vdev_t *raidvd = cvd->vdev_parent;
2381eda14cbcSMatt Macy 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2382eda14cbcSMatt Macy 
2383eda14cbcSMatt Macy 	uint64_t width = raidvd->vdev_children;
2384eda14cbcSMatt Macy 	uint64_t tgt_col = cvd->vdev_id;
2385eda14cbcSMatt Macy 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2386eda14cbcSMatt Macy 
2387eda14cbcSMatt Macy 	/* make sure the offsets are block-aligned */
2388eda14cbcSMatt Macy 	ASSERT0(in->rs_start % (1 << ashift));
2389eda14cbcSMatt Macy 	ASSERT0(in->rs_end % (1 << ashift));
2390eda14cbcSMatt Macy 	uint64_t b_start = in->rs_start >> ashift;
2391eda14cbcSMatt Macy 	uint64_t b_end = in->rs_end >> ashift;
2392eda14cbcSMatt Macy 
2393eda14cbcSMatt Macy 	uint64_t start_row = 0;
2394eda14cbcSMatt Macy 	if (b_start > tgt_col) /* avoid underflow */
2395eda14cbcSMatt Macy 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2396eda14cbcSMatt Macy 
2397eda14cbcSMatt Macy 	uint64_t end_row = 0;
2398eda14cbcSMatt Macy 	if (b_end > tgt_col)
2399eda14cbcSMatt Macy 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2400eda14cbcSMatt Macy 
2401eda14cbcSMatt Macy 	res->rs_start = start_row << ashift;
2402eda14cbcSMatt Macy 	res->rs_end = end_row << ashift;
2403eda14cbcSMatt Macy 
2404eda14cbcSMatt Macy 	ASSERT3U(res->rs_start, <=, in->rs_start);
2405eda14cbcSMatt Macy 	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2406eda14cbcSMatt Macy }
2407eda14cbcSMatt Macy 
2408eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = {
2409eda14cbcSMatt Macy 	.vdev_op_open = vdev_raidz_open,
2410eda14cbcSMatt Macy 	.vdev_op_close = vdev_raidz_close,
2411eda14cbcSMatt Macy 	.vdev_op_asize = vdev_raidz_asize,
2412eda14cbcSMatt Macy 	.vdev_op_io_start = vdev_raidz_io_start,
2413eda14cbcSMatt Macy 	.vdev_op_io_done = vdev_raidz_io_done,
2414eda14cbcSMatt Macy 	.vdev_op_state_change = vdev_raidz_state_change,
2415eda14cbcSMatt Macy 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2416eda14cbcSMatt Macy 	.vdev_op_hold = NULL,
2417eda14cbcSMatt Macy 	.vdev_op_rele = NULL,
2418eda14cbcSMatt Macy 	.vdev_op_remap = NULL,
2419eda14cbcSMatt Macy 	.vdev_op_xlate = vdev_raidz_xlate,
2420eda14cbcSMatt Macy 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2421eda14cbcSMatt Macy 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2422eda14cbcSMatt Macy };
2423