xref: /titanic_52/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 4b7f25f92fce04a513df62afed73561f9216a4fd)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
2199653d4eSeschrock 
22fa9e4066Sahrens /*
233f9d6ad7SLin Ling  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24738f37bcSGeorge Wilson  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25810e43b2SBill Pijewski  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26c3d26abcSMatthew Ahrens  * Copyright (c) 2014 Integros [integros.com]
27fa9e4066Sahrens  */
28fa9e4066Sahrens 
29fa9e4066Sahrens #include <sys/zfs_context.h>
30fa9e4066Sahrens #include <sys/spa.h>
31fa9e4066Sahrens #include <sys/vdev_impl.h>
32810e43b2SBill Pijewski #include <sys/vdev_disk.h>
33810e43b2SBill Pijewski #include <sys/vdev_file.h>
34810e43b2SBill Pijewski #include <sys/vdev_raidz.h>
35fa9e4066Sahrens #include <sys/zio.h>
36fa9e4066Sahrens #include <sys/zio_checksum.h>
37fa9e4066Sahrens #include <sys/fs/zfs.h>
38ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
39fa9e4066Sahrens 
40fa9e4066Sahrens /*
41fa9e4066Sahrens  * Virtual device vector for RAID-Z.
4299653d4eSeschrock  *
43f94275ceSAdam Leventhal  * This vdev supports single, double, and triple parity. For single parity,
44f94275ceSAdam Leventhal  * we use a simple XOR of all the data columns. For double or triple parity,
45f94275ceSAdam Leventhal  * we use a special case of Reed-Solomon coding. This extends the
46f94275ceSAdam Leventhal  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
47f94275ceSAdam Leventhal  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
48f94275ceSAdam Leventhal  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
49f94275ceSAdam Leventhal  * former is also based. The latter is designed to provide higher performance
50f94275ceSAdam Leventhal  * for writes.
51f94275ceSAdam Leventhal  *
52f94275ceSAdam Leventhal  * Note that the Plank paper claimed to support arbitrary N+M, but was then
53f94275ceSAdam Leventhal  * amended six years later identifying a critical flaw that invalidates its
54f94275ceSAdam Leventhal  * claims. Nevertheless, the technique can be adapted to work for up to
55f94275ceSAdam Leventhal  * triple parity. For additional parity, the amendment "Note: Correction to
56f94275ceSAdam Leventhal  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
57f94275ceSAdam Leventhal  * is viable, but the additional complexity means that write performance will
58f94275ceSAdam Leventhal  * suffer.
59f94275ceSAdam Leventhal  *
60f94275ceSAdam Leventhal  * All of the methods above operate on a Galois field, defined over the
61f94275ceSAdam Leventhal  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
62f94275ceSAdam Leventhal  * can be expressed with a single byte. Briefly, the operations on the
63f94275ceSAdam Leventhal  * field are defined as follows:
6499653d4eSeschrock  *
6599653d4eSeschrock  *   o addition (+) is represented by a bitwise XOR
6699653d4eSeschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
6799653d4eSeschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
68f7170741SWill Andrews  *
6999653d4eSeschrock  *	(A * 2)_7 = A_6
7099653d4eSeschrock  *	(A * 2)_6 = A_5
7199653d4eSeschrock  *	(A * 2)_5 = A_4
7299653d4eSeschrock  *	(A * 2)_4 = A_3 + A_7
7399653d4eSeschrock  *	(A * 2)_3 = A_2 + A_7
7499653d4eSeschrock  *	(A * 2)_2 = A_1 + A_7
7599653d4eSeschrock  *	(A * 2)_1 = A_0
7699653d4eSeschrock  *	(A * 2)_0 = A_7
7799653d4eSeschrock  *
7899653d4eSeschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
79f94275ceSAdam Leventhal  * As an aside, this multiplication is derived from the error correcting
80f94275ceSAdam Leventhal  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
8199653d4eSeschrock  *
8299653d4eSeschrock  * Observe that any number in the field (except for 0) can be expressed as a
8399653d4eSeschrock  * power of 2 -- a generator for the field. We store a table of the powers of
8499653d4eSeschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
8599653d4eSeschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
86f94275ceSAdam Leventhal  * than field addition). The inverse of a field element A (A^-1) is therefore
87f94275ceSAdam Leventhal  * A ^ (255 - 1) = A^254.
8899653d4eSeschrock  *
89f94275ceSAdam Leventhal  * The up-to-three parity columns, P, Q, R over several data columns,
90f94275ceSAdam Leventhal  * D_0, ... D_n-1, can be expressed by field operations:
9199653d4eSeschrock  *
9299653d4eSeschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
9399653d4eSeschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
9499653d4eSeschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
95f94275ceSAdam Leventhal  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
96f94275ceSAdam Leventhal  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
9799653d4eSeschrock  *
98f94275ceSAdam Leventhal  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
99f94275ceSAdam Leventhal  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
100f94275ceSAdam Leventhal  * independent coefficients. (There are no additional coefficients that have
101f94275ceSAdam Leventhal  * this property which is why the uncorrected Plank method breaks down.)
102f94275ceSAdam Leventhal  *
103f94275ceSAdam Leventhal  * See the reconstruction code below for how P, Q and R can used individually
104f94275ceSAdam Leventhal  * or in concert to recover missing data columns.
105fa9e4066Sahrens  */
106fa9e4066Sahrens 
107fa9e4066Sahrens typedef struct raidz_col {
10899653d4eSeschrock 	uint64_t rc_devidx;		/* child device index for I/O */
10999653d4eSeschrock 	uint64_t rc_offset;		/* device offset */
11099653d4eSeschrock 	uint64_t rc_size;		/* I/O size */
11199653d4eSeschrock 	void *rc_data;			/* I/O data */
11222fe2c88SJonathan Adams 	void *rc_gdata;			/* used to store the "good" version */
11399653d4eSeschrock 	int rc_error;			/* I/O error for this device */
11499653d4eSeschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
11599653d4eSeschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
116fa9e4066Sahrens } raidz_col_t;
117fa9e4066Sahrens 
118fa9e4066Sahrens typedef struct raidz_map {
119f94275ceSAdam Leventhal 	uint64_t rm_cols;		/* Regular column count */
120f94275ceSAdam Leventhal 	uint64_t rm_scols;		/* Count including skipped columns */
12199653d4eSeschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
12299653d4eSeschrock 	uint64_t rm_asize;		/* Actual total I/O size */
12399653d4eSeschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
12499653d4eSeschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
12599653d4eSeschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
1262fbc121fSAdam Leventhal 	uint64_t rm_nskip;		/* Skipped sectors for padding */
1272fbc121fSAdam Leventhal 	uint64_t rm_skipstart;		/* Column index of padding start */
12822fe2c88SJonathan Adams 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
12922fe2c88SJonathan Adams 	uintptr_t rm_reports;		/* # of referencing checksum reports */
13022fe2c88SJonathan Adams 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
13122fe2c88SJonathan Adams 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
13299653d4eSeschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
133fa9e4066Sahrens } raidz_map_t;
134fa9e4066Sahrens 
13599653d4eSeschrock #define	VDEV_RAIDZ_P		0
13699653d4eSeschrock #define	VDEV_RAIDZ_Q		1
137f94275ceSAdam Leventhal #define	VDEV_RAIDZ_R		2
13899653d4eSeschrock 
139f94275ceSAdam Leventhal #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
140f94275ceSAdam Leventhal #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
14199653d4eSeschrock 
142f94275ceSAdam Leventhal /*
143f94275ceSAdam Leventhal  * We provide a mechanism to perform the field multiplication operation on a
144f94275ceSAdam Leventhal  * 64-bit value all at once rather than a byte at a time. This works by
145f94275ceSAdam Leventhal  * creating a mask from the top bit in each byte and using that to
146f94275ceSAdam Leventhal  * conditionally apply the XOR of 0x1d.
147f94275ceSAdam Leventhal  */
148f94275ceSAdam Leventhal #define	VDEV_RAIDZ_64MUL_2(x, mask) \
149f94275ceSAdam Leventhal { \
150f94275ceSAdam Leventhal 	(mask) = (x) & 0x8080808080808080ULL; \
151f94275ceSAdam Leventhal 	(mask) = ((mask) << 1) - ((mask) >> 7); \
152f94275ceSAdam Leventhal 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
153f94275ceSAdam Leventhal 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
154f94275ceSAdam Leventhal }
155f94275ceSAdam Leventhal 
156f94275ceSAdam Leventhal #define	VDEV_RAIDZ_64MUL_4(x, mask) \
157f94275ceSAdam Leventhal { \
158f94275ceSAdam Leventhal 	VDEV_RAIDZ_64MUL_2((x), mask); \
159f94275ceSAdam Leventhal 	VDEV_RAIDZ_64MUL_2((x), mask); \
160f94275ceSAdam Leventhal }
161f94275ceSAdam Leventhal 
162810e43b2SBill Pijewski #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
163810e43b2SBill Pijewski 
164f94275ceSAdam Leventhal /*
165f94275ceSAdam Leventhal  * Force reconstruction to use the general purpose method.
166f94275ceSAdam Leventhal  */
167f94275ceSAdam Leventhal int vdev_raidz_default_to_general;
16899653d4eSeschrock 
169f7170741SWill Andrews /* Powers of 2 in the Galois field defined above. */
17099653d4eSeschrock static const uint8_t vdev_raidz_pow2[256] = {
17199653d4eSeschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
17299653d4eSeschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
17399653d4eSeschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
17499653d4eSeschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
17599653d4eSeschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
17699653d4eSeschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
17799653d4eSeschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
17899653d4eSeschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
17999653d4eSeschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
18099653d4eSeschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
18199653d4eSeschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
18299653d4eSeschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
18399653d4eSeschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
18499653d4eSeschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
18599653d4eSeschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
18699653d4eSeschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
18799653d4eSeschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
18899653d4eSeschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
18999653d4eSeschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
19099653d4eSeschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
19199653d4eSeschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
19299653d4eSeschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
19399653d4eSeschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
19499653d4eSeschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
19599653d4eSeschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
19699653d4eSeschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
19799653d4eSeschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
19899653d4eSeschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
19999653d4eSeschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
20099653d4eSeschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
20199653d4eSeschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
20299653d4eSeschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
20399653d4eSeschrock };
204f7170741SWill Andrews /* Logs of 2 in the Galois field defined above. */
20599653d4eSeschrock static const uint8_t vdev_raidz_log2[256] = {
20699653d4eSeschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
20799653d4eSeschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
20899653d4eSeschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
20999653d4eSeschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
21099653d4eSeschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
21199653d4eSeschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
21299653d4eSeschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
21399653d4eSeschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
21499653d4eSeschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
21599653d4eSeschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
21699653d4eSeschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
21799653d4eSeschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
21899653d4eSeschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
21999653d4eSeschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
22099653d4eSeschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
22199653d4eSeschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
22299653d4eSeschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
22399653d4eSeschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
22499653d4eSeschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
22599653d4eSeschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
22699653d4eSeschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
22799653d4eSeschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
22899653d4eSeschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
22999653d4eSeschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
23099653d4eSeschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
23199653d4eSeschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
23299653d4eSeschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
23399653d4eSeschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
23499653d4eSeschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
23599653d4eSeschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
23699653d4eSeschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
23799653d4eSeschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
23899653d4eSeschrock };
23999653d4eSeschrock 
24022fe2c88SJonathan Adams static void vdev_raidz_generate_parity(raidz_map_t *rm);
24122fe2c88SJonathan Adams 
24299653d4eSeschrock /*
24399653d4eSeschrock  * Multiply a given number by 2 raised to the given power.
24499653d4eSeschrock  */
24599653d4eSeschrock static uint8_t
24699653d4eSeschrock vdev_raidz_exp2(uint_t a, int exp)
24799653d4eSeschrock {
24899653d4eSeschrock 	if (a == 0)
24999653d4eSeschrock 		return (0);
25099653d4eSeschrock 
25199653d4eSeschrock 	ASSERT(exp >= 0);
25299653d4eSeschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
25399653d4eSeschrock 
25499653d4eSeschrock 	exp += vdev_raidz_log2[a];
25599653d4eSeschrock 	if (exp > 255)
25699653d4eSeschrock 		exp -= 255;
25799653d4eSeschrock 
25899653d4eSeschrock 	return (vdev_raidz_pow2[exp]);
25999653d4eSeschrock }
26099653d4eSeschrock 
261e14bb325SJeff Bonwick static void
26222fe2c88SJonathan Adams vdev_raidz_map_free(raidz_map_t *rm)
263e14bb325SJeff Bonwick {
264e14bb325SJeff Bonwick 	int c;
265baa7389eSJonathan Adams 	size_t size;
266e14bb325SJeff Bonwick 
26722fe2c88SJonathan Adams 	for (c = 0; c < rm->rm_firstdatacol; c++) {
268e14bb325SJeff Bonwick 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
269e14bb325SJeff Bonwick 
27022fe2c88SJonathan Adams 		if (rm->rm_col[c].rc_gdata != NULL)
27122fe2c88SJonathan Adams 			zio_buf_free(rm->rm_col[c].rc_gdata,
27222fe2c88SJonathan Adams 			    rm->rm_col[c].rc_size);
27322fe2c88SJonathan Adams 	}
27422fe2c88SJonathan Adams 
275baa7389eSJonathan Adams 	size = 0;
276baa7389eSJonathan Adams 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
277baa7389eSJonathan Adams 		size += rm->rm_col[c].rc_size;
278baa7389eSJonathan Adams 
27922fe2c88SJonathan Adams 	if (rm->rm_datacopy != NULL)
28022fe2c88SJonathan Adams 		zio_buf_free(rm->rm_datacopy, size);
28122fe2c88SJonathan Adams 
282f94275ceSAdam Leventhal 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
283e14bb325SJeff Bonwick }
284e14bb325SJeff Bonwick 
28522fe2c88SJonathan Adams static void
28622fe2c88SJonathan Adams vdev_raidz_map_free_vsd(zio_t *zio)
28722fe2c88SJonathan Adams {
28822fe2c88SJonathan Adams 	raidz_map_t *rm = zio->io_vsd;
28922fe2c88SJonathan Adams 
290fb09f5aaSMadhav Suresh 	ASSERT0(rm->rm_freed);
29122fe2c88SJonathan Adams 	rm->rm_freed = 1;
29222fe2c88SJonathan Adams 
29322fe2c88SJonathan Adams 	if (rm->rm_reports == 0)
29422fe2c88SJonathan Adams 		vdev_raidz_map_free(rm);
29522fe2c88SJonathan Adams }
29622fe2c88SJonathan Adams 
29722fe2c88SJonathan Adams /*ARGSUSED*/
29822fe2c88SJonathan Adams static void
29922fe2c88SJonathan Adams vdev_raidz_cksum_free(void *arg, size_t ignored)
30022fe2c88SJonathan Adams {
30122fe2c88SJonathan Adams 	raidz_map_t *rm = arg;
30222fe2c88SJonathan Adams 
30322fe2c88SJonathan Adams 	ASSERT3U(rm->rm_reports, >, 0);
30422fe2c88SJonathan Adams 
305baa7389eSJonathan Adams 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
30622fe2c88SJonathan Adams 		vdev_raidz_map_free(rm);
30722fe2c88SJonathan Adams }
30822fe2c88SJonathan Adams 
30922fe2c88SJonathan Adams static void
31022fe2c88SJonathan Adams vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
31122fe2c88SJonathan Adams {
31222fe2c88SJonathan Adams 	raidz_map_t *rm = zcr->zcr_cbdata;
31322fe2c88SJonathan Adams 	size_t c = zcr->zcr_cbinfo;
31422fe2c88SJonathan Adams 	size_t x;
31522fe2c88SJonathan Adams 
31622fe2c88SJonathan Adams 	const char *good = NULL;
31722fe2c88SJonathan Adams 	const char *bad = rm->rm_col[c].rc_data;
31822fe2c88SJonathan Adams 
31922fe2c88SJonathan Adams 	if (good_data == NULL) {
32022fe2c88SJonathan Adams 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
32122fe2c88SJonathan Adams 		return;
32222fe2c88SJonathan Adams 	}
32322fe2c88SJonathan Adams 
32422fe2c88SJonathan Adams 	if (c < rm->rm_firstdatacol) {
32522fe2c88SJonathan Adams 		/*
32622fe2c88SJonathan Adams 		 * The first time through, calculate the parity blocks for
32722fe2c88SJonathan Adams 		 * the good data (this relies on the fact that the good
32822fe2c88SJonathan Adams 		 * data never changes for a given logical ZIO)
32922fe2c88SJonathan Adams 		 */
33022fe2c88SJonathan Adams 		if (rm->rm_col[0].rc_gdata == NULL) {
33122fe2c88SJonathan Adams 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
33222fe2c88SJonathan Adams 			char *buf;
33322fe2c88SJonathan Adams 
33422fe2c88SJonathan Adams 			/*
33522fe2c88SJonathan Adams 			 * Set up the rm_col[]s to generate the parity for
33622fe2c88SJonathan Adams 			 * good_data, first saving the parity bufs and
33722fe2c88SJonathan Adams 			 * replacing them with buffers to hold the result.
33822fe2c88SJonathan Adams 			 */
33922fe2c88SJonathan Adams 			for (x = 0; x < rm->rm_firstdatacol; x++) {
34022fe2c88SJonathan Adams 				bad_parity[x] = rm->rm_col[x].rc_data;
34122fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
34222fe2c88SJonathan Adams 				    zio_buf_alloc(rm->rm_col[x].rc_size);
34322fe2c88SJonathan Adams 			}
34422fe2c88SJonathan Adams 
34522fe2c88SJonathan Adams 			/* fill in the data columns from good_data */
34622fe2c88SJonathan Adams 			buf = (char *)good_data;
34722fe2c88SJonathan Adams 			for (; x < rm->rm_cols; x++) {
34822fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = buf;
34922fe2c88SJonathan Adams 				buf += rm->rm_col[x].rc_size;
35022fe2c88SJonathan Adams 			}
35122fe2c88SJonathan Adams 
35222fe2c88SJonathan Adams 			/*
35322fe2c88SJonathan Adams 			 * Construct the parity from the good data.
35422fe2c88SJonathan Adams 			 */
35522fe2c88SJonathan Adams 			vdev_raidz_generate_parity(rm);
35622fe2c88SJonathan Adams 
35722fe2c88SJonathan Adams 			/* restore everything back to its original state */
35822fe2c88SJonathan Adams 			for (x = 0; x < rm->rm_firstdatacol; x++)
35922fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = bad_parity[x];
36022fe2c88SJonathan Adams 
36122fe2c88SJonathan Adams 			buf = rm->rm_datacopy;
36222fe2c88SJonathan Adams 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
36322fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = buf;
36422fe2c88SJonathan Adams 				buf += rm->rm_col[x].rc_size;
36522fe2c88SJonathan Adams 			}
36622fe2c88SJonathan Adams 		}
36722fe2c88SJonathan Adams 
36822fe2c88SJonathan Adams 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
36922fe2c88SJonathan Adams 		good = rm->rm_col[c].rc_gdata;
37022fe2c88SJonathan Adams 	} else {
37122fe2c88SJonathan Adams 		/* adjust good_data to point at the start of our column */
37222fe2c88SJonathan Adams 		good = good_data;
37322fe2c88SJonathan Adams 
37422fe2c88SJonathan Adams 		for (x = rm->rm_firstdatacol; x < c; x++)
37522fe2c88SJonathan Adams 			good += rm->rm_col[x].rc_size;
37622fe2c88SJonathan Adams 	}
37722fe2c88SJonathan Adams 
37822fe2c88SJonathan Adams 	/* we drop the ereport if it ends up that the data was good */
37922fe2c88SJonathan Adams 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
38022fe2c88SJonathan Adams }
38122fe2c88SJonathan Adams 
38222fe2c88SJonathan Adams /*
38322fe2c88SJonathan Adams  * Invoked indirectly by zfs_ereport_start_checksum(), called
38422fe2c88SJonathan Adams  * below when our read operation fails completely.  The main point
38522fe2c88SJonathan Adams  * is to keep a copy of everything we read from disk, so that at
38622fe2c88SJonathan Adams  * vdev_raidz_cksum_finish() time we can compare it with the good data.
38722fe2c88SJonathan Adams  */
38822fe2c88SJonathan Adams static void
38922fe2c88SJonathan Adams vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
39022fe2c88SJonathan Adams {
39122fe2c88SJonathan Adams 	size_t c = (size_t)(uintptr_t)arg;
39222fe2c88SJonathan Adams 	caddr_t buf;
39322fe2c88SJonathan Adams 
39422fe2c88SJonathan Adams 	raidz_map_t *rm = zio->io_vsd;
39522fe2c88SJonathan Adams 	size_t size;
39622fe2c88SJonathan Adams 
39722fe2c88SJonathan Adams 	/* set up the report and bump the refcount  */
39822fe2c88SJonathan Adams 	zcr->zcr_cbdata = rm;
39922fe2c88SJonathan Adams 	zcr->zcr_cbinfo = c;
40022fe2c88SJonathan Adams 	zcr->zcr_finish = vdev_raidz_cksum_finish;
40122fe2c88SJonathan Adams 	zcr->zcr_free = vdev_raidz_cksum_free;
40222fe2c88SJonathan Adams 
40322fe2c88SJonathan Adams 	rm->rm_reports++;
40422fe2c88SJonathan Adams 	ASSERT3U(rm->rm_reports, >, 0);
40522fe2c88SJonathan Adams 
406baa7389eSJonathan Adams 	if (rm->rm_datacopy != NULL)
40722fe2c88SJonathan Adams 		return;
40822fe2c88SJonathan Adams 
40922fe2c88SJonathan Adams 	/*
410baa7389eSJonathan Adams 	 * It's the first time we're called for this raidz_map_t, so we need
411baa7389eSJonathan Adams 	 * to copy the data aside; there's no guarantee that our zio's buffer
412baa7389eSJonathan Adams 	 * won't be re-used for something else.
41322fe2c88SJonathan Adams 	 *
414baa7389eSJonathan Adams 	 * Our parity data is already in separate buffers, so there's no need
41522fe2c88SJonathan Adams 	 * to copy them.
41622fe2c88SJonathan Adams 	 */
41722fe2c88SJonathan Adams 
418baa7389eSJonathan Adams 	size = 0;
419baa7389eSJonathan Adams 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
420baa7389eSJonathan Adams 		size += rm->rm_col[c].rc_size;
42122fe2c88SJonathan Adams 
42222fe2c88SJonathan Adams 	buf = rm->rm_datacopy = zio_buf_alloc(size);
423baa7389eSJonathan Adams 
424baa7389eSJonathan Adams 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
42522fe2c88SJonathan Adams 		raidz_col_t *col = &rm->rm_col[c];
42622fe2c88SJonathan Adams 
42722fe2c88SJonathan Adams 		bcopy(col->rc_data, buf, col->rc_size);
42822fe2c88SJonathan Adams 		col->rc_data = buf;
42922fe2c88SJonathan Adams 
43022fe2c88SJonathan Adams 		buf += col->rc_size;
43122fe2c88SJonathan Adams 	}
43222fe2c88SJonathan Adams 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
43322fe2c88SJonathan Adams }
43422fe2c88SJonathan Adams 
43522fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
43622fe2c88SJonathan Adams 	vdev_raidz_map_free_vsd,
43722fe2c88SJonathan Adams 	vdev_raidz_cksum_report
43822fe2c88SJonathan Adams };
43922fe2c88SJonathan Adams 
4403e30c24aSWill Andrews /*
4413e30c24aSWill Andrews  * Divides the IO evenly across all child vdevs; usually, dcols is
4423e30c24aSWill Andrews  * the number of children in the target vdev.
4433e30c24aSWill Andrews  */
444fa9e4066Sahrens static raidz_map_t *
445810e43b2SBill Pijewski vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
446810e43b2SBill Pijewski     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
447fa9e4066Sahrens {
448fa9e4066Sahrens 	raidz_map_t *rm;
4493e30c24aSWill Andrews 	/* The starting RAIDZ (parent) vdev sector of the block. */
450810e43b2SBill Pijewski 	uint64_t b = offset >> unit_shift;
4513e30c24aSWill Andrews 	/* The zio's size in units of the vdev's minimum sector size. */
452810e43b2SBill Pijewski 	uint64_t s = size >> unit_shift;
4533e30c24aSWill Andrews 	/* The first column for this stripe. */
454fa9e4066Sahrens 	uint64_t f = b % dcols;
4553e30c24aSWill Andrews 	/* The starting byte offset on each child vdev. */
456fa9e4066Sahrens 	uint64_t o = (b / dcols) << unit_shift;
457f94275ceSAdam Leventhal 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
458fa9e4066Sahrens 
4593e30c24aSWill Andrews 	/*
4603e30c24aSWill Andrews 	 * "Quotient": The number of data sectors for this stripe on all but
4613e30c24aSWill Andrews 	 * the "big column" child vdevs that also contain "remainder" data.
4623e30c24aSWill Andrews 	 */
46399653d4eSeschrock 	q = s / (dcols - nparity);
4643e30c24aSWill Andrews 
4653e30c24aSWill Andrews 	/*
4663e30c24aSWill Andrews 	 * "Remainder": The number of partial stripe data sectors in this I/O.
4673e30c24aSWill Andrews 	 * This will add a sector to some, but not all, child vdevs.
4683e30c24aSWill Andrews 	 */
46999653d4eSeschrock 	r = s - q * (dcols - nparity);
4703e30c24aSWill Andrews 
4713e30c24aSWill Andrews 	/* The number of "big columns" - those which contain remainder data. */
47299653d4eSeschrock 	bc = (r == 0 ? 0 : r + nparity);
4733e30c24aSWill Andrews 
4743e30c24aSWill Andrews 	/*
4753e30c24aSWill Andrews 	 * The total number of data and parity sectors associated with
4763e30c24aSWill Andrews 	 * this I/O.
4773e30c24aSWill Andrews 	 */
478f94275ceSAdam Leventhal 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
479fa9e4066Sahrens 
4803e30c24aSWill Andrews 	/* acols: The columns that will be accessed. */
4813e30c24aSWill Andrews 	/* scols: The columns that will be accessed or skipped. */
482f94275ceSAdam Leventhal 	if (q == 0) {
4833e30c24aSWill Andrews 		/* Our I/O request doesn't span all child vdevs. */
484f94275ceSAdam Leventhal 		acols = bc;
485f94275ceSAdam Leventhal 		scols = MIN(dcols, roundup(bc, nparity + 1));
486f94275ceSAdam Leventhal 	} else {
487f94275ceSAdam Leventhal 		acols = dcols;
488f94275ceSAdam Leventhal 		scols = dcols;
489f94275ceSAdam Leventhal 	}
490fa9e4066Sahrens 
491f94275ceSAdam Leventhal 	ASSERT3U(acols, <=, scols);
492f94275ceSAdam Leventhal 
493f94275ceSAdam Leventhal 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
494fa9e4066Sahrens 
495fa9e4066Sahrens 	rm->rm_cols = acols;
496f94275ceSAdam Leventhal 	rm->rm_scols = scols;
497fa9e4066Sahrens 	rm->rm_bigcols = bc;
4982fbc121fSAdam Leventhal 	rm->rm_skipstart = bc;
49999653d4eSeschrock 	rm->rm_missingdata = 0;
50099653d4eSeschrock 	rm->rm_missingparity = 0;
50199653d4eSeschrock 	rm->rm_firstdatacol = nparity;
50222fe2c88SJonathan Adams 	rm->rm_datacopy = NULL;
50322fe2c88SJonathan Adams 	rm->rm_reports = 0;
50422fe2c88SJonathan Adams 	rm->rm_freed = 0;
50522fe2c88SJonathan Adams 	rm->rm_ecksuminjected = 0;
506fa9e4066Sahrens 
507f94275ceSAdam Leventhal 	asize = 0;
508f94275ceSAdam Leventhal 
509f94275ceSAdam Leventhal 	for (c = 0; c < scols; c++) {
510fa9e4066Sahrens 		col = f + c;
511fa9e4066Sahrens 		coff = o;
512fa9e4066Sahrens 		if (col >= dcols) {
513fa9e4066Sahrens 			col -= dcols;
514fa9e4066Sahrens 			coff += 1ULL << unit_shift;
515fa9e4066Sahrens 		}
51699653d4eSeschrock 		rm->rm_col[c].rc_devidx = col;
517fa9e4066Sahrens 		rm->rm_col[c].rc_offset = coff;
518fa9e4066Sahrens 		rm->rm_col[c].rc_data = NULL;
51922fe2c88SJonathan Adams 		rm->rm_col[c].rc_gdata = NULL;
520fa9e4066Sahrens 		rm->rm_col[c].rc_error = 0;
521fa9e4066Sahrens 		rm->rm_col[c].rc_tried = 0;
522fa9e4066Sahrens 		rm->rm_col[c].rc_skipped = 0;
523f94275ceSAdam Leventhal 
524f94275ceSAdam Leventhal 		if (c >= acols)
525f94275ceSAdam Leventhal 			rm->rm_col[c].rc_size = 0;
526f94275ceSAdam Leventhal 		else if (c < bc)
527f94275ceSAdam Leventhal 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
528f94275ceSAdam Leventhal 		else
529f94275ceSAdam Leventhal 			rm->rm_col[c].rc_size = q << unit_shift;
530f94275ceSAdam Leventhal 
531f94275ceSAdam Leventhal 		asize += rm->rm_col[c].rc_size;
532fa9e4066Sahrens 	}
533fa9e4066Sahrens 
534f94275ceSAdam Leventhal 	ASSERT3U(asize, ==, tot << unit_shift);
535f94275ceSAdam Leventhal 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
5362fbc121fSAdam Leventhal 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
5372fbc121fSAdam Leventhal 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
5382fbc121fSAdam Leventhal 	ASSERT3U(rm->rm_nskip, <=, nparity);
539fa9e4066Sahrens 
540fa9e4066Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
541fa9e4066Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
542fa9e4066Sahrens 
543810e43b2SBill Pijewski 	rm->rm_col[c].rc_data = data;
544fa9e4066Sahrens 
545fa9e4066Sahrens 	for (c = c + 1; c < acols; c++)
546fa9e4066Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
547fa9e4066Sahrens 		    rm->rm_col[c - 1].rc_size;
548fa9e4066Sahrens 
549fa9e4066Sahrens 	/*
55099653d4eSeschrock 	 * If all data stored spans all columns, there's a danger that parity
55199653d4eSeschrock 	 * will always be on the same device and, since parity isn't read
55299653d4eSeschrock 	 * during normal operation, that that device's I/O bandwidth won't be
55399653d4eSeschrock 	 * used effectively. We therefore switch the parity every 1MB.
55499653d4eSeschrock 	 *
55599653d4eSeschrock 	 * ... at least that was, ostensibly, the theory. As a practical
55699653d4eSeschrock 	 * matter unless we juggle the parity between all devices evenly, we
55799653d4eSeschrock 	 * won't see any benefit. Further, occasional writes that aren't a
55899653d4eSeschrock 	 * multiple of the LCM of the number of children and the minimum
55999653d4eSeschrock 	 * stripe width are sufficient to avoid pessimal behavior.
56099653d4eSeschrock 	 * Unfortunately, this decision created an implicit on-disk format
561c7a40cc4Sahl 	 * requirement that we need to support for all eternity, but only
562c7a40cc4Sahl 	 * for single-parity RAID-Z.
5632fbc121fSAdam Leventhal 	 *
5642fbc121fSAdam Leventhal 	 * If we intend to skip a sector in the zeroth column for padding
5652fbc121fSAdam Leventhal 	 * we must make sure to note this swap. We will never intend to
5662fbc121fSAdam Leventhal 	 * skip the first column since at least one data and one parity
5672fbc121fSAdam Leventhal 	 * column must appear in each row.
568fa9e4066Sahrens 	 */
569fa9e4066Sahrens 	ASSERT(rm->rm_cols >= 2);
570fa9e4066Sahrens 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
571fa9e4066Sahrens 
572810e43b2SBill Pijewski 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
57399653d4eSeschrock 		devidx = rm->rm_col[0].rc_devidx;
574fa9e4066Sahrens 		o = rm->rm_col[0].rc_offset;
57599653d4eSeschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
576fa9e4066Sahrens 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
57799653d4eSeschrock 		rm->rm_col[1].rc_devidx = devidx;
578fa9e4066Sahrens 		rm->rm_col[1].rc_offset = o;
5792fbc121fSAdam Leventhal 
5802fbc121fSAdam Leventhal 		if (rm->rm_skipstart == 0)
5812fbc121fSAdam Leventhal 			rm->rm_skipstart = 1;
582fa9e4066Sahrens 	}
583fa9e4066Sahrens 
584fa9e4066Sahrens 	return (rm);
585fa9e4066Sahrens }
586fa9e4066Sahrens 
587fa9e4066Sahrens static void
58899653d4eSeschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
589fa9e4066Sahrens {
59099653d4eSeschrock 	uint64_t *p, *src, pcount, ccount, i;
59199653d4eSeschrock 	int c;
592fa9e4066Sahrens 
59399653d4eSeschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
59499653d4eSeschrock 
59599653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
59699653d4eSeschrock 		src = rm->rm_col[c].rc_data;
59799653d4eSeschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
59899653d4eSeschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
59999653d4eSeschrock 
60099653d4eSeschrock 		if (c == rm->rm_firstdatacol) {
60199653d4eSeschrock 			ASSERT(ccount == pcount);
602f94275ceSAdam Leventhal 			for (i = 0; i < ccount; i++, src++, p++) {
60399653d4eSeschrock 				*p = *src;
60499653d4eSeschrock 			}
60599653d4eSeschrock 		} else {
60699653d4eSeschrock 			ASSERT(ccount <= pcount);
607f94275ceSAdam Leventhal 			for (i = 0; i < ccount; i++, src++, p++) {
60899653d4eSeschrock 				*p ^= *src;
60999653d4eSeschrock 			}
61099653d4eSeschrock 		}
61199653d4eSeschrock 	}
61299653d4eSeschrock }
61399653d4eSeschrock 
61499653d4eSeschrock static void
61599653d4eSeschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
61699653d4eSeschrock {
617f94275ceSAdam Leventhal 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
61899653d4eSeschrock 	int c;
61999653d4eSeschrock 
620f94275ceSAdam Leventhal 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
62199653d4eSeschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
62299653d4eSeschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
62399653d4eSeschrock 
62499653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
62599653d4eSeschrock 		src = rm->rm_col[c].rc_data;
62699653d4eSeschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
62799653d4eSeschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
628f94275ceSAdam Leventhal 
629f94275ceSAdam Leventhal 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
63099653d4eSeschrock 
63199653d4eSeschrock 		if (c == rm->rm_firstdatacol) {
632f94275ceSAdam Leventhal 			ASSERT(ccnt == pcnt || ccnt == 0);
633f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
63499653d4eSeschrock 				*p = *src;
635f94275ceSAdam Leventhal 				*q = *src;
63699653d4eSeschrock 			}
637f94275ceSAdam Leventhal 			for (; i < pcnt; i++, src++, p++, q++) {
63899653d4eSeschrock 				*p = 0;
639f94275ceSAdam Leventhal 				*q = 0;
64099653d4eSeschrock 			}
64199653d4eSeschrock 		} else {
642f94275ceSAdam Leventhal 			ASSERT(ccnt <= pcnt);
64399653d4eSeschrock 
64499653d4eSeschrock 			/*
645f94275ceSAdam Leventhal 			 * Apply the algorithm described above by multiplying
646f94275ceSAdam Leventhal 			 * the previous result and adding in the new value.
64799653d4eSeschrock 			 */
648f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
64999653d4eSeschrock 				*p ^= *src;
650f94275ceSAdam Leventhal 
651f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
652f94275ceSAdam Leventhal 				*q ^= *src;
65399653d4eSeschrock 			}
65499653d4eSeschrock 
65599653d4eSeschrock 			/*
65699653d4eSeschrock 			 * Treat short columns as though they are full of 0s.
657f94275ceSAdam Leventhal 			 * Note that there's therefore nothing needed for P.
65899653d4eSeschrock 			 */
659f94275ceSAdam Leventhal 			for (; i < pcnt; i++, q++) {
660f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
66199653d4eSeschrock 			}
66299653d4eSeschrock 		}
66399653d4eSeschrock 	}
66499653d4eSeschrock }
66599653d4eSeschrock 
66699653d4eSeschrock static void
667f94275ceSAdam Leventhal vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
668f94275ceSAdam Leventhal {
669f94275ceSAdam Leventhal 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
670f94275ceSAdam Leventhal 	int c;
671f94275ceSAdam Leventhal 
672f94275ceSAdam Leventhal 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
673f94275ceSAdam Leventhal 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
674f94275ceSAdam Leventhal 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
675f94275ceSAdam Leventhal 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
676f94275ceSAdam Leventhal 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
677f94275ceSAdam Leventhal 
678f94275ceSAdam Leventhal 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
679f94275ceSAdam Leventhal 		src = rm->rm_col[c].rc_data;
680f94275ceSAdam Leventhal 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
681f94275ceSAdam Leventhal 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
682f94275ceSAdam Leventhal 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
683f94275ceSAdam Leventhal 
684f94275ceSAdam Leventhal 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
685f94275ceSAdam Leventhal 
686f94275ceSAdam Leventhal 		if (c == rm->rm_firstdatacol) {
687f94275ceSAdam Leventhal 			ASSERT(ccnt == pcnt || ccnt == 0);
688f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
689f94275ceSAdam Leventhal 				*p = *src;
690f94275ceSAdam Leventhal 				*q = *src;
691f94275ceSAdam Leventhal 				*r = *src;
692f94275ceSAdam Leventhal 			}
693f94275ceSAdam Leventhal 			for (; i < pcnt; i++, src++, p++, q++, r++) {
694f94275ceSAdam Leventhal 				*p = 0;
695f94275ceSAdam Leventhal 				*q = 0;
696f94275ceSAdam Leventhal 				*r = 0;
697f94275ceSAdam Leventhal 			}
698f94275ceSAdam Leventhal 		} else {
699f94275ceSAdam Leventhal 			ASSERT(ccnt <= pcnt);
700f94275ceSAdam Leventhal 
701f94275ceSAdam Leventhal 			/*
702f94275ceSAdam Leventhal 			 * Apply the algorithm described above by multiplying
703f94275ceSAdam Leventhal 			 * the previous result and adding in the new value.
704f94275ceSAdam Leventhal 			 */
705f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
706f94275ceSAdam Leventhal 				*p ^= *src;
707f94275ceSAdam Leventhal 
708f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
709f94275ceSAdam Leventhal 				*q ^= *src;
710f94275ceSAdam Leventhal 
711f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_4(*r, mask);
712f94275ceSAdam Leventhal 				*r ^= *src;
713f94275ceSAdam Leventhal 			}
714f94275ceSAdam Leventhal 
715f94275ceSAdam Leventhal 			/*
716f94275ceSAdam Leventhal 			 * Treat short columns as though they are full of 0s.
717f94275ceSAdam Leventhal 			 * Note that there's therefore nothing needed for P.
718f94275ceSAdam Leventhal 			 */
719f94275ceSAdam Leventhal 			for (; i < pcnt; i++, q++, r++) {
720f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
721f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_4(*r, mask);
722f94275ceSAdam Leventhal 			}
723f94275ceSAdam Leventhal 		}
724f94275ceSAdam Leventhal 	}
725f94275ceSAdam Leventhal }
726f94275ceSAdam Leventhal 
727f94275ceSAdam Leventhal /*
728f94275ceSAdam Leventhal  * Generate RAID parity in the first virtual columns according to the number of
729f94275ceSAdam Leventhal  * parity columns available.
730f94275ceSAdam Leventhal  */
731f94275ceSAdam Leventhal static void
732f94275ceSAdam Leventhal vdev_raidz_generate_parity(raidz_map_t *rm)
733f94275ceSAdam Leventhal {
734f94275ceSAdam Leventhal 	switch (rm->rm_firstdatacol) {
735f94275ceSAdam Leventhal 	case 1:
736f94275ceSAdam Leventhal 		vdev_raidz_generate_parity_p(rm);
737f94275ceSAdam Leventhal 		break;
738f94275ceSAdam Leventhal 	case 2:
739f94275ceSAdam Leventhal 		vdev_raidz_generate_parity_pq(rm);
740f94275ceSAdam Leventhal 		break;
741f94275ceSAdam Leventhal 	case 3:
742f94275ceSAdam Leventhal 		vdev_raidz_generate_parity_pqr(rm);
743f94275ceSAdam Leventhal 		break;
744f94275ceSAdam Leventhal 	default:
745f94275ceSAdam Leventhal 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
746f94275ceSAdam Leventhal 	}
747f94275ceSAdam Leventhal }
748f94275ceSAdam Leventhal 
749f94275ceSAdam Leventhal static int
750f94275ceSAdam Leventhal vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
75199653d4eSeschrock {
75299653d4eSeschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
753f94275ceSAdam Leventhal 	int x = tgts[0];
75499653d4eSeschrock 	int c;
75599653d4eSeschrock 
756f94275ceSAdam Leventhal 	ASSERT(ntgts == 1);
757f94275ceSAdam Leventhal 	ASSERT(x >= rm->rm_firstdatacol);
758f94275ceSAdam Leventhal 	ASSERT(x < rm->rm_cols);
759f94275ceSAdam Leventhal 
76099653d4eSeschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
76199653d4eSeschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
76299653d4eSeschrock 	ASSERT(xcount > 0);
76399653d4eSeschrock 
76499653d4eSeschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
76599653d4eSeschrock 	dst = rm->rm_col[x].rc_data;
76699653d4eSeschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
76799653d4eSeschrock 		*dst = *src;
76899653d4eSeschrock 	}
76999653d4eSeschrock 
77099653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771fa9e4066Sahrens 		src = rm->rm_col[c].rc_data;
772fa9e4066Sahrens 		dst = rm->rm_col[x].rc_data;
77399653d4eSeschrock 
77499653d4eSeschrock 		if (c == x)
77599653d4eSeschrock 			continue;
77699653d4eSeschrock 
77799653d4eSeschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
77899653d4eSeschrock 		count = MIN(ccount, xcount);
77999653d4eSeschrock 
78099653d4eSeschrock 		for (i = 0; i < count; i++, dst++, src++) {
78199653d4eSeschrock 			*dst ^= *src;
78299653d4eSeschrock 		}
78399653d4eSeschrock 	}
784f94275ceSAdam Leventhal 
785f94275ceSAdam Leventhal 	return (1 << VDEV_RAIDZ_P);
78699653d4eSeschrock }
78799653d4eSeschrock 
788f94275ceSAdam Leventhal static int
789f94275ceSAdam Leventhal vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
79099653d4eSeschrock {
79199653d4eSeschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
79299653d4eSeschrock 	uint8_t *b;
793f94275ceSAdam Leventhal 	int x = tgts[0];
79499653d4eSeschrock 	int c, j, exp;
79599653d4eSeschrock 
796f94275ceSAdam Leventhal 	ASSERT(ntgts == 1);
797f94275ceSAdam Leventhal 
79899653d4eSeschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
79999653d4eSeschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
80099653d4eSeschrock 
80199653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
80299653d4eSeschrock 		src = rm->rm_col[c].rc_data;
80399653d4eSeschrock 		dst = rm->rm_col[x].rc_data;
80499653d4eSeschrock 
80599653d4eSeschrock 		if (c == x)
80699653d4eSeschrock 			ccount = 0;
80799653d4eSeschrock 		else
80899653d4eSeschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
80999653d4eSeschrock 
81099653d4eSeschrock 		count = MIN(ccount, xcount);
81199653d4eSeschrock 
81299653d4eSeschrock 		if (c == rm->rm_firstdatacol) {
81399653d4eSeschrock 			for (i = 0; i < count; i++, dst++, src++) {
81499653d4eSeschrock 				*dst = *src;
81599653d4eSeschrock 			}
81699653d4eSeschrock 			for (; i < xcount; i++, dst++) {
81799653d4eSeschrock 				*dst = 0;
81899653d4eSeschrock 			}
81999653d4eSeschrock 
820fa9e4066Sahrens 		} else {
82199653d4eSeschrock 			for (i = 0; i < count; i++, dst++, src++) {
822f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*dst, mask);
82399653d4eSeschrock 				*dst ^= *src;
82499653d4eSeschrock 			}
82599653d4eSeschrock 
82699653d4eSeschrock 			for (; i < xcount; i++, dst++) {
827f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*dst, mask);
828fa9e4066Sahrens 			}
829fa9e4066Sahrens 		}
830fa9e4066Sahrens 	}
831fa9e4066Sahrens 
83299653d4eSeschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
83399653d4eSeschrock 	dst = rm->rm_col[x].rc_data;
83499653d4eSeschrock 	exp = 255 - (rm->rm_cols - 1 - x);
83599653d4eSeschrock 
83699653d4eSeschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
83799653d4eSeschrock 		*dst ^= *src;
83899653d4eSeschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
83999653d4eSeschrock 			*b = vdev_raidz_exp2(*b, exp);
84099653d4eSeschrock 		}
84199653d4eSeschrock 	}
842f94275ceSAdam Leventhal 
843f94275ceSAdam Leventhal 	return (1 << VDEV_RAIDZ_Q);
84499653d4eSeschrock }
84599653d4eSeschrock 
846f94275ceSAdam Leventhal static int
847f94275ceSAdam Leventhal vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
84899653d4eSeschrock {
84999653d4eSeschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
85099653d4eSeschrock 	void *pdata, *qdata;
85199653d4eSeschrock 	uint64_t xsize, ysize, i;
852f94275ceSAdam Leventhal 	int x = tgts[0];
853f94275ceSAdam Leventhal 	int y = tgts[1];
85499653d4eSeschrock 
855f94275ceSAdam Leventhal 	ASSERT(ntgts == 2);
85699653d4eSeschrock 	ASSERT(x < y);
85799653d4eSeschrock 	ASSERT(x >= rm->rm_firstdatacol);
85899653d4eSeschrock 	ASSERT(y < rm->rm_cols);
85999653d4eSeschrock 
86099653d4eSeschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
86199653d4eSeschrock 
86299653d4eSeschrock 	/*
86399653d4eSeschrock 	 * Move the parity data aside -- we're going to compute parity as
86499653d4eSeschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
86599653d4eSeschrock 	 * reuse the parity generation mechanism without trashing the actual
86699653d4eSeschrock 	 * parity so we make those columns appear to be full of zeros by
86799653d4eSeschrock 	 * setting their lengths to zero.
86899653d4eSeschrock 	 */
86999653d4eSeschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
87099653d4eSeschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
87199653d4eSeschrock 	xsize = rm->rm_col[x].rc_size;
87299653d4eSeschrock 	ysize = rm->rm_col[y].rc_size;
87399653d4eSeschrock 
87499653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
87599653d4eSeschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
87699653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
87799653d4eSeschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
87899653d4eSeschrock 	rm->rm_col[x].rc_size = 0;
87999653d4eSeschrock 	rm->rm_col[y].rc_size = 0;
88099653d4eSeschrock 
88199653d4eSeschrock 	vdev_raidz_generate_parity_pq(rm);
88299653d4eSeschrock 
88399653d4eSeschrock 	rm->rm_col[x].rc_size = xsize;
88499653d4eSeschrock 	rm->rm_col[y].rc_size = ysize;
88599653d4eSeschrock 
88699653d4eSeschrock 	p = pdata;
88799653d4eSeschrock 	q = qdata;
88899653d4eSeschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
88999653d4eSeschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
89099653d4eSeschrock 	xd = rm->rm_col[x].rc_data;
89199653d4eSeschrock 	yd = rm->rm_col[y].rc_data;
89299653d4eSeschrock 
89399653d4eSeschrock 	/*
89499653d4eSeschrock 	 * We now have:
89599653d4eSeschrock 	 *	Pxy = P + D_x + D_y
89699653d4eSeschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
89799653d4eSeschrock 	 *
89899653d4eSeschrock 	 * We can then solve for D_x:
89999653d4eSeschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
90099653d4eSeschrock 	 * where
90199653d4eSeschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
90299653d4eSeschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
90399653d4eSeschrock 	 *
90499653d4eSeschrock 	 * With D_x in hand, we can easily solve for D_y:
90599653d4eSeschrock 	 *	D_y = P + Pxy + D_x
90699653d4eSeschrock 	 */
90799653d4eSeschrock 
90899653d4eSeschrock 	a = vdev_raidz_pow2[255 + x - y];
90999653d4eSeschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
91099653d4eSeschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
91199653d4eSeschrock 
91299653d4eSeschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
91399653d4eSeschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
91499653d4eSeschrock 
91599653d4eSeschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
91699653d4eSeschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
91799653d4eSeschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
91899653d4eSeschrock 
91999653d4eSeschrock 		if (i < ysize)
92099653d4eSeschrock 			*yd = *p ^ *pxy ^ *xd;
92199653d4eSeschrock 	}
92299653d4eSeschrock 
92399653d4eSeschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
92499653d4eSeschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
92599653d4eSeschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
92699653d4eSeschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
92799653d4eSeschrock 
92899653d4eSeschrock 	/*
92999653d4eSeschrock 	 * Restore the saved parity data.
93099653d4eSeschrock 	 */
93199653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
93299653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
933f94275ceSAdam Leventhal 
934f94275ceSAdam Leventhal 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
93599653d4eSeschrock }
93699653d4eSeschrock 
937f94275ceSAdam Leventhal /* BEGIN CSTYLED */
938f94275ceSAdam Leventhal /*
939f94275ceSAdam Leventhal  * In the general case of reconstruction, we must solve the system of linear
940f94275ceSAdam Leventhal  * equations defined by the coeffecients used to generate parity as well as
941f94275ceSAdam Leventhal  * the contents of the data and parity disks. This can be expressed with
942f94275ceSAdam Leventhal  * vectors for the original data (D) and the actual data (d) and parity (p)
943f94275ceSAdam Leventhal  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
944f94275ceSAdam Leventhal  *
945f94275ceSAdam Leventhal  *            __   __                     __     __
946f94275ceSAdam Leventhal  *            |     |         __     __   |  p_0  |
947f94275ceSAdam Leventhal  *            |  V  |         |  D_0  |   | p_m-1 |
948f94275ceSAdam Leventhal  *            |     |    x    |   :   | = |  d_0  |
949f94275ceSAdam Leventhal  *            |  I  |         | D_n-1 |   |   :   |
950f94275ceSAdam Leventhal  *            |     |         ~~     ~~   | d_n-1 |
951f94275ceSAdam Leventhal  *            ~~   ~~                     ~~     ~~
952f94275ceSAdam Leventhal  *
953f94275ceSAdam Leventhal  * I is simply a square identity matrix of size n, and V is a vandermonde
954f94275ceSAdam Leventhal  * matrix defined by the coeffecients we chose for the various parity columns
955f94275ceSAdam Leventhal  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
956f94275ceSAdam Leventhal  * computation as well as linear separability.
957f94275ceSAdam Leventhal  *
958f94275ceSAdam Leventhal  *      __               __               __     __
959f94275ceSAdam Leventhal  *      |   1   ..  1 1 1 |               |  p_0  |
960f94275ceSAdam Leventhal  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
961f94275ceSAdam Leventhal  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
962f94275ceSAdam Leventhal  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
963f94275ceSAdam Leventhal  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
964f94275ceSAdam Leventhal  *      |   :       : : : |   |   :   |   |  d_2  |
965f94275ceSAdam Leventhal  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
966f94275ceSAdam Leventhal  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
967f94275ceSAdam Leventhal  *      |   0   ..  0 0 1 |               | d_n-1 |
968f94275ceSAdam Leventhal  *      ~~               ~~               ~~     ~~
969f94275ceSAdam Leventhal  *
970f94275ceSAdam Leventhal  * Note that I, V, d, and p are known. To compute D, we must invert the
971f94275ceSAdam Leventhal  * matrix and use the known data and parity values to reconstruct the unknown
972f94275ceSAdam Leventhal  * data values. We begin by removing the rows in V|I and d|p that correspond
973f94275ceSAdam Leventhal  * to failed or missing columns; we then make V|I square (n x n) and d|p
974f94275ceSAdam Leventhal  * sized n by removing rows corresponding to unused parity from the bottom up
975f94275ceSAdam Leventhal  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
976f94275ceSAdam Leventhal  * using Gauss-Jordan elimination. In the example below we use m=3 parity
977f94275ceSAdam Leventhal  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
978f94275ceSAdam Leventhal  *           __                               __
979f94275ceSAdam Leventhal  *           |  1   1   1   1   1   1   1   1  |
980f94275ceSAdam Leventhal  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
981f94275ceSAdam Leventhal  *           |  19 205 116  29  64  16  4   1  |      / /
982f94275ceSAdam Leventhal  *           |  1   0   0   0   0   0   0   0  |     / /
983f94275ceSAdam Leventhal  *           |  0   1   0   0   0   0   0   0  | <--' /
984f94275ceSAdam Leventhal  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
985f94275ceSAdam Leventhal  *           |  0   0   0   1   0   0   0   0  |
986f94275ceSAdam Leventhal  *           |  0   0   0   0   1   0   0   0  |
987f94275ceSAdam Leventhal  *           |  0   0   0   0   0   1   0   0  |
988f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   1   0  |
989f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   0   1  |
990f94275ceSAdam Leventhal  *           ~~                               ~~
991f94275ceSAdam Leventhal  *           __                               __
992f94275ceSAdam Leventhal  *           |  1   1   1   1   1   1   1   1  |
993f94275ceSAdam Leventhal  *           |  19 205 116  29  64  16  4   1  |
994f94275ceSAdam Leventhal  *           |  1   0   0   0   0   0   0   0  |
995810e43b2SBill Pijewski  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
996f94275ceSAdam Leventhal  *           |  0   0   0   0   1   0   0   0  |
997f94275ceSAdam Leventhal  *           |  0   0   0   0   0   1   0   0  |
998f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   1   0  |
999f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   0   1  |
1000f94275ceSAdam Leventhal  *           ~~                               ~~
1001f94275ceSAdam Leventhal  *
1002f94275ceSAdam Leventhal  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1003f94275ceSAdam Leventhal  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1004f94275ceSAdam Leventhal  * matrix is not singular.
1005f94275ceSAdam Leventhal  * __                                                                 __
1006f94275ceSAdam Leventhal  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1007f94275ceSAdam Leventhal  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1008f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1009f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1010f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1011f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1012f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1013f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1014f94275ceSAdam Leventhal  * ~~                                                                 ~~
1015f94275ceSAdam Leventhal  * __                                                                 __
1016f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1017f94275ceSAdam Leventhal  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1018f94275ceSAdam Leventhal  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1019f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024f94275ceSAdam Leventhal  * ~~                                                                 ~~
1025f94275ceSAdam Leventhal  * __                                                                 __
1026f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027f94275ceSAdam Leventhal  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1028f94275ceSAdam Leventhal  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1029f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034f94275ceSAdam Leventhal  * ~~                                                                 ~~
1035f94275ceSAdam Leventhal  * __                                                                 __
1036f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037f94275ceSAdam Leventhal  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038f94275ceSAdam Leventhal  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1039f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044f94275ceSAdam Leventhal  * ~~                                                                 ~~
1045f94275ceSAdam Leventhal  * __                                                                 __
1046f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047f94275ceSAdam Leventhal  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048f94275ceSAdam Leventhal  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1049f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054f94275ceSAdam Leventhal  * ~~                                                                 ~~
1055f94275ceSAdam Leventhal  * __                                                                 __
1056f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057f94275ceSAdam Leventhal  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1058f94275ceSAdam Leventhal  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064f94275ceSAdam Leventhal  * ~~                                                                 ~~
1065f94275ceSAdam Leventhal  *                   __                               __
1066f94275ceSAdam Leventhal  *                   |  0   0   1   0   0   0   0   0  |
1067f94275ceSAdam Leventhal  *                   | 167 100  5   41 159 169 217 208 |
1068f94275ceSAdam Leventhal  *                   | 166 100  4   40 158 168 216 209 |
1069f94275ceSAdam Leventhal  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1070f94275ceSAdam Leventhal  *                   |  0   0   0   0   1   0   0   0  |
1071f94275ceSAdam Leventhal  *                   |  0   0   0   0   0   1   0   0  |
1072f94275ceSAdam Leventhal  *                   |  0   0   0   0   0   0   1   0  |
1073f94275ceSAdam Leventhal  *                   |  0   0   0   0   0   0   0   1  |
1074f94275ceSAdam Leventhal  *                   ~~                               ~~
1075f94275ceSAdam Leventhal  *
1076f94275ceSAdam Leventhal  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1077f94275ceSAdam Leventhal  * of the missing data.
1078f94275ceSAdam Leventhal  *
1079f94275ceSAdam Leventhal  * As is apparent from the example above, the only non-trivial rows in the
1080f94275ceSAdam Leventhal  * inverse matrix correspond to the data disks that we're trying to
1081f94275ceSAdam Leventhal  * reconstruct. Indeed, those are the only rows we need as the others would
1082f94275ceSAdam Leventhal  * only be useful for reconstructing data known or assumed to be valid. For
1083f94275ceSAdam Leventhal  * that reason, we only build the coefficients in the rows that correspond to
1084f94275ceSAdam Leventhal  * targeted columns.
1085f94275ceSAdam Leventhal  */
1086f94275ceSAdam Leventhal /* END CSTYLED */
1087f94275ceSAdam Leventhal 
1088f94275ceSAdam Leventhal static void
1089f94275ceSAdam Leventhal vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1090f94275ceSAdam Leventhal     uint8_t **rows)
1091f94275ceSAdam Leventhal {
1092f94275ceSAdam Leventhal 	int i, j;
1093f94275ceSAdam Leventhal 	int pow;
1094f94275ceSAdam Leventhal 
1095f94275ceSAdam Leventhal 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1096f94275ceSAdam Leventhal 
1097f94275ceSAdam Leventhal 	/*
1098f94275ceSAdam Leventhal 	 * Fill in the missing rows of interest.
1099f94275ceSAdam Leventhal 	 */
1100f94275ceSAdam Leventhal 	for (i = 0; i < nmap; i++) {
1101f94275ceSAdam Leventhal 		ASSERT3S(0, <=, map[i]);
1102f94275ceSAdam Leventhal 		ASSERT3S(map[i], <=, 2);
1103f94275ceSAdam Leventhal 
1104f94275ceSAdam Leventhal 		pow = map[i] * n;
1105f94275ceSAdam Leventhal 		if (pow > 255)
1106f94275ceSAdam Leventhal 			pow -= 255;
1107f94275ceSAdam Leventhal 		ASSERT(pow <= 255);
1108f94275ceSAdam Leventhal 
1109f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1110f94275ceSAdam Leventhal 			pow -= map[i];
1111f94275ceSAdam Leventhal 			if (pow < 0)
1112f94275ceSAdam Leventhal 				pow += 255;
1113f94275ceSAdam Leventhal 			rows[i][j] = vdev_raidz_pow2[pow];
1114f94275ceSAdam Leventhal 		}
1115f94275ceSAdam Leventhal 	}
1116f94275ceSAdam Leventhal }
1117f94275ceSAdam Leventhal 
1118f94275ceSAdam Leventhal static void
1119f94275ceSAdam Leventhal vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1120f94275ceSAdam Leventhal     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1121f94275ceSAdam Leventhal {
1122f94275ceSAdam Leventhal 	int i, j, ii, jj;
1123f94275ceSAdam Leventhal 	uint8_t log;
1124f94275ceSAdam Leventhal 
1125f94275ceSAdam Leventhal 	/*
1126f94275ceSAdam Leventhal 	 * Assert that the first nmissing entries from the array of used
1127f94275ceSAdam Leventhal 	 * columns correspond to parity columns and that subsequent entries
1128f94275ceSAdam Leventhal 	 * correspond to data columns.
1129f94275ceSAdam Leventhal 	 */
1130f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1131f94275ceSAdam Leventhal 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1132f94275ceSAdam Leventhal 	}
1133f94275ceSAdam Leventhal 	for (; i < n; i++) {
1134f94275ceSAdam Leventhal 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1135f94275ceSAdam Leventhal 	}
1136f94275ceSAdam Leventhal 
1137f94275ceSAdam Leventhal 	/*
1138f94275ceSAdam Leventhal 	 * First initialize the storage where we'll compute the inverse rows.
1139f94275ceSAdam Leventhal 	 */
1140f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1141f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1142f94275ceSAdam Leventhal 			invrows[i][j] = (i == j) ? 1 : 0;
1143f94275ceSAdam Leventhal 		}
1144f94275ceSAdam Leventhal 	}
1145f94275ceSAdam Leventhal 
1146f94275ceSAdam Leventhal 	/*
1147f94275ceSAdam Leventhal 	 * Subtract all trivial rows from the rows of consequence.
1148f94275ceSAdam Leventhal 	 */
1149f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1150f94275ceSAdam Leventhal 		for (j = nmissing; j < n; j++) {
1151f94275ceSAdam Leventhal 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1152f94275ceSAdam Leventhal 			jj = used[j] - rm->rm_firstdatacol;
1153f94275ceSAdam Leventhal 			ASSERT3S(jj, <, n);
1154f94275ceSAdam Leventhal 			invrows[i][j] = rows[i][jj];
1155f94275ceSAdam Leventhal 			rows[i][jj] = 0;
1156f94275ceSAdam Leventhal 		}
1157f94275ceSAdam Leventhal 	}
1158f94275ceSAdam Leventhal 
1159f94275ceSAdam Leventhal 	/*
1160f94275ceSAdam Leventhal 	 * For each of the rows of interest, we must normalize it and subtract
1161f94275ceSAdam Leventhal 	 * a multiple of it from the other rows.
1162f94275ceSAdam Leventhal 	 */
1163f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1164f94275ceSAdam Leventhal 		for (j = 0; j < missing[i]; j++) {
1165fb09f5aaSMadhav Suresh 			ASSERT0(rows[i][j]);
1166f94275ceSAdam Leventhal 		}
1167f94275ceSAdam Leventhal 		ASSERT3U(rows[i][missing[i]], !=, 0);
1168f94275ceSAdam Leventhal 
1169f94275ceSAdam Leventhal 		/*
1170f94275ceSAdam Leventhal 		 * Compute the inverse of the first element and multiply each
1171f94275ceSAdam Leventhal 		 * element in the row by that value.
1172f94275ceSAdam Leventhal 		 */
1173f94275ceSAdam Leventhal 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1174f94275ceSAdam Leventhal 
1175f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1176f94275ceSAdam Leventhal 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1177f94275ceSAdam Leventhal 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1178f94275ceSAdam Leventhal 		}
1179f94275ceSAdam Leventhal 
1180f94275ceSAdam Leventhal 		for (ii = 0; ii < nmissing; ii++) {
1181f94275ceSAdam Leventhal 			if (i == ii)
1182f94275ceSAdam Leventhal 				continue;
1183f94275ceSAdam Leventhal 
1184f94275ceSAdam Leventhal 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1185f94275ceSAdam Leventhal 
1186f94275ceSAdam Leventhal 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1187f94275ceSAdam Leventhal 
1188f94275ceSAdam Leventhal 			for (j = 0; j < n; j++) {
1189f94275ceSAdam Leventhal 				rows[ii][j] ^=
1190f94275ceSAdam Leventhal 				    vdev_raidz_exp2(rows[i][j], log);
1191f94275ceSAdam Leventhal 				invrows[ii][j] ^=
1192f94275ceSAdam Leventhal 				    vdev_raidz_exp2(invrows[i][j], log);
1193f94275ceSAdam Leventhal 			}
1194f94275ceSAdam Leventhal 		}
1195f94275ceSAdam Leventhal 	}
1196f94275ceSAdam Leventhal 
1197f94275ceSAdam Leventhal 	/*
1198f94275ceSAdam Leventhal 	 * Verify that the data that is left in the rows are properly part of
1199f94275ceSAdam Leventhal 	 * an identity matrix.
1200f94275ceSAdam Leventhal 	 */
1201f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1202f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1203f94275ceSAdam Leventhal 			if (j == missing[i]) {
1204f94275ceSAdam Leventhal 				ASSERT3U(rows[i][j], ==, 1);
1205f94275ceSAdam Leventhal 			} else {
1206fb09f5aaSMadhav Suresh 				ASSERT0(rows[i][j]);
1207f94275ceSAdam Leventhal 			}
1208f94275ceSAdam Leventhal 		}
1209f94275ceSAdam Leventhal 	}
1210f94275ceSAdam Leventhal }
1211f94275ceSAdam Leventhal 
1212f94275ceSAdam Leventhal static void
1213f94275ceSAdam Leventhal vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1214f94275ceSAdam Leventhal     int *missing, uint8_t **invrows, const uint8_t *used)
1215f94275ceSAdam Leventhal {
1216f94275ceSAdam Leventhal 	int i, j, x, cc, c;
1217f94275ceSAdam Leventhal 	uint8_t *src;
1218f94275ceSAdam Leventhal 	uint64_t ccount;
1219f94275ceSAdam Leventhal 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1220f94275ceSAdam Leventhal 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1221d5285caeSGeorge Wilson 	uint8_t log = 0;
1222d5285caeSGeorge Wilson 	uint8_t val;
1223f94275ceSAdam Leventhal 	int ll;
1224f94275ceSAdam Leventhal 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1225f94275ceSAdam Leventhal 	uint8_t *p, *pp;
1226f94275ceSAdam Leventhal 	size_t psize;
1227f94275ceSAdam Leventhal 
1228f94275ceSAdam Leventhal 	psize = sizeof (invlog[0][0]) * n * nmissing;
1229f94275ceSAdam Leventhal 	p = kmem_alloc(psize, KM_SLEEP);
1230f94275ceSAdam Leventhal 
1231f94275ceSAdam Leventhal 	for (pp = p, i = 0; i < nmissing; i++) {
1232f94275ceSAdam Leventhal 		invlog[i] = pp;
1233f94275ceSAdam Leventhal 		pp += n;
1234f94275ceSAdam Leventhal 	}
1235f94275ceSAdam Leventhal 
1236f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1237f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1238f94275ceSAdam Leventhal 			ASSERT3U(invrows[i][j], !=, 0);
1239f94275ceSAdam Leventhal 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1240f94275ceSAdam Leventhal 		}
1241f94275ceSAdam Leventhal 	}
1242f94275ceSAdam Leventhal 
1243f94275ceSAdam Leventhal 	for (i = 0; i < n; i++) {
1244f94275ceSAdam Leventhal 		c = used[i];
1245f94275ceSAdam Leventhal 		ASSERT3U(c, <, rm->rm_cols);
1246f94275ceSAdam Leventhal 
1247f94275ceSAdam Leventhal 		src = rm->rm_col[c].rc_data;
1248f94275ceSAdam Leventhal 		ccount = rm->rm_col[c].rc_size;
1249f94275ceSAdam Leventhal 		for (j = 0; j < nmissing; j++) {
1250f94275ceSAdam Leventhal 			cc = missing[j] + rm->rm_firstdatacol;
1251f94275ceSAdam Leventhal 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1252f94275ceSAdam Leventhal 			ASSERT3U(cc, <, rm->rm_cols);
1253f94275ceSAdam Leventhal 			ASSERT3U(cc, !=, c);
1254f94275ceSAdam Leventhal 
1255f94275ceSAdam Leventhal 			dst[j] = rm->rm_col[cc].rc_data;
1256f94275ceSAdam Leventhal 			dcount[j] = rm->rm_col[cc].rc_size;
1257f94275ceSAdam Leventhal 		}
1258f94275ceSAdam Leventhal 
1259f94275ceSAdam Leventhal 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1260f94275ceSAdam Leventhal 
1261f94275ceSAdam Leventhal 		for (x = 0; x < ccount; x++, src++) {
1262f94275ceSAdam Leventhal 			if (*src != 0)
1263f94275ceSAdam Leventhal 				log = vdev_raidz_log2[*src];
1264f94275ceSAdam Leventhal 
1265f94275ceSAdam Leventhal 			for (cc = 0; cc < nmissing; cc++) {
1266f94275ceSAdam Leventhal 				if (x >= dcount[cc])
1267f94275ceSAdam Leventhal 					continue;
1268f94275ceSAdam Leventhal 
1269f94275ceSAdam Leventhal 				if (*src == 0) {
1270f94275ceSAdam Leventhal 					val = 0;
1271f94275ceSAdam Leventhal 				} else {
1272f94275ceSAdam Leventhal 					if ((ll = log + invlog[cc][i]) >= 255)
1273f94275ceSAdam Leventhal 						ll -= 255;
1274f94275ceSAdam Leventhal 					val = vdev_raidz_pow2[ll];
1275f94275ceSAdam Leventhal 				}
1276f94275ceSAdam Leventhal 
1277f94275ceSAdam Leventhal 				if (i == 0)
1278f94275ceSAdam Leventhal 					dst[cc][x] = val;
1279f94275ceSAdam Leventhal 				else
1280f94275ceSAdam Leventhal 					dst[cc][x] ^= val;
1281f94275ceSAdam Leventhal 			}
1282f94275ceSAdam Leventhal 		}
1283f94275ceSAdam Leventhal 	}
1284f94275ceSAdam Leventhal 
1285f94275ceSAdam Leventhal 	kmem_free(p, psize);
1286f94275ceSAdam Leventhal }
1287f94275ceSAdam Leventhal 
1288f94275ceSAdam Leventhal static int
1289f94275ceSAdam Leventhal vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1290f94275ceSAdam Leventhal {
1291f94275ceSAdam Leventhal 	int n, i, c, t, tt;
1292f94275ceSAdam Leventhal 	int nmissing_rows;
1293f94275ceSAdam Leventhal 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1294f94275ceSAdam Leventhal 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1295f94275ceSAdam Leventhal 
1296f94275ceSAdam Leventhal 	uint8_t *p, *pp;
1297f94275ceSAdam Leventhal 	size_t psize;
1298f94275ceSAdam Leventhal 
1299f94275ceSAdam Leventhal 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1300f94275ceSAdam Leventhal 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1301f94275ceSAdam Leventhal 	uint8_t *used;
1302f94275ceSAdam Leventhal 
1303f94275ceSAdam Leventhal 	int code = 0;
1304f94275ceSAdam Leventhal 
1305f94275ceSAdam Leventhal 
1306f94275ceSAdam Leventhal 	n = rm->rm_cols - rm->rm_firstdatacol;
1307f94275ceSAdam Leventhal 
1308f94275ceSAdam Leventhal 	/*
1309f94275ceSAdam Leventhal 	 * Figure out which data columns are missing.
1310f94275ceSAdam Leventhal 	 */
1311f94275ceSAdam Leventhal 	nmissing_rows = 0;
1312f94275ceSAdam Leventhal 	for (t = 0; t < ntgts; t++) {
1313f94275ceSAdam Leventhal 		if (tgts[t] >= rm->rm_firstdatacol) {
1314f94275ceSAdam Leventhal 			missing_rows[nmissing_rows++] =
1315f94275ceSAdam Leventhal 			    tgts[t] - rm->rm_firstdatacol;
1316f94275ceSAdam Leventhal 		}
1317f94275ceSAdam Leventhal 	}
1318f94275ceSAdam Leventhal 
1319f94275ceSAdam Leventhal 	/*
1320f94275ceSAdam Leventhal 	 * Figure out which parity columns to use to help generate the missing
1321f94275ceSAdam Leventhal 	 * data columns.
1322f94275ceSAdam Leventhal 	 */
1323f94275ceSAdam Leventhal 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1324f94275ceSAdam Leventhal 		ASSERT(tt < ntgts);
1325f94275ceSAdam Leventhal 		ASSERT(c < rm->rm_firstdatacol);
1326f94275ceSAdam Leventhal 
1327f94275ceSAdam Leventhal 		/*
1328f94275ceSAdam Leventhal 		 * Skip any targeted parity columns.
1329f94275ceSAdam Leventhal 		 */
1330f94275ceSAdam Leventhal 		if (c == tgts[tt]) {
1331f94275ceSAdam Leventhal 			tt++;
1332f94275ceSAdam Leventhal 			continue;
1333f94275ceSAdam Leventhal 		}
1334f94275ceSAdam Leventhal 
1335f94275ceSAdam Leventhal 		code |= 1 << c;
1336f94275ceSAdam Leventhal 
1337f94275ceSAdam Leventhal 		parity_map[i] = c;
1338f94275ceSAdam Leventhal 		i++;
1339f94275ceSAdam Leventhal 	}
1340f94275ceSAdam Leventhal 
1341f94275ceSAdam Leventhal 	ASSERT(code != 0);
1342f94275ceSAdam Leventhal 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1343f94275ceSAdam Leventhal 
1344f94275ceSAdam Leventhal 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1345f94275ceSAdam Leventhal 	    nmissing_rows * n + sizeof (used[0]) * n;
1346f94275ceSAdam Leventhal 	p = kmem_alloc(psize, KM_SLEEP);
1347f94275ceSAdam Leventhal 
1348f94275ceSAdam Leventhal 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1349f94275ceSAdam Leventhal 		rows[i] = pp;
1350f94275ceSAdam Leventhal 		pp += n;
1351f94275ceSAdam Leventhal 		invrows[i] = pp;
1352f94275ceSAdam Leventhal 		pp += n;
1353f94275ceSAdam Leventhal 	}
1354f94275ceSAdam Leventhal 	used = pp;
1355f94275ceSAdam Leventhal 
1356f94275ceSAdam Leventhal 	for (i = 0; i < nmissing_rows; i++) {
1357f94275ceSAdam Leventhal 		used[i] = parity_map[i];
1358f94275ceSAdam Leventhal 	}
1359f94275ceSAdam Leventhal 
1360f94275ceSAdam Leventhal 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1361f94275ceSAdam Leventhal 		if (tt < nmissing_rows &&
1362f94275ceSAdam Leventhal 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1363f94275ceSAdam Leventhal 			tt++;
1364f94275ceSAdam Leventhal 			continue;
1365f94275ceSAdam Leventhal 		}
1366f94275ceSAdam Leventhal 
1367f94275ceSAdam Leventhal 		ASSERT3S(i, <, n);
1368f94275ceSAdam Leventhal 		used[i] = c;
1369f94275ceSAdam Leventhal 		i++;
1370f94275ceSAdam Leventhal 	}
1371f94275ceSAdam Leventhal 
1372f94275ceSAdam Leventhal 	/*
1373f94275ceSAdam Leventhal 	 * Initialize the interesting rows of the matrix.
1374f94275ceSAdam Leventhal 	 */
1375f94275ceSAdam Leventhal 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1376f94275ceSAdam Leventhal 
1377f94275ceSAdam Leventhal 	/*
1378f94275ceSAdam Leventhal 	 * Invert the matrix.
1379f94275ceSAdam Leventhal 	 */
1380f94275ceSAdam Leventhal 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1381f94275ceSAdam Leventhal 	    invrows, used);
1382f94275ceSAdam Leventhal 
1383f94275ceSAdam Leventhal 	/*
1384f94275ceSAdam Leventhal 	 * Reconstruct the missing data using the generated matrix.
1385f94275ceSAdam Leventhal 	 */
1386f94275ceSAdam Leventhal 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1387f94275ceSAdam Leventhal 	    invrows, used);
1388f94275ceSAdam Leventhal 
1389f94275ceSAdam Leventhal 	kmem_free(p, psize);
1390f94275ceSAdam Leventhal 
1391f94275ceSAdam Leventhal 	return (code);
1392f94275ceSAdam Leventhal }
1393f94275ceSAdam Leventhal 
1394f94275ceSAdam Leventhal static int
1395f94275ceSAdam Leventhal vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1396f94275ceSAdam Leventhal {
1397f94275ceSAdam Leventhal 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1398f94275ceSAdam Leventhal 	int ntgts;
1399f94275ceSAdam Leventhal 	int i, c;
1400f94275ceSAdam Leventhal 	int code;
1401f94275ceSAdam Leventhal 	int nbadparity, nbaddata;
1402f94275ceSAdam Leventhal 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1403f94275ceSAdam Leventhal 
1404f94275ceSAdam Leventhal 	/*
1405f94275ceSAdam Leventhal 	 * The tgts list must already be sorted.
1406f94275ceSAdam Leventhal 	 */
1407f94275ceSAdam Leventhal 	for (i = 1; i < nt; i++) {
1408f94275ceSAdam Leventhal 		ASSERT(t[i] > t[i - 1]);
1409f94275ceSAdam Leventhal 	}
1410f94275ceSAdam Leventhal 
1411f94275ceSAdam Leventhal 	nbadparity = rm->rm_firstdatacol;
1412f94275ceSAdam Leventhal 	nbaddata = rm->rm_cols - nbadparity;
1413f94275ceSAdam Leventhal 	ntgts = 0;
1414f94275ceSAdam Leventhal 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1415f94275ceSAdam Leventhal 		if (c < rm->rm_firstdatacol)
1416f94275ceSAdam Leventhal 			parity_valid[c] = B_FALSE;
1417f94275ceSAdam Leventhal 
1418f94275ceSAdam Leventhal 		if (i < nt && c == t[i]) {
1419f94275ceSAdam Leventhal 			tgts[ntgts++] = c;
1420f94275ceSAdam Leventhal 			i++;
1421f94275ceSAdam Leventhal 		} else if (rm->rm_col[c].rc_error != 0) {
1422f94275ceSAdam Leventhal 			tgts[ntgts++] = c;
1423f94275ceSAdam Leventhal 		} else if (c >= rm->rm_firstdatacol) {
1424f94275ceSAdam Leventhal 			nbaddata--;
1425f94275ceSAdam Leventhal 		} else {
1426f94275ceSAdam Leventhal 			parity_valid[c] = B_TRUE;
1427f94275ceSAdam Leventhal 			nbadparity--;
1428f94275ceSAdam Leventhal 		}
1429f94275ceSAdam Leventhal 	}
1430f94275ceSAdam Leventhal 
1431f94275ceSAdam Leventhal 	ASSERT(ntgts >= nt);
1432f94275ceSAdam Leventhal 	ASSERT(nbaddata >= 0);
1433f94275ceSAdam Leventhal 	ASSERT(nbaddata + nbadparity == ntgts);
1434f94275ceSAdam Leventhal 
1435f94275ceSAdam Leventhal 	dt = &tgts[nbadparity];
1436f94275ceSAdam Leventhal 
1437f94275ceSAdam Leventhal 	/*
1438f94275ceSAdam Leventhal 	 * See if we can use any of our optimized reconstruction routines.
1439f94275ceSAdam Leventhal 	 */
1440f94275ceSAdam Leventhal 	if (!vdev_raidz_default_to_general) {
1441f94275ceSAdam Leventhal 		switch (nbaddata) {
1442f94275ceSAdam Leventhal 		case 1:
1443f94275ceSAdam Leventhal 			if (parity_valid[VDEV_RAIDZ_P])
1444f94275ceSAdam Leventhal 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1445f94275ceSAdam Leventhal 
1446f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 1);
1447f94275ceSAdam Leventhal 
1448f94275ceSAdam Leventhal 			if (parity_valid[VDEV_RAIDZ_Q])
1449f94275ceSAdam Leventhal 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1450f94275ceSAdam Leventhal 
1451f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 2);
1452f94275ceSAdam Leventhal 			break;
1453f94275ceSAdam Leventhal 
1454f94275ceSAdam Leventhal 		case 2:
1455f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 1);
1456f94275ceSAdam Leventhal 
1457f94275ceSAdam Leventhal 			if (parity_valid[VDEV_RAIDZ_P] &&
1458f94275ceSAdam Leventhal 			    parity_valid[VDEV_RAIDZ_Q])
1459f94275ceSAdam Leventhal 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1460f94275ceSAdam Leventhal 
1461f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 2);
1462f94275ceSAdam Leventhal 
1463f94275ceSAdam Leventhal 			break;
1464f94275ceSAdam Leventhal 		}
1465f94275ceSAdam Leventhal 	}
1466f94275ceSAdam Leventhal 
1467f94275ceSAdam Leventhal 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1468f94275ceSAdam Leventhal 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1469f94275ceSAdam Leventhal 	ASSERT(code > 0);
1470f94275ceSAdam Leventhal 	return (code);
1471f94275ceSAdam Leventhal }
147299653d4eSeschrock 
1473fa9e4066Sahrens static int
14744263d13fSGeorge Wilson vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
14754263d13fSGeorge Wilson     uint64_t *ashift)
1476fa9e4066Sahrens {
1477f94275ceSAdam Leventhal 	vdev_t *cvd;
147899653d4eSeschrock 	uint64_t nparity = vd->vdev_nparity;
1479f94275ceSAdam Leventhal 	int c;
1480fa9e4066Sahrens 	int lasterror = 0;
1481fa9e4066Sahrens 	int numerrors = 0;
1482fa9e4066Sahrens 
148399653d4eSeschrock 	ASSERT(nparity > 0);
148499653d4eSeschrock 
148599653d4eSeschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
148699653d4eSeschrock 	    vd->vdev_children < nparity + 1) {
1487fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1488be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
1489fa9e4066Sahrens 	}
1490fa9e4066Sahrens 
1491f64c0e34SEric Taylor 	vdev_open_children(vd);
1492fa9e4066Sahrens 
1493f94275ceSAdam Leventhal 	for (c = 0; c < vd->vdev_children; c++) {
1494f94275ceSAdam Leventhal 		cvd = vd->vdev_child[c];
1495f64c0e34SEric Taylor 
1496f94275ceSAdam Leventhal 		if (cvd->vdev_open_error != 0) {
1497f64c0e34SEric Taylor 			lasterror = cvd->vdev_open_error;
1498fa9e4066Sahrens 			numerrors++;
1499fa9e4066Sahrens 			continue;
1500fa9e4066Sahrens 		}
1501fa9e4066Sahrens 
1502fa9e4066Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
15034263d13fSGeorge Wilson 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1504ecc2d604Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1505fa9e4066Sahrens 	}
1506fa9e4066Sahrens 
1507fa9e4066Sahrens 	*asize *= vd->vdev_children;
15084263d13fSGeorge Wilson 	*max_asize *= vd->vdev_children;
1509fa9e4066Sahrens 
151099653d4eSeschrock 	if (numerrors > nparity) {
1511fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1512fa9e4066Sahrens 		return (lasterror);
1513fa9e4066Sahrens 	}
1514fa9e4066Sahrens 
1515fa9e4066Sahrens 	return (0);
1516fa9e4066Sahrens }
1517fa9e4066Sahrens 
1518fa9e4066Sahrens static void
1519fa9e4066Sahrens vdev_raidz_close(vdev_t *vd)
1520fa9e4066Sahrens {
1521f94275ceSAdam Leventhal 	int c;
1522f94275ceSAdam Leventhal 
1523f94275ceSAdam Leventhal 	for (c = 0; c < vd->vdev_children; c++)
1524fa9e4066Sahrens 		vdev_close(vd->vdev_child[c]);
1525fa9e4066Sahrens }
1526fa9e4066Sahrens 
1527810e43b2SBill Pijewski /*
1528810e43b2SBill Pijewski  * Handle a read or write I/O to a RAID-Z dump device.
1529810e43b2SBill Pijewski  *
1530810e43b2SBill Pijewski  * The dump device is in a unique situation compared to other ZFS datasets:
1531810e43b2SBill Pijewski  * writing to this device should be as simple and fast as possible.  In
1532810e43b2SBill Pijewski  * addition, durability matters much less since the dump will be extracted
1533810e43b2SBill Pijewski  * once the machine reboots.  For that reason, this function eschews parity for
1534810e43b2SBill Pijewski  * performance and simplicity.  The dump device uses the checksum setting
1535810e43b2SBill Pijewski  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1536810e43b2SBill Pijewski  * dataset.
1537810e43b2SBill Pijewski  *
1538810e43b2SBill Pijewski  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1539810e43b2SBill Pijewski  * 128 KB will not fill an entire block; in addition, they may not be properly
1540810e43b2SBill Pijewski  * aligned.  In that case, this function uses the preallocated 128 KB block and
1541810e43b2SBill Pijewski  * omits reading or writing any "empty" portions of that block, as opposed to
1542810e43b2SBill Pijewski  * allocating a fresh appropriately-sized block.
1543810e43b2SBill Pijewski  *
1544810e43b2SBill Pijewski  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1545810e43b2SBill Pijewski  *
1546810e43b2SBill Pijewski  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1547810e43b2SBill Pijewski  *
1548810e43b2SBill Pijewski  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1549810e43b2SBill Pijewski  * allocated which spans all five child vdevs.  8 KB of data would be written to
1550810e43b2SBill Pijewski  * each of four vdevs, with the fifth containing the parity bits.
1551810e43b2SBill Pijewski  *
1552810e43b2SBill Pijewski  *       parity    data     data     data     data
1553810e43b2SBill Pijewski  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1554810e43b2SBill Pijewski  *         ^        ^        ^        ^        ^
1555810e43b2SBill Pijewski  *         |        |        |        |        |
1556810e43b2SBill Pijewski  *   8 KB parity    ------8 KB data blocks------
1557810e43b2SBill Pijewski  *
1558810e43b2SBill Pijewski  * However, when writing to the dump device, the behavior is different:
1559810e43b2SBill Pijewski  *
1560810e43b2SBill Pijewski  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1561810e43b2SBill Pijewski  *
1562810e43b2SBill Pijewski  * Unlike the normal RAID-Z case in which the block is allocated based on the
1563810e43b2SBill Pijewski  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1564810e43b2SBill Pijewski  * I/O size is less than 128 KB, only the actual portions of data are written.
1565810e43b2SBill Pijewski  * In this example the data is written to the third data vdev since that vdev
1566810e43b2SBill Pijewski  * contains the offset [64 KB, 96 KB).
1567810e43b2SBill Pijewski  *
1568810e43b2SBill Pijewski  *       parity    data     data     data     data
1569810e43b2SBill Pijewski  *     |        |        |        |   XX   |        |
1570810e43b2SBill Pijewski  *                                    ^
1571810e43b2SBill Pijewski  *                                    |
1572810e43b2SBill Pijewski  *                             32 KB data block
1573810e43b2SBill Pijewski  *
1574810e43b2SBill Pijewski  * As a result, an individual I/O may not span all child vdevs; moreover, a
1575810e43b2SBill Pijewski  * small I/O may only operate on a single child vdev.
1576810e43b2SBill Pijewski  *
1577810e43b2SBill Pijewski  * Note that since there are no parity bits calculated or written, this format
1578810e43b2SBill Pijewski  * remains the same no matter how many parity bits are used in a normal RAID-Z
1579810e43b2SBill Pijewski  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1580810e43b2SBill Pijewski  * would look like:
1581810e43b2SBill Pijewski  *
1582810e43b2SBill Pijewski  *       parity   parity   parity    data     data     data     data
1583810e43b2SBill Pijewski  *     |        |        |        |        |        |   XX   |        |
1584810e43b2SBill Pijewski  *                                                      ^
1585810e43b2SBill Pijewski  *                                                      |
1586810e43b2SBill Pijewski  *                                               32 KB data block
1587810e43b2SBill Pijewski  */
1588810e43b2SBill Pijewski int
1589810e43b2SBill Pijewski vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1590810e43b2SBill Pijewski     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1591810e43b2SBill Pijewski {
1592810e43b2SBill Pijewski 	vdev_t *tvd = vd->vdev_top;
1593810e43b2SBill Pijewski 	vdev_t *cvd;
1594810e43b2SBill Pijewski 	raidz_map_t *rm;
1595810e43b2SBill Pijewski 	raidz_col_t *rc;
1596810e43b2SBill Pijewski 	int c, err = 0;
1597810e43b2SBill Pijewski 
1598810e43b2SBill Pijewski 	uint64_t start, end, colstart, colend;
1599810e43b2SBill Pijewski 	uint64_t coloffset, colsize, colskip;
1600810e43b2SBill Pijewski 
1601810e43b2SBill Pijewski 	int flags = doread ? B_READ : B_WRITE;
1602810e43b2SBill Pijewski 
1603810e43b2SBill Pijewski #ifdef	_KERNEL
1604810e43b2SBill Pijewski 
1605810e43b2SBill Pijewski 	/*
1606810e43b2SBill Pijewski 	 * Don't write past the end of the block
1607810e43b2SBill Pijewski 	 */
1608b5152584SMatthew Ahrens 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1609810e43b2SBill Pijewski 
1610810e43b2SBill Pijewski 	start = offset;
1611810e43b2SBill Pijewski 	end = start + size;
1612810e43b2SBill Pijewski 
1613810e43b2SBill Pijewski 	/*
1614810e43b2SBill Pijewski 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1615810e43b2SBill Pijewski 	 * from the "original" offset, this is, the offset of the extent which
1616810e43b2SBill Pijewski 	 * contains the requisite offset of the data being read or written.
1617810e43b2SBill Pijewski 	 *
1618810e43b2SBill Pijewski 	 * Even if this I/O operation doesn't span the full block size, let's
1619810e43b2SBill Pijewski 	 * treat the on-disk format as if the only blocks are the complete 128
1620810e43b2SBill Pijewski 	 * KB size.
1621810e43b2SBill Pijewski 	 */
1622810e43b2SBill Pijewski 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1623b5152584SMatthew Ahrens 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1624b5152584SMatthew Ahrens 	    vd->vdev_children, vd->vdev_nparity);
1625810e43b2SBill Pijewski 
1626810e43b2SBill Pijewski 	coloffset = origoffset;
1627810e43b2SBill Pijewski 
1628810e43b2SBill Pijewski 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1629810e43b2SBill Pijewski 	    c++, coloffset += rc->rc_size) {
1630810e43b2SBill Pijewski 		rc = &rm->rm_col[c];
1631810e43b2SBill Pijewski 		cvd = vd->vdev_child[rc->rc_devidx];
1632810e43b2SBill Pijewski 
1633810e43b2SBill Pijewski 		/*
1634810e43b2SBill Pijewski 		 * Find the start and end of this column in the RAID-Z map,
1635810e43b2SBill Pijewski 		 * keeping in mind that the stated size and offset of the
1636810e43b2SBill Pijewski 		 * operation may not fill the entire column for this vdev.
1637810e43b2SBill Pijewski 		 *
1638810e43b2SBill Pijewski 		 * If any portion of the data spans this column, issue the
1639810e43b2SBill Pijewski 		 * appropriate operation to the vdev.
1640810e43b2SBill Pijewski 		 */
1641810e43b2SBill Pijewski 		if (coloffset + rc->rc_size <= start)
1642810e43b2SBill Pijewski 			continue;
1643810e43b2SBill Pijewski 		if (coloffset >= end)
1644810e43b2SBill Pijewski 			continue;
1645810e43b2SBill Pijewski 
1646810e43b2SBill Pijewski 		colstart = MAX(coloffset, start);
1647810e43b2SBill Pijewski 		colend = MIN(end, coloffset + rc->rc_size);
1648810e43b2SBill Pijewski 		colsize = colend - colstart;
1649810e43b2SBill Pijewski 		colskip = colstart - coloffset;
1650810e43b2SBill Pijewski 
1651810e43b2SBill Pijewski 		VERIFY3U(colsize, <=, rc->rc_size);
1652810e43b2SBill Pijewski 		VERIFY3U(colskip, <=, rc->rc_size);
1653810e43b2SBill Pijewski 
1654810e43b2SBill Pijewski 		/*
1655810e43b2SBill Pijewski 		 * Note that the child vdev will have a vdev label at the start
1656810e43b2SBill Pijewski 		 * of its range of offsets, hence the need for
1657810e43b2SBill Pijewski 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1658810e43b2SBill Pijewski 		 * example of why this calculation is needed.
1659810e43b2SBill Pijewski 		 */
1660810e43b2SBill Pijewski 		if ((err = vdev_disk_physio(cvd,
1661810e43b2SBill Pijewski 		    ((char *)rc->rc_data) + colskip, colsize,
1662810e43b2SBill Pijewski 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1663810e43b2SBill Pijewski 		    flags, isdump)) != 0)
1664810e43b2SBill Pijewski 			break;
1665810e43b2SBill Pijewski 	}
1666810e43b2SBill Pijewski 
1667810e43b2SBill Pijewski 	vdev_raidz_map_free(rm);
1668810e43b2SBill Pijewski #endif	/* KERNEL */
1669810e43b2SBill Pijewski 
1670810e43b2SBill Pijewski 	return (err);
1671810e43b2SBill Pijewski }
1672810e43b2SBill Pijewski 
1673fa9e4066Sahrens static uint64_t
1674fa9e4066Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1675fa9e4066Sahrens {
1676fa9e4066Sahrens 	uint64_t asize;
1677ecc2d604Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1678fa9e4066Sahrens 	uint64_t cols = vd->vdev_children;
167999653d4eSeschrock 	uint64_t nparity = vd->vdev_nparity;
1680fa9e4066Sahrens 
1681ecc2d604Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
168299653d4eSeschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
168399653d4eSeschrock 	asize = roundup(asize, nparity + 1) << ashift;
1684fa9e4066Sahrens 
1685fa9e4066Sahrens 	return (asize);
1686fa9e4066Sahrens }
1687fa9e4066Sahrens 
1688fa9e4066Sahrens static void
1689fa9e4066Sahrens vdev_raidz_child_done(zio_t *zio)
1690fa9e4066Sahrens {
1691fa9e4066Sahrens 	raidz_col_t *rc = zio->io_private;
1692fa9e4066Sahrens 
1693fa9e4066Sahrens 	rc->rc_error = zio->io_error;
1694fa9e4066Sahrens 	rc->rc_tried = 1;
1695fa9e4066Sahrens 	rc->rc_skipped = 0;
1696fa9e4066Sahrens }
1697fa9e4066Sahrens 
16983e30c24aSWill Andrews /*
16993e30c24aSWill Andrews  * Start an IO operation on a RAIDZ VDev
17003e30c24aSWill Andrews  *
17013e30c24aSWill Andrews  * Outline:
17023e30c24aSWill Andrews  * - For write operations:
17033e30c24aSWill Andrews  *   1. Generate the parity data
17043e30c24aSWill Andrews  *   2. Create child zio write operations to each column's vdev, for both
17053e30c24aSWill Andrews  *      data and parity.
17063e30c24aSWill Andrews  *   3. If the column skips any sectors for padding, create optional dummy
17073e30c24aSWill Andrews  *      write zio children for those areas to improve aggregation continuity.
17083e30c24aSWill Andrews  * - For read operations:
17093e30c24aSWill Andrews  *   1. Create child zio read operations to each data column's vdev to read
17103e30c24aSWill Andrews  *      the range of data required for zio.
17113e30c24aSWill Andrews  *   2. If this is a scrub or resilver operation, or if any of the data
17123e30c24aSWill Andrews  *      vdevs have had errors, then create zio read operations to the parity
17133e30c24aSWill Andrews  *      columns' VDevs as well.
17143e30c24aSWill Andrews  */
1715738f37bcSGeorge Wilson static void
1716fa9e4066Sahrens vdev_raidz_io_start(zio_t *zio)
1717fa9e4066Sahrens {
1718fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
1719ecc2d604Sbonwick 	vdev_t *tvd = vd->vdev_top;
1720fa9e4066Sahrens 	vdev_t *cvd;
1721fa9e4066Sahrens 	raidz_map_t *rm;
1722fa9e4066Sahrens 	raidz_col_t *rc;
1723f94275ceSAdam Leventhal 	int c, i;
1724fa9e4066Sahrens 
1725810e43b2SBill Pijewski 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1726810e43b2SBill Pijewski 	    tvd->vdev_ashift, vd->vdev_children,
172799653d4eSeschrock 	    vd->vdev_nparity);
1728fa9e4066Sahrens 
1729810e43b2SBill Pijewski 	zio->io_vsd = rm;
1730810e43b2SBill Pijewski 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1731810e43b2SBill Pijewski 
173244cd46caSbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1733fa9e4066Sahrens 
1734fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
1735f94275ceSAdam Leventhal 		vdev_raidz_generate_parity(rm);
1736fa9e4066Sahrens 
1737fa9e4066Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1738fa9e4066Sahrens 			rc = &rm->rm_col[c];
173999653d4eSeschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1740fa9e4066Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1741fa9e4066Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1742e14bb325SJeff Bonwick 			    zio->io_type, zio->io_priority, 0,
1743fa9e4066Sahrens 			    vdev_raidz_child_done, rc));
1744fa9e4066Sahrens 		}
1745e05725b1Sbonwick 
1746f94275ceSAdam Leventhal 		/*
1747f94275ceSAdam Leventhal 		 * Generate optional I/Os for any skipped sectors to improve
1748f94275ceSAdam Leventhal 		 * aggregation contiguity.
1749f94275ceSAdam Leventhal 		 */
17502fbc121fSAdam Leventhal 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1751f94275ceSAdam Leventhal 			ASSERT(c <= rm->rm_scols);
1752f94275ceSAdam Leventhal 			if (c == rm->rm_scols)
1753f94275ceSAdam Leventhal 				c = 0;
1754f94275ceSAdam Leventhal 			rc = &rm->rm_col[c];
1755f94275ceSAdam Leventhal 			cvd = vd->vdev_child[rc->rc_devidx];
1756f94275ceSAdam Leventhal 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757f94275ceSAdam Leventhal 			    rc->rc_offset + rc->rc_size, NULL,
1758f94275ceSAdam Leventhal 			    1 << tvd->vdev_ashift,
1759f94275ceSAdam Leventhal 			    zio->io_type, zio->io_priority,
1760f94275ceSAdam Leventhal 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1761f94275ceSAdam Leventhal 		}
1762f94275ceSAdam Leventhal 
1763738f37bcSGeorge Wilson 		zio_execute(zio);
1764738f37bcSGeorge Wilson 		return;
1765fa9e4066Sahrens 	}
1766fa9e4066Sahrens 
1767fa9e4066Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1768fa9e4066Sahrens 
176999653d4eSeschrock 	/*
177099653d4eSeschrock 	 * Iterate over the columns in reverse order so that we hit the parity
1771f94275ceSAdam Leventhal 	 * last -- any errors along the way will force us to read the parity.
177299653d4eSeschrock 	 */
1773fa9e4066Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1774fa9e4066Sahrens 		rc = &rm->rm_col[c];
177599653d4eSeschrock 		cvd = vd->vdev_child[rc->rc_devidx];
1776*4b7f25f9SArne Jansen 		if (cvd->vdev_avoid_read) {
1777*4b7f25f9SArne Jansen 			if (c >= rm->rm_firstdatacol)
1778*4b7f25f9SArne Jansen 				rm->rm_missingdata++;
1779*4b7f25f9SArne Jansen 			else
1780*4b7f25f9SArne Jansen 				rm->rm_missingparity++;
1781*4b7f25f9SArne Jansen 			rc->rc_error = SET_ERROR(ENXIO);
1782*4b7f25f9SArne Jansen 			rc->rc_skipped = 1;	/* only try if necessary */
1783*4b7f25f9SArne Jansen 			continue;
1784*4b7f25f9SArne Jansen 		}
17850a4e9518Sgw25295 		if (!vdev_readable(cvd)) {
178699653d4eSeschrock 			if (c >= rm->rm_firstdatacol)
178799653d4eSeschrock 				rm->rm_missingdata++;
178899653d4eSeschrock 			else
178999653d4eSeschrock 				rm->rm_missingparity++;
1790be6fd75aSMatthew Ahrens 			rc->rc_error = SET_ERROR(ENXIO);
1791fa9e4066Sahrens 			rc->rc_tried = 1;	/* don't even try */
1792fa9e4066Sahrens 			rc->rc_skipped = 1;
1793fa9e4066Sahrens 			continue;
1794fa9e4066Sahrens 		}
1795b24ab676SJeff Bonwick 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
179699653d4eSeschrock 			if (c >= rm->rm_firstdatacol)
179799653d4eSeschrock 				rm->rm_missingdata++;
179899653d4eSeschrock 			else
179999653d4eSeschrock 				rm->rm_missingparity++;
1800be6fd75aSMatthew Ahrens 			rc->rc_error = SET_ERROR(ESTALE);
1801fa9e4066Sahrens 			rc->rc_skipped = 1;
1802fa9e4066Sahrens 			continue;
1803fa9e4066Sahrens 		}
180499653d4eSeschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1805dfd80e3eSMark J Musante 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1806fa9e4066Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1807fa9e4066Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1808e14bb325SJeff Bonwick 			    zio->io_type, zio->io_priority, 0,
1809fa9e4066Sahrens 			    vdev_raidz_child_done, rc));
1810fa9e4066Sahrens 		}
1811fa9e4066Sahrens 	}
1812fa9e4066Sahrens 
1813738f37bcSGeorge Wilson 	zio_execute(zio);
1814fa9e4066Sahrens }
1815fa9e4066Sahrens 
18163f9d6ad7SLin Ling 
1817ea8dc4b6Seschrock /*
1818ea8dc4b6Seschrock  * Report a checksum error for a child of a RAID-Z device.
1819ea8dc4b6Seschrock  */
1820ea8dc4b6Seschrock static void
182122fe2c88SJonathan Adams raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1822ea8dc4b6Seschrock {
182399653d4eSeschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1824ea8dc4b6Seschrock 
1825ea8dc4b6Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
182622fe2c88SJonathan Adams 		zio_bad_cksum_t zbc;
182722fe2c88SJonathan Adams 		raidz_map_t *rm = zio->io_vsd;
182822fe2c88SJonathan Adams 
1829ea8dc4b6Seschrock 		mutex_enter(&vd->vdev_stat_lock);
1830ea8dc4b6Seschrock 		vd->vdev_stat.vs_checksum_errors++;
1831ea8dc4b6Seschrock 		mutex_exit(&vd->vdev_stat_lock);
183222fe2c88SJonathan Adams 
183322fe2c88SJonathan Adams 		zbc.zbc_has_cksum = 0;
183422fe2c88SJonathan Adams 		zbc.zbc_injected = rm->rm_ecksuminjected;
183522fe2c88SJonathan Adams 
183622fe2c88SJonathan Adams 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
183722fe2c88SJonathan Adams 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
183822fe2c88SJonathan Adams 		    &zbc);
183922fe2c88SJonathan Adams 	}
1840ea8dc4b6Seschrock }
1841ea8dc4b6Seschrock 
184222fe2c88SJonathan Adams /*
184322fe2c88SJonathan Adams  * We keep track of whether or not there were any injected errors, so that
184422fe2c88SJonathan Adams  * any ereports we generate can note it.
184522fe2c88SJonathan Adams  */
184622fe2c88SJonathan Adams static int
184722fe2c88SJonathan Adams raidz_checksum_verify(zio_t *zio)
184822fe2c88SJonathan Adams {
184922fe2c88SJonathan Adams 	zio_bad_cksum_t zbc;
185022fe2c88SJonathan Adams 	raidz_map_t *rm = zio->io_vsd;
185122fe2c88SJonathan Adams 
185222fe2c88SJonathan Adams 	int ret = zio_checksum_error(zio, &zbc);
185322fe2c88SJonathan Adams 	if (ret != 0 && zbc.zbc_injected != 0)
185422fe2c88SJonathan Adams 		rm->rm_ecksuminjected = 1;
185522fe2c88SJonathan Adams 
185622fe2c88SJonathan Adams 	return (ret);
1857ea8dc4b6Seschrock }
1858ea8dc4b6Seschrock 
185999653d4eSeschrock /*
186099653d4eSeschrock  * Generate the parity from the data columns. If we tried and were able to
186199653d4eSeschrock  * read the parity without error, verify that the generated parity matches the
186299653d4eSeschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
186399653d4eSeschrock  * number such failures.
186499653d4eSeschrock  */
186599653d4eSeschrock static int
186699653d4eSeschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
186799653d4eSeschrock {
186899653d4eSeschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
186999653d4eSeschrock 	int c, ret = 0;
187099653d4eSeschrock 	raidz_col_t *rc;
187199653d4eSeschrock 
1872810e43b2SBill Pijewski 	blkptr_t *bp = zio->io_bp;
1873810e43b2SBill Pijewski 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1874810e43b2SBill Pijewski 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1875810e43b2SBill Pijewski 
1876810e43b2SBill Pijewski 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1877810e43b2SBill Pijewski 		return (ret);
1878810e43b2SBill Pijewski 
187999653d4eSeschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
188099653d4eSeschrock 		rc = &rm->rm_col[c];
188199653d4eSeschrock 		if (!rc->rc_tried || rc->rc_error != 0)
188299653d4eSeschrock 			continue;
188399653d4eSeschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
188499653d4eSeschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
188599653d4eSeschrock 	}
188699653d4eSeschrock 
1887f94275ceSAdam Leventhal 	vdev_raidz_generate_parity(rm);
188899653d4eSeschrock 
188999653d4eSeschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
189099653d4eSeschrock 		rc = &rm->rm_col[c];
189199653d4eSeschrock 		if (!rc->rc_tried || rc->rc_error != 0)
189299653d4eSeschrock 			continue;
189399653d4eSeschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
189422fe2c88SJonathan Adams 			raidz_checksum_error(zio, rc, orig[c]);
1895be6fd75aSMatthew Ahrens 			rc->rc_error = SET_ERROR(ECKSUM);
189699653d4eSeschrock 			ret++;
189799653d4eSeschrock 		}
189899653d4eSeschrock 		zio_buf_free(orig[c], rc->rc_size);
189999653d4eSeschrock 	}
190099653d4eSeschrock 
190199653d4eSeschrock 	return (ret);
190299653d4eSeschrock }
190399653d4eSeschrock 
1904f94275ceSAdam Leventhal /*
1905f94275ceSAdam Leventhal  * Keep statistics on all the ways that we used parity to correct data.
1906f94275ceSAdam Leventhal  */
1907f94275ceSAdam Leventhal static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1908ea8dc4b6Seschrock 
1909e05725b1Sbonwick static int
1910e14bb325SJeff Bonwick vdev_raidz_worst_error(raidz_map_t *rm)
1911e14bb325SJeff Bonwick {
1912e14bb325SJeff Bonwick 	int error = 0;
1913e14bb325SJeff Bonwick 
1914e14bb325SJeff Bonwick 	for (int c = 0; c < rm->rm_cols; c++)
1915e14bb325SJeff Bonwick 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1916e14bb325SJeff Bonwick 
1917e14bb325SJeff Bonwick 	return (error);
1918e14bb325SJeff Bonwick }
1919e14bb325SJeff Bonwick 
1920f94275ceSAdam Leventhal /*
1921f94275ceSAdam Leventhal  * Iterate over all combinations of bad data and attempt a reconstruction.
1922f94275ceSAdam Leventhal  * Note that the algorithm below is non-optimal because it doesn't take into
1923f94275ceSAdam Leventhal  * account how reconstruction is actually performed. For example, with
1924f94275ceSAdam Leventhal  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1925f94275ceSAdam Leventhal  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1926f94275ceSAdam Leventhal  * cases we'd only use parity information in column 0.
1927f94275ceSAdam Leventhal  */
1928f94275ceSAdam Leventhal static int
1929f94275ceSAdam Leventhal vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1930f94275ceSAdam Leventhal {
1931f94275ceSAdam Leventhal 	raidz_map_t *rm = zio->io_vsd;
1932f94275ceSAdam Leventhal 	raidz_col_t *rc;
1933f94275ceSAdam Leventhal 	void *orig[VDEV_RAIDZ_MAXPARITY];
1934f94275ceSAdam Leventhal 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1935f94275ceSAdam Leventhal 	int *tgts = &tstore[1];
1936f94275ceSAdam Leventhal 	int current, next, i, c, n;
1937f94275ceSAdam Leventhal 	int code, ret = 0;
1938f94275ceSAdam Leventhal 
1939f94275ceSAdam Leventhal 	ASSERT(total_errors < rm->rm_firstdatacol);
1940f94275ceSAdam Leventhal 
1941f94275ceSAdam Leventhal 	/*
1942f94275ceSAdam Leventhal 	 * This simplifies one edge condition.
1943f94275ceSAdam Leventhal 	 */
1944f94275ceSAdam Leventhal 	tgts[-1] = -1;
1945f94275ceSAdam Leventhal 
1946f94275ceSAdam Leventhal 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1947f94275ceSAdam Leventhal 		/*
1948f94275ceSAdam Leventhal 		 * Initialize the targets array by finding the first n columns
1949f94275ceSAdam Leventhal 		 * that contain no error.
1950f94275ceSAdam Leventhal 		 *
1951f94275ceSAdam Leventhal 		 * If there were no data errors, we need to ensure that we're
1952f94275ceSAdam Leventhal 		 * always explicitly attempting to reconstruct at least one
1953f94275ceSAdam Leventhal 		 * data column. To do this, we simply push the highest target
1954f94275ceSAdam Leventhal 		 * up into the data columns.
1955f94275ceSAdam Leventhal 		 */
1956f94275ceSAdam Leventhal 		for (c = 0, i = 0; i < n; i++) {
1957f94275ceSAdam Leventhal 			if (i == n - 1 && data_errors == 0 &&
1958f94275ceSAdam Leventhal 			    c < rm->rm_firstdatacol) {
1959f94275ceSAdam Leventhal 				c = rm->rm_firstdatacol;
1960f94275ceSAdam Leventhal 			}
1961f94275ceSAdam Leventhal 
1962f94275ceSAdam Leventhal 			while (rm->rm_col[c].rc_error != 0) {
1963f94275ceSAdam Leventhal 				c++;
1964f94275ceSAdam Leventhal 				ASSERT3S(c, <, rm->rm_cols);
1965f94275ceSAdam Leventhal 			}
1966f94275ceSAdam Leventhal 
1967f94275ceSAdam Leventhal 			tgts[i] = c++;
1968f94275ceSAdam Leventhal 		}
1969f94275ceSAdam Leventhal 
1970f94275ceSAdam Leventhal 		/*
1971f94275ceSAdam Leventhal 		 * Setting tgts[n] simplifies the other edge condition.
1972f94275ceSAdam Leventhal 		 */
1973f94275ceSAdam Leventhal 		tgts[n] = rm->rm_cols;
1974f94275ceSAdam Leventhal 
1975f94275ceSAdam Leventhal 		/*
1976f94275ceSAdam Leventhal 		 * These buffers were allocated in previous iterations.
1977f94275ceSAdam Leventhal 		 */
1978f94275ceSAdam Leventhal 		for (i = 0; i < n - 1; i++) {
1979f94275ceSAdam Leventhal 			ASSERT(orig[i] != NULL);
1980f94275ceSAdam Leventhal 		}
1981f94275ceSAdam Leventhal 
1982f94275ceSAdam Leventhal 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1983f94275ceSAdam Leventhal 
1984f94275ceSAdam Leventhal 		current = 0;
1985f94275ceSAdam Leventhal 		next = tgts[current];
1986f94275ceSAdam Leventhal 
1987f94275ceSAdam Leventhal 		while (current != n) {
1988f94275ceSAdam Leventhal 			tgts[current] = next;
1989f94275ceSAdam Leventhal 			current = 0;
1990f94275ceSAdam Leventhal 
1991f94275ceSAdam Leventhal 			/*
1992f94275ceSAdam Leventhal 			 * Save off the original data that we're going to
1993f94275ceSAdam Leventhal 			 * attempt to reconstruct.
1994f94275ceSAdam Leventhal 			 */
1995f94275ceSAdam Leventhal 			for (i = 0; i < n; i++) {
1996f94275ceSAdam Leventhal 				ASSERT(orig[i] != NULL);
1997f94275ceSAdam Leventhal 				c = tgts[i];
1998f94275ceSAdam Leventhal 				ASSERT3S(c, >=, 0);
1999f94275ceSAdam Leventhal 				ASSERT3S(c, <, rm->rm_cols);
2000f94275ceSAdam Leventhal 				rc = &rm->rm_col[c];
2001f94275ceSAdam Leventhal 				bcopy(rc->rc_data, orig[i], rc->rc_size);
2002f94275ceSAdam Leventhal 			}
2003f94275ceSAdam Leventhal 
2004f94275ceSAdam Leventhal 			/*
2005f94275ceSAdam Leventhal 			 * Attempt a reconstruction and exit the outer loop on
2006f94275ceSAdam Leventhal 			 * success.
2007f94275ceSAdam Leventhal 			 */
2008f94275ceSAdam Leventhal 			code = vdev_raidz_reconstruct(rm, tgts, n);
200922fe2c88SJonathan Adams 			if (raidz_checksum_verify(zio) == 0) {
2010f94275ceSAdam Leventhal 				atomic_inc_64(&raidz_corrected[code]);
2011f94275ceSAdam Leventhal 
2012f94275ceSAdam Leventhal 				for (i = 0; i < n; i++) {
2013f94275ceSAdam Leventhal 					c = tgts[i];
2014f94275ceSAdam Leventhal 					rc = &rm->rm_col[c];
2015f94275ceSAdam Leventhal 					ASSERT(rc->rc_error == 0);
201622fe2c88SJonathan Adams 					if (rc->rc_tried)
201722fe2c88SJonathan Adams 						raidz_checksum_error(zio, rc,
201822fe2c88SJonathan Adams 						    orig[i]);
2019be6fd75aSMatthew Ahrens 					rc->rc_error = SET_ERROR(ECKSUM);
2020f94275ceSAdam Leventhal 				}
2021f94275ceSAdam Leventhal 
2022f94275ceSAdam Leventhal 				ret = code;
2023f94275ceSAdam Leventhal 				goto done;
2024f94275ceSAdam Leventhal 			}
2025f94275ceSAdam Leventhal 
2026f94275ceSAdam Leventhal 			/*
2027f94275ceSAdam Leventhal 			 * Restore the original data.
2028f94275ceSAdam Leventhal 			 */
2029f94275ceSAdam Leventhal 			for (i = 0; i < n; i++) {
2030f94275ceSAdam Leventhal 				c = tgts[i];
2031f94275ceSAdam Leventhal 				rc = &rm->rm_col[c];
2032f94275ceSAdam Leventhal 				bcopy(orig[i], rc->rc_data, rc->rc_size);
2033f94275ceSAdam Leventhal 			}
2034f94275ceSAdam Leventhal 
2035f94275ceSAdam Leventhal 			do {
2036f94275ceSAdam Leventhal 				/*
2037f94275ceSAdam Leventhal 				 * Find the next valid column after the current
2038f94275ceSAdam Leventhal 				 * position..
2039f94275ceSAdam Leventhal 				 */
2040f94275ceSAdam Leventhal 				for (next = tgts[current] + 1;
2041f94275ceSAdam Leventhal 				    next < rm->rm_cols &&
2042f94275ceSAdam Leventhal 				    rm->rm_col[next].rc_error != 0; next++)
2043f94275ceSAdam Leventhal 					continue;
2044f94275ceSAdam Leventhal 
2045f94275ceSAdam Leventhal 				ASSERT(next <= tgts[current + 1]);
2046f94275ceSAdam Leventhal 
2047f94275ceSAdam Leventhal 				/*
2048f94275ceSAdam Leventhal 				 * If that spot is available, we're done here.
2049f94275ceSAdam Leventhal 				 */
2050f94275ceSAdam Leventhal 				if (next != tgts[current + 1])
2051f94275ceSAdam Leventhal 					break;
2052f94275ceSAdam Leventhal 
2053f94275ceSAdam Leventhal 				/*
2054f94275ceSAdam Leventhal 				 * Otherwise, find the next valid column after
2055f94275ceSAdam Leventhal 				 * the previous position.
2056f94275ceSAdam Leventhal 				 */
2057f94275ceSAdam Leventhal 				for (c = tgts[current - 1] + 1;
2058f94275ceSAdam Leventhal 				    rm->rm_col[c].rc_error != 0; c++)
2059f94275ceSAdam Leventhal 					continue;
2060f94275ceSAdam Leventhal 
2061f94275ceSAdam Leventhal 				tgts[current] = c;
2062f94275ceSAdam Leventhal 				current++;
2063f94275ceSAdam Leventhal 
2064f94275ceSAdam Leventhal 			} while (current != n);
2065f94275ceSAdam Leventhal 		}
2066f94275ceSAdam Leventhal 	}
2067f94275ceSAdam Leventhal 	n--;
2068f94275ceSAdam Leventhal done:
2069f94275ceSAdam Leventhal 	for (i = 0; i < n; i++) {
2070f94275ceSAdam Leventhal 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2071f94275ceSAdam Leventhal 	}
2072f94275ceSAdam Leventhal 
2073f94275ceSAdam Leventhal 	return (ret);
2074f94275ceSAdam Leventhal }
2075f94275ceSAdam Leventhal 
20763e30c24aSWill Andrews /*
20773e30c24aSWill Andrews  * Complete an IO operation on a RAIDZ VDev
20783e30c24aSWill Andrews  *
20793e30c24aSWill Andrews  * Outline:
20803e30c24aSWill Andrews  * - For write operations:
20813e30c24aSWill Andrews  *   1. Check for errors on the child IOs.
20823e30c24aSWill Andrews  *   2. Return, setting an error code if too few child VDevs were written
20833e30c24aSWill Andrews  *      to reconstruct the data later.  Note that partial writes are
20843e30c24aSWill Andrews  *      considered successful if they can be reconstructed at all.
20853e30c24aSWill Andrews  * - For read operations:
20863e30c24aSWill Andrews  *   1. Check for errors on the child IOs.
20873e30c24aSWill Andrews  *   2. If data errors occurred:
20883e30c24aSWill Andrews  *      a. Try to reassemble the data from the parity available.
20893e30c24aSWill Andrews  *      b. If we haven't yet read the parity drives, read them now.
20903e30c24aSWill Andrews  *      c. If all parity drives have been read but the data still doesn't
20913e30c24aSWill Andrews  *         reassemble with a correct checksum, then try combinatorial
20923e30c24aSWill Andrews  *         reconstruction.
20933e30c24aSWill Andrews  *      d. If that doesn't work, return an error.
20943e30c24aSWill Andrews  *   3. If there were unexpected errors or this is a resilver operation,
20953e30c24aSWill Andrews  *      rewrite the vdevs that had errors.
20963e30c24aSWill Andrews  */
2097e14bb325SJeff Bonwick static void
2098fa9e4066Sahrens vdev_raidz_io_done(zio_t *zio)
2099fa9e4066Sahrens {
2100fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
2101fa9e4066Sahrens 	vdev_t *cvd;
2102fa9e4066Sahrens 	raidz_map_t *rm = zio->io_vsd;
2103f94275ceSAdam Leventhal 	raidz_col_t *rc;
2104fa9e4066Sahrens 	int unexpected_errors = 0;
210599653d4eSeschrock 	int parity_errors = 0;
2106c7a40cc4Sahl 	int parity_untried = 0;
210799653d4eSeschrock 	int data_errors = 0;
2108e14bb325SJeff Bonwick 	int total_errors = 0;
2109f94275ceSAdam Leventhal 	int n, c;
2110f94275ceSAdam Leventhal 	int tgts[VDEV_RAIDZ_MAXPARITY];
2111f94275ceSAdam Leventhal 	int code;
2112fa9e4066Sahrens 
211344cd46caSbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2114fa9e4066Sahrens 
211599653d4eSeschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
211699653d4eSeschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
211799653d4eSeschrock 
2118fa9e4066Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
2119fa9e4066Sahrens 		rc = &rm->rm_col[c];
2120fa9e4066Sahrens 
2121fa9e4066Sahrens 		if (rc->rc_error) {
2122e14bb325SJeff Bonwick 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
212399653d4eSeschrock 
212499653d4eSeschrock 			if (c < rm->rm_firstdatacol)
212599653d4eSeschrock 				parity_errors++;
212699653d4eSeschrock 			else
212799653d4eSeschrock 				data_errors++;
212899653d4eSeschrock 
2129fa9e4066Sahrens 			if (!rc->rc_skipped)
2130fa9e4066Sahrens 				unexpected_errors++;
213199653d4eSeschrock 
2132e14bb325SJeff Bonwick 			total_errors++;
2133c7a40cc4Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2134c7a40cc4Sahl 			parity_untried++;
2135fa9e4066Sahrens 		}
2136fa9e4066Sahrens 	}
2137fa9e4066Sahrens 
2138fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
2139fa9e4066Sahrens 		/*
2140e14bb325SJeff Bonwick 		 * XXX -- for now, treat partial writes as a success.
2141e14bb325SJeff Bonwick 		 * (If we couldn't write enough columns to reconstruct
2142e14bb325SJeff Bonwick 		 * the data, the I/O failed.  Otherwise, good enough.)
2143e14bb325SJeff Bonwick 		 *
2144e14bb325SJeff Bonwick 		 * Now that we support write reallocation, it would be better
2145e14bb325SJeff Bonwick 		 * to treat partial failure as real failure unless there are
2146e14bb325SJeff Bonwick 		 * no non-degraded top-level vdevs left, and not update DTLs
2147e14bb325SJeff Bonwick 		 * if we intend to reallocate.
2148fa9e4066Sahrens 		 */
2149fa9e4066Sahrens 		/* XXPOLICY */
2150e14bb325SJeff Bonwick 		if (total_errors > rm->rm_firstdatacol)
2151e14bb325SJeff Bonwick 			zio->io_error = vdev_raidz_worst_error(rm);
2152fa9e4066Sahrens 
2153e14bb325SJeff Bonwick 		return;
2154fa9e4066Sahrens 	}
2155fa9e4066Sahrens 
2156fa9e4066Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
215799653d4eSeschrock 	/*
215899653d4eSeschrock 	 * There are three potential phases for a read:
215999653d4eSeschrock 	 *	1. produce valid data from the columns read
216099653d4eSeschrock 	 *	2. read all disks and try again
216199653d4eSeschrock 	 *	3. perform combinatorial reconstruction
216299653d4eSeschrock 	 *
216399653d4eSeschrock 	 * Each phase is progressively both more expensive and less likely to
216499653d4eSeschrock 	 * occur. If we encounter more errors than we can repair or all phases
216599653d4eSeschrock 	 * fail, we have no choice but to return an error.
216699653d4eSeschrock 	 */
2167fa9e4066Sahrens 
2168fa9e4066Sahrens 	/*
216999653d4eSeschrock 	 * If the number of errors we saw was correctable -- less than or equal
2170c7a40cc4Sahl 	 * to the number of parity disks read -- attempt to produce data that
2171c7a40cc4Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
2172c7a40cc4Sahl 	 * any errors.
2173fa9e4066Sahrens 	 */
2174e14bb325SJeff Bonwick 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2175f94275ceSAdam Leventhal 		if (data_errors == 0) {
217622fe2c88SJonathan Adams 			if (raidz_checksum_verify(zio) == 0) {
2177d427dcb0Sahl 				/*
2178d427dcb0Sahl 				 * If we read parity information (unnecessarily
2179d427dcb0Sahl 				 * as it happens since no reconstruction was
2180d427dcb0Sahl 				 * needed) regenerate and verify the parity.
2181d427dcb0Sahl 				 * We also regenerate parity when resilvering
2182d427dcb0Sahl 				 * so we can write it out to the failed device
2183d427dcb0Sahl 				 * later.
2184d427dcb0Sahl 				 */
2185c7a40cc4Sahl 				if (parity_errors + parity_untried <
2186d427dcb0Sahl 				    rm->rm_firstdatacol ||
2187d427dcb0Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
218899653d4eSeschrock 					n = raidz_parity_verify(zio, rm);
218999653d4eSeschrock 					unexpected_errors += n;
219099653d4eSeschrock 					ASSERT(parity_errors + n <=
219199653d4eSeschrock 					    rm->rm_firstdatacol);
2192c7a40cc4Sahl 				}
2193fa9e4066Sahrens 				goto done;
2194fa9e4066Sahrens 			}
2195f94275ceSAdam Leventhal 		} else {
2196c7a40cc4Sahl 			/*
2197c7a40cc4Sahl 			 * We either attempt to read all the parity columns or
2198c7a40cc4Sahl 			 * none of them. If we didn't try to read parity, we
2199c7a40cc4Sahl 			 * wouldn't be here in the correctable case. There must
2200c7a40cc4Sahl 			 * also have been fewer parity errors than parity
2201c7a40cc4Sahl 			 * columns or, again, we wouldn't be in this code path.
2202c7a40cc4Sahl 			 */
2203c7a40cc4Sahl 			ASSERT(parity_untried == 0);
220499653d4eSeschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
220599653d4eSeschrock 
220699653d4eSeschrock 			/*
2207f94275ceSAdam Leventhal 			 * Identify the data columns that reported an error.
220899653d4eSeschrock 			 */
2209f94275ceSAdam Leventhal 			n = 0;
221099653d4eSeschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
221199653d4eSeschrock 				rc = &rm->rm_col[c];
2212f94275ceSAdam Leventhal 				if (rc->rc_error != 0) {
2213f94275ceSAdam Leventhal 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2214f94275ceSAdam Leventhal 					tgts[n++] = c;
221599653d4eSeschrock 				}
2216f94275ceSAdam Leventhal 			}
221799653d4eSeschrock 
2218f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol >= n);
2219f94275ceSAdam Leventhal 
2220f94275ceSAdam Leventhal 			code = vdev_raidz_reconstruct(rm, tgts, n);
222199653d4eSeschrock 
222222fe2c88SJonathan Adams 			if (raidz_checksum_verify(zio) == 0) {
2223f94275ceSAdam Leventhal 				atomic_inc_64(&raidz_corrected[code]);
222499653d4eSeschrock 
222599653d4eSeschrock 				/*
2226f94275ceSAdam Leventhal 				 * If we read more parity disks than were used
2227f94275ceSAdam Leventhal 				 * for reconstruction, confirm that the other
2228f94275ceSAdam Leventhal 				 * parity disks produced correct data. This
2229f94275ceSAdam Leventhal 				 * routine is suboptimal in that it regenerates
2230f94275ceSAdam Leventhal 				 * the parity that we already used in addition
2231f94275ceSAdam Leventhal 				 * to the parity that we're attempting to
2232f94275ceSAdam Leventhal 				 * verify, but this should be a relatively
2233f94275ceSAdam Leventhal 				 * uncommon case, and can be optimized if it
2234f94275ceSAdam Leventhal 				 * becomes a problem. Note that we regenerate
2235f94275ceSAdam Leventhal 				 * parity when resilvering so we can write it
2236f94275ceSAdam Leventhal 				 * out to failed devices later.
223799653d4eSeschrock 				 */
2238f94275ceSAdam Leventhal 				if (parity_errors < rm->rm_firstdatacol - n ||
2239d427dcb0Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
224099653d4eSeschrock 					n = raidz_parity_verify(zio, rm);
224199653d4eSeschrock 					unexpected_errors += n;
224299653d4eSeschrock 					ASSERT(parity_errors + n <=
224399653d4eSeschrock 					    rm->rm_firstdatacol);
224499653d4eSeschrock 				}
224599653d4eSeschrock 
224699653d4eSeschrock 				goto done;
224799653d4eSeschrock 			}
224899653d4eSeschrock 		}
2249fa9e4066Sahrens 	}
2250fa9e4066Sahrens 
2251fa9e4066Sahrens 	/*
225299653d4eSeschrock 	 * This isn't a typical situation -- either we got a read error or
225399653d4eSeschrock 	 * a child silently returned bad data. Read every block so we can
225499653d4eSeschrock 	 * try again with as much data and parity as we can track down. If
225599653d4eSeschrock 	 * we've already been through once before, all children will be marked
225699653d4eSeschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
2257fa9e4066Sahrens 	 */
2258fa9e4066Sahrens 	unexpected_errors = 1;
225999653d4eSeschrock 	rm->rm_missingdata = 0;
226099653d4eSeschrock 	rm->rm_missingparity = 0;
2261fa9e4066Sahrens 
226299653d4eSeschrock 	for (c = 0; c < rm->rm_cols; c++) {
226399653d4eSeschrock 		if (rm->rm_col[c].rc_tried)
226499653d4eSeschrock 			continue;
2265fa9e4066Sahrens 
2266fa9e4066Sahrens 		zio_vdev_io_redone(zio);
226799653d4eSeschrock 		do {
2268fa9e4066Sahrens 			rc = &rm->rm_col[c];
2269fa9e4066Sahrens 			if (rc->rc_tried)
2270fa9e4066Sahrens 				continue;
2271fa9e4066Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
227299653d4eSeschrock 			    vd->vdev_child[rc->rc_devidx],
2273fa9e4066Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2274e14bb325SJeff Bonwick 			    zio->io_type, zio->io_priority, 0,
2275fa9e4066Sahrens 			    vdev_raidz_child_done, rc));
227699653d4eSeschrock 		} while (++c < rm->rm_cols);
2277e05725b1Sbonwick 
2278e14bb325SJeff Bonwick 		return;
2279fa9e4066Sahrens 	}
2280fa9e4066Sahrens 
2281fa9e4066Sahrens 	/*
228299653d4eSeschrock 	 * At this point we've attempted to reconstruct the data given the
228399653d4eSeschrock 	 * errors we detected, and we've attempted to read all columns. There
228499653d4eSeschrock 	 * must, therefore, be one or more additional problems -- silent errors
228599653d4eSeschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
2286f94275ceSAdam Leventhal 	 * in absent data. We check if there is enough additional data to
2287f94275ceSAdam Leventhal 	 * possibly reconstruct the data and then perform combinatorial
2288f94275ceSAdam Leventhal 	 * reconstruction over all possible combinations. If that fails,
2289f94275ceSAdam Leventhal 	 * we're cooked.
2290fa9e4066Sahrens 	 */
229122fe2c88SJonathan Adams 	if (total_errors > rm->rm_firstdatacol) {
2292e14bb325SJeff Bonwick 		zio->io_error = vdev_raidz_worst_error(rm);
2293fa9e4066Sahrens 
229422fe2c88SJonathan Adams 	} else if (total_errors < rm->rm_firstdatacol &&
229522fe2c88SJonathan Adams 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2296fa9e4066Sahrens 		/*
2297f94275ceSAdam Leventhal 		 * If we didn't use all the available parity for the
2298f94275ceSAdam Leventhal 		 * combinatorial reconstruction, verify that the remaining
2299f94275ceSAdam Leventhal 		 * parity is correct.
2300fa9e4066Sahrens 		 */
2301f94275ceSAdam Leventhal 		if (code != (1 << rm->rm_firstdatacol) - 1)
2302f94275ceSAdam Leventhal 			(void) raidz_parity_verify(zio, rm);
2303f94275ceSAdam Leventhal 	} else {
2304fa9e4066Sahrens 		/*
230522fe2c88SJonathan Adams 		 * We're here because either:
230622fe2c88SJonathan Adams 		 *
230722fe2c88SJonathan Adams 		 *	total_errors == rm_first_datacol, or
230822fe2c88SJonathan Adams 		 *	vdev_raidz_combrec() failed
230922fe2c88SJonathan Adams 		 *
231022fe2c88SJonathan Adams 		 * In either case, there is enough bad data to prevent
231122fe2c88SJonathan Adams 		 * reconstruction.
231222fe2c88SJonathan Adams 		 *
231322fe2c88SJonathan Adams 		 * Start checksum ereports for all children which haven't
23146e1f5caaSNeil Perrin 		 * failed, and the IO wasn't speculative.
2315fa9e4066Sahrens 		 */
2316be6fd75aSMatthew Ahrens 		zio->io_error = SET_ERROR(ECKSUM);
2317e14bb325SJeff Bonwick 
23186e1f5caaSNeil Perrin 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2319ea8dc4b6Seschrock 			for (c = 0; c < rm->rm_cols; c++) {
2320ea8dc4b6Seschrock 				rc = &rm->rm_col[c];
232122fe2c88SJonathan Adams 				if (rc->rc_error == 0) {
232222fe2c88SJonathan Adams 					zio_bad_cksum_t zbc;
232322fe2c88SJonathan Adams 					zbc.zbc_has_cksum = 0;
23246e1f5caaSNeil Perrin 					zbc.zbc_injected =
23256e1f5caaSNeil Perrin 					    rm->rm_ecksuminjected;
232622fe2c88SJonathan Adams 
232722fe2c88SJonathan Adams 					zfs_ereport_start_checksum(
23286e1f5caaSNeil Perrin 					    zio->io_spa,
23296e1f5caaSNeil Perrin 					    vd->vdev_child[rc->rc_devidx],
233022fe2c88SJonathan Adams 					    zio, rc->rc_offset, rc->rc_size,
233122fe2c88SJonathan Adams 					    (void *)(uintptr_t)c, &zbc);
2332f94275ceSAdam Leventhal 				}
2333ea8dc4b6Seschrock 			}
2334ea8dc4b6Seschrock 		}
23356e1f5caaSNeil Perrin 	}
2336fa9e4066Sahrens 
2337fa9e4066Sahrens done:
2338fa9e4066Sahrens 	zio_checksum_verified(zio);
2339fa9e4066Sahrens 
23408ad4d6ddSJeff Bonwick 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2341fa9e4066Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2342fa9e4066Sahrens 		/*
2343fa9e4066Sahrens 		 * Use the good data we have in hand to repair damaged children.
2344fa9e4066Sahrens 		 */
2345fa9e4066Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
2346fa9e4066Sahrens 			rc = &rm->rm_col[c];
234799653d4eSeschrock 			cvd = vd->vdev_child[rc->rc_devidx];
2348fa9e4066Sahrens 
2349ecc2d604Sbonwick 			if (rc->rc_error == 0)
2350ecc2d604Sbonwick 				continue;
2351fa9e4066Sahrens 
2352e14bb325SJeff Bonwick 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2353ecc2d604Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
235469962b56SMatthew Ahrens 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
23558ad4d6ddSJeff Bonwick 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
23568ad4d6ddSJeff Bonwick 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2357fa9e4066Sahrens 		}
2358fa9e4066Sahrens 	}
2359fa9e4066Sahrens }
2360fa9e4066Sahrens 
2361fa9e4066Sahrens static void
2362fa9e4066Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2363fa9e4066Sahrens {
236499653d4eSeschrock 	if (faulted > vd->vdev_nparity)
2365ea8dc4b6Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2366ea8dc4b6Seschrock 		    VDEV_AUX_NO_REPLICAS);
2367fa9e4066Sahrens 	else if (degraded + faulted != 0)
2368ea8dc4b6Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2369fa9e4066Sahrens 	else
2370ea8dc4b6Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2371fa9e4066Sahrens }
2372fa9e4066Sahrens 
2373fa9e4066Sahrens vdev_ops_t vdev_raidz_ops = {
2374fa9e4066Sahrens 	vdev_raidz_open,
2375fa9e4066Sahrens 	vdev_raidz_close,
2376fa9e4066Sahrens 	vdev_raidz_asize,
2377fa9e4066Sahrens 	vdev_raidz_io_start,
2378fa9e4066Sahrens 	vdev_raidz_io_done,
2379fa9e4066Sahrens 	vdev_raidz_state_change,
2380dcba9f3fSGeorge Wilson 	NULL,
2381dcba9f3fSGeorge Wilson 	NULL,
2382fa9e4066Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2383fa9e4066Sahrens 	B_FALSE			/* not a leaf vdev */
2384fa9e4066Sahrens };
2385