xref: /titanic_41/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision c3a1418d1a4c19d574cfbf275daf91a0d44b7340)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
2199653d4eSeschrock 
22fa9e4066Sahrens /*
233f9d6ad7SLin Ling  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24efe6bf49SGeorge Wilson  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25810e43b2SBill Pijewski  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26fa9e4066Sahrens  */
27fa9e4066Sahrens 
28fa9e4066Sahrens #include <sys/zfs_context.h>
29fa9e4066Sahrens #include <sys/spa.h>
30fa9e4066Sahrens #include <sys/vdev_impl.h>
31810e43b2SBill Pijewski #include <sys/vdev_disk.h>
32810e43b2SBill Pijewski #include <sys/vdev_file.h>
33810e43b2SBill Pijewski #include <sys/vdev_raidz.h>
34fa9e4066Sahrens #include <sys/zio.h>
35fa9e4066Sahrens #include <sys/zio_checksum.h>
36fa9e4066Sahrens #include <sys/fs/zfs.h>
37ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
38fa9e4066Sahrens 
39fa9e4066Sahrens /*
40fa9e4066Sahrens  * Virtual device vector for RAID-Z.
4199653d4eSeschrock  *
42f94275ceSAdam Leventhal  * This vdev supports single, double, and triple parity. For single parity,
43f94275ceSAdam Leventhal  * we use a simple XOR of all the data columns. For double or triple parity,
44f94275ceSAdam Leventhal  * we use a special case of Reed-Solomon coding. This extends the
45f94275ceSAdam Leventhal  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
46f94275ceSAdam Leventhal  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
47f94275ceSAdam Leventhal  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
48f94275ceSAdam Leventhal  * former is also based. The latter is designed to provide higher performance
49f94275ceSAdam Leventhal  * for writes.
50f94275ceSAdam Leventhal  *
51f94275ceSAdam Leventhal  * Note that the Plank paper claimed to support arbitrary N+M, but was then
52f94275ceSAdam Leventhal  * amended six years later identifying a critical flaw that invalidates its
53f94275ceSAdam Leventhal  * claims. Nevertheless, the technique can be adapted to work for up to
54f94275ceSAdam Leventhal  * triple parity. For additional parity, the amendment "Note: Correction to
55f94275ceSAdam Leventhal  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
56f94275ceSAdam Leventhal  * is viable, but the additional complexity means that write performance will
57f94275ceSAdam Leventhal  * suffer.
58f94275ceSAdam Leventhal  *
59f94275ceSAdam Leventhal  * All of the methods above operate on a Galois field, defined over the
60f94275ceSAdam Leventhal  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
61f94275ceSAdam Leventhal  * can be expressed with a single byte. Briefly, the operations on the
62f94275ceSAdam Leventhal  * field are defined as follows:
6399653d4eSeschrock  *
6499653d4eSeschrock  *   o addition (+) is represented by a bitwise XOR
6599653d4eSeschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
6699653d4eSeschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
67f7170741SWill Andrews  *
6899653d4eSeschrock  *	(A * 2)_7 = A_6
6999653d4eSeschrock  *	(A * 2)_6 = A_5
7099653d4eSeschrock  *	(A * 2)_5 = A_4
7199653d4eSeschrock  *	(A * 2)_4 = A_3 + A_7
7299653d4eSeschrock  *	(A * 2)_3 = A_2 + A_7
7399653d4eSeschrock  *	(A * 2)_2 = A_1 + A_7
7499653d4eSeschrock  *	(A * 2)_1 = A_0
7599653d4eSeschrock  *	(A * 2)_0 = A_7
7699653d4eSeschrock  *
7799653d4eSeschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
78f94275ceSAdam Leventhal  * As an aside, this multiplication is derived from the error correcting
79f94275ceSAdam Leventhal  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
8099653d4eSeschrock  *
8199653d4eSeschrock  * Observe that any number in the field (except for 0) can be expressed as a
8299653d4eSeschrock  * power of 2 -- a generator for the field. We store a table of the powers of
8399653d4eSeschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
8499653d4eSeschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
85f94275ceSAdam Leventhal  * than field addition). The inverse of a field element A (A^-1) is therefore
86f94275ceSAdam Leventhal  * A ^ (255 - 1) = A^254.
8799653d4eSeschrock  *
88f94275ceSAdam Leventhal  * The up-to-three parity columns, P, Q, R over several data columns,
89f94275ceSAdam Leventhal  * D_0, ... D_n-1, can be expressed by field operations:
9099653d4eSeschrock  *
9199653d4eSeschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
9299653d4eSeschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
9399653d4eSeschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
94f94275ceSAdam Leventhal  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
95f94275ceSAdam Leventhal  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
9699653d4eSeschrock  *
97f94275ceSAdam Leventhal  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
98f94275ceSAdam Leventhal  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
99f94275ceSAdam Leventhal  * independent coefficients. (There are no additional coefficients that have
100f94275ceSAdam Leventhal  * this property which is why the uncorrected Plank method breaks down.)
101f94275ceSAdam Leventhal  *
102f94275ceSAdam Leventhal  * See the reconstruction code below for how P, Q and R can used individually
103f94275ceSAdam Leventhal  * or in concert to recover missing data columns.
104fa9e4066Sahrens  */
105fa9e4066Sahrens 
106fa9e4066Sahrens typedef struct raidz_col {
10799653d4eSeschrock 	uint64_t rc_devidx;		/* child device index for I/O */
10899653d4eSeschrock 	uint64_t rc_offset;		/* device offset */
10999653d4eSeschrock 	uint64_t rc_size;		/* I/O size */
11099653d4eSeschrock 	void *rc_data;			/* I/O data */
11122fe2c88SJonathan Adams 	void *rc_gdata;			/* used to store the "good" version */
11299653d4eSeschrock 	int rc_error;			/* I/O error for this device */
11399653d4eSeschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
11499653d4eSeschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
115fa9e4066Sahrens } raidz_col_t;
116fa9e4066Sahrens 
117fa9e4066Sahrens typedef struct raidz_map {
118f94275ceSAdam Leventhal 	uint64_t rm_cols;		/* Regular column count */
119f94275ceSAdam Leventhal 	uint64_t rm_scols;		/* Count including skipped columns */
12099653d4eSeschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
12199653d4eSeschrock 	uint64_t rm_asize;		/* Actual total I/O size */
12299653d4eSeschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
12399653d4eSeschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
12499653d4eSeschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
1252fbc121fSAdam Leventhal 	uint64_t rm_nskip;		/* Skipped sectors for padding */
1262fbc121fSAdam Leventhal 	uint64_t rm_skipstart;		/* Column index of padding start */
12722fe2c88SJonathan Adams 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
12822fe2c88SJonathan Adams 	uintptr_t rm_reports;		/* # of referencing checksum reports */
12922fe2c88SJonathan Adams 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
13022fe2c88SJonathan Adams 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
13199653d4eSeschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
132fa9e4066Sahrens } raidz_map_t;
133fa9e4066Sahrens 
13499653d4eSeschrock #define	VDEV_RAIDZ_P		0
13599653d4eSeschrock #define	VDEV_RAIDZ_Q		1
136f94275ceSAdam Leventhal #define	VDEV_RAIDZ_R		2
13799653d4eSeschrock 
138f94275ceSAdam Leventhal #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
139f94275ceSAdam Leventhal #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
14099653d4eSeschrock 
141f94275ceSAdam Leventhal /*
142f94275ceSAdam Leventhal  * We provide a mechanism to perform the field multiplication operation on a
143f94275ceSAdam Leventhal  * 64-bit value all at once rather than a byte at a time. This works by
144f94275ceSAdam Leventhal  * creating a mask from the top bit in each byte and using that to
145f94275ceSAdam Leventhal  * conditionally apply the XOR of 0x1d.
146f94275ceSAdam Leventhal  */
147f94275ceSAdam Leventhal #define	VDEV_RAIDZ_64MUL_2(x, mask) \
148f94275ceSAdam Leventhal { \
149f94275ceSAdam Leventhal 	(mask) = (x) & 0x8080808080808080ULL; \
150f94275ceSAdam Leventhal 	(mask) = ((mask) << 1) - ((mask) >> 7); \
151f94275ceSAdam Leventhal 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
152f94275ceSAdam Leventhal 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
153f94275ceSAdam Leventhal }
154f94275ceSAdam Leventhal 
155f94275ceSAdam Leventhal #define	VDEV_RAIDZ_64MUL_4(x, mask) \
156f94275ceSAdam Leventhal { \
157f94275ceSAdam Leventhal 	VDEV_RAIDZ_64MUL_2((x), mask); \
158f94275ceSAdam Leventhal 	VDEV_RAIDZ_64MUL_2((x), mask); \
159f94275ceSAdam Leventhal }
160f94275ceSAdam Leventhal 
161810e43b2SBill Pijewski #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
162810e43b2SBill Pijewski 
163f94275ceSAdam Leventhal /*
164f94275ceSAdam Leventhal  * Force reconstruction to use the general purpose method.
165f94275ceSAdam Leventhal  */
166f94275ceSAdam Leventhal int vdev_raidz_default_to_general;
16799653d4eSeschrock 
168f7170741SWill Andrews /* Powers of 2 in the Galois field defined above. */
16999653d4eSeschrock static const uint8_t vdev_raidz_pow2[256] = {
17099653d4eSeschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
17199653d4eSeschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
17299653d4eSeschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
17399653d4eSeschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
17499653d4eSeschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
17599653d4eSeschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
17699653d4eSeschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
17799653d4eSeschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
17899653d4eSeschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
17999653d4eSeschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
18099653d4eSeschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
18199653d4eSeschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
18299653d4eSeschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
18399653d4eSeschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
18499653d4eSeschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
18599653d4eSeschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
18699653d4eSeschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
18799653d4eSeschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
18899653d4eSeschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
18999653d4eSeschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
19099653d4eSeschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
19199653d4eSeschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
19299653d4eSeschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
19399653d4eSeschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
19499653d4eSeschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
19599653d4eSeschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
19699653d4eSeschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
19799653d4eSeschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
19899653d4eSeschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
19999653d4eSeschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
20099653d4eSeschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
20199653d4eSeschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
20299653d4eSeschrock };
203f7170741SWill Andrews /* Logs of 2 in the Galois field defined above. */
20499653d4eSeschrock static const uint8_t vdev_raidz_log2[256] = {
20599653d4eSeschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
20699653d4eSeschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
20799653d4eSeschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
20899653d4eSeschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
20999653d4eSeschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
21099653d4eSeschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
21199653d4eSeschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
21299653d4eSeschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
21399653d4eSeschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
21499653d4eSeschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
21599653d4eSeschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
21699653d4eSeschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
21799653d4eSeschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
21899653d4eSeschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
21999653d4eSeschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
22099653d4eSeschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
22199653d4eSeschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
22299653d4eSeschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
22399653d4eSeschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
22499653d4eSeschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
22599653d4eSeschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
22699653d4eSeschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
22799653d4eSeschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
22899653d4eSeschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
22999653d4eSeschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
23099653d4eSeschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
23199653d4eSeschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
23299653d4eSeschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
23399653d4eSeschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
23499653d4eSeschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
23599653d4eSeschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
23699653d4eSeschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
23799653d4eSeschrock };
23899653d4eSeschrock 
23922fe2c88SJonathan Adams static void vdev_raidz_generate_parity(raidz_map_t *rm);
24022fe2c88SJonathan Adams 
24199653d4eSeschrock /*
24299653d4eSeschrock  * Multiply a given number by 2 raised to the given power.
24399653d4eSeschrock  */
24499653d4eSeschrock static uint8_t
vdev_raidz_exp2(uint_t a,int exp)24599653d4eSeschrock vdev_raidz_exp2(uint_t a, int exp)
24699653d4eSeschrock {
24799653d4eSeschrock 	if (a == 0)
24899653d4eSeschrock 		return (0);
24999653d4eSeschrock 
25099653d4eSeschrock 	ASSERT(exp >= 0);
25199653d4eSeschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
25299653d4eSeschrock 
25399653d4eSeschrock 	exp += vdev_raidz_log2[a];
25499653d4eSeschrock 	if (exp > 255)
25599653d4eSeschrock 		exp -= 255;
25699653d4eSeschrock 
25799653d4eSeschrock 	return (vdev_raidz_pow2[exp]);
25899653d4eSeschrock }
25999653d4eSeschrock 
260e14bb325SJeff Bonwick static void
vdev_raidz_map_free(raidz_map_t * rm)26122fe2c88SJonathan Adams vdev_raidz_map_free(raidz_map_t *rm)
262e14bb325SJeff Bonwick {
263e14bb325SJeff Bonwick 	int c;
264baa7389eSJonathan Adams 	size_t size;
265e14bb325SJeff Bonwick 
26622fe2c88SJonathan Adams 	for (c = 0; c < rm->rm_firstdatacol; c++) {
267e14bb325SJeff Bonwick 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
268e14bb325SJeff Bonwick 
26922fe2c88SJonathan Adams 		if (rm->rm_col[c].rc_gdata != NULL)
27022fe2c88SJonathan Adams 			zio_buf_free(rm->rm_col[c].rc_gdata,
27122fe2c88SJonathan Adams 			    rm->rm_col[c].rc_size);
27222fe2c88SJonathan Adams 	}
27322fe2c88SJonathan Adams 
274baa7389eSJonathan Adams 	size = 0;
275baa7389eSJonathan Adams 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
276baa7389eSJonathan Adams 		size += rm->rm_col[c].rc_size;
277baa7389eSJonathan Adams 
27822fe2c88SJonathan Adams 	if (rm->rm_datacopy != NULL)
27922fe2c88SJonathan Adams 		zio_buf_free(rm->rm_datacopy, size);
28022fe2c88SJonathan Adams 
281f94275ceSAdam Leventhal 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282e14bb325SJeff Bonwick }
283e14bb325SJeff Bonwick 
28422fe2c88SJonathan Adams static void
vdev_raidz_map_free_vsd(zio_t * zio)28522fe2c88SJonathan Adams vdev_raidz_map_free_vsd(zio_t *zio)
28622fe2c88SJonathan Adams {
28722fe2c88SJonathan Adams 	raidz_map_t *rm = zio->io_vsd;
28822fe2c88SJonathan Adams 
289fb09f5aaSMadhav Suresh 	ASSERT0(rm->rm_freed);
29022fe2c88SJonathan Adams 	rm->rm_freed = 1;
29122fe2c88SJonathan Adams 
29222fe2c88SJonathan Adams 	if (rm->rm_reports == 0)
29322fe2c88SJonathan Adams 		vdev_raidz_map_free(rm);
29422fe2c88SJonathan Adams }
29522fe2c88SJonathan Adams 
29622fe2c88SJonathan Adams /*ARGSUSED*/
29722fe2c88SJonathan Adams static void
vdev_raidz_cksum_free(void * arg,size_t ignored)29822fe2c88SJonathan Adams vdev_raidz_cksum_free(void *arg, size_t ignored)
29922fe2c88SJonathan Adams {
30022fe2c88SJonathan Adams 	raidz_map_t *rm = arg;
30122fe2c88SJonathan Adams 
30222fe2c88SJonathan Adams 	ASSERT3U(rm->rm_reports, >, 0);
30322fe2c88SJonathan Adams 
304baa7389eSJonathan Adams 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
30522fe2c88SJonathan Adams 		vdev_raidz_map_free(rm);
30622fe2c88SJonathan Adams }
30722fe2c88SJonathan Adams 
30822fe2c88SJonathan Adams static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)30922fe2c88SJonathan Adams vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
31022fe2c88SJonathan Adams {
31122fe2c88SJonathan Adams 	raidz_map_t *rm = zcr->zcr_cbdata;
31222fe2c88SJonathan Adams 	size_t c = zcr->zcr_cbinfo;
31322fe2c88SJonathan Adams 	size_t x;
31422fe2c88SJonathan Adams 
31522fe2c88SJonathan Adams 	const char *good = NULL;
31622fe2c88SJonathan Adams 	const char *bad = rm->rm_col[c].rc_data;
31722fe2c88SJonathan Adams 
31822fe2c88SJonathan Adams 	if (good_data == NULL) {
31922fe2c88SJonathan Adams 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
32022fe2c88SJonathan Adams 		return;
32122fe2c88SJonathan Adams 	}
32222fe2c88SJonathan Adams 
32322fe2c88SJonathan Adams 	if (c < rm->rm_firstdatacol) {
32422fe2c88SJonathan Adams 		/*
32522fe2c88SJonathan Adams 		 * The first time through, calculate the parity blocks for
32622fe2c88SJonathan Adams 		 * the good data (this relies on the fact that the good
32722fe2c88SJonathan Adams 		 * data never changes for a given logical ZIO)
32822fe2c88SJonathan Adams 		 */
32922fe2c88SJonathan Adams 		if (rm->rm_col[0].rc_gdata == NULL) {
33022fe2c88SJonathan Adams 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
33122fe2c88SJonathan Adams 			char *buf;
33222fe2c88SJonathan Adams 
33322fe2c88SJonathan Adams 			/*
33422fe2c88SJonathan Adams 			 * Set up the rm_col[]s to generate the parity for
33522fe2c88SJonathan Adams 			 * good_data, first saving the parity bufs and
33622fe2c88SJonathan Adams 			 * replacing them with buffers to hold the result.
33722fe2c88SJonathan Adams 			 */
33822fe2c88SJonathan Adams 			for (x = 0; x < rm->rm_firstdatacol; x++) {
33922fe2c88SJonathan Adams 				bad_parity[x] = rm->rm_col[x].rc_data;
34022fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
34122fe2c88SJonathan Adams 				    zio_buf_alloc(rm->rm_col[x].rc_size);
34222fe2c88SJonathan Adams 			}
34322fe2c88SJonathan Adams 
34422fe2c88SJonathan Adams 			/* fill in the data columns from good_data */
34522fe2c88SJonathan Adams 			buf = (char *)good_data;
34622fe2c88SJonathan Adams 			for (; x < rm->rm_cols; x++) {
34722fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = buf;
34822fe2c88SJonathan Adams 				buf += rm->rm_col[x].rc_size;
34922fe2c88SJonathan Adams 			}
35022fe2c88SJonathan Adams 
35122fe2c88SJonathan Adams 			/*
35222fe2c88SJonathan Adams 			 * Construct the parity from the good data.
35322fe2c88SJonathan Adams 			 */
35422fe2c88SJonathan Adams 			vdev_raidz_generate_parity(rm);
35522fe2c88SJonathan Adams 
35622fe2c88SJonathan Adams 			/* restore everything back to its original state */
35722fe2c88SJonathan Adams 			for (x = 0; x < rm->rm_firstdatacol; x++)
35822fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = bad_parity[x];
35922fe2c88SJonathan Adams 
36022fe2c88SJonathan Adams 			buf = rm->rm_datacopy;
36122fe2c88SJonathan Adams 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
36222fe2c88SJonathan Adams 				rm->rm_col[x].rc_data = buf;
36322fe2c88SJonathan Adams 				buf += rm->rm_col[x].rc_size;
36422fe2c88SJonathan Adams 			}
36522fe2c88SJonathan Adams 		}
36622fe2c88SJonathan Adams 
36722fe2c88SJonathan Adams 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
36822fe2c88SJonathan Adams 		good = rm->rm_col[c].rc_gdata;
36922fe2c88SJonathan Adams 	} else {
37022fe2c88SJonathan Adams 		/* adjust good_data to point at the start of our column */
37122fe2c88SJonathan Adams 		good = good_data;
37222fe2c88SJonathan Adams 
37322fe2c88SJonathan Adams 		for (x = rm->rm_firstdatacol; x < c; x++)
37422fe2c88SJonathan Adams 			good += rm->rm_col[x].rc_size;
37522fe2c88SJonathan Adams 	}
37622fe2c88SJonathan Adams 
37722fe2c88SJonathan Adams 	/* we drop the ereport if it ends up that the data was good */
37822fe2c88SJonathan Adams 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
37922fe2c88SJonathan Adams }
38022fe2c88SJonathan Adams 
38122fe2c88SJonathan Adams /*
38222fe2c88SJonathan Adams  * Invoked indirectly by zfs_ereport_start_checksum(), called
38322fe2c88SJonathan Adams  * below when our read operation fails completely.  The main point
38422fe2c88SJonathan Adams  * is to keep a copy of everything we read from disk, so that at
38522fe2c88SJonathan Adams  * vdev_raidz_cksum_finish() time we can compare it with the good data.
38622fe2c88SJonathan Adams  */
38722fe2c88SJonathan Adams static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)38822fe2c88SJonathan Adams vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
38922fe2c88SJonathan Adams {
39022fe2c88SJonathan Adams 	size_t c = (size_t)(uintptr_t)arg;
39122fe2c88SJonathan Adams 	caddr_t buf;
39222fe2c88SJonathan Adams 
39322fe2c88SJonathan Adams 	raidz_map_t *rm = zio->io_vsd;
39422fe2c88SJonathan Adams 	size_t size;
39522fe2c88SJonathan Adams 
39622fe2c88SJonathan Adams 	/* set up the report and bump the refcount  */
39722fe2c88SJonathan Adams 	zcr->zcr_cbdata = rm;
39822fe2c88SJonathan Adams 	zcr->zcr_cbinfo = c;
39922fe2c88SJonathan Adams 	zcr->zcr_finish = vdev_raidz_cksum_finish;
40022fe2c88SJonathan Adams 	zcr->zcr_free = vdev_raidz_cksum_free;
40122fe2c88SJonathan Adams 
40222fe2c88SJonathan Adams 	rm->rm_reports++;
40322fe2c88SJonathan Adams 	ASSERT3U(rm->rm_reports, >, 0);
40422fe2c88SJonathan Adams 
405baa7389eSJonathan Adams 	if (rm->rm_datacopy != NULL)
40622fe2c88SJonathan Adams 		return;
40722fe2c88SJonathan Adams 
40822fe2c88SJonathan Adams 	/*
409baa7389eSJonathan Adams 	 * It's the first time we're called for this raidz_map_t, so we need
410baa7389eSJonathan Adams 	 * to copy the data aside; there's no guarantee that our zio's buffer
411baa7389eSJonathan Adams 	 * won't be re-used for something else.
41222fe2c88SJonathan Adams 	 *
413baa7389eSJonathan Adams 	 * Our parity data is already in separate buffers, so there's no need
41422fe2c88SJonathan Adams 	 * to copy them.
41522fe2c88SJonathan Adams 	 */
41622fe2c88SJonathan Adams 
417baa7389eSJonathan Adams 	size = 0;
418baa7389eSJonathan Adams 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
419baa7389eSJonathan Adams 		size += rm->rm_col[c].rc_size;
42022fe2c88SJonathan Adams 
42122fe2c88SJonathan Adams 	buf = rm->rm_datacopy = zio_buf_alloc(size);
422baa7389eSJonathan Adams 
423baa7389eSJonathan Adams 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
42422fe2c88SJonathan Adams 		raidz_col_t *col = &rm->rm_col[c];
42522fe2c88SJonathan Adams 
42622fe2c88SJonathan Adams 		bcopy(col->rc_data, buf, col->rc_size);
42722fe2c88SJonathan Adams 		col->rc_data = buf;
42822fe2c88SJonathan Adams 
42922fe2c88SJonathan Adams 		buf += col->rc_size;
43022fe2c88SJonathan Adams 	}
43122fe2c88SJonathan Adams 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
43222fe2c88SJonathan Adams }
43322fe2c88SJonathan Adams 
43422fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
43522fe2c88SJonathan Adams 	vdev_raidz_map_free_vsd,
43622fe2c88SJonathan Adams 	vdev_raidz_cksum_report
43722fe2c88SJonathan Adams };
43822fe2c88SJonathan Adams 
4393e30c24aSWill Andrews /*
4403e30c24aSWill Andrews  * Divides the IO evenly across all child vdevs; usually, dcols is
4413e30c24aSWill Andrews  * the number of children in the target vdev.
4423e30c24aSWill Andrews  */
443fa9e4066Sahrens static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data,uint64_t size,uint64_t offset,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)444810e43b2SBill Pijewski vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
445810e43b2SBill Pijewski     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
446fa9e4066Sahrens {
447fa9e4066Sahrens 	raidz_map_t *rm;
4483e30c24aSWill Andrews 	/* The starting RAIDZ (parent) vdev sector of the block. */
449810e43b2SBill Pijewski 	uint64_t b = offset >> unit_shift;
4503e30c24aSWill Andrews 	/* The zio's size in units of the vdev's minimum sector size. */
451810e43b2SBill Pijewski 	uint64_t s = size >> unit_shift;
4523e30c24aSWill Andrews 	/* The first column for this stripe. */
453fa9e4066Sahrens 	uint64_t f = b % dcols;
4543e30c24aSWill Andrews 	/* The starting byte offset on each child vdev. */
455fa9e4066Sahrens 	uint64_t o = (b / dcols) << unit_shift;
456f94275ceSAdam Leventhal 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
457fa9e4066Sahrens 
4583e30c24aSWill Andrews 	/*
4593e30c24aSWill Andrews 	 * "Quotient": The number of data sectors for this stripe on all but
4603e30c24aSWill Andrews 	 * the "big column" child vdevs that also contain "remainder" data.
4613e30c24aSWill Andrews 	 */
46299653d4eSeschrock 	q = s / (dcols - nparity);
4633e30c24aSWill Andrews 
4643e30c24aSWill Andrews 	/*
4653e30c24aSWill Andrews 	 * "Remainder": The number of partial stripe data sectors in this I/O.
4663e30c24aSWill Andrews 	 * This will add a sector to some, but not all, child vdevs.
4673e30c24aSWill Andrews 	 */
46899653d4eSeschrock 	r = s - q * (dcols - nparity);
4693e30c24aSWill Andrews 
4703e30c24aSWill Andrews 	/* The number of "big columns" - those which contain remainder data. */
47199653d4eSeschrock 	bc = (r == 0 ? 0 : r + nparity);
4723e30c24aSWill Andrews 
4733e30c24aSWill Andrews 	/*
4743e30c24aSWill Andrews 	 * The total number of data and parity sectors associated with
4753e30c24aSWill Andrews 	 * this I/O.
4763e30c24aSWill Andrews 	 */
477f94275ceSAdam Leventhal 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
478fa9e4066Sahrens 
4793e30c24aSWill Andrews 	/* acols: The columns that will be accessed. */
4803e30c24aSWill Andrews 	/* scols: The columns that will be accessed or skipped. */
481f94275ceSAdam Leventhal 	if (q == 0) {
4823e30c24aSWill Andrews 		/* Our I/O request doesn't span all child vdevs. */
483f94275ceSAdam Leventhal 		acols = bc;
484f94275ceSAdam Leventhal 		scols = MIN(dcols, roundup(bc, nparity + 1));
485f94275ceSAdam Leventhal 	} else {
486f94275ceSAdam Leventhal 		acols = dcols;
487f94275ceSAdam Leventhal 		scols = dcols;
488f94275ceSAdam Leventhal 	}
489fa9e4066Sahrens 
490f94275ceSAdam Leventhal 	ASSERT3U(acols, <=, scols);
491f94275ceSAdam Leventhal 
492f94275ceSAdam Leventhal 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
493fa9e4066Sahrens 
494fa9e4066Sahrens 	rm->rm_cols = acols;
495f94275ceSAdam Leventhal 	rm->rm_scols = scols;
496fa9e4066Sahrens 	rm->rm_bigcols = bc;
4972fbc121fSAdam Leventhal 	rm->rm_skipstart = bc;
49899653d4eSeschrock 	rm->rm_missingdata = 0;
49999653d4eSeschrock 	rm->rm_missingparity = 0;
50099653d4eSeschrock 	rm->rm_firstdatacol = nparity;
50122fe2c88SJonathan Adams 	rm->rm_datacopy = NULL;
50222fe2c88SJonathan Adams 	rm->rm_reports = 0;
50322fe2c88SJonathan Adams 	rm->rm_freed = 0;
50422fe2c88SJonathan Adams 	rm->rm_ecksuminjected = 0;
505fa9e4066Sahrens 
506f94275ceSAdam Leventhal 	asize = 0;
507f94275ceSAdam Leventhal 
508f94275ceSAdam Leventhal 	for (c = 0; c < scols; c++) {
509fa9e4066Sahrens 		col = f + c;
510fa9e4066Sahrens 		coff = o;
511fa9e4066Sahrens 		if (col >= dcols) {
512fa9e4066Sahrens 			col -= dcols;
513fa9e4066Sahrens 			coff += 1ULL << unit_shift;
514fa9e4066Sahrens 		}
51599653d4eSeschrock 		rm->rm_col[c].rc_devidx = col;
516fa9e4066Sahrens 		rm->rm_col[c].rc_offset = coff;
517fa9e4066Sahrens 		rm->rm_col[c].rc_data = NULL;
51822fe2c88SJonathan Adams 		rm->rm_col[c].rc_gdata = NULL;
519fa9e4066Sahrens 		rm->rm_col[c].rc_error = 0;
520fa9e4066Sahrens 		rm->rm_col[c].rc_tried = 0;
521fa9e4066Sahrens 		rm->rm_col[c].rc_skipped = 0;
522f94275ceSAdam Leventhal 
523f94275ceSAdam Leventhal 		if (c >= acols)
524f94275ceSAdam Leventhal 			rm->rm_col[c].rc_size = 0;
525f94275ceSAdam Leventhal 		else if (c < bc)
526f94275ceSAdam Leventhal 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
527f94275ceSAdam Leventhal 		else
528f94275ceSAdam Leventhal 			rm->rm_col[c].rc_size = q << unit_shift;
529f94275ceSAdam Leventhal 
530f94275ceSAdam Leventhal 		asize += rm->rm_col[c].rc_size;
531fa9e4066Sahrens 	}
532fa9e4066Sahrens 
533f94275ceSAdam Leventhal 	ASSERT3U(asize, ==, tot << unit_shift);
534f94275ceSAdam Leventhal 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
5352fbc121fSAdam Leventhal 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
5362fbc121fSAdam Leventhal 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
5372fbc121fSAdam Leventhal 	ASSERT3U(rm->rm_nskip, <=, nparity);
538fa9e4066Sahrens 
539fa9e4066Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
540fa9e4066Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
541fa9e4066Sahrens 
542810e43b2SBill Pijewski 	rm->rm_col[c].rc_data = data;
543fa9e4066Sahrens 
544fa9e4066Sahrens 	for (c = c + 1; c < acols; c++)
545fa9e4066Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
546fa9e4066Sahrens 		    rm->rm_col[c - 1].rc_size;
547fa9e4066Sahrens 
548fa9e4066Sahrens 	/*
54999653d4eSeschrock 	 * If all data stored spans all columns, there's a danger that parity
55099653d4eSeschrock 	 * will always be on the same device and, since parity isn't read
55199653d4eSeschrock 	 * during normal operation, that that device's I/O bandwidth won't be
55299653d4eSeschrock 	 * used effectively. We therefore switch the parity every 1MB.
55399653d4eSeschrock 	 *
55499653d4eSeschrock 	 * ... at least that was, ostensibly, the theory. As a practical
55599653d4eSeschrock 	 * matter unless we juggle the parity between all devices evenly, we
55699653d4eSeschrock 	 * won't see any benefit. Further, occasional writes that aren't a
55799653d4eSeschrock 	 * multiple of the LCM of the number of children and the minimum
55899653d4eSeschrock 	 * stripe width are sufficient to avoid pessimal behavior.
55999653d4eSeschrock 	 * Unfortunately, this decision created an implicit on-disk format
560c7a40cc4Sahl 	 * requirement that we need to support for all eternity, but only
561c7a40cc4Sahl 	 * for single-parity RAID-Z.
5622fbc121fSAdam Leventhal 	 *
5632fbc121fSAdam Leventhal 	 * If we intend to skip a sector in the zeroth column for padding
5642fbc121fSAdam Leventhal 	 * we must make sure to note this swap. We will never intend to
5652fbc121fSAdam Leventhal 	 * skip the first column since at least one data and one parity
5662fbc121fSAdam Leventhal 	 * column must appear in each row.
567fa9e4066Sahrens 	 */
568fa9e4066Sahrens 	ASSERT(rm->rm_cols >= 2);
569fa9e4066Sahrens 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
570fa9e4066Sahrens 
571810e43b2SBill Pijewski 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
57299653d4eSeschrock 		devidx = rm->rm_col[0].rc_devidx;
573fa9e4066Sahrens 		o = rm->rm_col[0].rc_offset;
57499653d4eSeschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
575fa9e4066Sahrens 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
57699653d4eSeschrock 		rm->rm_col[1].rc_devidx = devidx;
577fa9e4066Sahrens 		rm->rm_col[1].rc_offset = o;
5782fbc121fSAdam Leventhal 
5792fbc121fSAdam Leventhal 		if (rm->rm_skipstart == 0)
5802fbc121fSAdam Leventhal 			rm->rm_skipstart = 1;
581fa9e4066Sahrens 	}
582fa9e4066Sahrens 
583fa9e4066Sahrens 	return (rm);
584fa9e4066Sahrens }
585fa9e4066Sahrens 
586fa9e4066Sahrens static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)58799653d4eSeschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
588fa9e4066Sahrens {
58999653d4eSeschrock 	uint64_t *p, *src, pcount, ccount, i;
59099653d4eSeschrock 	int c;
591fa9e4066Sahrens 
59299653d4eSeschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
59399653d4eSeschrock 
59499653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
59599653d4eSeschrock 		src = rm->rm_col[c].rc_data;
59699653d4eSeschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
59799653d4eSeschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
59899653d4eSeschrock 
59999653d4eSeschrock 		if (c == rm->rm_firstdatacol) {
60099653d4eSeschrock 			ASSERT(ccount == pcount);
601f94275ceSAdam Leventhal 			for (i = 0; i < ccount; i++, src++, p++) {
60299653d4eSeschrock 				*p = *src;
60399653d4eSeschrock 			}
60499653d4eSeschrock 		} else {
60599653d4eSeschrock 			ASSERT(ccount <= pcount);
606f94275ceSAdam Leventhal 			for (i = 0; i < ccount; i++, src++, p++) {
60799653d4eSeschrock 				*p ^= *src;
60899653d4eSeschrock 			}
60999653d4eSeschrock 		}
61099653d4eSeschrock 	}
61199653d4eSeschrock }
61299653d4eSeschrock 
61399653d4eSeschrock static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)61499653d4eSeschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
61599653d4eSeschrock {
616f94275ceSAdam Leventhal 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
61799653d4eSeschrock 	int c;
61899653d4eSeschrock 
619f94275ceSAdam Leventhal 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
62099653d4eSeschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
62199653d4eSeschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
62299653d4eSeschrock 
62399653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
62499653d4eSeschrock 		src = rm->rm_col[c].rc_data;
62599653d4eSeschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
62699653d4eSeschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
627f94275ceSAdam Leventhal 
628f94275ceSAdam Leventhal 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
62999653d4eSeschrock 
63099653d4eSeschrock 		if (c == rm->rm_firstdatacol) {
631f94275ceSAdam Leventhal 			ASSERT(ccnt == pcnt || ccnt == 0);
632f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
63399653d4eSeschrock 				*p = *src;
634f94275ceSAdam Leventhal 				*q = *src;
63599653d4eSeschrock 			}
636f94275ceSAdam Leventhal 			for (; i < pcnt; i++, src++, p++, q++) {
63799653d4eSeschrock 				*p = 0;
638f94275ceSAdam Leventhal 				*q = 0;
63999653d4eSeschrock 			}
64099653d4eSeschrock 		} else {
641f94275ceSAdam Leventhal 			ASSERT(ccnt <= pcnt);
64299653d4eSeschrock 
64399653d4eSeschrock 			/*
644f94275ceSAdam Leventhal 			 * Apply the algorithm described above by multiplying
645f94275ceSAdam Leventhal 			 * the previous result and adding in the new value.
64699653d4eSeschrock 			 */
647f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
64899653d4eSeschrock 				*p ^= *src;
649f94275ceSAdam Leventhal 
650f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
651f94275ceSAdam Leventhal 				*q ^= *src;
65299653d4eSeschrock 			}
65399653d4eSeschrock 
65499653d4eSeschrock 			/*
65599653d4eSeschrock 			 * Treat short columns as though they are full of 0s.
656f94275ceSAdam Leventhal 			 * Note that there's therefore nothing needed for P.
65799653d4eSeschrock 			 */
658f94275ceSAdam Leventhal 			for (; i < pcnt; i++, q++) {
659f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
66099653d4eSeschrock 			}
66199653d4eSeschrock 		}
66299653d4eSeschrock 	}
66399653d4eSeschrock }
66499653d4eSeschrock 
66599653d4eSeschrock static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)666f94275ceSAdam Leventhal vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
667f94275ceSAdam Leventhal {
668f94275ceSAdam Leventhal 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
669f94275ceSAdam Leventhal 	int c;
670f94275ceSAdam Leventhal 
671f94275ceSAdam Leventhal 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
672f94275ceSAdam Leventhal 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
673f94275ceSAdam Leventhal 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
674f94275ceSAdam Leventhal 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
675f94275ceSAdam Leventhal 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
676f94275ceSAdam Leventhal 
677f94275ceSAdam Leventhal 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678f94275ceSAdam Leventhal 		src = rm->rm_col[c].rc_data;
679f94275ceSAdam Leventhal 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
680f94275ceSAdam Leventhal 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
681f94275ceSAdam Leventhal 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
682f94275ceSAdam Leventhal 
683f94275ceSAdam Leventhal 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
684f94275ceSAdam Leventhal 
685f94275ceSAdam Leventhal 		if (c == rm->rm_firstdatacol) {
686f94275ceSAdam Leventhal 			ASSERT(ccnt == pcnt || ccnt == 0);
687f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
688f94275ceSAdam Leventhal 				*p = *src;
689f94275ceSAdam Leventhal 				*q = *src;
690f94275ceSAdam Leventhal 				*r = *src;
691f94275ceSAdam Leventhal 			}
692f94275ceSAdam Leventhal 			for (; i < pcnt; i++, src++, p++, q++, r++) {
693f94275ceSAdam Leventhal 				*p = 0;
694f94275ceSAdam Leventhal 				*q = 0;
695f94275ceSAdam Leventhal 				*r = 0;
696f94275ceSAdam Leventhal 			}
697f94275ceSAdam Leventhal 		} else {
698f94275ceSAdam Leventhal 			ASSERT(ccnt <= pcnt);
699f94275ceSAdam Leventhal 
700f94275ceSAdam Leventhal 			/*
701f94275ceSAdam Leventhal 			 * Apply the algorithm described above by multiplying
702f94275ceSAdam Leventhal 			 * the previous result and adding in the new value.
703f94275ceSAdam Leventhal 			 */
704f94275ceSAdam Leventhal 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
705f94275ceSAdam Leventhal 				*p ^= *src;
706f94275ceSAdam Leventhal 
707f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
708f94275ceSAdam Leventhal 				*q ^= *src;
709f94275ceSAdam Leventhal 
710f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_4(*r, mask);
711f94275ceSAdam Leventhal 				*r ^= *src;
712f94275ceSAdam Leventhal 			}
713f94275ceSAdam Leventhal 
714f94275ceSAdam Leventhal 			/*
715f94275ceSAdam Leventhal 			 * Treat short columns as though they are full of 0s.
716f94275ceSAdam Leventhal 			 * Note that there's therefore nothing needed for P.
717f94275ceSAdam Leventhal 			 */
718f94275ceSAdam Leventhal 			for (; i < pcnt; i++, q++, r++) {
719f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*q, mask);
720f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_4(*r, mask);
721f94275ceSAdam Leventhal 			}
722f94275ceSAdam Leventhal 		}
723f94275ceSAdam Leventhal 	}
724f94275ceSAdam Leventhal }
725f94275ceSAdam Leventhal 
726f94275ceSAdam Leventhal /*
727f94275ceSAdam Leventhal  * Generate RAID parity in the first virtual columns according to the number of
728f94275ceSAdam Leventhal  * parity columns available.
729f94275ceSAdam Leventhal  */
730f94275ceSAdam Leventhal static void
vdev_raidz_generate_parity(raidz_map_t * rm)731f94275ceSAdam Leventhal vdev_raidz_generate_parity(raidz_map_t *rm)
732f94275ceSAdam Leventhal {
733f94275ceSAdam Leventhal 	switch (rm->rm_firstdatacol) {
734f94275ceSAdam Leventhal 	case 1:
735f94275ceSAdam Leventhal 		vdev_raidz_generate_parity_p(rm);
736f94275ceSAdam Leventhal 		break;
737f94275ceSAdam Leventhal 	case 2:
738f94275ceSAdam Leventhal 		vdev_raidz_generate_parity_pq(rm);
739f94275ceSAdam Leventhal 		break;
740f94275ceSAdam Leventhal 	case 3:
741f94275ceSAdam Leventhal 		vdev_raidz_generate_parity_pqr(rm);
742f94275ceSAdam Leventhal 		break;
743f94275ceSAdam Leventhal 	default:
744f94275ceSAdam Leventhal 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
745f94275ceSAdam Leventhal 	}
746f94275ceSAdam Leventhal }
747f94275ceSAdam Leventhal 
748f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)749f94275ceSAdam Leventhal vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
75099653d4eSeschrock {
75199653d4eSeschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
752f94275ceSAdam Leventhal 	int x = tgts[0];
75399653d4eSeschrock 	int c;
75499653d4eSeschrock 
755f94275ceSAdam Leventhal 	ASSERT(ntgts == 1);
756f94275ceSAdam Leventhal 	ASSERT(x >= rm->rm_firstdatacol);
757f94275ceSAdam Leventhal 	ASSERT(x < rm->rm_cols);
758f94275ceSAdam Leventhal 
75999653d4eSeschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
76099653d4eSeschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
76199653d4eSeschrock 	ASSERT(xcount > 0);
76299653d4eSeschrock 
76399653d4eSeschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
76499653d4eSeschrock 	dst = rm->rm_col[x].rc_data;
76599653d4eSeschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
76699653d4eSeschrock 		*dst = *src;
76799653d4eSeschrock 	}
76899653d4eSeschrock 
76999653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770fa9e4066Sahrens 		src = rm->rm_col[c].rc_data;
771fa9e4066Sahrens 		dst = rm->rm_col[x].rc_data;
77299653d4eSeschrock 
77399653d4eSeschrock 		if (c == x)
77499653d4eSeschrock 			continue;
77599653d4eSeschrock 
77699653d4eSeschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
77799653d4eSeschrock 		count = MIN(ccount, xcount);
77899653d4eSeschrock 
77999653d4eSeschrock 		for (i = 0; i < count; i++, dst++, src++) {
78099653d4eSeschrock 			*dst ^= *src;
78199653d4eSeschrock 		}
78299653d4eSeschrock 	}
783f94275ceSAdam Leventhal 
784f94275ceSAdam Leventhal 	return (1 << VDEV_RAIDZ_P);
78599653d4eSeschrock }
78699653d4eSeschrock 
787f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)788f94275ceSAdam Leventhal vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
78999653d4eSeschrock {
79099653d4eSeschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
79199653d4eSeschrock 	uint8_t *b;
792f94275ceSAdam Leventhal 	int x = tgts[0];
79399653d4eSeschrock 	int c, j, exp;
79499653d4eSeschrock 
795f94275ceSAdam Leventhal 	ASSERT(ntgts == 1);
796f94275ceSAdam Leventhal 
79799653d4eSeschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
79899653d4eSeschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
79999653d4eSeschrock 
80099653d4eSeschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
80199653d4eSeschrock 		src = rm->rm_col[c].rc_data;
80299653d4eSeschrock 		dst = rm->rm_col[x].rc_data;
80399653d4eSeschrock 
80499653d4eSeschrock 		if (c == x)
80599653d4eSeschrock 			ccount = 0;
80699653d4eSeschrock 		else
80799653d4eSeschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
80899653d4eSeschrock 
80999653d4eSeschrock 		count = MIN(ccount, xcount);
81099653d4eSeschrock 
81199653d4eSeschrock 		if (c == rm->rm_firstdatacol) {
81299653d4eSeschrock 			for (i = 0; i < count; i++, dst++, src++) {
81399653d4eSeschrock 				*dst = *src;
81499653d4eSeschrock 			}
81599653d4eSeschrock 			for (; i < xcount; i++, dst++) {
81699653d4eSeschrock 				*dst = 0;
81799653d4eSeschrock 			}
81899653d4eSeschrock 
819fa9e4066Sahrens 		} else {
82099653d4eSeschrock 			for (i = 0; i < count; i++, dst++, src++) {
821f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*dst, mask);
82299653d4eSeschrock 				*dst ^= *src;
82399653d4eSeschrock 			}
82499653d4eSeschrock 
82599653d4eSeschrock 			for (; i < xcount; i++, dst++) {
826f94275ceSAdam Leventhal 				VDEV_RAIDZ_64MUL_2(*dst, mask);
827fa9e4066Sahrens 			}
828fa9e4066Sahrens 		}
829fa9e4066Sahrens 	}
830fa9e4066Sahrens 
83199653d4eSeschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
83299653d4eSeschrock 	dst = rm->rm_col[x].rc_data;
83399653d4eSeschrock 	exp = 255 - (rm->rm_cols - 1 - x);
83499653d4eSeschrock 
83599653d4eSeschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
83699653d4eSeschrock 		*dst ^= *src;
83799653d4eSeschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
83899653d4eSeschrock 			*b = vdev_raidz_exp2(*b, exp);
83999653d4eSeschrock 		}
84099653d4eSeschrock 	}
841f94275ceSAdam Leventhal 
842f94275ceSAdam Leventhal 	return (1 << VDEV_RAIDZ_Q);
84399653d4eSeschrock }
84499653d4eSeschrock 
845f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)846f94275ceSAdam Leventhal vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
84799653d4eSeschrock {
84899653d4eSeschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
84999653d4eSeschrock 	void *pdata, *qdata;
85099653d4eSeschrock 	uint64_t xsize, ysize, i;
851f94275ceSAdam Leventhal 	int x = tgts[0];
852f94275ceSAdam Leventhal 	int y = tgts[1];
85399653d4eSeschrock 
854f94275ceSAdam Leventhal 	ASSERT(ntgts == 2);
85599653d4eSeschrock 	ASSERT(x < y);
85699653d4eSeschrock 	ASSERT(x >= rm->rm_firstdatacol);
85799653d4eSeschrock 	ASSERT(y < rm->rm_cols);
85899653d4eSeschrock 
85999653d4eSeschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
86099653d4eSeschrock 
86199653d4eSeschrock 	/*
86299653d4eSeschrock 	 * Move the parity data aside -- we're going to compute parity as
86399653d4eSeschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
86499653d4eSeschrock 	 * reuse the parity generation mechanism without trashing the actual
86599653d4eSeschrock 	 * parity so we make those columns appear to be full of zeros by
86699653d4eSeschrock 	 * setting their lengths to zero.
86799653d4eSeschrock 	 */
86899653d4eSeschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
86999653d4eSeschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
87099653d4eSeschrock 	xsize = rm->rm_col[x].rc_size;
87199653d4eSeschrock 	ysize = rm->rm_col[y].rc_size;
87299653d4eSeschrock 
87399653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
87499653d4eSeschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
87599653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
87699653d4eSeschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
87799653d4eSeschrock 	rm->rm_col[x].rc_size = 0;
87899653d4eSeschrock 	rm->rm_col[y].rc_size = 0;
87999653d4eSeschrock 
88099653d4eSeschrock 	vdev_raidz_generate_parity_pq(rm);
88199653d4eSeschrock 
88299653d4eSeschrock 	rm->rm_col[x].rc_size = xsize;
88399653d4eSeschrock 	rm->rm_col[y].rc_size = ysize;
88499653d4eSeschrock 
88599653d4eSeschrock 	p = pdata;
88699653d4eSeschrock 	q = qdata;
88799653d4eSeschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
88899653d4eSeschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
88999653d4eSeschrock 	xd = rm->rm_col[x].rc_data;
89099653d4eSeschrock 	yd = rm->rm_col[y].rc_data;
89199653d4eSeschrock 
89299653d4eSeschrock 	/*
89399653d4eSeschrock 	 * We now have:
89499653d4eSeschrock 	 *	Pxy = P + D_x + D_y
89599653d4eSeschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
89699653d4eSeschrock 	 *
89799653d4eSeschrock 	 * We can then solve for D_x:
89899653d4eSeschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
89999653d4eSeschrock 	 * where
90099653d4eSeschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
90199653d4eSeschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
90299653d4eSeschrock 	 *
90399653d4eSeschrock 	 * With D_x in hand, we can easily solve for D_y:
90499653d4eSeschrock 	 *	D_y = P + Pxy + D_x
90599653d4eSeschrock 	 */
90699653d4eSeschrock 
90799653d4eSeschrock 	a = vdev_raidz_pow2[255 + x - y];
90899653d4eSeschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
90999653d4eSeschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
91099653d4eSeschrock 
91199653d4eSeschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
91299653d4eSeschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
91399653d4eSeschrock 
91499653d4eSeschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
91599653d4eSeschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
91699653d4eSeschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
91799653d4eSeschrock 
91899653d4eSeschrock 		if (i < ysize)
91999653d4eSeschrock 			*yd = *p ^ *pxy ^ *xd;
92099653d4eSeschrock 	}
92199653d4eSeschrock 
92299653d4eSeschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
92399653d4eSeschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
92499653d4eSeschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
92599653d4eSeschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
92699653d4eSeschrock 
92799653d4eSeschrock 	/*
92899653d4eSeschrock 	 * Restore the saved parity data.
92999653d4eSeschrock 	 */
93099653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
93199653d4eSeschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
932f94275ceSAdam Leventhal 
933f94275ceSAdam Leventhal 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
93499653d4eSeschrock }
93599653d4eSeschrock 
936f94275ceSAdam Leventhal /* BEGIN CSTYLED */
937f94275ceSAdam Leventhal /*
938f94275ceSAdam Leventhal  * In the general case of reconstruction, we must solve the system of linear
939f94275ceSAdam Leventhal  * equations defined by the coeffecients used to generate parity as well as
940f94275ceSAdam Leventhal  * the contents of the data and parity disks. This can be expressed with
941f94275ceSAdam Leventhal  * vectors for the original data (D) and the actual data (d) and parity (p)
942f94275ceSAdam Leventhal  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
943f94275ceSAdam Leventhal  *
944f94275ceSAdam Leventhal  *            __   __                     __     __
945f94275ceSAdam Leventhal  *            |     |         __     __   |  p_0  |
946f94275ceSAdam Leventhal  *            |  V  |         |  D_0  |   | p_m-1 |
947f94275ceSAdam Leventhal  *            |     |    x    |   :   | = |  d_0  |
948f94275ceSAdam Leventhal  *            |  I  |         | D_n-1 |   |   :   |
949f94275ceSAdam Leventhal  *            |     |         ~~     ~~   | d_n-1 |
950f94275ceSAdam Leventhal  *            ~~   ~~                     ~~     ~~
951f94275ceSAdam Leventhal  *
952f94275ceSAdam Leventhal  * I is simply a square identity matrix of size n, and V is a vandermonde
953f94275ceSAdam Leventhal  * matrix defined by the coeffecients we chose for the various parity columns
954f94275ceSAdam Leventhal  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
955f94275ceSAdam Leventhal  * computation as well as linear separability.
956f94275ceSAdam Leventhal  *
957f94275ceSAdam Leventhal  *      __               __               __     __
958f94275ceSAdam Leventhal  *      |   1   ..  1 1 1 |               |  p_0  |
959f94275ceSAdam Leventhal  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
960f94275ceSAdam Leventhal  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
961f94275ceSAdam Leventhal  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
962f94275ceSAdam Leventhal  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
963f94275ceSAdam Leventhal  *      |   :       : : : |   |   :   |   |  d_2  |
964f94275ceSAdam Leventhal  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
965f94275ceSAdam Leventhal  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
966f94275ceSAdam Leventhal  *      |   0   ..  0 0 1 |               | d_n-1 |
967f94275ceSAdam Leventhal  *      ~~               ~~               ~~     ~~
968f94275ceSAdam Leventhal  *
969f94275ceSAdam Leventhal  * Note that I, V, d, and p are known. To compute D, we must invert the
970f94275ceSAdam Leventhal  * matrix and use the known data and parity values to reconstruct the unknown
971f94275ceSAdam Leventhal  * data values. We begin by removing the rows in V|I and d|p that correspond
972f94275ceSAdam Leventhal  * to failed or missing columns; we then make V|I square (n x n) and d|p
973f94275ceSAdam Leventhal  * sized n by removing rows corresponding to unused parity from the bottom up
974f94275ceSAdam Leventhal  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
975f94275ceSAdam Leventhal  * using Gauss-Jordan elimination. In the example below we use m=3 parity
976f94275ceSAdam Leventhal  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
977f94275ceSAdam Leventhal  *           __                               __
978f94275ceSAdam Leventhal  *           |  1   1   1   1   1   1   1   1  |
979f94275ceSAdam Leventhal  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
980f94275ceSAdam Leventhal  *           |  19 205 116  29  64  16  4   1  |      / /
981f94275ceSAdam Leventhal  *           |  1   0   0   0   0   0   0   0  |     / /
982f94275ceSAdam Leventhal  *           |  0   1   0   0   0   0   0   0  | <--' /
983f94275ceSAdam Leventhal  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
984f94275ceSAdam Leventhal  *           |  0   0   0   1   0   0   0   0  |
985f94275ceSAdam Leventhal  *           |  0   0   0   0   1   0   0   0  |
986f94275ceSAdam Leventhal  *           |  0   0   0   0   0   1   0   0  |
987f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   1   0  |
988f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   0   1  |
989f94275ceSAdam Leventhal  *           ~~                               ~~
990f94275ceSAdam Leventhal  *           __                               __
991f94275ceSAdam Leventhal  *           |  1   1   1   1   1   1   1   1  |
992f94275ceSAdam Leventhal  *           |  19 205 116  29  64  16  4   1  |
993f94275ceSAdam Leventhal  *           |  1   0   0   0   0   0   0   0  |
994810e43b2SBill Pijewski  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
995f94275ceSAdam Leventhal  *           |  0   0   0   0   1   0   0   0  |
996f94275ceSAdam Leventhal  *           |  0   0   0   0   0   1   0   0  |
997f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   1   0  |
998f94275ceSAdam Leventhal  *           |  0   0   0   0   0   0   0   1  |
999f94275ceSAdam Leventhal  *           ~~                               ~~
1000f94275ceSAdam Leventhal  *
1001f94275ceSAdam Leventhal  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002f94275ceSAdam Leventhal  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003f94275ceSAdam Leventhal  * matrix is not singular.
1004f94275ceSAdam Leventhal  * __                                                                 __
1005f94275ceSAdam Leventhal  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1006f94275ceSAdam Leventhal  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1007f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1009f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1010f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1011f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1012f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1013f94275ceSAdam Leventhal  * ~~                                                                 ~~
1014f94275ceSAdam Leventhal  * __                                                                 __
1015f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1016f94275ceSAdam Leventhal  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017f94275ceSAdam Leventhal  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1019f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1020f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1021f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1022f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1023f94275ceSAdam Leventhal  * ~~                                                                 ~~
1024f94275ceSAdam Leventhal  * __                                                                 __
1025f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1026f94275ceSAdam Leventhal  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1027f94275ceSAdam Leventhal  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1028f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1029f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1030f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1031f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1032f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1033f94275ceSAdam Leventhal  * ~~                                                                 ~~
1034f94275ceSAdam Leventhal  * __                                                                 __
1035f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1036f94275ceSAdam Leventhal  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1037f94275ceSAdam Leventhal  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1038f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1039f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1040f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1041f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1042f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1043f94275ceSAdam Leventhal  * ~~                                                                 ~~
1044f94275ceSAdam Leventhal  * __                                                                 __
1045f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1046f94275ceSAdam Leventhal  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1047f94275ceSAdam Leventhal  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1048f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1049f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1050f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1051f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1052f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1053f94275ceSAdam Leventhal  * ~~                                                                 ~~
1054f94275ceSAdam Leventhal  * __                                                                 __
1055f94275ceSAdam Leventhal  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1056f94275ceSAdam Leventhal  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1057f94275ceSAdam Leventhal  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1058f94275ceSAdam Leventhal  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1059f94275ceSAdam Leventhal  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1060f94275ceSAdam Leventhal  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1061f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1062f94275ceSAdam Leventhal  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1063f94275ceSAdam Leventhal  * ~~                                                                 ~~
1064f94275ceSAdam Leventhal  *                   __                               __
1065f94275ceSAdam Leventhal  *                   |  0   0   1   0   0   0   0   0  |
1066f94275ceSAdam Leventhal  *                   | 167 100  5   41 159 169 217 208 |
1067f94275ceSAdam Leventhal  *                   | 166 100  4   40 158 168 216 209 |
1068f94275ceSAdam Leventhal  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1069f94275ceSAdam Leventhal  *                   |  0   0   0   0   1   0   0   0  |
1070f94275ceSAdam Leventhal  *                   |  0   0   0   0   0   1   0   0  |
1071f94275ceSAdam Leventhal  *                   |  0   0   0   0   0   0   1   0  |
1072f94275ceSAdam Leventhal  *                   |  0   0   0   0   0   0   0   1  |
1073f94275ceSAdam Leventhal  *                   ~~                               ~~
1074f94275ceSAdam Leventhal  *
1075f94275ceSAdam Leventhal  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076f94275ceSAdam Leventhal  * of the missing data.
1077f94275ceSAdam Leventhal  *
1078f94275ceSAdam Leventhal  * As is apparent from the example above, the only non-trivial rows in the
1079f94275ceSAdam Leventhal  * inverse matrix correspond to the data disks that we're trying to
1080f94275ceSAdam Leventhal  * reconstruct. Indeed, those are the only rows we need as the others would
1081f94275ceSAdam Leventhal  * only be useful for reconstructing data known or assumed to be valid. For
1082f94275ceSAdam Leventhal  * that reason, we only build the coefficients in the rows that correspond to
1083f94275ceSAdam Leventhal  * targeted columns.
1084f94275ceSAdam Leventhal  */
1085f94275ceSAdam Leventhal /* END CSTYLED */
1086f94275ceSAdam Leventhal 
1087f94275ceSAdam Leventhal static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1088f94275ceSAdam Leventhal vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089f94275ceSAdam Leventhal     uint8_t **rows)
1090f94275ceSAdam Leventhal {
1091f94275ceSAdam Leventhal 	int i, j;
1092f94275ceSAdam Leventhal 	int pow;
1093f94275ceSAdam Leventhal 
1094f94275ceSAdam Leventhal 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095f94275ceSAdam Leventhal 
1096f94275ceSAdam Leventhal 	/*
1097f94275ceSAdam Leventhal 	 * Fill in the missing rows of interest.
1098f94275ceSAdam Leventhal 	 */
1099f94275ceSAdam Leventhal 	for (i = 0; i < nmap; i++) {
1100f94275ceSAdam Leventhal 		ASSERT3S(0, <=, map[i]);
1101f94275ceSAdam Leventhal 		ASSERT3S(map[i], <=, 2);
1102f94275ceSAdam Leventhal 
1103f94275ceSAdam Leventhal 		pow = map[i] * n;
1104f94275ceSAdam Leventhal 		if (pow > 255)
1105f94275ceSAdam Leventhal 			pow -= 255;
1106f94275ceSAdam Leventhal 		ASSERT(pow <= 255);
1107f94275ceSAdam Leventhal 
1108f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1109f94275ceSAdam Leventhal 			pow -= map[i];
1110f94275ceSAdam Leventhal 			if (pow < 0)
1111f94275ceSAdam Leventhal 				pow += 255;
1112f94275ceSAdam Leventhal 			rows[i][j] = vdev_raidz_pow2[pow];
1113f94275ceSAdam Leventhal 		}
1114f94275ceSAdam Leventhal 	}
1115f94275ceSAdam Leventhal }
1116f94275ceSAdam Leventhal 
1117f94275ceSAdam Leventhal static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1118f94275ceSAdam Leventhal vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119f94275ceSAdam Leventhal     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120f94275ceSAdam Leventhal {
1121f94275ceSAdam Leventhal 	int i, j, ii, jj;
1122f94275ceSAdam Leventhal 	uint8_t log;
1123f94275ceSAdam Leventhal 
1124f94275ceSAdam Leventhal 	/*
1125f94275ceSAdam Leventhal 	 * Assert that the first nmissing entries from the array of used
1126f94275ceSAdam Leventhal 	 * columns correspond to parity columns and that subsequent entries
1127f94275ceSAdam Leventhal 	 * correspond to data columns.
1128f94275ceSAdam Leventhal 	 */
1129f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1130f94275ceSAdam Leventhal 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131f94275ceSAdam Leventhal 	}
1132f94275ceSAdam Leventhal 	for (; i < n; i++) {
1133f94275ceSAdam Leventhal 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134f94275ceSAdam Leventhal 	}
1135f94275ceSAdam Leventhal 
1136f94275ceSAdam Leventhal 	/*
1137f94275ceSAdam Leventhal 	 * First initialize the storage where we'll compute the inverse rows.
1138f94275ceSAdam Leventhal 	 */
1139f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1140f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1141f94275ceSAdam Leventhal 			invrows[i][j] = (i == j) ? 1 : 0;
1142f94275ceSAdam Leventhal 		}
1143f94275ceSAdam Leventhal 	}
1144f94275ceSAdam Leventhal 
1145f94275ceSAdam Leventhal 	/*
1146f94275ceSAdam Leventhal 	 * Subtract all trivial rows from the rows of consequence.
1147f94275ceSAdam Leventhal 	 */
1148f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1149f94275ceSAdam Leventhal 		for (j = nmissing; j < n; j++) {
1150f94275ceSAdam Leventhal 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151f94275ceSAdam Leventhal 			jj = used[j] - rm->rm_firstdatacol;
1152f94275ceSAdam Leventhal 			ASSERT3S(jj, <, n);
1153f94275ceSAdam Leventhal 			invrows[i][j] = rows[i][jj];
1154f94275ceSAdam Leventhal 			rows[i][jj] = 0;
1155f94275ceSAdam Leventhal 		}
1156f94275ceSAdam Leventhal 	}
1157f94275ceSAdam Leventhal 
1158f94275ceSAdam Leventhal 	/*
1159f94275ceSAdam Leventhal 	 * For each of the rows of interest, we must normalize it and subtract
1160f94275ceSAdam Leventhal 	 * a multiple of it from the other rows.
1161f94275ceSAdam Leventhal 	 */
1162f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1163f94275ceSAdam Leventhal 		for (j = 0; j < missing[i]; j++) {
1164fb09f5aaSMadhav Suresh 			ASSERT0(rows[i][j]);
1165f94275ceSAdam Leventhal 		}
1166f94275ceSAdam Leventhal 		ASSERT3U(rows[i][missing[i]], !=, 0);
1167f94275ceSAdam Leventhal 
1168f94275ceSAdam Leventhal 		/*
1169f94275ceSAdam Leventhal 		 * Compute the inverse of the first element and multiply each
1170f94275ceSAdam Leventhal 		 * element in the row by that value.
1171f94275ceSAdam Leventhal 		 */
1172f94275ceSAdam Leventhal 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173f94275ceSAdam Leventhal 
1174f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1175f94275ceSAdam Leventhal 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176f94275ceSAdam Leventhal 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177f94275ceSAdam Leventhal 		}
1178f94275ceSAdam Leventhal 
1179f94275ceSAdam Leventhal 		for (ii = 0; ii < nmissing; ii++) {
1180f94275ceSAdam Leventhal 			if (i == ii)
1181f94275ceSAdam Leventhal 				continue;
1182f94275ceSAdam Leventhal 
1183f94275ceSAdam Leventhal 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1184f94275ceSAdam Leventhal 
1185f94275ceSAdam Leventhal 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1186f94275ceSAdam Leventhal 
1187f94275ceSAdam Leventhal 			for (j = 0; j < n; j++) {
1188f94275ceSAdam Leventhal 				rows[ii][j] ^=
1189f94275ceSAdam Leventhal 				    vdev_raidz_exp2(rows[i][j], log);
1190f94275ceSAdam Leventhal 				invrows[ii][j] ^=
1191f94275ceSAdam Leventhal 				    vdev_raidz_exp2(invrows[i][j], log);
1192f94275ceSAdam Leventhal 			}
1193f94275ceSAdam Leventhal 		}
1194f94275ceSAdam Leventhal 	}
1195f94275ceSAdam Leventhal 
1196f94275ceSAdam Leventhal 	/*
1197f94275ceSAdam Leventhal 	 * Verify that the data that is left in the rows are properly part of
1198f94275ceSAdam Leventhal 	 * an identity matrix.
1199f94275ceSAdam Leventhal 	 */
1200f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1201f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1202f94275ceSAdam Leventhal 			if (j == missing[i]) {
1203f94275ceSAdam Leventhal 				ASSERT3U(rows[i][j], ==, 1);
1204f94275ceSAdam Leventhal 			} else {
1205fb09f5aaSMadhav Suresh 				ASSERT0(rows[i][j]);
1206f94275ceSAdam Leventhal 			}
1207f94275ceSAdam Leventhal 		}
1208f94275ceSAdam Leventhal 	}
1209f94275ceSAdam Leventhal }
1210f94275ceSAdam Leventhal 
1211f94275ceSAdam Leventhal static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1212f94275ceSAdam Leventhal vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213f94275ceSAdam Leventhal     int *missing, uint8_t **invrows, const uint8_t *used)
1214f94275ceSAdam Leventhal {
1215f94275ceSAdam Leventhal 	int i, j, x, cc, c;
1216f94275ceSAdam Leventhal 	uint8_t *src;
1217f94275ceSAdam Leventhal 	uint64_t ccount;
1218f94275ceSAdam Leventhal 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219f94275ceSAdam Leventhal 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220d5285caeSGeorge Wilson 	uint8_t log = 0;
1221d5285caeSGeorge Wilson 	uint8_t val;
1222f94275ceSAdam Leventhal 	int ll;
1223f94275ceSAdam Leventhal 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224f94275ceSAdam Leventhal 	uint8_t *p, *pp;
1225f94275ceSAdam Leventhal 	size_t psize;
1226f94275ceSAdam Leventhal 
1227f94275ceSAdam Leventhal 	psize = sizeof (invlog[0][0]) * n * nmissing;
1228f94275ceSAdam Leventhal 	p = kmem_alloc(psize, KM_SLEEP);
1229f94275ceSAdam Leventhal 
1230f94275ceSAdam Leventhal 	for (pp = p, i = 0; i < nmissing; i++) {
1231f94275ceSAdam Leventhal 		invlog[i] = pp;
1232f94275ceSAdam Leventhal 		pp += n;
1233f94275ceSAdam Leventhal 	}
1234f94275ceSAdam Leventhal 
1235f94275ceSAdam Leventhal 	for (i = 0; i < nmissing; i++) {
1236f94275ceSAdam Leventhal 		for (j = 0; j < n; j++) {
1237f94275ceSAdam Leventhal 			ASSERT3U(invrows[i][j], !=, 0);
1238f94275ceSAdam Leventhal 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239f94275ceSAdam Leventhal 		}
1240f94275ceSAdam Leventhal 	}
1241f94275ceSAdam Leventhal 
1242f94275ceSAdam Leventhal 	for (i = 0; i < n; i++) {
1243f94275ceSAdam Leventhal 		c = used[i];
1244f94275ceSAdam Leventhal 		ASSERT3U(c, <, rm->rm_cols);
1245f94275ceSAdam Leventhal 
1246f94275ceSAdam Leventhal 		src = rm->rm_col[c].rc_data;
1247f94275ceSAdam Leventhal 		ccount = rm->rm_col[c].rc_size;
1248f94275ceSAdam Leventhal 		for (j = 0; j < nmissing; j++) {
1249f94275ceSAdam Leventhal 			cc = missing[j] + rm->rm_firstdatacol;
1250f94275ceSAdam Leventhal 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251f94275ceSAdam Leventhal 			ASSERT3U(cc, <, rm->rm_cols);
1252f94275ceSAdam Leventhal 			ASSERT3U(cc, !=, c);
1253f94275ceSAdam Leventhal 
1254f94275ceSAdam Leventhal 			dst[j] = rm->rm_col[cc].rc_data;
1255f94275ceSAdam Leventhal 			dcount[j] = rm->rm_col[cc].rc_size;
1256f94275ceSAdam Leventhal 		}
1257f94275ceSAdam Leventhal 
1258f94275ceSAdam Leventhal 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259f94275ceSAdam Leventhal 
1260f94275ceSAdam Leventhal 		for (x = 0; x < ccount; x++, src++) {
1261f94275ceSAdam Leventhal 			if (*src != 0)
1262f94275ceSAdam Leventhal 				log = vdev_raidz_log2[*src];
1263f94275ceSAdam Leventhal 
1264f94275ceSAdam Leventhal 			for (cc = 0; cc < nmissing; cc++) {
1265f94275ceSAdam Leventhal 				if (x >= dcount[cc])
1266f94275ceSAdam Leventhal 					continue;
1267f94275ceSAdam Leventhal 
1268f94275ceSAdam Leventhal 				if (*src == 0) {
1269f94275ceSAdam Leventhal 					val = 0;
1270f94275ceSAdam Leventhal 				} else {
1271f94275ceSAdam Leventhal 					if ((ll = log + invlog[cc][i]) >= 255)
1272f94275ceSAdam Leventhal 						ll -= 255;
1273f94275ceSAdam Leventhal 					val = vdev_raidz_pow2[ll];
1274f94275ceSAdam Leventhal 				}
1275f94275ceSAdam Leventhal 
1276f94275ceSAdam Leventhal 				if (i == 0)
1277f94275ceSAdam Leventhal 					dst[cc][x] = val;
1278f94275ceSAdam Leventhal 				else
1279f94275ceSAdam Leventhal 					dst[cc][x] ^= val;
1280f94275ceSAdam Leventhal 			}
1281f94275ceSAdam Leventhal 		}
1282f94275ceSAdam Leventhal 	}
1283f94275ceSAdam Leventhal 
1284f94275ceSAdam Leventhal 	kmem_free(p, psize);
1285f94275ceSAdam Leventhal }
1286f94275ceSAdam Leventhal 
1287f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1288f94275ceSAdam Leventhal vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289f94275ceSAdam Leventhal {
1290f94275ceSAdam Leventhal 	int n, i, c, t, tt;
1291f94275ceSAdam Leventhal 	int nmissing_rows;
1292f94275ceSAdam Leventhal 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293f94275ceSAdam Leventhal 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1294f94275ceSAdam Leventhal 
1295f94275ceSAdam Leventhal 	uint8_t *p, *pp;
1296f94275ceSAdam Leventhal 	size_t psize;
1297f94275ceSAdam Leventhal 
1298f94275ceSAdam Leventhal 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299f94275ceSAdam Leventhal 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300f94275ceSAdam Leventhal 	uint8_t *used;
1301f94275ceSAdam Leventhal 
1302f94275ceSAdam Leventhal 	int code = 0;
1303f94275ceSAdam Leventhal 
1304f94275ceSAdam Leventhal 
1305f94275ceSAdam Leventhal 	n = rm->rm_cols - rm->rm_firstdatacol;
1306f94275ceSAdam Leventhal 
1307f94275ceSAdam Leventhal 	/*
1308f94275ceSAdam Leventhal 	 * Figure out which data columns are missing.
1309f94275ceSAdam Leventhal 	 */
1310f94275ceSAdam Leventhal 	nmissing_rows = 0;
1311f94275ceSAdam Leventhal 	for (t = 0; t < ntgts; t++) {
1312f94275ceSAdam Leventhal 		if (tgts[t] >= rm->rm_firstdatacol) {
1313f94275ceSAdam Leventhal 			missing_rows[nmissing_rows++] =
1314f94275ceSAdam Leventhal 			    tgts[t] - rm->rm_firstdatacol;
1315f94275ceSAdam Leventhal 		}
1316f94275ceSAdam Leventhal 	}
1317f94275ceSAdam Leventhal 
1318f94275ceSAdam Leventhal 	/*
1319f94275ceSAdam Leventhal 	 * Figure out which parity columns to use to help generate the missing
1320f94275ceSAdam Leventhal 	 * data columns.
1321f94275ceSAdam Leventhal 	 */
1322f94275ceSAdam Leventhal 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323f94275ceSAdam Leventhal 		ASSERT(tt < ntgts);
1324f94275ceSAdam Leventhal 		ASSERT(c < rm->rm_firstdatacol);
1325f94275ceSAdam Leventhal 
1326f94275ceSAdam Leventhal 		/*
1327f94275ceSAdam Leventhal 		 * Skip any targeted parity columns.
1328f94275ceSAdam Leventhal 		 */
1329f94275ceSAdam Leventhal 		if (c == tgts[tt]) {
1330f94275ceSAdam Leventhal 			tt++;
1331f94275ceSAdam Leventhal 			continue;
1332f94275ceSAdam Leventhal 		}
1333f94275ceSAdam Leventhal 
1334f94275ceSAdam Leventhal 		code |= 1 << c;
1335f94275ceSAdam Leventhal 
1336f94275ceSAdam Leventhal 		parity_map[i] = c;
1337f94275ceSAdam Leventhal 		i++;
1338f94275ceSAdam Leventhal 	}
1339f94275ceSAdam Leventhal 
1340f94275ceSAdam Leventhal 	ASSERT(code != 0);
1341f94275ceSAdam Leventhal 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342f94275ceSAdam Leventhal 
1343f94275ceSAdam Leventhal 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344f94275ceSAdam Leventhal 	    nmissing_rows * n + sizeof (used[0]) * n;
1345f94275ceSAdam Leventhal 	p = kmem_alloc(psize, KM_SLEEP);
1346f94275ceSAdam Leventhal 
1347f94275ceSAdam Leventhal 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1348f94275ceSAdam Leventhal 		rows[i] = pp;
1349f94275ceSAdam Leventhal 		pp += n;
1350f94275ceSAdam Leventhal 		invrows[i] = pp;
1351f94275ceSAdam Leventhal 		pp += n;
1352f94275ceSAdam Leventhal 	}
1353f94275ceSAdam Leventhal 	used = pp;
1354f94275ceSAdam Leventhal 
1355f94275ceSAdam Leventhal 	for (i = 0; i < nmissing_rows; i++) {
1356f94275ceSAdam Leventhal 		used[i] = parity_map[i];
1357f94275ceSAdam Leventhal 	}
1358f94275ceSAdam Leventhal 
1359f94275ceSAdam Leventhal 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360f94275ceSAdam Leventhal 		if (tt < nmissing_rows &&
1361f94275ceSAdam Leventhal 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1362f94275ceSAdam Leventhal 			tt++;
1363f94275ceSAdam Leventhal 			continue;
1364f94275ceSAdam Leventhal 		}
1365f94275ceSAdam Leventhal 
1366f94275ceSAdam Leventhal 		ASSERT3S(i, <, n);
1367f94275ceSAdam Leventhal 		used[i] = c;
1368f94275ceSAdam Leventhal 		i++;
1369f94275ceSAdam Leventhal 	}
1370f94275ceSAdam Leventhal 
1371f94275ceSAdam Leventhal 	/*
1372f94275ceSAdam Leventhal 	 * Initialize the interesting rows of the matrix.
1373f94275ceSAdam Leventhal 	 */
1374f94275ceSAdam Leventhal 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375f94275ceSAdam Leventhal 
1376f94275ceSAdam Leventhal 	/*
1377f94275ceSAdam Leventhal 	 * Invert the matrix.
1378f94275ceSAdam Leventhal 	 */
1379f94275ceSAdam Leventhal 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380f94275ceSAdam Leventhal 	    invrows, used);
1381f94275ceSAdam Leventhal 
1382f94275ceSAdam Leventhal 	/*
1383f94275ceSAdam Leventhal 	 * Reconstruct the missing data using the generated matrix.
1384f94275ceSAdam Leventhal 	 */
1385f94275ceSAdam Leventhal 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386f94275ceSAdam Leventhal 	    invrows, used);
1387f94275ceSAdam Leventhal 
1388f94275ceSAdam Leventhal 	kmem_free(p, psize);
1389f94275ceSAdam Leventhal 
1390f94275ceSAdam Leventhal 	return (code);
1391f94275ceSAdam Leventhal }
1392f94275ceSAdam Leventhal 
1393f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1394f94275ceSAdam Leventhal vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395f94275ceSAdam Leventhal {
1396f94275ceSAdam Leventhal 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397f94275ceSAdam Leventhal 	int ntgts;
1398f94275ceSAdam Leventhal 	int i, c;
1399f94275ceSAdam Leventhal 	int code;
1400f94275ceSAdam Leventhal 	int nbadparity, nbaddata;
1401f94275ceSAdam Leventhal 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402f94275ceSAdam Leventhal 
1403f94275ceSAdam Leventhal 	/*
1404f94275ceSAdam Leventhal 	 * The tgts list must already be sorted.
1405f94275ceSAdam Leventhal 	 */
1406f94275ceSAdam Leventhal 	for (i = 1; i < nt; i++) {
1407f94275ceSAdam Leventhal 		ASSERT(t[i] > t[i - 1]);
1408f94275ceSAdam Leventhal 	}
1409f94275ceSAdam Leventhal 
1410f94275ceSAdam Leventhal 	nbadparity = rm->rm_firstdatacol;
1411f94275ceSAdam Leventhal 	nbaddata = rm->rm_cols - nbadparity;
1412f94275ceSAdam Leventhal 	ntgts = 0;
1413f94275ceSAdam Leventhal 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414f94275ceSAdam Leventhal 		if (c < rm->rm_firstdatacol)
1415f94275ceSAdam Leventhal 			parity_valid[c] = B_FALSE;
1416f94275ceSAdam Leventhal 
1417f94275ceSAdam Leventhal 		if (i < nt && c == t[i]) {
1418f94275ceSAdam Leventhal 			tgts[ntgts++] = c;
1419f94275ceSAdam Leventhal 			i++;
1420f94275ceSAdam Leventhal 		} else if (rm->rm_col[c].rc_error != 0) {
1421f94275ceSAdam Leventhal 			tgts[ntgts++] = c;
1422f94275ceSAdam Leventhal 		} else if (c >= rm->rm_firstdatacol) {
1423f94275ceSAdam Leventhal 			nbaddata--;
1424f94275ceSAdam Leventhal 		} else {
1425f94275ceSAdam Leventhal 			parity_valid[c] = B_TRUE;
1426f94275ceSAdam Leventhal 			nbadparity--;
1427f94275ceSAdam Leventhal 		}
1428f94275ceSAdam Leventhal 	}
1429f94275ceSAdam Leventhal 
1430f94275ceSAdam Leventhal 	ASSERT(ntgts >= nt);
1431f94275ceSAdam Leventhal 	ASSERT(nbaddata >= 0);
1432f94275ceSAdam Leventhal 	ASSERT(nbaddata + nbadparity == ntgts);
1433f94275ceSAdam Leventhal 
1434f94275ceSAdam Leventhal 	dt = &tgts[nbadparity];
1435f94275ceSAdam Leventhal 
1436f94275ceSAdam Leventhal 	/*
1437f94275ceSAdam Leventhal 	 * See if we can use any of our optimized reconstruction routines.
1438f94275ceSAdam Leventhal 	 */
1439f94275ceSAdam Leventhal 	if (!vdev_raidz_default_to_general) {
1440f94275ceSAdam Leventhal 		switch (nbaddata) {
1441f94275ceSAdam Leventhal 		case 1:
1442f94275ceSAdam Leventhal 			if (parity_valid[VDEV_RAIDZ_P])
1443f94275ceSAdam Leventhal 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444f94275ceSAdam Leventhal 
1445f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 1);
1446f94275ceSAdam Leventhal 
1447f94275ceSAdam Leventhal 			if (parity_valid[VDEV_RAIDZ_Q])
1448f94275ceSAdam Leventhal 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449f94275ceSAdam Leventhal 
1450f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 2);
1451f94275ceSAdam Leventhal 			break;
1452f94275ceSAdam Leventhal 
1453f94275ceSAdam Leventhal 		case 2:
1454f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 1);
1455f94275ceSAdam Leventhal 
1456f94275ceSAdam Leventhal 			if (parity_valid[VDEV_RAIDZ_P] &&
1457f94275ceSAdam Leventhal 			    parity_valid[VDEV_RAIDZ_Q])
1458f94275ceSAdam Leventhal 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459f94275ceSAdam Leventhal 
1460f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol > 2);
1461f94275ceSAdam Leventhal 
1462f94275ceSAdam Leventhal 			break;
1463f94275ceSAdam Leventhal 		}
1464f94275ceSAdam Leventhal 	}
1465f94275ceSAdam Leventhal 
1466f94275ceSAdam Leventhal 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467f94275ceSAdam Leventhal 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468f94275ceSAdam Leventhal 	ASSERT(code > 0);
1469f94275ceSAdam Leventhal 	return (code);
1470f94275ceSAdam Leventhal }
147199653d4eSeschrock 
1472fa9e4066Sahrens static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * ashift)14734263d13fSGeorge Wilson vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
14744263d13fSGeorge Wilson     uint64_t *ashift)
1475fa9e4066Sahrens {
1476f94275ceSAdam Leventhal 	vdev_t *cvd;
147799653d4eSeschrock 	uint64_t nparity = vd->vdev_nparity;
1478f94275ceSAdam Leventhal 	int c;
1479fa9e4066Sahrens 	int lasterror = 0;
1480fa9e4066Sahrens 	int numerrors = 0;
1481fa9e4066Sahrens 
148299653d4eSeschrock 	ASSERT(nparity > 0);
148399653d4eSeschrock 
148499653d4eSeschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
148599653d4eSeschrock 	    vd->vdev_children < nparity + 1) {
1486fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
1488fa9e4066Sahrens 	}
1489fa9e4066Sahrens 
1490f64c0e34SEric Taylor 	vdev_open_children(vd);
1491fa9e4066Sahrens 
1492f94275ceSAdam Leventhal 	for (c = 0; c < vd->vdev_children; c++) {
1493f94275ceSAdam Leventhal 		cvd = vd->vdev_child[c];
1494f64c0e34SEric Taylor 
1495f94275ceSAdam Leventhal 		if (cvd->vdev_open_error != 0) {
1496f64c0e34SEric Taylor 			lasterror = cvd->vdev_open_error;
1497fa9e4066Sahrens 			numerrors++;
1498fa9e4066Sahrens 			continue;
1499fa9e4066Sahrens 		}
1500fa9e4066Sahrens 
1501fa9e4066Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
15024263d13fSGeorge Wilson 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503ecc2d604Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1504fa9e4066Sahrens 	}
1505fa9e4066Sahrens 
1506fa9e4066Sahrens 	*asize *= vd->vdev_children;
15074263d13fSGeorge Wilson 	*max_asize *= vd->vdev_children;
1508fa9e4066Sahrens 
150999653d4eSeschrock 	if (numerrors > nparity) {
1510fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511fa9e4066Sahrens 		return (lasterror);
1512fa9e4066Sahrens 	}
1513fa9e4066Sahrens 
1514fa9e4066Sahrens 	return (0);
1515fa9e4066Sahrens }
1516fa9e4066Sahrens 
1517fa9e4066Sahrens static void
vdev_raidz_close(vdev_t * vd)1518fa9e4066Sahrens vdev_raidz_close(vdev_t *vd)
1519fa9e4066Sahrens {
1520f94275ceSAdam Leventhal 	int c;
1521f94275ceSAdam Leventhal 
1522f94275ceSAdam Leventhal 	for (c = 0; c < vd->vdev_children; c++)
1523fa9e4066Sahrens 		vdev_close(vd->vdev_child[c]);
1524fa9e4066Sahrens }
1525fa9e4066Sahrens 
1526810e43b2SBill Pijewski /*
1527810e43b2SBill Pijewski  * Handle a read or write I/O to a RAID-Z dump device.
1528810e43b2SBill Pijewski  *
1529810e43b2SBill Pijewski  * The dump device is in a unique situation compared to other ZFS datasets:
1530810e43b2SBill Pijewski  * writing to this device should be as simple and fast as possible.  In
1531810e43b2SBill Pijewski  * addition, durability matters much less since the dump will be extracted
1532810e43b2SBill Pijewski  * once the machine reboots.  For that reason, this function eschews parity for
1533810e43b2SBill Pijewski  * performance and simplicity.  The dump device uses the checksum setting
1534810e43b2SBill Pijewski  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1535810e43b2SBill Pijewski  * dataset.
1536810e43b2SBill Pijewski  *
1537810e43b2SBill Pijewski  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1538810e43b2SBill Pijewski  * 128 KB will not fill an entire block; in addition, they may not be properly
1539810e43b2SBill Pijewski  * aligned.  In that case, this function uses the preallocated 128 KB block and
1540810e43b2SBill Pijewski  * omits reading or writing any "empty" portions of that block, as opposed to
1541810e43b2SBill Pijewski  * allocating a fresh appropriately-sized block.
1542810e43b2SBill Pijewski  *
1543810e43b2SBill Pijewski  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1544810e43b2SBill Pijewski  *
1545810e43b2SBill Pijewski  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1546810e43b2SBill Pijewski  *
1547810e43b2SBill Pijewski  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1548810e43b2SBill Pijewski  * allocated which spans all five child vdevs.  8 KB of data would be written to
1549810e43b2SBill Pijewski  * each of four vdevs, with the fifth containing the parity bits.
1550810e43b2SBill Pijewski  *
1551810e43b2SBill Pijewski  *       parity    data     data     data     data
1552810e43b2SBill Pijewski  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1553810e43b2SBill Pijewski  *         ^        ^        ^        ^        ^
1554810e43b2SBill Pijewski  *         |        |        |        |        |
1555810e43b2SBill Pijewski  *   8 KB parity    ------8 KB data blocks------
1556810e43b2SBill Pijewski  *
1557810e43b2SBill Pijewski  * However, when writing to the dump device, the behavior is different:
1558810e43b2SBill Pijewski  *
1559810e43b2SBill Pijewski  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1560810e43b2SBill Pijewski  *
1561810e43b2SBill Pijewski  * Unlike the normal RAID-Z case in which the block is allocated based on the
1562810e43b2SBill Pijewski  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1563810e43b2SBill Pijewski  * I/O size is less than 128 KB, only the actual portions of data are written.
1564810e43b2SBill Pijewski  * In this example the data is written to the third data vdev since that vdev
1565810e43b2SBill Pijewski  * contains the offset [64 KB, 96 KB).
1566810e43b2SBill Pijewski  *
1567810e43b2SBill Pijewski  *       parity    data     data     data     data
1568810e43b2SBill Pijewski  *     |        |        |        |   XX   |        |
1569810e43b2SBill Pijewski  *                                    ^
1570810e43b2SBill Pijewski  *                                    |
1571810e43b2SBill Pijewski  *                             32 KB data block
1572810e43b2SBill Pijewski  *
1573810e43b2SBill Pijewski  * As a result, an individual I/O may not span all child vdevs; moreover, a
1574810e43b2SBill Pijewski  * small I/O may only operate on a single child vdev.
1575810e43b2SBill Pijewski  *
1576810e43b2SBill Pijewski  * Note that since there are no parity bits calculated or written, this format
1577810e43b2SBill Pijewski  * remains the same no matter how many parity bits are used in a normal RAID-Z
1578810e43b2SBill Pijewski  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1579810e43b2SBill Pijewski  * would look like:
1580810e43b2SBill Pijewski  *
1581810e43b2SBill Pijewski  *       parity   parity   parity    data     data     data     data
1582810e43b2SBill Pijewski  *     |        |        |        |        |        |   XX   |        |
1583810e43b2SBill Pijewski  *                                                      ^
1584810e43b2SBill Pijewski  *                                                      |
1585810e43b2SBill Pijewski  *                                               32 KB data block
1586810e43b2SBill Pijewski  */
1587810e43b2SBill Pijewski int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1588810e43b2SBill Pijewski vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1589810e43b2SBill Pijewski     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1590810e43b2SBill Pijewski {
1591810e43b2SBill Pijewski 	vdev_t *tvd = vd->vdev_top;
1592810e43b2SBill Pijewski 	vdev_t *cvd;
1593810e43b2SBill Pijewski 	raidz_map_t *rm;
1594810e43b2SBill Pijewski 	raidz_col_t *rc;
1595810e43b2SBill Pijewski 	int c, err = 0;
1596810e43b2SBill Pijewski 
1597810e43b2SBill Pijewski 	uint64_t start, end, colstart, colend;
1598810e43b2SBill Pijewski 	uint64_t coloffset, colsize, colskip;
1599810e43b2SBill Pijewski 
1600810e43b2SBill Pijewski 	int flags = doread ? B_READ : B_WRITE;
1601810e43b2SBill Pijewski 
1602810e43b2SBill Pijewski #ifdef	_KERNEL
1603810e43b2SBill Pijewski 
1604810e43b2SBill Pijewski 	/*
1605810e43b2SBill Pijewski 	 * Don't write past the end of the block
1606810e43b2SBill Pijewski 	 */
1607d1a98260SMatthew Ahrens 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1608810e43b2SBill Pijewski 
1609810e43b2SBill Pijewski 	start = offset;
1610810e43b2SBill Pijewski 	end = start + size;
1611810e43b2SBill Pijewski 
1612810e43b2SBill Pijewski 	/*
1613810e43b2SBill Pijewski 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1614810e43b2SBill Pijewski 	 * from the "original" offset, this is, the offset of the extent which
1615810e43b2SBill Pijewski 	 * contains the requisite offset of the data being read or written.
1616810e43b2SBill Pijewski 	 *
1617810e43b2SBill Pijewski 	 * Even if this I/O operation doesn't span the full block size, let's
1618810e43b2SBill Pijewski 	 * treat the on-disk format as if the only blocks are the complete 128
1619810e43b2SBill Pijewski 	 * KB size.
1620810e43b2SBill Pijewski 	 */
1621810e43b2SBill Pijewski 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1622d1a98260SMatthew Ahrens 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1623d1a98260SMatthew Ahrens 	    vd->vdev_children, vd->vdev_nparity);
1624810e43b2SBill Pijewski 
1625810e43b2SBill Pijewski 	coloffset = origoffset;
1626810e43b2SBill Pijewski 
1627810e43b2SBill Pijewski 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1628810e43b2SBill Pijewski 	    c++, coloffset += rc->rc_size) {
1629810e43b2SBill Pijewski 		rc = &rm->rm_col[c];
1630810e43b2SBill Pijewski 		cvd = vd->vdev_child[rc->rc_devidx];
1631810e43b2SBill Pijewski 
1632810e43b2SBill Pijewski 		/*
1633810e43b2SBill Pijewski 		 * Find the start and end of this column in the RAID-Z map,
1634810e43b2SBill Pijewski 		 * keeping in mind that the stated size and offset of the
1635810e43b2SBill Pijewski 		 * operation may not fill the entire column for this vdev.
1636810e43b2SBill Pijewski 		 *
1637810e43b2SBill Pijewski 		 * If any portion of the data spans this column, issue the
1638810e43b2SBill Pijewski 		 * appropriate operation to the vdev.
1639810e43b2SBill Pijewski 		 */
1640810e43b2SBill Pijewski 		if (coloffset + rc->rc_size <= start)
1641810e43b2SBill Pijewski 			continue;
1642810e43b2SBill Pijewski 		if (coloffset >= end)
1643810e43b2SBill Pijewski 			continue;
1644810e43b2SBill Pijewski 
1645810e43b2SBill Pijewski 		colstart = MAX(coloffset, start);
1646810e43b2SBill Pijewski 		colend = MIN(end, coloffset + rc->rc_size);
1647810e43b2SBill Pijewski 		colsize = colend - colstart;
1648810e43b2SBill Pijewski 		colskip = colstart - coloffset;
1649810e43b2SBill Pijewski 
1650810e43b2SBill Pijewski 		VERIFY3U(colsize, <=, rc->rc_size);
1651810e43b2SBill Pijewski 		VERIFY3U(colskip, <=, rc->rc_size);
1652810e43b2SBill Pijewski 
1653810e43b2SBill Pijewski 		/*
1654810e43b2SBill Pijewski 		 * Note that the child vdev will have a vdev label at the start
1655810e43b2SBill Pijewski 		 * of its range of offsets, hence the need for
1656810e43b2SBill Pijewski 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1657810e43b2SBill Pijewski 		 * example of why this calculation is needed.
1658810e43b2SBill Pijewski 		 */
1659810e43b2SBill Pijewski 		if ((err = vdev_disk_physio(cvd,
1660810e43b2SBill Pijewski 		    ((char *)rc->rc_data) + colskip, colsize,
1661810e43b2SBill Pijewski 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1662810e43b2SBill Pijewski 		    flags, isdump)) != 0)
1663810e43b2SBill Pijewski 			break;
1664810e43b2SBill Pijewski 	}
1665810e43b2SBill Pijewski 
1666810e43b2SBill Pijewski 	vdev_raidz_map_free(rm);
1667810e43b2SBill Pijewski #endif	/* KERNEL */
1668810e43b2SBill Pijewski 
1669810e43b2SBill Pijewski 	return (err);
1670810e43b2SBill Pijewski }
1671810e43b2SBill Pijewski 
1672fa9e4066Sahrens static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1673fa9e4066Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1674fa9e4066Sahrens {
1675fa9e4066Sahrens 	uint64_t asize;
1676ecc2d604Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1677fa9e4066Sahrens 	uint64_t cols = vd->vdev_children;
167899653d4eSeschrock 	uint64_t nparity = vd->vdev_nparity;
1679fa9e4066Sahrens 
1680ecc2d604Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
168199653d4eSeschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
168299653d4eSeschrock 	asize = roundup(asize, nparity + 1) << ashift;
1683fa9e4066Sahrens 
1684fa9e4066Sahrens 	return (asize);
1685fa9e4066Sahrens }
1686fa9e4066Sahrens 
1687fa9e4066Sahrens static void
vdev_raidz_child_done(zio_t * zio)1688fa9e4066Sahrens vdev_raidz_child_done(zio_t *zio)
1689fa9e4066Sahrens {
1690fa9e4066Sahrens 	raidz_col_t *rc = zio->io_private;
1691fa9e4066Sahrens 
1692fa9e4066Sahrens 	rc->rc_error = zio->io_error;
1693fa9e4066Sahrens 	rc->rc_tried = 1;
1694fa9e4066Sahrens 	rc->rc_skipped = 0;
1695fa9e4066Sahrens }
1696fa9e4066Sahrens 
16973e30c24aSWill Andrews /*
16983e30c24aSWill Andrews  * Start an IO operation on a RAIDZ VDev
16993e30c24aSWill Andrews  *
17003e30c24aSWill Andrews  * Outline:
17013e30c24aSWill Andrews  * - For write operations:
17023e30c24aSWill Andrews  *   1. Generate the parity data
17033e30c24aSWill Andrews  *   2. Create child zio write operations to each column's vdev, for both
17043e30c24aSWill Andrews  *      data and parity.
17053e30c24aSWill Andrews  *   3. If the column skips any sectors for padding, create optional dummy
17063e30c24aSWill Andrews  *      write zio children for those areas to improve aggregation continuity.
17073e30c24aSWill Andrews  * - For read operations:
17083e30c24aSWill Andrews  *   1. Create child zio read operations to each data column's vdev to read
17093e30c24aSWill Andrews  *      the range of data required for zio.
17103e30c24aSWill Andrews  *   2. If this is a scrub or resilver operation, or if any of the data
17113e30c24aSWill Andrews  *      vdevs have had errors, then create zio read operations to the parity
17123e30c24aSWill Andrews  *      columns' VDevs as well.
17133e30c24aSWill Andrews  */
1714efe6bf49SGeorge Wilson static void
vdev_raidz_io_start(zio_t * zio)1715fa9e4066Sahrens vdev_raidz_io_start(zio_t *zio)
1716fa9e4066Sahrens {
1717fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
1718ecc2d604Sbonwick 	vdev_t *tvd = vd->vdev_top;
1719fa9e4066Sahrens 	vdev_t *cvd;
1720fa9e4066Sahrens 	raidz_map_t *rm;
1721fa9e4066Sahrens 	raidz_col_t *rc;
1722f94275ceSAdam Leventhal 	int c, i;
1723fa9e4066Sahrens 
1724810e43b2SBill Pijewski 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1725810e43b2SBill Pijewski 	    tvd->vdev_ashift, vd->vdev_children,
172699653d4eSeschrock 	    vd->vdev_nparity);
1727fa9e4066Sahrens 
1728810e43b2SBill Pijewski 	zio->io_vsd = rm;
1729810e43b2SBill Pijewski 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1730810e43b2SBill Pijewski 
173144cd46caSbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1732fa9e4066Sahrens 
1733fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
1734f94275ceSAdam Leventhal 		vdev_raidz_generate_parity(rm);
1735fa9e4066Sahrens 
1736fa9e4066Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1737fa9e4066Sahrens 			rc = &rm->rm_col[c];
173899653d4eSeschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1739fa9e4066Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1740fa9e4066Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1741e14bb325SJeff Bonwick 			    zio->io_type, zio->io_priority, 0,
1742fa9e4066Sahrens 			    vdev_raidz_child_done, rc));
1743fa9e4066Sahrens 		}
1744e05725b1Sbonwick 
1745f94275ceSAdam Leventhal 		/*
1746f94275ceSAdam Leventhal 		 * Generate optional I/Os for any skipped sectors to improve
1747f94275ceSAdam Leventhal 		 * aggregation contiguity.
1748f94275ceSAdam Leventhal 		 */
17492fbc121fSAdam Leventhal 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1750f94275ceSAdam Leventhal 			ASSERT(c <= rm->rm_scols);
1751f94275ceSAdam Leventhal 			if (c == rm->rm_scols)
1752f94275ceSAdam Leventhal 				c = 0;
1753f94275ceSAdam Leventhal 			rc = &rm->rm_col[c];
1754f94275ceSAdam Leventhal 			cvd = vd->vdev_child[rc->rc_devidx];
1755f94275ceSAdam Leventhal 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1756f94275ceSAdam Leventhal 			    rc->rc_offset + rc->rc_size, NULL,
1757f94275ceSAdam Leventhal 			    1 << tvd->vdev_ashift,
1758f94275ceSAdam Leventhal 			    zio->io_type, zio->io_priority,
1759f94275ceSAdam Leventhal 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1760f94275ceSAdam Leventhal 		}
1761f94275ceSAdam Leventhal 
1762efe6bf49SGeorge Wilson 		zio_execute(zio);
1763efe6bf49SGeorge Wilson 		return;
1764fa9e4066Sahrens 	}
1765fa9e4066Sahrens 
1766fa9e4066Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1767fa9e4066Sahrens 
176899653d4eSeschrock 	/*
176999653d4eSeschrock 	 * Iterate over the columns in reverse order so that we hit the parity
1770f94275ceSAdam Leventhal 	 * last -- any errors along the way will force us to read the parity.
177199653d4eSeschrock 	 */
1772fa9e4066Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1773fa9e4066Sahrens 		rc = &rm->rm_col[c];
177499653d4eSeschrock 		cvd = vd->vdev_child[rc->rc_devidx];
1775*c3a1418dSArne Jansen 		if (cvd->vdev_avoid_read) {
1776*c3a1418dSArne Jansen 			if (c >= rm->rm_firstdatacol)
1777*c3a1418dSArne Jansen 				rm->rm_missingdata++;
1778*c3a1418dSArne Jansen 			else
1779*c3a1418dSArne Jansen 				rm->rm_missingparity++;
1780*c3a1418dSArne Jansen 			rc->rc_error = SET_ERROR(ENXIO);
1781*c3a1418dSArne Jansen 			rc->rc_skipped = 1;	/* only try if necessary */
1782*c3a1418dSArne Jansen 			continue;
1783*c3a1418dSArne Jansen 		}
17840a4e9518Sgw25295 		if (!vdev_readable(cvd)) {
178599653d4eSeschrock 			if (c >= rm->rm_firstdatacol)
178699653d4eSeschrock 				rm->rm_missingdata++;
178799653d4eSeschrock 			else
178899653d4eSeschrock 				rm->rm_missingparity++;
1789be6fd75aSMatthew Ahrens 			rc->rc_error = SET_ERROR(ENXIO);
1790fa9e4066Sahrens 			rc->rc_tried = 1;	/* don't even try */
1791fa9e4066Sahrens 			rc->rc_skipped = 1;
1792fa9e4066Sahrens 			continue;
1793fa9e4066Sahrens 		}
1794b24ab676SJeff Bonwick 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
179599653d4eSeschrock 			if (c >= rm->rm_firstdatacol)
179699653d4eSeschrock 				rm->rm_missingdata++;
179799653d4eSeschrock 			else
179899653d4eSeschrock 				rm->rm_missingparity++;
1799be6fd75aSMatthew Ahrens 			rc->rc_error = SET_ERROR(ESTALE);
1800fa9e4066Sahrens 			rc->rc_skipped = 1;
1801fa9e4066Sahrens 			continue;
1802fa9e4066Sahrens 		}
180399653d4eSeschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1804dfd80e3eSMark J Musante 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1805fa9e4066Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1806fa9e4066Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1807e14bb325SJeff Bonwick 			    zio->io_type, zio->io_priority, 0,
1808fa9e4066Sahrens 			    vdev_raidz_child_done, rc));
1809fa9e4066Sahrens 		}
1810fa9e4066Sahrens 	}
1811fa9e4066Sahrens 
1812efe6bf49SGeorge Wilson 	zio_execute(zio);
1813fa9e4066Sahrens }
1814fa9e4066Sahrens 
18153f9d6ad7SLin Ling 
1816ea8dc4b6Seschrock /*
1817ea8dc4b6Seschrock  * Report a checksum error for a child of a RAID-Z device.
1818ea8dc4b6Seschrock  */
1819ea8dc4b6Seschrock static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)182022fe2c88SJonathan Adams raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1821ea8dc4b6Seschrock {
182299653d4eSeschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1823ea8dc4b6Seschrock 
1824ea8dc4b6Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
182522fe2c88SJonathan Adams 		zio_bad_cksum_t zbc;
182622fe2c88SJonathan Adams 		raidz_map_t *rm = zio->io_vsd;
182722fe2c88SJonathan Adams 
1828ea8dc4b6Seschrock 		mutex_enter(&vd->vdev_stat_lock);
1829ea8dc4b6Seschrock 		vd->vdev_stat.vs_checksum_errors++;
1830ea8dc4b6Seschrock 		mutex_exit(&vd->vdev_stat_lock);
183122fe2c88SJonathan Adams 
183222fe2c88SJonathan Adams 		zbc.zbc_has_cksum = 0;
183322fe2c88SJonathan Adams 		zbc.zbc_injected = rm->rm_ecksuminjected;
183422fe2c88SJonathan Adams 
183522fe2c88SJonathan Adams 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
183622fe2c88SJonathan Adams 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
183722fe2c88SJonathan Adams 		    &zbc);
183822fe2c88SJonathan Adams 	}
1839ea8dc4b6Seschrock }
1840ea8dc4b6Seschrock 
184122fe2c88SJonathan Adams /*
184222fe2c88SJonathan Adams  * We keep track of whether or not there were any injected errors, so that
184322fe2c88SJonathan Adams  * any ereports we generate can note it.
184422fe2c88SJonathan Adams  */
184522fe2c88SJonathan Adams static int
raidz_checksum_verify(zio_t * zio)184622fe2c88SJonathan Adams raidz_checksum_verify(zio_t *zio)
184722fe2c88SJonathan Adams {
184822fe2c88SJonathan Adams 	zio_bad_cksum_t zbc;
184922fe2c88SJonathan Adams 	raidz_map_t *rm = zio->io_vsd;
185022fe2c88SJonathan Adams 
185122fe2c88SJonathan Adams 	int ret = zio_checksum_error(zio, &zbc);
185222fe2c88SJonathan Adams 	if (ret != 0 && zbc.zbc_injected != 0)
185322fe2c88SJonathan Adams 		rm->rm_ecksuminjected = 1;
185422fe2c88SJonathan Adams 
185522fe2c88SJonathan Adams 	return (ret);
1856ea8dc4b6Seschrock }
1857ea8dc4b6Seschrock 
185899653d4eSeschrock /*
185999653d4eSeschrock  * Generate the parity from the data columns. If we tried and were able to
186099653d4eSeschrock  * read the parity without error, verify that the generated parity matches the
186199653d4eSeschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
186299653d4eSeschrock  * number such failures.
186399653d4eSeschrock  */
186499653d4eSeschrock static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)186599653d4eSeschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
186699653d4eSeschrock {
186799653d4eSeschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
186899653d4eSeschrock 	int c, ret = 0;
186999653d4eSeschrock 	raidz_col_t *rc;
187099653d4eSeschrock 
1871810e43b2SBill Pijewski 	blkptr_t *bp = zio->io_bp;
1872810e43b2SBill Pijewski 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1873810e43b2SBill Pijewski 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1874810e43b2SBill Pijewski 
1875810e43b2SBill Pijewski 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1876810e43b2SBill Pijewski 		return (ret);
1877810e43b2SBill Pijewski 
187899653d4eSeschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
187999653d4eSeschrock 		rc = &rm->rm_col[c];
188099653d4eSeschrock 		if (!rc->rc_tried || rc->rc_error != 0)
188199653d4eSeschrock 			continue;
188299653d4eSeschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
188399653d4eSeschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
188499653d4eSeschrock 	}
188599653d4eSeschrock 
1886f94275ceSAdam Leventhal 	vdev_raidz_generate_parity(rm);
188799653d4eSeschrock 
188899653d4eSeschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
188999653d4eSeschrock 		rc = &rm->rm_col[c];
189099653d4eSeschrock 		if (!rc->rc_tried || rc->rc_error != 0)
189199653d4eSeschrock 			continue;
189299653d4eSeschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
189322fe2c88SJonathan Adams 			raidz_checksum_error(zio, rc, orig[c]);
1894be6fd75aSMatthew Ahrens 			rc->rc_error = SET_ERROR(ECKSUM);
189599653d4eSeschrock 			ret++;
189699653d4eSeschrock 		}
189799653d4eSeschrock 		zio_buf_free(orig[c], rc->rc_size);
189899653d4eSeschrock 	}
189999653d4eSeschrock 
190099653d4eSeschrock 	return (ret);
190199653d4eSeschrock }
190299653d4eSeschrock 
1903f94275ceSAdam Leventhal /*
1904f94275ceSAdam Leventhal  * Keep statistics on all the ways that we used parity to correct data.
1905f94275ceSAdam Leventhal  */
1906f94275ceSAdam Leventhal static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1907ea8dc4b6Seschrock 
1908e05725b1Sbonwick static int
vdev_raidz_worst_error(raidz_map_t * rm)1909e14bb325SJeff Bonwick vdev_raidz_worst_error(raidz_map_t *rm)
1910e14bb325SJeff Bonwick {
1911e14bb325SJeff Bonwick 	int error = 0;
1912e14bb325SJeff Bonwick 
1913e14bb325SJeff Bonwick 	for (int c = 0; c < rm->rm_cols; c++)
1914e14bb325SJeff Bonwick 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1915e14bb325SJeff Bonwick 
1916e14bb325SJeff Bonwick 	return (error);
1917e14bb325SJeff Bonwick }
1918e14bb325SJeff Bonwick 
1919f94275ceSAdam Leventhal /*
1920f94275ceSAdam Leventhal  * Iterate over all combinations of bad data and attempt a reconstruction.
1921f94275ceSAdam Leventhal  * Note that the algorithm below is non-optimal because it doesn't take into
1922f94275ceSAdam Leventhal  * account how reconstruction is actually performed. For example, with
1923f94275ceSAdam Leventhal  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1924f94275ceSAdam Leventhal  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1925f94275ceSAdam Leventhal  * cases we'd only use parity information in column 0.
1926f94275ceSAdam Leventhal  */
1927f94275ceSAdam Leventhal static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)1928f94275ceSAdam Leventhal vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1929f94275ceSAdam Leventhal {
1930f94275ceSAdam Leventhal 	raidz_map_t *rm = zio->io_vsd;
1931f94275ceSAdam Leventhal 	raidz_col_t *rc;
1932f94275ceSAdam Leventhal 	void *orig[VDEV_RAIDZ_MAXPARITY];
1933f94275ceSAdam Leventhal 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1934f94275ceSAdam Leventhal 	int *tgts = &tstore[1];
1935f94275ceSAdam Leventhal 	int current, next, i, c, n;
1936f94275ceSAdam Leventhal 	int code, ret = 0;
1937f94275ceSAdam Leventhal 
1938f94275ceSAdam Leventhal 	ASSERT(total_errors < rm->rm_firstdatacol);
1939f94275ceSAdam Leventhal 
1940f94275ceSAdam Leventhal 	/*
1941f94275ceSAdam Leventhal 	 * This simplifies one edge condition.
1942f94275ceSAdam Leventhal 	 */
1943f94275ceSAdam Leventhal 	tgts[-1] = -1;
1944f94275ceSAdam Leventhal 
1945f94275ceSAdam Leventhal 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1946f94275ceSAdam Leventhal 		/*
1947f94275ceSAdam Leventhal 		 * Initialize the targets array by finding the first n columns
1948f94275ceSAdam Leventhal 		 * that contain no error.
1949f94275ceSAdam Leventhal 		 *
1950f94275ceSAdam Leventhal 		 * If there were no data errors, we need to ensure that we're
1951f94275ceSAdam Leventhal 		 * always explicitly attempting to reconstruct at least one
1952f94275ceSAdam Leventhal 		 * data column. To do this, we simply push the highest target
1953f94275ceSAdam Leventhal 		 * up into the data columns.
1954f94275ceSAdam Leventhal 		 */
1955f94275ceSAdam Leventhal 		for (c = 0, i = 0; i < n; i++) {
1956f94275ceSAdam Leventhal 			if (i == n - 1 && data_errors == 0 &&
1957f94275ceSAdam Leventhal 			    c < rm->rm_firstdatacol) {
1958f94275ceSAdam Leventhal 				c = rm->rm_firstdatacol;
1959f94275ceSAdam Leventhal 			}
1960f94275ceSAdam Leventhal 
1961f94275ceSAdam Leventhal 			while (rm->rm_col[c].rc_error != 0) {
1962f94275ceSAdam Leventhal 				c++;
1963f94275ceSAdam Leventhal 				ASSERT3S(c, <, rm->rm_cols);
1964f94275ceSAdam Leventhal 			}
1965f94275ceSAdam Leventhal 
1966f94275ceSAdam Leventhal 			tgts[i] = c++;
1967f94275ceSAdam Leventhal 		}
1968f94275ceSAdam Leventhal 
1969f94275ceSAdam Leventhal 		/*
1970f94275ceSAdam Leventhal 		 * Setting tgts[n] simplifies the other edge condition.
1971f94275ceSAdam Leventhal 		 */
1972f94275ceSAdam Leventhal 		tgts[n] = rm->rm_cols;
1973f94275ceSAdam Leventhal 
1974f94275ceSAdam Leventhal 		/*
1975f94275ceSAdam Leventhal 		 * These buffers were allocated in previous iterations.
1976f94275ceSAdam Leventhal 		 */
1977f94275ceSAdam Leventhal 		for (i = 0; i < n - 1; i++) {
1978f94275ceSAdam Leventhal 			ASSERT(orig[i] != NULL);
1979f94275ceSAdam Leventhal 		}
1980f94275ceSAdam Leventhal 
1981f94275ceSAdam Leventhal 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1982f94275ceSAdam Leventhal 
1983f94275ceSAdam Leventhal 		current = 0;
1984f94275ceSAdam Leventhal 		next = tgts[current];
1985f94275ceSAdam Leventhal 
1986f94275ceSAdam Leventhal 		while (current != n) {
1987f94275ceSAdam Leventhal 			tgts[current] = next;
1988f94275ceSAdam Leventhal 			current = 0;
1989f94275ceSAdam Leventhal 
1990f94275ceSAdam Leventhal 			/*
1991f94275ceSAdam Leventhal 			 * Save off the original data that we're going to
1992f94275ceSAdam Leventhal 			 * attempt to reconstruct.
1993f94275ceSAdam Leventhal 			 */
1994f94275ceSAdam Leventhal 			for (i = 0; i < n; i++) {
1995f94275ceSAdam Leventhal 				ASSERT(orig[i] != NULL);
1996f94275ceSAdam Leventhal 				c = tgts[i];
1997f94275ceSAdam Leventhal 				ASSERT3S(c, >=, 0);
1998f94275ceSAdam Leventhal 				ASSERT3S(c, <, rm->rm_cols);
1999f94275ceSAdam Leventhal 				rc = &rm->rm_col[c];
2000f94275ceSAdam Leventhal 				bcopy(rc->rc_data, orig[i], rc->rc_size);
2001f94275ceSAdam Leventhal 			}
2002f94275ceSAdam Leventhal 
2003f94275ceSAdam Leventhal 			/*
2004f94275ceSAdam Leventhal 			 * Attempt a reconstruction and exit the outer loop on
2005f94275ceSAdam Leventhal 			 * success.
2006f94275ceSAdam Leventhal 			 */
2007f94275ceSAdam Leventhal 			code = vdev_raidz_reconstruct(rm, tgts, n);
200822fe2c88SJonathan Adams 			if (raidz_checksum_verify(zio) == 0) {
2009f94275ceSAdam Leventhal 				atomic_inc_64(&raidz_corrected[code]);
2010f94275ceSAdam Leventhal 
2011f94275ceSAdam Leventhal 				for (i = 0; i < n; i++) {
2012f94275ceSAdam Leventhal 					c = tgts[i];
2013f94275ceSAdam Leventhal 					rc = &rm->rm_col[c];
2014f94275ceSAdam Leventhal 					ASSERT(rc->rc_error == 0);
201522fe2c88SJonathan Adams 					if (rc->rc_tried)
201622fe2c88SJonathan Adams 						raidz_checksum_error(zio, rc,
201722fe2c88SJonathan Adams 						    orig[i]);
2018be6fd75aSMatthew Ahrens 					rc->rc_error = SET_ERROR(ECKSUM);
2019f94275ceSAdam Leventhal 				}
2020f94275ceSAdam Leventhal 
2021f94275ceSAdam Leventhal 				ret = code;
2022f94275ceSAdam Leventhal 				goto done;
2023f94275ceSAdam Leventhal 			}
2024f94275ceSAdam Leventhal 
2025f94275ceSAdam Leventhal 			/*
2026f94275ceSAdam Leventhal 			 * Restore the original data.
2027f94275ceSAdam Leventhal 			 */
2028f94275ceSAdam Leventhal 			for (i = 0; i < n; i++) {
2029f94275ceSAdam Leventhal 				c = tgts[i];
2030f94275ceSAdam Leventhal 				rc = &rm->rm_col[c];
2031f94275ceSAdam Leventhal 				bcopy(orig[i], rc->rc_data, rc->rc_size);
2032f94275ceSAdam Leventhal 			}
2033f94275ceSAdam Leventhal 
2034f94275ceSAdam Leventhal 			do {
2035f94275ceSAdam Leventhal 				/*
2036f94275ceSAdam Leventhal 				 * Find the next valid column after the current
2037f94275ceSAdam Leventhal 				 * position..
2038f94275ceSAdam Leventhal 				 */
2039f94275ceSAdam Leventhal 				for (next = tgts[current] + 1;
2040f94275ceSAdam Leventhal 				    next < rm->rm_cols &&
2041f94275ceSAdam Leventhal 				    rm->rm_col[next].rc_error != 0; next++)
2042f94275ceSAdam Leventhal 					continue;
2043f94275ceSAdam Leventhal 
2044f94275ceSAdam Leventhal 				ASSERT(next <= tgts[current + 1]);
2045f94275ceSAdam Leventhal 
2046f94275ceSAdam Leventhal 				/*
2047f94275ceSAdam Leventhal 				 * If that spot is available, we're done here.
2048f94275ceSAdam Leventhal 				 */
2049f94275ceSAdam Leventhal 				if (next != tgts[current + 1])
2050f94275ceSAdam Leventhal 					break;
2051f94275ceSAdam Leventhal 
2052f94275ceSAdam Leventhal 				/*
2053f94275ceSAdam Leventhal 				 * Otherwise, find the next valid column after
2054f94275ceSAdam Leventhal 				 * the previous position.
2055f94275ceSAdam Leventhal 				 */
2056f94275ceSAdam Leventhal 				for (c = tgts[current - 1] + 1;
2057f94275ceSAdam Leventhal 				    rm->rm_col[c].rc_error != 0; c++)
2058f94275ceSAdam Leventhal 					continue;
2059f94275ceSAdam Leventhal 
2060f94275ceSAdam Leventhal 				tgts[current] = c;
2061f94275ceSAdam Leventhal 				current++;
2062f94275ceSAdam Leventhal 
2063f94275ceSAdam Leventhal 			} while (current != n);
2064f94275ceSAdam Leventhal 		}
2065f94275ceSAdam Leventhal 	}
2066f94275ceSAdam Leventhal 	n--;
2067f94275ceSAdam Leventhal done:
2068f94275ceSAdam Leventhal 	for (i = 0; i < n; i++) {
2069f94275ceSAdam Leventhal 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2070f94275ceSAdam Leventhal 	}
2071f94275ceSAdam Leventhal 
2072f94275ceSAdam Leventhal 	return (ret);
2073f94275ceSAdam Leventhal }
2074f94275ceSAdam Leventhal 
20753e30c24aSWill Andrews /*
20763e30c24aSWill Andrews  * Complete an IO operation on a RAIDZ VDev
20773e30c24aSWill Andrews  *
20783e30c24aSWill Andrews  * Outline:
20793e30c24aSWill Andrews  * - For write operations:
20803e30c24aSWill Andrews  *   1. Check for errors on the child IOs.
20813e30c24aSWill Andrews  *   2. Return, setting an error code if too few child VDevs were written
20823e30c24aSWill Andrews  *      to reconstruct the data later.  Note that partial writes are
20833e30c24aSWill Andrews  *      considered successful if they can be reconstructed at all.
20843e30c24aSWill Andrews  * - For read operations:
20853e30c24aSWill Andrews  *   1. Check for errors on the child IOs.
20863e30c24aSWill Andrews  *   2. If data errors occurred:
20873e30c24aSWill Andrews  *      a. Try to reassemble the data from the parity available.
20883e30c24aSWill Andrews  *      b. If we haven't yet read the parity drives, read them now.
20893e30c24aSWill Andrews  *      c. If all parity drives have been read but the data still doesn't
20903e30c24aSWill Andrews  *         reassemble with a correct checksum, then try combinatorial
20913e30c24aSWill Andrews  *         reconstruction.
20923e30c24aSWill Andrews  *      d. If that doesn't work, return an error.
20933e30c24aSWill Andrews  *   3. If there were unexpected errors or this is a resilver operation,
20943e30c24aSWill Andrews  *      rewrite the vdevs that had errors.
20953e30c24aSWill Andrews  */
2096e14bb325SJeff Bonwick static void
vdev_raidz_io_done(zio_t * zio)2097fa9e4066Sahrens vdev_raidz_io_done(zio_t *zio)
2098fa9e4066Sahrens {
2099fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
2100fa9e4066Sahrens 	vdev_t *cvd;
2101fa9e4066Sahrens 	raidz_map_t *rm = zio->io_vsd;
2102f94275ceSAdam Leventhal 	raidz_col_t *rc;
2103fa9e4066Sahrens 	int unexpected_errors = 0;
210499653d4eSeschrock 	int parity_errors = 0;
2105c7a40cc4Sahl 	int parity_untried = 0;
210699653d4eSeschrock 	int data_errors = 0;
2107e14bb325SJeff Bonwick 	int total_errors = 0;
2108f94275ceSAdam Leventhal 	int n, c;
2109f94275ceSAdam Leventhal 	int tgts[VDEV_RAIDZ_MAXPARITY];
2110f94275ceSAdam Leventhal 	int code;
2111fa9e4066Sahrens 
211244cd46caSbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2113fa9e4066Sahrens 
211499653d4eSeschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
211599653d4eSeschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
211699653d4eSeschrock 
2117fa9e4066Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
2118fa9e4066Sahrens 		rc = &rm->rm_col[c];
2119fa9e4066Sahrens 
2120fa9e4066Sahrens 		if (rc->rc_error) {
2121e14bb325SJeff Bonwick 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
212299653d4eSeschrock 
212399653d4eSeschrock 			if (c < rm->rm_firstdatacol)
212499653d4eSeschrock 				parity_errors++;
212599653d4eSeschrock 			else
212699653d4eSeschrock 				data_errors++;
212799653d4eSeschrock 
2128fa9e4066Sahrens 			if (!rc->rc_skipped)
2129fa9e4066Sahrens 				unexpected_errors++;
213099653d4eSeschrock 
2131e14bb325SJeff Bonwick 			total_errors++;
2132c7a40cc4Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2133c7a40cc4Sahl 			parity_untried++;
2134fa9e4066Sahrens 		}
2135fa9e4066Sahrens 	}
2136fa9e4066Sahrens 
2137fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
2138fa9e4066Sahrens 		/*
2139e14bb325SJeff Bonwick 		 * XXX -- for now, treat partial writes as a success.
2140e14bb325SJeff Bonwick 		 * (If we couldn't write enough columns to reconstruct
2141e14bb325SJeff Bonwick 		 * the data, the I/O failed.  Otherwise, good enough.)
2142e14bb325SJeff Bonwick 		 *
2143e14bb325SJeff Bonwick 		 * Now that we support write reallocation, it would be better
2144e14bb325SJeff Bonwick 		 * to treat partial failure as real failure unless there are
2145e14bb325SJeff Bonwick 		 * no non-degraded top-level vdevs left, and not update DTLs
2146e14bb325SJeff Bonwick 		 * if we intend to reallocate.
2147fa9e4066Sahrens 		 */
2148fa9e4066Sahrens 		/* XXPOLICY */
2149e14bb325SJeff Bonwick 		if (total_errors > rm->rm_firstdatacol)
2150e14bb325SJeff Bonwick 			zio->io_error = vdev_raidz_worst_error(rm);
2151fa9e4066Sahrens 
2152e14bb325SJeff Bonwick 		return;
2153fa9e4066Sahrens 	}
2154fa9e4066Sahrens 
2155fa9e4066Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
215699653d4eSeschrock 	/*
215799653d4eSeschrock 	 * There are three potential phases for a read:
215899653d4eSeschrock 	 *	1. produce valid data from the columns read
215999653d4eSeschrock 	 *	2. read all disks and try again
216099653d4eSeschrock 	 *	3. perform combinatorial reconstruction
216199653d4eSeschrock 	 *
216299653d4eSeschrock 	 * Each phase is progressively both more expensive and less likely to
216399653d4eSeschrock 	 * occur. If we encounter more errors than we can repair or all phases
216499653d4eSeschrock 	 * fail, we have no choice but to return an error.
216599653d4eSeschrock 	 */
2166fa9e4066Sahrens 
2167fa9e4066Sahrens 	/*
216899653d4eSeschrock 	 * If the number of errors we saw was correctable -- less than or equal
2169c7a40cc4Sahl 	 * to the number of parity disks read -- attempt to produce data that
2170c7a40cc4Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
2171c7a40cc4Sahl 	 * any errors.
2172fa9e4066Sahrens 	 */
2173e14bb325SJeff Bonwick 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2174f94275ceSAdam Leventhal 		if (data_errors == 0) {
217522fe2c88SJonathan Adams 			if (raidz_checksum_verify(zio) == 0) {
2176d427dcb0Sahl 				/*
2177d427dcb0Sahl 				 * If we read parity information (unnecessarily
2178d427dcb0Sahl 				 * as it happens since no reconstruction was
2179d427dcb0Sahl 				 * needed) regenerate and verify the parity.
2180d427dcb0Sahl 				 * We also regenerate parity when resilvering
2181d427dcb0Sahl 				 * so we can write it out to the failed device
2182d427dcb0Sahl 				 * later.
2183d427dcb0Sahl 				 */
2184c7a40cc4Sahl 				if (parity_errors + parity_untried <
2185d427dcb0Sahl 				    rm->rm_firstdatacol ||
2186d427dcb0Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
218799653d4eSeschrock 					n = raidz_parity_verify(zio, rm);
218899653d4eSeschrock 					unexpected_errors += n;
218999653d4eSeschrock 					ASSERT(parity_errors + n <=
219099653d4eSeschrock 					    rm->rm_firstdatacol);
2191c7a40cc4Sahl 				}
2192fa9e4066Sahrens 				goto done;
2193fa9e4066Sahrens 			}
2194f94275ceSAdam Leventhal 		} else {
2195c7a40cc4Sahl 			/*
2196c7a40cc4Sahl 			 * We either attempt to read all the parity columns or
2197c7a40cc4Sahl 			 * none of them. If we didn't try to read parity, we
2198c7a40cc4Sahl 			 * wouldn't be here in the correctable case. There must
2199c7a40cc4Sahl 			 * also have been fewer parity errors than parity
2200c7a40cc4Sahl 			 * columns or, again, we wouldn't be in this code path.
2201c7a40cc4Sahl 			 */
2202c7a40cc4Sahl 			ASSERT(parity_untried == 0);
220399653d4eSeschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
220499653d4eSeschrock 
220599653d4eSeschrock 			/*
2206f94275ceSAdam Leventhal 			 * Identify the data columns that reported an error.
220799653d4eSeschrock 			 */
2208f94275ceSAdam Leventhal 			n = 0;
220999653d4eSeschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
221099653d4eSeschrock 				rc = &rm->rm_col[c];
2211f94275ceSAdam Leventhal 				if (rc->rc_error != 0) {
2212f94275ceSAdam Leventhal 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2213f94275ceSAdam Leventhal 					tgts[n++] = c;
221499653d4eSeschrock 				}
2215f94275ceSAdam Leventhal 			}
221699653d4eSeschrock 
2217f94275ceSAdam Leventhal 			ASSERT(rm->rm_firstdatacol >= n);
2218f94275ceSAdam Leventhal 
2219f94275ceSAdam Leventhal 			code = vdev_raidz_reconstruct(rm, tgts, n);
222099653d4eSeschrock 
222122fe2c88SJonathan Adams 			if (raidz_checksum_verify(zio) == 0) {
2222f94275ceSAdam Leventhal 				atomic_inc_64(&raidz_corrected[code]);
222399653d4eSeschrock 
222499653d4eSeschrock 				/*
2225f94275ceSAdam Leventhal 				 * If we read more parity disks than were used
2226f94275ceSAdam Leventhal 				 * for reconstruction, confirm that the other
2227f94275ceSAdam Leventhal 				 * parity disks produced correct data. This
2228f94275ceSAdam Leventhal 				 * routine is suboptimal in that it regenerates
2229f94275ceSAdam Leventhal 				 * the parity that we already used in addition
2230f94275ceSAdam Leventhal 				 * to the parity that we're attempting to
2231f94275ceSAdam Leventhal 				 * verify, but this should be a relatively
2232f94275ceSAdam Leventhal 				 * uncommon case, and can be optimized if it
2233f94275ceSAdam Leventhal 				 * becomes a problem. Note that we regenerate
2234f94275ceSAdam Leventhal 				 * parity when resilvering so we can write it
2235f94275ceSAdam Leventhal 				 * out to failed devices later.
223699653d4eSeschrock 				 */
2237f94275ceSAdam Leventhal 				if (parity_errors < rm->rm_firstdatacol - n ||
2238d427dcb0Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
223999653d4eSeschrock 					n = raidz_parity_verify(zio, rm);
224099653d4eSeschrock 					unexpected_errors += n;
224199653d4eSeschrock 					ASSERT(parity_errors + n <=
224299653d4eSeschrock 					    rm->rm_firstdatacol);
224399653d4eSeschrock 				}
224499653d4eSeschrock 
224599653d4eSeschrock 				goto done;
224699653d4eSeschrock 			}
224799653d4eSeschrock 		}
2248fa9e4066Sahrens 	}
2249fa9e4066Sahrens 
2250fa9e4066Sahrens 	/*
225199653d4eSeschrock 	 * This isn't a typical situation -- either we got a read error or
225299653d4eSeschrock 	 * a child silently returned bad data. Read every block so we can
225399653d4eSeschrock 	 * try again with as much data and parity as we can track down. If
225499653d4eSeschrock 	 * we've already been through once before, all children will be marked
225599653d4eSeschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
2256fa9e4066Sahrens 	 */
2257fa9e4066Sahrens 	unexpected_errors = 1;
225899653d4eSeschrock 	rm->rm_missingdata = 0;
225999653d4eSeschrock 	rm->rm_missingparity = 0;
2260fa9e4066Sahrens 
226199653d4eSeschrock 	for (c = 0; c < rm->rm_cols; c++) {
226299653d4eSeschrock 		if (rm->rm_col[c].rc_tried)
226399653d4eSeschrock 			continue;
2264fa9e4066Sahrens 
2265fa9e4066Sahrens 		zio_vdev_io_redone(zio);
226699653d4eSeschrock 		do {
2267fa9e4066Sahrens 			rc = &rm->rm_col[c];
2268fa9e4066Sahrens 			if (rc->rc_tried)
2269fa9e4066Sahrens 				continue;
2270fa9e4066Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
227199653d4eSeschrock 			    vd->vdev_child[rc->rc_devidx],
2272fa9e4066Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2273e14bb325SJeff Bonwick 			    zio->io_type, zio->io_priority, 0,
2274fa9e4066Sahrens 			    vdev_raidz_child_done, rc));
227599653d4eSeschrock 		} while (++c < rm->rm_cols);
2276e05725b1Sbonwick 
2277e14bb325SJeff Bonwick 		return;
2278fa9e4066Sahrens 	}
2279fa9e4066Sahrens 
2280fa9e4066Sahrens 	/*
228199653d4eSeschrock 	 * At this point we've attempted to reconstruct the data given the
228299653d4eSeschrock 	 * errors we detected, and we've attempted to read all columns. There
228399653d4eSeschrock 	 * must, therefore, be one or more additional problems -- silent errors
228499653d4eSeschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
2285f94275ceSAdam Leventhal 	 * in absent data. We check if there is enough additional data to
2286f94275ceSAdam Leventhal 	 * possibly reconstruct the data and then perform combinatorial
2287f94275ceSAdam Leventhal 	 * reconstruction over all possible combinations. If that fails,
2288f94275ceSAdam Leventhal 	 * we're cooked.
2289fa9e4066Sahrens 	 */
229022fe2c88SJonathan Adams 	if (total_errors > rm->rm_firstdatacol) {
2291e14bb325SJeff Bonwick 		zio->io_error = vdev_raidz_worst_error(rm);
2292fa9e4066Sahrens 
229322fe2c88SJonathan Adams 	} else if (total_errors < rm->rm_firstdatacol &&
229422fe2c88SJonathan Adams 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2295fa9e4066Sahrens 		/*
2296f94275ceSAdam Leventhal 		 * If we didn't use all the available parity for the
2297f94275ceSAdam Leventhal 		 * combinatorial reconstruction, verify that the remaining
2298f94275ceSAdam Leventhal 		 * parity is correct.
2299fa9e4066Sahrens 		 */
2300f94275ceSAdam Leventhal 		if (code != (1 << rm->rm_firstdatacol) - 1)
2301f94275ceSAdam Leventhal 			(void) raidz_parity_verify(zio, rm);
2302f94275ceSAdam Leventhal 	} else {
2303fa9e4066Sahrens 		/*
230422fe2c88SJonathan Adams 		 * We're here because either:
230522fe2c88SJonathan Adams 		 *
230622fe2c88SJonathan Adams 		 *	total_errors == rm_first_datacol, or
230722fe2c88SJonathan Adams 		 *	vdev_raidz_combrec() failed
230822fe2c88SJonathan Adams 		 *
230922fe2c88SJonathan Adams 		 * In either case, there is enough bad data to prevent
231022fe2c88SJonathan Adams 		 * reconstruction.
231122fe2c88SJonathan Adams 		 *
231222fe2c88SJonathan Adams 		 * Start checksum ereports for all children which haven't
23136e1f5caaSNeil Perrin 		 * failed, and the IO wasn't speculative.
2314fa9e4066Sahrens 		 */
2315be6fd75aSMatthew Ahrens 		zio->io_error = SET_ERROR(ECKSUM);
2316e14bb325SJeff Bonwick 
23176e1f5caaSNeil Perrin 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2318ea8dc4b6Seschrock 			for (c = 0; c < rm->rm_cols; c++) {
2319ea8dc4b6Seschrock 				rc = &rm->rm_col[c];
232022fe2c88SJonathan Adams 				if (rc->rc_error == 0) {
232122fe2c88SJonathan Adams 					zio_bad_cksum_t zbc;
232222fe2c88SJonathan Adams 					zbc.zbc_has_cksum = 0;
23236e1f5caaSNeil Perrin 					zbc.zbc_injected =
23246e1f5caaSNeil Perrin 					    rm->rm_ecksuminjected;
232522fe2c88SJonathan Adams 
232622fe2c88SJonathan Adams 					zfs_ereport_start_checksum(
23276e1f5caaSNeil Perrin 					    zio->io_spa,
23286e1f5caaSNeil Perrin 					    vd->vdev_child[rc->rc_devidx],
232922fe2c88SJonathan Adams 					    zio, rc->rc_offset, rc->rc_size,
233022fe2c88SJonathan Adams 					    (void *)(uintptr_t)c, &zbc);
2331f94275ceSAdam Leventhal 				}
2332ea8dc4b6Seschrock 			}
2333ea8dc4b6Seschrock 		}
23346e1f5caaSNeil Perrin 	}
2335fa9e4066Sahrens 
2336fa9e4066Sahrens done:
2337fa9e4066Sahrens 	zio_checksum_verified(zio);
2338fa9e4066Sahrens 
23398ad4d6ddSJeff Bonwick 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2340fa9e4066Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2341fa9e4066Sahrens 		/*
2342fa9e4066Sahrens 		 * Use the good data we have in hand to repair damaged children.
2343fa9e4066Sahrens 		 */
2344fa9e4066Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
2345fa9e4066Sahrens 			rc = &rm->rm_col[c];
234699653d4eSeschrock 			cvd = vd->vdev_child[rc->rc_devidx];
2347fa9e4066Sahrens 
2348ecc2d604Sbonwick 			if (rc->rc_error == 0)
2349ecc2d604Sbonwick 				continue;
2350fa9e4066Sahrens 
2351e14bb325SJeff Bonwick 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2352ecc2d604Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
235369962b56SMatthew Ahrens 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
23548ad4d6ddSJeff Bonwick 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
23558ad4d6ddSJeff Bonwick 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2356fa9e4066Sahrens 		}
2357fa9e4066Sahrens 	}
2358fa9e4066Sahrens }
2359fa9e4066Sahrens 
2360fa9e4066Sahrens static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2361fa9e4066Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2362fa9e4066Sahrens {
236399653d4eSeschrock 	if (faulted > vd->vdev_nparity)
2364ea8dc4b6Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2365ea8dc4b6Seschrock 		    VDEV_AUX_NO_REPLICAS);
2366fa9e4066Sahrens 	else if (degraded + faulted != 0)
2367ea8dc4b6Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2368fa9e4066Sahrens 	else
2369ea8dc4b6Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2370fa9e4066Sahrens }
2371fa9e4066Sahrens 
2372fa9e4066Sahrens vdev_ops_t vdev_raidz_ops = {
2373fa9e4066Sahrens 	vdev_raidz_open,
2374fa9e4066Sahrens 	vdev_raidz_close,
2375fa9e4066Sahrens 	vdev_raidz_asize,
2376fa9e4066Sahrens 	vdev_raidz_io_start,
2377fa9e4066Sahrens 	vdev_raidz_io_done,
2378fa9e4066Sahrens 	vdev_raidz_state_change,
2379dcba9f3fSGeorge Wilson 	NULL,
2380dcba9f3fSGeorge Wilson 	NULL,
2381fa9e4066Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2382fa9e4066Sahrens 	B_FALSE			/* not a leaf vdev */
2383fa9e4066Sahrens };
2384