1fa9e4066Sahrens /*
2fa9e4066Sahrens * CDDL HEADER START
3fa9e4066Sahrens *
4fa9e4066Sahrens * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock * You may not use this file except in compliance with the License.
7fa9e4066Sahrens *
8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens * See the License for the specific language governing permissions
11fa9e4066Sahrens * and limitations under the License.
12fa9e4066Sahrens *
13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens *
19fa9e4066Sahrens * CDDL HEADER END
20fa9e4066Sahrens */
2199653d4eSeschrock
22fa9e4066Sahrens /*
233f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24efe6bf49SGeorge Wilson * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25810e43b2SBill Pijewski * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26fa9e4066Sahrens */
27fa9e4066Sahrens
28fa9e4066Sahrens #include <sys/zfs_context.h>
29fa9e4066Sahrens #include <sys/spa.h>
30fa9e4066Sahrens #include <sys/vdev_impl.h>
31810e43b2SBill Pijewski #include <sys/vdev_disk.h>
32810e43b2SBill Pijewski #include <sys/vdev_file.h>
33810e43b2SBill Pijewski #include <sys/vdev_raidz.h>
34fa9e4066Sahrens #include <sys/zio.h>
35fa9e4066Sahrens #include <sys/zio_checksum.h>
36fa9e4066Sahrens #include <sys/fs/zfs.h>
37ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
38fa9e4066Sahrens
39fa9e4066Sahrens /*
40fa9e4066Sahrens * Virtual device vector for RAID-Z.
4199653d4eSeschrock *
42f94275ceSAdam Leventhal * This vdev supports single, double, and triple parity. For single parity,
43f94275ceSAdam Leventhal * we use a simple XOR of all the data columns. For double or triple parity,
44f94275ceSAdam Leventhal * we use a special case of Reed-Solomon coding. This extends the
45f94275ceSAdam Leventhal * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
46f94275ceSAdam Leventhal * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
47f94275ceSAdam Leventhal * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
48f94275ceSAdam Leventhal * former is also based. The latter is designed to provide higher performance
49f94275ceSAdam Leventhal * for writes.
50f94275ceSAdam Leventhal *
51f94275ceSAdam Leventhal * Note that the Plank paper claimed to support arbitrary N+M, but was then
52f94275ceSAdam Leventhal * amended six years later identifying a critical flaw that invalidates its
53f94275ceSAdam Leventhal * claims. Nevertheless, the technique can be adapted to work for up to
54f94275ceSAdam Leventhal * triple parity. For additional parity, the amendment "Note: Correction to
55f94275ceSAdam Leventhal * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
56f94275ceSAdam Leventhal * is viable, but the additional complexity means that write performance will
57f94275ceSAdam Leventhal * suffer.
58f94275ceSAdam Leventhal *
59f94275ceSAdam Leventhal * All of the methods above operate on a Galois field, defined over the
60f94275ceSAdam Leventhal * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
61f94275ceSAdam Leventhal * can be expressed with a single byte. Briefly, the operations on the
62f94275ceSAdam Leventhal * field are defined as follows:
6399653d4eSeschrock *
6499653d4eSeschrock * o addition (+) is represented by a bitwise XOR
6599653d4eSeschrock * o subtraction (-) is therefore identical to addition: A + B = A - B
6699653d4eSeschrock * o multiplication of A by 2 is defined by the following bitwise expression:
67f7170741SWill Andrews *
6899653d4eSeschrock * (A * 2)_7 = A_6
6999653d4eSeschrock * (A * 2)_6 = A_5
7099653d4eSeschrock * (A * 2)_5 = A_4
7199653d4eSeschrock * (A * 2)_4 = A_3 + A_7
7299653d4eSeschrock * (A * 2)_3 = A_2 + A_7
7399653d4eSeschrock * (A * 2)_2 = A_1 + A_7
7499653d4eSeschrock * (A * 2)_1 = A_0
7599653d4eSeschrock * (A * 2)_0 = A_7
7699653d4eSeschrock *
7799653d4eSeschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
78f94275ceSAdam Leventhal * As an aside, this multiplication is derived from the error correcting
79f94275ceSAdam Leventhal * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
8099653d4eSeschrock *
8199653d4eSeschrock * Observe that any number in the field (except for 0) can be expressed as a
8299653d4eSeschrock * power of 2 -- a generator for the field. We store a table of the powers of
8399653d4eSeschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
8499653d4eSeschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
85f94275ceSAdam Leventhal * than field addition). The inverse of a field element A (A^-1) is therefore
86f94275ceSAdam Leventhal * A ^ (255 - 1) = A^254.
8799653d4eSeschrock *
88f94275ceSAdam Leventhal * The up-to-three parity columns, P, Q, R over several data columns,
89f94275ceSAdam Leventhal * D_0, ... D_n-1, can be expressed by field operations:
9099653d4eSeschrock *
9199653d4eSeschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1
9299653d4eSeschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
9399653d4eSeschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
94f94275ceSAdam Leventhal * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
95f94275ceSAdam Leventhal * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
9699653d4eSeschrock *
97f94275ceSAdam Leventhal * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
98f94275ceSAdam Leventhal * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
99f94275ceSAdam Leventhal * independent coefficients. (There are no additional coefficients that have
100f94275ceSAdam Leventhal * this property which is why the uncorrected Plank method breaks down.)
101f94275ceSAdam Leventhal *
102f94275ceSAdam Leventhal * See the reconstruction code below for how P, Q and R can used individually
103f94275ceSAdam Leventhal * or in concert to recover missing data columns.
104fa9e4066Sahrens */
105fa9e4066Sahrens
106fa9e4066Sahrens typedef struct raidz_col {
10799653d4eSeschrock uint64_t rc_devidx; /* child device index for I/O */
10899653d4eSeschrock uint64_t rc_offset; /* device offset */
10999653d4eSeschrock uint64_t rc_size; /* I/O size */
11099653d4eSeschrock void *rc_data; /* I/O data */
11122fe2c88SJonathan Adams void *rc_gdata; /* used to store the "good" version */
11299653d4eSeschrock int rc_error; /* I/O error for this device */
11399653d4eSeschrock uint8_t rc_tried; /* Did we attempt this I/O column? */
11499653d4eSeschrock uint8_t rc_skipped; /* Did we skip this I/O column? */
115fa9e4066Sahrens } raidz_col_t;
116fa9e4066Sahrens
117fa9e4066Sahrens typedef struct raidz_map {
118f94275ceSAdam Leventhal uint64_t rm_cols; /* Regular column count */
119f94275ceSAdam Leventhal uint64_t rm_scols; /* Count including skipped columns */
12099653d4eSeschrock uint64_t rm_bigcols; /* Number of oversized columns */
12199653d4eSeschrock uint64_t rm_asize; /* Actual total I/O size */
12299653d4eSeschrock uint64_t rm_missingdata; /* Count of missing data devices */
12399653d4eSeschrock uint64_t rm_missingparity; /* Count of missing parity devices */
12499653d4eSeschrock uint64_t rm_firstdatacol; /* First data column/parity count */
1252fbc121fSAdam Leventhal uint64_t rm_nskip; /* Skipped sectors for padding */
1262fbc121fSAdam Leventhal uint64_t rm_skipstart; /* Column index of padding start */
12722fe2c88SJonathan Adams void *rm_datacopy; /* rm_asize-buffer of copied data */
12822fe2c88SJonathan Adams uintptr_t rm_reports; /* # of referencing checksum reports */
12922fe2c88SJonathan Adams uint8_t rm_freed; /* map no longer has referencing ZIO */
13022fe2c88SJonathan Adams uint8_t rm_ecksuminjected; /* checksum error was injected */
13199653d4eSeschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
132fa9e4066Sahrens } raidz_map_t;
133fa9e4066Sahrens
13499653d4eSeschrock #define VDEV_RAIDZ_P 0
13599653d4eSeschrock #define VDEV_RAIDZ_Q 1
136f94275ceSAdam Leventhal #define VDEV_RAIDZ_R 2
13799653d4eSeschrock
138f94275ceSAdam Leventhal #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
139f94275ceSAdam Leventhal #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
14099653d4eSeschrock
141f94275ceSAdam Leventhal /*
142f94275ceSAdam Leventhal * We provide a mechanism to perform the field multiplication operation on a
143f94275ceSAdam Leventhal * 64-bit value all at once rather than a byte at a time. This works by
144f94275ceSAdam Leventhal * creating a mask from the top bit in each byte and using that to
145f94275ceSAdam Leventhal * conditionally apply the XOR of 0x1d.
146f94275ceSAdam Leventhal */
147f94275ceSAdam Leventhal #define VDEV_RAIDZ_64MUL_2(x, mask) \
148f94275ceSAdam Leventhal { \
149f94275ceSAdam Leventhal (mask) = (x) & 0x8080808080808080ULL; \
150f94275ceSAdam Leventhal (mask) = ((mask) << 1) - ((mask) >> 7); \
151f94275ceSAdam Leventhal (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
152f94275ceSAdam Leventhal ((mask) & 0x1d1d1d1d1d1d1d1d); \
153f94275ceSAdam Leventhal }
154f94275ceSAdam Leventhal
155f94275ceSAdam Leventhal #define VDEV_RAIDZ_64MUL_4(x, mask) \
156f94275ceSAdam Leventhal { \
157f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2((x), mask); \
158f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2((x), mask); \
159f94275ceSAdam Leventhal }
160f94275ceSAdam Leventhal
161810e43b2SBill Pijewski #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
162810e43b2SBill Pijewski
163f94275ceSAdam Leventhal /*
164f94275ceSAdam Leventhal * Force reconstruction to use the general purpose method.
165f94275ceSAdam Leventhal */
166f94275ceSAdam Leventhal int vdev_raidz_default_to_general;
16799653d4eSeschrock
168f7170741SWill Andrews /* Powers of 2 in the Galois field defined above. */
16999653d4eSeschrock static const uint8_t vdev_raidz_pow2[256] = {
17099653d4eSeschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
17199653d4eSeschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
17299653d4eSeschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
17399653d4eSeschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
17499653d4eSeschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
17599653d4eSeschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
17699653d4eSeschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
17799653d4eSeschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
17899653d4eSeschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
17999653d4eSeschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
18099653d4eSeschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
18199653d4eSeschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
18299653d4eSeschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
18399653d4eSeschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
18499653d4eSeschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
18599653d4eSeschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
18699653d4eSeschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
18799653d4eSeschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
18899653d4eSeschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
18999653d4eSeschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
19099653d4eSeschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
19199653d4eSeschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
19299653d4eSeschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
19399653d4eSeschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
19499653d4eSeschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
19599653d4eSeschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
19699653d4eSeschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
19799653d4eSeschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
19899653d4eSeschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
19999653d4eSeschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
20099653d4eSeschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
20199653d4eSeschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
20299653d4eSeschrock };
203f7170741SWill Andrews /* Logs of 2 in the Galois field defined above. */
20499653d4eSeschrock static const uint8_t vdev_raidz_log2[256] = {
20599653d4eSeschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
20699653d4eSeschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
20799653d4eSeschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
20899653d4eSeschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
20999653d4eSeschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
21099653d4eSeschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
21199653d4eSeschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
21299653d4eSeschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
21399653d4eSeschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
21499653d4eSeschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
21599653d4eSeschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
21699653d4eSeschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
21799653d4eSeschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
21899653d4eSeschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
21999653d4eSeschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
22099653d4eSeschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
22199653d4eSeschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
22299653d4eSeschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
22399653d4eSeschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
22499653d4eSeschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
22599653d4eSeschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
22699653d4eSeschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
22799653d4eSeschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
22899653d4eSeschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
22999653d4eSeschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
23099653d4eSeschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
23199653d4eSeschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
23299653d4eSeschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
23399653d4eSeschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
23499653d4eSeschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
23599653d4eSeschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
23699653d4eSeschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
23799653d4eSeschrock };
23899653d4eSeschrock
23922fe2c88SJonathan Adams static void vdev_raidz_generate_parity(raidz_map_t *rm);
24022fe2c88SJonathan Adams
24199653d4eSeschrock /*
24299653d4eSeschrock * Multiply a given number by 2 raised to the given power.
24399653d4eSeschrock */
24499653d4eSeschrock static uint8_t
vdev_raidz_exp2(uint_t a,int exp)24599653d4eSeschrock vdev_raidz_exp2(uint_t a, int exp)
24699653d4eSeschrock {
24799653d4eSeschrock if (a == 0)
24899653d4eSeschrock return (0);
24999653d4eSeschrock
25099653d4eSeschrock ASSERT(exp >= 0);
25199653d4eSeschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
25299653d4eSeschrock
25399653d4eSeschrock exp += vdev_raidz_log2[a];
25499653d4eSeschrock if (exp > 255)
25599653d4eSeschrock exp -= 255;
25699653d4eSeschrock
25799653d4eSeschrock return (vdev_raidz_pow2[exp]);
25899653d4eSeschrock }
25999653d4eSeschrock
260e14bb325SJeff Bonwick static void
vdev_raidz_map_free(raidz_map_t * rm)26122fe2c88SJonathan Adams vdev_raidz_map_free(raidz_map_t *rm)
262e14bb325SJeff Bonwick {
263e14bb325SJeff Bonwick int c;
264baa7389eSJonathan Adams size_t size;
265e14bb325SJeff Bonwick
26622fe2c88SJonathan Adams for (c = 0; c < rm->rm_firstdatacol; c++) {
267e14bb325SJeff Bonwick zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
268e14bb325SJeff Bonwick
26922fe2c88SJonathan Adams if (rm->rm_col[c].rc_gdata != NULL)
27022fe2c88SJonathan Adams zio_buf_free(rm->rm_col[c].rc_gdata,
27122fe2c88SJonathan Adams rm->rm_col[c].rc_size);
27222fe2c88SJonathan Adams }
27322fe2c88SJonathan Adams
274baa7389eSJonathan Adams size = 0;
275baa7389eSJonathan Adams for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
276baa7389eSJonathan Adams size += rm->rm_col[c].rc_size;
277baa7389eSJonathan Adams
27822fe2c88SJonathan Adams if (rm->rm_datacopy != NULL)
27922fe2c88SJonathan Adams zio_buf_free(rm->rm_datacopy, size);
28022fe2c88SJonathan Adams
281f94275ceSAdam Leventhal kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282e14bb325SJeff Bonwick }
283e14bb325SJeff Bonwick
28422fe2c88SJonathan Adams static void
vdev_raidz_map_free_vsd(zio_t * zio)28522fe2c88SJonathan Adams vdev_raidz_map_free_vsd(zio_t *zio)
28622fe2c88SJonathan Adams {
28722fe2c88SJonathan Adams raidz_map_t *rm = zio->io_vsd;
28822fe2c88SJonathan Adams
289fb09f5aaSMadhav Suresh ASSERT0(rm->rm_freed);
29022fe2c88SJonathan Adams rm->rm_freed = 1;
29122fe2c88SJonathan Adams
29222fe2c88SJonathan Adams if (rm->rm_reports == 0)
29322fe2c88SJonathan Adams vdev_raidz_map_free(rm);
29422fe2c88SJonathan Adams }
29522fe2c88SJonathan Adams
29622fe2c88SJonathan Adams /*ARGSUSED*/
29722fe2c88SJonathan Adams static void
vdev_raidz_cksum_free(void * arg,size_t ignored)29822fe2c88SJonathan Adams vdev_raidz_cksum_free(void *arg, size_t ignored)
29922fe2c88SJonathan Adams {
30022fe2c88SJonathan Adams raidz_map_t *rm = arg;
30122fe2c88SJonathan Adams
30222fe2c88SJonathan Adams ASSERT3U(rm->rm_reports, >, 0);
30322fe2c88SJonathan Adams
304baa7389eSJonathan Adams if (--rm->rm_reports == 0 && rm->rm_freed != 0)
30522fe2c88SJonathan Adams vdev_raidz_map_free(rm);
30622fe2c88SJonathan Adams }
30722fe2c88SJonathan Adams
30822fe2c88SJonathan Adams static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)30922fe2c88SJonathan Adams vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
31022fe2c88SJonathan Adams {
31122fe2c88SJonathan Adams raidz_map_t *rm = zcr->zcr_cbdata;
31222fe2c88SJonathan Adams size_t c = zcr->zcr_cbinfo;
31322fe2c88SJonathan Adams size_t x;
31422fe2c88SJonathan Adams
31522fe2c88SJonathan Adams const char *good = NULL;
31622fe2c88SJonathan Adams const char *bad = rm->rm_col[c].rc_data;
31722fe2c88SJonathan Adams
31822fe2c88SJonathan Adams if (good_data == NULL) {
31922fe2c88SJonathan Adams zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
32022fe2c88SJonathan Adams return;
32122fe2c88SJonathan Adams }
32222fe2c88SJonathan Adams
32322fe2c88SJonathan Adams if (c < rm->rm_firstdatacol) {
32422fe2c88SJonathan Adams /*
32522fe2c88SJonathan Adams * The first time through, calculate the parity blocks for
32622fe2c88SJonathan Adams * the good data (this relies on the fact that the good
32722fe2c88SJonathan Adams * data never changes for a given logical ZIO)
32822fe2c88SJonathan Adams */
32922fe2c88SJonathan Adams if (rm->rm_col[0].rc_gdata == NULL) {
33022fe2c88SJonathan Adams char *bad_parity[VDEV_RAIDZ_MAXPARITY];
33122fe2c88SJonathan Adams char *buf;
33222fe2c88SJonathan Adams
33322fe2c88SJonathan Adams /*
33422fe2c88SJonathan Adams * Set up the rm_col[]s to generate the parity for
33522fe2c88SJonathan Adams * good_data, first saving the parity bufs and
33622fe2c88SJonathan Adams * replacing them with buffers to hold the result.
33722fe2c88SJonathan Adams */
33822fe2c88SJonathan Adams for (x = 0; x < rm->rm_firstdatacol; x++) {
33922fe2c88SJonathan Adams bad_parity[x] = rm->rm_col[x].rc_data;
34022fe2c88SJonathan Adams rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
34122fe2c88SJonathan Adams zio_buf_alloc(rm->rm_col[x].rc_size);
34222fe2c88SJonathan Adams }
34322fe2c88SJonathan Adams
34422fe2c88SJonathan Adams /* fill in the data columns from good_data */
34522fe2c88SJonathan Adams buf = (char *)good_data;
34622fe2c88SJonathan Adams for (; x < rm->rm_cols; x++) {
34722fe2c88SJonathan Adams rm->rm_col[x].rc_data = buf;
34822fe2c88SJonathan Adams buf += rm->rm_col[x].rc_size;
34922fe2c88SJonathan Adams }
35022fe2c88SJonathan Adams
35122fe2c88SJonathan Adams /*
35222fe2c88SJonathan Adams * Construct the parity from the good data.
35322fe2c88SJonathan Adams */
35422fe2c88SJonathan Adams vdev_raidz_generate_parity(rm);
35522fe2c88SJonathan Adams
35622fe2c88SJonathan Adams /* restore everything back to its original state */
35722fe2c88SJonathan Adams for (x = 0; x < rm->rm_firstdatacol; x++)
35822fe2c88SJonathan Adams rm->rm_col[x].rc_data = bad_parity[x];
35922fe2c88SJonathan Adams
36022fe2c88SJonathan Adams buf = rm->rm_datacopy;
36122fe2c88SJonathan Adams for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
36222fe2c88SJonathan Adams rm->rm_col[x].rc_data = buf;
36322fe2c88SJonathan Adams buf += rm->rm_col[x].rc_size;
36422fe2c88SJonathan Adams }
36522fe2c88SJonathan Adams }
36622fe2c88SJonathan Adams
36722fe2c88SJonathan Adams ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
36822fe2c88SJonathan Adams good = rm->rm_col[c].rc_gdata;
36922fe2c88SJonathan Adams } else {
37022fe2c88SJonathan Adams /* adjust good_data to point at the start of our column */
37122fe2c88SJonathan Adams good = good_data;
37222fe2c88SJonathan Adams
37322fe2c88SJonathan Adams for (x = rm->rm_firstdatacol; x < c; x++)
37422fe2c88SJonathan Adams good += rm->rm_col[x].rc_size;
37522fe2c88SJonathan Adams }
37622fe2c88SJonathan Adams
37722fe2c88SJonathan Adams /* we drop the ereport if it ends up that the data was good */
37822fe2c88SJonathan Adams zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
37922fe2c88SJonathan Adams }
38022fe2c88SJonathan Adams
38122fe2c88SJonathan Adams /*
38222fe2c88SJonathan Adams * Invoked indirectly by zfs_ereport_start_checksum(), called
38322fe2c88SJonathan Adams * below when our read operation fails completely. The main point
38422fe2c88SJonathan Adams * is to keep a copy of everything we read from disk, so that at
38522fe2c88SJonathan Adams * vdev_raidz_cksum_finish() time we can compare it with the good data.
38622fe2c88SJonathan Adams */
38722fe2c88SJonathan Adams static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)38822fe2c88SJonathan Adams vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
38922fe2c88SJonathan Adams {
39022fe2c88SJonathan Adams size_t c = (size_t)(uintptr_t)arg;
39122fe2c88SJonathan Adams caddr_t buf;
39222fe2c88SJonathan Adams
39322fe2c88SJonathan Adams raidz_map_t *rm = zio->io_vsd;
39422fe2c88SJonathan Adams size_t size;
39522fe2c88SJonathan Adams
39622fe2c88SJonathan Adams /* set up the report and bump the refcount */
39722fe2c88SJonathan Adams zcr->zcr_cbdata = rm;
39822fe2c88SJonathan Adams zcr->zcr_cbinfo = c;
39922fe2c88SJonathan Adams zcr->zcr_finish = vdev_raidz_cksum_finish;
40022fe2c88SJonathan Adams zcr->zcr_free = vdev_raidz_cksum_free;
40122fe2c88SJonathan Adams
40222fe2c88SJonathan Adams rm->rm_reports++;
40322fe2c88SJonathan Adams ASSERT3U(rm->rm_reports, >, 0);
40422fe2c88SJonathan Adams
405baa7389eSJonathan Adams if (rm->rm_datacopy != NULL)
40622fe2c88SJonathan Adams return;
40722fe2c88SJonathan Adams
40822fe2c88SJonathan Adams /*
409baa7389eSJonathan Adams * It's the first time we're called for this raidz_map_t, so we need
410baa7389eSJonathan Adams * to copy the data aside; there's no guarantee that our zio's buffer
411baa7389eSJonathan Adams * won't be re-used for something else.
41222fe2c88SJonathan Adams *
413baa7389eSJonathan Adams * Our parity data is already in separate buffers, so there's no need
41422fe2c88SJonathan Adams * to copy them.
41522fe2c88SJonathan Adams */
41622fe2c88SJonathan Adams
417baa7389eSJonathan Adams size = 0;
418baa7389eSJonathan Adams for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
419baa7389eSJonathan Adams size += rm->rm_col[c].rc_size;
42022fe2c88SJonathan Adams
42122fe2c88SJonathan Adams buf = rm->rm_datacopy = zio_buf_alloc(size);
422baa7389eSJonathan Adams
423baa7389eSJonathan Adams for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
42422fe2c88SJonathan Adams raidz_col_t *col = &rm->rm_col[c];
42522fe2c88SJonathan Adams
42622fe2c88SJonathan Adams bcopy(col->rc_data, buf, col->rc_size);
42722fe2c88SJonathan Adams col->rc_data = buf;
42822fe2c88SJonathan Adams
42922fe2c88SJonathan Adams buf += col->rc_size;
43022fe2c88SJonathan Adams }
43122fe2c88SJonathan Adams ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
43222fe2c88SJonathan Adams }
43322fe2c88SJonathan Adams
43422fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
43522fe2c88SJonathan Adams vdev_raidz_map_free_vsd,
43622fe2c88SJonathan Adams vdev_raidz_cksum_report
43722fe2c88SJonathan Adams };
43822fe2c88SJonathan Adams
4393e30c24aSWill Andrews /*
4403e30c24aSWill Andrews * Divides the IO evenly across all child vdevs; usually, dcols is
4413e30c24aSWill Andrews * the number of children in the target vdev.
4423e30c24aSWill Andrews */
443fa9e4066Sahrens static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data,uint64_t size,uint64_t offset,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)444810e43b2SBill Pijewski vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
445810e43b2SBill Pijewski uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
446fa9e4066Sahrens {
447fa9e4066Sahrens raidz_map_t *rm;
4483e30c24aSWill Andrews /* The starting RAIDZ (parent) vdev sector of the block. */
449810e43b2SBill Pijewski uint64_t b = offset >> unit_shift;
4503e30c24aSWill Andrews /* The zio's size in units of the vdev's minimum sector size. */
451810e43b2SBill Pijewski uint64_t s = size >> unit_shift;
4523e30c24aSWill Andrews /* The first column for this stripe. */
453fa9e4066Sahrens uint64_t f = b % dcols;
4543e30c24aSWill Andrews /* The starting byte offset on each child vdev. */
455fa9e4066Sahrens uint64_t o = (b / dcols) << unit_shift;
456f94275ceSAdam Leventhal uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
457fa9e4066Sahrens
4583e30c24aSWill Andrews /*
4593e30c24aSWill Andrews * "Quotient": The number of data sectors for this stripe on all but
4603e30c24aSWill Andrews * the "big column" child vdevs that also contain "remainder" data.
4613e30c24aSWill Andrews */
46299653d4eSeschrock q = s / (dcols - nparity);
4633e30c24aSWill Andrews
4643e30c24aSWill Andrews /*
4653e30c24aSWill Andrews * "Remainder": The number of partial stripe data sectors in this I/O.
4663e30c24aSWill Andrews * This will add a sector to some, but not all, child vdevs.
4673e30c24aSWill Andrews */
46899653d4eSeschrock r = s - q * (dcols - nparity);
4693e30c24aSWill Andrews
4703e30c24aSWill Andrews /* The number of "big columns" - those which contain remainder data. */
47199653d4eSeschrock bc = (r == 0 ? 0 : r + nparity);
4723e30c24aSWill Andrews
4733e30c24aSWill Andrews /*
4743e30c24aSWill Andrews * The total number of data and parity sectors associated with
4753e30c24aSWill Andrews * this I/O.
4763e30c24aSWill Andrews */
477f94275ceSAdam Leventhal tot = s + nparity * (q + (r == 0 ? 0 : 1));
478fa9e4066Sahrens
4793e30c24aSWill Andrews /* acols: The columns that will be accessed. */
4803e30c24aSWill Andrews /* scols: The columns that will be accessed or skipped. */
481f94275ceSAdam Leventhal if (q == 0) {
4823e30c24aSWill Andrews /* Our I/O request doesn't span all child vdevs. */
483f94275ceSAdam Leventhal acols = bc;
484f94275ceSAdam Leventhal scols = MIN(dcols, roundup(bc, nparity + 1));
485f94275ceSAdam Leventhal } else {
486f94275ceSAdam Leventhal acols = dcols;
487f94275ceSAdam Leventhal scols = dcols;
488f94275ceSAdam Leventhal }
489fa9e4066Sahrens
490f94275ceSAdam Leventhal ASSERT3U(acols, <=, scols);
491f94275ceSAdam Leventhal
492f94275ceSAdam Leventhal rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
493fa9e4066Sahrens
494fa9e4066Sahrens rm->rm_cols = acols;
495f94275ceSAdam Leventhal rm->rm_scols = scols;
496fa9e4066Sahrens rm->rm_bigcols = bc;
4972fbc121fSAdam Leventhal rm->rm_skipstart = bc;
49899653d4eSeschrock rm->rm_missingdata = 0;
49999653d4eSeschrock rm->rm_missingparity = 0;
50099653d4eSeschrock rm->rm_firstdatacol = nparity;
50122fe2c88SJonathan Adams rm->rm_datacopy = NULL;
50222fe2c88SJonathan Adams rm->rm_reports = 0;
50322fe2c88SJonathan Adams rm->rm_freed = 0;
50422fe2c88SJonathan Adams rm->rm_ecksuminjected = 0;
505fa9e4066Sahrens
506f94275ceSAdam Leventhal asize = 0;
507f94275ceSAdam Leventhal
508f94275ceSAdam Leventhal for (c = 0; c < scols; c++) {
509fa9e4066Sahrens col = f + c;
510fa9e4066Sahrens coff = o;
511fa9e4066Sahrens if (col >= dcols) {
512fa9e4066Sahrens col -= dcols;
513fa9e4066Sahrens coff += 1ULL << unit_shift;
514fa9e4066Sahrens }
51599653d4eSeschrock rm->rm_col[c].rc_devidx = col;
516fa9e4066Sahrens rm->rm_col[c].rc_offset = coff;
517fa9e4066Sahrens rm->rm_col[c].rc_data = NULL;
51822fe2c88SJonathan Adams rm->rm_col[c].rc_gdata = NULL;
519fa9e4066Sahrens rm->rm_col[c].rc_error = 0;
520fa9e4066Sahrens rm->rm_col[c].rc_tried = 0;
521fa9e4066Sahrens rm->rm_col[c].rc_skipped = 0;
522f94275ceSAdam Leventhal
523f94275ceSAdam Leventhal if (c >= acols)
524f94275ceSAdam Leventhal rm->rm_col[c].rc_size = 0;
525f94275ceSAdam Leventhal else if (c < bc)
526f94275ceSAdam Leventhal rm->rm_col[c].rc_size = (q + 1) << unit_shift;
527f94275ceSAdam Leventhal else
528f94275ceSAdam Leventhal rm->rm_col[c].rc_size = q << unit_shift;
529f94275ceSAdam Leventhal
530f94275ceSAdam Leventhal asize += rm->rm_col[c].rc_size;
531fa9e4066Sahrens }
532fa9e4066Sahrens
533f94275ceSAdam Leventhal ASSERT3U(asize, ==, tot << unit_shift);
534f94275ceSAdam Leventhal rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
5352fbc121fSAdam Leventhal rm->rm_nskip = roundup(tot, nparity + 1) - tot;
5362fbc121fSAdam Leventhal ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
5372fbc121fSAdam Leventhal ASSERT3U(rm->rm_nskip, <=, nparity);
538fa9e4066Sahrens
539fa9e4066Sahrens for (c = 0; c < rm->rm_firstdatacol; c++)
540fa9e4066Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
541fa9e4066Sahrens
542810e43b2SBill Pijewski rm->rm_col[c].rc_data = data;
543fa9e4066Sahrens
544fa9e4066Sahrens for (c = c + 1; c < acols; c++)
545fa9e4066Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
546fa9e4066Sahrens rm->rm_col[c - 1].rc_size;
547fa9e4066Sahrens
548fa9e4066Sahrens /*
54999653d4eSeschrock * If all data stored spans all columns, there's a danger that parity
55099653d4eSeschrock * will always be on the same device and, since parity isn't read
55199653d4eSeschrock * during normal operation, that that device's I/O bandwidth won't be
55299653d4eSeschrock * used effectively. We therefore switch the parity every 1MB.
55399653d4eSeschrock *
55499653d4eSeschrock * ... at least that was, ostensibly, the theory. As a practical
55599653d4eSeschrock * matter unless we juggle the parity between all devices evenly, we
55699653d4eSeschrock * won't see any benefit. Further, occasional writes that aren't a
55799653d4eSeschrock * multiple of the LCM of the number of children and the minimum
55899653d4eSeschrock * stripe width are sufficient to avoid pessimal behavior.
55999653d4eSeschrock * Unfortunately, this decision created an implicit on-disk format
560c7a40cc4Sahl * requirement that we need to support for all eternity, but only
561c7a40cc4Sahl * for single-parity RAID-Z.
5622fbc121fSAdam Leventhal *
5632fbc121fSAdam Leventhal * If we intend to skip a sector in the zeroth column for padding
5642fbc121fSAdam Leventhal * we must make sure to note this swap. We will never intend to
5652fbc121fSAdam Leventhal * skip the first column since at least one data and one parity
5662fbc121fSAdam Leventhal * column must appear in each row.
567fa9e4066Sahrens */
568fa9e4066Sahrens ASSERT(rm->rm_cols >= 2);
569fa9e4066Sahrens ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
570fa9e4066Sahrens
571810e43b2SBill Pijewski if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
57299653d4eSeschrock devidx = rm->rm_col[0].rc_devidx;
573fa9e4066Sahrens o = rm->rm_col[0].rc_offset;
57499653d4eSeschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
575fa9e4066Sahrens rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
57699653d4eSeschrock rm->rm_col[1].rc_devidx = devidx;
577fa9e4066Sahrens rm->rm_col[1].rc_offset = o;
5782fbc121fSAdam Leventhal
5792fbc121fSAdam Leventhal if (rm->rm_skipstart == 0)
5802fbc121fSAdam Leventhal rm->rm_skipstart = 1;
581fa9e4066Sahrens }
582fa9e4066Sahrens
583fa9e4066Sahrens return (rm);
584fa9e4066Sahrens }
585fa9e4066Sahrens
586fa9e4066Sahrens static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)58799653d4eSeschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
588fa9e4066Sahrens {
58999653d4eSeschrock uint64_t *p, *src, pcount, ccount, i;
59099653d4eSeschrock int c;
591fa9e4066Sahrens
59299653d4eSeschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
59399653d4eSeschrock
59499653d4eSeschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
59599653d4eSeschrock src = rm->rm_col[c].rc_data;
59699653d4eSeschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
59799653d4eSeschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
59899653d4eSeschrock
59999653d4eSeschrock if (c == rm->rm_firstdatacol) {
60099653d4eSeschrock ASSERT(ccount == pcount);
601f94275ceSAdam Leventhal for (i = 0; i < ccount; i++, src++, p++) {
60299653d4eSeschrock *p = *src;
60399653d4eSeschrock }
60499653d4eSeschrock } else {
60599653d4eSeschrock ASSERT(ccount <= pcount);
606f94275ceSAdam Leventhal for (i = 0; i < ccount; i++, src++, p++) {
60799653d4eSeschrock *p ^= *src;
60899653d4eSeschrock }
60999653d4eSeschrock }
61099653d4eSeschrock }
61199653d4eSeschrock }
61299653d4eSeschrock
61399653d4eSeschrock static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)61499653d4eSeschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
61599653d4eSeschrock {
616f94275ceSAdam Leventhal uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
61799653d4eSeschrock int c;
61899653d4eSeschrock
619f94275ceSAdam Leventhal pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
62099653d4eSeschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
62199653d4eSeschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size);
62299653d4eSeschrock
62399653d4eSeschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
62499653d4eSeschrock src = rm->rm_col[c].rc_data;
62599653d4eSeschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
62699653d4eSeschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
627f94275ceSAdam Leventhal
628f94275ceSAdam Leventhal ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
62999653d4eSeschrock
63099653d4eSeschrock if (c == rm->rm_firstdatacol) {
631f94275ceSAdam Leventhal ASSERT(ccnt == pcnt || ccnt == 0);
632f94275ceSAdam Leventhal for (i = 0; i < ccnt; i++, src++, p++, q++) {
63399653d4eSeschrock *p = *src;
634f94275ceSAdam Leventhal *q = *src;
63599653d4eSeschrock }
636f94275ceSAdam Leventhal for (; i < pcnt; i++, src++, p++, q++) {
63799653d4eSeschrock *p = 0;
638f94275ceSAdam Leventhal *q = 0;
63999653d4eSeschrock }
64099653d4eSeschrock } else {
641f94275ceSAdam Leventhal ASSERT(ccnt <= pcnt);
64299653d4eSeschrock
64399653d4eSeschrock /*
644f94275ceSAdam Leventhal * Apply the algorithm described above by multiplying
645f94275ceSAdam Leventhal * the previous result and adding in the new value.
64699653d4eSeschrock */
647f94275ceSAdam Leventhal for (i = 0; i < ccnt; i++, src++, p++, q++) {
64899653d4eSeschrock *p ^= *src;
649f94275ceSAdam Leventhal
650f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2(*q, mask);
651f94275ceSAdam Leventhal *q ^= *src;
65299653d4eSeschrock }
65399653d4eSeschrock
65499653d4eSeschrock /*
65599653d4eSeschrock * Treat short columns as though they are full of 0s.
656f94275ceSAdam Leventhal * Note that there's therefore nothing needed for P.
65799653d4eSeschrock */
658f94275ceSAdam Leventhal for (; i < pcnt; i++, q++) {
659f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2(*q, mask);
66099653d4eSeschrock }
66199653d4eSeschrock }
66299653d4eSeschrock }
66399653d4eSeschrock }
66499653d4eSeschrock
66599653d4eSeschrock static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)666f94275ceSAdam Leventhal vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
667f94275ceSAdam Leventhal {
668f94275ceSAdam Leventhal uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
669f94275ceSAdam Leventhal int c;
670f94275ceSAdam Leventhal
671f94275ceSAdam Leventhal pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
672f94275ceSAdam Leventhal ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
673f94275ceSAdam Leventhal rm->rm_col[VDEV_RAIDZ_Q].rc_size);
674f94275ceSAdam Leventhal ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
675f94275ceSAdam Leventhal rm->rm_col[VDEV_RAIDZ_R].rc_size);
676f94275ceSAdam Leventhal
677f94275ceSAdam Leventhal for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678f94275ceSAdam Leventhal src = rm->rm_col[c].rc_data;
679f94275ceSAdam Leventhal p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
680f94275ceSAdam Leventhal q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
681f94275ceSAdam Leventhal r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
682f94275ceSAdam Leventhal
683f94275ceSAdam Leventhal ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
684f94275ceSAdam Leventhal
685f94275ceSAdam Leventhal if (c == rm->rm_firstdatacol) {
686f94275ceSAdam Leventhal ASSERT(ccnt == pcnt || ccnt == 0);
687f94275ceSAdam Leventhal for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
688f94275ceSAdam Leventhal *p = *src;
689f94275ceSAdam Leventhal *q = *src;
690f94275ceSAdam Leventhal *r = *src;
691f94275ceSAdam Leventhal }
692f94275ceSAdam Leventhal for (; i < pcnt; i++, src++, p++, q++, r++) {
693f94275ceSAdam Leventhal *p = 0;
694f94275ceSAdam Leventhal *q = 0;
695f94275ceSAdam Leventhal *r = 0;
696f94275ceSAdam Leventhal }
697f94275ceSAdam Leventhal } else {
698f94275ceSAdam Leventhal ASSERT(ccnt <= pcnt);
699f94275ceSAdam Leventhal
700f94275ceSAdam Leventhal /*
701f94275ceSAdam Leventhal * Apply the algorithm described above by multiplying
702f94275ceSAdam Leventhal * the previous result and adding in the new value.
703f94275ceSAdam Leventhal */
704f94275ceSAdam Leventhal for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
705f94275ceSAdam Leventhal *p ^= *src;
706f94275ceSAdam Leventhal
707f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2(*q, mask);
708f94275ceSAdam Leventhal *q ^= *src;
709f94275ceSAdam Leventhal
710f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_4(*r, mask);
711f94275ceSAdam Leventhal *r ^= *src;
712f94275ceSAdam Leventhal }
713f94275ceSAdam Leventhal
714f94275ceSAdam Leventhal /*
715f94275ceSAdam Leventhal * Treat short columns as though they are full of 0s.
716f94275ceSAdam Leventhal * Note that there's therefore nothing needed for P.
717f94275ceSAdam Leventhal */
718f94275ceSAdam Leventhal for (; i < pcnt; i++, q++, r++) {
719f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2(*q, mask);
720f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_4(*r, mask);
721f94275ceSAdam Leventhal }
722f94275ceSAdam Leventhal }
723f94275ceSAdam Leventhal }
724f94275ceSAdam Leventhal }
725f94275ceSAdam Leventhal
726f94275ceSAdam Leventhal /*
727f94275ceSAdam Leventhal * Generate RAID parity in the first virtual columns according to the number of
728f94275ceSAdam Leventhal * parity columns available.
729f94275ceSAdam Leventhal */
730f94275ceSAdam Leventhal static void
vdev_raidz_generate_parity(raidz_map_t * rm)731f94275ceSAdam Leventhal vdev_raidz_generate_parity(raidz_map_t *rm)
732f94275ceSAdam Leventhal {
733f94275ceSAdam Leventhal switch (rm->rm_firstdatacol) {
734f94275ceSAdam Leventhal case 1:
735f94275ceSAdam Leventhal vdev_raidz_generate_parity_p(rm);
736f94275ceSAdam Leventhal break;
737f94275ceSAdam Leventhal case 2:
738f94275ceSAdam Leventhal vdev_raidz_generate_parity_pq(rm);
739f94275ceSAdam Leventhal break;
740f94275ceSAdam Leventhal case 3:
741f94275ceSAdam Leventhal vdev_raidz_generate_parity_pqr(rm);
742f94275ceSAdam Leventhal break;
743f94275ceSAdam Leventhal default:
744f94275ceSAdam Leventhal cmn_err(CE_PANIC, "invalid RAID-Z configuration");
745f94275ceSAdam Leventhal }
746f94275ceSAdam Leventhal }
747f94275ceSAdam Leventhal
748f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)749f94275ceSAdam Leventhal vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
75099653d4eSeschrock {
75199653d4eSeschrock uint64_t *dst, *src, xcount, ccount, count, i;
752f94275ceSAdam Leventhal int x = tgts[0];
75399653d4eSeschrock int c;
75499653d4eSeschrock
755f94275ceSAdam Leventhal ASSERT(ntgts == 1);
756f94275ceSAdam Leventhal ASSERT(x >= rm->rm_firstdatacol);
757f94275ceSAdam Leventhal ASSERT(x < rm->rm_cols);
758f94275ceSAdam Leventhal
75999653d4eSeschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
76099653d4eSeschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
76199653d4eSeschrock ASSERT(xcount > 0);
76299653d4eSeschrock
76399653d4eSeschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
76499653d4eSeschrock dst = rm->rm_col[x].rc_data;
76599653d4eSeschrock for (i = 0; i < xcount; i++, dst++, src++) {
76699653d4eSeschrock *dst = *src;
76799653d4eSeschrock }
76899653d4eSeschrock
76999653d4eSeschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770fa9e4066Sahrens src = rm->rm_col[c].rc_data;
771fa9e4066Sahrens dst = rm->rm_col[x].rc_data;
77299653d4eSeschrock
77399653d4eSeschrock if (c == x)
77499653d4eSeschrock continue;
77599653d4eSeschrock
77699653d4eSeschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
77799653d4eSeschrock count = MIN(ccount, xcount);
77899653d4eSeschrock
77999653d4eSeschrock for (i = 0; i < count; i++, dst++, src++) {
78099653d4eSeschrock *dst ^= *src;
78199653d4eSeschrock }
78299653d4eSeschrock }
783f94275ceSAdam Leventhal
784f94275ceSAdam Leventhal return (1 << VDEV_RAIDZ_P);
78599653d4eSeschrock }
78699653d4eSeschrock
787f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)788f94275ceSAdam Leventhal vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
78999653d4eSeschrock {
79099653d4eSeschrock uint64_t *dst, *src, xcount, ccount, count, mask, i;
79199653d4eSeschrock uint8_t *b;
792f94275ceSAdam Leventhal int x = tgts[0];
79399653d4eSeschrock int c, j, exp;
79499653d4eSeschrock
795f94275ceSAdam Leventhal ASSERT(ntgts == 1);
796f94275ceSAdam Leventhal
79799653d4eSeschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
79899653d4eSeschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
79999653d4eSeschrock
80099653d4eSeschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
80199653d4eSeschrock src = rm->rm_col[c].rc_data;
80299653d4eSeschrock dst = rm->rm_col[x].rc_data;
80399653d4eSeschrock
80499653d4eSeschrock if (c == x)
80599653d4eSeschrock ccount = 0;
80699653d4eSeschrock else
80799653d4eSeschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
80899653d4eSeschrock
80999653d4eSeschrock count = MIN(ccount, xcount);
81099653d4eSeschrock
81199653d4eSeschrock if (c == rm->rm_firstdatacol) {
81299653d4eSeschrock for (i = 0; i < count; i++, dst++, src++) {
81399653d4eSeschrock *dst = *src;
81499653d4eSeschrock }
81599653d4eSeschrock for (; i < xcount; i++, dst++) {
81699653d4eSeschrock *dst = 0;
81799653d4eSeschrock }
81899653d4eSeschrock
819fa9e4066Sahrens } else {
82099653d4eSeschrock for (i = 0; i < count; i++, dst++, src++) {
821f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2(*dst, mask);
82299653d4eSeschrock *dst ^= *src;
82399653d4eSeschrock }
82499653d4eSeschrock
82599653d4eSeschrock for (; i < xcount; i++, dst++) {
826f94275ceSAdam Leventhal VDEV_RAIDZ_64MUL_2(*dst, mask);
827fa9e4066Sahrens }
828fa9e4066Sahrens }
829fa9e4066Sahrens }
830fa9e4066Sahrens
83199653d4eSeschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
83299653d4eSeschrock dst = rm->rm_col[x].rc_data;
83399653d4eSeschrock exp = 255 - (rm->rm_cols - 1 - x);
83499653d4eSeschrock
83599653d4eSeschrock for (i = 0; i < xcount; i++, dst++, src++) {
83699653d4eSeschrock *dst ^= *src;
83799653d4eSeschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
83899653d4eSeschrock *b = vdev_raidz_exp2(*b, exp);
83999653d4eSeschrock }
84099653d4eSeschrock }
841f94275ceSAdam Leventhal
842f94275ceSAdam Leventhal return (1 << VDEV_RAIDZ_Q);
84399653d4eSeschrock }
84499653d4eSeschrock
845f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)846f94275ceSAdam Leventhal vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
84799653d4eSeschrock {
84899653d4eSeschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
84999653d4eSeschrock void *pdata, *qdata;
85099653d4eSeschrock uint64_t xsize, ysize, i;
851f94275ceSAdam Leventhal int x = tgts[0];
852f94275ceSAdam Leventhal int y = tgts[1];
85399653d4eSeschrock
854f94275ceSAdam Leventhal ASSERT(ntgts == 2);
85599653d4eSeschrock ASSERT(x < y);
85699653d4eSeschrock ASSERT(x >= rm->rm_firstdatacol);
85799653d4eSeschrock ASSERT(y < rm->rm_cols);
85899653d4eSeschrock
85999653d4eSeschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
86099653d4eSeschrock
86199653d4eSeschrock /*
86299653d4eSeschrock * Move the parity data aside -- we're going to compute parity as
86399653d4eSeschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to
86499653d4eSeschrock * reuse the parity generation mechanism without trashing the actual
86599653d4eSeschrock * parity so we make those columns appear to be full of zeros by
86699653d4eSeschrock * setting their lengths to zero.
86799653d4eSeschrock */
86899653d4eSeschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
86999653d4eSeschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
87099653d4eSeschrock xsize = rm->rm_col[x].rc_size;
87199653d4eSeschrock ysize = rm->rm_col[y].rc_size;
87299653d4eSeschrock
87399653d4eSeschrock rm->rm_col[VDEV_RAIDZ_P].rc_data =
87499653d4eSeschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
87599653d4eSeschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data =
87699653d4eSeschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
87799653d4eSeschrock rm->rm_col[x].rc_size = 0;
87899653d4eSeschrock rm->rm_col[y].rc_size = 0;
87999653d4eSeschrock
88099653d4eSeschrock vdev_raidz_generate_parity_pq(rm);
88199653d4eSeschrock
88299653d4eSeschrock rm->rm_col[x].rc_size = xsize;
88399653d4eSeschrock rm->rm_col[y].rc_size = ysize;
88499653d4eSeschrock
88599653d4eSeschrock p = pdata;
88699653d4eSeschrock q = qdata;
88799653d4eSeschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
88899653d4eSeschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
88999653d4eSeschrock xd = rm->rm_col[x].rc_data;
89099653d4eSeschrock yd = rm->rm_col[y].rc_data;
89199653d4eSeschrock
89299653d4eSeschrock /*
89399653d4eSeschrock * We now have:
89499653d4eSeschrock * Pxy = P + D_x + D_y
89599653d4eSeschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
89699653d4eSeschrock *
89799653d4eSeschrock * We can then solve for D_x:
89899653d4eSeschrock * D_x = A * (P + Pxy) + B * (Q + Qxy)
89999653d4eSeschrock * where
90099653d4eSeschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1
90199653d4eSeschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
90299653d4eSeschrock *
90399653d4eSeschrock * With D_x in hand, we can easily solve for D_y:
90499653d4eSeschrock * D_y = P + Pxy + D_x
90599653d4eSeschrock */
90699653d4eSeschrock
90799653d4eSeschrock a = vdev_raidz_pow2[255 + x - y];
90899653d4eSeschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
90999653d4eSeschrock tmp = 255 - vdev_raidz_log2[a ^ 1];
91099653d4eSeschrock
91199653d4eSeschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
91299653d4eSeschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
91399653d4eSeschrock
91499653d4eSeschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
91599653d4eSeschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
91699653d4eSeschrock vdev_raidz_exp2(*q ^ *qxy, bexp);
91799653d4eSeschrock
91899653d4eSeschrock if (i < ysize)
91999653d4eSeschrock *yd = *p ^ *pxy ^ *xd;
92099653d4eSeschrock }
92199653d4eSeschrock
92299653d4eSeschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
92399653d4eSeschrock rm->rm_col[VDEV_RAIDZ_P].rc_size);
92499653d4eSeschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
92599653d4eSeschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size);
92699653d4eSeschrock
92799653d4eSeschrock /*
92899653d4eSeschrock * Restore the saved parity data.
92999653d4eSeschrock */
93099653d4eSeschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
93199653d4eSeschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
932f94275ceSAdam Leventhal
933f94275ceSAdam Leventhal return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
93499653d4eSeschrock }
93599653d4eSeschrock
936f94275ceSAdam Leventhal /* BEGIN CSTYLED */
937f94275ceSAdam Leventhal /*
938f94275ceSAdam Leventhal * In the general case of reconstruction, we must solve the system of linear
939f94275ceSAdam Leventhal * equations defined by the coeffecients used to generate parity as well as
940f94275ceSAdam Leventhal * the contents of the data and parity disks. This can be expressed with
941f94275ceSAdam Leventhal * vectors for the original data (D) and the actual data (d) and parity (p)
942f94275ceSAdam Leventhal * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
943f94275ceSAdam Leventhal *
944f94275ceSAdam Leventhal * __ __ __ __
945f94275ceSAdam Leventhal * | | __ __ | p_0 |
946f94275ceSAdam Leventhal * | V | | D_0 | | p_m-1 |
947f94275ceSAdam Leventhal * | | x | : | = | d_0 |
948f94275ceSAdam Leventhal * | I | | D_n-1 | | : |
949f94275ceSAdam Leventhal * | | ~~ ~~ | d_n-1 |
950f94275ceSAdam Leventhal * ~~ ~~ ~~ ~~
951f94275ceSAdam Leventhal *
952f94275ceSAdam Leventhal * I is simply a square identity matrix of size n, and V is a vandermonde
953f94275ceSAdam Leventhal * matrix defined by the coeffecients we chose for the various parity columns
954f94275ceSAdam Leventhal * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
955f94275ceSAdam Leventhal * computation as well as linear separability.
956f94275ceSAdam Leventhal *
957f94275ceSAdam Leventhal * __ __ __ __
958f94275ceSAdam Leventhal * | 1 .. 1 1 1 | | p_0 |
959f94275ceSAdam Leventhal * | 2^n-1 .. 4 2 1 | __ __ | : |
960f94275ceSAdam Leventhal * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
961f94275ceSAdam Leventhal * | 1 .. 0 0 0 | | D_1 | | d_0 |
962f94275ceSAdam Leventhal * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
963f94275ceSAdam Leventhal * | : : : : | | : | | d_2 |
964f94275ceSAdam Leventhal * | 0 .. 1 0 0 | | D_n-1 | | : |
965f94275ceSAdam Leventhal * | 0 .. 0 1 0 | ~~ ~~ | : |
966f94275ceSAdam Leventhal * | 0 .. 0 0 1 | | d_n-1 |
967f94275ceSAdam Leventhal * ~~ ~~ ~~ ~~
968f94275ceSAdam Leventhal *
969f94275ceSAdam Leventhal * Note that I, V, d, and p are known. To compute D, we must invert the
970f94275ceSAdam Leventhal * matrix and use the known data and parity values to reconstruct the unknown
971f94275ceSAdam Leventhal * data values. We begin by removing the rows in V|I and d|p that correspond
972f94275ceSAdam Leventhal * to failed or missing columns; we then make V|I square (n x n) and d|p
973f94275ceSAdam Leventhal * sized n by removing rows corresponding to unused parity from the bottom up
974f94275ceSAdam Leventhal * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
975f94275ceSAdam Leventhal * using Gauss-Jordan elimination. In the example below we use m=3 parity
976f94275ceSAdam Leventhal * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
977f94275ceSAdam Leventhal * __ __
978f94275ceSAdam Leventhal * | 1 1 1 1 1 1 1 1 |
979f94275ceSAdam Leventhal * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
980f94275ceSAdam Leventhal * | 19 205 116 29 64 16 4 1 | / /
981f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 | / /
982f94275ceSAdam Leventhal * | 0 1 0 0 0 0 0 0 | <--' /
983f94275ceSAdam Leventhal * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
984f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 |
985f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 |
986f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 |
987f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 |
988f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 |
989f94275ceSAdam Leventhal * ~~ ~~
990f94275ceSAdam Leventhal * __ __
991f94275ceSAdam Leventhal * | 1 1 1 1 1 1 1 1 |
992f94275ceSAdam Leventhal * | 19 205 116 29 64 16 4 1 |
993f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 |
994810e43b2SBill Pijewski * (V|I)' = | 0 0 0 1 0 0 0 0 |
995f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 |
996f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 |
997f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 |
998f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 |
999f94275ceSAdam Leventhal * ~~ ~~
1000f94275ceSAdam Leventhal *
1001f94275ceSAdam Leventhal * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002f94275ceSAdam Leventhal * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003f94275ceSAdam Leventhal * matrix is not singular.
1004f94275ceSAdam Leventhal * __ __
1005f94275ceSAdam Leventhal * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1006f94275ceSAdam Leventhal * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1007f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1008f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1009f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1010f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1011f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1012f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1013f94275ceSAdam Leventhal * ~~ ~~
1014f94275ceSAdam Leventhal * __ __
1015f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1016f94275ceSAdam Leventhal * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1017f94275ceSAdam Leventhal * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1018f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1019f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1020f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1021f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1022f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1023f94275ceSAdam Leventhal * ~~ ~~
1024f94275ceSAdam Leventhal * __ __
1025f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1026f94275ceSAdam Leventhal * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1027f94275ceSAdam Leventhal * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1028f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1029f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1030f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1031f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1032f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1033f94275ceSAdam Leventhal * ~~ ~~
1034f94275ceSAdam Leventhal * __ __
1035f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1036f94275ceSAdam Leventhal * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1037f94275ceSAdam Leventhal * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1038f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1039f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1040f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1041f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1042f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1043f94275ceSAdam Leventhal * ~~ ~~
1044f94275ceSAdam Leventhal * __ __
1045f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1046f94275ceSAdam Leventhal * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1047f94275ceSAdam Leventhal * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1048f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1049f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1050f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1051f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1052f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1053f94275ceSAdam Leventhal * ~~ ~~
1054f94275ceSAdam Leventhal * __ __
1055f94275ceSAdam Leventhal * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1056f94275ceSAdam Leventhal * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1057f94275ceSAdam Leventhal * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1058f94275ceSAdam Leventhal * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1059f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1060f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1061f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1062f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1063f94275ceSAdam Leventhal * ~~ ~~
1064f94275ceSAdam Leventhal * __ __
1065f94275ceSAdam Leventhal * | 0 0 1 0 0 0 0 0 |
1066f94275ceSAdam Leventhal * | 167 100 5 41 159 169 217 208 |
1067f94275ceSAdam Leventhal * | 166 100 4 40 158 168 216 209 |
1068f94275ceSAdam Leventhal * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1069f94275ceSAdam Leventhal * | 0 0 0 0 1 0 0 0 |
1070f94275ceSAdam Leventhal * | 0 0 0 0 0 1 0 0 |
1071f94275ceSAdam Leventhal * | 0 0 0 0 0 0 1 0 |
1072f94275ceSAdam Leventhal * | 0 0 0 0 0 0 0 1 |
1073f94275ceSAdam Leventhal * ~~ ~~
1074f94275ceSAdam Leventhal *
1075f94275ceSAdam Leventhal * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076f94275ceSAdam Leventhal * of the missing data.
1077f94275ceSAdam Leventhal *
1078f94275ceSAdam Leventhal * As is apparent from the example above, the only non-trivial rows in the
1079f94275ceSAdam Leventhal * inverse matrix correspond to the data disks that we're trying to
1080f94275ceSAdam Leventhal * reconstruct. Indeed, those are the only rows we need as the others would
1081f94275ceSAdam Leventhal * only be useful for reconstructing data known or assumed to be valid. For
1082f94275ceSAdam Leventhal * that reason, we only build the coefficients in the rows that correspond to
1083f94275ceSAdam Leventhal * targeted columns.
1084f94275ceSAdam Leventhal */
1085f94275ceSAdam Leventhal /* END CSTYLED */
1086f94275ceSAdam Leventhal
1087f94275ceSAdam Leventhal static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1088f94275ceSAdam Leventhal vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089f94275ceSAdam Leventhal uint8_t **rows)
1090f94275ceSAdam Leventhal {
1091f94275ceSAdam Leventhal int i, j;
1092f94275ceSAdam Leventhal int pow;
1093f94275ceSAdam Leventhal
1094f94275ceSAdam Leventhal ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095f94275ceSAdam Leventhal
1096f94275ceSAdam Leventhal /*
1097f94275ceSAdam Leventhal * Fill in the missing rows of interest.
1098f94275ceSAdam Leventhal */
1099f94275ceSAdam Leventhal for (i = 0; i < nmap; i++) {
1100f94275ceSAdam Leventhal ASSERT3S(0, <=, map[i]);
1101f94275ceSAdam Leventhal ASSERT3S(map[i], <=, 2);
1102f94275ceSAdam Leventhal
1103f94275ceSAdam Leventhal pow = map[i] * n;
1104f94275ceSAdam Leventhal if (pow > 255)
1105f94275ceSAdam Leventhal pow -= 255;
1106f94275ceSAdam Leventhal ASSERT(pow <= 255);
1107f94275ceSAdam Leventhal
1108f94275ceSAdam Leventhal for (j = 0; j < n; j++) {
1109f94275ceSAdam Leventhal pow -= map[i];
1110f94275ceSAdam Leventhal if (pow < 0)
1111f94275ceSAdam Leventhal pow += 255;
1112f94275ceSAdam Leventhal rows[i][j] = vdev_raidz_pow2[pow];
1113f94275ceSAdam Leventhal }
1114f94275ceSAdam Leventhal }
1115f94275ceSAdam Leventhal }
1116f94275ceSAdam Leventhal
1117f94275ceSAdam Leventhal static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1118f94275ceSAdam Leventhal vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119f94275ceSAdam Leventhal uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120f94275ceSAdam Leventhal {
1121f94275ceSAdam Leventhal int i, j, ii, jj;
1122f94275ceSAdam Leventhal uint8_t log;
1123f94275ceSAdam Leventhal
1124f94275ceSAdam Leventhal /*
1125f94275ceSAdam Leventhal * Assert that the first nmissing entries from the array of used
1126f94275ceSAdam Leventhal * columns correspond to parity columns and that subsequent entries
1127f94275ceSAdam Leventhal * correspond to data columns.
1128f94275ceSAdam Leventhal */
1129f94275ceSAdam Leventhal for (i = 0; i < nmissing; i++) {
1130f94275ceSAdam Leventhal ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131f94275ceSAdam Leventhal }
1132f94275ceSAdam Leventhal for (; i < n; i++) {
1133f94275ceSAdam Leventhal ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134f94275ceSAdam Leventhal }
1135f94275ceSAdam Leventhal
1136f94275ceSAdam Leventhal /*
1137f94275ceSAdam Leventhal * First initialize the storage where we'll compute the inverse rows.
1138f94275ceSAdam Leventhal */
1139f94275ceSAdam Leventhal for (i = 0; i < nmissing; i++) {
1140f94275ceSAdam Leventhal for (j = 0; j < n; j++) {
1141f94275ceSAdam Leventhal invrows[i][j] = (i == j) ? 1 : 0;
1142f94275ceSAdam Leventhal }
1143f94275ceSAdam Leventhal }
1144f94275ceSAdam Leventhal
1145f94275ceSAdam Leventhal /*
1146f94275ceSAdam Leventhal * Subtract all trivial rows from the rows of consequence.
1147f94275ceSAdam Leventhal */
1148f94275ceSAdam Leventhal for (i = 0; i < nmissing; i++) {
1149f94275ceSAdam Leventhal for (j = nmissing; j < n; j++) {
1150f94275ceSAdam Leventhal ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151f94275ceSAdam Leventhal jj = used[j] - rm->rm_firstdatacol;
1152f94275ceSAdam Leventhal ASSERT3S(jj, <, n);
1153f94275ceSAdam Leventhal invrows[i][j] = rows[i][jj];
1154f94275ceSAdam Leventhal rows[i][jj] = 0;
1155f94275ceSAdam Leventhal }
1156f94275ceSAdam Leventhal }
1157f94275ceSAdam Leventhal
1158f94275ceSAdam Leventhal /*
1159f94275ceSAdam Leventhal * For each of the rows of interest, we must normalize it and subtract
1160f94275ceSAdam Leventhal * a multiple of it from the other rows.
1161f94275ceSAdam Leventhal */
1162f94275ceSAdam Leventhal for (i = 0; i < nmissing; i++) {
1163f94275ceSAdam Leventhal for (j = 0; j < missing[i]; j++) {
1164fb09f5aaSMadhav Suresh ASSERT0(rows[i][j]);
1165f94275ceSAdam Leventhal }
1166f94275ceSAdam Leventhal ASSERT3U(rows[i][missing[i]], !=, 0);
1167f94275ceSAdam Leventhal
1168f94275ceSAdam Leventhal /*
1169f94275ceSAdam Leventhal * Compute the inverse of the first element and multiply each
1170f94275ceSAdam Leventhal * element in the row by that value.
1171f94275ceSAdam Leventhal */
1172f94275ceSAdam Leventhal log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173f94275ceSAdam Leventhal
1174f94275ceSAdam Leventhal for (j = 0; j < n; j++) {
1175f94275ceSAdam Leventhal rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176f94275ceSAdam Leventhal invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177f94275ceSAdam Leventhal }
1178f94275ceSAdam Leventhal
1179f94275ceSAdam Leventhal for (ii = 0; ii < nmissing; ii++) {
1180f94275ceSAdam Leventhal if (i == ii)
1181f94275ceSAdam Leventhal continue;
1182f94275ceSAdam Leventhal
1183f94275ceSAdam Leventhal ASSERT3U(rows[ii][missing[i]], !=, 0);
1184f94275ceSAdam Leventhal
1185f94275ceSAdam Leventhal log = vdev_raidz_log2[rows[ii][missing[i]]];
1186f94275ceSAdam Leventhal
1187f94275ceSAdam Leventhal for (j = 0; j < n; j++) {
1188f94275ceSAdam Leventhal rows[ii][j] ^=
1189f94275ceSAdam Leventhal vdev_raidz_exp2(rows[i][j], log);
1190f94275ceSAdam Leventhal invrows[ii][j] ^=
1191f94275ceSAdam Leventhal vdev_raidz_exp2(invrows[i][j], log);
1192f94275ceSAdam Leventhal }
1193f94275ceSAdam Leventhal }
1194f94275ceSAdam Leventhal }
1195f94275ceSAdam Leventhal
1196f94275ceSAdam Leventhal /*
1197f94275ceSAdam Leventhal * Verify that the data that is left in the rows are properly part of
1198f94275ceSAdam Leventhal * an identity matrix.
1199f94275ceSAdam Leventhal */
1200f94275ceSAdam Leventhal for (i = 0; i < nmissing; i++) {
1201f94275ceSAdam Leventhal for (j = 0; j < n; j++) {
1202f94275ceSAdam Leventhal if (j == missing[i]) {
1203f94275ceSAdam Leventhal ASSERT3U(rows[i][j], ==, 1);
1204f94275ceSAdam Leventhal } else {
1205fb09f5aaSMadhav Suresh ASSERT0(rows[i][j]);
1206f94275ceSAdam Leventhal }
1207f94275ceSAdam Leventhal }
1208f94275ceSAdam Leventhal }
1209f94275ceSAdam Leventhal }
1210f94275ceSAdam Leventhal
1211f94275ceSAdam Leventhal static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1212f94275ceSAdam Leventhal vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213f94275ceSAdam Leventhal int *missing, uint8_t **invrows, const uint8_t *used)
1214f94275ceSAdam Leventhal {
1215f94275ceSAdam Leventhal int i, j, x, cc, c;
1216f94275ceSAdam Leventhal uint8_t *src;
1217f94275ceSAdam Leventhal uint64_t ccount;
1218f94275ceSAdam Leventhal uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219f94275ceSAdam Leventhal uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220d5285caeSGeorge Wilson uint8_t log = 0;
1221d5285caeSGeorge Wilson uint8_t val;
1222f94275ceSAdam Leventhal int ll;
1223f94275ceSAdam Leventhal uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224f94275ceSAdam Leventhal uint8_t *p, *pp;
1225f94275ceSAdam Leventhal size_t psize;
1226f94275ceSAdam Leventhal
1227f94275ceSAdam Leventhal psize = sizeof (invlog[0][0]) * n * nmissing;
1228f94275ceSAdam Leventhal p = kmem_alloc(psize, KM_SLEEP);
1229f94275ceSAdam Leventhal
1230f94275ceSAdam Leventhal for (pp = p, i = 0; i < nmissing; i++) {
1231f94275ceSAdam Leventhal invlog[i] = pp;
1232f94275ceSAdam Leventhal pp += n;
1233f94275ceSAdam Leventhal }
1234f94275ceSAdam Leventhal
1235f94275ceSAdam Leventhal for (i = 0; i < nmissing; i++) {
1236f94275ceSAdam Leventhal for (j = 0; j < n; j++) {
1237f94275ceSAdam Leventhal ASSERT3U(invrows[i][j], !=, 0);
1238f94275ceSAdam Leventhal invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239f94275ceSAdam Leventhal }
1240f94275ceSAdam Leventhal }
1241f94275ceSAdam Leventhal
1242f94275ceSAdam Leventhal for (i = 0; i < n; i++) {
1243f94275ceSAdam Leventhal c = used[i];
1244f94275ceSAdam Leventhal ASSERT3U(c, <, rm->rm_cols);
1245f94275ceSAdam Leventhal
1246f94275ceSAdam Leventhal src = rm->rm_col[c].rc_data;
1247f94275ceSAdam Leventhal ccount = rm->rm_col[c].rc_size;
1248f94275ceSAdam Leventhal for (j = 0; j < nmissing; j++) {
1249f94275ceSAdam Leventhal cc = missing[j] + rm->rm_firstdatacol;
1250f94275ceSAdam Leventhal ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251f94275ceSAdam Leventhal ASSERT3U(cc, <, rm->rm_cols);
1252f94275ceSAdam Leventhal ASSERT3U(cc, !=, c);
1253f94275ceSAdam Leventhal
1254f94275ceSAdam Leventhal dst[j] = rm->rm_col[cc].rc_data;
1255f94275ceSAdam Leventhal dcount[j] = rm->rm_col[cc].rc_size;
1256f94275ceSAdam Leventhal }
1257f94275ceSAdam Leventhal
1258f94275ceSAdam Leventhal ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259f94275ceSAdam Leventhal
1260f94275ceSAdam Leventhal for (x = 0; x < ccount; x++, src++) {
1261f94275ceSAdam Leventhal if (*src != 0)
1262f94275ceSAdam Leventhal log = vdev_raidz_log2[*src];
1263f94275ceSAdam Leventhal
1264f94275ceSAdam Leventhal for (cc = 0; cc < nmissing; cc++) {
1265f94275ceSAdam Leventhal if (x >= dcount[cc])
1266f94275ceSAdam Leventhal continue;
1267f94275ceSAdam Leventhal
1268f94275ceSAdam Leventhal if (*src == 0) {
1269f94275ceSAdam Leventhal val = 0;
1270f94275ceSAdam Leventhal } else {
1271f94275ceSAdam Leventhal if ((ll = log + invlog[cc][i]) >= 255)
1272f94275ceSAdam Leventhal ll -= 255;
1273f94275ceSAdam Leventhal val = vdev_raidz_pow2[ll];
1274f94275ceSAdam Leventhal }
1275f94275ceSAdam Leventhal
1276f94275ceSAdam Leventhal if (i == 0)
1277f94275ceSAdam Leventhal dst[cc][x] = val;
1278f94275ceSAdam Leventhal else
1279f94275ceSAdam Leventhal dst[cc][x] ^= val;
1280f94275ceSAdam Leventhal }
1281f94275ceSAdam Leventhal }
1282f94275ceSAdam Leventhal }
1283f94275ceSAdam Leventhal
1284f94275ceSAdam Leventhal kmem_free(p, psize);
1285f94275ceSAdam Leventhal }
1286f94275ceSAdam Leventhal
1287f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1288f94275ceSAdam Leventhal vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289f94275ceSAdam Leventhal {
1290f94275ceSAdam Leventhal int n, i, c, t, tt;
1291f94275ceSAdam Leventhal int nmissing_rows;
1292f94275ceSAdam Leventhal int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293f94275ceSAdam Leventhal int parity_map[VDEV_RAIDZ_MAXPARITY];
1294f94275ceSAdam Leventhal
1295f94275ceSAdam Leventhal uint8_t *p, *pp;
1296f94275ceSAdam Leventhal size_t psize;
1297f94275ceSAdam Leventhal
1298f94275ceSAdam Leventhal uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299f94275ceSAdam Leventhal uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300f94275ceSAdam Leventhal uint8_t *used;
1301f94275ceSAdam Leventhal
1302f94275ceSAdam Leventhal int code = 0;
1303f94275ceSAdam Leventhal
1304f94275ceSAdam Leventhal
1305f94275ceSAdam Leventhal n = rm->rm_cols - rm->rm_firstdatacol;
1306f94275ceSAdam Leventhal
1307f94275ceSAdam Leventhal /*
1308f94275ceSAdam Leventhal * Figure out which data columns are missing.
1309f94275ceSAdam Leventhal */
1310f94275ceSAdam Leventhal nmissing_rows = 0;
1311f94275ceSAdam Leventhal for (t = 0; t < ntgts; t++) {
1312f94275ceSAdam Leventhal if (tgts[t] >= rm->rm_firstdatacol) {
1313f94275ceSAdam Leventhal missing_rows[nmissing_rows++] =
1314f94275ceSAdam Leventhal tgts[t] - rm->rm_firstdatacol;
1315f94275ceSAdam Leventhal }
1316f94275ceSAdam Leventhal }
1317f94275ceSAdam Leventhal
1318f94275ceSAdam Leventhal /*
1319f94275ceSAdam Leventhal * Figure out which parity columns to use to help generate the missing
1320f94275ceSAdam Leventhal * data columns.
1321f94275ceSAdam Leventhal */
1322f94275ceSAdam Leventhal for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323f94275ceSAdam Leventhal ASSERT(tt < ntgts);
1324f94275ceSAdam Leventhal ASSERT(c < rm->rm_firstdatacol);
1325f94275ceSAdam Leventhal
1326f94275ceSAdam Leventhal /*
1327f94275ceSAdam Leventhal * Skip any targeted parity columns.
1328f94275ceSAdam Leventhal */
1329f94275ceSAdam Leventhal if (c == tgts[tt]) {
1330f94275ceSAdam Leventhal tt++;
1331f94275ceSAdam Leventhal continue;
1332f94275ceSAdam Leventhal }
1333f94275ceSAdam Leventhal
1334f94275ceSAdam Leventhal code |= 1 << c;
1335f94275ceSAdam Leventhal
1336f94275ceSAdam Leventhal parity_map[i] = c;
1337f94275ceSAdam Leventhal i++;
1338f94275ceSAdam Leventhal }
1339f94275ceSAdam Leventhal
1340f94275ceSAdam Leventhal ASSERT(code != 0);
1341f94275ceSAdam Leventhal ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342f94275ceSAdam Leventhal
1343f94275ceSAdam Leventhal psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344f94275ceSAdam Leventhal nmissing_rows * n + sizeof (used[0]) * n;
1345f94275ceSAdam Leventhal p = kmem_alloc(psize, KM_SLEEP);
1346f94275ceSAdam Leventhal
1347f94275ceSAdam Leventhal for (pp = p, i = 0; i < nmissing_rows; i++) {
1348f94275ceSAdam Leventhal rows[i] = pp;
1349f94275ceSAdam Leventhal pp += n;
1350f94275ceSAdam Leventhal invrows[i] = pp;
1351f94275ceSAdam Leventhal pp += n;
1352f94275ceSAdam Leventhal }
1353f94275ceSAdam Leventhal used = pp;
1354f94275ceSAdam Leventhal
1355f94275ceSAdam Leventhal for (i = 0; i < nmissing_rows; i++) {
1356f94275ceSAdam Leventhal used[i] = parity_map[i];
1357f94275ceSAdam Leventhal }
1358f94275ceSAdam Leventhal
1359f94275ceSAdam Leventhal for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360f94275ceSAdam Leventhal if (tt < nmissing_rows &&
1361f94275ceSAdam Leventhal c == missing_rows[tt] + rm->rm_firstdatacol) {
1362f94275ceSAdam Leventhal tt++;
1363f94275ceSAdam Leventhal continue;
1364f94275ceSAdam Leventhal }
1365f94275ceSAdam Leventhal
1366f94275ceSAdam Leventhal ASSERT3S(i, <, n);
1367f94275ceSAdam Leventhal used[i] = c;
1368f94275ceSAdam Leventhal i++;
1369f94275ceSAdam Leventhal }
1370f94275ceSAdam Leventhal
1371f94275ceSAdam Leventhal /*
1372f94275ceSAdam Leventhal * Initialize the interesting rows of the matrix.
1373f94275ceSAdam Leventhal */
1374f94275ceSAdam Leventhal vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375f94275ceSAdam Leventhal
1376f94275ceSAdam Leventhal /*
1377f94275ceSAdam Leventhal * Invert the matrix.
1378f94275ceSAdam Leventhal */
1379f94275ceSAdam Leventhal vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380f94275ceSAdam Leventhal invrows, used);
1381f94275ceSAdam Leventhal
1382f94275ceSAdam Leventhal /*
1383f94275ceSAdam Leventhal * Reconstruct the missing data using the generated matrix.
1384f94275ceSAdam Leventhal */
1385f94275ceSAdam Leventhal vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386f94275ceSAdam Leventhal invrows, used);
1387f94275ceSAdam Leventhal
1388f94275ceSAdam Leventhal kmem_free(p, psize);
1389f94275ceSAdam Leventhal
1390f94275ceSAdam Leventhal return (code);
1391f94275ceSAdam Leventhal }
1392f94275ceSAdam Leventhal
1393f94275ceSAdam Leventhal static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1394f94275ceSAdam Leventhal vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395f94275ceSAdam Leventhal {
1396f94275ceSAdam Leventhal int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397f94275ceSAdam Leventhal int ntgts;
1398f94275ceSAdam Leventhal int i, c;
1399f94275ceSAdam Leventhal int code;
1400f94275ceSAdam Leventhal int nbadparity, nbaddata;
1401f94275ceSAdam Leventhal int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402f94275ceSAdam Leventhal
1403f94275ceSAdam Leventhal /*
1404f94275ceSAdam Leventhal * The tgts list must already be sorted.
1405f94275ceSAdam Leventhal */
1406f94275ceSAdam Leventhal for (i = 1; i < nt; i++) {
1407f94275ceSAdam Leventhal ASSERT(t[i] > t[i - 1]);
1408f94275ceSAdam Leventhal }
1409f94275ceSAdam Leventhal
1410f94275ceSAdam Leventhal nbadparity = rm->rm_firstdatacol;
1411f94275ceSAdam Leventhal nbaddata = rm->rm_cols - nbadparity;
1412f94275ceSAdam Leventhal ntgts = 0;
1413f94275ceSAdam Leventhal for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414f94275ceSAdam Leventhal if (c < rm->rm_firstdatacol)
1415f94275ceSAdam Leventhal parity_valid[c] = B_FALSE;
1416f94275ceSAdam Leventhal
1417f94275ceSAdam Leventhal if (i < nt && c == t[i]) {
1418f94275ceSAdam Leventhal tgts[ntgts++] = c;
1419f94275ceSAdam Leventhal i++;
1420f94275ceSAdam Leventhal } else if (rm->rm_col[c].rc_error != 0) {
1421f94275ceSAdam Leventhal tgts[ntgts++] = c;
1422f94275ceSAdam Leventhal } else if (c >= rm->rm_firstdatacol) {
1423f94275ceSAdam Leventhal nbaddata--;
1424f94275ceSAdam Leventhal } else {
1425f94275ceSAdam Leventhal parity_valid[c] = B_TRUE;
1426f94275ceSAdam Leventhal nbadparity--;
1427f94275ceSAdam Leventhal }
1428f94275ceSAdam Leventhal }
1429f94275ceSAdam Leventhal
1430f94275ceSAdam Leventhal ASSERT(ntgts >= nt);
1431f94275ceSAdam Leventhal ASSERT(nbaddata >= 0);
1432f94275ceSAdam Leventhal ASSERT(nbaddata + nbadparity == ntgts);
1433f94275ceSAdam Leventhal
1434f94275ceSAdam Leventhal dt = &tgts[nbadparity];
1435f94275ceSAdam Leventhal
1436f94275ceSAdam Leventhal /*
1437f94275ceSAdam Leventhal * See if we can use any of our optimized reconstruction routines.
1438f94275ceSAdam Leventhal */
1439f94275ceSAdam Leventhal if (!vdev_raidz_default_to_general) {
1440f94275ceSAdam Leventhal switch (nbaddata) {
1441f94275ceSAdam Leventhal case 1:
1442f94275ceSAdam Leventhal if (parity_valid[VDEV_RAIDZ_P])
1443f94275ceSAdam Leventhal return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444f94275ceSAdam Leventhal
1445f94275ceSAdam Leventhal ASSERT(rm->rm_firstdatacol > 1);
1446f94275ceSAdam Leventhal
1447f94275ceSAdam Leventhal if (parity_valid[VDEV_RAIDZ_Q])
1448f94275ceSAdam Leventhal return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449f94275ceSAdam Leventhal
1450f94275ceSAdam Leventhal ASSERT(rm->rm_firstdatacol > 2);
1451f94275ceSAdam Leventhal break;
1452f94275ceSAdam Leventhal
1453f94275ceSAdam Leventhal case 2:
1454f94275ceSAdam Leventhal ASSERT(rm->rm_firstdatacol > 1);
1455f94275ceSAdam Leventhal
1456f94275ceSAdam Leventhal if (parity_valid[VDEV_RAIDZ_P] &&
1457f94275ceSAdam Leventhal parity_valid[VDEV_RAIDZ_Q])
1458f94275ceSAdam Leventhal return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459f94275ceSAdam Leventhal
1460f94275ceSAdam Leventhal ASSERT(rm->rm_firstdatacol > 2);
1461f94275ceSAdam Leventhal
1462f94275ceSAdam Leventhal break;
1463f94275ceSAdam Leventhal }
1464f94275ceSAdam Leventhal }
1465f94275ceSAdam Leventhal
1466f94275ceSAdam Leventhal code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467f94275ceSAdam Leventhal ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468f94275ceSAdam Leventhal ASSERT(code > 0);
1469f94275ceSAdam Leventhal return (code);
1470f94275ceSAdam Leventhal }
147199653d4eSeschrock
1472fa9e4066Sahrens static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * ashift)14734263d13fSGeorge Wilson vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
14744263d13fSGeorge Wilson uint64_t *ashift)
1475fa9e4066Sahrens {
1476f94275ceSAdam Leventhal vdev_t *cvd;
147799653d4eSeschrock uint64_t nparity = vd->vdev_nparity;
1478f94275ceSAdam Leventhal int c;
1479fa9e4066Sahrens int lasterror = 0;
1480fa9e4066Sahrens int numerrors = 0;
1481fa9e4066Sahrens
148299653d4eSeschrock ASSERT(nparity > 0);
148399653d4eSeschrock
148499653d4eSeschrock if (nparity > VDEV_RAIDZ_MAXPARITY ||
148599653d4eSeschrock vd->vdev_children < nparity + 1) {
1486fa9e4066Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL));
1488fa9e4066Sahrens }
1489fa9e4066Sahrens
1490f64c0e34SEric Taylor vdev_open_children(vd);
1491fa9e4066Sahrens
1492f94275ceSAdam Leventhal for (c = 0; c < vd->vdev_children; c++) {
1493f94275ceSAdam Leventhal cvd = vd->vdev_child[c];
1494f64c0e34SEric Taylor
1495f94275ceSAdam Leventhal if (cvd->vdev_open_error != 0) {
1496f64c0e34SEric Taylor lasterror = cvd->vdev_open_error;
1497fa9e4066Sahrens numerrors++;
1498fa9e4066Sahrens continue;
1499fa9e4066Sahrens }
1500fa9e4066Sahrens
1501fa9e4066Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
15024263d13fSGeorge Wilson *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503ecc2d604Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift);
1504fa9e4066Sahrens }
1505fa9e4066Sahrens
1506fa9e4066Sahrens *asize *= vd->vdev_children;
15074263d13fSGeorge Wilson *max_asize *= vd->vdev_children;
1508fa9e4066Sahrens
150999653d4eSeschrock if (numerrors > nparity) {
1510fa9e4066Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511fa9e4066Sahrens return (lasterror);
1512fa9e4066Sahrens }
1513fa9e4066Sahrens
1514fa9e4066Sahrens return (0);
1515fa9e4066Sahrens }
1516fa9e4066Sahrens
1517fa9e4066Sahrens static void
vdev_raidz_close(vdev_t * vd)1518fa9e4066Sahrens vdev_raidz_close(vdev_t *vd)
1519fa9e4066Sahrens {
1520f94275ceSAdam Leventhal int c;
1521f94275ceSAdam Leventhal
1522f94275ceSAdam Leventhal for (c = 0; c < vd->vdev_children; c++)
1523fa9e4066Sahrens vdev_close(vd->vdev_child[c]);
1524fa9e4066Sahrens }
1525fa9e4066Sahrens
1526810e43b2SBill Pijewski /*
1527810e43b2SBill Pijewski * Handle a read or write I/O to a RAID-Z dump device.
1528810e43b2SBill Pijewski *
1529810e43b2SBill Pijewski * The dump device is in a unique situation compared to other ZFS datasets:
1530810e43b2SBill Pijewski * writing to this device should be as simple and fast as possible. In
1531810e43b2SBill Pijewski * addition, durability matters much less since the dump will be extracted
1532810e43b2SBill Pijewski * once the machine reboots. For that reason, this function eschews parity for
1533810e43b2SBill Pijewski * performance and simplicity. The dump device uses the checksum setting
1534810e43b2SBill Pijewski * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1535810e43b2SBill Pijewski * dataset.
1536810e43b2SBill Pijewski *
1537810e43b2SBill Pijewski * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1538810e43b2SBill Pijewski * 128 KB will not fill an entire block; in addition, they may not be properly
1539810e43b2SBill Pijewski * aligned. In that case, this function uses the preallocated 128 KB block and
1540810e43b2SBill Pijewski * omits reading or writing any "empty" portions of that block, as opposed to
1541810e43b2SBill Pijewski * allocating a fresh appropriately-sized block.
1542810e43b2SBill Pijewski *
1543810e43b2SBill Pijewski * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1544810e43b2SBill Pijewski *
1545810e43b2SBill Pijewski * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1546810e43b2SBill Pijewski *
1547810e43b2SBill Pijewski * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1548810e43b2SBill Pijewski * allocated which spans all five child vdevs. 8 KB of data would be written to
1549810e43b2SBill Pijewski * each of four vdevs, with the fifth containing the parity bits.
1550810e43b2SBill Pijewski *
1551810e43b2SBill Pijewski * parity data data data data
1552810e43b2SBill Pijewski * | PP | XX | XX | XX | XX |
1553810e43b2SBill Pijewski * ^ ^ ^ ^ ^
1554810e43b2SBill Pijewski * | | | | |
1555810e43b2SBill Pijewski * 8 KB parity ------8 KB data blocks------
1556810e43b2SBill Pijewski *
1557810e43b2SBill Pijewski * However, when writing to the dump device, the behavior is different:
1558810e43b2SBill Pijewski *
1559810e43b2SBill Pijewski * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1560810e43b2SBill Pijewski *
1561810e43b2SBill Pijewski * Unlike the normal RAID-Z case in which the block is allocated based on the
1562810e43b2SBill Pijewski * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1563810e43b2SBill Pijewski * I/O size is less than 128 KB, only the actual portions of data are written.
1564810e43b2SBill Pijewski * In this example the data is written to the third data vdev since that vdev
1565810e43b2SBill Pijewski * contains the offset [64 KB, 96 KB).
1566810e43b2SBill Pijewski *
1567810e43b2SBill Pijewski * parity data data data data
1568810e43b2SBill Pijewski * | | | | XX | |
1569810e43b2SBill Pijewski * ^
1570810e43b2SBill Pijewski * |
1571810e43b2SBill Pijewski * 32 KB data block
1572810e43b2SBill Pijewski *
1573810e43b2SBill Pijewski * As a result, an individual I/O may not span all child vdevs; moreover, a
1574810e43b2SBill Pijewski * small I/O may only operate on a single child vdev.
1575810e43b2SBill Pijewski *
1576810e43b2SBill Pijewski * Note that since there are no parity bits calculated or written, this format
1577810e43b2SBill Pijewski * remains the same no matter how many parity bits are used in a normal RAID-Z
1578810e43b2SBill Pijewski * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1579810e43b2SBill Pijewski * would look like:
1580810e43b2SBill Pijewski *
1581810e43b2SBill Pijewski * parity parity parity data data data data
1582810e43b2SBill Pijewski * | | | | | | XX | |
1583810e43b2SBill Pijewski * ^
1584810e43b2SBill Pijewski * |
1585810e43b2SBill Pijewski * 32 KB data block
1586810e43b2SBill Pijewski */
1587810e43b2SBill Pijewski int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1588810e43b2SBill Pijewski vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1589810e43b2SBill Pijewski uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1590810e43b2SBill Pijewski {
1591810e43b2SBill Pijewski vdev_t *tvd = vd->vdev_top;
1592810e43b2SBill Pijewski vdev_t *cvd;
1593810e43b2SBill Pijewski raidz_map_t *rm;
1594810e43b2SBill Pijewski raidz_col_t *rc;
1595810e43b2SBill Pijewski int c, err = 0;
1596810e43b2SBill Pijewski
1597810e43b2SBill Pijewski uint64_t start, end, colstart, colend;
1598810e43b2SBill Pijewski uint64_t coloffset, colsize, colskip;
1599810e43b2SBill Pijewski
1600810e43b2SBill Pijewski int flags = doread ? B_READ : B_WRITE;
1601810e43b2SBill Pijewski
1602810e43b2SBill Pijewski #ifdef _KERNEL
1603810e43b2SBill Pijewski
1604810e43b2SBill Pijewski /*
1605810e43b2SBill Pijewski * Don't write past the end of the block
1606810e43b2SBill Pijewski */
1607d1a98260SMatthew Ahrens VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1608810e43b2SBill Pijewski
1609810e43b2SBill Pijewski start = offset;
1610810e43b2SBill Pijewski end = start + size;
1611810e43b2SBill Pijewski
1612810e43b2SBill Pijewski /*
1613810e43b2SBill Pijewski * Allocate a RAID-Z map for this block. Note that this block starts
1614810e43b2SBill Pijewski * from the "original" offset, this is, the offset of the extent which
1615810e43b2SBill Pijewski * contains the requisite offset of the data being read or written.
1616810e43b2SBill Pijewski *
1617810e43b2SBill Pijewski * Even if this I/O operation doesn't span the full block size, let's
1618810e43b2SBill Pijewski * treat the on-disk format as if the only blocks are the complete 128
1619810e43b2SBill Pijewski * KB size.
1620810e43b2SBill Pijewski */
1621810e43b2SBill Pijewski rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1622d1a98260SMatthew Ahrens SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1623d1a98260SMatthew Ahrens vd->vdev_children, vd->vdev_nparity);
1624810e43b2SBill Pijewski
1625810e43b2SBill Pijewski coloffset = origoffset;
1626810e43b2SBill Pijewski
1627810e43b2SBill Pijewski for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1628810e43b2SBill Pijewski c++, coloffset += rc->rc_size) {
1629810e43b2SBill Pijewski rc = &rm->rm_col[c];
1630810e43b2SBill Pijewski cvd = vd->vdev_child[rc->rc_devidx];
1631810e43b2SBill Pijewski
1632810e43b2SBill Pijewski /*
1633810e43b2SBill Pijewski * Find the start and end of this column in the RAID-Z map,
1634810e43b2SBill Pijewski * keeping in mind that the stated size and offset of the
1635810e43b2SBill Pijewski * operation may not fill the entire column for this vdev.
1636810e43b2SBill Pijewski *
1637810e43b2SBill Pijewski * If any portion of the data spans this column, issue the
1638810e43b2SBill Pijewski * appropriate operation to the vdev.
1639810e43b2SBill Pijewski */
1640810e43b2SBill Pijewski if (coloffset + rc->rc_size <= start)
1641810e43b2SBill Pijewski continue;
1642810e43b2SBill Pijewski if (coloffset >= end)
1643810e43b2SBill Pijewski continue;
1644810e43b2SBill Pijewski
1645810e43b2SBill Pijewski colstart = MAX(coloffset, start);
1646810e43b2SBill Pijewski colend = MIN(end, coloffset + rc->rc_size);
1647810e43b2SBill Pijewski colsize = colend - colstart;
1648810e43b2SBill Pijewski colskip = colstart - coloffset;
1649810e43b2SBill Pijewski
1650810e43b2SBill Pijewski VERIFY3U(colsize, <=, rc->rc_size);
1651810e43b2SBill Pijewski VERIFY3U(colskip, <=, rc->rc_size);
1652810e43b2SBill Pijewski
1653810e43b2SBill Pijewski /*
1654810e43b2SBill Pijewski * Note that the child vdev will have a vdev label at the start
1655810e43b2SBill Pijewski * of its range of offsets, hence the need for
1656810e43b2SBill Pijewski * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1657810e43b2SBill Pijewski * example of why this calculation is needed.
1658810e43b2SBill Pijewski */
1659810e43b2SBill Pijewski if ((err = vdev_disk_physio(cvd,
1660810e43b2SBill Pijewski ((char *)rc->rc_data) + colskip, colsize,
1661810e43b2SBill Pijewski VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1662810e43b2SBill Pijewski flags, isdump)) != 0)
1663810e43b2SBill Pijewski break;
1664810e43b2SBill Pijewski }
1665810e43b2SBill Pijewski
1666810e43b2SBill Pijewski vdev_raidz_map_free(rm);
1667810e43b2SBill Pijewski #endif /* KERNEL */
1668810e43b2SBill Pijewski
1669810e43b2SBill Pijewski return (err);
1670810e43b2SBill Pijewski }
1671810e43b2SBill Pijewski
1672fa9e4066Sahrens static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1673fa9e4066Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1674fa9e4066Sahrens {
1675fa9e4066Sahrens uint64_t asize;
1676ecc2d604Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift;
1677fa9e4066Sahrens uint64_t cols = vd->vdev_children;
167899653d4eSeschrock uint64_t nparity = vd->vdev_nparity;
1679fa9e4066Sahrens
1680ecc2d604Sbonwick asize = ((psize - 1) >> ashift) + 1;
168199653d4eSeschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
168299653d4eSeschrock asize = roundup(asize, nparity + 1) << ashift;
1683fa9e4066Sahrens
1684fa9e4066Sahrens return (asize);
1685fa9e4066Sahrens }
1686fa9e4066Sahrens
1687fa9e4066Sahrens static void
vdev_raidz_child_done(zio_t * zio)1688fa9e4066Sahrens vdev_raidz_child_done(zio_t *zio)
1689fa9e4066Sahrens {
1690fa9e4066Sahrens raidz_col_t *rc = zio->io_private;
1691fa9e4066Sahrens
1692fa9e4066Sahrens rc->rc_error = zio->io_error;
1693fa9e4066Sahrens rc->rc_tried = 1;
1694fa9e4066Sahrens rc->rc_skipped = 0;
1695fa9e4066Sahrens }
1696fa9e4066Sahrens
16973e30c24aSWill Andrews /*
16983e30c24aSWill Andrews * Start an IO operation on a RAIDZ VDev
16993e30c24aSWill Andrews *
17003e30c24aSWill Andrews * Outline:
17013e30c24aSWill Andrews * - For write operations:
17023e30c24aSWill Andrews * 1. Generate the parity data
17033e30c24aSWill Andrews * 2. Create child zio write operations to each column's vdev, for both
17043e30c24aSWill Andrews * data and parity.
17053e30c24aSWill Andrews * 3. If the column skips any sectors for padding, create optional dummy
17063e30c24aSWill Andrews * write zio children for those areas to improve aggregation continuity.
17073e30c24aSWill Andrews * - For read operations:
17083e30c24aSWill Andrews * 1. Create child zio read operations to each data column's vdev to read
17093e30c24aSWill Andrews * the range of data required for zio.
17103e30c24aSWill Andrews * 2. If this is a scrub or resilver operation, or if any of the data
17113e30c24aSWill Andrews * vdevs have had errors, then create zio read operations to the parity
17123e30c24aSWill Andrews * columns' VDevs as well.
17133e30c24aSWill Andrews */
1714efe6bf49SGeorge Wilson static void
vdev_raidz_io_start(zio_t * zio)1715fa9e4066Sahrens vdev_raidz_io_start(zio_t *zio)
1716fa9e4066Sahrens {
1717fa9e4066Sahrens vdev_t *vd = zio->io_vd;
1718ecc2d604Sbonwick vdev_t *tvd = vd->vdev_top;
1719fa9e4066Sahrens vdev_t *cvd;
1720fa9e4066Sahrens raidz_map_t *rm;
1721fa9e4066Sahrens raidz_col_t *rc;
1722f94275ceSAdam Leventhal int c, i;
1723fa9e4066Sahrens
1724810e43b2SBill Pijewski rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1725810e43b2SBill Pijewski tvd->vdev_ashift, vd->vdev_children,
172699653d4eSeschrock vd->vdev_nparity);
1727fa9e4066Sahrens
1728810e43b2SBill Pijewski zio->io_vsd = rm;
1729810e43b2SBill Pijewski zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1730810e43b2SBill Pijewski
173144cd46caSbillm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1732fa9e4066Sahrens
1733fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_WRITE) {
1734f94275ceSAdam Leventhal vdev_raidz_generate_parity(rm);
1735fa9e4066Sahrens
1736fa9e4066Sahrens for (c = 0; c < rm->rm_cols; c++) {
1737fa9e4066Sahrens rc = &rm->rm_col[c];
173899653d4eSeschrock cvd = vd->vdev_child[rc->rc_devidx];
1739fa9e4066Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1740fa9e4066Sahrens rc->rc_offset, rc->rc_data, rc->rc_size,
1741e14bb325SJeff Bonwick zio->io_type, zio->io_priority, 0,
1742fa9e4066Sahrens vdev_raidz_child_done, rc));
1743fa9e4066Sahrens }
1744e05725b1Sbonwick
1745f94275ceSAdam Leventhal /*
1746f94275ceSAdam Leventhal * Generate optional I/Os for any skipped sectors to improve
1747f94275ceSAdam Leventhal * aggregation contiguity.
1748f94275ceSAdam Leventhal */
17492fbc121fSAdam Leventhal for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1750f94275ceSAdam Leventhal ASSERT(c <= rm->rm_scols);
1751f94275ceSAdam Leventhal if (c == rm->rm_scols)
1752f94275ceSAdam Leventhal c = 0;
1753f94275ceSAdam Leventhal rc = &rm->rm_col[c];
1754f94275ceSAdam Leventhal cvd = vd->vdev_child[rc->rc_devidx];
1755f94275ceSAdam Leventhal zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1756f94275ceSAdam Leventhal rc->rc_offset + rc->rc_size, NULL,
1757f94275ceSAdam Leventhal 1 << tvd->vdev_ashift,
1758f94275ceSAdam Leventhal zio->io_type, zio->io_priority,
1759f94275ceSAdam Leventhal ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1760f94275ceSAdam Leventhal }
1761f94275ceSAdam Leventhal
1762efe6bf49SGeorge Wilson zio_execute(zio);
1763efe6bf49SGeorge Wilson return;
1764fa9e4066Sahrens }
1765fa9e4066Sahrens
1766fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ);
1767fa9e4066Sahrens
176899653d4eSeschrock /*
176999653d4eSeschrock * Iterate over the columns in reverse order so that we hit the parity
1770f94275ceSAdam Leventhal * last -- any errors along the way will force us to read the parity.
177199653d4eSeschrock */
1772fa9e4066Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) {
1773fa9e4066Sahrens rc = &rm->rm_col[c];
177499653d4eSeschrock cvd = vd->vdev_child[rc->rc_devidx];
1775*c3a1418dSArne Jansen if (cvd->vdev_avoid_read) {
1776*c3a1418dSArne Jansen if (c >= rm->rm_firstdatacol)
1777*c3a1418dSArne Jansen rm->rm_missingdata++;
1778*c3a1418dSArne Jansen else
1779*c3a1418dSArne Jansen rm->rm_missingparity++;
1780*c3a1418dSArne Jansen rc->rc_error = SET_ERROR(ENXIO);
1781*c3a1418dSArne Jansen rc->rc_skipped = 1; /* only try if necessary */
1782*c3a1418dSArne Jansen continue;
1783*c3a1418dSArne Jansen }
17840a4e9518Sgw25295 if (!vdev_readable(cvd)) {
178599653d4eSeschrock if (c >= rm->rm_firstdatacol)
178699653d4eSeschrock rm->rm_missingdata++;
178799653d4eSeschrock else
178899653d4eSeschrock rm->rm_missingparity++;
1789be6fd75aSMatthew Ahrens rc->rc_error = SET_ERROR(ENXIO);
1790fa9e4066Sahrens rc->rc_tried = 1; /* don't even try */
1791fa9e4066Sahrens rc->rc_skipped = 1;
1792fa9e4066Sahrens continue;
1793fa9e4066Sahrens }
1794b24ab676SJeff Bonwick if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
179599653d4eSeschrock if (c >= rm->rm_firstdatacol)
179699653d4eSeschrock rm->rm_missingdata++;
179799653d4eSeschrock else
179899653d4eSeschrock rm->rm_missingparity++;
1799be6fd75aSMatthew Ahrens rc->rc_error = SET_ERROR(ESTALE);
1800fa9e4066Sahrens rc->rc_skipped = 1;
1801fa9e4066Sahrens continue;
1802fa9e4066Sahrens }
180399653d4eSeschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1804dfd80e3eSMark J Musante (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1805fa9e4066Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1806fa9e4066Sahrens rc->rc_offset, rc->rc_data, rc->rc_size,
1807e14bb325SJeff Bonwick zio->io_type, zio->io_priority, 0,
1808fa9e4066Sahrens vdev_raidz_child_done, rc));
1809fa9e4066Sahrens }
1810fa9e4066Sahrens }
1811fa9e4066Sahrens
1812efe6bf49SGeorge Wilson zio_execute(zio);
1813fa9e4066Sahrens }
1814fa9e4066Sahrens
18153f9d6ad7SLin Ling
1816ea8dc4b6Seschrock /*
1817ea8dc4b6Seschrock * Report a checksum error for a child of a RAID-Z device.
1818ea8dc4b6Seschrock */
1819ea8dc4b6Seschrock static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)182022fe2c88SJonathan Adams raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1821ea8dc4b6Seschrock {
182299653d4eSeschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1823ea8dc4b6Seschrock
1824ea8dc4b6Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
182522fe2c88SJonathan Adams zio_bad_cksum_t zbc;
182622fe2c88SJonathan Adams raidz_map_t *rm = zio->io_vsd;
182722fe2c88SJonathan Adams
1828ea8dc4b6Seschrock mutex_enter(&vd->vdev_stat_lock);
1829ea8dc4b6Seschrock vd->vdev_stat.vs_checksum_errors++;
1830ea8dc4b6Seschrock mutex_exit(&vd->vdev_stat_lock);
183122fe2c88SJonathan Adams
183222fe2c88SJonathan Adams zbc.zbc_has_cksum = 0;
183322fe2c88SJonathan Adams zbc.zbc_injected = rm->rm_ecksuminjected;
183422fe2c88SJonathan Adams
183522fe2c88SJonathan Adams zfs_ereport_post_checksum(zio->io_spa, vd, zio,
183622fe2c88SJonathan Adams rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
183722fe2c88SJonathan Adams &zbc);
183822fe2c88SJonathan Adams }
1839ea8dc4b6Seschrock }
1840ea8dc4b6Seschrock
184122fe2c88SJonathan Adams /*
184222fe2c88SJonathan Adams * We keep track of whether or not there were any injected errors, so that
184322fe2c88SJonathan Adams * any ereports we generate can note it.
184422fe2c88SJonathan Adams */
184522fe2c88SJonathan Adams static int
raidz_checksum_verify(zio_t * zio)184622fe2c88SJonathan Adams raidz_checksum_verify(zio_t *zio)
184722fe2c88SJonathan Adams {
184822fe2c88SJonathan Adams zio_bad_cksum_t zbc;
184922fe2c88SJonathan Adams raidz_map_t *rm = zio->io_vsd;
185022fe2c88SJonathan Adams
185122fe2c88SJonathan Adams int ret = zio_checksum_error(zio, &zbc);
185222fe2c88SJonathan Adams if (ret != 0 && zbc.zbc_injected != 0)
185322fe2c88SJonathan Adams rm->rm_ecksuminjected = 1;
185422fe2c88SJonathan Adams
185522fe2c88SJonathan Adams return (ret);
1856ea8dc4b6Seschrock }
1857ea8dc4b6Seschrock
185899653d4eSeschrock /*
185999653d4eSeschrock * Generate the parity from the data columns. If we tried and were able to
186099653d4eSeschrock * read the parity without error, verify that the generated parity matches the
186199653d4eSeschrock * data we read. If it doesn't, we fire off a checksum error. Return the
186299653d4eSeschrock * number such failures.
186399653d4eSeschrock */
186499653d4eSeschrock static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)186599653d4eSeschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
186699653d4eSeschrock {
186799653d4eSeschrock void *orig[VDEV_RAIDZ_MAXPARITY];
186899653d4eSeschrock int c, ret = 0;
186999653d4eSeschrock raidz_col_t *rc;
187099653d4eSeschrock
1871810e43b2SBill Pijewski blkptr_t *bp = zio->io_bp;
1872810e43b2SBill Pijewski enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1873810e43b2SBill Pijewski (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1874810e43b2SBill Pijewski
1875810e43b2SBill Pijewski if (checksum == ZIO_CHECKSUM_NOPARITY)
1876810e43b2SBill Pijewski return (ret);
1877810e43b2SBill Pijewski
187899653d4eSeschrock for (c = 0; c < rm->rm_firstdatacol; c++) {
187999653d4eSeschrock rc = &rm->rm_col[c];
188099653d4eSeschrock if (!rc->rc_tried || rc->rc_error != 0)
188199653d4eSeschrock continue;
188299653d4eSeschrock orig[c] = zio_buf_alloc(rc->rc_size);
188399653d4eSeschrock bcopy(rc->rc_data, orig[c], rc->rc_size);
188499653d4eSeschrock }
188599653d4eSeschrock
1886f94275ceSAdam Leventhal vdev_raidz_generate_parity(rm);
188799653d4eSeschrock
188899653d4eSeschrock for (c = 0; c < rm->rm_firstdatacol; c++) {
188999653d4eSeschrock rc = &rm->rm_col[c];
189099653d4eSeschrock if (!rc->rc_tried || rc->rc_error != 0)
189199653d4eSeschrock continue;
189299653d4eSeschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
189322fe2c88SJonathan Adams raidz_checksum_error(zio, rc, orig[c]);
1894be6fd75aSMatthew Ahrens rc->rc_error = SET_ERROR(ECKSUM);
189599653d4eSeschrock ret++;
189699653d4eSeschrock }
189799653d4eSeschrock zio_buf_free(orig[c], rc->rc_size);
189899653d4eSeschrock }
189999653d4eSeschrock
190099653d4eSeschrock return (ret);
190199653d4eSeschrock }
190299653d4eSeschrock
1903f94275ceSAdam Leventhal /*
1904f94275ceSAdam Leventhal * Keep statistics on all the ways that we used parity to correct data.
1905f94275ceSAdam Leventhal */
1906f94275ceSAdam Leventhal static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1907ea8dc4b6Seschrock
1908e05725b1Sbonwick static int
vdev_raidz_worst_error(raidz_map_t * rm)1909e14bb325SJeff Bonwick vdev_raidz_worst_error(raidz_map_t *rm)
1910e14bb325SJeff Bonwick {
1911e14bb325SJeff Bonwick int error = 0;
1912e14bb325SJeff Bonwick
1913e14bb325SJeff Bonwick for (int c = 0; c < rm->rm_cols; c++)
1914e14bb325SJeff Bonwick error = zio_worst_error(error, rm->rm_col[c].rc_error);
1915e14bb325SJeff Bonwick
1916e14bb325SJeff Bonwick return (error);
1917e14bb325SJeff Bonwick }
1918e14bb325SJeff Bonwick
1919f94275ceSAdam Leventhal /*
1920f94275ceSAdam Leventhal * Iterate over all combinations of bad data and attempt a reconstruction.
1921f94275ceSAdam Leventhal * Note that the algorithm below is non-optimal because it doesn't take into
1922f94275ceSAdam Leventhal * account how reconstruction is actually performed. For example, with
1923f94275ceSAdam Leventhal * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1924f94275ceSAdam Leventhal * is targeted as invalid as if columns 1 and 4 are targeted since in both
1925f94275ceSAdam Leventhal * cases we'd only use parity information in column 0.
1926f94275ceSAdam Leventhal */
1927f94275ceSAdam Leventhal static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)1928f94275ceSAdam Leventhal vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1929f94275ceSAdam Leventhal {
1930f94275ceSAdam Leventhal raidz_map_t *rm = zio->io_vsd;
1931f94275ceSAdam Leventhal raidz_col_t *rc;
1932f94275ceSAdam Leventhal void *orig[VDEV_RAIDZ_MAXPARITY];
1933f94275ceSAdam Leventhal int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1934f94275ceSAdam Leventhal int *tgts = &tstore[1];
1935f94275ceSAdam Leventhal int current, next, i, c, n;
1936f94275ceSAdam Leventhal int code, ret = 0;
1937f94275ceSAdam Leventhal
1938f94275ceSAdam Leventhal ASSERT(total_errors < rm->rm_firstdatacol);
1939f94275ceSAdam Leventhal
1940f94275ceSAdam Leventhal /*
1941f94275ceSAdam Leventhal * This simplifies one edge condition.
1942f94275ceSAdam Leventhal */
1943f94275ceSAdam Leventhal tgts[-1] = -1;
1944f94275ceSAdam Leventhal
1945f94275ceSAdam Leventhal for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1946f94275ceSAdam Leventhal /*
1947f94275ceSAdam Leventhal * Initialize the targets array by finding the first n columns
1948f94275ceSAdam Leventhal * that contain no error.
1949f94275ceSAdam Leventhal *
1950f94275ceSAdam Leventhal * If there were no data errors, we need to ensure that we're
1951f94275ceSAdam Leventhal * always explicitly attempting to reconstruct at least one
1952f94275ceSAdam Leventhal * data column. To do this, we simply push the highest target
1953f94275ceSAdam Leventhal * up into the data columns.
1954f94275ceSAdam Leventhal */
1955f94275ceSAdam Leventhal for (c = 0, i = 0; i < n; i++) {
1956f94275ceSAdam Leventhal if (i == n - 1 && data_errors == 0 &&
1957f94275ceSAdam Leventhal c < rm->rm_firstdatacol) {
1958f94275ceSAdam Leventhal c = rm->rm_firstdatacol;
1959f94275ceSAdam Leventhal }
1960f94275ceSAdam Leventhal
1961f94275ceSAdam Leventhal while (rm->rm_col[c].rc_error != 0) {
1962f94275ceSAdam Leventhal c++;
1963f94275ceSAdam Leventhal ASSERT3S(c, <, rm->rm_cols);
1964f94275ceSAdam Leventhal }
1965f94275ceSAdam Leventhal
1966f94275ceSAdam Leventhal tgts[i] = c++;
1967f94275ceSAdam Leventhal }
1968f94275ceSAdam Leventhal
1969f94275ceSAdam Leventhal /*
1970f94275ceSAdam Leventhal * Setting tgts[n] simplifies the other edge condition.
1971f94275ceSAdam Leventhal */
1972f94275ceSAdam Leventhal tgts[n] = rm->rm_cols;
1973f94275ceSAdam Leventhal
1974f94275ceSAdam Leventhal /*
1975f94275ceSAdam Leventhal * These buffers were allocated in previous iterations.
1976f94275ceSAdam Leventhal */
1977f94275ceSAdam Leventhal for (i = 0; i < n - 1; i++) {
1978f94275ceSAdam Leventhal ASSERT(orig[i] != NULL);
1979f94275ceSAdam Leventhal }
1980f94275ceSAdam Leventhal
1981f94275ceSAdam Leventhal orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1982f94275ceSAdam Leventhal
1983f94275ceSAdam Leventhal current = 0;
1984f94275ceSAdam Leventhal next = tgts[current];
1985f94275ceSAdam Leventhal
1986f94275ceSAdam Leventhal while (current != n) {
1987f94275ceSAdam Leventhal tgts[current] = next;
1988f94275ceSAdam Leventhal current = 0;
1989f94275ceSAdam Leventhal
1990f94275ceSAdam Leventhal /*
1991f94275ceSAdam Leventhal * Save off the original data that we're going to
1992f94275ceSAdam Leventhal * attempt to reconstruct.
1993f94275ceSAdam Leventhal */
1994f94275ceSAdam Leventhal for (i = 0; i < n; i++) {
1995f94275ceSAdam Leventhal ASSERT(orig[i] != NULL);
1996f94275ceSAdam Leventhal c = tgts[i];
1997f94275ceSAdam Leventhal ASSERT3S(c, >=, 0);
1998f94275ceSAdam Leventhal ASSERT3S(c, <, rm->rm_cols);
1999f94275ceSAdam Leventhal rc = &rm->rm_col[c];
2000f94275ceSAdam Leventhal bcopy(rc->rc_data, orig[i], rc->rc_size);
2001f94275ceSAdam Leventhal }
2002f94275ceSAdam Leventhal
2003f94275ceSAdam Leventhal /*
2004f94275ceSAdam Leventhal * Attempt a reconstruction and exit the outer loop on
2005f94275ceSAdam Leventhal * success.
2006f94275ceSAdam Leventhal */
2007f94275ceSAdam Leventhal code = vdev_raidz_reconstruct(rm, tgts, n);
200822fe2c88SJonathan Adams if (raidz_checksum_verify(zio) == 0) {
2009f94275ceSAdam Leventhal atomic_inc_64(&raidz_corrected[code]);
2010f94275ceSAdam Leventhal
2011f94275ceSAdam Leventhal for (i = 0; i < n; i++) {
2012f94275ceSAdam Leventhal c = tgts[i];
2013f94275ceSAdam Leventhal rc = &rm->rm_col[c];
2014f94275ceSAdam Leventhal ASSERT(rc->rc_error == 0);
201522fe2c88SJonathan Adams if (rc->rc_tried)
201622fe2c88SJonathan Adams raidz_checksum_error(zio, rc,
201722fe2c88SJonathan Adams orig[i]);
2018be6fd75aSMatthew Ahrens rc->rc_error = SET_ERROR(ECKSUM);
2019f94275ceSAdam Leventhal }
2020f94275ceSAdam Leventhal
2021f94275ceSAdam Leventhal ret = code;
2022f94275ceSAdam Leventhal goto done;
2023f94275ceSAdam Leventhal }
2024f94275ceSAdam Leventhal
2025f94275ceSAdam Leventhal /*
2026f94275ceSAdam Leventhal * Restore the original data.
2027f94275ceSAdam Leventhal */
2028f94275ceSAdam Leventhal for (i = 0; i < n; i++) {
2029f94275ceSAdam Leventhal c = tgts[i];
2030f94275ceSAdam Leventhal rc = &rm->rm_col[c];
2031f94275ceSAdam Leventhal bcopy(orig[i], rc->rc_data, rc->rc_size);
2032f94275ceSAdam Leventhal }
2033f94275ceSAdam Leventhal
2034f94275ceSAdam Leventhal do {
2035f94275ceSAdam Leventhal /*
2036f94275ceSAdam Leventhal * Find the next valid column after the current
2037f94275ceSAdam Leventhal * position..
2038f94275ceSAdam Leventhal */
2039f94275ceSAdam Leventhal for (next = tgts[current] + 1;
2040f94275ceSAdam Leventhal next < rm->rm_cols &&
2041f94275ceSAdam Leventhal rm->rm_col[next].rc_error != 0; next++)
2042f94275ceSAdam Leventhal continue;
2043f94275ceSAdam Leventhal
2044f94275ceSAdam Leventhal ASSERT(next <= tgts[current + 1]);
2045f94275ceSAdam Leventhal
2046f94275ceSAdam Leventhal /*
2047f94275ceSAdam Leventhal * If that spot is available, we're done here.
2048f94275ceSAdam Leventhal */
2049f94275ceSAdam Leventhal if (next != tgts[current + 1])
2050f94275ceSAdam Leventhal break;
2051f94275ceSAdam Leventhal
2052f94275ceSAdam Leventhal /*
2053f94275ceSAdam Leventhal * Otherwise, find the next valid column after
2054f94275ceSAdam Leventhal * the previous position.
2055f94275ceSAdam Leventhal */
2056f94275ceSAdam Leventhal for (c = tgts[current - 1] + 1;
2057f94275ceSAdam Leventhal rm->rm_col[c].rc_error != 0; c++)
2058f94275ceSAdam Leventhal continue;
2059f94275ceSAdam Leventhal
2060f94275ceSAdam Leventhal tgts[current] = c;
2061f94275ceSAdam Leventhal current++;
2062f94275ceSAdam Leventhal
2063f94275ceSAdam Leventhal } while (current != n);
2064f94275ceSAdam Leventhal }
2065f94275ceSAdam Leventhal }
2066f94275ceSAdam Leventhal n--;
2067f94275ceSAdam Leventhal done:
2068f94275ceSAdam Leventhal for (i = 0; i < n; i++) {
2069f94275ceSAdam Leventhal zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2070f94275ceSAdam Leventhal }
2071f94275ceSAdam Leventhal
2072f94275ceSAdam Leventhal return (ret);
2073f94275ceSAdam Leventhal }
2074f94275ceSAdam Leventhal
20753e30c24aSWill Andrews /*
20763e30c24aSWill Andrews * Complete an IO operation on a RAIDZ VDev
20773e30c24aSWill Andrews *
20783e30c24aSWill Andrews * Outline:
20793e30c24aSWill Andrews * - For write operations:
20803e30c24aSWill Andrews * 1. Check for errors on the child IOs.
20813e30c24aSWill Andrews * 2. Return, setting an error code if too few child VDevs were written
20823e30c24aSWill Andrews * to reconstruct the data later. Note that partial writes are
20833e30c24aSWill Andrews * considered successful if they can be reconstructed at all.
20843e30c24aSWill Andrews * - For read operations:
20853e30c24aSWill Andrews * 1. Check for errors on the child IOs.
20863e30c24aSWill Andrews * 2. If data errors occurred:
20873e30c24aSWill Andrews * a. Try to reassemble the data from the parity available.
20883e30c24aSWill Andrews * b. If we haven't yet read the parity drives, read them now.
20893e30c24aSWill Andrews * c. If all parity drives have been read but the data still doesn't
20903e30c24aSWill Andrews * reassemble with a correct checksum, then try combinatorial
20913e30c24aSWill Andrews * reconstruction.
20923e30c24aSWill Andrews * d. If that doesn't work, return an error.
20933e30c24aSWill Andrews * 3. If there were unexpected errors or this is a resilver operation,
20943e30c24aSWill Andrews * rewrite the vdevs that had errors.
20953e30c24aSWill Andrews */
2096e14bb325SJeff Bonwick static void
vdev_raidz_io_done(zio_t * zio)2097fa9e4066Sahrens vdev_raidz_io_done(zio_t *zio)
2098fa9e4066Sahrens {
2099fa9e4066Sahrens vdev_t *vd = zio->io_vd;
2100fa9e4066Sahrens vdev_t *cvd;
2101fa9e4066Sahrens raidz_map_t *rm = zio->io_vsd;
2102f94275ceSAdam Leventhal raidz_col_t *rc;
2103fa9e4066Sahrens int unexpected_errors = 0;
210499653d4eSeschrock int parity_errors = 0;
2105c7a40cc4Sahl int parity_untried = 0;
210699653d4eSeschrock int data_errors = 0;
2107e14bb325SJeff Bonwick int total_errors = 0;
2108f94275ceSAdam Leventhal int n, c;
2109f94275ceSAdam Leventhal int tgts[VDEV_RAIDZ_MAXPARITY];
2110f94275ceSAdam Leventhal int code;
2111fa9e4066Sahrens
211244cd46caSbillm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2113fa9e4066Sahrens
211499653d4eSeschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
211599653d4eSeschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
211699653d4eSeschrock
2117fa9e4066Sahrens for (c = 0; c < rm->rm_cols; c++) {
2118fa9e4066Sahrens rc = &rm->rm_col[c];
2119fa9e4066Sahrens
2120fa9e4066Sahrens if (rc->rc_error) {
2121e14bb325SJeff Bonwick ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
212299653d4eSeschrock
212399653d4eSeschrock if (c < rm->rm_firstdatacol)
212499653d4eSeschrock parity_errors++;
212599653d4eSeschrock else
212699653d4eSeschrock data_errors++;
212799653d4eSeschrock
2128fa9e4066Sahrens if (!rc->rc_skipped)
2129fa9e4066Sahrens unexpected_errors++;
213099653d4eSeschrock
2131e14bb325SJeff Bonwick total_errors++;
2132c7a40cc4Sahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2133c7a40cc4Sahl parity_untried++;
2134fa9e4066Sahrens }
2135fa9e4066Sahrens }
2136fa9e4066Sahrens
2137fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_WRITE) {
2138fa9e4066Sahrens /*
2139e14bb325SJeff Bonwick * XXX -- for now, treat partial writes as a success.
2140e14bb325SJeff Bonwick * (If we couldn't write enough columns to reconstruct
2141e14bb325SJeff Bonwick * the data, the I/O failed. Otherwise, good enough.)
2142e14bb325SJeff Bonwick *
2143e14bb325SJeff Bonwick * Now that we support write reallocation, it would be better
2144e14bb325SJeff Bonwick * to treat partial failure as real failure unless there are
2145e14bb325SJeff Bonwick * no non-degraded top-level vdevs left, and not update DTLs
2146e14bb325SJeff Bonwick * if we intend to reallocate.
2147fa9e4066Sahrens */
2148fa9e4066Sahrens /* XXPOLICY */
2149e14bb325SJeff Bonwick if (total_errors > rm->rm_firstdatacol)
2150e14bb325SJeff Bonwick zio->io_error = vdev_raidz_worst_error(rm);
2151fa9e4066Sahrens
2152e14bb325SJeff Bonwick return;
2153fa9e4066Sahrens }
2154fa9e4066Sahrens
2155fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ);
215699653d4eSeschrock /*
215799653d4eSeschrock * There are three potential phases for a read:
215899653d4eSeschrock * 1. produce valid data from the columns read
215999653d4eSeschrock * 2. read all disks and try again
216099653d4eSeschrock * 3. perform combinatorial reconstruction
216199653d4eSeschrock *
216299653d4eSeschrock * Each phase is progressively both more expensive and less likely to
216399653d4eSeschrock * occur. If we encounter more errors than we can repair or all phases
216499653d4eSeschrock * fail, we have no choice but to return an error.
216599653d4eSeschrock */
2166fa9e4066Sahrens
2167fa9e4066Sahrens /*
216899653d4eSeschrock * If the number of errors we saw was correctable -- less than or equal
2169c7a40cc4Sahl * to the number of parity disks read -- attempt to produce data that
2170c7a40cc4Sahl * has a valid checksum. Naturally, this case applies in the absence of
2171c7a40cc4Sahl * any errors.
2172fa9e4066Sahrens */
2173e14bb325SJeff Bonwick if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2174f94275ceSAdam Leventhal if (data_errors == 0) {
217522fe2c88SJonathan Adams if (raidz_checksum_verify(zio) == 0) {
2176d427dcb0Sahl /*
2177d427dcb0Sahl * If we read parity information (unnecessarily
2178d427dcb0Sahl * as it happens since no reconstruction was
2179d427dcb0Sahl * needed) regenerate and verify the parity.
2180d427dcb0Sahl * We also regenerate parity when resilvering
2181d427dcb0Sahl * so we can write it out to the failed device
2182d427dcb0Sahl * later.
2183d427dcb0Sahl */
2184c7a40cc4Sahl if (parity_errors + parity_untried <
2185d427dcb0Sahl rm->rm_firstdatacol ||
2186d427dcb0Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) {
218799653d4eSeschrock n = raidz_parity_verify(zio, rm);
218899653d4eSeschrock unexpected_errors += n;
218999653d4eSeschrock ASSERT(parity_errors + n <=
219099653d4eSeschrock rm->rm_firstdatacol);
2191c7a40cc4Sahl }
2192fa9e4066Sahrens goto done;
2193fa9e4066Sahrens }
2194f94275ceSAdam Leventhal } else {
2195c7a40cc4Sahl /*
2196c7a40cc4Sahl * We either attempt to read all the parity columns or
2197c7a40cc4Sahl * none of them. If we didn't try to read parity, we
2198c7a40cc4Sahl * wouldn't be here in the correctable case. There must
2199c7a40cc4Sahl * also have been fewer parity errors than parity
2200c7a40cc4Sahl * columns or, again, we wouldn't be in this code path.
2201c7a40cc4Sahl */
2202c7a40cc4Sahl ASSERT(parity_untried == 0);
220399653d4eSeschrock ASSERT(parity_errors < rm->rm_firstdatacol);
220499653d4eSeschrock
220599653d4eSeschrock /*
2206f94275ceSAdam Leventhal * Identify the data columns that reported an error.
220799653d4eSeschrock */
2208f94275ceSAdam Leventhal n = 0;
220999653d4eSeschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
221099653d4eSeschrock rc = &rm->rm_col[c];
2211f94275ceSAdam Leventhal if (rc->rc_error != 0) {
2212f94275ceSAdam Leventhal ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2213f94275ceSAdam Leventhal tgts[n++] = c;
221499653d4eSeschrock }
2215f94275ceSAdam Leventhal }
221699653d4eSeschrock
2217f94275ceSAdam Leventhal ASSERT(rm->rm_firstdatacol >= n);
2218f94275ceSAdam Leventhal
2219f94275ceSAdam Leventhal code = vdev_raidz_reconstruct(rm, tgts, n);
222099653d4eSeschrock
222122fe2c88SJonathan Adams if (raidz_checksum_verify(zio) == 0) {
2222f94275ceSAdam Leventhal atomic_inc_64(&raidz_corrected[code]);
222399653d4eSeschrock
222499653d4eSeschrock /*
2225f94275ceSAdam Leventhal * If we read more parity disks than were used
2226f94275ceSAdam Leventhal * for reconstruction, confirm that the other
2227f94275ceSAdam Leventhal * parity disks produced correct data. This
2228f94275ceSAdam Leventhal * routine is suboptimal in that it regenerates
2229f94275ceSAdam Leventhal * the parity that we already used in addition
2230f94275ceSAdam Leventhal * to the parity that we're attempting to
2231f94275ceSAdam Leventhal * verify, but this should be a relatively
2232f94275ceSAdam Leventhal * uncommon case, and can be optimized if it
2233f94275ceSAdam Leventhal * becomes a problem. Note that we regenerate
2234f94275ceSAdam Leventhal * parity when resilvering so we can write it
2235f94275ceSAdam Leventhal * out to failed devices later.
223699653d4eSeschrock */
2237f94275ceSAdam Leventhal if (parity_errors < rm->rm_firstdatacol - n ||
2238d427dcb0Sahl (zio->io_flags & ZIO_FLAG_RESILVER)) {
223999653d4eSeschrock n = raidz_parity_verify(zio, rm);
224099653d4eSeschrock unexpected_errors += n;
224199653d4eSeschrock ASSERT(parity_errors + n <=
224299653d4eSeschrock rm->rm_firstdatacol);
224399653d4eSeschrock }
224499653d4eSeschrock
224599653d4eSeschrock goto done;
224699653d4eSeschrock }
224799653d4eSeschrock }
2248fa9e4066Sahrens }
2249fa9e4066Sahrens
2250fa9e4066Sahrens /*
225199653d4eSeschrock * This isn't a typical situation -- either we got a read error or
225299653d4eSeschrock * a child silently returned bad data. Read every block so we can
225399653d4eSeschrock * try again with as much data and parity as we can track down. If
225499653d4eSeschrock * we've already been through once before, all children will be marked
225599653d4eSeschrock * as tried so we'll proceed to combinatorial reconstruction.
2256fa9e4066Sahrens */
2257fa9e4066Sahrens unexpected_errors = 1;
225899653d4eSeschrock rm->rm_missingdata = 0;
225999653d4eSeschrock rm->rm_missingparity = 0;
2260fa9e4066Sahrens
226199653d4eSeschrock for (c = 0; c < rm->rm_cols; c++) {
226299653d4eSeschrock if (rm->rm_col[c].rc_tried)
226399653d4eSeschrock continue;
2264fa9e4066Sahrens
2265fa9e4066Sahrens zio_vdev_io_redone(zio);
226699653d4eSeschrock do {
2267fa9e4066Sahrens rc = &rm->rm_col[c];
2268fa9e4066Sahrens if (rc->rc_tried)
2269fa9e4066Sahrens continue;
2270fa9e4066Sahrens zio_nowait(zio_vdev_child_io(zio, NULL,
227199653d4eSeschrock vd->vdev_child[rc->rc_devidx],
2272fa9e4066Sahrens rc->rc_offset, rc->rc_data, rc->rc_size,
2273e14bb325SJeff Bonwick zio->io_type, zio->io_priority, 0,
2274fa9e4066Sahrens vdev_raidz_child_done, rc));
227599653d4eSeschrock } while (++c < rm->rm_cols);
2276e05725b1Sbonwick
2277e14bb325SJeff Bonwick return;
2278fa9e4066Sahrens }
2279fa9e4066Sahrens
2280fa9e4066Sahrens /*
228199653d4eSeschrock * At this point we've attempted to reconstruct the data given the
228299653d4eSeschrock * errors we detected, and we've attempted to read all columns. There
228399653d4eSeschrock * must, therefore, be one or more additional problems -- silent errors
228499653d4eSeschrock * resulting in invalid data rather than explicit I/O errors resulting
2285f94275ceSAdam Leventhal * in absent data. We check if there is enough additional data to
2286f94275ceSAdam Leventhal * possibly reconstruct the data and then perform combinatorial
2287f94275ceSAdam Leventhal * reconstruction over all possible combinations. If that fails,
2288f94275ceSAdam Leventhal * we're cooked.
2289fa9e4066Sahrens */
229022fe2c88SJonathan Adams if (total_errors > rm->rm_firstdatacol) {
2291e14bb325SJeff Bonwick zio->io_error = vdev_raidz_worst_error(rm);
2292fa9e4066Sahrens
229322fe2c88SJonathan Adams } else if (total_errors < rm->rm_firstdatacol &&
229422fe2c88SJonathan Adams (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2295fa9e4066Sahrens /*
2296f94275ceSAdam Leventhal * If we didn't use all the available parity for the
2297f94275ceSAdam Leventhal * combinatorial reconstruction, verify that the remaining
2298f94275ceSAdam Leventhal * parity is correct.
2299fa9e4066Sahrens */
2300f94275ceSAdam Leventhal if (code != (1 << rm->rm_firstdatacol) - 1)
2301f94275ceSAdam Leventhal (void) raidz_parity_verify(zio, rm);
2302f94275ceSAdam Leventhal } else {
2303fa9e4066Sahrens /*
230422fe2c88SJonathan Adams * We're here because either:
230522fe2c88SJonathan Adams *
230622fe2c88SJonathan Adams * total_errors == rm_first_datacol, or
230722fe2c88SJonathan Adams * vdev_raidz_combrec() failed
230822fe2c88SJonathan Adams *
230922fe2c88SJonathan Adams * In either case, there is enough bad data to prevent
231022fe2c88SJonathan Adams * reconstruction.
231122fe2c88SJonathan Adams *
231222fe2c88SJonathan Adams * Start checksum ereports for all children which haven't
23136e1f5caaSNeil Perrin * failed, and the IO wasn't speculative.
2314fa9e4066Sahrens */
2315be6fd75aSMatthew Ahrens zio->io_error = SET_ERROR(ECKSUM);
2316e14bb325SJeff Bonwick
23176e1f5caaSNeil Perrin if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2318ea8dc4b6Seschrock for (c = 0; c < rm->rm_cols; c++) {
2319ea8dc4b6Seschrock rc = &rm->rm_col[c];
232022fe2c88SJonathan Adams if (rc->rc_error == 0) {
232122fe2c88SJonathan Adams zio_bad_cksum_t zbc;
232222fe2c88SJonathan Adams zbc.zbc_has_cksum = 0;
23236e1f5caaSNeil Perrin zbc.zbc_injected =
23246e1f5caaSNeil Perrin rm->rm_ecksuminjected;
232522fe2c88SJonathan Adams
232622fe2c88SJonathan Adams zfs_ereport_start_checksum(
23276e1f5caaSNeil Perrin zio->io_spa,
23286e1f5caaSNeil Perrin vd->vdev_child[rc->rc_devidx],
232922fe2c88SJonathan Adams zio, rc->rc_offset, rc->rc_size,
233022fe2c88SJonathan Adams (void *)(uintptr_t)c, &zbc);
2331f94275ceSAdam Leventhal }
2332ea8dc4b6Seschrock }
2333ea8dc4b6Seschrock }
23346e1f5caaSNeil Perrin }
2335fa9e4066Sahrens
2336fa9e4066Sahrens done:
2337fa9e4066Sahrens zio_checksum_verified(zio);
2338fa9e4066Sahrens
23398ad4d6ddSJeff Bonwick if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2340fa9e4066Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2341fa9e4066Sahrens /*
2342fa9e4066Sahrens * Use the good data we have in hand to repair damaged children.
2343fa9e4066Sahrens */
2344fa9e4066Sahrens for (c = 0; c < rm->rm_cols; c++) {
2345fa9e4066Sahrens rc = &rm->rm_col[c];
234699653d4eSeschrock cvd = vd->vdev_child[rc->rc_devidx];
2347fa9e4066Sahrens
2348ecc2d604Sbonwick if (rc->rc_error == 0)
2349ecc2d604Sbonwick continue;
2350fa9e4066Sahrens
2351e14bb325SJeff Bonwick zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2352ecc2d604Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size,
235369962b56SMatthew Ahrens ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
23548ad4d6ddSJeff Bonwick ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
23558ad4d6ddSJeff Bonwick ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2356fa9e4066Sahrens }
2357fa9e4066Sahrens }
2358fa9e4066Sahrens }
2359fa9e4066Sahrens
2360fa9e4066Sahrens static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2361fa9e4066Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2362fa9e4066Sahrens {
236399653d4eSeschrock if (faulted > vd->vdev_nparity)
2364ea8dc4b6Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2365ea8dc4b6Seschrock VDEV_AUX_NO_REPLICAS);
2366fa9e4066Sahrens else if (degraded + faulted != 0)
2367ea8dc4b6Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2368fa9e4066Sahrens else
2369ea8dc4b6Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2370fa9e4066Sahrens }
2371fa9e4066Sahrens
2372fa9e4066Sahrens vdev_ops_t vdev_raidz_ops = {
2373fa9e4066Sahrens vdev_raidz_open,
2374fa9e4066Sahrens vdev_raidz_close,
2375fa9e4066Sahrens vdev_raidz_asize,
2376fa9e4066Sahrens vdev_raidz_io_start,
2377fa9e4066Sahrens vdev_raidz_io_done,
2378fa9e4066Sahrens vdev_raidz_state_change,
2379dcba9f3fSGeorge Wilson NULL,
2380dcba9f3fSGeorge Wilson NULL,
2381fa9e4066Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */
2382fa9e4066Sahrens B_FALSE /* not a leaf vdev */
2383fa9e4066Sahrens };
2384