1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_file.h>
33 #include <sys/vdev_raidz.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/fm/fs/zfs.h>
38
39 /*
40 * Virtual device vector for RAID-Z.
41 *
42 * This vdev supports single, double, and triple parity. For single parity,
43 * we use a simple XOR of all the data columns. For double or triple parity,
44 * we use a special case of Reed-Solomon coding. This extends the
45 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
46 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
47 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
48 * former is also based. The latter is designed to provide higher performance
49 * for writes.
50 *
51 * Note that the Plank paper claimed to support arbitrary N+M, but was then
52 * amended six years later identifying a critical flaw that invalidates its
53 * claims. Nevertheless, the technique can be adapted to work for up to
54 * triple parity. For additional parity, the amendment "Note: Correction to
55 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
56 * is viable, but the additional complexity means that write performance will
57 * suffer.
58 *
59 * All of the methods above operate on a Galois field, defined over the
60 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
61 * can be expressed with a single byte. Briefly, the operations on the
62 * field are defined as follows:
63 *
64 * o addition (+) is represented by a bitwise XOR
65 * o subtraction (-) is therefore identical to addition: A + B = A - B
66 * o multiplication of A by 2 is defined by the following bitwise expression:
67 *
68 * (A * 2)_7 = A_6
69 * (A * 2)_6 = A_5
70 * (A * 2)_5 = A_4
71 * (A * 2)_4 = A_3 + A_7
72 * (A * 2)_3 = A_2 + A_7
73 * (A * 2)_2 = A_1 + A_7
74 * (A * 2)_1 = A_0
75 * (A * 2)_0 = A_7
76 *
77 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
78 * As an aside, this multiplication is derived from the error correcting
79 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
80 *
81 * Observe that any number in the field (except for 0) can be expressed as a
82 * power of 2 -- a generator for the field. We store a table of the powers of
83 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
84 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
85 * than field addition). The inverse of a field element A (A^-1) is therefore
86 * A ^ (255 - 1) = A^254.
87 *
88 * The up-to-three parity columns, P, Q, R over several data columns,
89 * D_0, ... D_n-1, can be expressed by field operations:
90 *
91 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
92 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
93 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
94 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
95 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
96 *
97 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
98 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
99 * independent coefficients. (There are no additional coefficients that have
100 * this property which is why the uncorrected Plank method breaks down.)
101 *
102 * See the reconstruction code below for how P, Q and R can used individually
103 * or in concert to recover missing data columns.
104 */
105
106 typedef struct raidz_col {
107 uint64_t rc_devidx; /* child device index for I/O */
108 uint64_t rc_offset; /* device offset */
109 uint64_t rc_size; /* I/O size */
110 void *rc_data; /* I/O data */
111 void *rc_gdata; /* used to store the "good" version */
112 int rc_error; /* I/O error for this device */
113 uint8_t rc_tried; /* Did we attempt this I/O column? */
114 uint8_t rc_skipped; /* Did we skip this I/O column? */
115 } raidz_col_t;
116
117 typedef struct raidz_map {
118 uint64_t rm_cols; /* Regular column count */
119 uint64_t rm_scols; /* Count including skipped columns */
120 uint64_t rm_bigcols; /* Number of oversized columns */
121 uint64_t rm_asize; /* Actual total I/O size */
122 uint64_t rm_missingdata; /* Count of missing data devices */
123 uint64_t rm_missingparity; /* Count of missing parity devices */
124 uint64_t rm_firstdatacol; /* First data column/parity count */
125 uint64_t rm_nskip; /* Skipped sectors for padding */
126 uint64_t rm_skipstart; /* Column index of padding start */
127 void *rm_datacopy; /* rm_asize-buffer of copied data */
128 uintptr_t rm_reports; /* # of referencing checksum reports */
129 uint8_t rm_freed; /* map no longer has referencing ZIO */
130 uint8_t rm_ecksuminjected; /* checksum error was injected */
131 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
132 } raidz_map_t;
133
134 #define VDEV_RAIDZ_P 0
135 #define VDEV_RAIDZ_Q 1
136 #define VDEV_RAIDZ_R 2
137
138 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
139 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
140
141 /*
142 * We provide a mechanism to perform the field multiplication operation on a
143 * 64-bit value all at once rather than a byte at a time. This works by
144 * creating a mask from the top bit in each byte and using that to
145 * conditionally apply the XOR of 0x1d.
146 */
147 #define VDEV_RAIDZ_64MUL_2(x, mask) \
148 { \
149 (mask) = (x) & 0x8080808080808080ULL; \
150 (mask) = ((mask) << 1) - ((mask) >> 7); \
151 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
152 ((mask) & 0x1d1d1d1d1d1d1d1d); \
153 }
154
155 #define VDEV_RAIDZ_64MUL_4(x, mask) \
156 { \
157 VDEV_RAIDZ_64MUL_2((x), mask); \
158 VDEV_RAIDZ_64MUL_2((x), mask); \
159 }
160
161 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
162
163 /*
164 * Force reconstruction to use the general purpose method.
165 */
166 int vdev_raidz_default_to_general;
167
168 /* Powers of 2 in the Galois field defined above. */
169 static const uint8_t vdev_raidz_pow2[256] = {
170 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
171 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
172 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
173 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
174 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
175 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
176 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
177 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
178 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
179 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
180 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
181 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
182 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
183 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
184 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
185 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
186 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
187 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
188 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
189 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
190 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
191 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
192 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
193 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
194 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
195 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
196 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
197 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
198 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
199 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
200 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
201 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
202 };
203 /* Logs of 2 in the Galois field defined above. */
204 static const uint8_t vdev_raidz_log2[256] = {
205 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
206 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
207 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
208 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
209 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
210 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
211 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
212 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
213 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
214 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
215 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
216 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
217 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
218 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
219 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
220 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
221 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
222 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
223 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
224 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
225 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
226 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
227 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
228 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
229 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
230 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
231 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
232 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
233 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
234 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
235 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
236 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
237 };
238
239 static void vdev_raidz_generate_parity(raidz_map_t *rm);
240
241 /*
242 * Multiply a given number by 2 raised to the given power.
243 */
244 static uint8_t
vdev_raidz_exp2(uint_t a,int exp)245 vdev_raidz_exp2(uint_t a, int exp)
246 {
247 if (a == 0)
248 return (0);
249
250 ASSERT(exp >= 0);
251 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
252
253 exp += vdev_raidz_log2[a];
254 if (exp > 255)
255 exp -= 255;
256
257 return (vdev_raidz_pow2[exp]);
258 }
259
260 static void
vdev_raidz_map_free(raidz_map_t * rm)261 vdev_raidz_map_free(raidz_map_t *rm)
262 {
263 int c;
264 size_t size;
265
266 for (c = 0; c < rm->rm_firstdatacol; c++) {
267 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
268
269 if (rm->rm_col[c].rc_gdata != NULL)
270 zio_buf_free(rm->rm_col[c].rc_gdata,
271 rm->rm_col[c].rc_size);
272 }
273
274 size = 0;
275 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
276 size += rm->rm_col[c].rc_size;
277
278 if (rm->rm_datacopy != NULL)
279 zio_buf_free(rm->rm_datacopy, size);
280
281 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282 }
283
284 static void
vdev_raidz_map_free_vsd(zio_t * zio)285 vdev_raidz_map_free_vsd(zio_t *zio)
286 {
287 raidz_map_t *rm = zio->io_vsd;
288
289 ASSERT0(rm->rm_freed);
290 rm->rm_freed = 1;
291
292 if (rm->rm_reports == 0)
293 vdev_raidz_map_free(rm);
294 }
295
296 /*ARGSUSED*/
297 static void
vdev_raidz_cksum_free(void * arg,size_t ignored)298 vdev_raidz_cksum_free(void *arg, size_t ignored)
299 {
300 raidz_map_t *rm = arg;
301
302 ASSERT3U(rm->rm_reports, >, 0);
303
304 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
305 vdev_raidz_map_free(rm);
306 }
307
308 static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)309 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
310 {
311 raidz_map_t *rm = zcr->zcr_cbdata;
312 size_t c = zcr->zcr_cbinfo;
313 size_t x;
314
315 const char *good = NULL;
316 const char *bad = rm->rm_col[c].rc_data;
317
318 if (good_data == NULL) {
319 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
320 return;
321 }
322
323 if (c < rm->rm_firstdatacol) {
324 /*
325 * The first time through, calculate the parity blocks for
326 * the good data (this relies on the fact that the good
327 * data never changes for a given logical ZIO)
328 */
329 if (rm->rm_col[0].rc_gdata == NULL) {
330 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
331 char *buf;
332
333 /*
334 * Set up the rm_col[]s to generate the parity for
335 * good_data, first saving the parity bufs and
336 * replacing them with buffers to hold the result.
337 */
338 for (x = 0; x < rm->rm_firstdatacol; x++) {
339 bad_parity[x] = rm->rm_col[x].rc_data;
340 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
341 zio_buf_alloc(rm->rm_col[x].rc_size);
342 }
343
344 /* fill in the data columns from good_data */
345 buf = (char *)good_data;
346 for (; x < rm->rm_cols; x++) {
347 rm->rm_col[x].rc_data = buf;
348 buf += rm->rm_col[x].rc_size;
349 }
350
351 /*
352 * Construct the parity from the good data.
353 */
354 vdev_raidz_generate_parity(rm);
355
356 /* restore everything back to its original state */
357 for (x = 0; x < rm->rm_firstdatacol; x++)
358 rm->rm_col[x].rc_data = bad_parity[x];
359
360 buf = rm->rm_datacopy;
361 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
362 rm->rm_col[x].rc_data = buf;
363 buf += rm->rm_col[x].rc_size;
364 }
365 }
366
367 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
368 good = rm->rm_col[c].rc_gdata;
369 } else {
370 /* adjust good_data to point at the start of our column */
371 good = good_data;
372
373 for (x = rm->rm_firstdatacol; x < c; x++)
374 good += rm->rm_col[x].rc_size;
375 }
376
377 /* we drop the ereport if it ends up that the data was good */
378 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
379 }
380
381 /*
382 * Invoked indirectly by zfs_ereport_start_checksum(), called
383 * below when our read operation fails completely. The main point
384 * is to keep a copy of everything we read from disk, so that at
385 * vdev_raidz_cksum_finish() time we can compare it with the good data.
386 */
387 static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)388 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
389 {
390 size_t c = (size_t)(uintptr_t)arg;
391 caddr_t buf;
392
393 raidz_map_t *rm = zio->io_vsd;
394 size_t size;
395
396 /* set up the report and bump the refcount */
397 zcr->zcr_cbdata = rm;
398 zcr->zcr_cbinfo = c;
399 zcr->zcr_finish = vdev_raidz_cksum_finish;
400 zcr->zcr_free = vdev_raidz_cksum_free;
401
402 rm->rm_reports++;
403 ASSERT3U(rm->rm_reports, >, 0);
404
405 if (rm->rm_datacopy != NULL)
406 return;
407
408 /*
409 * It's the first time we're called for this raidz_map_t, so we need
410 * to copy the data aside; there's no guarantee that our zio's buffer
411 * won't be re-used for something else.
412 *
413 * Our parity data is already in separate buffers, so there's no need
414 * to copy them.
415 */
416
417 size = 0;
418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
419 size += rm->rm_col[c].rc_size;
420
421 buf = rm->rm_datacopy = zio_buf_alloc(size);
422
423 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
424 raidz_col_t *col = &rm->rm_col[c];
425
426 bcopy(col->rc_data, buf, col->rc_size);
427 col->rc_data = buf;
428
429 buf += col->rc_size;
430 }
431 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
432 }
433
434 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
435 vdev_raidz_map_free_vsd,
436 vdev_raidz_cksum_report
437 };
438
439 /*
440 * Divides the IO evenly across all child vdevs; usually, dcols is
441 * the number of children in the target vdev.
442 */
443 static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data,uint64_t size,uint64_t offset,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)444 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
445 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
446 {
447 raidz_map_t *rm;
448 /* The starting RAIDZ (parent) vdev sector of the block. */
449 uint64_t b = offset >> unit_shift;
450 /* The zio's size in units of the vdev's minimum sector size. */
451 uint64_t s = size >> unit_shift;
452 /* The first column for this stripe. */
453 uint64_t f = b % dcols;
454 /* The starting byte offset on each child vdev. */
455 uint64_t o = (b / dcols) << unit_shift;
456 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
457
458 /*
459 * "Quotient": The number of data sectors for this stripe on all but
460 * the "big column" child vdevs that also contain "remainder" data.
461 */
462 q = s / (dcols - nparity);
463
464 /*
465 * "Remainder": The number of partial stripe data sectors in this I/O.
466 * This will add a sector to some, but not all, child vdevs.
467 */
468 r = s - q * (dcols - nparity);
469
470 /* The number of "big columns" - those which contain remainder data. */
471 bc = (r == 0 ? 0 : r + nparity);
472
473 /*
474 * The total number of data and parity sectors associated with
475 * this I/O.
476 */
477 tot = s + nparity * (q + (r == 0 ? 0 : 1));
478
479 /* acols: The columns that will be accessed. */
480 /* scols: The columns that will be accessed or skipped. */
481 if (q == 0) {
482 /* Our I/O request doesn't span all child vdevs. */
483 acols = bc;
484 scols = MIN(dcols, roundup(bc, nparity + 1));
485 } else {
486 acols = dcols;
487 scols = dcols;
488 }
489
490 ASSERT3U(acols, <=, scols);
491
492 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
493
494 rm->rm_cols = acols;
495 rm->rm_scols = scols;
496 rm->rm_bigcols = bc;
497 rm->rm_skipstart = bc;
498 rm->rm_missingdata = 0;
499 rm->rm_missingparity = 0;
500 rm->rm_firstdatacol = nparity;
501 rm->rm_datacopy = NULL;
502 rm->rm_reports = 0;
503 rm->rm_freed = 0;
504 rm->rm_ecksuminjected = 0;
505
506 asize = 0;
507
508 for (c = 0; c < scols; c++) {
509 col = f + c;
510 coff = o;
511 if (col >= dcols) {
512 col -= dcols;
513 coff += 1ULL << unit_shift;
514 }
515 rm->rm_col[c].rc_devidx = col;
516 rm->rm_col[c].rc_offset = coff;
517 rm->rm_col[c].rc_data = NULL;
518 rm->rm_col[c].rc_gdata = NULL;
519 rm->rm_col[c].rc_error = 0;
520 rm->rm_col[c].rc_tried = 0;
521 rm->rm_col[c].rc_skipped = 0;
522
523 if (c >= acols)
524 rm->rm_col[c].rc_size = 0;
525 else if (c < bc)
526 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
527 else
528 rm->rm_col[c].rc_size = q << unit_shift;
529
530 asize += rm->rm_col[c].rc_size;
531 }
532
533 ASSERT3U(asize, ==, tot << unit_shift);
534 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
535 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
536 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
537 ASSERT3U(rm->rm_nskip, <=, nparity);
538
539 for (c = 0; c < rm->rm_firstdatacol; c++)
540 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
541
542 rm->rm_col[c].rc_data = data;
543
544 for (c = c + 1; c < acols; c++)
545 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
546 rm->rm_col[c - 1].rc_size;
547
548 /*
549 * If all data stored spans all columns, there's a danger that parity
550 * will always be on the same device and, since parity isn't read
551 * during normal operation, that that device's I/O bandwidth won't be
552 * used effectively. We therefore switch the parity every 1MB.
553 *
554 * ... at least that was, ostensibly, the theory. As a practical
555 * matter unless we juggle the parity between all devices evenly, we
556 * won't see any benefit. Further, occasional writes that aren't a
557 * multiple of the LCM of the number of children and the minimum
558 * stripe width are sufficient to avoid pessimal behavior.
559 * Unfortunately, this decision created an implicit on-disk format
560 * requirement that we need to support for all eternity, but only
561 * for single-parity RAID-Z.
562 *
563 * If we intend to skip a sector in the zeroth column for padding
564 * we must make sure to note this swap. We will never intend to
565 * skip the first column since at least one data and one parity
566 * column must appear in each row.
567 */
568 ASSERT(rm->rm_cols >= 2);
569 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
570
571 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
572 devidx = rm->rm_col[0].rc_devidx;
573 o = rm->rm_col[0].rc_offset;
574 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
575 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
576 rm->rm_col[1].rc_devidx = devidx;
577 rm->rm_col[1].rc_offset = o;
578
579 if (rm->rm_skipstart == 0)
580 rm->rm_skipstart = 1;
581 }
582
583 return (rm);
584 }
585
586 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)587 vdev_raidz_generate_parity_p(raidz_map_t *rm)
588 {
589 uint64_t *p, *src, pcount, ccount, i;
590 int c;
591
592 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
593
594 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
595 src = rm->rm_col[c].rc_data;
596 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
597 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
598
599 if (c == rm->rm_firstdatacol) {
600 ASSERT(ccount == pcount);
601 for (i = 0; i < ccount; i++, src++, p++) {
602 *p = *src;
603 }
604 } else {
605 ASSERT(ccount <= pcount);
606 for (i = 0; i < ccount; i++, src++, p++) {
607 *p ^= *src;
608 }
609 }
610 }
611 }
612
613 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)614 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
615 {
616 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
617 int c;
618
619 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622
623 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
624 src = rm->rm_col[c].rc_data;
625 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
626 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
627
628 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
629
630 if (c == rm->rm_firstdatacol) {
631 ASSERT(ccnt == pcnt || ccnt == 0);
632 for (i = 0; i < ccnt; i++, src++, p++, q++) {
633 *p = *src;
634 *q = *src;
635 }
636 for (; i < pcnt; i++, src++, p++, q++) {
637 *p = 0;
638 *q = 0;
639 }
640 } else {
641 ASSERT(ccnt <= pcnt);
642
643 /*
644 * Apply the algorithm described above by multiplying
645 * the previous result and adding in the new value.
646 */
647 for (i = 0; i < ccnt; i++, src++, p++, q++) {
648 *p ^= *src;
649
650 VDEV_RAIDZ_64MUL_2(*q, mask);
651 *q ^= *src;
652 }
653
654 /*
655 * Treat short columns as though they are full of 0s.
656 * Note that there's therefore nothing needed for P.
657 */
658 for (; i < pcnt; i++, q++) {
659 VDEV_RAIDZ_64MUL_2(*q, mask);
660 }
661 }
662 }
663 }
664
665 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)666 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
667 {
668 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
669 int c;
670
671 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
672 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
673 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
674 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
675 rm->rm_col[VDEV_RAIDZ_R].rc_size);
676
677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 src = rm->rm_col[c].rc_data;
679 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
680 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
681 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
682
683 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
684
685 if (c == rm->rm_firstdatacol) {
686 ASSERT(ccnt == pcnt || ccnt == 0);
687 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
688 *p = *src;
689 *q = *src;
690 *r = *src;
691 }
692 for (; i < pcnt; i++, src++, p++, q++, r++) {
693 *p = 0;
694 *q = 0;
695 *r = 0;
696 }
697 } else {
698 ASSERT(ccnt <= pcnt);
699
700 /*
701 * Apply the algorithm described above by multiplying
702 * the previous result and adding in the new value.
703 */
704 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
705 *p ^= *src;
706
707 VDEV_RAIDZ_64MUL_2(*q, mask);
708 *q ^= *src;
709
710 VDEV_RAIDZ_64MUL_4(*r, mask);
711 *r ^= *src;
712 }
713
714 /*
715 * Treat short columns as though they are full of 0s.
716 * Note that there's therefore nothing needed for P.
717 */
718 for (; i < pcnt; i++, q++, r++) {
719 VDEV_RAIDZ_64MUL_2(*q, mask);
720 VDEV_RAIDZ_64MUL_4(*r, mask);
721 }
722 }
723 }
724 }
725
726 /*
727 * Generate RAID parity in the first virtual columns according to the number of
728 * parity columns available.
729 */
730 static void
vdev_raidz_generate_parity(raidz_map_t * rm)731 vdev_raidz_generate_parity(raidz_map_t *rm)
732 {
733 switch (rm->rm_firstdatacol) {
734 case 1:
735 vdev_raidz_generate_parity_p(rm);
736 break;
737 case 2:
738 vdev_raidz_generate_parity_pq(rm);
739 break;
740 case 3:
741 vdev_raidz_generate_parity_pqr(rm);
742 break;
743 default:
744 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
745 }
746 }
747
748 static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)749 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
750 {
751 uint64_t *dst, *src, xcount, ccount, count, i;
752 int x = tgts[0];
753 int c;
754
755 ASSERT(ntgts == 1);
756 ASSERT(x >= rm->rm_firstdatacol);
757 ASSERT(x < rm->rm_cols);
758
759 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
760 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
761 ASSERT(xcount > 0);
762
763 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
764 dst = rm->rm_col[x].rc_data;
765 for (i = 0; i < xcount; i++, dst++, src++) {
766 *dst = *src;
767 }
768
769 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770 src = rm->rm_col[c].rc_data;
771 dst = rm->rm_col[x].rc_data;
772
773 if (c == x)
774 continue;
775
776 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
777 count = MIN(ccount, xcount);
778
779 for (i = 0; i < count; i++, dst++, src++) {
780 *dst ^= *src;
781 }
782 }
783
784 return (1 << VDEV_RAIDZ_P);
785 }
786
787 static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)788 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
789 {
790 uint64_t *dst, *src, xcount, ccount, count, mask, i;
791 uint8_t *b;
792 int x = tgts[0];
793 int c, j, exp;
794
795 ASSERT(ntgts == 1);
796
797 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
798 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
799
800 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
801 src = rm->rm_col[c].rc_data;
802 dst = rm->rm_col[x].rc_data;
803
804 if (c == x)
805 ccount = 0;
806 else
807 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
808
809 count = MIN(ccount, xcount);
810
811 if (c == rm->rm_firstdatacol) {
812 for (i = 0; i < count; i++, dst++, src++) {
813 *dst = *src;
814 }
815 for (; i < xcount; i++, dst++) {
816 *dst = 0;
817 }
818
819 } else {
820 for (i = 0; i < count; i++, dst++, src++) {
821 VDEV_RAIDZ_64MUL_2(*dst, mask);
822 *dst ^= *src;
823 }
824
825 for (; i < xcount; i++, dst++) {
826 VDEV_RAIDZ_64MUL_2(*dst, mask);
827 }
828 }
829 }
830
831 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
832 dst = rm->rm_col[x].rc_data;
833 exp = 255 - (rm->rm_cols - 1 - x);
834
835 for (i = 0; i < xcount; i++, dst++, src++) {
836 *dst ^= *src;
837 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
838 *b = vdev_raidz_exp2(*b, exp);
839 }
840 }
841
842 return (1 << VDEV_RAIDZ_Q);
843 }
844
845 static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)846 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
847 {
848 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
849 void *pdata, *qdata;
850 uint64_t xsize, ysize, i;
851 int x = tgts[0];
852 int y = tgts[1];
853
854 ASSERT(ntgts == 2);
855 ASSERT(x < y);
856 ASSERT(x >= rm->rm_firstdatacol);
857 ASSERT(y < rm->rm_cols);
858
859 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
860
861 /*
862 * Move the parity data aside -- we're going to compute parity as
863 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
864 * reuse the parity generation mechanism without trashing the actual
865 * parity so we make those columns appear to be full of zeros by
866 * setting their lengths to zero.
867 */
868 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
869 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
870 xsize = rm->rm_col[x].rc_size;
871 ysize = rm->rm_col[y].rc_size;
872
873 rm->rm_col[VDEV_RAIDZ_P].rc_data =
874 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
875 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
876 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
877 rm->rm_col[x].rc_size = 0;
878 rm->rm_col[y].rc_size = 0;
879
880 vdev_raidz_generate_parity_pq(rm);
881
882 rm->rm_col[x].rc_size = xsize;
883 rm->rm_col[y].rc_size = ysize;
884
885 p = pdata;
886 q = qdata;
887 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
888 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
889 xd = rm->rm_col[x].rc_data;
890 yd = rm->rm_col[y].rc_data;
891
892 /*
893 * We now have:
894 * Pxy = P + D_x + D_y
895 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
896 *
897 * We can then solve for D_x:
898 * D_x = A * (P + Pxy) + B * (Q + Qxy)
899 * where
900 * A = 2^(x - y) * (2^(x - y) + 1)^-1
901 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
902 *
903 * With D_x in hand, we can easily solve for D_y:
904 * D_y = P + Pxy + D_x
905 */
906
907 a = vdev_raidz_pow2[255 + x - y];
908 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
909 tmp = 255 - vdev_raidz_log2[a ^ 1];
910
911 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
912 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
913
914 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
915 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
916 vdev_raidz_exp2(*q ^ *qxy, bexp);
917
918 if (i < ysize)
919 *yd = *p ^ *pxy ^ *xd;
920 }
921
922 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
923 rm->rm_col[VDEV_RAIDZ_P].rc_size);
924 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
925 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
926
927 /*
928 * Restore the saved parity data.
929 */
930 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
931 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
932
933 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
934 }
935
936 /* BEGIN CSTYLED */
937 /*
938 * In the general case of reconstruction, we must solve the system of linear
939 * equations defined by the coeffecients used to generate parity as well as
940 * the contents of the data and parity disks. This can be expressed with
941 * vectors for the original data (D) and the actual data (d) and parity (p)
942 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
943 *
944 * __ __ __ __
945 * | | __ __ | p_0 |
946 * | V | | D_0 | | p_m-1 |
947 * | | x | : | = | d_0 |
948 * | I | | D_n-1 | | : |
949 * | | ~~ ~~ | d_n-1 |
950 * ~~ ~~ ~~ ~~
951 *
952 * I is simply a square identity matrix of size n, and V is a vandermonde
953 * matrix defined by the coeffecients we chose for the various parity columns
954 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
955 * computation as well as linear separability.
956 *
957 * __ __ __ __
958 * | 1 .. 1 1 1 | | p_0 |
959 * | 2^n-1 .. 4 2 1 | __ __ | : |
960 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
961 * | 1 .. 0 0 0 | | D_1 | | d_0 |
962 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
963 * | : : : : | | : | | d_2 |
964 * | 0 .. 1 0 0 | | D_n-1 | | : |
965 * | 0 .. 0 1 0 | ~~ ~~ | : |
966 * | 0 .. 0 0 1 | | d_n-1 |
967 * ~~ ~~ ~~ ~~
968 *
969 * Note that I, V, d, and p are known. To compute D, we must invert the
970 * matrix and use the known data and parity values to reconstruct the unknown
971 * data values. We begin by removing the rows in V|I and d|p that correspond
972 * to failed or missing columns; we then make V|I square (n x n) and d|p
973 * sized n by removing rows corresponding to unused parity from the bottom up
974 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
975 * using Gauss-Jordan elimination. In the example below we use m=3 parity
976 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
977 * __ __
978 * | 1 1 1 1 1 1 1 1 |
979 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
980 * | 19 205 116 29 64 16 4 1 | / /
981 * | 1 0 0 0 0 0 0 0 | / /
982 * | 0 1 0 0 0 0 0 0 | <--' /
983 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
984 * | 0 0 0 1 0 0 0 0 |
985 * | 0 0 0 0 1 0 0 0 |
986 * | 0 0 0 0 0 1 0 0 |
987 * | 0 0 0 0 0 0 1 0 |
988 * | 0 0 0 0 0 0 0 1 |
989 * ~~ ~~
990 * __ __
991 * | 1 1 1 1 1 1 1 1 |
992 * | 19 205 116 29 64 16 4 1 |
993 * | 1 0 0 0 0 0 0 0 |
994 * (V|I)' = | 0 0 0 1 0 0 0 0 |
995 * | 0 0 0 0 1 0 0 0 |
996 * | 0 0 0 0 0 1 0 0 |
997 * | 0 0 0 0 0 0 1 0 |
998 * | 0 0 0 0 0 0 0 1 |
999 * ~~ ~~
1000 *
1001 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003 * matrix is not singular.
1004 * __ __
1005 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1006 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1007 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1008 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1009 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1010 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1011 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1012 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1013 * ~~ ~~
1014 * __ __
1015 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1018 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1019 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1020 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1021 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1022 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1023 * ~~ ~~
1024 * __ __
1025 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1026 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1027 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1028 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1029 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1030 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1031 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1032 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1033 * ~~ ~~
1034 * __ __
1035 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1036 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1037 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1038 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1039 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1040 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1041 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1042 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1043 * ~~ ~~
1044 * __ __
1045 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1046 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1047 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1048 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1049 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1050 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1051 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1052 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1053 * ~~ ~~
1054 * __ __
1055 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1056 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1057 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1058 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1059 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1060 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1061 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1062 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1063 * ~~ ~~
1064 * __ __
1065 * | 0 0 1 0 0 0 0 0 |
1066 * | 167 100 5 41 159 169 217 208 |
1067 * | 166 100 4 40 158 168 216 209 |
1068 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1069 * | 0 0 0 0 1 0 0 0 |
1070 * | 0 0 0 0 0 1 0 0 |
1071 * | 0 0 0 0 0 0 1 0 |
1072 * | 0 0 0 0 0 0 0 1 |
1073 * ~~ ~~
1074 *
1075 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076 * of the missing data.
1077 *
1078 * As is apparent from the example above, the only non-trivial rows in the
1079 * inverse matrix correspond to the data disks that we're trying to
1080 * reconstruct. Indeed, those are the only rows we need as the others would
1081 * only be useful for reconstructing data known or assumed to be valid. For
1082 * that reason, we only build the coefficients in the rows that correspond to
1083 * targeted columns.
1084 */
1085 /* END CSTYLED */
1086
1087 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1088 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089 uint8_t **rows)
1090 {
1091 int i, j;
1092 int pow;
1093
1094 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095
1096 /*
1097 * Fill in the missing rows of interest.
1098 */
1099 for (i = 0; i < nmap; i++) {
1100 ASSERT3S(0, <=, map[i]);
1101 ASSERT3S(map[i], <=, 2);
1102
1103 pow = map[i] * n;
1104 if (pow > 255)
1105 pow -= 255;
1106 ASSERT(pow <= 255);
1107
1108 for (j = 0; j < n; j++) {
1109 pow -= map[i];
1110 if (pow < 0)
1111 pow += 255;
1112 rows[i][j] = vdev_raidz_pow2[pow];
1113 }
1114 }
1115 }
1116
1117 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1118 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120 {
1121 int i, j, ii, jj;
1122 uint8_t log;
1123
1124 /*
1125 * Assert that the first nmissing entries from the array of used
1126 * columns correspond to parity columns and that subsequent entries
1127 * correspond to data columns.
1128 */
1129 for (i = 0; i < nmissing; i++) {
1130 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131 }
1132 for (; i < n; i++) {
1133 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134 }
1135
1136 /*
1137 * First initialize the storage where we'll compute the inverse rows.
1138 */
1139 for (i = 0; i < nmissing; i++) {
1140 for (j = 0; j < n; j++) {
1141 invrows[i][j] = (i == j) ? 1 : 0;
1142 }
1143 }
1144
1145 /*
1146 * Subtract all trivial rows from the rows of consequence.
1147 */
1148 for (i = 0; i < nmissing; i++) {
1149 for (j = nmissing; j < n; j++) {
1150 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151 jj = used[j] - rm->rm_firstdatacol;
1152 ASSERT3S(jj, <, n);
1153 invrows[i][j] = rows[i][jj];
1154 rows[i][jj] = 0;
1155 }
1156 }
1157
1158 /*
1159 * For each of the rows of interest, we must normalize it and subtract
1160 * a multiple of it from the other rows.
1161 */
1162 for (i = 0; i < nmissing; i++) {
1163 for (j = 0; j < missing[i]; j++) {
1164 ASSERT0(rows[i][j]);
1165 }
1166 ASSERT3U(rows[i][missing[i]], !=, 0);
1167
1168 /*
1169 * Compute the inverse of the first element and multiply each
1170 * element in the row by that value.
1171 */
1172 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173
1174 for (j = 0; j < n; j++) {
1175 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177 }
1178
1179 for (ii = 0; ii < nmissing; ii++) {
1180 if (i == ii)
1181 continue;
1182
1183 ASSERT3U(rows[ii][missing[i]], !=, 0);
1184
1185 log = vdev_raidz_log2[rows[ii][missing[i]]];
1186
1187 for (j = 0; j < n; j++) {
1188 rows[ii][j] ^=
1189 vdev_raidz_exp2(rows[i][j], log);
1190 invrows[ii][j] ^=
1191 vdev_raidz_exp2(invrows[i][j], log);
1192 }
1193 }
1194 }
1195
1196 /*
1197 * Verify that the data that is left in the rows are properly part of
1198 * an identity matrix.
1199 */
1200 for (i = 0; i < nmissing; i++) {
1201 for (j = 0; j < n; j++) {
1202 if (j == missing[i]) {
1203 ASSERT3U(rows[i][j], ==, 1);
1204 } else {
1205 ASSERT0(rows[i][j]);
1206 }
1207 }
1208 }
1209 }
1210
1211 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1212 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213 int *missing, uint8_t **invrows, const uint8_t *used)
1214 {
1215 int i, j, x, cc, c;
1216 uint8_t *src;
1217 uint64_t ccount;
1218 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220 uint8_t log = 0;
1221 uint8_t val;
1222 int ll;
1223 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224 uint8_t *p, *pp;
1225 size_t psize;
1226
1227 psize = sizeof (invlog[0][0]) * n * nmissing;
1228 p = kmem_alloc(psize, KM_SLEEP);
1229
1230 for (pp = p, i = 0; i < nmissing; i++) {
1231 invlog[i] = pp;
1232 pp += n;
1233 }
1234
1235 for (i = 0; i < nmissing; i++) {
1236 for (j = 0; j < n; j++) {
1237 ASSERT3U(invrows[i][j], !=, 0);
1238 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239 }
1240 }
1241
1242 for (i = 0; i < n; i++) {
1243 c = used[i];
1244 ASSERT3U(c, <, rm->rm_cols);
1245
1246 src = rm->rm_col[c].rc_data;
1247 ccount = rm->rm_col[c].rc_size;
1248 for (j = 0; j < nmissing; j++) {
1249 cc = missing[j] + rm->rm_firstdatacol;
1250 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251 ASSERT3U(cc, <, rm->rm_cols);
1252 ASSERT3U(cc, !=, c);
1253
1254 dst[j] = rm->rm_col[cc].rc_data;
1255 dcount[j] = rm->rm_col[cc].rc_size;
1256 }
1257
1258 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259
1260 for (x = 0; x < ccount; x++, src++) {
1261 if (*src != 0)
1262 log = vdev_raidz_log2[*src];
1263
1264 for (cc = 0; cc < nmissing; cc++) {
1265 if (x >= dcount[cc])
1266 continue;
1267
1268 if (*src == 0) {
1269 val = 0;
1270 } else {
1271 if ((ll = log + invlog[cc][i]) >= 255)
1272 ll -= 255;
1273 val = vdev_raidz_pow2[ll];
1274 }
1275
1276 if (i == 0)
1277 dst[cc][x] = val;
1278 else
1279 dst[cc][x] ^= val;
1280 }
1281 }
1282 }
1283
1284 kmem_free(p, psize);
1285 }
1286
1287 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1288 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289 {
1290 int n, i, c, t, tt;
1291 int nmissing_rows;
1292 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293 int parity_map[VDEV_RAIDZ_MAXPARITY];
1294
1295 uint8_t *p, *pp;
1296 size_t psize;
1297
1298 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300 uint8_t *used;
1301
1302 int code = 0;
1303
1304
1305 n = rm->rm_cols - rm->rm_firstdatacol;
1306
1307 /*
1308 * Figure out which data columns are missing.
1309 */
1310 nmissing_rows = 0;
1311 for (t = 0; t < ntgts; t++) {
1312 if (tgts[t] >= rm->rm_firstdatacol) {
1313 missing_rows[nmissing_rows++] =
1314 tgts[t] - rm->rm_firstdatacol;
1315 }
1316 }
1317
1318 /*
1319 * Figure out which parity columns to use to help generate the missing
1320 * data columns.
1321 */
1322 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323 ASSERT(tt < ntgts);
1324 ASSERT(c < rm->rm_firstdatacol);
1325
1326 /*
1327 * Skip any targeted parity columns.
1328 */
1329 if (c == tgts[tt]) {
1330 tt++;
1331 continue;
1332 }
1333
1334 code |= 1 << c;
1335
1336 parity_map[i] = c;
1337 i++;
1338 }
1339
1340 ASSERT(code != 0);
1341 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342
1343 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344 nmissing_rows * n + sizeof (used[0]) * n;
1345 p = kmem_alloc(psize, KM_SLEEP);
1346
1347 for (pp = p, i = 0; i < nmissing_rows; i++) {
1348 rows[i] = pp;
1349 pp += n;
1350 invrows[i] = pp;
1351 pp += n;
1352 }
1353 used = pp;
1354
1355 for (i = 0; i < nmissing_rows; i++) {
1356 used[i] = parity_map[i];
1357 }
1358
1359 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360 if (tt < nmissing_rows &&
1361 c == missing_rows[tt] + rm->rm_firstdatacol) {
1362 tt++;
1363 continue;
1364 }
1365
1366 ASSERT3S(i, <, n);
1367 used[i] = c;
1368 i++;
1369 }
1370
1371 /*
1372 * Initialize the interesting rows of the matrix.
1373 */
1374 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375
1376 /*
1377 * Invert the matrix.
1378 */
1379 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380 invrows, used);
1381
1382 /*
1383 * Reconstruct the missing data using the generated matrix.
1384 */
1385 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386 invrows, used);
1387
1388 kmem_free(p, psize);
1389
1390 return (code);
1391 }
1392
1393 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1394 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395 {
1396 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397 int ntgts;
1398 int i, c;
1399 int code;
1400 int nbadparity, nbaddata;
1401 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402
1403 /*
1404 * The tgts list must already be sorted.
1405 */
1406 for (i = 1; i < nt; i++) {
1407 ASSERT(t[i] > t[i - 1]);
1408 }
1409
1410 nbadparity = rm->rm_firstdatacol;
1411 nbaddata = rm->rm_cols - nbadparity;
1412 ntgts = 0;
1413 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414 if (c < rm->rm_firstdatacol)
1415 parity_valid[c] = B_FALSE;
1416
1417 if (i < nt && c == t[i]) {
1418 tgts[ntgts++] = c;
1419 i++;
1420 } else if (rm->rm_col[c].rc_error != 0) {
1421 tgts[ntgts++] = c;
1422 } else if (c >= rm->rm_firstdatacol) {
1423 nbaddata--;
1424 } else {
1425 parity_valid[c] = B_TRUE;
1426 nbadparity--;
1427 }
1428 }
1429
1430 ASSERT(ntgts >= nt);
1431 ASSERT(nbaddata >= 0);
1432 ASSERT(nbaddata + nbadparity == ntgts);
1433
1434 dt = &tgts[nbadparity];
1435
1436 /*
1437 * See if we can use any of our optimized reconstruction routines.
1438 */
1439 if (!vdev_raidz_default_to_general) {
1440 switch (nbaddata) {
1441 case 1:
1442 if (parity_valid[VDEV_RAIDZ_P])
1443 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444
1445 ASSERT(rm->rm_firstdatacol > 1);
1446
1447 if (parity_valid[VDEV_RAIDZ_Q])
1448 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449
1450 ASSERT(rm->rm_firstdatacol > 2);
1451 break;
1452
1453 case 2:
1454 ASSERT(rm->rm_firstdatacol > 1);
1455
1456 if (parity_valid[VDEV_RAIDZ_P] &&
1457 parity_valid[VDEV_RAIDZ_Q])
1458 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459
1460 ASSERT(rm->rm_firstdatacol > 2);
1461
1462 break;
1463 }
1464 }
1465
1466 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468 ASSERT(code > 0);
1469 return (code);
1470 }
1471
1472 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * ashift)1473 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474 uint64_t *ashift)
1475 {
1476 vdev_t *cvd;
1477 uint64_t nparity = vd->vdev_nparity;
1478 int c;
1479 int lasterror = 0;
1480 int numerrors = 0;
1481
1482 ASSERT(nparity > 0);
1483
1484 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485 vd->vdev_children < nparity + 1) {
1486 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487 return (SET_ERROR(EINVAL));
1488 }
1489
1490 vdev_open_children(vd);
1491
1492 for (c = 0; c < vd->vdev_children; c++) {
1493 cvd = vd->vdev_child[c];
1494
1495 if (cvd->vdev_open_error != 0) {
1496 lasterror = cvd->vdev_open_error;
1497 numerrors++;
1498 continue;
1499 }
1500
1501 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503 *ashift = MAX(*ashift, cvd->vdev_ashift);
1504 }
1505
1506 *asize *= vd->vdev_children;
1507 *max_asize *= vd->vdev_children;
1508
1509 if (numerrors > nparity) {
1510 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511 return (lasterror);
1512 }
1513
1514 return (0);
1515 }
1516
1517 static void
vdev_raidz_close(vdev_t * vd)1518 vdev_raidz_close(vdev_t *vd)
1519 {
1520 int c;
1521
1522 for (c = 0; c < vd->vdev_children; c++)
1523 vdev_close(vd->vdev_child[c]);
1524 }
1525
1526 /*
1527 * Handle a read or write I/O to a RAID-Z dump device.
1528 *
1529 * The dump device is in a unique situation compared to other ZFS datasets:
1530 * writing to this device should be as simple and fast as possible. In
1531 * addition, durability matters much less since the dump will be extracted
1532 * once the machine reboots. For that reason, this function eschews parity for
1533 * performance and simplicity. The dump device uses the checksum setting
1534 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1535 * dataset.
1536 *
1537 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1538 * 128 KB will not fill an entire block; in addition, they may not be properly
1539 * aligned. In that case, this function uses the preallocated 128 KB block and
1540 * omits reading or writing any "empty" portions of that block, as opposed to
1541 * allocating a fresh appropriately-sized block.
1542 *
1543 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1544 *
1545 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1546 *
1547 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1548 * allocated which spans all five child vdevs. 8 KB of data would be written to
1549 * each of four vdevs, with the fifth containing the parity bits.
1550 *
1551 * parity data data data data
1552 * | PP | XX | XX | XX | XX |
1553 * ^ ^ ^ ^ ^
1554 * | | | | |
1555 * 8 KB parity ------8 KB data blocks------
1556 *
1557 * However, when writing to the dump device, the behavior is different:
1558 *
1559 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * Unlike the normal RAID-Z case in which the block is allocated based on the
1562 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1563 * I/O size is less than 128 KB, only the actual portions of data are written.
1564 * In this example the data is written to the third data vdev since that vdev
1565 * contains the offset [64 KB, 96 KB).
1566 *
1567 * parity data data data data
1568 * | | | | XX | |
1569 * ^
1570 * |
1571 * 32 KB data block
1572 *
1573 * As a result, an individual I/O may not span all child vdevs; moreover, a
1574 * small I/O may only operate on a single child vdev.
1575 *
1576 * Note that since there are no parity bits calculated or written, this format
1577 * remains the same no matter how many parity bits are used in a normal RAID-Z
1578 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1579 * would look like:
1580 *
1581 * parity parity parity data data data data
1582 * | | | | | | XX | |
1583 * ^
1584 * |
1585 * 32 KB data block
1586 */
1587 int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1588 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1589 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1590 {
1591 vdev_t *tvd = vd->vdev_top;
1592 vdev_t *cvd;
1593 raidz_map_t *rm;
1594 raidz_col_t *rc;
1595 int c, err = 0;
1596
1597 uint64_t start, end, colstart, colend;
1598 uint64_t coloffset, colsize, colskip;
1599
1600 int flags = doread ? B_READ : B_WRITE;
1601
1602 #ifdef _KERNEL
1603
1604 /*
1605 * Don't write past the end of the block
1606 */
1607 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1608
1609 start = offset;
1610 end = start + size;
1611
1612 /*
1613 * Allocate a RAID-Z map for this block. Note that this block starts
1614 * from the "original" offset, this is, the offset of the extent which
1615 * contains the requisite offset of the data being read or written.
1616 *
1617 * Even if this I/O operation doesn't span the full block size, let's
1618 * treat the on-disk format as if the only blocks are the complete 128
1619 * KB size.
1620 */
1621 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1622 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1623 vd->vdev_children, vd->vdev_nparity);
1624
1625 coloffset = origoffset;
1626
1627 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1628 c++, coloffset += rc->rc_size) {
1629 rc = &rm->rm_col[c];
1630 cvd = vd->vdev_child[rc->rc_devidx];
1631
1632 /*
1633 * Find the start and end of this column in the RAID-Z map,
1634 * keeping in mind that the stated size and offset of the
1635 * operation may not fill the entire column for this vdev.
1636 *
1637 * If any portion of the data spans this column, issue the
1638 * appropriate operation to the vdev.
1639 */
1640 if (coloffset + rc->rc_size <= start)
1641 continue;
1642 if (coloffset >= end)
1643 continue;
1644
1645 colstart = MAX(coloffset, start);
1646 colend = MIN(end, coloffset + rc->rc_size);
1647 colsize = colend - colstart;
1648 colskip = colstart - coloffset;
1649
1650 VERIFY3U(colsize, <=, rc->rc_size);
1651 VERIFY3U(colskip, <=, rc->rc_size);
1652
1653 /*
1654 * Note that the child vdev will have a vdev label at the start
1655 * of its range of offsets, hence the need for
1656 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1657 * example of why this calculation is needed.
1658 */
1659 if ((err = vdev_disk_physio(cvd,
1660 ((char *)rc->rc_data) + colskip, colsize,
1661 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1662 flags, isdump)) != 0)
1663 break;
1664 }
1665
1666 vdev_raidz_map_free(rm);
1667 #endif /* KERNEL */
1668
1669 return (err);
1670 }
1671
1672 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1673 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1674 {
1675 uint64_t asize;
1676 uint64_t ashift = vd->vdev_top->vdev_ashift;
1677 uint64_t cols = vd->vdev_children;
1678 uint64_t nparity = vd->vdev_nparity;
1679
1680 asize = ((psize - 1) >> ashift) + 1;
1681 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1682 asize = roundup(asize, nparity + 1) << ashift;
1683
1684 return (asize);
1685 }
1686
1687 static void
vdev_raidz_child_done(zio_t * zio)1688 vdev_raidz_child_done(zio_t *zio)
1689 {
1690 raidz_col_t *rc = zio->io_private;
1691
1692 rc->rc_error = zio->io_error;
1693 rc->rc_tried = 1;
1694 rc->rc_skipped = 0;
1695 }
1696
1697 /*
1698 * Start an IO operation on a RAIDZ VDev
1699 *
1700 * Outline:
1701 * - For write operations:
1702 * 1. Generate the parity data
1703 * 2. Create child zio write operations to each column's vdev, for both
1704 * data and parity.
1705 * 3. If the column skips any sectors for padding, create optional dummy
1706 * write zio children for those areas to improve aggregation continuity.
1707 * - For read operations:
1708 * 1. Create child zio read operations to each data column's vdev to read
1709 * the range of data required for zio.
1710 * 2. If this is a scrub or resilver operation, or if any of the data
1711 * vdevs have had errors, then create zio read operations to the parity
1712 * columns' VDevs as well.
1713 */
1714 static void
vdev_raidz_io_start(zio_t * zio)1715 vdev_raidz_io_start(zio_t *zio)
1716 {
1717 vdev_t *vd = zio->io_vd;
1718 vdev_t *tvd = vd->vdev_top;
1719 vdev_t *cvd;
1720 raidz_map_t *rm;
1721 raidz_col_t *rc;
1722 int c, i;
1723
1724 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1725 tvd->vdev_ashift, vd->vdev_children,
1726 vd->vdev_nparity);
1727
1728 zio->io_vsd = rm;
1729 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1730
1731 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1732
1733 if (zio->io_type == ZIO_TYPE_WRITE) {
1734 vdev_raidz_generate_parity(rm);
1735
1736 for (c = 0; c < rm->rm_cols; c++) {
1737 rc = &rm->rm_col[c];
1738 cvd = vd->vdev_child[rc->rc_devidx];
1739 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1740 rc->rc_offset, rc->rc_data, rc->rc_size,
1741 zio->io_type, zio->io_priority, 0,
1742 vdev_raidz_child_done, rc));
1743 }
1744
1745 /*
1746 * Generate optional I/Os for any skipped sectors to improve
1747 * aggregation contiguity.
1748 */
1749 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1750 ASSERT(c <= rm->rm_scols);
1751 if (c == rm->rm_scols)
1752 c = 0;
1753 rc = &rm->rm_col[c];
1754 cvd = vd->vdev_child[rc->rc_devidx];
1755 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1756 rc->rc_offset + rc->rc_size, NULL,
1757 1 << tvd->vdev_ashift,
1758 zio->io_type, zio->io_priority,
1759 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1760 }
1761
1762 zio_execute(zio);
1763 return;
1764 }
1765
1766 ASSERT(zio->io_type == ZIO_TYPE_READ);
1767
1768 /*
1769 * Iterate over the columns in reverse order so that we hit the parity
1770 * last -- any errors along the way will force us to read the parity.
1771 */
1772 for (c = rm->rm_cols - 1; c >= 0; c--) {
1773 rc = &rm->rm_col[c];
1774 cvd = vd->vdev_child[rc->rc_devidx];
1775 if (cvd->vdev_avoid_read) {
1776 if (c >= rm->rm_firstdatacol)
1777 rm->rm_missingdata++;
1778 else
1779 rm->rm_missingparity++;
1780 rc->rc_error = SET_ERROR(ENXIO);
1781 rc->rc_skipped = 1; /* only try if necessary */
1782 continue;
1783 }
1784 if (!vdev_readable(cvd)) {
1785 if (c >= rm->rm_firstdatacol)
1786 rm->rm_missingdata++;
1787 else
1788 rm->rm_missingparity++;
1789 rc->rc_error = SET_ERROR(ENXIO);
1790 rc->rc_tried = 1; /* don't even try */
1791 rc->rc_skipped = 1;
1792 continue;
1793 }
1794 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1795 if (c >= rm->rm_firstdatacol)
1796 rm->rm_missingdata++;
1797 else
1798 rm->rm_missingparity++;
1799 rc->rc_error = SET_ERROR(ESTALE);
1800 rc->rc_skipped = 1;
1801 continue;
1802 }
1803 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1804 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1805 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1806 rc->rc_offset, rc->rc_data, rc->rc_size,
1807 zio->io_type, zio->io_priority, 0,
1808 vdev_raidz_child_done, rc));
1809 }
1810 }
1811
1812 zio_execute(zio);
1813 }
1814
1815
1816 /*
1817 * Report a checksum error for a child of a RAID-Z device.
1818 */
1819 static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)1820 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1821 {
1822 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1823
1824 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1825 zio_bad_cksum_t zbc;
1826 raidz_map_t *rm = zio->io_vsd;
1827
1828 mutex_enter(&vd->vdev_stat_lock);
1829 vd->vdev_stat.vs_checksum_errors++;
1830 mutex_exit(&vd->vdev_stat_lock);
1831
1832 zbc.zbc_has_cksum = 0;
1833 zbc.zbc_injected = rm->rm_ecksuminjected;
1834
1835 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1836 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1837 &zbc);
1838 }
1839 }
1840
1841 /*
1842 * We keep track of whether or not there were any injected errors, so that
1843 * any ereports we generate can note it.
1844 */
1845 static int
raidz_checksum_verify(zio_t * zio)1846 raidz_checksum_verify(zio_t *zio)
1847 {
1848 zio_bad_cksum_t zbc;
1849 raidz_map_t *rm = zio->io_vsd;
1850
1851 int ret = zio_checksum_error(zio, &zbc);
1852 if (ret != 0 && zbc.zbc_injected != 0)
1853 rm->rm_ecksuminjected = 1;
1854
1855 return (ret);
1856 }
1857
1858 /*
1859 * Generate the parity from the data columns. If we tried and were able to
1860 * read the parity without error, verify that the generated parity matches the
1861 * data we read. If it doesn't, we fire off a checksum error. Return the
1862 * number such failures.
1863 */
1864 static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)1865 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1866 {
1867 void *orig[VDEV_RAIDZ_MAXPARITY];
1868 int c, ret = 0;
1869 raidz_col_t *rc;
1870
1871 blkptr_t *bp = zio->io_bp;
1872 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1873 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1874
1875 if (checksum == ZIO_CHECKSUM_NOPARITY)
1876 return (ret);
1877
1878 for (c = 0; c < rm->rm_firstdatacol; c++) {
1879 rc = &rm->rm_col[c];
1880 if (!rc->rc_tried || rc->rc_error != 0)
1881 continue;
1882 orig[c] = zio_buf_alloc(rc->rc_size);
1883 bcopy(rc->rc_data, orig[c], rc->rc_size);
1884 }
1885
1886 vdev_raidz_generate_parity(rm);
1887
1888 for (c = 0; c < rm->rm_firstdatacol; c++) {
1889 rc = &rm->rm_col[c];
1890 if (!rc->rc_tried || rc->rc_error != 0)
1891 continue;
1892 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1893 raidz_checksum_error(zio, rc, orig[c]);
1894 rc->rc_error = SET_ERROR(ECKSUM);
1895 ret++;
1896 }
1897 zio_buf_free(orig[c], rc->rc_size);
1898 }
1899
1900 return (ret);
1901 }
1902
1903 /*
1904 * Keep statistics on all the ways that we used parity to correct data.
1905 */
1906 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1907
1908 static int
vdev_raidz_worst_error(raidz_map_t * rm)1909 vdev_raidz_worst_error(raidz_map_t *rm)
1910 {
1911 int error = 0;
1912
1913 for (int c = 0; c < rm->rm_cols; c++)
1914 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1915
1916 return (error);
1917 }
1918
1919 /*
1920 * Iterate over all combinations of bad data and attempt a reconstruction.
1921 * Note that the algorithm below is non-optimal because it doesn't take into
1922 * account how reconstruction is actually performed. For example, with
1923 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1924 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1925 * cases we'd only use parity information in column 0.
1926 */
1927 static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)1928 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1929 {
1930 raidz_map_t *rm = zio->io_vsd;
1931 raidz_col_t *rc;
1932 void *orig[VDEV_RAIDZ_MAXPARITY];
1933 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1934 int *tgts = &tstore[1];
1935 int current, next, i, c, n;
1936 int code, ret = 0;
1937
1938 ASSERT(total_errors < rm->rm_firstdatacol);
1939
1940 /*
1941 * This simplifies one edge condition.
1942 */
1943 tgts[-1] = -1;
1944
1945 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1946 /*
1947 * Initialize the targets array by finding the first n columns
1948 * that contain no error.
1949 *
1950 * If there were no data errors, we need to ensure that we're
1951 * always explicitly attempting to reconstruct at least one
1952 * data column. To do this, we simply push the highest target
1953 * up into the data columns.
1954 */
1955 for (c = 0, i = 0; i < n; i++) {
1956 if (i == n - 1 && data_errors == 0 &&
1957 c < rm->rm_firstdatacol) {
1958 c = rm->rm_firstdatacol;
1959 }
1960
1961 while (rm->rm_col[c].rc_error != 0) {
1962 c++;
1963 ASSERT3S(c, <, rm->rm_cols);
1964 }
1965
1966 tgts[i] = c++;
1967 }
1968
1969 /*
1970 * Setting tgts[n] simplifies the other edge condition.
1971 */
1972 tgts[n] = rm->rm_cols;
1973
1974 /*
1975 * These buffers were allocated in previous iterations.
1976 */
1977 for (i = 0; i < n - 1; i++) {
1978 ASSERT(orig[i] != NULL);
1979 }
1980
1981 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1982
1983 current = 0;
1984 next = tgts[current];
1985
1986 while (current != n) {
1987 tgts[current] = next;
1988 current = 0;
1989
1990 /*
1991 * Save off the original data that we're going to
1992 * attempt to reconstruct.
1993 */
1994 for (i = 0; i < n; i++) {
1995 ASSERT(orig[i] != NULL);
1996 c = tgts[i];
1997 ASSERT3S(c, >=, 0);
1998 ASSERT3S(c, <, rm->rm_cols);
1999 rc = &rm->rm_col[c];
2000 bcopy(rc->rc_data, orig[i], rc->rc_size);
2001 }
2002
2003 /*
2004 * Attempt a reconstruction and exit the outer loop on
2005 * success.
2006 */
2007 code = vdev_raidz_reconstruct(rm, tgts, n);
2008 if (raidz_checksum_verify(zio) == 0) {
2009 atomic_inc_64(&raidz_corrected[code]);
2010
2011 for (i = 0; i < n; i++) {
2012 c = tgts[i];
2013 rc = &rm->rm_col[c];
2014 ASSERT(rc->rc_error == 0);
2015 if (rc->rc_tried)
2016 raidz_checksum_error(zio, rc,
2017 orig[i]);
2018 rc->rc_error = SET_ERROR(ECKSUM);
2019 }
2020
2021 ret = code;
2022 goto done;
2023 }
2024
2025 /*
2026 * Restore the original data.
2027 */
2028 for (i = 0; i < n; i++) {
2029 c = tgts[i];
2030 rc = &rm->rm_col[c];
2031 bcopy(orig[i], rc->rc_data, rc->rc_size);
2032 }
2033
2034 do {
2035 /*
2036 * Find the next valid column after the current
2037 * position..
2038 */
2039 for (next = tgts[current] + 1;
2040 next < rm->rm_cols &&
2041 rm->rm_col[next].rc_error != 0; next++)
2042 continue;
2043
2044 ASSERT(next <= tgts[current + 1]);
2045
2046 /*
2047 * If that spot is available, we're done here.
2048 */
2049 if (next != tgts[current + 1])
2050 break;
2051
2052 /*
2053 * Otherwise, find the next valid column after
2054 * the previous position.
2055 */
2056 for (c = tgts[current - 1] + 1;
2057 rm->rm_col[c].rc_error != 0; c++)
2058 continue;
2059
2060 tgts[current] = c;
2061 current++;
2062
2063 } while (current != n);
2064 }
2065 }
2066 n--;
2067 done:
2068 for (i = 0; i < n; i++) {
2069 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2070 }
2071
2072 return (ret);
2073 }
2074
2075 /*
2076 * Complete an IO operation on a RAIDZ VDev
2077 *
2078 * Outline:
2079 * - For write operations:
2080 * 1. Check for errors on the child IOs.
2081 * 2. Return, setting an error code if too few child VDevs were written
2082 * to reconstruct the data later. Note that partial writes are
2083 * considered successful if they can be reconstructed at all.
2084 * - For read operations:
2085 * 1. Check for errors on the child IOs.
2086 * 2. If data errors occurred:
2087 * a. Try to reassemble the data from the parity available.
2088 * b. If we haven't yet read the parity drives, read them now.
2089 * c. If all parity drives have been read but the data still doesn't
2090 * reassemble with a correct checksum, then try combinatorial
2091 * reconstruction.
2092 * d. If that doesn't work, return an error.
2093 * 3. If there were unexpected errors or this is a resilver operation,
2094 * rewrite the vdevs that had errors.
2095 */
2096 static void
vdev_raidz_io_done(zio_t * zio)2097 vdev_raidz_io_done(zio_t *zio)
2098 {
2099 vdev_t *vd = zio->io_vd;
2100 vdev_t *cvd;
2101 raidz_map_t *rm = zio->io_vsd;
2102 raidz_col_t *rc;
2103 int unexpected_errors = 0;
2104 int parity_errors = 0;
2105 int parity_untried = 0;
2106 int data_errors = 0;
2107 int total_errors = 0;
2108 int n, c;
2109 int tgts[VDEV_RAIDZ_MAXPARITY];
2110 int code;
2111
2112 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2113
2114 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2115 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2116
2117 for (c = 0; c < rm->rm_cols; c++) {
2118 rc = &rm->rm_col[c];
2119
2120 if (rc->rc_error) {
2121 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2122
2123 if (c < rm->rm_firstdatacol)
2124 parity_errors++;
2125 else
2126 data_errors++;
2127
2128 if (!rc->rc_skipped)
2129 unexpected_errors++;
2130
2131 total_errors++;
2132 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2133 parity_untried++;
2134 }
2135 }
2136
2137 if (zio->io_type == ZIO_TYPE_WRITE) {
2138 /*
2139 * XXX -- for now, treat partial writes as a success.
2140 * (If we couldn't write enough columns to reconstruct
2141 * the data, the I/O failed. Otherwise, good enough.)
2142 *
2143 * Now that we support write reallocation, it would be better
2144 * to treat partial failure as real failure unless there are
2145 * no non-degraded top-level vdevs left, and not update DTLs
2146 * if we intend to reallocate.
2147 */
2148 /* XXPOLICY */
2149 if (total_errors > rm->rm_firstdatacol)
2150 zio->io_error = vdev_raidz_worst_error(rm);
2151
2152 return;
2153 }
2154
2155 ASSERT(zio->io_type == ZIO_TYPE_READ);
2156 /*
2157 * There are three potential phases for a read:
2158 * 1. produce valid data from the columns read
2159 * 2. read all disks and try again
2160 * 3. perform combinatorial reconstruction
2161 *
2162 * Each phase is progressively both more expensive and less likely to
2163 * occur. If we encounter more errors than we can repair or all phases
2164 * fail, we have no choice but to return an error.
2165 */
2166
2167 /*
2168 * If the number of errors we saw was correctable -- less than or equal
2169 * to the number of parity disks read -- attempt to produce data that
2170 * has a valid checksum. Naturally, this case applies in the absence of
2171 * any errors.
2172 */
2173 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2174 if (data_errors == 0) {
2175 if (raidz_checksum_verify(zio) == 0) {
2176 /*
2177 * If we read parity information (unnecessarily
2178 * as it happens since no reconstruction was
2179 * needed) regenerate and verify the parity.
2180 * We also regenerate parity when resilvering
2181 * so we can write it out to the failed device
2182 * later.
2183 */
2184 if (parity_errors + parity_untried <
2185 rm->rm_firstdatacol ||
2186 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2187 n = raidz_parity_verify(zio, rm);
2188 unexpected_errors += n;
2189 ASSERT(parity_errors + n <=
2190 rm->rm_firstdatacol);
2191 }
2192 goto done;
2193 }
2194 } else {
2195 /*
2196 * We either attempt to read all the parity columns or
2197 * none of them. If we didn't try to read parity, we
2198 * wouldn't be here in the correctable case. There must
2199 * also have been fewer parity errors than parity
2200 * columns or, again, we wouldn't be in this code path.
2201 */
2202 ASSERT(parity_untried == 0);
2203 ASSERT(parity_errors < rm->rm_firstdatacol);
2204
2205 /*
2206 * Identify the data columns that reported an error.
2207 */
2208 n = 0;
2209 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2210 rc = &rm->rm_col[c];
2211 if (rc->rc_error != 0) {
2212 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2213 tgts[n++] = c;
2214 }
2215 }
2216
2217 ASSERT(rm->rm_firstdatacol >= n);
2218
2219 code = vdev_raidz_reconstruct(rm, tgts, n);
2220
2221 if (raidz_checksum_verify(zio) == 0) {
2222 atomic_inc_64(&raidz_corrected[code]);
2223
2224 /*
2225 * If we read more parity disks than were used
2226 * for reconstruction, confirm that the other
2227 * parity disks produced correct data. This
2228 * routine is suboptimal in that it regenerates
2229 * the parity that we already used in addition
2230 * to the parity that we're attempting to
2231 * verify, but this should be a relatively
2232 * uncommon case, and can be optimized if it
2233 * becomes a problem. Note that we regenerate
2234 * parity when resilvering so we can write it
2235 * out to failed devices later.
2236 */
2237 if (parity_errors < rm->rm_firstdatacol - n ||
2238 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2239 n = raidz_parity_verify(zio, rm);
2240 unexpected_errors += n;
2241 ASSERT(parity_errors + n <=
2242 rm->rm_firstdatacol);
2243 }
2244
2245 goto done;
2246 }
2247 }
2248 }
2249
2250 /*
2251 * This isn't a typical situation -- either we got a read error or
2252 * a child silently returned bad data. Read every block so we can
2253 * try again with as much data and parity as we can track down. If
2254 * we've already been through once before, all children will be marked
2255 * as tried so we'll proceed to combinatorial reconstruction.
2256 */
2257 unexpected_errors = 1;
2258 rm->rm_missingdata = 0;
2259 rm->rm_missingparity = 0;
2260
2261 for (c = 0; c < rm->rm_cols; c++) {
2262 if (rm->rm_col[c].rc_tried)
2263 continue;
2264
2265 zio_vdev_io_redone(zio);
2266 do {
2267 rc = &rm->rm_col[c];
2268 if (rc->rc_tried)
2269 continue;
2270 zio_nowait(zio_vdev_child_io(zio, NULL,
2271 vd->vdev_child[rc->rc_devidx],
2272 rc->rc_offset, rc->rc_data, rc->rc_size,
2273 zio->io_type, zio->io_priority, 0,
2274 vdev_raidz_child_done, rc));
2275 } while (++c < rm->rm_cols);
2276
2277 return;
2278 }
2279
2280 /*
2281 * At this point we've attempted to reconstruct the data given the
2282 * errors we detected, and we've attempted to read all columns. There
2283 * must, therefore, be one or more additional problems -- silent errors
2284 * resulting in invalid data rather than explicit I/O errors resulting
2285 * in absent data. We check if there is enough additional data to
2286 * possibly reconstruct the data and then perform combinatorial
2287 * reconstruction over all possible combinations. If that fails,
2288 * we're cooked.
2289 */
2290 if (total_errors > rm->rm_firstdatacol) {
2291 zio->io_error = vdev_raidz_worst_error(rm);
2292
2293 } else if (total_errors < rm->rm_firstdatacol &&
2294 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2295 /*
2296 * If we didn't use all the available parity for the
2297 * combinatorial reconstruction, verify that the remaining
2298 * parity is correct.
2299 */
2300 if (code != (1 << rm->rm_firstdatacol) - 1)
2301 (void) raidz_parity_verify(zio, rm);
2302 } else {
2303 /*
2304 * We're here because either:
2305 *
2306 * total_errors == rm_first_datacol, or
2307 * vdev_raidz_combrec() failed
2308 *
2309 * In either case, there is enough bad data to prevent
2310 * reconstruction.
2311 *
2312 * Start checksum ereports for all children which haven't
2313 * failed, and the IO wasn't speculative.
2314 */
2315 zio->io_error = SET_ERROR(ECKSUM);
2316
2317 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2318 for (c = 0; c < rm->rm_cols; c++) {
2319 rc = &rm->rm_col[c];
2320 if (rc->rc_error == 0) {
2321 zio_bad_cksum_t zbc;
2322 zbc.zbc_has_cksum = 0;
2323 zbc.zbc_injected =
2324 rm->rm_ecksuminjected;
2325
2326 zfs_ereport_start_checksum(
2327 zio->io_spa,
2328 vd->vdev_child[rc->rc_devidx],
2329 zio, rc->rc_offset, rc->rc_size,
2330 (void *)(uintptr_t)c, &zbc);
2331 }
2332 }
2333 }
2334 }
2335
2336 done:
2337 zio_checksum_verified(zio);
2338
2339 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2340 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2341 /*
2342 * Use the good data we have in hand to repair damaged children.
2343 */
2344 for (c = 0; c < rm->rm_cols; c++) {
2345 rc = &rm->rm_col[c];
2346 cvd = vd->vdev_child[rc->rc_devidx];
2347
2348 if (rc->rc_error == 0)
2349 continue;
2350
2351 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2352 rc->rc_offset, rc->rc_data, rc->rc_size,
2353 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2354 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2355 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2356 }
2357 }
2358 }
2359
2360 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2361 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2362 {
2363 if (faulted > vd->vdev_nparity)
2364 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2365 VDEV_AUX_NO_REPLICAS);
2366 else if (degraded + faulted != 0)
2367 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2368 else
2369 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2370 }
2371
2372 vdev_ops_t vdev_raidz_ops = {
2373 vdev_raidz_open,
2374 vdev_raidz_close,
2375 vdev_raidz_asize,
2376 vdev_raidz_io_start,
2377 vdev_raidz_io_done,
2378 vdev_raidz_state_change,
2379 NULL,
2380 NULL,
2381 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2382 B_FALSE /* not a leaf vdev */
2383 };
2384