1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports both single and double parity. For single parity, we 39 * use a simple XOR of all the data columns. For double parity, we use both 40 * the simple XOR as well as a technique described in "The mathematics of 41 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 42 * over the integers expressable in a single byte. Briefly, the operations on 43 * the field are defined as follows: 44 * 45 * o addition (+) is represented by a bitwise XOR 46 * o subtraction (-) is therefore identical to addition: A + B = A - B 47 * o multiplication of A by 2 is defined by the following bitwise expression: 48 * (A * 2)_7 = A_6 49 * (A * 2)_6 = A_5 50 * (A * 2)_5 = A_4 51 * (A * 2)_4 = A_3 + A_7 52 * (A * 2)_3 = A_2 + A_7 53 * (A * 2)_2 = A_1 + A_7 54 * (A * 2)_1 = A_0 55 * (A * 2)_0 = A_7 56 * 57 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 58 * 59 * Observe that any number in the field (except for 0) can be expressed as a 60 * power of 2 -- a generator for the field. We store a table of the powers of 61 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 62 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 63 * than field addition). The inverse of a field element A (A^-1) is A^254. 64 * 65 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 66 * can be expressed by field operations: 67 * 68 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 69 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 70 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 71 * 72 * See the reconstruction code below for how P and Q can used individually or 73 * in concert to recover missing data columns. 74 */ 75 76 typedef struct raidz_col { 77 uint64_t rc_devidx; /* child device index for I/O */ 78 uint64_t rc_offset; /* device offset */ 79 uint64_t rc_size; /* I/O size */ 80 void *rc_data; /* I/O data */ 81 int rc_error; /* I/O error for this device */ 82 uint8_t rc_tried; /* Did we attempt this I/O column? */ 83 uint8_t rc_skipped; /* Did we skip this I/O column? */ 84 } raidz_col_t; 85 86 typedef struct raidz_map { 87 uint64_t rm_cols; /* Column count */ 88 uint64_t rm_bigcols; /* Number of oversized columns */ 89 uint64_t rm_asize; /* Actual total I/O size */ 90 uint64_t rm_missingdata; /* Count of missing data devices */ 91 uint64_t rm_missingparity; /* Count of missing parity devices */ 92 uint64_t rm_firstdatacol; /* First data column/parity count */ 93 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 94 } raidz_map_t; 95 96 #define VDEV_RAIDZ_P 0 97 #define VDEV_RAIDZ_Q 1 98 99 #define VDEV_RAIDZ_MAXPARITY 2 100 101 #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 102 103 /* 104 * These two tables represent powers and logs of 2 in the Galois field defined 105 * above. These values were computed by repeatedly multiplying by 2 as above. 106 */ 107 static const uint8_t vdev_raidz_pow2[256] = { 108 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 109 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 110 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 111 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 112 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 113 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 114 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 115 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 116 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 117 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 118 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 119 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 120 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 121 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 122 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 123 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 124 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 125 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 126 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 127 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 128 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 129 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 130 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 131 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 132 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 133 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 134 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 135 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 136 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 137 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 138 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 139 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 140 }; 141 static const uint8_t vdev_raidz_log2[256] = { 142 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 143 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 144 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 145 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 146 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 147 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 148 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 149 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 150 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 151 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 152 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 153 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 154 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 155 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 156 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 157 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 158 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 159 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 160 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 161 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 162 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 163 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 164 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 165 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 166 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 167 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 168 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 169 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 170 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 171 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 172 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 173 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 174 }; 175 176 /* 177 * Multiply a given number by 2 raised to the given power. 178 */ 179 static uint8_t 180 vdev_raidz_exp2(uint_t a, int exp) 181 { 182 if (a == 0) 183 return (0); 184 185 ASSERT(exp >= 0); 186 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 187 188 exp += vdev_raidz_log2[a]; 189 if (exp > 255) 190 exp -= 255; 191 192 return (vdev_raidz_pow2[exp]); 193 } 194 195 static void 196 vdev_raidz_map_free(zio_t *zio) 197 { 198 raidz_map_t *rm = zio->io_vsd; 199 int c; 200 201 for (c = 0; c < rm->rm_firstdatacol; c++) 202 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 203 204 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 205 } 206 207 static raidz_map_t * 208 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 209 uint64_t nparity) 210 { 211 raidz_map_t *rm; 212 uint64_t b = zio->io_offset >> unit_shift; 213 uint64_t s = zio->io_size >> unit_shift; 214 uint64_t f = b % dcols; 215 uint64_t o = (b / dcols) << unit_shift; 216 uint64_t q, r, c, bc, col, acols, coff, devidx; 217 218 q = s / (dcols - nparity); 219 r = s - q * (dcols - nparity); 220 bc = (r == 0 ? 0 : r + nparity); 221 222 acols = (q == 0 ? bc : dcols); 223 224 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 225 226 rm->rm_cols = acols; 227 rm->rm_bigcols = bc; 228 rm->rm_asize = 0; 229 rm->rm_missingdata = 0; 230 rm->rm_missingparity = 0; 231 rm->rm_firstdatacol = nparity; 232 233 for (c = 0; c < acols; c++) { 234 col = f + c; 235 coff = o; 236 if (col >= dcols) { 237 col -= dcols; 238 coff += 1ULL << unit_shift; 239 } 240 rm->rm_col[c].rc_devidx = col; 241 rm->rm_col[c].rc_offset = coff; 242 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 243 rm->rm_col[c].rc_data = NULL; 244 rm->rm_col[c].rc_error = 0; 245 rm->rm_col[c].rc_tried = 0; 246 rm->rm_col[c].rc_skipped = 0; 247 rm->rm_asize += rm->rm_col[c].rc_size; 248 } 249 250 rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 251 252 for (c = 0; c < rm->rm_firstdatacol; c++) 253 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 254 255 rm->rm_col[c].rc_data = zio->io_data; 256 257 for (c = c + 1; c < acols; c++) 258 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 259 rm->rm_col[c - 1].rc_size; 260 261 /* 262 * If all data stored spans all columns, there's a danger that parity 263 * will always be on the same device and, since parity isn't read 264 * during normal operation, that that device's I/O bandwidth won't be 265 * used effectively. We therefore switch the parity every 1MB. 266 * 267 * ... at least that was, ostensibly, the theory. As a practical 268 * matter unless we juggle the parity between all devices evenly, we 269 * won't see any benefit. Further, occasional writes that aren't a 270 * multiple of the LCM of the number of children and the minimum 271 * stripe width are sufficient to avoid pessimal behavior. 272 * Unfortunately, this decision created an implicit on-disk format 273 * requirement that we need to support for all eternity, but only 274 * for single-parity RAID-Z. 275 */ 276 ASSERT(rm->rm_cols >= 2); 277 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 278 279 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 280 devidx = rm->rm_col[0].rc_devidx; 281 o = rm->rm_col[0].rc_offset; 282 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 283 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 284 rm->rm_col[1].rc_devidx = devidx; 285 rm->rm_col[1].rc_offset = o; 286 } 287 288 zio->io_vsd = rm; 289 zio->io_vsd_free = vdev_raidz_map_free; 290 return (rm); 291 } 292 293 static void 294 vdev_raidz_generate_parity_p(raidz_map_t *rm) 295 { 296 uint64_t *p, *src, pcount, ccount, i; 297 int c; 298 299 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 300 301 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 302 src = rm->rm_col[c].rc_data; 303 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 304 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 305 306 if (c == rm->rm_firstdatacol) { 307 ASSERT(ccount == pcount); 308 for (i = 0; i < ccount; i++, p++, src++) { 309 *p = *src; 310 } 311 } else { 312 ASSERT(ccount <= pcount); 313 for (i = 0; i < ccount; i++, p++, src++) { 314 *p ^= *src; 315 } 316 } 317 } 318 } 319 320 static void 321 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 322 { 323 uint64_t *q, *p, *src, pcount, ccount, mask, i; 324 int c; 325 326 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 327 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 328 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 329 330 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 331 src = rm->rm_col[c].rc_data; 332 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 333 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 334 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 335 336 if (c == rm->rm_firstdatacol) { 337 ASSERT(ccount == pcount || ccount == 0); 338 for (i = 0; i < ccount; i++, p++, q++, src++) { 339 *q = *src; 340 *p = *src; 341 } 342 for (; i < pcount; i++, p++, q++, src++) { 343 *q = 0; 344 *p = 0; 345 } 346 } else { 347 ASSERT(ccount <= pcount); 348 349 /* 350 * Rather than multiplying each byte individually (as 351 * described above), we are able to handle 8 at once 352 * by generating a mask based on the high bit in each 353 * byte and using that to conditionally XOR in 0x1d. 354 */ 355 for (i = 0; i < ccount; i++, p++, q++, src++) { 356 mask = *q & 0x8080808080808080ULL; 357 mask = (mask << 1) - (mask >> 7); 358 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 359 (mask & 0x1d1d1d1d1d1d1d1dULL); 360 *q ^= *src; 361 *p ^= *src; 362 } 363 364 /* 365 * Treat short columns as though they are full of 0s. 366 */ 367 for (; i < pcount; i++, q++) { 368 mask = *q & 0x8080808080808080ULL; 369 mask = (mask << 1) - (mask >> 7); 370 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 371 (mask & 0x1d1d1d1d1d1d1d1dULL); 372 } 373 } 374 } 375 } 376 377 static void 378 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 379 { 380 uint64_t *dst, *src, xcount, ccount, count, i; 381 int c; 382 383 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 384 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 385 ASSERT(xcount > 0); 386 387 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 388 dst = rm->rm_col[x].rc_data; 389 for (i = 0; i < xcount; i++, dst++, src++) { 390 *dst = *src; 391 } 392 393 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 394 src = rm->rm_col[c].rc_data; 395 dst = rm->rm_col[x].rc_data; 396 397 if (c == x) 398 continue; 399 400 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 401 count = MIN(ccount, xcount); 402 403 for (i = 0; i < count; i++, dst++, src++) { 404 *dst ^= *src; 405 } 406 } 407 } 408 409 static void 410 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 411 { 412 uint64_t *dst, *src, xcount, ccount, count, mask, i; 413 uint8_t *b; 414 int c, j, exp; 415 416 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 417 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 418 419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 420 src = rm->rm_col[c].rc_data; 421 dst = rm->rm_col[x].rc_data; 422 423 if (c == x) 424 ccount = 0; 425 else 426 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 427 428 count = MIN(ccount, xcount); 429 430 if (c == rm->rm_firstdatacol) { 431 for (i = 0; i < count; i++, dst++, src++) { 432 *dst = *src; 433 } 434 for (; i < xcount; i++, dst++) { 435 *dst = 0; 436 } 437 438 } else { 439 /* 440 * For an explanation of this, see the comment in 441 * vdev_raidz_generate_parity_pq() above. 442 */ 443 for (i = 0; i < count; i++, dst++, src++) { 444 mask = *dst & 0x8080808080808080ULL; 445 mask = (mask << 1) - (mask >> 7); 446 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 447 (mask & 0x1d1d1d1d1d1d1d1dULL); 448 *dst ^= *src; 449 } 450 451 for (; i < xcount; i++, dst++) { 452 mask = *dst & 0x8080808080808080ULL; 453 mask = (mask << 1) - (mask >> 7); 454 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 455 (mask & 0x1d1d1d1d1d1d1d1dULL); 456 } 457 } 458 } 459 460 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 461 dst = rm->rm_col[x].rc_data; 462 exp = 255 - (rm->rm_cols - 1 - x); 463 464 for (i = 0; i < xcount; i++, dst++, src++) { 465 *dst ^= *src; 466 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 467 *b = vdev_raidz_exp2(*b, exp); 468 } 469 } 470 } 471 472 static void 473 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 474 { 475 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 476 void *pdata, *qdata; 477 uint64_t xsize, ysize, i; 478 479 ASSERT(x < y); 480 ASSERT(x >= rm->rm_firstdatacol); 481 ASSERT(y < rm->rm_cols); 482 483 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 484 485 /* 486 * Move the parity data aside -- we're going to compute parity as 487 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 488 * reuse the parity generation mechanism without trashing the actual 489 * parity so we make those columns appear to be full of zeros by 490 * setting their lengths to zero. 491 */ 492 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 493 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 494 xsize = rm->rm_col[x].rc_size; 495 ysize = rm->rm_col[y].rc_size; 496 497 rm->rm_col[VDEV_RAIDZ_P].rc_data = 498 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 499 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 500 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 501 rm->rm_col[x].rc_size = 0; 502 rm->rm_col[y].rc_size = 0; 503 504 vdev_raidz_generate_parity_pq(rm); 505 506 rm->rm_col[x].rc_size = xsize; 507 rm->rm_col[y].rc_size = ysize; 508 509 p = pdata; 510 q = qdata; 511 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 512 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 513 xd = rm->rm_col[x].rc_data; 514 yd = rm->rm_col[y].rc_data; 515 516 /* 517 * We now have: 518 * Pxy = P + D_x + D_y 519 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 520 * 521 * We can then solve for D_x: 522 * D_x = A * (P + Pxy) + B * (Q + Qxy) 523 * where 524 * A = 2^(x - y) * (2^(x - y) + 1)^-1 525 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 526 * 527 * With D_x in hand, we can easily solve for D_y: 528 * D_y = P + Pxy + D_x 529 */ 530 531 a = vdev_raidz_pow2[255 + x - y]; 532 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 533 tmp = 255 - vdev_raidz_log2[a ^ 1]; 534 535 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 536 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 537 538 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 539 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 540 vdev_raidz_exp2(*q ^ *qxy, bexp); 541 542 if (i < ysize) 543 *yd = *p ^ *pxy ^ *xd; 544 } 545 546 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 547 rm->rm_col[VDEV_RAIDZ_P].rc_size); 548 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 549 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 550 551 /* 552 * Restore the saved parity data. 553 */ 554 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 555 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 556 } 557 558 559 static int 560 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 561 { 562 uint64_t nparity = vd->vdev_nparity; 563 int lasterror = 0; 564 int numerrors = 0; 565 566 ASSERT(nparity > 0); 567 568 if (nparity > VDEV_RAIDZ_MAXPARITY || 569 vd->vdev_children < nparity + 1) { 570 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 571 return (EINVAL); 572 } 573 574 vdev_open_children(vd); 575 576 for (int c = 0; c < vd->vdev_children; c++) { 577 vdev_t *cvd = vd->vdev_child[c]; 578 579 if (cvd->vdev_open_error) { 580 lasterror = cvd->vdev_open_error; 581 numerrors++; 582 continue; 583 } 584 585 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 586 *ashift = MAX(*ashift, cvd->vdev_ashift); 587 } 588 589 *asize *= vd->vdev_children; 590 591 if (numerrors > nparity) { 592 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 593 return (lasterror); 594 } 595 596 return (0); 597 } 598 599 static void 600 vdev_raidz_close(vdev_t *vd) 601 { 602 for (int c = 0; c < vd->vdev_children; c++) 603 vdev_close(vd->vdev_child[c]); 604 } 605 606 static uint64_t 607 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 608 { 609 uint64_t asize; 610 uint64_t ashift = vd->vdev_top->vdev_ashift; 611 uint64_t cols = vd->vdev_children; 612 uint64_t nparity = vd->vdev_nparity; 613 614 asize = ((psize - 1) >> ashift) + 1; 615 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 616 asize = roundup(asize, nparity + 1) << ashift; 617 618 return (asize); 619 } 620 621 static void 622 vdev_raidz_child_done(zio_t *zio) 623 { 624 raidz_col_t *rc = zio->io_private; 625 626 rc->rc_error = zio->io_error; 627 rc->rc_tried = 1; 628 rc->rc_skipped = 0; 629 } 630 631 static int 632 vdev_raidz_io_start(zio_t *zio) 633 { 634 vdev_t *vd = zio->io_vd; 635 vdev_t *tvd = vd->vdev_top; 636 vdev_t *cvd; 637 blkptr_t *bp = zio->io_bp; 638 raidz_map_t *rm; 639 raidz_col_t *rc; 640 int c; 641 642 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 643 vd->vdev_nparity); 644 645 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 646 647 if (zio->io_type == ZIO_TYPE_WRITE) { 648 /* 649 * Generate RAID parity in the first virtual columns. 650 */ 651 if (rm->rm_firstdatacol == 1) 652 vdev_raidz_generate_parity_p(rm); 653 else 654 vdev_raidz_generate_parity_pq(rm); 655 656 for (c = 0; c < rm->rm_cols; c++) { 657 rc = &rm->rm_col[c]; 658 cvd = vd->vdev_child[rc->rc_devidx]; 659 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 660 rc->rc_offset, rc->rc_data, rc->rc_size, 661 zio->io_type, zio->io_priority, 0, 662 vdev_raidz_child_done, rc)); 663 } 664 665 return (ZIO_PIPELINE_CONTINUE); 666 } 667 668 ASSERT(zio->io_type == ZIO_TYPE_READ); 669 670 /* 671 * Iterate over the columns in reverse order so that we hit the parity 672 * last -- any errors along the way will force us to read the parity 673 * data. 674 */ 675 for (c = rm->rm_cols - 1; c >= 0; c--) { 676 rc = &rm->rm_col[c]; 677 cvd = vd->vdev_child[rc->rc_devidx]; 678 if (!vdev_readable(cvd)) { 679 if (c >= rm->rm_firstdatacol) 680 rm->rm_missingdata++; 681 else 682 rm->rm_missingparity++; 683 rc->rc_error = ENXIO; 684 rc->rc_tried = 1; /* don't even try */ 685 rc->rc_skipped = 1; 686 continue; 687 } 688 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 689 if (c >= rm->rm_firstdatacol) 690 rm->rm_missingdata++; 691 else 692 rm->rm_missingparity++; 693 rc->rc_error = ESTALE; 694 rc->rc_skipped = 1; 695 continue; 696 } 697 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 698 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 699 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 700 rc->rc_offset, rc->rc_data, rc->rc_size, 701 zio->io_type, zio->io_priority, 0, 702 vdev_raidz_child_done, rc)); 703 } 704 } 705 706 return (ZIO_PIPELINE_CONTINUE); 707 } 708 709 /* 710 * Report a checksum error for a child of a RAID-Z device. 711 */ 712 static void 713 raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 714 { 715 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 716 717 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 718 mutex_enter(&vd->vdev_stat_lock); 719 vd->vdev_stat.vs_checksum_errors++; 720 mutex_exit(&vd->vdev_stat_lock); 721 } 722 723 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 724 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 725 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 726 } 727 728 /* 729 * Generate the parity from the data columns. If we tried and were able to 730 * read the parity without error, verify that the generated parity matches the 731 * data we read. If it doesn't, we fire off a checksum error. Return the 732 * number such failures. 733 */ 734 static int 735 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 736 { 737 void *orig[VDEV_RAIDZ_MAXPARITY]; 738 int c, ret = 0; 739 raidz_col_t *rc; 740 741 for (c = 0; c < rm->rm_firstdatacol; c++) { 742 rc = &rm->rm_col[c]; 743 if (!rc->rc_tried || rc->rc_error != 0) 744 continue; 745 orig[c] = zio_buf_alloc(rc->rc_size); 746 bcopy(rc->rc_data, orig[c], rc->rc_size); 747 } 748 749 if (rm->rm_firstdatacol == 1) 750 vdev_raidz_generate_parity_p(rm); 751 else 752 vdev_raidz_generate_parity_pq(rm); 753 754 for (c = 0; c < rm->rm_firstdatacol; c++) { 755 rc = &rm->rm_col[c]; 756 if (!rc->rc_tried || rc->rc_error != 0) 757 continue; 758 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 759 raidz_checksum_error(zio, rc); 760 rc->rc_error = ECKSUM; 761 ret++; 762 } 763 zio_buf_free(orig[c], rc->rc_size); 764 } 765 766 return (ret); 767 } 768 769 static uint64_t raidz_corrected_p; 770 static uint64_t raidz_corrected_q; 771 static uint64_t raidz_corrected_pq; 772 773 static int 774 vdev_raidz_worst_error(raidz_map_t *rm) 775 { 776 int error = 0; 777 778 for (int c = 0; c < rm->rm_cols; c++) 779 error = zio_worst_error(error, rm->rm_col[c].rc_error); 780 781 return (error); 782 } 783 784 static void 785 vdev_raidz_io_done(zio_t *zio) 786 { 787 vdev_t *vd = zio->io_vd; 788 vdev_t *cvd; 789 raidz_map_t *rm = zio->io_vsd; 790 raidz_col_t *rc, *rc1; 791 int unexpected_errors = 0; 792 int parity_errors = 0; 793 int parity_untried = 0; 794 int data_errors = 0; 795 int total_errors = 0; 796 int n, c, c1; 797 798 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 799 800 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 801 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 802 803 for (c = 0; c < rm->rm_cols; c++) { 804 rc = &rm->rm_col[c]; 805 806 if (rc->rc_error) { 807 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 808 809 if (c < rm->rm_firstdatacol) 810 parity_errors++; 811 else 812 data_errors++; 813 814 if (!rc->rc_skipped) 815 unexpected_errors++; 816 817 total_errors++; 818 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 819 parity_untried++; 820 } 821 } 822 823 if (zio->io_type == ZIO_TYPE_WRITE) { 824 /* 825 * XXX -- for now, treat partial writes as a success. 826 * (If we couldn't write enough columns to reconstruct 827 * the data, the I/O failed. Otherwise, good enough.) 828 * 829 * Now that we support write reallocation, it would be better 830 * to treat partial failure as real failure unless there are 831 * no non-degraded top-level vdevs left, and not update DTLs 832 * if we intend to reallocate. 833 */ 834 /* XXPOLICY */ 835 if (total_errors > rm->rm_firstdatacol) 836 zio->io_error = vdev_raidz_worst_error(rm); 837 838 return; 839 } 840 841 ASSERT(zio->io_type == ZIO_TYPE_READ); 842 /* 843 * There are three potential phases for a read: 844 * 1. produce valid data from the columns read 845 * 2. read all disks and try again 846 * 3. perform combinatorial reconstruction 847 * 848 * Each phase is progressively both more expensive and less likely to 849 * occur. If we encounter more errors than we can repair or all phases 850 * fail, we have no choice but to return an error. 851 */ 852 853 /* 854 * If the number of errors we saw was correctable -- less than or equal 855 * to the number of parity disks read -- attempt to produce data that 856 * has a valid checksum. Naturally, this case applies in the absence of 857 * any errors. 858 */ 859 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 860 switch (data_errors) { 861 case 0: 862 if (zio_checksum_error(zio) == 0) { 863 /* 864 * If we read parity information (unnecessarily 865 * as it happens since no reconstruction was 866 * needed) regenerate and verify the parity. 867 * We also regenerate parity when resilvering 868 * so we can write it out to the failed device 869 * later. 870 */ 871 if (parity_errors + parity_untried < 872 rm->rm_firstdatacol || 873 (zio->io_flags & ZIO_FLAG_RESILVER)) { 874 n = raidz_parity_verify(zio, rm); 875 unexpected_errors += n; 876 ASSERT(parity_errors + n <= 877 rm->rm_firstdatacol); 878 } 879 goto done; 880 } 881 break; 882 883 case 1: 884 /* 885 * We either attempt to read all the parity columns or 886 * none of them. If we didn't try to read parity, we 887 * wouldn't be here in the correctable case. There must 888 * also have been fewer parity errors than parity 889 * columns or, again, we wouldn't be in this code path. 890 */ 891 ASSERT(parity_untried == 0); 892 ASSERT(parity_errors < rm->rm_firstdatacol); 893 894 /* 895 * Find the column that reported the error. 896 */ 897 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 898 rc = &rm->rm_col[c]; 899 if (rc->rc_error != 0) 900 break; 901 } 902 ASSERT(c != rm->rm_cols); 903 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 904 rc->rc_error == ESTALE); 905 906 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 907 vdev_raidz_reconstruct_p(rm, c); 908 } else { 909 ASSERT(rm->rm_firstdatacol > 1); 910 vdev_raidz_reconstruct_q(rm, c); 911 } 912 913 if (zio_checksum_error(zio) == 0) { 914 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 915 atomic_inc_64(&raidz_corrected_p); 916 else 917 atomic_inc_64(&raidz_corrected_q); 918 919 /* 920 * If there's more than one parity disk that 921 * was successfully read, confirm that the 922 * other parity disk produced the correct data. 923 * This routine is suboptimal in that it 924 * regenerates both the parity we wish to test 925 * as well as the parity we just used to 926 * perform the reconstruction, but this should 927 * be a relatively uncommon case, and can be 928 * optimized if it becomes a problem. 929 * We also regenerate parity when resilvering 930 * so we can write it out to the failed device 931 * later. 932 */ 933 if (parity_errors < rm->rm_firstdatacol - 1 || 934 (zio->io_flags & ZIO_FLAG_RESILVER)) { 935 n = raidz_parity_verify(zio, rm); 936 unexpected_errors += n; 937 ASSERT(parity_errors + n <= 938 rm->rm_firstdatacol); 939 } 940 941 goto done; 942 } 943 break; 944 945 case 2: 946 /* 947 * Two data column errors require double parity. 948 */ 949 ASSERT(rm->rm_firstdatacol == 2); 950 951 /* 952 * Find the two columns that reported errors. 953 */ 954 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 955 rc = &rm->rm_col[c]; 956 if (rc->rc_error != 0) 957 break; 958 } 959 ASSERT(c != rm->rm_cols); 960 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 961 rc->rc_error == ESTALE); 962 963 for (c1 = c++; c < rm->rm_cols; c++) { 964 rc = &rm->rm_col[c]; 965 if (rc->rc_error != 0) 966 break; 967 } 968 ASSERT(c != rm->rm_cols); 969 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 970 rc->rc_error == ESTALE); 971 972 vdev_raidz_reconstruct_pq(rm, c1, c); 973 974 if (zio_checksum_error(zio) == 0) { 975 atomic_inc_64(&raidz_corrected_pq); 976 goto done; 977 } 978 break; 979 980 default: 981 ASSERT(rm->rm_firstdatacol <= 2); 982 ASSERT(0); 983 } 984 } 985 986 /* 987 * This isn't a typical situation -- either we got a read error or 988 * a child silently returned bad data. Read every block so we can 989 * try again with as much data and parity as we can track down. If 990 * we've already been through once before, all children will be marked 991 * as tried so we'll proceed to combinatorial reconstruction. 992 */ 993 unexpected_errors = 1; 994 rm->rm_missingdata = 0; 995 rm->rm_missingparity = 0; 996 997 for (c = 0; c < rm->rm_cols; c++) { 998 if (rm->rm_col[c].rc_tried) 999 continue; 1000 1001 zio_vdev_io_redone(zio); 1002 do { 1003 rc = &rm->rm_col[c]; 1004 if (rc->rc_tried) 1005 continue; 1006 zio_nowait(zio_vdev_child_io(zio, NULL, 1007 vd->vdev_child[rc->rc_devidx], 1008 rc->rc_offset, rc->rc_data, rc->rc_size, 1009 zio->io_type, zio->io_priority, 0, 1010 vdev_raidz_child_done, rc)); 1011 } while (++c < rm->rm_cols); 1012 1013 return; 1014 } 1015 1016 /* 1017 * At this point we've attempted to reconstruct the data given the 1018 * errors we detected, and we've attempted to read all columns. There 1019 * must, therefore, be one or more additional problems -- silent errors 1020 * resulting in invalid data rather than explicit I/O errors resulting 1021 * in absent data. Before we attempt combinatorial reconstruction make 1022 * sure we have a chance of coming up with the right answer. 1023 */ 1024 if (total_errors >= rm->rm_firstdatacol) { 1025 zio->io_error = vdev_raidz_worst_error(rm); 1026 /* 1027 * If there were exactly as many device errors as parity 1028 * columns, yet we couldn't reconstruct the data, then at 1029 * least one device must have returned bad data silently. 1030 */ 1031 if (total_errors == rm->rm_firstdatacol) 1032 zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 1033 goto done; 1034 } 1035 1036 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 1037 /* 1038 * Attempt to reconstruct the data from parity P. 1039 */ 1040 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1041 void *orig; 1042 rc = &rm->rm_col[c]; 1043 1044 orig = zio_buf_alloc(rc->rc_size); 1045 bcopy(rc->rc_data, orig, rc->rc_size); 1046 vdev_raidz_reconstruct_p(rm, c); 1047 1048 if (zio_checksum_error(zio) == 0) { 1049 zio_buf_free(orig, rc->rc_size); 1050 atomic_inc_64(&raidz_corrected_p); 1051 1052 /* 1053 * If this child didn't know that it returned 1054 * bad data, inform it. 1055 */ 1056 if (rc->rc_tried && rc->rc_error == 0) 1057 raidz_checksum_error(zio, rc); 1058 rc->rc_error = ECKSUM; 1059 goto done; 1060 } 1061 1062 bcopy(orig, rc->rc_data, rc->rc_size); 1063 zio_buf_free(orig, rc->rc_size); 1064 } 1065 } 1066 1067 if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1068 /* 1069 * Attempt to reconstruct the data from parity Q. 1070 */ 1071 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1072 void *orig; 1073 rc = &rm->rm_col[c]; 1074 1075 orig = zio_buf_alloc(rc->rc_size); 1076 bcopy(rc->rc_data, orig, rc->rc_size); 1077 vdev_raidz_reconstruct_q(rm, c); 1078 1079 if (zio_checksum_error(zio) == 0) { 1080 zio_buf_free(orig, rc->rc_size); 1081 atomic_inc_64(&raidz_corrected_q); 1082 1083 /* 1084 * If this child didn't know that it returned 1085 * bad data, inform it. 1086 */ 1087 if (rc->rc_tried && rc->rc_error == 0) 1088 raidz_checksum_error(zio, rc); 1089 rc->rc_error = ECKSUM; 1090 goto done; 1091 } 1092 1093 bcopy(orig, rc->rc_data, rc->rc_size); 1094 zio_buf_free(orig, rc->rc_size); 1095 } 1096 } 1097 1098 if (rm->rm_firstdatacol > 1 && 1099 rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 1100 rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1101 /* 1102 * Attempt to reconstruct the data from both P and Q. 1103 */ 1104 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 1105 void *orig, *orig1; 1106 rc = &rm->rm_col[c]; 1107 1108 orig = zio_buf_alloc(rc->rc_size); 1109 bcopy(rc->rc_data, orig, rc->rc_size); 1110 1111 for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 1112 rc1 = &rm->rm_col[c1]; 1113 1114 orig1 = zio_buf_alloc(rc1->rc_size); 1115 bcopy(rc1->rc_data, orig1, rc1->rc_size); 1116 1117 vdev_raidz_reconstruct_pq(rm, c, c1); 1118 1119 if (zio_checksum_error(zio) == 0) { 1120 zio_buf_free(orig, rc->rc_size); 1121 zio_buf_free(orig1, rc1->rc_size); 1122 atomic_inc_64(&raidz_corrected_pq); 1123 1124 /* 1125 * If these children didn't know they 1126 * returned bad data, inform them. 1127 */ 1128 if (rc->rc_tried && rc->rc_error == 0) 1129 raidz_checksum_error(zio, rc); 1130 if (rc1->rc_tried && rc1->rc_error == 0) 1131 raidz_checksum_error(zio, rc1); 1132 1133 rc->rc_error = ECKSUM; 1134 rc1->rc_error = ECKSUM; 1135 1136 goto done; 1137 } 1138 1139 bcopy(orig1, rc1->rc_data, rc1->rc_size); 1140 zio_buf_free(orig1, rc1->rc_size); 1141 } 1142 1143 bcopy(orig, rc->rc_data, rc->rc_size); 1144 zio_buf_free(orig, rc->rc_size); 1145 } 1146 } 1147 1148 /* 1149 * All combinations failed to checksum. Generate checksum ereports for 1150 * all children. 1151 */ 1152 zio->io_error = ECKSUM; 1153 1154 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1155 for (c = 0; c < rm->rm_cols; c++) { 1156 rc = &rm->rm_col[c]; 1157 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1158 zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 1159 rc->rc_offset, rc->rc_size); 1160 } 1161 } 1162 1163 done: 1164 zio_checksum_verified(zio); 1165 1166 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1167 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1168 /* 1169 * Use the good data we have in hand to repair damaged children. 1170 */ 1171 for (c = 0; c < rm->rm_cols; c++) { 1172 rc = &rm->rm_col[c]; 1173 cvd = vd->vdev_child[rc->rc_devidx]; 1174 1175 if (rc->rc_error == 0) 1176 continue; 1177 1178 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1179 rc->rc_offset, rc->rc_data, rc->rc_size, 1180 ZIO_TYPE_WRITE, zio->io_priority, 1181 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1182 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1183 } 1184 } 1185 } 1186 1187 static void 1188 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1189 { 1190 if (faulted > vd->vdev_nparity) 1191 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1192 VDEV_AUX_NO_REPLICAS); 1193 else if (degraded + faulted != 0) 1194 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1195 else 1196 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1197 } 1198 1199 vdev_ops_t vdev_raidz_ops = { 1200 vdev_raidz_open, 1201 vdev_raidz_close, 1202 vdev_raidz_asize, 1203 vdev_raidz_io_start, 1204 vdev_raidz_io_done, 1205 vdev_raidz_state_change, 1206 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1207 B_FALSE /* not a leaf vdev */ 1208 }; 1209