1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports both single and double parity. For single parity, we 39 * use a simple XOR of all the data columns. For double parity, we use both 40 * the simple XOR as well as a technique described in "The mathematics of 41 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 42 * over the integers expressable in a single byte. Briefly, the operations on 43 * the field are defined as follows: 44 * 45 * o addition (+) is represented by a bitwise XOR 46 * o subtraction (-) is therefore identical to addition: A + B = A - B 47 * o multiplication of A by 2 is defined by the following bitwise expression: 48 * (A * 2)_7 = A_6 49 * (A * 2)_6 = A_5 50 * (A * 2)_5 = A_4 51 * (A * 2)_4 = A_3 + A_7 52 * (A * 2)_3 = A_2 + A_7 53 * (A * 2)_2 = A_1 + A_7 54 * (A * 2)_1 = A_0 55 * (A * 2)_0 = A_7 56 * 57 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 58 * 59 * Observe that any number in the field (except for 0) can be expressed as a 60 * power of 2 -- a generator for the field. We store a table of the powers of 61 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 62 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 63 * than field addition). The inverse of a field element A (A^-1) is A^254. 64 * 65 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 66 * can be expressed by field operations: 67 * 68 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 69 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 70 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 71 * 72 * See the reconstruction code below for how P and Q can used individually or 73 * in concert to recover missing data columns. 74 */ 75 76 typedef struct raidz_col { 77 uint64_t rc_devidx; /* child device index for I/O */ 78 uint64_t rc_offset; /* device offset */ 79 uint64_t rc_size; /* I/O size */ 80 void *rc_data; /* I/O data */ 81 int rc_error; /* I/O error for this device */ 82 uint8_t rc_tried; /* Did we attempt this I/O column? */ 83 uint8_t rc_skipped; /* Did we skip this I/O column? */ 84 } raidz_col_t; 85 86 typedef struct raidz_map { 87 uint64_t rm_cols; /* Column count */ 88 uint64_t rm_bigcols; /* Number of oversized columns */ 89 uint64_t rm_asize; /* Actual total I/O size */ 90 uint64_t rm_missingdata; /* Count of missing data devices */ 91 uint64_t rm_missingparity; /* Count of missing parity devices */ 92 uint64_t rm_firstdatacol; /* First data column/parity count */ 93 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 94 } raidz_map_t; 95 96 #define VDEV_RAIDZ_P 0 97 #define VDEV_RAIDZ_Q 1 98 99 #define VDEV_RAIDZ_MAXPARITY 2 100 101 #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 102 103 /* 104 * These two tables represent powers and logs of 2 in the Galois field defined 105 * above. These values were computed by repeatedly multiplying by 2 as above. 106 */ 107 static const uint8_t vdev_raidz_pow2[256] = { 108 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 109 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 110 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 111 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 112 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 113 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 114 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 115 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 116 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 117 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 118 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 119 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 120 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 121 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 122 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 123 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 124 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 125 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 126 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 127 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 128 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 129 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 130 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 131 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 132 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 133 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 134 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 135 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 136 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 137 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 138 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 139 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 140 }; 141 static const uint8_t vdev_raidz_log2[256] = { 142 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 143 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 144 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 145 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 146 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 147 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 148 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 149 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 150 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 151 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 152 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 153 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 154 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 155 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 156 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 157 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 158 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 159 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 160 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 161 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 162 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 163 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 164 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 165 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 166 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 167 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 168 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 169 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 170 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 171 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 172 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 173 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 174 }; 175 176 /* 177 * Multiply a given number by 2 raised to the given power. 178 */ 179 static uint8_t 180 vdev_raidz_exp2(uint_t a, int exp) 181 { 182 if (a == 0) 183 return (0); 184 185 ASSERT(exp >= 0); 186 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 187 188 exp += vdev_raidz_log2[a]; 189 if (exp > 255) 190 exp -= 255; 191 192 return (vdev_raidz_pow2[exp]); 193 } 194 195 static void 196 vdev_raidz_map_free(zio_t *zio) 197 { 198 raidz_map_t *rm = zio->io_vsd; 199 int c; 200 201 for (c = 0; c < rm->rm_firstdatacol; c++) 202 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 203 204 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 205 } 206 207 static raidz_map_t * 208 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 209 uint64_t nparity) 210 { 211 raidz_map_t *rm; 212 uint64_t b = zio->io_offset >> unit_shift; 213 uint64_t s = zio->io_size >> unit_shift; 214 uint64_t f = b % dcols; 215 uint64_t o = (b / dcols) << unit_shift; 216 uint64_t q, r, c, bc, col, acols, coff, devidx; 217 218 q = s / (dcols - nparity); 219 r = s - q * (dcols - nparity); 220 bc = (r == 0 ? 0 : r + nparity); 221 222 acols = (q == 0 ? bc : dcols); 223 224 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 225 226 rm->rm_cols = acols; 227 rm->rm_bigcols = bc; 228 rm->rm_asize = 0; 229 rm->rm_missingdata = 0; 230 rm->rm_missingparity = 0; 231 rm->rm_firstdatacol = nparity; 232 233 for (c = 0; c < acols; c++) { 234 col = f + c; 235 coff = o; 236 if (col >= dcols) { 237 col -= dcols; 238 coff += 1ULL << unit_shift; 239 } 240 rm->rm_col[c].rc_devidx = col; 241 rm->rm_col[c].rc_offset = coff; 242 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 243 rm->rm_col[c].rc_data = NULL; 244 rm->rm_col[c].rc_error = 0; 245 rm->rm_col[c].rc_tried = 0; 246 rm->rm_col[c].rc_skipped = 0; 247 rm->rm_asize += rm->rm_col[c].rc_size; 248 } 249 250 rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 251 252 for (c = 0; c < rm->rm_firstdatacol; c++) 253 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 254 255 rm->rm_col[c].rc_data = zio->io_data; 256 257 for (c = c + 1; c < acols; c++) 258 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 259 rm->rm_col[c - 1].rc_size; 260 261 /* 262 * If all data stored spans all columns, there's a danger that parity 263 * will always be on the same device and, since parity isn't read 264 * during normal operation, that that device's I/O bandwidth won't be 265 * used effectively. We therefore switch the parity every 1MB. 266 * 267 * ... at least that was, ostensibly, the theory. As a practical 268 * matter unless we juggle the parity between all devices evenly, we 269 * won't see any benefit. Further, occasional writes that aren't a 270 * multiple of the LCM of the number of children and the minimum 271 * stripe width are sufficient to avoid pessimal behavior. 272 * Unfortunately, this decision created an implicit on-disk format 273 * requirement that we need to support for all eternity, but only 274 * for single-parity RAID-Z. 275 */ 276 ASSERT(rm->rm_cols >= 2); 277 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 278 279 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 280 devidx = rm->rm_col[0].rc_devidx; 281 o = rm->rm_col[0].rc_offset; 282 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 283 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 284 rm->rm_col[1].rc_devidx = devidx; 285 rm->rm_col[1].rc_offset = o; 286 } 287 288 zio->io_vsd = rm; 289 zio->io_vsd_free = vdev_raidz_map_free; 290 return (rm); 291 } 292 293 static void 294 vdev_raidz_generate_parity_p(raidz_map_t *rm) 295 { 296 uint64_t *p, *src, pcount, ccount, i; 297 int c; 298 299 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 300 301 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 302 src = rm->rm_col[c].rc_data; 303 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 304 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 305 306 if (c == rm->rm_firstdatacol) { 307 ASSERT(ccount == pcount); 308 for (i = 0; i < ccount; i++, p++, src++) { 309 *p = *src; 310 } 311 } else { 312 ASSERT(ccount <= pcount); 313 for (i = 0; i < ccount; i++, p++, src++) { 314 *p ^= *src; 315 } 316 } 317 } 318 } 319 320 static void 321 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 322 { 323 uint64_t *q, *p, *src, pcount, ccount, mask, i; 324 int c; 325 326 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 327 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 328 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 329 330 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 331 src = rm->rm_col[c].rc_data; 332 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 333 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 334 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 335 336 if (c == rm->rm_firstdatacol) { 337 ASSERT(ccount == pcount || ccount == 0); 338 for (i = 0; i < ccount; i++, p++, q++, src++) { 339 *q = *src; 340 *p = *src; 341 } 342 for (; i < pcount; i++, p++, q++, src++) { 343 *q = 0; 344 *p = 0; 345 } 346 } else { 347 ASSERT(ccount <= pcount); 348 349 /* 350 * Rather than multiplying each byte individually (as 351 * described above), we are able to handle 8 at once 352 * by generating a mask based on the high bit in each 353 * byte and using that to conditionally XOR in 0x1d. 354 */ 355 for (i = 0; i < ccount; i++, p++, q++, src++) { 356 mask = *q & 0x8080808080808080ULL; 357 mask = (mask << 1) - (mask >> 7); 358 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 359 (mask & 0x1d1d1d1d1d1d1d1dULL); 360 *q ^= *src; 361 *p ^= *src; 362 } 363 364 /* 365 * Treat short columns as though they are full of 0s. 366 */ 367 for (; i < pcount; i++, q++) { 368 mask = *q & 0x8080808080808080ULL; 369 mask = (mask << 1) - (mask >> 7); 370 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 371 (mask & 0x1d1d1d1d1d1d1d1dULL); 372 } 373 } 374 } 375 } 376 377 static void 378 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 379 { 380 uint64_t *dst, *src, xcount, ccount, count, i; 381 int c; 382 383 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 384 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 385 ASSERT(xcount > 0); 386 387 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 388 dst = rm->rm_col[x].rc_data; 389 for (i = 0; i < xcount; i++, dst++, src++) { 390 *dst = *src; 391 } 392 393 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 394 src = rm->rm_col[c].rc_data; 395 dst = rm->rm_col[x].rc_data; 396 397 if (c == x) 398 continue; 399 400 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 401 count = MIN(ccount, xcount); 402 403 for (i = 0; i < count; i++, dst++, src++) { 404 *dst ^= *src; 405 } 406 } 407 } 408 409 static void 410 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 411 { 412 uint64_t *dst, *src, xcount, ccount, count, mask, i; 413 uint8_t *b; 414 int c, j, exp; 415 416 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 417 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 418 419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 420 src = rm->rm_col[c].rc_data; 421 dst = rm->rm_col[x].rc_data; 422 423 if (c == x) 424 ccount = 0; 425 else 426 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 427 428 count = MIN(ccount, xcount); 429 430 if (c == rm->rm_firstdatacol) { 431 for (i = 0; i < count; i++, dst++, src++) { 432 *dst = *src; 433 } 434 for (; i < xcount; i++, dst++) { 435 *dst = 0; 436 } 437 438 } else { 439 /* 440 * For an explanation of this, see the comment in 441 * vdev_raidz_generate_parity_pq() above. 442 */ 443 for (i = 0; i < count; i++, dst++, src++) { 444 mask = *dst & 0x8080808080808080ULL; 445 mask = (mask << 1) - (mask >> 7); 446 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 447 (mask & 0x1d1d1d1d1d1d1d1dULL); 448 *dst ^= *src; 449 } 450 451 for (; i < xcount; i++, dst++) { 452 mask = *dst & 0x8080808080808080ULL; 453 mask = (mask << 1) - (mask >> 7); 454 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 455 (mask & 0x1d1d1d1d1d1d1d1dULL); 456 } 457 } 458 } 459 460 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 461 dst = rm->rm_col[x].rc_data; 462 exp = 255 - (rm->rm_cols - 1 - x); 463 464 for (i = 0; i < xcount; i++, dst++, src++) { 465 *dst ^= *src; 466 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 467 *b = vdev_raidz_exp2(*b, exp); 468 } 469 } 470 } 471 472 static void 473 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 474 { 475 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 476 void *pdata, *qdata; 477 uint64_t xsize, ysize, i; 478 479 ASSERT(x < y); 480 ASSERT(x >= rm->rm_firstdatacol); 481 ASSERT(y < rm->rm_cols); 482 483 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 484 485 /* 486 * Move the parity data aside -- we're going to compute parity as 487 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 488 * reuse the parity generation mechanism without trashing the actual 489 * parity so we make those columns appear to be full of zeros by 490 * setting their lengths to zero. 491 */ 492 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 493 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 494 xsize = rm->rm_col[x].rc_size; 495 ysize = rm->rm_col[y].rc_size; 496 497 rm->rm_col[VDEV_RAIDZ_P].rc_data = 498 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 499 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 500 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 501 rm->rm_col[x].rc_size = 0; 502 rm->rm_col[y].rc_size = 0; 503 504 vdev_raidz_generate_parity_pq(rm); 505 506 rm->rm_col[x].rc_size = xsize; 507 rm->rm_col[y].rc_size = ysize; 508 509 p = pdata; 510 q = qdata; 511 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 512 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 513 xd = rm->rm_col[x].rc_data; 514 yd = rm->rm_col[y].rc_data; 515 516 /* 517 * We now have: 518 * Pxy = P + D_x + D_y 519 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 520 * 521 * We can then solve for D_x: 522 * D_x = A * (P + Pxy) + B * (Q + Qxy) 523 * where 524 * A = 2^(x - y) * (2^(x - y) + 1)^-1 525 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 526 * 527 * With D_x in hand, we can easily solve for D_y: 528 * D_y = P + Pxy + D_x 529 */ 530 531 a = vdev_raidz_pow2[255 + x - y]; 532 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 533 tmp = 255 - vdev_raidz_log2[a ^ 1]; 534 535 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 536 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 537 538 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 539 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 540 vdev_raidz_exp2(*q ^ *qxy, bexp); 541 542 if (i < ysize) 543 *yd = *p ^ *pxy ^ *xd; 544 } 545 546 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 547 rm->rm_col[VDEV_RAIDZ_P].rc_size); 548 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 549 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 550 551 /* 552 * Restore the saved parity data. 553 */ 554 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 555 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 556 } 557 558 559 static int 560 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 561 { 562 vdev_t *cvd; 563 uint64_t nparity = vd->vdev_nparity; 564 int c, error; 565 int lasterror = 0; 566 int numerrors = 0; 567 568 ASSERT(nparity > 0); 569 570 if (nparity > VDEV_RAIDZ_MAXPARITY || 571 vd->vdev_children < nparity + 1) { 572 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 573 return (EINVAL); 574 } 575 576 for (c = 0; c < vd->vdev_children; c++) { 577 cvd = vd->vdev_child[c]; 578 579 if ((error = vdev_open(cvd)) != 0) { 580 lasterror = error; 581 numerrors++; 582 continue; 583 } 584 585 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 586 *ashift = MAX(*ashift, cvd->vdev_ashift); 587 } 588 589 *asize *= vd->vdev_children; 590 591 if (numerrors > nparity) { 592 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 593 return (lasterror); 594 } 595 596 return (0); 597 } 598 599 static void 600 vdev_raidz_close(vdev_t *vd) 601 { 602 int c; 603 604 for (c = 0; c < vd->vdev_children; c++) 605 vdev_close(vd->vdev_child[c]); 606 } 607 608 static uint64_t 609 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 610 { 611 uint64_t asize; 612 uint64_t ashift = vd->vdev_top->vdev_ashift; 613 uint64_t cols = vd->vdev_children; 614 uint64_t nparity = vd->vdev_nparity; 615 616 asize = ((psize - 1) >> ashift) + 1; 617 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 618 asize = roundup(asize, nparity + 1) << ashift; 619 620 return (asize); 621 } 622 623 static void 624 vdev_raidz_child_done(zio_t *zio) 625 { 626 raidz_col_t *rc = zio->io_private; 627 628 rc->rc_error = zio->io_error; 629 rc->rc_tried = 1; 630 rc->rc_skipped = 0; 631 } 632 633 static int 634 vdev_raidz_io_start(zio_t *zio) 635 { 636 vdev_t *vd = zio->io_vd; 637 vdev_t *tvd = vd->vdev_top; 638 vdev_t *cvd; 639 blkptr_t *bp = zio->io_bp; 640 raidz_map_t *rm; 641 raidz_col_t *rc; 642 int c; 643 644 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 645 vd->vdev_nparity); 646 647 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 648 649 if (zio->io_type == ZIO_TYPE_WRITE) { 650 /* 651 * Generate RAID parity in the first virtual columns. 652 */ 653 if (rm->rm_firstdatacol == 1) 654 vdev_raidz_generate_parity_p(rm); 655 else 656 vdev_raidz_generate_parity_pq(rm); 657 658 for (c = 0; c < rm->rm_cols; c++) { 659 rc = &rm->rm_col[c]; 660 cvd = vd->vdev_child[rc->rc_devidx]; 661 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 662 rc->rc_offset, rc->rc_data, rc->rc_size, 663 zio->io_type, zio->io_priority, 0, 664 vdev_raidz_child_done, rc)); 665 } 666 667 return (ZIO_PIPELINE_CONTINUE); 668 } 669 670 ASSERT(zio->io_type == ZIO_TYPE_READ); 671 672 /* 673 * Iterate over the columns in reverse order so that we hit the parity 674 * last -- any errors along the way will force us to read the parity 675 * data. 676 */ 677 for (c = rm->rm_cols - 1; c >= 0; c--) { 678 rc = &rm->rm_col[c]; 679 cvd = vd->vdev_child[rc->rc_devidx]; 680 if (!vdev_readable(cvd)) { 681 if (c >= rm->rm_firstdatacol) 682 rm->rm_missingdata++; 683 else 684 rm->rm_missingparity++; 685 rc->rc_error = ENXIO; 686 rc->rc_tried = 1; /* don't even try */ 687 rc->rc_skipped = 1; 688 continue; 689 } 690 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 691 if (c >= rm->rm_firstdatacol) 692 rm->rm_missingdata++; 693 else 694 rm->rm_missingparity++; 695 rc->rc_error = ESTALE; 696 rc->rc_skipped = 1; 697 continue; 698 } 699 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 700 (zio->io_flags & ZIO_FLAG_SCRUB)) { 701 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 702 rc->rc_offset, rc->rc_data, rc->rc_size, 703 zio->io_type, zio->io_priority, 0, 704 vdev_raidz_child_done, rc)); 705 } 706 } 707 708 return (ZIO_PIPELINE_CONTINUE); 709 } 710 711 /* 712 * Report a checksum error for a child of a RAID-Z device. 713 */ 714 static void 715 raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 716 { 717 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 718 719 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 720 mutex_enter(&vd->vdev_stat_lock); 721 vd->vdev_stat.vs_checksum_errors++; 722 mutex_exit(&vd->vdev_stat_lock); 723 } 724 725 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 726 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 727 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 728 } 729 730 /* 731 * Generate the parity from the data columns. If we tried and were able to 732 * read the parity without error, verify that the generated parity matches the 733 * data we read. If it doesn't, we fire off a checksum error. Return the 734 * number such failures. 735 */ 736 static int 737 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 738 { 739 void *orig[VDEV_RAIDZ_MAXPARITY]; 740 int c, ret = 0; 741 raidz_col_t *rc; 742 743 for (c = 0; c < rm->rm_firstdatacol; c++) { 744 rc = &rm->rm_col[c]; 745 if (!rc->rc_tried || rc->rc_error != 0) 746 continue; 747 orig[c] = zio_buf_alloc(rc->rc_size); 748 bcopy(rc->rc_data, orig[c], rc->rc_size); 749 } 750 751 if (rm->rm_firstdatacol == 1) 752 vdev_raidz_generate_parity_p(rm); 753 else 754 vdev_raidz_generate_parity_pq(rm); 755 756 for (c = 0; c < rm->rm_firstdatacol; c++) { 757 rc = &rm->rm_col[c]; 758 if (!rc->rc_tried || rc->rc_error != 0) 759 continue; 760 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 761 raidz_checksum_error(zio, rc); 762 rc->rc_error = ECKSUM; 763 ret++; 764 } 765 zio_buf_free(orig[c], rc->rc_size); 766 } 767 768 return (ret); 769 } 770 771 static uint64_t raidz_corrected_p; 772 static uint64_t raidz_corrected_q; 773 static uint64_t raidz_corrected_pq; 774 775 static int 776 vdev_raidz_worst_error(raidz_map_t *rm) 777 { 778 int error = 0; 779 780 for (int c = 0; c < rm->rm_cols; c++) 781 error = zio_worst_error(error, rm->rm_col[c].rc_error); 782 783 return (error); 784 } 785 786 static void 787 vdev_raidz_io_done(zio_t *zio) 788 { 789 vdev_t *vd = zio->io_vd; 790 vdev_t *cvd; 791 raidz_map_t *rm = zio->io_vsd; 792 raidz_col_t *rc, *rc1; 793 int unexpected_errors = 0; 794 int parity_errors = 0; 795 int parity_untried = 0; 796 int data_errors = 0; 797 int total_errors = 0; 798 int n, c, c1; 799 800 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 801 802 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 803 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 804 805 for (c = 0; c < rm->rm_cols; c++) { 806 rc = &rm->rm_col[c]; 807 808 if (rc->rc_error) { 809 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 810 811 if (c < rm->rm_firstdatacol) 812 parity_errors++; 813 else 814 data_errors++; 815 816 if (!rc->rc_skipped) 817 unexpected_errors++; 818 819 total_errors++; 820 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 821 parity_untried++; 822 } 823 } 824 825 if (zio->io_type == ZIO_TYPE_WRITE) { 826 /* 827 * XXX -- for now, treat partial writes as a success. 828 * (If we couldn't write enough columns to reconstruct 829 * the data, the I/O failed. Otherwise, good enough.) 830 * 831 * Now that we support write reallocation, it would be better 832 * to treat partial failure as real failure unless there are 833 * no non-degraded top-level vdevs left, and not update DTLs 834 * if we intend to reallocate. 835 */ 836 /* XXPOLICY */ 837 if (total_errors > rm->rm_firstdatacol) 838 zio->io_error = vdev_raidz_worst_error(rm); 839 840 return; 841 } 842 843 ASSERT(zio->io_type == ZIO_TYPE_READ); 844 /* 845 * There are three potential phases for a read: 846 * 1. produce valid data from the columns read 847 * 2. read all disks and try again 848 * 3. perform combinatorial reconstruction 849 * 850 * Each phase is progressively both more expensive and less likely to 851 * occur. If we encounter more errors than we can repair or all phases 852 * fail, we have no choice but to return an error. 853 */ 854 855 /* 856 * If the number of errors we saw was correctable -- less than or equal 857 * to the number of parity disks read -- attempt to produce data that 858 * has a valid checksum. Naturally, this case applies in the absence of 859 * any errors. 860 */ 861 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 862 switch (data_errors) { 863 case 0: 864 if (zio_checksum_error(zio) == 0) { 865 /* 866 * If we read parity information (unnecessarily 867 * as it happens since no reconstruction was 868 * needed) regenerate and verify the parity. 869 * We also regenerate parity when resilvering 870 * so we can write it out to the failed device 871 * later. 872 */ 873 if (parity_errors + parity_untried < 874 rm->rm_firstdatacol || 875 (zio->io_flags & ZIO_FLAG_RESILVER)) { 876 n = raidz_parity_verify(zio, rm); 877 unexpected_errors += n; 878 ASSERT(parity_errors + n <= 879 rm->rm_firstdatacol); 880 } 881 goto done; 882 } 883 break; 884 885 case 1: 886 /* 887 * We either attempt to read all the parity columns or 888 * none of them. If we didn't try to read parity, we 889 * wouldn't be here in the correctable case. There must 890 * also have been fewer parity errors than parity 891 * columns or, again, we wouldn't be in this code path. 892 */ 893 ASSERT(parity_untried == 0); 894 ASSERT(parity_errors < rm->rm_firstdatacol); 895 896 /* 897 * Find the column that reported the error. 898 */ 899 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 900 rc = &rm->rm_col[c]; 901 if (rc->rc_error != 0) 902 break; 903 } 904 ASSERT(c != rm->rm_cols); 905 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 906 rc->rc_error == ESTALE); 907 908 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 909 vdev_raidz_reconstruct_p(rm, c); 910 } else { 911 ASSERT(rm->rm_firstdatacol > 1); 912 vdev_raidz_reconstruct_q(rm, c); 913 } 914 915 if (zio_checksum_error(zio) == 0) { 916 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 917 atomic_inc_64(&raidz_corrected_p); 918 else 919 atomic_inc_64(&raidz_corrected_q); 920 921 /* 922 * If there's more than one parity disk that 923 * was successfully read, confirm that the 924 * other parity disk produced the correct data. 925 * This routine is suboptimal in that it 926 * regenerates both the parity we wish to test 927 * as well as the parity we just used to 928 * perform the reconstruction, but this should 929 * be a relatively uncommon case, and can be 930 * optimized if it becomes a problem. 931 * We also regenerate parity when resilvering 932 * so we can write it out to the failed device 933 * later. 934 */ 935 if (parity_errors < rm->rm_firstdatacol - 1 || 936 (zio->io_flags & ZIO_FLAG_RESILVER)) { 937 n = raidz_parity_verify(zio, rm); 938 unexpected_errors += n; 939 ASSERT(parity_errors + n <= 940 rm->rm_firstdatacol); 941 } 942 943 goto done; 944 } 945 break; 946 947 case 2: 948 /* 949 * Two data column errors require double parity. 950 */ 951 ASSERT(rm->rm_firstdatacol == 2); 952 953 /* 954 * Find the two columns that reported errors. 955 */ 956 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 957 rc = &rm->rm_col[c]; 958 if (rc->rc_error != 0) 959 break; 960 } 961 ASSERT(c != rm->rm_cols); 962 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 963 rc->rc_error == ESTALE); 964 965 for (c1 = c++; c < rm->rm_cols; c++) { 966 rc = &rm->rm_col[c]; 967 if (rc->rc_error != 0) 968 break; 969 } 970 ASSERT(c != rm->rm_cols); 971 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 972 rc->rc_error == ESTALE); 973 974 vdev_raidz_reconstruct_pq(rm, c1, c); 975 976 if (zio_checksum_error(zio) == 0) { 977 atomic_inc_64(&raidz_corrected_pq); 978 goto done; 979 } 980 break; 981 982 default: 983 ASSERT(rm->rm_firstdatacol <= 2); 984 ASSERT(0); 985 } 986 } 987 988 /* 989 * This isn't a typical situation -- either we got a read error or 990 * a child silently returned bad data. Read every block so we can 991 * try again with as much data and parity as we can track down. If 992 * we've already been through once before, all children will be marked 993 * as tried so we'll proceed to combinatorial reconstruction. 994 */ 995 unexpected_errors = 1; 996 rm->rm_missingdata = 0; 997 rm->rm_missingparity = 0; 998 999 for (c = 0; c < rm->rm_cols; c++) { 1000 if (rm->rm_col[c].rc_tried) 1001 continue; 1002 1003 zio_vdev_io_redone(zio); 1004 do { 1005 rc = &rm->rm_col[c]; 1006 if (rc->rc_tried) 1007 continue; 1008 zio_nowait(zio_vdev_child_io(zio, NULL, 1009 vd->vdev_child[rc->rc_devidx], 1010 rc->rc_offset, rc->rc_data, rc->rc_size, 1011 zio->io_type, zio->io_priority, 0, 1012 vdev_raidz_child_done, rc)); 1013 } while (++c < rm->rm_cols); 1014 1015 return; 1016 } 1017 1018 /* 1019 * At this point we've attempted to reconstruct the data given the 1020 * errors we detected, and we've attempted to read all columns. There 1021 * must, therefore, be one or more additional problems -- silent errors 1022 * resulting in invalid data rather than explicit I/O errors resulting 1023 * in absent data. Before we attempt combinatorial reconstruction make 1024 * sure we have a chance of coming up with the right answer. 1025 */ 1026 if (total_errors >= rm->rm_firstdatacol) { 1027 zio->io_error = vdev_raidz_worst_error(rm); 1028 /* 1029 * If there were exactly as many device errors as parity 1030 * columns, yet we couldn't reconstruct the data, then at 1031 * least one device must have returned bad data silently. 1032 */ 1033 if (total_errors == rm->rm_firstdatacol) 1034 zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 1035 goto done; 1036 } 1037 1038 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 1039 /* 1040 * Attempt to reconstruct the data from parity P. 1041 */ 1042 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1043 void *orig; 1044 rc = &rm->rm_col[c]; 1045 1046 orig = zio_buf_alloc(rc->rc_size); 1047 bcopy(rc->rc_data, orig, rc->rc_size); 1048 vdev_raidz_reconstruct_p(rm, c); 1049 1050 if (zio_checksum_error(zio) == 0) { 1051 zio_buf_free(orig, rc->rc_size); 1052 atomic_inc_64(&raidz_corrected_p); 1053 1054 /* 1055 * If this child didn't know that it returned 1056 * bad data, inform it. 1057 */ 1058 if (rc->rc_tried && rc->rc_error == 0) 1059 raidz_checksum_error(zio, rc); 1060 rc->rc_error = ECKSUM; 1061 goto done; 1062 } 1063 1064 bcopy(orig, rc->rc_data, rc->rc_size); 1065 zio_buf_free(orig, rc->rc_size); 1066 } 1067 } 1068 1069 if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1070 /* 1071 * Attempt to reconstruct the data from parity Q. 1072 */ 1073 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1074 void *orig; 1075 rc = &rm->rm_col[c]; 1076 1077 orig = zio_buf_alloc(rc->rc_size); 1078 bcopy(rc->rc_data, orig, rc->rc_size); 1079 vdev_raidz_reconstruct_q(rm, c); 1080 1081 if (zio_checksum_error(zio) == 0) { 1082 zio_buf_free(orig, rc->rc_size); 1083 atomic_inc_64(&raidz_corrected_q); 1084 1085 /* 1086 * If this child didn't know that it returned 1087 * bad data, inform it. 1088 */ 1089 if (rc->rc_tried && rc->rc_error == 0) 1090 raidz_checksum_error(zio, rc); 1091 rc->rc_error = ECKSUM; 1092 goto done; 1093 } 1094 1095 bcopy(orig, rc->rc_data, rc->rc_size); 1096 zio_buf_free(orig, rc->rc_size); 1097 } 1098 } 1099 1100 if (rm->rm_firstdatacol > 1 && 1101 rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 1102 rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1103 /* 1104 * Attempt to reconstruct the data from both P and Q. 1105 */ 1106 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 1107 void *orig, *orig1; 1108 rc = &rm->rm_col[c]; 1109 1110 orig = zio_buf_alloc(rc->rc_size); 1111 bcopy(rc->rc_data, orig, rc->rc_size); 1112 1113 for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 1114 rc1 = &rm->rm_col[c1]; 1115 1116 orig1 = zio_buf_alloc(rc1->rc_size); 1117 bcopy(rc1->rc_data, orig1, rc1->rc_size); 1118 1119 vdev_raidz_reconstruct_pq(rm, c, c1); 1120 1121 if (zio_checksum_error(zio) == 0) { 1122 zio_buf_free(orig, rc->rc_size); 1123 zio_buf_free(orig1, rc1->rc_size); 1124 atomic_inc_64(&raidz_corrected_pq); 1125 1126 /* 1127 * If these children didn't know they 1128 * returned bad data, inform them. 1129 */ 1130 if (rc->rc_tried && rc->rc_error == 0) 1131 raidz_checksum_error(zio, rc); 1132 if (rc1->rc_tried && rc1->rc_error == 0) 1133 raidz_checksum_error(zio, rc1); 1134 1135 rc->rc_error = ECKSUM; 1136 rc1->rc_error = ECKSUM; 1137 1138 goto done; 1139 } 1140 1141 bcopy(orig1, rc1->rc_data, rc1->rc_size); 1142 zio_buf_free(orig1, rc1->rc_size); 1143 } 1144 1145 bcopy(orig, rc->rc_data, rc->rc_size); 1146 zio_buf_free(orig, rc->rc_size); 1147 } 1148 } 1149 1150 /* 1151 * All combinations failed to checksum. Generate checksum ereports for 1152 * all children. 1153 */ 1154 zio->io_error = ECKSUM; 1155 1156 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1157 for (c = 0; c < rm->rm_cols; c++) { 1158 rc = &rm->rm_col[c]; 1159 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1160 zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 1161 rc->rc_offset, rc->rc_size); 1162 } 1163 } 1164 1165 done: 1166 zio_checksum_verified(zio); 1167 1168 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1169 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1170 /* 1171 * Use the good data we have in hand to repair damaged children. 1172 */ 1173 for (c = 0; c < rm->rm_cols; c++) { 1174 rc = &rm->rm_col[c]; 1175 cvd = vd->vdev_child[rc->rc_devidx]; 1176 1177 if (rc->rc_error == 0) 1178 continue; 1179 1180 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1181 rc->rc_offset, rc->rc_data, rc->rc_size, 1182 ZIO_TYPE_WRITE, zio->io_priority, 1183 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1184 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1185 } 1186 } 1187 } 1188 1189 static void 1190 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1191 { 1192 if (faulted > vd->vdev_nparity) 1193 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1194 VDEV_AUX_NO_REPLICAS); 1195 else if (degraded + faulted != 0) 1196 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1197 else 1198 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1199 } 1200 1201 vdev_ops_t vdev_raidz_ops = { 1202 vdev_raidz_open, 1203 vdev_raidz_close, 1204 vdev_raidz_asize, 1205 vdev_raidz_io_start, 1206 vdev_raidz_io_done, 1207 vdev_raidz_state_change, 1208 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1209 B_FALSE /* not a leaf vdev */ 1210 }; 1211