1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 #include <sys/zio_checksum.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 37 /* 38 * Virtual device vector for RAID-Z. 39 * 40 * This vdev supports both single and double parity. For single parity, we 41 * use a simple XOR of all the data columns. For double parity, we use both 42 * the simple XOR as well as a technique described in "The mathematics of 43 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 44 * over the integers expressable in a single byte. Briefly, the operations on 45 * the field are defined as follows: 46 * 47 * o addition (+) is represented by a bitwise XOR 48 * o subtraction (-) is therefore identical to addition: A + B = A - B 49 * o multiplication of A by 2 is defined by the following bitwise expression: 50 * (A * 2)_7 = A_6 51 * (A * 2)_6 = A_5 52 * (A * 2)_5 = A_4 53 * (A * 2)_4 = A_3 + A_7 54 * (A * 2)_3 = A_2 + A_7 55 * (A * 2)_2 = A_1 + A_7 56 * (A * 2)_1 = A_0 57 * (A * 2)_0 = A_7 58 * 59 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 60 * 61 * Observe that any number in the field (except for 0) can be expressed as a 62 * power of 2 -- a generator for the field. We store a table of the powers of 63 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 64 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 65 * than field addition). The inverse of a field element A (A^-1) is A^254. 66 * 67 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 68 * can be expressed by field operations: 69 * 70 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 71 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 72 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 73 * 74 * See the reconstruction code below for how P and Q can used individually or 75 * in concert to recover missing data columns. 76 */ 77 78 typedef struct raidz_col { 79 uint64_t rc_devidx; /* child device index for I/O */ 80 uint64_t rc_offset; /* device offset */ 81 uint64_t rc_size; /* I/O size */ 82 void *rc_data; /* I/O data */ 83 int rc_error; /* I/O error for this device */ 84 uint8_t rc_tried; /* Did we attempt this I/O column? */ 85 uint8_t rc_skipped; /* Did we skip this I/O column? */ 86 } raidz_col_t; 87 88 typedef struct raidz_map { 89 uint64_t rm_cols; /* Column count */ 90 uint64_t rm_bigcols; /* Number of oversized columns */ 91 uint64_t rm_asize; /* Actual total I/O size */ 92 uint64_t rm_missingdata; /* Count of missing data devices */ 93 uint64_t rm_missingparity; /* Count of missing parity devices */ 94 uint64_t rm_firstdatacol; /* First data column/parity count */ 95 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 96 } raidz_map_t; 97 98 #define VDEV_RAIDZ_P 0 99 #define VDEV_RAIDZ_Q 1 100 101 #define VDEV_RAIDZ_MAXPARITY 2 102 103 #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 104 105 /* 106 * These two tables represent powers and logs of 2 in the Galois field defined 107 * above. These values were computed by repeatedly multiplying by 2 as above. 108 */ 109 static const uint8_t vdev_raidz_pow2[256] = { 110 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 111 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 112 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 113 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 114 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 115 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 116 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 117 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 118 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 119 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 120 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 121 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 122 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 123 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 124 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 125 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 126 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 127 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 128 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 129 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 130 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 131 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 132 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 133 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 134 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 135 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 136 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 137 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 138 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 139 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 140 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 141 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 142 }; 143 static const uint8_t vdev_raidz_log2[256] = { 144 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 145 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 146 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 147 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 148 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 149 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 150 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 151 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 152 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 153 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 154 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 155 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 156 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 157 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 158 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 159 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 160 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 161 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 162 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 163 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 164 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 165 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 166 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 167 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 168 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 169 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 170 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 171 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 172 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 173 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 174 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 175 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 176 }; 177 178 /* 179 * Multiply a given number by 2 raised to the given power. 180 */ 181 static uint8_t 182 vdev_raidz_exp2(uint_t a, int exp) 183 { 184 if (a == 0) 185 return (0); 186 187 ASSERT(exp >= 0); 188 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 189 190 exp += vdev_raidz_log2[a]; 191 if (exp > 255) 192 exp -= 255; 193 194 return (vdev_raidz_pow2[exp]); 195 } 196 197 static raidz_map_t * 198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 199 uint64_t nparity) 200 { 201 raidz_map_t *rm; 202 uint64_t b = zio->io_offset >> unit_shift; 203 uint64_t s = zio->io_size >> unit_shift; 204 uint64_t f = b % dcols; 205 uint64_t o = (b / dcols) << unit_shift; 206 uint64_t q, r, c, bc, col, acols, coff, devidx; 207 208 q = s / (dcols - nparity); 209 r = s - q * (dcols - nparity); 210 bc = (r == 0 ? 0 : r + nparity); 211 212 acols = (q == 0 ? bc : dcols); 213 214 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 215 216 rm->rm_cols = acols; 217 rm->rm_bigcols = bc; 218 rm->rm_asize = 0; 219 rm->rm_missingdata = 0; 220 rm->rm_missingparity = 0; 221 rm->rm_firstdatacol = nparity; 222 223 for (c = 0; c < acols; c++) { 224 col = f + c; 225 coff = o; 226 if (col >= dcols) { 227 col -= dcols; 228 coff += 1ULL << unit_shift; 229 } 230 rm->rm_col[c].rc_devidx = col; 231 rm->rm_col[c].rc_offset = coff; 232 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 233 rm->rm_col[c].rc_data = NULL; 234 rm->rm_col[c].rc_error = 0; 235 rm->rm_col[c].rc_tried = 0; 236 rm->rm_col[c].rc_skipped = 0; 237 rm->rm_asize += rm->rm_col[c].rc_size; 238 } 239 240 rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 241 242 for (c = 0; c < rm->rm_firstdatacol; c++) 243 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 244 245 rm->rm_col[c].rc_data = zio->io_data; 246 247 for (c = c + 1; c < acols; c++) 248 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 249 rm->rm_col[c - 1].rc_size; 250 251 /* 252 * If all data stored spans all columns, there's a danger that parity 253 * will always be on the same device and, since parity isn't read 254 * during normal operation, that that device's I/O bandwidth won't be 255 * used effectively. We therefore switch the parity every 1MB. 256 * 257 * ... at least that was, ostensibly, the theory. As a practical 258 * matter unless we juggle the parity between all devices evenly, we 259 * won't see any benefit. Further, occasional writes that aren't a 260 * multiple of the LCM of the number of children and the minimum 261 * stripe width are sufficient to avoid pessimal behavior. 262 * Unfortunately, this decision created an implicit on-disk format 263 * requirement that we need to support for all eternity, but only 264 * for single-parity RAID-Z. 265 */ 266 ASSERT(rm->rm_cols >= 2); 267 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 268 269 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 270 devidx = rm->rm_col[0].rc_devidx; 271 o = rm->rm_col[0].rc_offset; 272 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 273 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 274 rm->rm_col[1].rc_devidx = devidx; 275 rm->rm_col[1].rc_offset = o; 276 } 277 278 zio->io_vsd = rm; 279 return (rm); 280 } 281 282 static void 283 vdev_raidz_map_free(zio_t *zio) 284 { 285 raidz_map_t *rm = zio->io_vsd; 286 int c; 287 288 for (c = 0; c < rm->rm_firstdatacol; c++) 289 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 290 291 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 292 zio->io_vsd = NULL; 293 } 294 295 static void 296 vdev_raidz_generate_parity_p(raidz_map_t *rm) 297 { 298 uint64_t *p, *src, pcount, ccount, i; 299 int c; 300 301 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 302 303 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 304 src = rm->rm_col[c].rc_data; 305 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 306 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 307 308 if (c == rm->rm_firstdatacol) { 309 ASSERT(ccount == pcount); 310 for (i = 0; i < ccount; i++, p++, src++) { 311 *p = *src; 312 } 313 } else { 314 ASSERT(ccount <= pcount); 315 for (i = 0; i < ccount; i++, p++, src++) { 316 *p ^= *src; 317 } 318 } 319 } 320 } 321 322 static void 323 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 324 { 325 uint64_t *q, *p, *src, pcount, ccount, mask, i; 326 int c; 327 328 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 329 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 330 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 331 332 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 333 src = rm->rm_col[c].rc_data; 334 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 335 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 336 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 337 338 if (c == rm->rm_firstdatacol) { 339 ASSERT(ccount == pcount || ccount == 0); 340 for (i = 0; i < ccount; i++, p++, q++, src++) { 341 *q = *src; 342 *p = *src; 343 } 344 for (; i < pcount; i++, p++, q++, src++) { 345 *q = 0; 346 *p = 0; 347 } 348 } else { 349 ASSERT(ccount <= pcount); 350 351 /* 352 * Rather than multiplying each byte individually (as 353 * described above), we are able to handle 8 at once 354 * by generating a mask based on the high bit in each 355 * byte and using that to conditionally XOR in 0x1d. 356 */ 357 for (i = 0; i < ccount; i++, p++, q++, src++) { 358 mask = *q & 0x8080808080808080ULL; 359 mask = (mask << 1) - (mask >> 7); 360 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 361 (mask & 0x1d1d1d1d1d1d1d1dULL); 362 *q ^= *src; 363 *p ^= *src; 364 } 365 366 /* 367 * Treat short columns as though they are full of 0s. 368 */ 369 for (; i < pcount; i++, q++) { 370 mask = *q & 0x8080808080808080ULL; 371 mask = (mask << 1) - (mask >> 7); 372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 373 (mask & 0x1d1d1d1d1d1d1d1dULL); 374 } 375 } 376 } 377 } 378 379 static void 380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 381 { 382 uint64_t *dst, *src, xcount, ccount, count, i; 383 int c; 384 385 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 386 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 387 ASSERT(xcount > 0); 388 389 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 390 dst = rm->rm_col[x].rc_data; 391 for (i = 0; i < xcount; i++, dst++, src++) { 392 *dst = *src; 393 } 394 395 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 396 src = rm->rm_col[c].rc_data; 397 dst = rm->rm_col[x].rc_data; 398 399 if (c == x) 400 continue; 401 402 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 403 count = MIN(ccount, xcount); 404 405 for (i = 0; i < count; i++, dst++, src++) { 406 *dst ^= *src; 407 } 408 } 409 } 410 411 static void 412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 413 { 414 uint64_t *dst, *src, xcount, ccount, count, mask, i; 415 uint8_t *b; 416 int c, j, exp; 417 418 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 419 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 420 421 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 422 src = rm->rm_col[c].rc_data; 423 dst = rm->rm_col[x].rc_data; 424 425 if (c == x) 426 ccount = 0; 427 else 428 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 429 430 count = MIN(ccount, xcount); 431 432 if (c == rm->rm_firstdatacol) { 433 for (i = 0; i < count; i++, dst++, src++) { 434 *dst = *src; 435 } 436 for (; i < xcount; i++, dst++) { 437 *dst = 0; 438 } 439 440 } else { 441 /* 442 * For an explanation of this, see the comment in 443 * vdev_raidz_generate_parity_pq() above. 444 */ 445 for (i = 0; i < count; i++, dst++, src++) { 446 mask = *dst & 0x8080808080808080ULL; 447 mask = (mask << 1) - (mask >> 7); 448 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 449 (mask & 0x1d1d1d1d1d1d1d1dULL); 450 *dst ^= *src; 451 } 452 453 for (; i < xcount; i++, dst++) { 454 mask = *dst & 0x8080808080808080ULL; 455 mask = (mask << 1) - (mask >> 7); 456 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 457 (mask & 0x1d1d1d1d1d1d1d1dULL); 458 } 459 } 460 } 461 462 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 463 dst = rm->rm_col[x].rc_data; 464 exp = 255 - (rm->rm_cols - 1 - x); 465 466 for (i = 0; i < xcount; i++, dst++, src++) { 467 *dst ^= *src; 468 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 469 *b = vdev_raidz_exp2(*b, exp); 470 } 471 } 472 } 473 474 static void 475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 476 { 477 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 478 void *pdata, *qdata; 479 uint64_t xsize, ysize, i; 480 481 ASSERT(x < y); 482 ASSERT(x >= rm->rm_firstdatacol); 483 ASSERT(y < rm->rm_cols); 484 485 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 486 487 /* 488 * Move the parity data aside -- we're going to compute parity as 489 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 490 * reuse the parity generation mechanism without trashing the actual 491 * parity so we make those columns appear to be full of zeros by 492 * setting their lengths to zero. 493 */ 494 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 495 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 496 xsize = rm->rm_col[x].rc_size; 497 ysize = rm->rm_col[y].rc_size; 498 499 rm->rm_col[VDEV_RAIDZ_P].rc_data = 500 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 501 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 502 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 503 rm->rm_col[x].rc_size = 0; 504 rm->rm_col[y].rc_size = 0; 505 506 vdev_raidz_generate_parity_pq(rm); 507 508 rm->rm_col[x].rc_size = xsize; 509 rm->rm_col[y].rc_size = ysize; 510 511 p = pdata; 512 q = qdata; 513 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 514 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 515 xd = rm->rm_col[x].rc_data; 516 yd = rm->rm_col[y].rc_data; 517 518 /* 519 * We now have: 520 * Pxy = P + D_x + D_y 521 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 522 * 523 * We can then solve for D_x: 524 * D_x = A * (P + Pxy) + B * (Q + Qxy) 525 * where 526 * A = 2^(x - y) * (2^(x - y) + 1)^-1 527 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 528 * 529 * With D_x in hand, we can easily solve for D_y: 530 * D_y = P + Pxy + D_x 531 */ 532 533 a = vdev_raidz_pow2[255 + x - y]; 534 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 535 tmp = 255 - vdev_raidz_log2[a ^ 1]; 536 537 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 538 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 539 540 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 541 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 542 vdev_raidz_exp2(*q ^ *qxy, bexp); 543 544 if (i < ysize) 545 *yd = *p ^ *pxy ^ *xd; 546 } 547 548 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 549 rm->rm_col[VDEV_RAIDZ_P].rc_size); 550 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 551 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 552 553 /* 554 * Restore the saved parity data. 555 */ 556 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 557 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 558 } 559 560 561 static int 562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 563 { 564 vdev_t *cvd; 565 uint64_t nparity = vd->vdev_nparity; 566 int c, error; 567 int lasterror = 0; 568 int numerrors = 0; 569 570 ASSERT(nparity > 0); 571 572 if (nparity > VDEV_RAIDZ_MAXPARITY || 573 vd->vdev_children < nparity + 1) { 574 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 575 return (EINVAL); 576 } 577 578 for (c = 0; c < vd->vdev_children; c++) { 579 cvd = vd->vdev_child[c]; 580 581 if ((error = vdev_open(cvd)) != 0) { 582 lasterror = error; 583 numerrors++; 584 continue; 585 } 586 587 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 588 *ashift = MAX(*ashift, cvd->vdev_ashift); 589 } 590 591 *asize *= vd->vdev_children; 592 593 if (numerrors > nparity) { 594 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 595 return (lasterror); 596 } 597 598 return (0); 599 } 600 601 static void 602 vdev_raidz_close(vdev_t *vd) 603 { 604 int c; 605 606 for (c = 0; c < vd->vdev_children; c++) 607 vdev_close(vd->vdev_child[c]); 608 } 609 610 static uint64_t 611 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 612 { 613 uint64_t asize; 614 uint64_t ashift = vd->vdev_top->vdev_ashift; 615 uint64_t cols = vd->vdev_children; 616 uint64_t nparity = vd->vdev_nparity; 617 618 asize = ((psize - 1) >> ashift) + 1; 619 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 620 asize = roundup(asize, nparity + 1) << ashift; 621 622 return (asize); 623 } 624 625 static void 626 vdev_raidz_child_done(zio_t *zio) 627 { 628 raidz_col_t *rc = zio->io_private; 629 630 rc->rc_error = zio->io_error; 631 rc->rc_tried = 1; 632 rc->rc_skipped = 0; 633 } 634 635 static void 636 vdev_raidz_repair_done(zio_t *zio) 637 { 638 ASSERT(zio->io_private == zio->io_parent); 639 vdev_raidz_map_free(zio->io_private); 640 } 641 642 static void 643 vdev_raidz_io_start(zio_t *zio) 644 { 645 vdev_t *vd = zio->io_vd; 646 vdev_t *tvd = vd->vdev_top; 647 vdev_t *cvd; 648 blkptr_t *bp = zio->io_bp; 649 raidz_map_t *rm; 650 raidz_col_t *rc; 651 int c; 652 653 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 654 vd->vdev_nparity); 655 656 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 657 658 if (zio->io_type == ZIO_TYPE_WRITE) { 659 /* 660 * Generate RAID parity in the first virtual columns. 661 */ 662 if (rm->rm_firstdatacol == 1) 663 vdev_raidz_generate_parity_p(rm); 664 else 665 vdev_raidz_generate_parity_pq(rm); 666 667 for (c = 0; c < rm->rm_cols; c++) { 668 rc = &rm->rm_col[c]; 669 cvd = vd->vdev_child[rc->rc_devidx]; 670 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 671 rc->rc_offset, rc->rc_data, rc->rc_size, 672 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 673 vdev_raidz_child_done, rc)); 674 } 675 zio_wait_children_done(zio); 676 return; 677 } 678 679 ASSERT(zio->io_type == ZIO_TYPE_READ); 680 681 /* 682 * Iterate over the columns in reverse order so that we hit the parity 683 * last -- any errors along the way will force us to read the parity 684 * data. 685 */ 686 for (c = rm->rm_cols - 1; c >= 0; c--) { 687 rc = &rm->rm_col[c]; 688 cvd = vd->vdev_child[rc->rc_devidx]; 689 if (vdev_is_dead(cvd)) { 690 if (c >= rm->rm_firstdatacol) 691 rm->rm_missingdata++; 692 else 693 rm->rm_missingparity++; 694 rc->rc_error = ENXIO; 695 rc->rc_tried = 1; /* don't even try */ 696 rc->rc_skipped = 1; 697 continue; 698 } 699 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 700 if (c >= rm->rm_firstdatacol) 701 rm->rm_missingdata++; 702 else 703 rm->rm_missingparity++; 704 rc->rc_error = ESTALE; 705 rc->rc_skipped = 1; 706 continue; 707 } 708 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 709 (zio->io_flags & ZIO_FLAG_SCRUB)) { 710 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 711 rc->rc_offset, rc->rc_data, rc->rc_size, 712 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 713 vdev_raidz_child_done, rc)); 714 } 715 } 716 717 zio_wait_children_done(zio); 718 } 719 720 /* 721 * Report a checksum error for a child of a RAID-Z device. 722 */ 723 static void 724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 725 { 726 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 727 dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 728 vdev_description(vd)); 729 730 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 731 mutex_enter(&vd->vdev_stat_lock); 732 vd->vdev_stat.vs_checksum_errors++; 733 mutex_exit(&vd->vdev_stat_lock); 734 } 735 736 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 737 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 738 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 739 } 740 741 /* 742 * Generate the parity from the data columns. If we tried and were able to 743 * read the parity without error, verify that the generated parity matches the 744 * data we read. If it doesn't, we fire off a checksum error. Return the 745 * number such failures. 746 */ 747 static int 748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 749 { 750 void *orig[VDEV_RAIDZ_MAXPARITY]; 751 int c, ret = 0; 752 raidz_col_t *rc; 753 754 for (c = 0; c < rm->rm_firstdatacol; c++) { 755 rc = &rm->rm_col[c]; 756 if (!rc->rc_tried || rc->rc_error != 0) 757 continue; 758 orig[c] = zio_buf_alloc(rc->rc_size); 759 bcopy(rc->rc_data, orig[c], rc->rc_size); 760 } 761 762 if (rm->rm_firstdatacol == 1) 763 vdev_raidz_generate_parity_p(rm); 764 else 765 vdev_raidz_generate_parity_pq(rm); 766 767 for (c = 0; c < rm->rm_firstdatacol; c++) { 768 rc = &rm->rm_col[c]; 769 if (!rc->rc_tried || rc->rc_error != 0) 770 continue; 771 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 772 raidz_checksum_error(zio, rc); 773 rc->rc_error = ECKSUM; 774 ret++; 775 } 776 zio_buf_free(orig[c], rc->rc_size); 777 } 778 779 return (ret); 780 } 781 782 static uint64_t raidz_corrected_p; 783 static uint64_t raidz_corrected_q; 784 static uint64_t raidz_corrected_pq; 785 786 static void 787 vdev_raidz_io_done(zio_t *zio) 788 { 789 vdev_t *vd = zio->io_vd; 790 vdev_t *cvd; 791 raidz_map_t *rm = zio->io_vsd; 792 raidz_col_t *rc, *rc1; 793 int unexpected_errors = 0; 794 int parity_errors = 0; 795 int parity_untried = 0; 796 int data_errors = 0; 797 int n, c, c1; 798 799 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 800 801 zio->io_error = 0; 802 zio->io_numerrors = 0; 803 804 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 805 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 806 807 for (c = 0; c < rm->rm_cols; c++) { 808 rc = &rm->rm_col[c]; 809 810 /* 811 * We preserve any EIOs because those may be worth retrying; 812 * whereas ECKSUM and ENXIO are more likely to be persistent. 813 */ 814 if (rc->rc_error) { 815 if (zio->io_error != EIO) 816 zio->io_error = rc->rc_error; 817 818 if (c < rm->rm_firstdatacol) 819 parity_errors++; 820 else 821 data_errors++; 822 823 if (!rc->rc_skipped) 824 unexpected_errors++; 825 826 zio->io_numerrors++; 827 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 828 parity_untried++; 829 } 830 } 831 832 if (zio->io_type == ZIO_TYPE_WRITE) { 833 /* 834 * If this is not a failfast write, and we were able to 835 * write enough columns to reconstruct the data, good enough. 836 */ 837 /* XXPOLICY */ 838 if (zio->io_numerrors <= rm->rm_firstdatacol && 839 !(zio->io_flags & ZIO_FLAG_FAILFAST)) 840 zio->io_error = 0; 841 842 vdev_raidz_map_free(zio); 843 zio_next_stage(zio); 844 return; 845 } 846 847 ASSERT(zio->io_type == ZIO_TYPE_READ); 848 /* 849 * There are three potential phases for a read: 850 * 1. produce valid data from the columns read 851 * 2. read all disks and try again 852 * 3. perform combinatorial reconstruction 853 * 854 * Each phase is progressively both more expensive and less likely to 855 * occur. If we encounter more errors than we can repair or all phases 856 * fail, we have no choice but to return an error. 857 */ 858 859 /* 860 * If the number of errors we saw was correctable -- less than or equal 861 * to the number of parity disks read -- attempt to produce data that 862 * has a valid checksum. Naturally, this case applies in the absence of 863 * any errors. 864 */ 865 if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { 866 switch (data_errors) { 867 case 0: 868 if (zio_checksum_error(zio) == 0) { 869 zio->io_error = 0; 870 if (parity_errors + parity_untried < 871 rm->rm_firstdatacol) { 872 n = raidz_parity_verify(zio, rm); 873 unexpected_errors += n; 874 ASSERT(parity_errors + n <= 875 rm->rm_firstdatacol); 876 } 877 goto done; 878 } 879 break; 880 881 case 1: 882 /* 883 * We either attempt to read all the parity columns or 884 * none of them. If we didn't try to read parity, we 885 * wouldn't be here in the correctable case. There must 886 * also have been fewer parity errors than parity 887 * columns or, again, we wouldn't be in this code path. 888 */ 889 ASSERT(parity_untried == 0); 890 ASSERT(parity_errors < rm->rm_firstdatacol); 891 892 /* 893 * Find the column that reported the error. 894 */ 895 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 896 rc = &rm->rm_col[c]; 897 if (rc->rc_error != 0) 898 break; 899 } 900 ASSERT(c != rm->rm_cols); 901 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 902 rc->rc_error == ESTALE); 903 904 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 905 vdev_raidz_reconstruct_p(rm, c); 906 } else { 907 ASSERT(rm->rm_firstdatacol > 1); 908 vdev_raidz_reconstruct_q(rm, c); 909 } 910 911 if (zio_checksum_error(zio) == 0) { 912 zio->io_error = 0; 913 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 914 atomic_inc_64(&raidz_corrected_p); 915 else 916 atomic_inc_64(&raidz_corrected_q); 917 918 /* 919 * If there's more than one parity disk that 920 * was successfully read, confirm that the 921 * other parity disk produced the correct data. 922 * This routine is suboptimal in that it 923 * regenerates both the parity we wish to test 924 * as well as the parity we just used to 925 * perform the reconstruction, but this should 926 * be a relatively uncommon case, and can be 927 * optimized if it becomes a problem. 928 */ 929 if (parity_errors < rm->rm_firstdatacol - 1) { 930 n = raidz_parity_verify(zio, rm); 931 unexpected_errors += n; 932 ASSERT(parity_errors + n <= 933 rm->rm_firstdatacol); 934 } 935 936 goto done; 937 } 938 break; 939 940 case 2: 941 /* 942 * Two data column errors require double parity. 943 */ 944 ASSERT(rm->rm_firstdatacol == 2); 945 946 /* 947 * Find the two columns that reported errors. 948 */ 949 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 950 rc = &rm->rm_col[c]; 951 if (rc->rc_error != 0) 952 break; 953 } 954 ASSERT(c != rm->rm_cols); 955 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 956 rc->rc_error == ESTALE); 957 958 for (c1 = c++; c < rm->rm_cols; c++) { 959 rc = &rm->rm_col[c]; 960 if (rc->rc_error != 0) 961 break; 962 } 963 ASSERT(c != rm->rm_cols); 964 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 965 rc->rc_error == ESTALE); 966 967 vdev_raidz_reconstruct_pq(rm, c1, c); 968 969 if (zio_checksum_error(zio) == 0) { 970 zio->io_error = 0; 971 atomic_inc_64(&raidz_corrected_pq); 972 973 goto done; 974 } 975 break; 976 977 default: 978 ASSERT(rm->rm_firstdatacol <= 2); 979 ASSERT(0); 980 } 981 } 982 983 /* 984 * This isn't a typical situation -- either we got a read error or 985 * a child silently returned bad data. Read every block so we can 986 * try again with as much data and parity as we can track down. If 987 * we've already been through once before, all children will be marked 988 * as tried so we'll proceed to combinatorial reconstruction. 989 */ 990 unexpected_errors = 1; 991 rm->rm_missingdata = 0; 992 rm->rm_missingparity = 0; 993 994 for (c = 0; c < rm->rm_cols; c++) { 995 if (rm->rm_col[c].rc_tried) 996 continue; 997 998 zio->io_error = 0; 999 zio_vdev_io_redone(zio); 1000 do { 1001 rc = &rm->rm_col[c]; 1002 if (rc->rc_tried) 1003 continue; 1004 zio_nowait(zio_vdev_child_io(zio, NULL, 1005 vd->vdev_child[rc->rc_devidx], 1006 rc->rc_offset, rc->rc_data, rc->rc_size, 1007 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 1008 vdev_raidz_child_done, rc)); 1009 } while (++c < rm->rm_cols); 1010 dprintf("rereading\n"); 1011 zio_wait_children_done(zio); 1012 return; 1013 } 1014 1015 /* 1016 * At this point we've attempted to reconstruct the data given the 1017 * errors we detected, and we've attempted to read all columns. There 1018 * must, therefore, be one or more additional problems -- silent errors 1019 * resulting in invalid data rather than explicit I/O errors resulting 1020 * in absent data. Before we attempt combinatorial reconstruction make 1021 * sure we have a chance of coming up with the right answer. 1022 */ 1023 if (zio->io_numerrors >= rm->rm_firstdatacol) { 1024 ASSERT(zio->io_error != 0); 1025 goto done; 1026 } 1027 1028 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 1029 /* 1030 * Attempt to reconstruct the data from parity P. 1031 */ 1032 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1033 void *orig; 1034 rc = &rm->rm_col[c]; 1035 1036 orig = zio_buf_alloc(rc->rc_size); 1037 bcopy(rc->rc_data, orig, rc->rc_size); 1038 vdev_raidz_reconstruct_p(rm, c); 1039 1040 if (zio_checksum_error(zio) == 0) { 1041 zio_buf_free(orig, rc->rc_size); 1042 zio->io_error = 0; 1043 atomic_inc_64(&raidz_corrected_p); 1044 1045 /* 1046 * If this child didn't know that it returned 1047 * bad data, inform it. 1048 */ 1049 if (rc->rc_tried && rc->rc_error == 0) 1050 raidz_checksum_error(zio, rc); 1051 rc->rc_error = ECKSUM; 1052 goto done; 1053 } 1054 1055 bcopy(orig, rc->rc_data, rc->rc_size); 1056 zio_buf_free(orig, rc->rc_size); 1057 } 1058 } 1059 1060 if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1061 /* 1062 * Attempt to reconstruct the data from parity Q. 1063 */ 1064 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1065 void *orig; 1066 rc = &rm->rm_col[c]; 1067 1068 orig = zio_buf_alloc(rc->rc_size); 1069 bcopy(rc->rc_data, orig, rc->rc_size); 1070 vdev_raidz_reconstruct_q(rm, c); 1071 1072 if (zio_checksum_error(zio) == 0) { 1073 zio_buf_free(orig, rc->rc_size); 1074 zio->io_error = 0; 1075 atomic_inc_64(&raidz_corrected_q); 1076 1077 /* 1078 * If this child didn't know that it returned 1079 * bad data, inform it. 1080 */ 1081 if (rc->rc_tried && rc->rc_error == 0) 1082 raidz_checksum_error(zio, rc); 1083 rc->rc_error = ECKSUM; 1084 goto done; 1085 } 1086 1087 bcopy(orig, rc->rc_data, rc->rc_size); 1088 zio_buf_free(orig, rc->rc_size); 1089 } 1090 } 1091 1092 if (rm->rm_firstdatacol > 1 && 1093 rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 1094 rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1095 /* 1096 * Attempt to reconstruct the data from both P and Q. 1097 */ 1098 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 1099 void *orig, *orig1; 1100 rc = &rm->rm_col[c]; 1101 1102 orig = zio_buf_alloc(rc->rc_size); 1103 bcopy(rc->rc_data, orig, rc->rc_size); 1104 1105 for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 1106 rc1 = &rm->rm_col[c1]; 1107 1108 orig1 = zio_buf_alloc(rc1->rc_size); 1109 bcopy(rc1->rc_data, orig1, rc1->rc_size); 1110 1111 vdev_raidz_reconstruct_pq(rm, c, c1); 1112 1113 if (zio_checksum_error(zio) == 0) { 1114 zio_buf_free(orig, rc->rc_size); 1115 zio_buf_free(orig1, rc1->rc_size); 1116 zio->io_error = 0; 1117 atomic_inc_64(&raidz_corrected_pq); 1118 1119 /* 1120 * If these children didn't know they 1121 * returned bad data, inform them. 1122 */ 1123 if (rc->rc_tried && rc->rc_error == 0) 1124 raidz_checksum_error(zio, rc); 1125 if (rc1->rc_tried && rc1->rc_error == 0) 1126 raidz_checksum_error(zio, rc1); 1127 1128 rc->rc_error = ECKSUM; 1129 rc1->rc_error = ECKSUM; 1130 1131 goto done; 1132 } 1133 1134 bcopy(orig1, rc1->rc_data, rc1->rc_size); 1135 zio_buf_free(orig1, rc1->rc_size); 1136 } 1137 1138 bcopy(orig, rc->rc_data, rc->rc_size); 1139 zio_buf_free(orig, rc->rc_size); 1140 } 1141 } 1142 1143 /* 1144 * All combinations failed to checksum. Generate checksum ereports for 1145 * all children. 1146 */ 1147 zio->io_error = ECKSUM; 1148 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1149 for (c = 0; c < rm->rm_cols; c++) { 1150 rc = &rm->rm_col[c]; 1151 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1152 zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 1153 rc->rc_offset, rc->rc_size); 1154 } 1155 } 1156 1157 done: 1158 zio_checksum_verified(zio); 1159 1160 if (zio->io_error == 0 && (spa_mode & FWRITE) && 1161 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1162 zio_t *rio; 1163 1164 /* 1165 * Use the good data we have in hand to repair damaged children. 1166 * 1167 * We issue all repair I/Os as children of 'rio' to arrange 1168 * that vdev_raidz_map_free(zio) will be invoked after all 1169 * repairs complete, but before we advance to the next stage. 1170 */ 1171 rio = zio_null(zio, zio->io_spa, 1172 vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); 1173 1174 for (c = 0; c < rm->rm_cols; c++) { 1175 rc = &rm->rm_col[c]; 1176 cvd = vd->vdev_child[rc->rc_devidx]; 1177 1178 if (rc->rc_error == 0) 1179 continue; 1180 1181 dprintf("%s resilvered %s @ 0x%llx error %d\n", 1182 vdev_description(vd), 1183 vdev_description(cvd), 1184 zio->io_offset, rc->rc_error); 1185 1186 zio_nowait(zio_vdev_child_io(rio, NULL, cvd, 1187 rc->rc_offset, rc->rc_data, rc->rc_size, 1188 ZIO_TYPE_WRITE, zio->io_priority, 1189 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | 1190 ZIO_FLAG_CANFAIL, NULL, NULL)); 1191 } 1192 1193 zio_nowait(rio); 1194 zio_wait_children_done(zio); 1195 return; 1196 } 1197 1198 vdev_raidz_map_free(zio); 1199 zio_next_stage(zio); 1200 } 1201 1202 static void 1203 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1204 { 1205 if (faulted > vd->vdev_nparity) 1206 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1207 VDEV_AUX_NO_REPLICAS); 1208 else if (degraded + faulted != 0) 1209 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1210 else 1211 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1212 } 1213 1214 vdev_ops_t vdev_raidz_ops = { 1215 vdev_raidz_open, 1216 vdev_raidz_close, 1217 vdev_raidz_asize, 1218 vdev_raidz_io_start, 1219 vdev_raidz_io_done, 1220 vdev_raidz_state_change, 1221 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1222 B_FALSE /* not a leaf vdev */ 1223 }; 1224