1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * (A * 2)_7 = A_6 64 * (A * 2)_6 = A_5 65 * (A * 2)_5 = A_4 66 * (A * 2)_4 = A_3 + A_7 67 * (A * 2)_3 = A_2 + A_7 68 * (A * 2)_2 = A_1 + A_7 69 * (A * 2)_1 = A_0 70 * (A * 2)_0 = A_7 71 * 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 * As an aside, this multiplication is derived from the error correcting 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 * 76 * Observe that any number in the field (except for 0) can be expressed as a 77 * power of 2 -- a generator for the field. We store a table of the powers of 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 * than field addition). The inverse of a field element A (A^-1) is therefore 81 * A ^ (255 - 1) = A^254. 82 * 83 * The up-to-three parity columns, P, Q, R over several data columns, 84 * D_0, ... D_n-1, can be expressed by field operations: 85 * 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 * 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 * independent coefficients. (There are no additional coefficients that have 95 * this property which is why the uncorrected Plank method breaks down.) 96 * 97 * See the reconstruction code below for how P, Q and R can used individually 98 * or in concert to recover missing data columns. 99 */ 100 101 typedef struct raidz_col { 102 uint64_t rc_devidx; /* child device index for I/O */ 103 uint64_t rc_offset; /* device offset */ 104 uint64_t rc_size; /* I/O size */ 105 void *rc_data; /* I/O data */ 106 int rc_error; /* I/O error for this device */ 107 uint8_t rc_tried; /* Did we attempt this I/O column? */ 108 uint8_t rc_skipped; /* Did we skip this I/O column? */ 109 } raidz_col_t; 110 111 typedef struct raidz_map { 112 uint64_t rm_cols; /* Regular column count */ 113 uint64_t rm_scols; /* Count including skipped columns */ 114 uint64_t rm_bigcols; /* Number of oversized columns */ 115 uint64_t rm_asize; /* Actual total I/O size */ 116 uint64_t rm_missingdata; /* Count of missing data devices */ 117 uint64_t rm_missingparity; /* Count of missing parity devices */ 118 uint64_t rm_firstdatacol; /* First data column/parity count */ 119 uint64_t rm_skipped; /* Skipped sectors for padding */ 120 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 121 } raidz_map_t; 122 123 #define VDEV_RAIDZ_P 0 124 #define VDEV_RAIDZ_Q 1 125 #define VDEV_RAIDZ_R 2 126 #define VDEV_RAIDZ_MAXPARITY 3 127 128 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 129 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 130 131 /* 132 * We provide a mechanism to perform the field multiplication operation on a 133 * 64-bit value all at once rather than a byte at a time. This works by 134 * creating a mask from the top bit in each byte and using that to 135 * conditionally apply the XOR of 0x1d. 136 */ 137 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 138 { \ 139 (mask) = (x) & 0x8080808080808080ULL; \ 140 (mask) = ((mask) << 1) - ((mask) >> 7); \ 141 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 142 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 143 } 144 145 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 146 { \ 147 VDEV_RAIDZ_64MUL_2((x), mask); \ 148 VDEV_RAIDZ_64MUL_2((x), mask); \ 149 } 150 151 /* 152 * Force reconstruction to use the general purpose method. 153 */ 154 int vdev_raidz_default_to_general; 155 156 /* 157 * These two tables represent powers and logs of 2 in the Galois field defined 158 * above. These values were computed by repeatedly multiplying by 2 as above. 159 */ 160 static const uint8_t vdev_raidz_pow2[256] = { 161 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 162 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 163 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 164 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 165 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 166 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 167 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 168 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 169 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 170 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 171 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 172 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 173 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 174 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 175 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 176 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 177 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 178 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 179 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 180 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 181 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 182 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 183 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 184 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 185 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 186 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 187 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 188 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 189 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 190 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 191 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 192 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 193 }; 194 static const uint8_t vdev_raidz_log2[256] = { 195 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 196 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 197 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 198 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 199 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 200 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 201 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 202 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 203 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 204 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 205 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 206 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 207 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 208 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 209 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 210 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 211 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 212 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 213 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 214 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 215 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 216 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 217 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 218 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 219 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 220 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 221 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 222 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 223 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 224 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 225 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 226 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 227 }; 228 229 /* 230 * Multiply a given number by 2 raised to the given power. 231 */ 232 static uint8_t 233 vdev_raidz_exp2(uint_t a, int exp) 234 { 235 if (a == 0) 236 return (0); 237 238 ASSERT(exp >= 0); 239 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 240 241 exp += vdev_raidz_log2[a]; 242 if (exp > 255) 243 exp -= 255; 244 245 return (vdev_raidz_pow2[exp]); 246 } 247 248 static void 249 vdev_raidz_map_free(zio_t *zio) 250 { 251 raidz_map_t *rm = zio->io_vsd; 252 int c; 253 254 for (c = 0; c < rm->rm_firstdatacol; c++) 255 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 256 257 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 258 } 259 260 static raidz_map_t * 261 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 262 uint64_t nparity) 263 { 264 raidz_map_t *rm; 265 uint64_t b = zio->io_offset >> unit_shift; 266 uint64_t s = zio->io_size >> unit_shift; 267 uint64_t f = b % dcols; 268 uint64_t o = (b / dcols) << unit_shift; 269 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 270 271 q = s / (dcols - nparity); 272 r = s - q * (dcols - nparity); 273 bc = (r == 0 ? 0 : r + nparity); 274 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 275 276 if (q == 0) { 277 acols = bc; 278 scols = MIN(dcols, roundup(bc, nparity + 1)); 279 } else { 280 acols = dcols; 281 scols = dcols; 282 } 283 284 ASSERT3U(acols, <=, scols); 285 286 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 287 288 rm->rm_cols = acols; 289 rm->rm_scols = scols; 290 rm->rm_bigcols = bc; 291 rm->rm_missingdata = 0; 292 rm->rm_missingparity = 0; 293 rm->rm_firstdatacol = nparity; 294 295 asize = 0; 296 297 for (c = 0; c < scols; c++) { 298 col = f + c; 299 coff = o; 300 if (col >= dcols) { 301 col -= dcols; 302 coff += 1ULL << unit_shift; 303 } 304 rm->rm_col[c].rc_devidx = col; 305 rm->rm_col[c].rc_offset = coff; 306 rm->rm_col[c].rc_data = NULL; 307 rm->rm_col[c].rc_error = 0; 308 rm->rm_col[c].rc_tried = 0; 309 rm->rm_col[c].rc_skipped = 0; 310 311 if (c >= acols) 312 rm->rm_col[c].rc_size = 0; 313 else if (c < bc) 314 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 315 else 316 rm->rm_col[c].rc_size = q << unit_shift; 317 318 asize += rm->rm_col[c].rc_size; 319 } 320 321 ASSERT3U(asize, ==, tot << unit_shift); 322 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 323 rm->rm_skipped = roundup(tot, nparity + 1) - tot; 324 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift); 325 ASSERT3U(rm->rm_skipped, <=, nparity); 326 327 for (c = 0; c < rm->rm_firstdatacol; c++) 328 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 329 330 rm->rm_col[c].rc_data = zio->io_data; 331 332 for (c = c + 1; c < acols; c++) 333 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 334 rm->rm_col[c - 1].rc_size; 335 336 /* 337 * If all data stored spans all columns, there's a danger that parity 338 * will always be on the same device and, since parity isn't read 339 * during normal operation, that that device's I/O bandwidth won't be 340 * used effectively. We therefore switch the parity every 1MB. 341 * 342 * ... at least that was, ostensibly, the theory. As a practical 343 * matter unless we juggle the parity between all devices evenly, we 344 * won't see any benefit. Further, occasional writes that aren't a 345 * multiple of the LCM of the number of children and the minimum 346 * stripe width are sufficient to avoid pessimal behavior. 347 * Unfortunately, this decision created an implicit on-disk format 348 * requirement that we need to support for all eternity, but only 349 * for single-parity RAID-Z. 350 */ 351 ASSERT(rm->rm_cols >= 2); 352 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 353 354 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 355 devidx = rm->rm_col[0].rc_devidx; 356 o = rm->rm_col[0].rc_offset; 357 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 358 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 359 rm->rm_col[1].rc_devidx = devidx; 360 rm->rm_col[1].rc_offset = o; 361 } 362 363 zio->io_vsd = rm; 364 zio->io_vsd_free = vdev_raidz_map_free; 365 return (rm); 366 } 367 368 static void 369 vdev_raidz_generate_parity_p(raidz_map_t *rm) 370 { 371 uint64_t *p, *src, pcount, ccount, i; 372 int c; 373 374 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 375 376 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 377 src = rm->rm_col[c].rc_data; 378 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 379 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 380 381 if (c == rm->rm_firstdatacol) { 382 ASSERT(ccount == pcount); 383 for (i = 0; i < ccount; i++, src++, p++) { 384 *p = *src; 385 } 386 } else { 387 ASSERT(ccount <= pcount); 388 for (i = 0; i < ccount; i++, src++, p++) { 389 *p ^= *src; 390 } 391 } 392 } 393 } 394 395 static void 396 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 397 { 398 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 399 int c; 400 401 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 402 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 403 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 404 405 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 406 src = rm->rm_col[c].rc_data; 407 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 408 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 409 410 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 411 412 if (c == rm->rm_firstdatacol) { 413 ASSERT(ccnt == pcnt || ccnt == 0); 414 for (i = 0; i < ccnt; i++, src++, p++, q++) { 415 *p = *src; 416 *q = *src; 417 } 418 for (; i < pcnt; i++, src++, p++, q++) { 419 *p = 0; 420 *q = 0; 421 } 422 } else { 423 ASSERT(ccnt <= pcnt); 424 425 /* 426 * Apply the algorithm described above by multiplying 427 * the previous result and adding in the new value. 428 */ 429 for (i = 0; i < ccnt; i++, src++, p++, q++) { 430 *p ^= *src; 431 432 VDEV_RAIDZ_64MUL_2(*q, mask); 433 *q ^= *src; 434 } 435 436 /* 437 * Treat short columns as though they are full of 0s. 438 * Note that there's therefore nothing needed for P. 439 */ 440 for (; i < pcnt; i++, q++) { 441 VDEV_RAIDZ_64MUL_2(*q, mask); 442 } 443 } 444 } 445 } 446 447 static void 448 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 449 { 450 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 451 int c; 452 453 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 454 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 455 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 456 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 457 rm->rm_col[VDEV_RAIDZ_R].rc_size); 458 459 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 460 src = rm->rm_col[c].rc_data; 461 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 462 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 463 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 464 465 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 466 467 if (c == rm->rm_firstdatacol) { 468 ASSERT(ccnt == pcnt || ccnt == 0); 469 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 470 *p = *src; 471 *q = *src; 472 *r = *src; 473 } 474 for (; i < pcnt; i++, src++, p++, q++, r++) { 475 *p = 0; 476 *q = 0; 477 *r = 0; 478 } 479 } else { 480 ASSERT(ccnt <= pcnt); 481 482 /* 483 * Apply the algorithm described above by multiplying 484 * the previous result and adding in the new value. 485 */ 486 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 487 *p ^= *src; 488 489 VDEV_RAIDZ_64MUL_2(*q, mask); 490 *q ^= *src; 491 492 VDEV_RAIDZ_64MUL_4(*r, mask); 493 *r ^= *src; 494 } 495 496 /* 497 * Treat short columns as though they are full of 0s. 498 * Note that there's therefore nothing needed for P. 499 */ 500 for (; i < pcnt; i++, q++, r++) { 501 VDEV_RAIDZ_64MUL_2(*q, mask); 502 VDEV_RAIDZ_64MUL_4(*r, mask); 503 } 504 } 505 } 506 } 507 508 /* 509 * Generate RAID parity in the first virtual columns according to the number of 510 * parity columns available. 511 */ 512 static void 513 vdev_raidz_generate_parity(raidz_map_t *rm) 514 { 515 switch (rm->rm_firstdatacol) { 516 case 1: 517 vdev_raidz_generate_parity_p(rm); 518 break; 519 case 2: 520 vdev_raidz_generate_parity_pq(rm); 521 break; 522 case 3: 523 vdev_raidz_generate_parity_pqr(rm); 524 break; 525 default: 526 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 527 } 528 } 529 530 static int 531 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 532 { 533 uint64_t *dst, *src, xcount, ccount, count, i; 534 int x = tgts[0]; 535 int c; 536 537 ASSERT(ntgts == 1); 538 ASSERT(x >= rm->rm_firstdatacol); 539 ASSERT(x < rm->rm_cols); 540 541 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 542 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 543 ASSERT(xcount > 0); 544 545 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 546 dst = rm->rm_col[x].rc_data; 547 for (i = 0; i < xcount; i++, dst++, src++) { 548 *dst = *src; 549 } 550 551 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 552 src = rm->rm_col[c].rc_data; 553 dst = rm->rm_col[x].rc_data; 554 555 if (c == x) 556 continue; 557 558 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 559 count = MIN(ccount, xcount); 560 561 for (i = 0; i < count; i++, dst++, src++) { 562 *dst ^= *src; 563 } 564 } 565 566 return (1 << VDEV_RAIDZ_P); 567 } 568 569 static int 570 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 571 { 572 uint64_t *dst, *src, xcount, ccount, count, mask, i; 573 uint8_t *b; 574 int x = tgts[0]; 575 int c, j, exp; 576 577 ASSERT(ntgts == 1); 578 579 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 580 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 581 582 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 583 src = rm->rm_col[c].rc_data; 584 dst = rm->rm_col[x].rc_data; 585 586 if (c == x) 587 ccount = 0; 588 else 589 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 590 591 count = MIN(ccount, xcount); 592 593 if (c == rm->rm_firstdatacol) { 594 for (i = 0; i < count; i++, dst++, src++) { 595 *dst = *src; 596 } 597 for (; i < xcount; i++, dst++) { 598 *dst = 0; 599 } 600 601 } else { 602 for (i = 0; i < count; i++, dst++, src++) { 603 VDEV_RAIDZ_64MUL_2(*dst, mask); 604 *dst ^= *src; 605 } 606 607 for (; i < xcount; i++, dst++) { 608 VDEV_RAIDZ_64MUL_2(*dst, mask); 609 } 610 } 611 } 612 613 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 614 dst = rm->rm_col[x].rc_data; 615 exp = 255 - (rm->rm_cols - 1 - x); 616 617 for (i = 0; i < xcount; i++, dst++, src++) { 618 *dst ^= *src; 619 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 620 *b = vdev_raidz_exp2(*b, exp); 621 } 622 } 623 624 return (1 << VDEV_RAIDZ_Q); 625 } 626 627 static int 628 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 629 { 630 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 631 void *pdata, *qdata; 632 uint64_t xsize, ysize, i; 633 int x = tgts[0]; 634 int y = tgts[1]; 635 636 ASSERT(ntgts == 2); 637 ASSERT(x < y); 638 ASSERT(x >= rm->rm_firstdatacol); 639 ASSERT(y < rm->rm_cols); 640 641 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 642 643 /* 644 * Move the parity data aside -- we're going to compute parity as 645 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 646 * reuse the parity generation mechanism without trashing the actual 647 * parity so we make those columns appear to be full of zeros by 648 * setting their lengths to zero. 649 */ 650 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 651 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 652 xsize = rm->rm_col[x].rc_size; 653 ysize = rm->rm_col[y].rc_size; 654 655 rm->rm_col[VDEV_RAIDZ_P].rc_data = 656 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 657 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 658 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 659 rm->rm_col[x].rc_size = 0; 660 rm->rm_col[y].rc_size = 0; 661 662 vdev_raidz_generate_parity_pq(rm); 663 664 rm->rm_col[x].rc_size = xsize; 665 rm->rm_col[y].rc_size = ysize; 666 667 p = pdata; 668 q = qdata; 669 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 670 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 671 xd = rm->rm_col[x].rc_data; 672 yd = rm->rm_col[y].rc_data; 673 674 /* 675 * We now have: 676 * Pxy = P + D_x + D_y 677 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 678 * 679 * We can then solve for D_x: 680 * D_x = A * (P + Pxy) + B * (Q + Qxy) 681 * where 682 * A = 2^(x - y) * (2^(x - y) + 1)^-1 683 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 684 * 685 * With D_x in hand, we can easily solve for D_y: 686 * D_y = P + Pxy + D_x 687 */ 688 689 a = vdev_raidz_pow2[255 + x - y]; 690 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 691 tmp = 255 - vdev_raidz_log2[a ^ 1]; 692 693 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 694 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 695 696 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 697 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 698 vdev_raidz_exp2(*q ^ *qxy, bexp); 699 700 if (i < ysize) 701 *yd = *p ^ *pxy ^ *xd; 702 } 703 704 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 705 rm->rm_col[VDEV_RAIDZ_P].rc_size); 706 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 707 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 708 709 /* 710 * Restore the saved parity data. 711 */ 712 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 713 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 714 715 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 716 } 717 718 /* BEGIN CSTYLED */ 719 /* 720 * In the general case of reconstruction, we must solve the system of linear 721 * equations defined by the coeffecients used to generate parity as well as 722 * the contents of the data and parity disks. This can be expressed with 723 * vectors for the original data (D) and the actual data (d) and parity (p) 724 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 725 * 726 * __ __ __ __ 727 * | | __ __ | p_0 | 728 * | V | | D_0 | | p_m-1 | 729 * | | x | : | = | d_0 | 730 * | I | | D_n-1 | | : | 731 * | | ~~ ~~ | d_n-1 | 732 * ~~ ~~ ~~ ~~ 733 * 734 * I is simply a square identity matrix of size n, and V is a vandermonde 735 * matrix defined by the coeffecients we chose for the various parity columns 736 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 737 * computation as well as linear separability. 738 * 739 * __ __ __ __ 740 * | 1 .. 1 1 1 | | p_0 | 741 * | 2^n-1 .. 4 2 1 | __ __ | : | 742 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 743 * | 1 .. 0 0 0 | | D_1 | | d_0 | 744 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 745 * | : : : : | | : | | d_2 | 746 * | 0 .. 1 0 0 | | D_n-1 | | : | 747 * | 0 .. 0 1 0 | ~~ ~~ | : | 748 * | 0 .. 0 0 1 | | d_n-1 | 749 * ~~ ~~ ~~ ~~ 750 * 751 * Note that I, V, d, and p are known. To compute D, we must invert the 752 * matrix and use the known data and parity values to reconstruct the unknown 753 * data values. We begin by removing the rows in V|I and d|p that correspond 754 * to failed or missing columns; we then make V|I square (n x n) and d|p 755 * sized n by removing rows corresponding to unused parity from the bottom up 756 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 757 * using Gauss-Jordan elimination. In the example below we use m=3 parity 758 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 759 * __ __ 760 * | 1 1 1 1 1 1 1 1 | 761 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 762 * | 19 205 116 29 64 16 4 1 | / / 763 * | 1 0 0 0 0 0 0 0 | / / 764 * | 0 1 0 0 0 0 0 0 | <--' / 765 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 766 * | 0 0 0 1 0 0 0 0 | 767 * | 0 0 0 0 1 0 0 0 | 768 * | 0 0 0 0 0 1 0 0 | 769 * | 0 0 0 0 0 0 1 0 | 770 * | 0 0 0 0 0 0 0 1 | 771 * ~~ ~~ 772 * __ __ 773 * | 1 1 1 1 1 1 1 1 | 774 * | 128 64 32 16 8 4 2 1 | 775 * | 19 205 116 29 64 16 4 1 | 776 * | 1 0 0 0 0 0 0 0 | 777 * | 0 1 0 0 0 0 0 0 | 778 * (V|I)' = | 0 0 1 0 0 0 0 0 | 779 * | 0 0 0 1 0 0 0 0 | 780 * | 0 0 0 0 1 0 0 0 | 781 * | 0 0 0 0 0 1 0 0 | 782 * | 0 0 0 0 0 0 1 0 | 783 * | 0 0 0 0 0 0 0 1 | 784 * ~~ ~~ 785 * 786 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 787 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 788 * matrix is not singular. 789 * __ __ 790 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 791 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 792 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 793 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 794 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 795 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 796 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 797 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 798 * ~~ ~~ 799 * __ __ 800 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 801 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 802 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 803 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 804 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 805 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 806 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 807 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 808 * ~~ ~~ 809 * __ __ 810 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 811 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 812 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 813 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 814 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 815 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 816 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 817 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 818 * ~~ ~~ 819 * __ __ 820 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 821 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 822 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 823 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 824 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 825 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 826 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 827 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 828 * ~~ ~~ 829 * __ __ 830 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 831 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 832 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 833 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 834 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 835 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 836 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 837 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 838 * ~~ ~~ 839 * __ __ 840 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 841 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 842 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 843 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 844 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 845 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 846 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 847 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 848 * ~~ ~~ 849 * __ __ 850 * | 0 0 1 0 0 0 0 0 | 851 * | 167 100 5 41 159 169 217 208 | 852 * | 166 100 4 40 158 168 216 209 | 853 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 854 * | 0 0 0 0 1 0 0 0 | 855 * | 0 0 0 0 0 1 0 0 | 856 * | 0 0 0 0 0 0 1 0 | 857 * | 0 0 0 0 0 0 0 1 | 858 * ~~ ~~ 859 * 860 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 861 * of the missing data. 862 * 863 * As is apparent from the example above, the only non-trivial rows in the 864 * inverse matrix correspond to the data disks that we're trying to 865 * reconstruct. Indeed, those are the only rows we need as the others would 866 * only be useful for reconstructing data known or assumed to be valid. For 867 * that reason, we only build the coefficients in the rows that correspond to 868 * targeted columns. 869 */ 870 /* END CSTYLED */ 871 872 static void 873 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 874 uint8_t **rows) 875 { 876 int i, j; 877 int pow; 878 879 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 880 881 /* 882 * Fill in the missing rows of interest. 883 */ 884 for (i = 0; i < nmap; i++) { 885 ASSERT3S(0, <=, map[i]); 886 ASSERT3S(map[i], <=, 2); 887 888 pow = map[i] * n; 889 if (pow > 255) 890 pow -= 255; 891 ASSERT(pow <= 255); 892 893 for (j = 0; j < n; j++) { 894 pow -= map[i]; 895 if (pow < 0) 896 pow += 255; 897 rows[i][j] = vdev_raidz_pow2[pow]; 898 } 899 } 900 } 901 902 static void 903 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 904 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 905 { 906 int i, j, ii, jj; 907 uint8_t log; 908 909 /* 910 * Assert that the first nmissing entries from the array of used 911 * columns correspond to parity columns and that subsequent entries 912 * correspond to data columns. 913 */ 914 for (i = 0; i < nmissing; i++) { 915 ASSERT3S(used[i], <, rm->rm_firstdatacol); 916 } 917 for (; i < n; i++) { 918 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 919 } 920 921 /* 922 * First initialize the storage where we'll compute the inverse rows. 923 */ 924 for (i = 0; i < nmissing; i++) { 925 for (j = 0; j < n; j++) { 926 invrows[i][j] = (i == j) ? 1 : 0; 927 } 928 } 929 930 /* 931 * Subtract all trivial rows from the rows of consequence. 932 */ 933 for (i = 0; i < nmissing; i++) { 934 for (j = nmissing; j < n; j++) { 935 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 936 jj = used[j] - rm->rm_firstdatacol; 937 ASSERT3S(jj, <, n); 938 invrows[i][j] = rows[i][jj]; 939 rows[i][jj] = 0; 940 } 941 } 942 943 /* 944 * For each of the rows of interest, we must normalize it and subtract 945 * a multiple of it from the other rows. 946 */ 947 for (i = 0; i < nmissing; i++) { 948 for (j = 0; j < missing[i]; j++) { 949 ASSERT3U(rows[i][j], ==, 0); 950 } 951 ASSERT3U(rows[i][missing[i]], !=, 0); 952 953 /* 954 * Compute the inverse of the first element and multiply each 955 * element in the row by that value. 956 */ 957 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 958 959 for (j = 0; j < n; j++) { 960 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 961 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 962 } 963 964 for (ii = 0; ii < nmissing; ii++) { 965 if (i == ii) 966 continue; 967 968 ASSERT3U(rows[ii][missing[i]], !=, 0); 969 970 log = vdev_raidz_log2[rows[ii][missing[i]]]; 971 972 for (j = 0; j < n; j++) { 973 rows[ii][j] ^= 974 vdev_raidz_exp2(rows[i][j], log); 975 invrows[ii][j] ^= 976 vdev_raidz_exp2(invrows[i][j], log); 977 } 978 } 979 } 980 981 /* 982 * Verify that the data that is left in the rows are properly part of 983 * an identity matrix. 984 */ 985 for (i = 0; i < nmissing; i++) { 986 for (j = 0; j < n; j++) { 987 if (j == missing[i]) { 988 ASSERT3U(rows[i][j], ==, 1); 989 } else { 990 ASSERT3U(rows[i][j], ==, 0); 991 } 992 } 993 } 994 } 995 996 static void 997 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 998 int *missing, uint8_t **invrows, const uint8_t *used) 999 { 1000 int i, j, x, cc, c; 1001 uint8_t *src; 1002 uint64_t ccount; 1003 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1004 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1005 uint8_t log, val; 1006 int ll; 1007 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1008 uint8_t *p, *pp; 1009 size_t psize; 1010 1011 psize = sizeof (invlog[0][0]) * n * nmissing; 1012 p = kmem_alloc(psize, KM_SLEEP); 1013 1014 for (pp = p, i = 0; i < nmissing; i++) { 1015 invlog[i] = pp; 1016 pp += n; 1017 } 1018 1019 for (i = 0; i < nmissing; i++) { 1020 for (j = 0; j < n; j++) { 1021 ASSERT3U(invrows[i][j], !=, 0); 1022 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1023 } 1024 } 1025 1026 for (i = 0; i < n; i++) { 1027 c = used[i]; 1028 ASSERT3U(c, <, rm->rm_cols); 1029 1030 src = rm->rm_col[c].rc_data; 1031 ccount = rm->rm_col[c].rc_size; 1032 for (j = 0; j < nmissing; j++) { 1033 cc = missing[j] + rm->rm_firstdatacol; 1034 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1035 ASSERT3U(cc, <, rm->rm_cols); 1036 ASSERT3U(cc, !=, c); 1037 1038 dst[j] = rm->rm_col[cc].rc_data; 1039 dcount[j] = rm->rm_col[cc].rc_size; 1040 } 1041 1042 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1043 1044 for (x = 0; x < ccount; x++, src++) { 1045 if (*src != 0) 1046 log = vdev_raidz_log2[*src]; 1047 1048 for (cc = 0; cc < nmissing; cc++) { 1049 if (x >= dcount[cc]) 1050 continue; 1051 1052 if (*src == 0) { 1053 val = 0; 1054 } else { 1055 if ((ll = log + invlog[cc][i]) >= 255) 1056 ll -= 255; 1057 val = vdev_raidz_pow2[ll]; 1058 } 1059 1060 if (i == 0) 1061 dst[cc][x] = val; 1062 else 1063 dst[cc][x] ^= val; 1064 } 1065 } 1066 } 1067 1068 kmem_free(p, psize); 1069 } 1070 1071 static int 1072 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1073 { 1074 int n, i, c, t, tt; 1075 int nmissing_rows; 1076 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1077 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1078 1079 uint8_t *p, *pp; 1080 size_t psize; 1081 1082 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1083 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1084 uint8_t *used; 1085 1086 int code = 0; 1087 1088 1089 n = rm->rm_cols - rm->rm_firstdatacol; 1090 1091 /* 1092 * Figure out which data columns are missing. 1093 */ 1094 nmissing_rows = 0; 1095 for (t = 0; t < ntgts; t++) { 1096 if (tgts[t] >= rm->rm_firstdatacol) { 1097 missing_rows[nmissing_rows++] = 1098 tgts[t] - rm->rm_firstdatacol; 1099 } 1100 } 1101 1102 /* 1103 * Figure out which parity columns to use to help generate the missing 1104 * data columns. 1105 */ 1106 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1107 ASSERT(tt < ntgts); 1108 ASSERT(c < rm->rm_firstdatacol); 1109 1110 /* 1111 * Skip any targeted parity columns. 1112 */ 1113 if (c == tgts[tt]) { 1114 tt++; 1115 continue; 1116 } 1117 1118 code |= 1 << c; 1119 1120 parity_map[i] = c; 1121 i++; 1122 } 1123 1124 ASSERT(code != 0); 1125 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1126 1127 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1128 nmissing_rows * n + sizeof (used[0]) * n; 1129 p = kmem_alloc(psize, KM_SLEEP); 1130 1131 for (pp = p, i = 0; i < nmissing_rows; i++) { 1132 rows[i] = pp; 1133 pp += n; 1134 invrows[i] = pp; 1135 pp += n; 1136 } 1137 used = pp; 1138 1139 for (i = 0; i < nmissing_rows; i++) { 1140 used[i] = parity_map[i]; 1141 } 1142 1143 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1144 if (tt < nmissing_rows && 1145 c == missing_rows[tt] + rm->rm_firstdatacol) { 1146 tt++; 1147 continue; 1148 } 1149 1150 ASSERT3S(i, <, n); 1151 used[i] = c; 1152 i++; 1153 } 1154 1155 /* 1156 * Initialize the interesting rows of the matrix. 1157 */ 1158 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1159 1160 /* 1161 * Invert the matrix. 1162 */ 1163 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1164 invrows, used); 1165 1166 /* 1167 * Reconstruct the missing data using the generated matrix. 1168 */ 1169 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1170 invrows, used); 1171 1172 kmem_free(p, psize); 1173 1174 return (code); 1175 } 1176 1177 static int 1178 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1179 { 1180 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1181 int ntgts; 1182 int i, c; 1183 int code; 1184 int nbadparity, nbaddata; 1185 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1186 1187 /* 1188 * The tgts list must already be sorted. 1189 */ 1190 for (i = 1; i < nt; i++) { 1191 ASSERT(t[i] > t[i - 1]); 1192 } 1193 1194 nbadparity = rm->rm_firstdatacol; 1195 nbaddata = rm->rm_cols - nbadparity; 1196 ntgts = 0; 1197 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1198 if (c < rm->rm_firstdatacol) 1199 parity_valid[c] = B_FALSE; 1200 1201 if (i < nt && c == t[i]) { 1202 tgts[ntgts++] = c; 1203 i++; 1204 } else if (rm->rm_col[c].rc_error != 0) { 1205 tgts[ntgts++] = c; 1206 } else if (c >= rm->rm_firstdatacol) { 1207 nbaddata--; 1208 } else { 1209 parity_valid[c] = B_TRUE; 1210 nbadparity--; 1211 } 1212 } 1213 1214 ASSERT(ntgts >= nt); 1215 ASSERT(nbaddata >= 0); 1216 ASSERT(nbaddata + nbadparity == ntgts); 1217 1218 dt = &tgts[nbadparity]; 1219 1220 /* 1221 * See if we can use any of our optimized reconstruction routines. 1222 */ 1223 if (!vdev_raidz_default_to_general) { 1224 switch (nbaddata) { 1225 case 1: 1226 if (parity_valid[VDEV_RAIDZ_P]) 1227 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1228 1229 ASSERT(rm->rm_firstdatacol > 1); 1230 1231 if (parity_valid[VDEV_RAIDZ_Q]) 1232 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1233 1234 ASSERT(rm->rm_firstdatacol > 2); 1235 break; 1236 1237 case 2: 1238 ASSERT(rm->rm_firstdatacol > 1); 1239 1240 if (parity_valid[VDEV_RAIDZ_P] && 1241 parity_valid[VDEV_RAIDZ_Q]) 1242 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1243 1244 ASSERT(rm->rm_firstdatacol > 2); 1245 1246 break; 1247 } 1248 } 1249 1250 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1251 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1252 ASSERT(code > 0); 1253 return (code); 1254 } 1255 1256 static int 1257 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1258 { 1259 vdev_t *cvd; 1260 uint64_t nparity = vd->vdev_nparity; 1261 int c; 1262 int lasterror = 0; 1263 int numerrors = 0; 1264 1265 ASSERT(nparity > 0); 1266 1267 if (nparity > VDEV_RAIDZ_MAXPARITY || 1268 vd->vdev_children < nparity + 1) { 1269 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1270 return (EINVAL); 1271 } 1272 1273 vdev_open_children(vd); 1274 1275 for (c = 0; c < vd->vdev_children; c++) { 1276 cvd = vd->vdev_child[c]; 1277 1278 if (cvd->vdev_open_error != 0) { 1279 lasterror = cvd->vdev_open_error; 1280 numerrors++; 1281 continue; 1282 } 1283 1284 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1285 *ashift = MAX(*ashift, cvd->vdev_ashift); 1286 } 1287 1288 *asize *= vd->vdev_children; 1289 1290 if (numerrors > nparity) { 1291 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1292 return (lasterror); 1293 } 1294 1295 return (0); 1296 } 1297 1298 static void 1299 vdev_raidz_close(vdev_t *vd) 1300 { 1301 int c; 1302 1303 for (c = 0; c < vd->vdev_children; c++) 1304 vdev_close(vd->vdev_child[c]); 1305 } 1306 1307 static uint64_t 1308 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1309 { 1310 uint64_t asize; 1311 uint64_t ashift = vd->vdev_top->vdev_ashift; 1312 uint64_t cols = vd->vdev_children; 1313 uint64_t nparity = vd->vdev_nparity; 1314 1315 asize = ((psize - 1) >> ashift) + 1; 1316 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1317 asize = roundup(asize, nparity + 1) << ashift; 1318 1319 return (asize); 1320 } 1321 1322 static void 1323 vdev_raidz_child_done(zio_t *zio) 1324 { 1325 raidz_col_t *rc = zio->io_private; 1326 1327 rc->rc_error = zio->io_error; 1328 rc->rc_tried = 1; 1329 rc->rc_skipped = 0; 1330 } 1331 1332 static int 1333 vdev_raidz_io_start(zio_t *zio) 1334 { 1335 vdev_t *vd = zio->io_vd; 1336 vdev_t *tvd = vd->vdev_top; 1337 vdev_t *cvd; 1338 blkptr_t *bp = zio->io_bp; 1339 raidz_map_t *rm; 1340 raidz_col_t *rc; 1341 int c, i; 1342 1343 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1344 vd->vdev_nparity); 1345 1346 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1347 1348 if (zio->io_type == ZIO_TYPE_WRITE) { 1349 vdev_raidz_generate_parity(rm); 1350 1351 for (c = 0; c < rm->rm_cols; c++) { 1352 rc = &rm->rm_col[c]; 1353 cvd = vd->vdev_child[rc->rc_devidx]; 1354 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1355 rc->rc_offset, rc->rc_data, rc->rc_size, 1356 zio->io_type, zio->io_priority, 0, 1357 vdev_raidz_child_done, rc)); 1358 } 1359 1360 /* 1361 * Generate optional I/Os for any skipped sectors to improve 1362 * aggregation contiguity. 1363 */ 1364 for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) { 1365 ASSERT(c <= rm->rm_scols); 1366 if (c == rm->rm_scols) 1367 c = 0; 1368 rc = &rm->rm_col[c]; 1369 cvd = vd->vdev_child[rc->rc_devidx]; 1370 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1371 rc->rc_offset + rc->rc_size, NULL, 1372 1 << tvd->vdev_ashift, 1373 zio->io_type, zio->io_priority, 1374 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1375 } 1376 1377 return (ZIO_PIPELINE_CONTINUE); 1378 } 1379 1380 ASSERT(zio->io_type == ZIO_TYPE_READ); 1381 1382 /* 1383 * Iterate over the columns in reverse order so that we hit the parity 1384 * last -- any errors along the way will force us to read the parity. 1385 */ 1386 for (c = rm->rm_cols - 1; c >= 0; c--) { 1387 rc = &rm->rm_col[c]; 1388 cvd = vd->vdev_child[rc->rc_devidx]; 1389 if (!vdev_readable(cvd)) { 1390 if (c >= rm->rm_firstdatacol) 1391 rm->rm_missingdata++; 1392 else 1393 rm->rm_missingparity++; 1394 rc->rc_error = ENXIO; 1395 rc->rc_tried = 1; /* don't even try */ 1396 rc->rc_skipped = 1; 1397 continue; 1398 } 1399 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 1400 if (c >= rm->rm_firstdatacol) 1401 rm->rm_missingdata++; 1402 else 1403 rm->rm_missingparity++; 1404 rc->rc_error = ESTALE; 1405 rc->rc_skipped = 1; 1406 continue; 1407 } 1408 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1409 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1410 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1411 rc->rc_offset, rc->rc_data, rc->rc_size, 1412 zio->io_type, zio->io_priority, 0, 1413 vdev_raidz_child_done, rc)); 1414 } 1415 } 1416 1417 return (ZIO_PIPELINE_CONTINUE); 1418 } 1419 1420 /* 1421 * Report a checksum error for a child of a RAID-Z device. 1422 */ 1423 static void 1424 raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 1425 { 1426 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1427 1428 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1429 mutex_enter(&vd->vdev_stat_lock); 1430 vd->vdev_stat.vs_checksum_errors++; 1431 mutex_exit(&vd->vdev_stat_lock); 1432 } 1433 1434 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1435 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1436 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 1437 } 1438 1439 /* 1440 * Generate the parity from the data columns. If we tried and were able to 1441 * read the parity without error, verify that the generated parity matches the 1442 * data we read. If it doesn't, we fire off a checksum error. Return the 1443 * number such failures. 1444 */ 1445 static int 1446 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1447 { 1448 void *orig[VDEV_RAIDZ_MAXPARITY]; 1449 int c, ret = 0; 1450 raidz_col_t *rc; 1451 1452 for (c = 0; c < rm->rm_firstdatacol; c++) { 1453 rc = &rm->rm_col[c]; 1454 if (!rc->rc_tried || rc->rc_error != 0) 1455 continue; 1456 orig[c] = zio_buf_alloc(rc->rc_size); 1457 bcopy(rc->rc_data, orig[c], rc->rc_size); 1458 } 1459 1460 vdev_raidz_generate_parity(rm); 1461 1462 for (c = 0; c < rm->rm_firstdatacol; c++) { 1463 rc = &rm->rm_col[c]; 1464 if (!rc->rc_tried || rc->rc_error != 0) 1465 continue; 1466 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1467 raidz_checksum_error(zio, rc); 1468 rc->rc_error = ECKSUM; 1469 ret++; 1470 } 1471 zio_buf_free(orig[c], rc->rc_size); 1472 } 1473 1474 return (ret); 1475 } 1476 1477 /* 1478 * Keep statistics on all the ways that we used parity to correct data. 1479 */ 1480 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1481 1482 static int 1483 vdev_raidz_worst_error(raidz_map_t *rm) 1484 { 1485 int error = 0; 1486 1487 for (int c = 0; c < rm->rm_cols; c++) 1488 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1489 1490 return (error); 1491 } 1492 1493 /* 1494 * Iterate over all combinations of bad data and attempt a reconstruction. 1495 * Note that the algorithm below is non-optimal because it doesn't take into 1496 * account how reconstruction is actually performed. For example, with 1497 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1498 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1499 * cases we'd only use parity information in column 0. 1500 */ 1501 static int 1502 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1503 { 1504 raidz_map_t *rm = zio->io_vsd; 1505 raidz_col_t *rc; 1506 void *orig[VDEV_RAIDZ_MAXPARITY]; 1507 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1508 int *tgts = &tstore[1]; 1509 int current, next, i, c, n; 1510 int code, ret = 0; 1511 1512 ASSERT(total_errors < rm->rm_firstdatacol); 1513 1514 /* 1515 * This simplifies one edge condition. 1516 */ 1517 tgts[-1] = -1; 1518 1519 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1520 /* 1521 * Initialize the targets array by finding the first n columns 1522 * that contain no error. 1523 * 1524 * If there were no data errors, we need to ensure that we're 1525 * always explicitly attempting to reconstruct at least one 1526 * data column. To do this, we simply push the highest target 1527 * up into the data columns. 1528 */ 1529 for (c = 0, i = 0; i < n; i++) { 1530 if (i == n - 1 && data_errors == 0 && 1531 c < rm->rm_firstdatacol) { 1532 c = rm->rm_firstdatacol; 1533 } 1534 1535 while (rm->rm_col[c].rc_error != 0) { 1536 c++; 1537 ASSERT3S(c, <, rm->rm_cols); 1538 } 1539 1540 tgts[i] = c++; 1541 } 1542 1543 /* 1544 * Setting tgts[n] simplifies the other edge condition. 1545 */ 1546 tgts[n] = rm->rm_cols; 1547 1548 /* 1549 * These buffers were allocated in previous iterations. 1550 */ 1551 for (i = 0; i < n - 1; i++) { 1552 ASSERT(orig[i] != NULL); 1553 } 1554 1555 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1556 1557 current = 0; 1558 next = tgts[current]; 1559 1560 while (current != n) { 1561 tgts[current] = next; 1562 current = 0; 1563 1564 /* 1565 * Save off the original data that we're going to 1566 * attempt to reconstruct. 1567 */ 1568 for (i = 0; i < n; i++) { 1569 ASSERT(orig[i] != NULL); 1570 c = tgts[i]; 1571 ASSERT3S(c, >=, 0); 1572 ASSERT3S(c, <, rm->rm_cols); 1573 rc = &rm->rm_col[c]; 1574 bcopy(rc->rc_data, orig[i], rc->rc_size); 1575 } 1576 1577 /* 1578 * Attempt a reconstruction and exit the outer loop on 1579 * success. 1580 */ 1581 code = vdev_raidz_reconstruct(rm, tgts, n); 1582 if (zio_checksum_error(zio) == 0) { 1583 atomic_inc_64(&raidz_corrected[code]); 1584 1585 for (i = 0; i < n; i++) { 1586 c = tgts[i]; 1587 rc = &rm->rm_col[c]; 1588 ASSERT(rc->rc_error == 0); 1589 if (rc->rc_tried) 1590 raidz_checksum_error(zio, rc); 1591 rc->rc_error = ECKSUM; 1592 } 1593 1594 ret = code; 1595 goto done; 1596 } 1597 1598 /* 1599 * Restore the original data. 1600 */ 1601 for (i = 0; i < n; i++) { 1602 c = tgts[i]; 1603 rc = &rm->rm_col[c]; 1604 bcopy(orig[i], rc->rc_data, rc->rc_size); 1605 } 1606 1607 do { 1608 /* 1609 * Find the next valid column after the current 1610 * position.. 1611 */ 1612 for (next = tgts[current] + 1; 1613 next < rm->rm_cols && 1614 rm->rm_col[next].rc_error != 0; next++) 1615 continue; 1616 1617 ASSERT(next <= tgts[current + 1]); 1618 1619 /* 1620 * If that spot is available, we're done here. 1621 */ 1622 if (next != tgts[current + 1]) 1623 break; 1624 1625 /* 1626 * Otherwise, find the next valid column after 1627 * the previous position. 1628 */ 1629 for (c = tgts[current - 1] + 1; 1630 rm->rm_col[c].rc_error != 0; c++) 1631 continue; 1632 1633 tgts[current] = c; 1634 current++; 1635 1636 } while (current != n); 1637 } 1638 } 1639 n--; 1640 done: 1641 for (i = 0; i < n; i++) { 1642 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1643 } 1644 1645 return (ret); 1646 } 1647 1648 static void 1649 vdev_raidz_io_done(zio_t *zio) 1650 { 1651 vdev_t *vd = zio->io_vd; 1652 vdev_t *cvd; 1653 raidz_map_t *rm = zio->io_vsd; 1654 raidz_col_t *rc; 1655 int unexpected_errors = 0; 1656 int parity_errors = 0; 1657 int parity_untried = 0; 1658 int data_errors = 0; 1659 int total_errors = 0; 1660 int n, c; 1661 int tgts[VDEV_RAIDZ_MAXPARITY]; 1662 int code; 1663 1664 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1665 1666 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1667 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1668 1669 for (c = 0; c < rm->rm_cols; c++) { 1670 rc = &rm->rm_col[c]; 1671 1672 if (rc->rc_error) { 1673 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1674 1675 if (c < rm->rm_firstdatacol) 1676 parity_errors++; 1677 else 1678 data_errors++; 1679 1680 if (!rc->rc_skipped) 1681 unexpected_errors++; 1682 1683 total_errors++; 1684 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1685 parity_untried++; 1686 } 1687 } 1688 1689 if (zio->io_type == ZIO_TYPE_WRITE) { 1690 /* 1691 * XXX -- for now, treat partial writes as a success. 1692 * (If we couldn't write enough columns to reconstruct 1693 * the data, the I/O failed. Otherwise, good enough.) 1694 * 1695 * Now that we support write reallocation, it would be better 1696 * to treat partial failure as real failure unless there are 1697 * no non-degraded top-level vdevs left, and not update DTLs 1698 * if we intend to reallocate. 1699 */ 1700 /* XXPOLICY */ 1701 if (total_errors > rm->rm_firstdatacol) 1702 zio->io_error = vdev_raidz_worst_error(rm); 1703 1704 return; 1705 } 1706 1707 ASSERT(zio->io_type == ZIO_TYPE_READ); 1708 /* 1709 * There are three potential phases for a read: 1710 * 1. produce valid data from the columns read 1711 * 2. read all disks and try again 1712 * 3. perform combinatorial reconstruction 1713 * 1714 * Each phase is progressively both more expensive and less likely to 1715 * occur. If we encounter more errors than we can repair or all phases 1716 * fail, we have no choice but to return an error. 1717 */ 1718 1719 /* 1720 * If the number of errors we saw was correctable -- less than or equal 1721 * to the number of parity disks read -- attempt to produce data that 1722 * has a valid checksum. Naturally, this case applies in the absence of 1723 * any errors. 1724 */ 1725 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1726 if (data_errors == 0) { 1727 if (zio_checksum_error(zio) == 0) { 1728 /* 1729 * If we read parity information (unnecessarily 1730 * as it happens since no reconstruction was 1731 * needed) regenerate and verify the parity. 1732 * We also regenerate parity when resilvering 1733 * so we can write it out to the failed device 1734 * later. 1735 */ 1736 if (parity_errors + parity_untried < 1737 rm->rm_firstdatacol || 1738 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1739 n = raidz_parity_verify(zio, rm); 1740 unexpected_errors += n; 1741 ASSERT(parity_errors + n <= 1742 rm->rm_firstdatacol); 1743 } 1744 goto done; 1745 } 1746 } else { 1747 /* 1748 * We either attempt to read all the parity columns or 1749 * none of them. If we didn't try to read parity, we 1750 * wouldn't be here in the correctable case. There must 1751 * also have been fewer parity errors than parity 1752 * columns or, again, we wouldn't be in this code path. 1753 */ 1754 ASSERT(parity_untried == 0); 1755 ASSERT(parity_errors < rm->rm_firstdatacol); 1756 1757 /* 1758 * Identify the data columns that reported an error. 1759 */ 1760 n = 0; 1761 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1762 rc = &rm->rm_col[c]; 1763 if (rc->rc_error != 0) { 1764 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1765 tgts[n++] = c; 1766 } 1767 } 1768 1769 ASSERT(rm->rm_firstdatacol >= n); 1770 1771 code = vdev_raidz_reconstruct(rm, tgts, n); 1772 1773 if (zio_checksum_error(zio) == 0) { 1774 atomic_inc_64(&raidz_corrected[code]); 1775 1776 /* 1777 * If we read more parity disks than were used 1778 * for reconstruction, confirm that the other 1779 * parity disks produced correct data. This 1780 * routine is suboptimal in that it regenerates 1781 * the parity that we already used in addition 1782 * to the parity that we're attempting to 1783 * verify, but this should be a relatively 1784 * uncommon case, and can be optimized if it 1785 * becomes a problem. Note that we regenerate 1786 * parity when resilvering so we can write it 1787 * out to failed devices later. 1788 */ 1789 if (parity_errors < rm->rm_firstdatacol - n || 1790 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1791 n = raidz_parity_verify(zio, rm); 1792 unexpected_errors += n; 1793 ASSERT(parity_errors + n <= 1794 rm->rm_firstdatacol); 1795 } 1796 1797 goto done; 1798 } 1799 } 1800 } 1801 1802 /* 1803 * This isn't a typical situation -- either we got a read error or 1804 * a child silently returned bad data. Read every block so we can 1805 * try again with as much data and parity as we can track down. If 1806 * we've already been through once before, all children will be marked 1807 * as tried so we'll proceed to combinatorial reconstruction. 1808 */ 1809 unexpected_errors = 1; 1810 rm->rm_missingdata = 0; 1811 rm->rm_missingparity = 0; 1812 1813 for (c = 0; c < rm->rm_cols; c++) { 1814 if (rm->rm_col[c].rc_tried) 1815 continue; 1816 1817 zio_vdev_io_redone(zio); 1818 do { 1819 rc = &rm->rm_col[c]; 1820 if (rc->rc_tried) 1821 continue; 1822 zio_nowait(zio_vdev_child_io(zio, NULL, 1823 vd->vdev_child[rc->rc_devidx], 1824 rc->rc_offset, rc->rc_data, rc->rc_size, 1825 zio->io_type, zio->io_priority, 0, 1826 vdev_raidz_child_done, rc)); 1827 } while (++c < rm->rm_cols); 1828 1829 return; 1830 } 1831 1832 /* 1833 * At this point we've attempted to reconstruct the data given the 1834 * errors we detected, and we've attempted to read all columns. There 1835 * must, therefore, be one or more additional problems -- silent errors 1836 * resulting in invalid data rather than explicit I/O errors resulting 1837 * in absent data. We check if there is enough additional data to 1838 * possibly reconstruct the data and then perform combinatorial 1839 * reconstruction over all possible combinations. If that fails, 1840 * we're cooked. 1841 */ 1842 if (total_errors >= rm->rm_firstdatacol) { 1843 zio->io_error = vdev_raidz_worst_error(rm); 1844 /* 1845 * If there were exactly as many device errors as parity 1846 * columns, yet we couldn't reconstruct the data, then at 1847 * least one device must have returned bad data silently. 1848 */ 1849 if (total_errors == rm->rm_firstdatacol) 1850 zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 1851 1852 } else if ((code = vdev_raidz_combrec(zio, total_errors, 1853 data_errors)) != 0) { 1854 /* 1855 * If we didn't use all the available parity for the 1856 * combinatorial reconstruction, verify that the remaining 1857 * parity is correct. 1858 */ 1859 if (code != (1 << rm->rm_firstdatacol) - 1) 1860 (void) raidz_parity_verify(zio, rm); 1861 } else { 1862 /* 1863 * All combinations failed to checksum. Generate checksum 1864 * ereports for all children. 1865 */ 1866 zio->io_error = ECKSUM; 1867 1868 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1869 for (c = 0; c < rm->rm_cols; c++) { 1870 rc = &rm->rm_col[c]; 1871 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1872 zio->io_spa, vd->vdev_child[rc->rc_devidx], 1873 zio, rc->rc_offset, rc->rc_size); 1874 } 1875 } 1876 } 1877 1878 done: 1879 zio_checksum_verified(zio); 1880 1881 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1882 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1883 /* 1884 * Use the good data we have in hand to repair damaged children. 1885 */ 1886 for (c = 0; c < rm->rm_cols; c++) { 1887 rc = &rm->rm_col[c]; 1888 cvd = vd->vdev_child[rc->rc_devidx]; 1889 1890 if (rc->rc_error == 0) 1891 continue; 1892 1893 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1894 rc->rc_offset, rc->rc_data, rc->rc_size, 1895 ZIO_TYPE_WRITE, zio->io_priority, 1896 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1897 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1898 } 1899 } 1900 } 1901 1902 static void 1903 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1904 { 1905 if (faulted > vd->vdev_nparity) 1906 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1907 VDEV_AUX_NO_REPLICAS); 1908 else if (degraded + faulted != 0) 1909 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1910 else 1911 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1912 } 1913 1914 vdev_ops_t vdev_raidz_ops = { 1915 vdev_raidz_open, 1916 vdev_raidz_close, 1917 vdev_raidz_asize, 1918 vdev_raidz_io_start, 1919 vdev_raidz_io_done, 1920 vdev_raidz_state_change, 1921 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1922 B_FALSE /* not a leaf vdev */ 1923 }; 1924