1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * (A * 2)_7 = A_6 64 * (A * 2)_6 = A_5 65 * (A * 2)_5 = A_4 66 * (A * 2)_4 = A_3 + A_7 67 * (A * 2)_3 = A_2 + A_7 68 * (A * 2)_2 = A_1 + A_7 69 * (A * 2)_1 = A_0 70 * (A * 2)_0 = A_7 71 * 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 * As an aside, this multiplication is derived from the error correcting 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 * 76 * Observe that any number in the field (except for 0) can be expressed as a 77 * power of 2 -- a generator for the field. We store a table of the powers of 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 * than field addition). The inverse of a field element A (A^-1) is therefore 81 * A ^ (255 - 1) = A^254. 82 * 83 * The up-to-three parity columns, P, Q, R over several data columns, 84 * D_0, ... D_n-1, can be expressed by field operations: 85 * 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 * 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 * independent coefficients. (There are no additional coefficients that have 95 * this property which is why the uncorrected Plank method breaks down.) 96 * 97 * See the reconstruction code below for how P, Q and R can used individually 98 * or in concert to recover missing data columns. 99 */ 100 101 typedef struct raidz_col { 102 uint64_t rc_devidx; /* child device index for I/O */ 103 uint64_t rc_offset; /* device offset */ 104 uint64_t rc_size; /* I/O size */ 105 void *rc_data; /* I/O data */ 106 int rc_error; /* I/O error for this device */ 107 uint8_t rc_tried; /* Did we attempt this I/O column? */ 108 uint8_t rc_skipped; /* Did we skip this I/O column? */ 109 } raidz_col_t; 110 111 typedef struct raidz_map { 112 uint64_t rm_cols; /* Regular column count */ 113 uint64_t rm_scols; /* Count including skipped columns */ 114 uint64_t rm_bigcols; /* Number of oversized columns */ 115 uint64_t rm_asize; /* Actual total I/O size */ 116 uint64_t rm_missingdata; /* Count of missing data devices */ 117 uint64_t rm_missingparity; /* Count of missing parity devices */ 118 uint64_t rm_firstdatacol; /* First data column/parity count */ 119 uint64_t rm_nskip; /* Skipped sectors for padding */ 120 uint64_t rm_skipstart; /* Column index of padding start */ 121 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 122 } raidz_map_t; 123 124 #define VDEV_RAIDZ_P 0 125 #define VDEV_RAIDZ_Q 1 126 #define VDEV_RAIDZ_R 2 127 #define VDEV_RAIDZ_MAXPARITY 3 128 129 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 130 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 131 132 /* 133 * We provide a mechanism to perform the field multiplication operation on a 134 * 64-bit value all at once rather than a byte at a time. This works by 135 * creating a mask from the top bit in each byte and using that to 136 * conditionally apply the XOR of 0x1d. 137 */ 138 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 139 { \ 140 (mask) = (x) & 0x8080808080808080ULL; \ 141 (mask) = ((mask) << 1) - ((mask) >> 7); \ 142 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 143 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 144 } 145 146 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 147 { \ 148 VDEV_RAIDZ_64MUL_2((x), mask); \ 149 VDEV_RAIDZ_64MUL_2((x), mask); \ 150 } 151 152 /* 153 * Force reconstruction to use the general purpose method. 154 */ 155 int vdev_raidz_default_to_general; 156 157 /* 158 * These two tables represent powers and logs of 2 in the Galois field defined 159 * above. These values were computed by repeatedly multiplying by 2 as above. 160 */ 161 static const uint8_t vdev_raidz_pow2[256] = { 162 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 163 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 164 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 165 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 166 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 167 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 168 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 169 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 170 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 171 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 172 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 173 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 174 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 175 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 176 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 177 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 178 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 179 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 180 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 181 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 182 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 183 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 184 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 185 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 186 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 187 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 188 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 189 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 190 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 191 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 192 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 193 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 194 }; 195 static const uint8_t vdev_raidz_log2[256] = { 196 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 197 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 198 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 199 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 200 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 201 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 202 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 203 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 204 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 205 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 206 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 207 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 208 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 209 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 210 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 211 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 212 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 213 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 214 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 215 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 216 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 217 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 218 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 219 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 220 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 221 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 222 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 223 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 224 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 225 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 226 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 227 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 228 }; 229 230 /* 231 * Multiply a given number by 2 raised to the given power. 232 */ 233 static uint8_t 234 vdev_raidz_exp2(uint_t a, int exp) 235 { 236 if (a == 0) 237 return (0); 238 239 ASSERT(exp >= 0); 240 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 241 242 exp += vdev_raidz_log2[a]; 243 if (exp > 255) 244 exp -= 255; 245 246 return (vdev_raidz_pow2[exp]); 247 } 248 249 static void 250 vdev_raidz_map_free(zio_t *zio) 251 { 252 raidz_map_t *rm = zio->io_vsd; 253 int c; 254 255 for (c = 0; c < rm->rm_firstdatacol; c++) 256 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 257 258 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 259 } 260 261 static raidz_map_t * 262 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 263 uint64_t nparity) 264 { 265 raidz_map_t *rm; 266 uint64_t b = zio->io_offset >> unit_shift; 267 uint64_t s = zio->io_size >> unit_shift; 268 uint64_t f = b % dcols; 269 uint64_t o = (b / dcols) << unit_shift; 270 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 271 272 q = s / (dcols - nparity); 273 r = s - q * (dcols - nparity); 274 bc = (r == 0 ? 0 : r + nparity); 275 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 276 277 if (q == 0) { 278 acols = bc; 279 scols = MIN(dcols, roundup(bc, nparity + 1)); 280 } else { 281 acols = dcols; 282 scols = dcols; 283 } 284 285 ASSERT3U(acols, <=, scols); 286 287 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 288 289 rm->rm_cols = acols; 290 rm->rm_scols = scols; 291 rm->rm_bigcols = bc; 292 rm->rm_skipstart = bc; 293 rm->rm_missingdata = 0; 294 rm->rm_missingparity = 0; 295 rm->rm_firstdatacol = nparity; 296 297 asize = 0; 298 299 for (c = 0; c < scols; c++) { 300 col = f + c; 301 coff = o; 302 if (col >= dcols) { 303 col -= dcols; 304 coff += 1ULL << unit_shift; 305 } 306 rm->rm_col[c].rc_devidx = col; 307 rm->rm_col[c].rc_offset = coff; 308 rm->rm_col[c].rc_data = NULL; 309 rm->rm_col[c].rc_error = 0; 310 rm->rm_col[c].rc_tried = 0; 311 rm->rm_col[c].rc_skipped = 0; 312 313 if (c >= acols) 314 rm->rm_col[c].rc_size = 0; 315 else if (c < bc) 316 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 317 else 318 rm->rm_col[c].rc_size = q << unit_shift; 319 320 asize += rm->rm_col[c].rc_size; 321 } 322 323 ASSERT3U(asize, ==, tot << unit_shift); 324 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 325 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 326 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 327 ASSERT3U(rm->rm_nskip, <=, nparity); 328 329 for (c = 0; c < rm->rm_firstdatacol; c++) 330 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 331 332 rm->rm_col[c].rc_data = zio->io_data; 333 334 for (c = c + 1; c < acols; c++) 335 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 336 rm->rm_col[c - 1].rc_size; 337 338 /* 339 * If all data stored spans all columns, there's a danger that parity 340 * will always be on the same device and, since parity isn't read 341 * during normal operation, that that device's I/O bandwidth won't be 342 * used effectively. We therefore switch the parity every 1MB. 343 * 344 * ... at least that was, ostensibly, the theory. As a practical 345 * matter unless we juggle the parity between all devices evenly, we 346 * won't see any benefit. Further, occasional writes that aren't a 347 * multiple of the LCM of the number of children and the minimum 348 * stripe width are sufficient to avoid pessimal behavior. 349 * Unfortunately, this decision created an implicit on-disk format 350 * requirement that we need to support for all eternity, but only 351 * for single-parity RAID-Z. 352 * 353 * If we intend to skip a sector in the zeroth column for padding 354 * we must make sure to note this swap. We will never intend to 355 * skip the first column since at least one data and one parity 356 * column must appear in each row. 357 */ 358 ASSERT(rm->rm_cols >= 2); 359 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 360 361 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 362 devidx = rm->rm_col[0].rc_devidx; 363 o = rm->rm_col[0].rc_offset; 364 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 365 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 366 rm->rm_col[1].rc_devidx = devidx; 367 rm->rm_col[1].rc_offset = o; 368 369 if (rm->rm_skipstart == 0) 370 rm->rm_skipstart = 1; 371 } 372 373 zio->io_vsd = rm; 374 zio->io_vsd_free = vdev_raidz_map_free; 375 return (rm); 376 } 377 378 static void 379 vdev_raidz_generate_parity_p(raidz_map_t *rm) 380 { 381 uint64_t *p, *src, pcount, ccount, i; 382 int c; 383 384 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 385 386 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 387 src = rm->rm_col[c].rc_data; 388 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 389 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 390 391 if (c == rm->rm_firstdatacol) { 392 ASSERT(ccount == pcount); 393 for (i = 0; i < ccount; i++, src++, p++) { 394 *p = *src; 395 } 396 } else { 397 ASSERT(ccount <= pcount); 398 for (i = 0; i < ccount; i++, src++, p++) { 399 *p ^= *src; 400 } 401 } 402 } 403 } 404 405 static void 406 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 407 { 408 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 409 int c; 410 411 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 412 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 413 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 414 415 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 416 src = rm->rm_col[c].rc_data; 417 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 418 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 419 420 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 421 422 if (c == rm->rm_firstdatacol) { 423 ASSERT(ccnt == pcnt || ccnt == 0); 424 for (i = 0; i < ccnt; i++, src++, p++, q++) { 425 *p = *src; 426 *q = *src; 427 } 428 for (; i < pcnt; i++, src++, p++, q++) { 429 *p = 0; 430 *q = 0; 431 } 432 } else { 433 ASSERT(ccnt <= pcnt); 434 435 /* 436 * Apply the algorithm described above by multiplying 437 * the previous result and adding in the new value. 438 */ 439 for (i = 0; i < ccnt; i++, src++, p++, q++) { 440 *p ^= *src; 441 442 VDEV_RAIDZ_64MUL_2(*q, mask); 443 *q ^= *src; 444 } 445 446 /* 447 * Treat short columns as though they are full of 0s. 448 * Note that there's therefore nothing needed for P. 449 */ 450 for (; i < pcnt; i++, q++) { 451 VDEV_RAIDZ_64MUL_2(*q, mask); 452 } 453 } 454 } 455 } 456 457 static void 458 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 459 { 460 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 461 int c; 462 463 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 464 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 465 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 466 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 467 rm->rm_col[VDEV_RAIDZ_R].rc_size); 468 469 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 470 src = rm->rm_col[c].rc_data; 471 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 472 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 473 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 474 475 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 476 477 if (c == rm->rm_firstdatacol) { 478 ASSERT(ccnt == pcnt || ccnt == 0); 479 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 480 *p = *src; 481 *q = *src; 482 *r = *src; 483 } 484 for (; i < pcnt; i++, src++, p++, q++, r++) { 485 *p = 0; 486 *q = 0; 487 *r = 0; 488 } 489 } else { 490 ASSERT(ccnt <= pcnt); 491 492 /* 493 * Apply the algorithm described above by multiplying 494 * the previous result and adding in the new value. 495 */ 496 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 497 *p ^= *src; 498 499 VDEV_RAIDZ_64MUL_2(*q, mask); 500 *q ^= *src; 501 502 VDEV_RAIDZ_64MUL_4(*r, mask); 503 *r ^= *src; 504 } 505 506 /* 507 * Treat short columns as though they are full of 0s. 508 * Note that there's therefore nothing needed for P. 509 */ 510 for (; i < pcnt; i++, q++, r++) { 511 VDEV_RAIDZ_64MUL_2(*q, mask); 512 VDEV_RAIDZ_64MUL_4(*r, mask); 513 } 514 } 515 } 516 } 517 518 /* 519 * Generate RAID parity in the first virtual columns according to the number of 520 * parity columns available. 521 */ 522 static void 523 vdev_raidz_generate_parity(raidz_map_t *rm) 524 { 525 switch (rm->rm_firstdatacol) { 526 case 1: 527 vdev_raidz_generate_parity_p(rm); 528 break; 529 case 2: 530 vdev_raidz_generate_parity_pq(rm); 531 break; 532 case 3: 533 vdev_raidz_generate_parity_pqr(rm); 534 break; 535 default: 536 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 537 } 538 } 539 540 static int 541 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 542 { 543 uint64_t *dst, *src, xcount, ccount, count, i; 544 int x = tgts[0]; 545 int c; 546 547 ASSERT(ntgts == 1); 548 ASSERT(x >= rm->rm_firstdatacol); 549 ASSERT(x < rm->rm_cols); 550 551 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 552 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 553 ASSERT(xcount > 0); 554 555 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 556 dst = rm->rm_col[x].rc_data; 557 for (i = 0; i < xcount; i++, dst++, src++) { 558 *dst = *src; 559 } 560 561 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 562 src = rm->rm_col[c].rc_data; 563 dst = rm->rm_col[x].rc_data; 564 565 if (c == x) 566 continue; 567 568 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 569 count = MIN(ccount, xcount); 570 571 for (i = 0; i < count; i++, dst++, src++) { 572 *dst ^= *src; 573 } 574 } 575 576 return (1 << VDEV_RAIDZ_P); 577 } 578 579 static int 580 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 581 { 582 uint64_t *dst, *src, xcount, ccount, count, mask, i; 583 uint8_t *b; 584 int x = tgts[0]; 585 int c, j, exp; 586 587 ASSERT(ntgts == 1); 588 589 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 590 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 591 592 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 593 src = rm->rm_col[c].rc_data; 594 dst = rm->rm_col[x].rc_data; 595 596 if (c == x) 597 ccount = 0; 598 else 599 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 600 601 count = MIN(ccount, xcount); 602 603 if (c == rm->rm_firstdatacol) { 604 for (i = 0; i < count; i++, dst++, src++) { 605 *dst = *src; 606 } 607 for (; i < xcount; i++, dst++) { 608 *dst = 0; 609 } 610 611 } else { 612 for (i = 0; i < count; i++, dst++, src++) { 613 VDEV_RAIDZ_64MUL_2(*dst, mask); 614 *dst ^= *src; 615 } 616 617 for (; i < xcount; i++, dst++) { 618 VDEV_RAIDZ_64MUL_2(*dst, mask); 619 } 620 } 621 } 622 623 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 624 dst = rm->rm_col[x].rc_data; 625 exp = 255 - (rm->rm_cols - 1 - x); 626 627 for (i = 0; i < xcount; i++, dst++, src++) { 628 *dst ^= *src; 629 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 630 *b = vdev_raidz_exp2(*b, exp); 631 } 632 } 633 634 return (1 << VDEV_RAIDZ_Q); 635 } 636 637 static int 638 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 639 { 640 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 641 void *pdata, *qdata; 642 uint64_t xsize, ysize, i; 643 int x = tgts[0]; 644 int y = tgts[1]; 645 646 ASSERT(ntgts == 2); 647 ASSERT(x < y); 648 ASSERT(x >= rm->rm_firstdatacol); 649 ASSERT(y < rm->rm_cols); 650 651 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 652 653 /* 654 * Move the parity data aside -- we're going to compute parity as 655 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 656 * reuse the parity generation mechanism without trashing the actual 657 * parity so we make those columns appear to be full of zeros by 658 * setting their lengths to zero. 659 */ 660 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 661 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 662 xsize = rm->rm_col[x].rc_size; 663 ysize = rm->rm_col[y].rc_size; 664 665 rm->rm_col[VDEV_RAIDZ_P].rc_data = 666 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 667 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 668 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 669 rm->rm_col[x].rc_size = 0; 670 rm->rm_col[y].rc_size = 0; 671 672 vdev_raidz_generate_parity_pq(rm); 673 674 rm->rm_col[x].rc_size = xsize; 675 rm->rm_col[y].rc_size = ysize; 676 677 p = pdata; 678 q = qdata; 679 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 680 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 681 xd = rm->rm_col[x].rc_data; 682 yd = rm->rm_col[y].rc_data; 683 684 /* 685 * We now have: 686 * Pxy = P + D_x + D_y 687 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 688 * 689 * We can then solve for D_x: 690 * D_x = A * (P + Pxy) + B * (Q + Qxy) 691 * where 692 * A = 2^(x - y) * (2^(x - y) + 1)^-1 693 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 694 * 695 * With D_x in hand, we can easily solve for D_y: 696 * D_y = P + Pxy + D_x 697 */ 698 699 a = vdev_raidz_pow2[255 + x - y]; 700 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 701 tmp = 255 - vdev_raidz_log2[a ^ 1]; 702 703 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 704 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 705 706 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 707 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 708 vdev_raidz_exp2(*q ^ *qxy, bexp); 709 710 if (i < ysize) 711 *yd = *p ^ *pxy ^ *xd; 712 } 713 714 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 715 rm->rm_col[VDEV_RAIDZ_P].rc_size); 716 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 717 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 718 719 /* 720 * Restore the saved parity data. 721 */ 722 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 723 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 724 725 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 726 } 727 728 /* BEGIN CSTYLED */ 729 /* 730 * In the general case of reconstruction, we must solve the system of linear 731 * equations defined by the coeffecients used to generate parity as well as 732 * the contents of the data and parity disks. This can be expressed with 733 * vectors for the original data (D) and the actual data (d) and parity (p) 734 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 735 * 736 * __ __ __ __ 737 * | | __ __ | p_0 | 738 * | V | | D_0 | | p_m-1 | 739 * | | x | : | = | d_0 | 740 * | I | | D_n-1 | | : | 741 * | | ~~ ~~ | d_n-1 | 742 * ~~ ~~ ~~ ~~ 743 * 744 * I is simply a square identity matrix of size n, and V is a vandermonde 745 * matrix defined by the coeffecients we chose for the various parity columns 746 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 747 * computation as well as linear separability. 748 * 749 * __ __ __ __ 750 * | 1 .. 1 1 1 | | p_0 | 751 * | 2^n-1 .. 4 2 1 | __ __ | : | 752 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 753 * | 1 .. 0 0 0 | | D_1 | | d_0 | 754 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 755 * | : : : : | | : | | d_2 | 756 * | 0 .. 1 0 0 | | D_n-1 | | : | 757 * | 0 .. 0 1 0 | ~~ ~~ | : | 758 * | 0 .. 0 0 1 | | d_n-1 | 759 * ~~ ~~ ~~ ~~ 760 * 761 * Note that I, V, d, and p are known. To compute D, we must invert the 762 * matrix and use the known data and parity values to reconstruct the unknown 763 * data values. We begin by removing the rows in V|I and d|p that correspond 764 * to failed or missing columns; we then make V|I square (n x n) and d|p 765 * sized n by removing rows corresponding to unused parity from the bottom up 766 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 767 * using Gauss-Jordan elimination. In the example below we use m=3 parity 768 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 769 * __ __ 770 * | 1 1 1 1 1 1 1 1 | 771 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 772 * | 19 205 116 29 64 16 4 1 | / / 773 * | 1 0 0 0 0 0 0 0 | / / 774 * | 0 1 0 0 0 0 0 0 | <--' / 775 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 776 * | 0 0 0 1 0 0 0 0 | 777 * | 0 0 0 0 1 0 0 0 | 778 * | 0 0 0 0 0 1 0 0 | 779 * | 0 0 0 0 0 0 1 0 | 780 * | 0 0 0 0 0 0 0 1 | 781 * ~~ ~~ 782 * __ __ 783 * | 1 1 1 1 1 1 1 1 | 784 * | 128 64 32 16 8 4 2 1 | 785 * | 19 205 116 29 64 16 4 1 | 786 * | 1 0 0 0 0 0 0 0 | 787 * | 0 1 0 0 0 0 0 0 | 788 * (V|I)' = | 0 0 1 0 0 0 0 0 | 789 * | 0 0 0 1 0 0 0 0 | 790 * | 0 0 0 0 1 0 0 0 | 791 * | 0 0 0 0 0 1 0 0 | 792 * | 0 0 0 0 0 0 1 0 | 793 * | 0 0 0 0 0 0 0 1 | 794 * ~~ ~~ 795 * 796 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 797 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 798 * matrix is not singular. 799 * __ __ 800 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 801 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 802 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 803 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 804 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 805 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 806 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 807 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 808 * ~~ ~~ 809 * __ __ 810 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 811 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 812 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 813 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 814 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 815 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 816 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 817 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 818 * ~~ ~~ 819 * __ __ 820 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 821 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 822 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 823 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 824 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 825 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 826 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 827 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 828 * ~~ ~~ 829 * __ __ 830 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 831 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 832 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 833 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 834 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 835 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 836 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 837 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 838 * ~~ ~~ 839 * __ __ 840 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 841 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 842 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 843 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 844 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 845 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 846 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 847 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 848 * ~~ ~~ 849 * __ __ 850 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 851 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 852 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 853 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 854 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 855 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 856 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 857 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 858 * ~~ ~~ 859 * __ __ 860 * | 0 0 1 0 0 0 0 0 | 861 * | 167 100 5 41 159 169 217 208 | 862 * | 166 100 4 40 158 168 216 209 | 863 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 864 * | 0 0 0 0 1 0 0 0 | 865 * | 0 0 0 0 0 1 0 0 | 866 * | 0 0 0 0 0 0 1 0 | 867 * | 0 0 0 0 0 0 0 1 | 868 * ~~ ~~ 869 * 870 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 871 * of the missing data. 872 * 873 * As is apparent from the example above, the only non-trivial rows in the 874 * inverse matrix correspond to the data disks that we're trying to 875 * reconstruct. Indeed, those are the only rows we need as the others would 876 * only be useful for reconstructing data known or assumed to be valid. For 877 * that reason, we only build the coefficients in the rows that correspond to 878 * targeted columns. 879 */ 880 /* END CSTYLED */ 881 882 static void 883 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 884 uint8_t **rows) 885 { 886 int i, j; 887 int pow; 888 889 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 890 891 /* 892 * Fill in the missing rows of interest. 893 */ 894 for (i = 0; i < nmap; i++) { 895 ASSERT3S(0, <=, map[i]); 896 ASSERT3S(map[i], <=, 2); 897 898 pow = map[i] * n; 899 if (pow > 255) 900 pow -= 255; 901 ASSERT(pow <= 255); 902 903 for (j = 0; j < n; j++) { 904 pow -= map[i]; 905 if (pow < 0) 906 pow += 255; 907 rows[i][j] = vdev_raidz_pow2[pow]; 908 } 909 } 910 } 911 912 static void 913 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 914 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 915 { 916 int i, j, ii, jj; 917 uint8_t log; 918 919 /* 920 * Assert that the first nmissing entries from the array of used 921 * columns correspond to parity columns and that subsequent entries 922 * correspond to data columns. 923 */ 924 for (i = 0; i < nmissing; i++) { 925 ASSERT3S(used[i], <, rm->rm_firstdatacol); 926 } 927 for (; i < n; i++) { 928 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 929 } 930 931 /* 932 * First initialize the storage where we'll compute the inverse rows. 933 */ 934 for (i = 0; i < nmissing; i++) { 935 for (j = 0; j < n; j++) { 936 invrows[i][j] = (i == j) ? 1 : 0; 937 } 938 } 939 940 /* 941 * Subtract all trivial rows from the rows of consequence. 942 */ 943 for (i = 0; i < nmissing; i++) { 944 for (j = nmissing; j < n; j++) { 945 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 946 jj = used[j] - rm->rm_firstdatacol; 947 ASSERT3S(jj, <, n); 948 invrows[i][j] = rows[i][jj]; 949 rows[i][jj] = 0; 950 } 951 } 952 953 /* 954 * For each of the rows of interest, we must normalize it and subtract 955 * a multiple of it from the other rows. 956 */ 957 for (i = 0; i < nmissing; i++) { 958 for (j = 0; j < missing[i]; j++) { 959 ASSERT3U(rows[i][j], ==, 0); 960 } 961 ASSERT3U(rows[i][missing[i]], !=, 0); 962 963 /* 964 * Compute the inverse of the first element and multiply each 965 * element in the row by that value. 966 */ 967 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 968 969 for (j = 0; j < n; j++) { 970 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 971 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 972 } 973 974 for (ii = 0; ii < nmissing; ii++) { 975 if (i == ii) 976 continue; 977 978 ASSERT3U(rows[ii][missing[i]], !=, 0); 979 980 log = vdev_raidz_log2[rows[ii][missing[i]]]; 981 982 for (j = 0; j < n; j++) { 983 rows[ii][j] ^= 984 vdev_raidz_exp2(rows[i][j], log); 985 invrows[ii][j] ^= 986 vdev_raidz_exp2(invrows[i][j], log); 987 } 988 } 989 } 990 991 /* 992 * Verify that the data that is left in the rows are properly part of 993 * an identity matrix. 994 */ 995 for (i = 0; i < nmissing; i++) { 996 for (j = 0; j < n; j++) { 997 if (j == missing[i]) { 998 ASSERT3U(rows[i][j], ==, 1); 999 } else { 1000 ASSERT3U(rows[i][j], ==, 0); 1001 } 1002 } 1003 } 1004 } 1005 1006 static void 1007 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1008 int *missing, uint8_t **invrows, const uint8_t *used) 1009 { 1010 int i, j, x, cc, c; 1011 uint8_t *src; 1012 uint64_t ccount; 1013 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1014 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1015 uint8_t log, val; 1016 int ll; 1017 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1018 uint8_t *p, *pp; 1019 size_t psize; 1020 1021 psize = sizeof (invlog[0][0]) * n * nmissing; 1022 p = kmem_alloc(psize, KM_SLEEP); 1023 1024 for (pp = p, i = 0; i < nmissing; i++) { 1025 invlog[i] = pp; 1026 pp += n; 1027 } 1028 1029 for (i = 0; i < nmissing; i++) { 1030 for (j = 0; j < n; j++) { 1031 ASSERT3U(invrows[i][j], !=, 0); 1032 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1033 } 1034 } 1035 1036 for (i = 0; i < n; i++) { 1037 c = used[i]; 1038 ASSERT3U(c, <, rm->rm_cols); 1039 1040 src = rm->rm_col[c].rc_data; 1041 ccount = rm->rm_col[c].rc_size; 1042 for (j = 0; j < nmissing; j++) { 1043 cc = missing[j] + rm->rm_firstdatacol; 1044 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1045 ASSERT3U(cc, <, rm->rm_cols); 1046 ASSERT3U(cc, !=, c); 1047 1048 dst[j] = rm->rm_col[cc].rc_data; 1049 dcount[j] = rm->rm_col[cc].rc_size; 1050 } 1051 1052 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1053 1054 for (x = 0; x < ccount; x++, src++) { 1055 if (*src != 0) 1056 log = vdev_raidz_log2[*src]; 1057 1058 for (cc = 0; cc < nmissing; cc++) { 1059 if (x >= dcount[cc]) 1060 continue; 1061 1062 if (*src == 0) { 1063 val = 0; 1064 } else { 1065 if ((ll = log + invlog[cc][i]) >= 255) 1066 ll -= 255; 1067 val = vdev_raidz_pow2[ll]; 1068 } 1069 1070 if (i == 0) 1071 dst[cc][x] = val; 1072 else 1073 dst[cc][x] ^= val; 1074 } 1075 } 1076 } 1077 1078 kmem_free(p, psize); 1079 } 1080 1081 static int 1082 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1083 { 1084 int n, i, c, t, tt; 1085 int nmissing_rows; 1086 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1087 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1088 1089 uint8_t *p, *pp; 1090 size_t psize; 1091 1092 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1093 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1094 uint8_t *used; 1095 1096 int code = 0; 1097 1098 1099 n = rm->rm_cols - rm->rm_firstdatacol; 1100 1101 /* 1102 * Figure out which data columns are missing. 1103 */ 1104 nmissing_rows = 0; 1105 for (t = 0; t < ntgts; t++) { 1106 if (tgts[t] >= rm->rm_firstdatacol) { 1107 missing_rows[nmissing_rows++] = 1108 tgts[t] - rm->rm_firstdatacol; 1109 } 1110 } 1111 1112 /* 1113 * Figure out which parity columns to use to help generate the missing 1114 * data columns. 1115 */ 1116 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1117 ASSERT(tt < ntgts); 1118 ASSERT(c < rm->rm_firstdatacol); 1119 1120 /* 1121 * Skip any targeted parity columns. 1122 */ 1123 if (c == tgts[tt]) { 1124 tt++; 1125 continue; 1126 } 1127 1128 code |= 1 << c; 1129 1130 parity_map[i] = c; 1131 i++; 1132 } 1133 1134 ASSERT(code != 0); 1135 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1136 1137 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1138 nmissing_rows * n + sizeof (used[0]) * n; 1139 p = kmem_alloc(psize, KM_SLEEP); 1140 1141 for (pp = p, i = 0; i < nmissing_rows; i++) { 1142 rows[i] = pp; 1143 pp += n; 1144 invrows[i] = pp; 1145 pp += n; 1146 } 1147 used = pp; 1148 1149 for (i = 0; i < nmissing_rows; i++) { 1150 used[i] = parity_map[i]; 1151 } 1152 1153 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1154 if (tt < nmissing_rows && 1155 c == missing_rows[tt] + rm->rm_firstdatacol) { 1156 tt++; 1157 continue; 1158 } 1159 1160 ASSERT3S(i, <, n); 1161 used[i] = c; 1162 i++; 1163 } 1164 1165 /* 1166 * Initialize the interesting rows of the matrix. 1167 */ 1168 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1169 1170 /* 1171 * Invert the matrix. 1172 */ 1173 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1174 invrows, used); 1175 1176 /* 1177 * Reconstruct the missing data using the generated matrix. 1178 */ 1179 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1180 invrows, used); 1181 1182 kmem_free(p, psize); 1183 1184 return (code); 1185 } 1186 1187 static int 1188 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1189 { 1190 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1191 int ntgts; 1192 int i, c; 1193 int code; 1194 int nbadparity, nbaddata; 1195 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1196 1197 /* 1198 * The tgts list must already be sorted. 1199 */ 1200 for (i = 1; i < nt; i++) { 1201 ASSERT(t[i] > t[i - 1]); 1202 } 1203 1204 nbadparity = rm->rm_firstdatacol; 1205 nbaddata = rm->rm_cols - nbadparity; 1206 ntgts = 0; 1207 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1208 if (c < rm->rm_firstdatacol) 1209 parity_valid[c] = B_FALSE; 1210 1211 if (i < nt && c == t[i]) { 1212 tgts[ntgts++] = c; 1213 i++; 1214 } else if (rm->rm_col[c].rc_error != 0) { 1215 tgts[ntgts++] = c; 1216 } else if (c >= rm->rm_firstdatacol) { 1217 nbaddata--; 1218 } else { 1219 parity_valid[c] = B_TRUE; 1220 nbadparity--; 1221 } 1222 } 1223 1224 ASSERT(ntgts >= nt); 1225 ASSERT(nbaddata >= 0); 1226 ASSERT(nbaddata + nbadparity == ntgts); 1227 1228 dt = &tgts[nbadparity]; 1229 1230 /* 1231 * See if we can use any of our optimized reconstruction routines. 1232 */ 1233 if (!vdev_raidz_default_to_general) { 1234 switch (nbaddata) { 1235 case 1: 1236 if (parity_valid[VDEV_RAIDZ_P]) 1237 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1238 1239 ASSERT(rm->rm_firstdatacol > 1); 1240 1241 if (parity_valid[VDEV_RAIDZ_Q]) 1242 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1243 1244 ASSERT(rm->rm_firstdatacol > 2); 1245 break; 1246 1247 case 2: 1248 ASSERT(rm->rm_firstdatacol > 1); 1249 1250 if (parity_valid[VDEV_RAIDZ_P] && 1251 parity_valid[VDEV_RAIDZ_Q]) 1252 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1253 1254 ASSERT(rm->rm_firstdatacol > 2); 1255 1256 break; 1257 } 1258 } 1259 1260 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1261 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1262 ASSERT(code > 0); 1263 return (code); 1264 } 1265 1266 static int 1267 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1268 { 1269 vdev_t *cvd; 1270 uint64_t nparity = vd->vdev_nparity; 1271 int c; 1272 int lasterror = 0; 1273 int numerrors = 0; 1274 1275 ASSERT(nparity > 0); 1276 1277 if (nparity > VDEV_RAIDZ_MAXPARITY || 1278 vd->vdev_children < nparity + 1) { 1279 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1280 return (EINVAL); 1281 } 1282 1283 vdev_open_children(vd); 1284 1285 for (c = 0; c < vd->vdev_children; c++) { 1286 cvd = vd->vdev_child[c]; 1287 1288 if (cvd->vdev_open_error != 0) { 1289 lasterror = cvd->vdev_open_error; 1290 numerrors++; 1291 continue; 1292 } 1293 1294 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1295 *ashift = MAX(*ashift, cvd->vdev_ashift); 1296 } 1297 1298 *asize *= vd->vdev_children; 1299 1300 if (numerrors > nparity) { 1301 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1302 return (lasterror); 1303 } 1304 1305 return (0); 1306 } 1307 1308 static void 1309 vdev_raidz_close(vdev_t *vd) 1310 { 1311 int c; 1312 1313 for (c = 0; c < vd->vdev_children; c++) 1314 vdev_close(vd->vdev_child[c]); 1315 } 1316 1317 static uint64_t 1318 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1319 { 1320 uint64_t asize; 1321 uint64_t ashift = vd->vdev_top->vdev_ashift; 1322 uint64_t cols = vd->vdev_children; 1323 uint64_t nparity = vd->vdev_nparity; 1324 1325 asize = ((psize - 1) >> ashift) + 1; 1326 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1327 asize = roundup(asize, nparity + 1) << ashift; 1328 1329 return (asize); 1330 } 1331 1332 static void 1333 vdev_raidz_child_done(zio_t *zio) 1334 { 1335 raidz_col_t *rc = zio->io_private; 1336 1337 rc->rc_error = zio->io_error; 1338 rc->rc_tried = 1; 1339 rc->rc_skipped = 0; 1340 } 1341 1342 static int 1343 vdev_raidz_io_start(zio_t *zio) 1344 { 1345 vdev_t *vd = zio->io_vd; 1346 vdev_t *tvd = vd->vdev_top; 1347 vdev_t *cvd; 1348 blkptr_t *bp = zio->io_bp; 1349 raidz_map_t *rm; 1350 raidz_col_t *rc; 1351 int c, i; 1352 1353 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1354 vd->vdev_nparity); 1355 1356 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1357 1358 if (zio->io_type == ZIO_TYPE_WRITE) { 1359 vdev_raidz_generate_parity(rm); 1360 1361 for (c = 0; c < rm->rm_cols; c++) { 1362 rc = &rm->rm_col[c]; 1363 cvd = vd->vdev_child[rc->rc_devidx]; 1364 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1365 rc->rc_offset, rc->rc_data, rc->rc_size, 1366 zio->io_type, zio->io_priority, 0, 1367 vdev_raidz_child_done, rc)); 1368 } 1369 1370 /* 1371 * Generate optional I/Os for any skipped sectors to improve 1372 * aggregation contiguity. 1373 */ 1374 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1375 ASSERT(c <= rm->rm_scols); 1376 if (c == rm->rm_scols) 1377 c = 0; 1378 rc = &rm->rm_col[c]; 1379 cvd = vd->vdev_child[rc->rc_devidx]; 1380 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1381 rc->rc_offset + rc->rc_size, NULL, 1382 1 << tvd->vdev_ashift, 1383 zio->io_type, zio->io_priority, 1384 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1385 } 1386 1387 return (ZIO_PIPELINE_CONTINUE); 1388 } 1389 1390 ASSERT(zio->io_type == ZIO_TYPE_READ); 1391 1392 /* 1393 * Iterate over the columns in reverse order so that we hit the parity 1394 * last -- any errors along the way will force us to read the parity. 1395 */ 1396 for (c = rm->rm_cols - 1; c >= 0; c--) { 1397 rc = &rm->rm_col[c]; 1398 cvd = vd->vdev_child[rc->rc_devidx]; 1399 if (!vdev_readable(cvd)) { 1400 if (c >= rm->rm_firstdatacol) 1401 rm->rm_missingdata++; 1402 else 1403 rm->rm_missingparity++; 1404 rc->rc_error = ENXIO; 1405 rc->rc_tried = 1; /* don't even try */ 1406 rc->rc_skipped = 1; 1407 continue; 1408 } 1409 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 1410 if (c >= rm->rm_firstdatacol) 1411 rm->rm_missingdata++; 1412 else 1413 rm->rm_missingparity++; 1414 rc->rc_error = ESTALE; 1415 rc->rc_skipped = 1; 1416 continue; 1417 } 1418 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1419 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1420 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1421 rc->rc_offset, rc->rc_data, rc->rc_size, 1422 zio->io_type, zio->io_priority, 0, 1423 vdev_raidz_child_done, rc)); 1424 } 1425 } 1426 1427 return (ZIO_PIPELINE_CONTINUE); 1428 } 1429 1430 /* 1431 * Report a checksum error for a child of a RAID-Z device. 1432 */ 1433 static void 1434 raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 1435 { 1436 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1437 1438 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1439 mutex_enter(&vd->vdev_stat_lock); 1440 vd->vdev_stat.vs_checksum_errors++; 1441 mutex_exit(&vd->vdev_stat_lock); 1442 } 1443 1444 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1445 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1446 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 1447 } 1448 1449 /* 1450 * Generate the parity from the data columns. If we tried and were able to 1451 * read the parity without error, verify that the generated parity matches the 1452 * data we read. If it doesn't, we fire off a checksum error. Return the 1453 * number such failures. 1454 */ 1455 static int 1456 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1457 { 1458 void *orig[VDEV_RAIDZ_MAXPARITY]; 1459 int c, ret = 0; 1460 raidz_col_t *rc; 1461 1462 for (c = 0; c < rm->rm_firstdatacol; c++) { 1463 rc = &rm->rm_col[c]; 1464 if (!rc->rc_tried || rc->rc_error != 0) 1465 continue; 1466 orig[c] = zio_buf_alloc(rc->rc_size); 1467 bcopy(rc->rc_data, orig[c], rc->rc_size); 1468 } 1469 1470 vdev_raidz_generate_parity(rm); 1471 1472 for (c = 0; c < rm->rm_firstdatacol; c++) { 1473 rc = &rm->rm_col[c]; 1474 if (!rc->rc_tried || rc->rc_error != 0) 1475 continue; 1476 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1477 raidz_checksum_error(zio, rc); 1478 rc->rc_error = ECKSUM; 1479 ret++; 1480 } 1481 zio_buf_free(orig[c], rc->rc_size); 1482 } 1483 1484 return (ret); 1485 } 1486 1487 /* 1488 * Keep statistics on all the ways that we used parity to correct data. 1489 */ 1490 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1491 1492 static int 1493 vdev_raidz_worst_error(raidz_map_t *rm) 1494 { 1495 int error = 0; 1496 1497 for (int c = 0; c < rm->rm_cols; c++) 1498 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1499 1500 return (error); 1501 } 1502 1503 /* 1504 * Iterate over all combinations of bad data and attempt a reconstruction. 1505 * Note that the algorithm below is non-optimal because it doesn't take into 1506 * account how reconstruction is actually performed. For example, with 1507 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1508 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1509 * cases we'd only use parity information in column 0. 1510 */ 1511 static int 1512 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1513 { 1514 raidz_map_t *rm = zio->io_vsd; 1515 raidz_col_t *rc; 1516 void *orig[VDEV_RAIDZ_MAXPARITY]; 1517 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1518 int *tgts = &tstore[1]; 1519 int current, next, i, c, n; 1520 int code, ret = 0; 1521 1522 ASSERT(total_errors < rm->rm_firstdatacol); 1523 1524 /* 1525 * This simplifies one edge condition. 1526 */ 1527 tgts[-1] = -1; 1528 1529 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1530 /* 1531 * Initialize the targets array by finding the first n columns 1532 * that contain no error. 1533 * 1534 * If there were no data errors, we need to ensure that we're 1535 * always explicitly attempting to reconstruct at least one 1536 * data column. To do this, we simply push the highest target 1537 * up into the data columns. 1538 */ 1539 for (c = 0, i = 0; i < n; i++) { 1540 if (i == n - 1 && data_errors == 0 && 1541 c < rm->rm_firstdatacol) { 1542 c = rm->rm_firstdatacol; 1543 } 1544 1545 while (rm->rm_col[c].rc_error != 0) { 1546 c++; 1547 ASSERT3S(c, <, rm->rm_cols); 1548 } 1549 1550 tgts[i] = c++; 1551 } 1552 1553 /* 1554 * Setting tgts[n] simplifies the other edge condition. 1555 */ 1556 tgts[n] = rm->rm_cols; 1557 1558 /* 1559 * These buffers were allocated in previous iterations. 1560 */ 1561 for (i = 0; i < n - 1; i++) { 1562 ASSERT(orig[i] != NULL); 1563 } 1564 1565 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1566 1567 current = 0; 1568 next = tgts[current]; 1569 1570 while (current != n) { 1571 tgts[current] = next; 1572 current = 0; 1573 1574 /* 1575 * Save off the original data that we're going to 1576 * attempt to reconstruct. 1577 */ 1578 for (i = 0; i < n; i++) { 1579 ASSERT(orig[i] != NULL); 1580 c = tgts[i]; 1581 ASSERT3S(c, >=, 0); 1582 ASSERT3S(c, <, rm->rm_cols); 1583 rc = &rm->rm_col[c]; 1584 bcopy(rc->rc_data, orig[i], rc->rc_size); 1585 } 1586 1587 /* 1588 * Attempt a reconstruction and exit the outer loop on 1589 * success. 1590 */ 1591 code = vdev_raidz_reconstruct(rm, tgts, n); 1592 if (zio_checksum_error(zio) == 0) { 1593 atomic_inc_64(&raidz_corrected[code]); 1594 1595 for (i = 0; i < n; i++) { 1596 c = tgts[i]; 1597 rc = &rm->rm_col[c]; 1598 ASSERT(rc->rc_error == 0); 1599 if (rc->rc_tried) { 1600 if (bcmp(orig[i], rc->rc_data, 1601 rc->rc_size) == 0) 1602 continue; 1603 raidz_checksum_error(zio, rc); 1604 } 1605 rc->rc_error = ECKSUM; 1606 } 1607 1608 ret = code; 1609 goto done; 1610 } 1611 1612 /* 1613 * Restore the original data. 1614 */ 1615 for (i = 0; i < n; i++) { 1616 c = tgts[i]; 1617 rc = &rm->rm_col[c]; 1618 bcopy(orig[i], rc->rc_data, rc->rc_size); 1619 } 1620 1621 do { 1622 /* 1623 * Find the next valid column after the current 1624 * position.. 1625 */ 1626 for (next = tgts[current] + 1; 1627 next < rm->rm_cols && 1628 rm->rm_col[next].rc_error != 0; next++) 1629 continue; 1630 1631 ASSERT(next <= tgts[current + 1]); 1632 1633 /* 1634 * If that spot is available, we're done here. 1635 */ 1636 if (next != tgts[current + 1]) 1637 break; 1638 1639 /* 1640 * Otherwise, find the next valid column after 1641 * the previous position. 1642 */ 1643 for (c = tgts[current - 1] + 1; 1644 rm->rm_col[c].rc_error != 0; c++) 1645 continue; 1646 1647 tgts[current] = c; 1648 current++; 1649 1650 } while (current != n); 1651 } 1652 } 1653 n--; 1654 done: 1655 for (i = 0; i < n; i++) { 1656 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1657 } 1658 1659 return (ret); 1660 } 1661 1662 static void 1663 vdev_raidz_io_done(zio_t *zio) 1664 { 1665 vdev_t *vd = zio->io_vd; 1666 vdev_t *cvd; 1667 raidz_map_t *rm = zio->io_vsd; 1668 raidz_col_t *rc; 1669 int unexpected_errors = 0; 1670 int parity_errors = 0; 1671 int parity_untried = 0; 1672 int data_errors = 0; 1673 int total_errors = 0; 1674 int n, c; 1675 int tgts[VDEV_RAIDZ_MAXPARITY]; 1676 int code; 1677 1678 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1679 1680 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1681 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1682 1683 for (c = 0; c < rm->rm_cols; c++) { 1684 rc = &rm->rm_col[c]; 1685 1686 if (rc->rc_error) { 1687 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1688 1689 if (c < rm->rm_firstdatacol) 1690 parity_errors++; 1691 else 1692 data_errors++; 1693 1694 if (!rc->rc_skipped) 1695 unexpected_errors++; 1696 1697 total_errors++; 1698 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1699 parity_untried++; 1700 } 1701 } 1702 1703 if (zio->io_type == ZIO_TYPE_WRITE) { 1704 /* 1705 * XXX -- for now, treat partial writes as a success. 1706 * (If we couldn't write enough columns to reconstruct 1707 * the data, the I/O failed. Otherwise, good enough.) 1708 * 1709 * Now that we support write reallocation, it would be better 1710 * to treat partial failure as real failure unless there are 1711 * no non-degraded top-level vdevs left, and not update DTLs 1712 * if we intend to reallocate. 1713 */ 1714 /* XXPOLICY */ 1715 if (total_errors > rm->rm_firstdatacol) 1716 zio->io_error = vdev_raidz_worst_error(rm); 1717 1718 return; 1719 } 1720 1721 ASSERT(zio->io_type == ZIO_TYPE_READ); 1722 /* 1723 * There are three potential phases for a read: 1724 * 1. produce valid data from the columns read 1725 * 2. read all disks and try again 1726 * 3. perform combinatorial reconstruction 1727 * 1728 * Each phase is progressively both more expensive and less likely to 1729 * occur. If we encounter more errors than we can repair or all phases 1730 * fail, we have no choice but to return an error. 1731 */ 1732 1733 /* 1734 * If the number of errors we saw was correctable -- less than or equal 1735 * to the number of parity disks read -- attempt to produce data that 1736 * has a valid checksum. Naturally, this case applies in the absence of 1737 * any errors. 1738 */ 1739 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1740 if (data_errors == 0) { 1741 if (zio_checksum_error(zio) == 0) { 1742 /* 1743 * If we read parity information (unnecessarily 1744 * as it happens since no reconstruction was 1745 * needed) regenerate and verify the parity. 1746 * We also regenerate parity when resilvering 1747 * so we can write it out to the failed device 1748 * later. 1749 */ 1750 if (parity_errors + parity_untried < 1751 rm->rm_firstdatacol || 1752 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1753 n = raidz_parity_verify(zio, rm); 1754 unexpected_errors += n; 1755 ASSERT(parity_errors + n <= 1756 rm->rm_firstdatacol); 1757 } 1758 goto done; 1759 } 1760 } else { 1761 /* 1762 * We either attempt to read all the parity columns or 1763 * none of them. If we didn't try to read parity, we 1764 * wouldn't be here in the correctable case. There must 1765 * also have been fewer parity errors than parity 1766 * columns or, again, we wouldn't be in this code path. 1767 */ 1768 ASSERT(parity_untried == 0); 1769 ASSERT(parity_errors < rm->rm_firstdatacol); 1770 1771 /* 1772 * Identify the data columns that reported an error. 1773 */ 1774 n = 0; 1775 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1776 rc = &rm->rm_col[c]; 1777 if (rc->rc_error != 0) { 1778 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1779 tgts[n++] = c; 1780 } 1781 } 1782 1783 ASSERT(rm->rm_firstdatacol >= n); 1784 1785 code = vdev_raidz_reconstruct(rm, tgts, n); 1786 1787 if (zio_checksum_error(zio) == 0) { 1788 atomic_inc_64(&raidz_corrected[code]); 1789 1790 /* 1791 * If we read more parity disks than were used 1792 * for reconstruction, confirm that the other 1793 * parity disks produced correct data. This 1794 * routine is suboptimal in that it regenerates 1795 * the parity that we already used in addition 1796 * to the parity that we're attempting to 1797 * verify, but this should be a relatively 1798 * uncommon case, and can be optimized if it 1799 * becomes a problem. Note that we regenerate 1800 * parity when resilvering so we can write it 1801 * out to failed devices later. 1802 */ 1803 if (parity_errors < rm->rm_firstdatacol - n || 1804 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1805 n = raidz_parity_verify(zio, rm); 1806 unexpected_errors += n; 1807 ASSERT(parity_errors + n <= 1808 rm->rm_firstdatacol); 1809 } 1810 1811 goto done; 1812 } 1813 } 1814 } 1815 1816 /* 1817 * This isn't a typical situation -- either we got a read error or 1818 * a child silently returned bad data. Read every block so we can 1819 * try again with as much data and parity as we can track down. If 1820 * we've already been through once before, all children will be marked 1821 * as tried so we'll proceed to combinatorial reconstruction. 1822 */ 1823 unexpected_errors = 1; 1824 rm->rm_missingdata = 0; 1825 rm->rm_missingparity = 0; 1826 1827 for (c = 0; c < rm->rm_cols; c++) { 1828 if (rm->rm_col[c].rc_tried) 1829 continue; 1830 1831 zio_vdev_io_redone(zio); 1832 do { 1833 rc = &rm->rm_col[c]; 1834 if (rc->rc_tried) 1835 continue; 1836 zio_nowait(zio_vdev_child_io(zio, NULL, 1837 vd->vdev_child[rc->rc_devidx], 1838 rc->rc_offset, rc->rc_data, rc->rc_size, 1839 zio->io_type, zio->io_priority, 0, 1840 vdev_raidz_child_done, rc)); 1841 } while (++c < rm->rm_cols); 1842 1843 return; 1844 } 1845 1846 /* 1847 * At this point we've attempted to reconstruct the data given the 1848 * errors we detected, and we've attempted to read all columns. There 1849 * must, therefore, be one or more additional problems -- silent errors 1850 * resulting in invalid data rather than explicit I/O errors resulting 1851 * in absent data. We check if there is enough additional data to 1852 * possibly reconstruct the data and then perform combinatorial 1853 * reconstruction over all possible combinations. If that fails, 1854 * we're cooked. 1855 */ 1856 if (total_errors >= rm->rm_firstdatacol) { 1857 zio->io_error = vdev_raidz_worst_error(rm); 1858 /* 1859 * If there were exactly as many device errors as parity 1860 * columns, yet we couldn't reconstruct the data, then at 1861 * least one device must have returned bad data silently. 1862 */ 1863 if (total_errors == rm->rm_firstdatacol) 1864 zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 1865 1866 } else if ((code = vdev_raidz_combrec(zio, total_errors, 1867 data_errors)) != 0) { 1868 /* 1869 * If we didn't use all the available parity for the 1870 * combinatorial reconstruction, verify that the remaining 1871 * parity is correct. 1872 */ 1873 if (code != (1 << rm->rm_firstdatacol) - 1) 1874 (void) raidz_parity_verify(zio, rm); 1875 } else { 1876 /* 1877 * All combinations failed to checksum. Generate checksum 1878 * ereports for all children. 1879 */ 1880 zio->io_error = ECKSUM; 1881 1882 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1883 for (c = 0; c < rm->rm_cols; c++) { 1884 rc = &rm->rm_col[c]; 1885 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1886 zio->io_spa, vd->vdev_child[rc->rc_devidx], 1887 zio, rc->rc_offset, rc->rc_size); 1888 } 1889 } 1890 } 1891 1892 done: 1893 zio_checksum_verified(zio); 1894 1895 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1896 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1897 /* 1898 * Use the good data we have in hand to repair damaged children. 1899 */ 1900 for (c = 0; c < rm->rm_cols; c++) { 1901 rc = &rm->rm_col[c]; 1902 cvd = vd->vdev_child[rc->rc_devidx]; 1903 1904 if (rc->rc_error == 0) 1905 continue; 1906 1907 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1908 rc->rc_offset, rc->rc_data, rc->rc_size, 1909 ZIO_TYPE_WRITE, zio->io_priority, 1910 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1911 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1912 } 1913 } 1914 } 1915 1916 static void 1917 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1918 { 1919 if (faulted > vd->vdev_nparity) 1920 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1921 VDEV_AUX_NO_REPLICAS); 1922 else if (degraded + faulted != 0) 1923 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1924 else 1925 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1926 } 1927 1928 vdev_ops_t vdev_raidz_ops = { 1929 vdev_raidz_open, 1930 vdev_raidz_close, 1931 vdev_raidz_asize, 1932 vdev_raidz_io_start, 1933 vdev_raidz_io_done, 1934 vdev_raidz_state_change, 1935 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1936 B_FALSE /* not a leaf vdev */ 1937 }; 1938