1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * (A * 2)_7 = A_6 64 * (A * 2)_6 = A_5 65 * (A * 2)_5 = A_4 66 * (A * 2)_4 = A_3 + A_7 67 * (A * 2)_3 = A_2 + A_7 68 * (A * 2)_2 = A_1 + A_7 69 * (A * 2)_1 = A_0 70 * (A * 2)_0 = A_7 71 * 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 * As an aside, this multiplication is derived from the error correcting 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 * 76 * Observe that any number in the field (except for 0) can be expressed as a 77 * power of 2 -- a generator for the field. We store a table of the powers of 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 * than field addition). The inverse of a field element A (A^-1) is therefore 81 * A ^ (255 - 1) = A^254. 82 * 83 * The up-to-three parity columns, P, Q, R over several data columns, 84 * D_0, ... D_n-1, can be expressed by field operations: 85 * 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 * 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 * independent coefficients. (There are no additional coefficients that have 95 * this property which is why the uncorrected Plank method breaks down.) 96 * 97 * See the reconstruction code below for how P, Q and R can used individually 98 * or in concert to recover missing data columns. 99 */ 100 101 typedef struct raidz_col { 102 uint64_t rc_devidx; /* child device index for I/O */ 103 uint64_t rc_offset; /* device offset */ 104 uint64_t rc_size; /* I/O size */ 105 void *rc_data; /* I/O data */ 106 void *rc_gdata; /* used to store the "good" version */ 107 int rc_error; /* I/O error for this device */ 108 uint8_t rc_tried; /* Did we attempt this I/O column? */ 109 uint8_t rc_skipped; /* Did we skip this I/O column? */ 110 } raidz_col_t; 111 112 typedef struct raidz_map { 113 uint64_t rm_cols; /* Regular column count */ 114 uint64_t rm_scols; /* Count including skipped columns */ 115 uint64_t rm_bigcols; /* Number of oversized columns */ 116 uint64_t rm_asize; /* Actual total I/O size */ 117 uint64_t rm_missingdata; /* Count of missing data devices */ 118 uint64_t rm_missingparity; /* Count of missing parity devices */ 119 uint64_t rm_firstdatacol; /* First data column/parity count */ 120 uint64_t rm_nskip; /* Skipped sectors for padding */ 121 uint64_t rm_skipstart; /* Column index of padding start */ 122 void *rm_datacopy; /* rm_asize-buffer of copied data */ 123 uintptr_t rm_reports; /* # of referencing checksum reports */ 124 uint8_t rm_freed; /* map no longer has referencing ZIO */ 125 uint8_t rm_ecksuminjected; /* checksum error was injected */ 126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 127 } raidz_map_t; 128 129 #define VDEV_RAIDZ_P 0 130 #define VDEV_RAIDZ_Q 1 131 #define VDEV_RAIDZ_R 2 132 #define VDEV_RAIDZ_MAXPARITY 3 133 134 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 135 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 136 137 /* 138 * We provide a mechanism to perform the field multiplication operation on a 139 * 64-bit value all at once rather than a byte at a time. This works by 140 * creating a mask from the top bit in each byte and using that to 141 * conditionally apply the XOR of 0x1d. 142 */ 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 144 { \ 145 (mask) = (x) & 0x8080808080808080ULL; \ 146 (mask) = ((mask) << 1) - ((mask) >> 7); \ 147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 148 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 149 } 150 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 152 { \ 153 VDEV_RAIDZ_64MUL_2((x), mask); \ 154 VDEV_RAIDZ_64MUL_2((x), mask); \ 155 } 156 157 /* 158 * Force reconstruction to use the general purpose method. 159 */ 160 int vdev_raidz_default_to_general; 161 162 /* 163 * These two tables represent powers and logs of 2 in the Galois field defined 164 * above. These values were computed by repeatedly multiplying by 2 as above. 165 */ 166 static const uint8_t vdev_raidz_pow2[256] = { 167 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 168 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 169 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 170 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 171 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 172 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 173 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 174 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 175 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 176 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 177 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 178 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 179 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 180 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 181 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 182 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 183 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 184 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 185 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 186 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 187 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 188 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 189 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 190 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 191 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 192 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 193 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 194 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 195 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 196 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 197 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 198 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 199 }; 200 static const uint8_t vdev_raidz_log2[256] = { 201 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 202 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 203 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 204 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 205 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 206 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 207 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 208 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 209 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 210 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 211 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 212 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 213 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 214 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 215 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 216 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 217 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 218 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 219 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 220 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 221 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 222 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 223 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 224 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 225 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 226 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 227 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 228 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 229 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 230 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 231 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 232 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 233 }; 234 235 static void vdev_raidz_generate_parity(raidz_map_t *rm); 236 237 /* 238 * Multiply a given number by 2 raised to the given power. 239 */ 240 static uint8_t 241 vdev_raidz_exp2(uint_t a, int exp) 242 { 243 if (a == 0) 244 return (0); 245 246 ASSERT(exp >= 0); 247 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 248 249 exp += vdev_raidz_log2[a]; 250 if (exp > 255) 251 exp -= 255; 252 253 return (vdev_raidz_pow2[exp]); 254 } 255 256 static void 257 vdev_raidz_map_free(raidz_map_t *rm) 258 { 259 int c; 260 size_t size; 261 262 for (c = 0; c < rm->rm_firstdatacol; c++) { 263 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 264 265 if (rm->rm_col[c].rc_gdata != NULL) 266 zio_buf_free(rm->rm_col[c].rc_gdata, 267 rm->rm_col[c].rc_size); 268 } 269 270 size = 0; 271 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 272 size += rm->rm_col[c].rc_size; 273 274 if (rm->rm_datacopy != NULL) 275 zio_buf_free(rm->rm_datacopy, size); 276 277 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 278 } 279 280 static void 281 vdev_raidz_map_free_vsd(zio_t *zio) 282 { 283 raidz_map_t *rm = zio->io_vsd; 284 285 ASSERT3U(rm->rm_freed, ==, 0); 286 rm->rm_freed = 1; 287 288 if (rm->rm_reports == 0) 289 vdev_raidz_map_free(rm); 290 } 291 292 /*ARGSUSED*/ 293 static void 294 vdev_raidz_cksum_free(void *arg, size_t ignored) 295 { 296 raidz_map_t *rm = arg; 297 298 ASSERT3U(rm->rm_reports, >, 0); 299 300 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 301 vdev_raidz_map_free(rm); 302 } 303 304 static void 305 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 306 { 307 raidz_map_t *rm = zcr->zcr_cbdata; 308 size_t c = zcr->zcr_cbinfo; 309 size_t x; 310 311 const char *good = NULL; 312 const char *bad = rm->rm_col[c].rc_data; 313 314 if (good_data == NULL) { 315 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 316 return; 317 } 318 319 if (c < rm->rm_firstdatacol) { 320 /* 321 * The first time through, calculate the parity blocks for 322 * the good data (this relies on the fact that the good 323 * data never changes for a given logical ZIO) 324 */ 325 if (rm->rm_col[0].rc_gdata == NULL) { 326 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 327 char *buf; 328 329 /* 330 * Set up the rm_col[]s to generate the parity for 331 * good_data, first saving the parity bufs and 332 * replacing them with buffers to hold the result. 333 */ 334 for (x = 0; x < rm->rm_firstdatacol; x++) { 335 bad_parity[x] = rm->rm_col[x].rc_data; 336 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 337 zio_buf_alloc(rm->rm_col[x].rc_size); 338 } 339 340 /* fill in the data columns from good_data */ 341 buf = (char *)good_data; 342 for (; x < rm->rm_cols; x++) { 343 rm->rm_col[x].rc_data = buf; 344 buf += rm->rm_col[x].rc_size; 345 } 346 347 /* 348 * Construct the parity from the good data. 349 */ 350 vdev_raidz_generate_parity(rm); 351 352 /* restore everything back to its original state */ 353 for (x = 0; x < rm->rm_firstdatacol; x++) 354 rm->rm_col[x].rc_data = bad_parity[x]; 355 356 buf = rm->rm_datacopy; 357 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 358 rm->rm_col[x].rc_data = buf; 359 buf += rm->rm_col[x].rc_size; 360 } 361 } 362 363 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 364 good = rm->rm_col[c].rc_gdata; 365 } else { 366 /* adjust good_data to point at the start of our column */ 367 good = good_data; 368 369 for (x = rm->rm_firstdatacol; x < c; x++) 370 good += rm->rm_col[x].rc_size; 371 } 372 373 /* we drop the ereport if it ends up that the data was good */ 374 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 375 } 376 377 /* 378 * Invoked indirectly by zfs_ereport_start_checksum(), called 379 * below when our read operation fails completely. The main point 380 * is to keep a copy of everything we read from disk, so that at 381 * vdev_raidz_cksum_finish() time we can compare it with the good data. 382 */ 383 static void 384 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 385 { 386 size_t c = (size_t)(uintptr_t)arg; 387 caddr_t buf; 388 389 raidz_map_t *rm = zio->io_vsd; 390 size_t size; 391 392 /* set up the report and bump the refcount */ 393 zcr->zcr_cbdata = rm; 394 zcr->zcr_cbinfo = c; 395 zcr->zcr_finish = vdev_raidz_cksum_finish; 396 zcr->zcr_free = vdev_raidz_cksum_free; 397 398 rm->rm_reports++; 399 ASSERT3U(rm->rm_reports, >, 0); 400 401 if (rm->rm_datacopy != NULL) 402 return; 403 404 /* 405 * It's the first time we're called for this raidz_map_t, so we need 406 * to copy the data aside; there's no guarantee that our zio's buffer 407 * won't be re-used for something else. 408 * 409 * Our parity data is already in separate buffers, so there's no need 410 * to copy them. 411 */ 412 413 size = 0; 414 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 415 size += rm->rm_col[c].rc_size; 416 417 buf = rm->rm_datacopy = zio_buf_alloc(size); 418 419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 420 raidz_col_t *col = &rm->rm_col[c]; 421 422 bcopy(col->rc_data, buf, col->rc_size); 423 col->rc_data = buf; 424 425 buf += col->rc_size; 426 } 427 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 428 } 429 430 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 431 vdev_raidz_map_free_vsd, 432 vdev_raidz_cksum_report 433 }; 434 435 static raidz_map_t * 436 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 437 uint64_t nparity) 438 { 439 raidz_map_t *rm; 440 uint64_t b = zio->io_offset >> unit_shift; 441 uint64_t s = zio->io_size >> unit_shift; 442 uint64_t f = b % dcols; 443 uint64_t o = (b / dcols) << unit_shift; 444 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 445 446 q = s / (dcols - nparity); 447 r = s - q * (dcols - nparity); 448 bc = (r == 0 ? 0 : r + nparity); 449 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 450 451 if (q == 0) { 452 acols = bc; 453 scols = MIN(dcols, roundup(bc, nparity + 1)); 454 } else { 455 acols = dcols; 456 scols = dcols; 457 } 458 459 ASSERT3U(acols, <=, scols); 460 461 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 462 463 rm->rm_cols = acols; 464 rm->rm_scols = scols; 465 rm->rm_bigcols = bc; 466 rm->rm_skipstart = bc; 467 rm->rm_missingdata = 0; 468 rm->rm_missingparity = 0; 469 rm->rm_firstdatacol = nparity; 470 rm->rm_datacopy = NULL; 471 rm->rm_reports = 0; 472 rm->rm_freed = 0; 473 rm->rm_ecksuminjected = 0; 474 475 asize = 0; 476 477 for (c = 0; c < scols; c++) { 478 col = f + c; 479 coff = o; 480 if (col >= dcols) { 481 col -= dcols; 482 coff += 1ULL << unit_shift; 483 } 484 rm->rm_col[c].rc_devidx = col; 485 rm->rm_col[c].rc_offset = coff; 486 rm->rm_col[c].rc_data = NULL; 487 rm->rm_col[c].rc_gdata = NULL; 488 rm->rm_col[c].rc_error = 0; 489 rm->rm_col[c].rc_tried = 0; 490 rm->rm_col[c].rc_skipped = 0; 491 492 if (c >= acols) 493 rm->rm_col[c].rc_size = 0; 494 else if (c < bc) 495 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 496 else 497 rm->rm_col[c].rc_size = q << unit_shift; 498 499 asize += rm->rm_col[c].rc_size; 500 } 501 502 ASSERT3U(asize, ==, tot << unit_shift); 503 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 504 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 505 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 506 ASSERT3U(rm->rm_nskip, <=, nparity); 507 508 for (c = 0; c < rm->rm_firstdatacol; c++) 509 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 510 511 rm->rm_col[c].rc_data = zio->io_data; 512 513 for (c = c + 1; c < acols; c++) 514 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 515 rm->rm_col[c - 1].rc_size; 516 517 /* 518 * If all data stored spans all columns, there's a danger that parity 519 * will always be on the same device and, since parity isn't read 520 * during normal operation, that that device's I/O bandwidth won't be 521 * used effectively. We therefore switch the parity every 1MB. 522 * 523 * ... at least that was, ostensibly, the theory. As a practical 524 * matter unless we juggle the parity between all devices evenly, we 525 * won't see any benefit. Further, occasional writes that aren't a 526 * multiple of the LCM of the number of children and the minimum 527 * stripe width are sufficient to avoid pessimal behavior. 528 * Unfortunately, this decision created an implicit on-disk format 529 * requirement that we need to support for all eternity, but only 530 * for single-parity RAID-Z. 531 * 532 * If we intend to skip a sector in the zeroth column for padding 533 * we must make sure to note this swap. We will never intend to 534 * skip the first column since at least one data and one parity 535 * column must appear in each row. 536 */ 537 ASSERT(rm->rm_cols >= 2); 538 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 539 540 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 541 devidx = rm->rm_col[0].rc_devidx; 542 o = rm->rm_col[0].rc_offset; 543 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 544 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 545 rm->rm_col[1].rc_devidx = devidx; 546 rm->rm_col[1].rc_offset = o; 547 548 if (rm->rm_skipstart == 0) 549 rm->rm_skipstart = 1; 550 } 551 552 zio->io_vsd = rm; 553 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 554 return (rm); 555 } 556 557 static void 558 vdev_raidz_generate_parity_p(raidz_map_t *rm) 559 { 560 uint64_t *p, *src, pcount, ccount, i; 561 int c; 562 563 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 564 565 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 566 src = rm->rm_col[c].rc_data; 567 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 568 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 569 570 if (c == rm->rm_firstdatacol) { 571 ASSERT(ccount == pcount); 572 for (i = 0; i < ccount; i++, src++, p++) { 573 *p = *src; 574 } 575 } else { 576 ASSERT(ccount <= pcount); 577 for (i = 0; i < ccount; i++, src++, p++) { 578 *p ^= *src; 579 } 580 } 581 } 582 } 583 584 static void 585 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 586 { 587 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 588 int c; 589 590 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 591 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 592 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 593 594 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 595 src = rm->rm_col[c].rc_data; 596 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 597 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 598 599 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 600 601 if (c == rm->rm_firstdatacol) { 602 ASSERT(ccnt == pcnt || ccnt == 0); 603 for (i = 0; i < ccnt; i++, src++, p++, q++) { 604 *p = *src; 605 *q = *src; 606 } 607 for (; i < pcnt; i++, src++, p++, q++) { 608 *p = 0; 609 *q = 0; 610 } 611 } else { 612 ASSERT(ccnt <= pcnt); 613 614 /* 615 * Apply the algorithm described above by multiplying 616 * the previous result and adding in the new value. 617 */ 618 for (i = 0; i < ccnt; i++, src++, p++, q++) { 619 *p ^= *src; 620 621 VDEV_RAIDZ_64MUL_2(*q, mask); 622 *q ^= *src; 623 } 624 625 /* 626 * Treat short columns as though they are full of 0s. 627 * Note that there's therefore nothing needed for P. 628 */ 629 for (; i < pcnt; i++, q++) { 630 VDEV_RAIDZ_64MUL_2(*q, mask); 631 } 632 } 633 } 634 } 635 636 static void 637 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 638 { 639 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 640 int c; 641 642 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 643 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 644 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 645 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 646 rm->rm_col[VDEV_RAIDZ_R].rc_size); 647 648 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 649 src = rm->rm_col[c].rc_data; 650 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 651 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 652 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 653 654 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 655 656 if (c == rm->rm_firstdatacol) { 657 ASSERT(ccnt == pcnt || ccnt == 0); 658 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 659 *p = *src; 660 *q = *src; 661 *r = *src; 662 } 663 for (; i < pcnt; i++, src++, p++, q++, r++) { 664 *p = 0; 665 *q = 0; 666 *r = 0; 667 } 668 } else { 669 ASSERT(ccnt <= pcnt); 670 671 /* 672 * Apply the algorithm described above by multiplying 673 * the previous result and adding in the new value. 674 */ 675 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 676 *p ^= *src; 677 678 VDEV_RAIDZ_64MUL_2(*q, mask); 679 *q ^= *src; 680 681 VDEV_RAIDZ_64MUL_4(*r, mask); 682 *r ^= *src; 683 } 684 685 /* 686 * Treat short columns as though they are full of 0s. 687 * Note that there's therefore nothing needed for P. 688 */ 689 for (; i < pcnt; i++, q++, r++) { 690 VDEV_RAIDZ_64MUL_2(*q, mask); 691 VDEV_RAIDZ_64MUL_4(*r, mask); 692 } 693 } 694 } 695 } 696 697 /* 698 * Generate RAID parity in the first virtual columns according to the number of 699 * parity columns available. 700 */ 701 static void 702 vdev_raidz_generate_parity(raidz_map_t *rm) 703 { 704 switch (rm->rm_firstdatacol) { 705 case 1: 706 vdev_raidz_generate_parity_p(rm); 707 break; 708 case 2: 709 vdev_raidz_generate_parity_pq(rm); 710 break; 711 case 3: 712 vdev_raidz_generate_parity_pqr(rm); 713 break; 714 default: 715 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 716 } 717 } 718 719 static int 720 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 721 { 722 uint64_t *dst, *src, xcount, ccount, count, i; 723 int x = tgts[0]; 724 int c; 725 726 ASSERT(ntgts == 1); 727 ASSERT(x >= rm->rm_firstdatacol); 728 ASSERT(x < rm->rm_cols); 729 730 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 731 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 732 ASSERT(xcount > 0); 733 734 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 735 dst = rm->rm_col[x].rc_data; 736 for (i = 0; i < xcount; i++, dst++, src++) { 737 *dst = *src; 738 } 739 740 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 741 src = rm->rm_col[c].rc_data; 742 dst = rm->rm_col[x].rc_data; 743 744 if (c == x) 745 continue; 746 747 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 748 count = MIN(ccount, xcount); 749 750 for (i = 0; i < count; i++, dst++, src++) { 751 *dst ^= *src; 752 } 753 } 754 755 return (1 << VDEV_RAIDZ_P); 756 } 757 758 static int 759 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 760 { 761 uint64_t *dst, *src, xcount, ccount, count, mask, i; 762 uint8_t *b; 763 int x = tgts[0]; 764 int c, j, exp; 765 766 ASSERT(ntgts == 1); 767 768 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 769 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 770 771 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 772 src = rm->rm_col[c].rc_data; 773 dst = rm->rm_col[x].rc_data; 774 775 if (c == x) 776 ccount = 0; 777 else 778 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 779 780 count = MIN(ccount, xcount); 781 782 if (c == rm->rm_firstdatacol) { 783 for (i = 0; i < count; i++, dst++, src++) { 784 *dst = *src; 785 } 786 for (; i < xcount; i++, dst++) { 787 *dst = 0; 788 } 789 790 } else { 791 for (i = 0; i < count; i++, dst++, src++) { 792 VDEV_RAIDZ_64MUL_2(*dst, mask); 793 *dst ^= *src; 794 } 795 796 for (; i < xcount; i++, dst++) { 797 VDEV_RAIDZ_64MUL_2(*dst, mask); 798 } 799 } 800 } 801 802 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 803 dst = rm->rm_col[x].rc_data; 804 exp = 255 - (rm->rm_cols - 1 - x); 805 806 for (i = 0; i < xcount; i++, dst++, src++) { 807 *dst ^= *src; 808 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 809 *b = vdev_raidz_exp2(*b, exp); 810 } 811 } 812 813 return (1 << VDEV_RAIDZ_Q); 814 } 815 816 static int 817 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 818 { 819 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 820 void *pdata, *qdata; 821 uint64_t xsize, ysize, i; 822 int x = tgts[0]; 823 int y = tgts[1]; 824 825 ASSERT(ntgts == 2); 826 ASSERT(x < y); 827 ASSERT(x >= rm->rm_firstdatacol); 828 ASSERT(y < rm->rm_cols); 829 830 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 831 832 /* 833 * Move the parity data aside -- we're going to compute parity as 834 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 835 * reuse the parity generation mechanism without trashing the actual 836 * parity so we make those columns appear to be full of zeros by 837 * setting their lengths to zero. 838 */ 839 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 840 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 841 xsize = rm->rm_col[x].rc_size; 842 ysize = rm->rm_col[y].rc_size; 843 844 rm->rm_col[VDEV_RAIDZ_P].rc_data = 845 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 846 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 847 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 848 rm->rm_col[x].rc_size = 0; 849 rm->rm_col[y].rc_size = 0; 850 851 vdev_raidz_generate_parity_pq(rm); 852 853 rm->rm_col[x].rc_size = xsize; 854 rm->rm_col[y].rc_size = ysize; 855 856 p = pdata; 857 q = qdata; 858 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 859 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 860 xd = rm->rm_col[x].rc_data; 861 yd = rm->rm_col[y].rc_data; 862 863 /* 864 * We now have: 865 * Pxy = P + D_x + D_y 866 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 867 * 868 * We can then solve for D_x: 869 * D_x = A * (P + Pxy) + B * (Q + Qxy) 870 * where 871 * A = 2^(x - y) * (2^(x - y) + 1)^-1 872 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 873 * 874 * With D_x in hand, we can easily solve for D_y: 875 * D_y = P + Pxy + D_x 876 */ 877 878 a = vdev_raidz_pow2[255 + x - y]; 879 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 880 tmp = 255 - vdev_raidz_log2[a ^ 1]; 881 882 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 883 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 884 885 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 886 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 887 vdev_raidz_exp2(*q ^ *qxy, bexp); 888 889 if (i < ysize) 890 *yd = *p ^ *pxy ^ *xd; 891 } 892 893 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 894 rm->rm_col[VDEV_RAIDZ_P].rc_size); 895 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 896 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 897 898 /* 899 * Restore the saved parity data. 900 */ 901 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 902 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 903 904 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 905 } 906 907 /* BEGIN CSTYLED */ 908 /* 909 * In the general case of reconstruction, we must solve the system of linear 910 * equations defined by the coeffecients used to generate parity as well as 911 * the contents of the data and parity disks. This can be expressed with 912 * vectors for the original data (D) and the actual data (d) and parity (p) 913 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 914 * 915 * __ __ __ __ 916 * | | __ __ | p_0 | 917 * | V | | D_0 | | p_m-1 | 918 * | | x | : | = | d_0 | 919 * | I | | D_n-1 | | : | 920 * | | ~~ ~~ | d_n-1 | 921 * ~~ ~~ ~~ ~~ 922 * 923 * I is simply a square identity matrix of size n, and V is a vandermonde 924 * matrix defined by the coeffecients we chose for the various parity columns 925 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 926 * computation as well as linear separability. 927 * 928 * __ __ __ __ 929 * | 1 .. 1 1 1 | | p_0 | 930 * | 2^n-1 .. 4 2 1 | __ __ | : | 931 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 932 * | 1 .. 0 0 0 | | D_1 | | d_0 | 933 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 934 * | : : : : | | : | | d_2 | 935 * | 0 .. 1 0 0 | | D_n-1 | | : | 936 * | 0 .. 0 1 0 | ~~ ~~ | : | 937 * | 0 .. 0 0 1 | | d_n-1 | 938 * ~~ ~~ ~~ ~~ 939 * 940 * Note that I, V, d, and p are known. To compute D, we must invert the 941 * matrix and use the known data and parity values to reconstruct the unknown 942 * data values. We begin by removing the rows in V|I and d|p that correspond 943 * to failed or missing columns; we then make V|I square (n x n) and d|p 944 * sized n by removing rows corresponding to unused parity from the bottom up 945 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 946 * using Gauss-Jordan elimination. In the example below we use m=3 parity 947 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 948 * __ __ 949 * | 1 1 1 1 1 1 1 1 | 950 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 951 * | 19 205 116 29 64 16 4 1 | / / 952 * | 1 0 0 0 0 0 0 0 | / / 953 * | 0 1 0 0 0 0 0 0 | <--' / 954 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 955 * | 0 0 0 1 0 0 0 0 | 956 * | 0 0 0 0 1 0 0 0 | 957 * | 0 0 0 0 0 1 0 0 | 958 * | 0 0 0 0 0 0 1 0 | 959 * | 0 0 0 0 0 0 0 1 | 960 * ~~ ~~ 961 * __ __ 962 * | 1 1 1 1 1 1 1 1 | 963 * | 128 64 32 16 8 4 2 1 | 964 * | 19 205 116 29 64 16 4 1 | 965 * | 1 0 0 0 0 0 0 0 | 966 * | 0 1 0 0 0 0 0 0 | 967 * (V|I)' = | 0 0 1 0 0 0 0 0 | 968 * | 0 0 0 1 0 0 0 0 | 969 * | 0 0 0 0 1 0 0 0 | 970 * | 0 0 0 0 0 1 0 0 | 971 * | 0 0 0 0 0 0 1 0 | 972 * | 0 0 0 0 0 0 0 1 | 973 * ~~ ~~ 974 * 975 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 976 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 977 * matrix is not singular. 978 * __ __ 979 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 980 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 981 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 982 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 983 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 984 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 985 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 986 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 987 * ~~ ~~ 988 * __ __ 989 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 990 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 991 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 992 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 993 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 994 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 995 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 996 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 997 * ~~ ~~ 998 * __ __ 999 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1000 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1001 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1002 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1003 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1004 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1005 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1006 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1007 * ~~ ~~ 1008 * __ __ 1009 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1010 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1011 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1012 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1013 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1014 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1015 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1016 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1017 * ~~ ~~ 1018 * __ __ 1019 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1020 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1021 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1022 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1023 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1024 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1025 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1026 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1027 * ~~ ~~ 1028 * __ __ 1029 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1030 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1031 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1032 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1033 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1034 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1035 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1036 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1037 * ~~ ~~ 1038 * __ __ 1039 * | 0 0 1 0 0 0 0 0 | 1040 * | 167 100 5 41 159 169 217 208 | 1041 * | 166 100 4 40 158 168 216 209 | 1042 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1043 * | 0 0 0 0 1 0 0 0 | 1044 * | 0 0 0 0 0 1 0 0 | 1045 * | 0 0 0 0 0 0 1 0 | 1046 * | 0 0 0 0 0 0 0 1 | 1047 * ~~ ~~ 1048 * 1049 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1050 * of the missing data. 1051 * 1052 * As is apparent from the example above, the only non-trivial rows in the 1053 * inverse matrix correspond to the data disks that we're trying to 1054 * reconstruct. Indeed, those are the only rows we need as the others would 1055 * only be useful for reconstructing data known or assumed to be valid. For 1056 * that reason, we only build the coefficients in the rows that correspond to 1057 * targeted columns. 1058 */ 1059 /* END CSTYLED */ 1060 1061 static void 1062 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1063 uint8_t **rows) 1064 { 1065 int i, j; 1066 int pow; 1067 1068 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1069 1070 /* 1071 * Fill in the missing rows of interest. 1072 */ 1073 for (i = 0; i < nmap; i++) { 1074 ASSERT3S(0, <=, map[i]); 1075 ASSERT3S(map[i], <=, 2); 1076 1077 pow = map[i] * n; 1078 if (pow > 255) 1079 pow -= 255; 1080 ASSERT(pow <= 255); 1081 1082 for (j = 0; j < n; j++) { 1083 pow -= map[i]; 1084 if (pow < 0) 1085 pow += 255; 1086 rows[i][j] = vdev_raidz_pow2[pow]; 1087 } 1088 } 1089 } 1090 1091 static void 1092 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1093 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1094 { 1095 int i, j, ii, jj; 1096 uint8_t log; 1097 1098 /* 1099 * Assert that the first nmissing entries from the array of used 1100 * columns correspond to parity columns and that subsequent entries 1101 * correspond to data columns. 1102 */ 1103 for (i = 0; i < nmissing; i++) { 1104 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1105 } 1106 for (; i < n; i++) { 1107 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1108 } 1109 1110 /* 1111 * First initialize the storage where we'll compute the inverse rows. 1112 */ 1113 for (i = 0; i < nmissing; i++) { 1114 for (j = 0; j < n; j++) { 1115 invrows[i][j] = (i == j) ? 1 : 0; 1116 } 1117 } 1118 1119 /* 1120 * Subtract all trivial rows from the rows of consequence. 1121 */ 1122 for (i = 0; i < nmissing; i++) { 1123 for (j = nmissing; j < n; j++) { 1124 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1125 jj = used[j] - rm->rm_firstdatacol; 1126 ASSERT3S(jj, <, n); 1127 invrows[i][j] = rows[i][jj]; 1128 rows[i][jj] = 0; 1129 } 1130 } 1131 1132 /* 1133 * For each of the rows of interest, we must normalize it and subtract 1134 * a multiple of it from the other rows. 1135 */ 1136 for (i = 0; i < nmissing; i++) { 1137 for (j = 0; j < missing[i]; j++) { 1138 ASSERT3U(rows[i][j], ==, 0); 1139 } 1140 ASSERT3U(rows[i][missing[i]], !=, 0); 1141 1142 /* 1143 * Compute the inverse of the first element and multiply each 1144 * element in the row by that value. 1145 */ 1146 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1147 1148 for (j = 0; j < n; j++) { 1149 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1150 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1151 } 1152 1153 for (ii = 0; ii < nmissing; ii++) { 1154 if (i == ii) 1155 continue; 1156 1157 ASSERT3U(rows[ii][missing[i]], !=, 0); 1158 1159 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1160 1161 for (j = 0; j < n; j++) { 1162 rows[ii][j] ^= 1163 vdev_raidz_exp2(rows[i][j], log); 1164 invrows[ii][j] ^= 1165 vdev_raidz_exp2(invrows[i][j], log); 1166 } 1167 } 1168 } 1169 1170 /* 1171 * Verify that the data that is left in the rows are properly part of 1172 * an identity matrix. 1173 */ 1174 for (i = 0; i < nmissing; i++) { 1175 for (j = 0; j < n; j++) { 1176 if (j == missing[i]) { 1177 ASSERT3U(rows[i][j], ==, 1); 1178 } else { 1179 ASSERT3U(rows[i][j], ==, 0); 1180 } 1181 } 1182 } 1183 } 1184 1185 static void 1186 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1187 int *missing, uint8_t **invrows, const uint8_t *used) 1188 { 1189 int i, j, x, cc, c; 1190 uint8_t *src; 1191 uint64_t ccount; 1192 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1193 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1194 uint8_t log, val; 1195 int ll; 1196 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1197 uint8_t *p, *pp; 1198 size_t psize; 1199 1200 psize = sizeof (invlog[0][0]) * n * nmissing; 1201 p = kmem_alloc(psize, KM_SLEEP); 1202 1203 for (pp = p, i = 0; i < nmissing; i++) { 1204 invlog[i] = pp; 1205 pp += n; 1206 } 1207 1208 for (i = 0; i < nmissing; i++) { 1209 for (j = 0; j < n; j++) { 1210 ASSERT3U(invrows[i][j], !=, 0); 1211 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1212 } 1213 } 1214 1215 for (i = 0; i < n; i++) { 1216 c = used[i]; 1217 ASSERT3U(c, <, rm->rm_cols); 1218 1219 src = rm->rm_col[c].rc_data; 1220 ccount = rm->rm_col[c].rc_size; 1221 for (j = 0; j < nmissing; j++) { 1222 cc = missing[j] + rm->rm_firstdatacol; 1223 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1224 ASSERT3U(cc, <, rm->rm_cols); 1225 ASSERT3U(cc, !=, c); 1226 1227 dst[j] = rm->rm_col[cc].rc_data; 1228 dcount[j] = rm->rm_col[cc].rc_size; 1229 } 1230 1231 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1232 1233 for (x = 0; x < ccount; x++, src++) { 1234 if (*src != 0) 1235 log = vdev_raidz_log2[*src]; 1236 1237 for (cc = 0; cc < nmissing; cc++) { 1238 if (x >= dcount[cc]) 1239 continue; 1240 1241 if (*src == 0) { 1242 val = 0; 1243 } else { 1244 if ((ll = log + invlog[cc][i]) >= 255) 1245 ll -= 255; 1246 val = vdev_raidz_pow2[ll]; 1247 } 1248 1249 if (i == 0) 1250 dst[cc][x] = val; 1251 else 1252 dst[cc][x] ^= val; 1253 } 1254 } 1255 } 1256 1257 kmem_free(p, psize); 1258 } 1259 1260 static int 1261 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1262 { 1263 int n, i, c, t, tt; 1264 int nmissing_rows; 1265 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1266 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1267 1268 uint8_t *p, *pp; 1269 size_t psize; 1270 1271 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1272 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1273 uint8_t *used; 1274 1275 int code = 0; 1276 1277 1278 n = rm->rm_cols - rm->rm_firstdatacol; 1279 1280 /* 1281 * Figure out which data columns are missing. 1282 */ 1283 nmissing_rows = 0; 1284 for (t = 0; t < ntgts; t++) { 1285 if (tgts[t] >= rm->rm_firstdatacol) { 1286 missing_rows[nmissing_rows++] = 1287 tgts[t] - rm->rm_firstdatacol; 1288 } 1289 } 1290 1291 /* 1292 * Figure out which parity columns to use to help generate the missing 1293 * data columns. 1294 */ 1295 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1296 ASSERT(tt < ntgts); 1297 ASSERT(c < rm->rm_firstdatacol); 1298 1299 /* 1300 * Skip any targeted parity columns. 1301 */ 1302 if (c == tgts[tt]) { 1303 tt++; 1304 continue; 1305 } 1306 1307 code |= 1 << c; 1308 1309 parity_map[i] = c; 1310 i++; 1311 } 1312 1313 ASSERT(code != 0); 1314 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1315 1316 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1317 nmissing_rows * n + sizeof (used[0]) * n; 1318 p = kmem_alloc(psize, KM_SLEEP); 1319 1320 for (pp = p, i = 0; i < nmissing_rows; i++) { 1321 rows[i] = pp; 1322 pp += n; 1323 invrows[i] = pp; 1324 pp += n; 1325 } 1326 used = pp; 1327 1328 for (i = 0; i < nmissing_rows; i++) { 1329 used[i] = parity_map[i]; 1330 } 1331 1332 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1333 if (tt < nmissing_rows && 1334 c == missing_rows[tt] + rm->rm_firstdatacol) { 1335 tt++; 1336 continue; 1337 } 1338 1339 ASSERT3S(i, <, n); 1340 used[i] = c; 1341 i++; 1342 } 1343 1344 /* 1345 * Initialize the interesting rows of the matrix. 1346 */ 1347 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1348 1349 /* 1350 * Invert the matrix. 1351 */ 1352 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1353 invrows, used); 1354 1355 /* 1356 * Reconstruct the missing data using the generated matrix. 1357 */ 1358 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1359 invrows, used); 1360 1361 kmem_free(p, psize); 1362 1363 return (code); 1364 } 1365 1366 static int 1367 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1368 { 1369 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1370 int ntgts; 1371 int i, c; 1372 int code; 1373 int nbadparity, nbaddata; 1374 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1375 1376 /* 1377 * The tgts list must already be sorted. 1378 */ 1379 for (i = 1; i < nt; i++) { 1380 ASSERT(t[i] > t[i - 1]); 1381 } 1382 1383 nbadparity = rm->rm_firstdatacol; 1384 nbaddata = rm->rm_cols - nbadparity; 1385 ntgts = 0; 1386 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1387 if (c < rm->rm_firstdatacol) 1388 parity_valid[c] = B_FALSE; 1389 1390 if (i < nt && c == t[i]) { 1391 tgts[ntgts++] = c; 1392 i++; 1393 } else if (rm->rm_col[c].rc_error != 0) { 1394 tgts[ntgts++] = c; 1395 } else if (c >= rm->rm_firstdatacol) { 1396 nbaddata--; 1397 } else { 1398 parity_valid[c] = B_TRUE; 1399 nbadparity--; 1400 } 1401 } 1402 1403 ASSERT(ntgts >= nt); 1404 ASSERT(nbaddata >= 0); 1405 ASSERT(nbaddata + nbadparity == ntgts); 1406 1407 dt = &tgts[nbadparity]; 1408 1409 /* 1410 * See if we can use any of our optimized reconstruction routines. 1411 */ 1412 if (!vdev_raidz_default_to_general) { 1413 switch (nbaddata) { 1414 case 1: 1415 if (parity_valid[VDEV_RAIDZ_P]) 1416 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1417 1418 ASSERT(rm->rm_firstdatacol > 1); 1419 1420 if (parity_valid[VDEV_RAIDZ_Q]) 1421 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1422 1423 ASSERT(rm->rm_firstdatacol > 2); 1424 break; 1425 1426 case 2: 1427 ASSERT(rm->rm_firstdatacol > 1); 1428 1429 if (parity_valid[VDEV_RAIDZ_P] && 1430 parity_valid[VDEV_RAIDZ_Q]) 1431 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1432 1433 ASSERT(rm->rm_firstdatacol > 2); 1434 1435 break; 1436 } 1437 } 1438 1439 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1440 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1441 ASSERT(code > 0); 1442 return (code); 1443 } 1444 1445 static int 1446 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1447 { 1448 vdev_t *cvd; 1449 uint64_t nparity = vd->vdev_nparity; 1450 int c; 1451 int lasterror = 0; 1452 int numerrors = 0; 1453 1454 ASSERT(nparity > 0); 1455 1456 if (nparity > VDEV_RAIDZ_MAXPARITY || 1457 vd->vdev_children < nparity + 1) { 1458 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1459 return (EINVAL); 1460 } 1461 1462 vdev_open_children(vd); 1463 1464 for (c = 0; c < vd->vdev_children; c++) { 1465 cvd = vd->vdev_child[c]; 1466 1467 if (cvd->vdev_open_error != 0) { 1468 lasterror = cvd->vdev_open_error; 1469 numerrors++; 1470 continue; 1471 } 1472 1473 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1474 *ashift = MAX(*ashift, cvd->vdev_ashift); 1475 } 1476 1477 *asize *= vd->vdev_children; 1478 1479 if (numerrors > nparity) { 1480 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1481 return (lasterror); 1482 } 1483 1484 return (0); 1485 } 1486 1487 static void 1488 vdev_raidz_close(vdev_t *vd) 1489 { 1490 int c; 1491 1492 for (c = 0; c < vd->vdev_children; c++) 1493 vdev_close(vd->vdev_child[c]); 1494 } 1495 1496 static uint64_t 1497 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1498 { 1499 uint64_t asize; 1500 uint64_t ashift = vd->vdev_top->vdev_ashift; 1501 uint64_t cols = vd->vdev_children; 1502 uint64_t nparity = vd->vdev_nparity; 1503 1504 asize = ((psize - 1) >> ashift) + 1; 1505 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1506 asize = roundup(asize, nparity + 1) << ashift; 1507 1508 return (asize); 1509 } 1510 1511 static void 1512 vdev_raidz_child_done(zio_t *zio) 1513 { 1514 raidz_col_t *rc = zio->io_private; 1515 1516 rc->rc_error = zio->io_error; 1517 rc->rc_tried = 1; 1518 rc->rc_skipped = 0; 1519 } 1520 1521 static int 1522 vdev_raidz_io_start(zio_t *zio) 1523 { 1524 vdev_t *vd = zio->io_vd; 1525 vdev_t *tvd = vd->vdev_top; 1526 vdev_t *cvd; 1527 blkptr_t *bp = zio->io_bp; 1528 raidz_map_t *rm; 1529 raidz_col_t *rc; 1530 int c, i; 1531 1532 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1533 vd->vdev_nparity); 1534 1535 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1536 1537 if (zio->io_type == ZIO_TYPE_WRITE) { 1538 vdev_raidz_generate_parity(rm); 1539 1540 for (c = 0; c < rm->rm_cols; c++) { 1541 rc = &rm->rm_col[c]; 1542 cvd = vd->vdev_child[rc->rc_devidx]; 1543 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1544 rc->rc_offset, rc->rc_data, rc->rc_size, 1545 zio->io_type, zio->io_priority, 0, 1546 vdev_raidz_child_done, rc)); 1547 } 1548 1549 /* 1550 * Generate optional I/Os for any skipped sectors to improve 1551 * aggregation contiguity. 1552 */ 1553 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1554 ASSERT(c <= rm->rm_scols); 1555 if (c == rm->rm_scols) 1556 c = 0; 1557 rc = &rm->rm_col[c]; 1558 cvd = vd->vdev_child[rc->rc_devidx]; 1559 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1560 rc->rc_offset + rc->rc_size, NULL, 1561 1 << tvd->vdev_ashift, 1562 zio->io_type, zio->io_priority, 1563 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1564 } 1565 1566 return (ZIO_PIPELINE_CONTINUE); 1567 } 1568 1569 ASSERT(zio->io_type == ZIO_TYPE_READ); 1570 1571 /* 1572 * Iterate over the columns in reverse order so that we hit the parity 1573 * last -- any errors along the way will force us to read the parity. 1574 */ 1575 for (c = rm->rm_cols - 1; c >= 0; c--) { 1576 rc = &rm->rm_col[c]; 1577 cvd = vd->vdev_child[rc->rc_devidx]; 1578 if (!vdev_readable(cvd)) { 1579 if (c >= rm->rm_firstdatacol) 1580 rm->rm_missingdata++; 1581 else 1582 rm->rm_missingparity++; 1583 rc->rc_error = ENXIO; 1584 rc->rc_tried = 1; /* don't even try */ 1585 rc->rc_skipped = 1; 1586 continue; 1587 } 1588 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 1589 if (c >= rm->rm_firstdatacol) 1590 rm->rm_missingdata++; 1591 else 1592 rm->rm_missingparity++; 1593 rc->rc_error = ESTALE; 1594 rc->rc_skipped = 1; 1595 continue; 1596 } 1597 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1598 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1599 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1600 rc->rc_offset, rc->rc_data, rc->rc_size, 1601 zio->io_type, zio->io_priority, 0, 1602 vdev_raidz_child_done, rc)); 1603 } 1604 } 1605 1606 return (ZIO_PIPELINE_CONTINUE); 1607 } 1608 1609 /* 1610 * Report a checksum error for a child of a RAID-Z device. 1611 */ 1612 static void 1613 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1614 { 1615 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1616 1617 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1618 zio_bad_cksum_t zbc; 1619 raidz_map_t *rm = zio->io_vsd; 1620 1621 mutex_enter(&vd->vdev_stat_lock); 1622 vd->vdev_stat.vs_checksum_errors++; 1623 mutex_exit(&vd->vdev_stat_lock); 1624 1625 zbc.zbc_has_cksum = 0; 1626 zbc.zbc_injected = rm->rm_ecksuminjected; 1627 1628 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1629 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1630 &zbc); 1631 } 1632 } 1633 1634 /* 1635 * We keep track of whether or not there were any injected errors, so that 1636 * any ereports we generate can note it. 1637 */ 1638 static int 1639 raidz_checksum_verify(zio_t *zio) 1640 { 1641 zio_bad_cksum_t zbc; 1642 raidz_map_t *rm = zio->io_vsd; 1643 1644 int ret = zio_checksum_error(zio, &zbc); 1645 if (ret != 0 && zbc.zbc_injected != 0) 1646 rm->rm_ecksuminjected = 1; 1647 1648 return (ret); 1649 } 1650 1651 /* 1652 * Generate the parity from the data columns. If we tried and were able to 1653 * read the parity without error, verify that the generated parity matches the 1654 * data we read. If it doesn't, we fire off a checksum error. Return the 1655 * number such failures. 1656 */ 1657 static int 1658 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1659 { 1660 void *orig[VDEV_RAIDZ_MAXPARITY]; 1661 int c, ret = 0; 1662 raidz_col_t *rc; 1663 1664 for (c = 0; c < rm->rm_firstdatacol; c++) { 1665 rc = &rm->rm_col[c]; 1666 if (!rc->rc_tried || rc->rc_error != 0) 1667 continue; 1668 orig[c] = zio_buf_alloc(rc->rc_size); 1669 bcopy(rc->rc_data, orig[c], rc->rc_size); 1670 } 1671 1672 vdev_raidz_generate_parity(rm); 1673 1674 for (c = 0; c < rm->rm_firstdatacol; c++) { 1675 rc = &rm->rm_col[c]; 1676 if (!rc->rc_tried || rc->rc_error != 0) 1677 continue; 1678 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1679 raidz_checksum_error(zio, rc, orig[c]); 1680 rc->rc_error = ECKSUM; 1681 ret++; 1682 } 1683 zio_buf_free(orig[c], rc->rc_size); 1684 } 1685 1686 return (ret); 1687 } 1688 1689 /* 1690 * Keep statistics on all the ways that we used parity to correct data. 1691 */ 1692 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1693 1694 static int 1695 vdev_raidz_worst_error(raidz_map_t *rm) 1696 { 1697 int error = 0; 1698 1699 for (int c = 0; c < rm->rm_cols; c++) 1700 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1701 1702 return (error); 1703 } 1704 1705 /* 1706 * Iterate over all combinations of bad data and attempt a reconstruction. 1707 * Note that the algorithm below is non-optimal because it doesn't take into 1708 * account how reconstruction is actually performed. For example, with 1709 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1710 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1711 * cases we'd only use parity information in column 0. 1712 */ 1713 static int 1714 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1715 { 1716 raidz_map_t *rm = zio->io_vsd; 1717 raidz_col_t *rc; 1718 void *orig[VDEV_RAIDZ_MAXPARITY]; 1719 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1720 int *tgts = &tstore[1]; 1721 int current, next, i, c, n; 1722 int code, ret = 0; 1723 1724 ASSERT(total_errors < rm->rm_firstdatacol); 1725 1726 /* 1727 * This simplifies one edge condition. 1728 */ 1729 tgts[-1] = -1; 1730 1731 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1732 /* 1733 * Initialize the targets array by finding the first n columns 1734 * that contain no error. 1735 * 1736 * If there were no data errors, we need to ensure that we're 1737 * always explicitly attempting to reconstruct at least one 1738 * data column. To do this, we simply push the highest target 1739 * up into the data columns. 1740 */ 1741 for (c = 0, i = 0; i < n; i++) { 1742 if (i == n - 1 && data_errors == 0 && 1743 c < rm->rm_firstdatacol) { 1744 c = rm->rm_firstdatacol; 1745 } 1746 1747 while (rm->rm_col[c].rc_error != 0) { 1748 c++; 1749 ASSERT3S(c, <, rm->rm_cols); 1750 } 1751 1752 tgts[i] = c++; 1753 } 1754 1755 /* 1756 * Setting tgts[n] simplifies the other edge condition. 1757 */ 1758 tgts[n] = rm->rm_cols; 1759 1760 /* 1761 * These buffers were allocated in previous iterations. 1762 */ 1763 for (i = 0; i < n - 1; i++) { 1764 ASSERT(orig[i] != NULL); 1765 } 1766 1767 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1768 1769 current = 0; 1770 next = tgts[current]; 1771 1772 while (current != n) { 1773 tgts[current] = next; 1774 current = 0; 1775 1776 /* 1777 * Save off the original data that we're going to 1778 * attempt to reconstruct. 1779 */ 1780 for (i = 0; i < n; i++) { 1781 ASSERT(orig[i] != NULL); 1782 c = tgts[i]; 1783 ASSERT3S(c, >=, 0); 1784 ASSERT3S(c, <, rm->rm_cols); 1785 rc = &rm->rm_col[c]; 1786 bcopy(rc->rc_data, orig[i], rc->rc_size); 1787 } 1788 1789 /* 1790 * Attempt a reconstruction and exit the outer loop on 1791 * success. 1792 */ 1793 code = vdev_raidz_reconstruct(rm, tgts, n); 1794 if (raidz_checksum_verify(zio) == 0) { 1795 atomic_inc_64(&raidz_corrected[code]); 1796 1797 for (i = 0; i < n; i++) { 1798 c = tgts[i]; 1799 rc = &rm->rm_col[c]; 1800 ASSERT(rc->rc_error == 0); 1801 if (rc->rc_tried) 1802 raidz_checksum_error(zio, rc, 1803 orig[i]); 1804 rc->rc_error = ECKSUM; 1805 } 1806 1807 ret = code; 1808 goto done; 1809 } 1810 1811 /* 1812 * Restore the original data. 1813 */ 1814 for (i = 0; i < n; i++) { 1815 c = tgts[i]; 1816 rc = &rm->rm_col[c]; 1817 bcopy(orig[i], rc->rc_data, rc->rc_size); 1818 } 1819 1820 do { 1821 /* 1822 * Find the next valid column after the current 1823 * position.. 1824 */ 1825 for (next = tgts[current] + 1; 1826 next < rm->rm_cols && 1827 rm->rm_col[next].rc_error != 0; next++) 1828 continue; 1829 1830 ASSERT(next <= tgts[current + 1]); 1831 1832 /* 1833 * If that spot is available, we're done here. 1834 */ 1835 if (next != tgts[current + 1]) 1836 break; 1837 1838 /* 1839 * Otherwise, find the next valid column after 1840 * the previous position. 1841 */ 1842 for (c = tgts[current - 1] + 1; 1843 rm->rm_col[c].rc_error != 0; c++) 1844 continue; 1845 1846 tgts[current] = c; 1847 current++; 1848 1849 } while (current != n); 1850 } 1851 } 1852 n--; 1853 done: 1854 for (i = 0; i < n; i++) { 1855 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1856 } 1857 1858 return (ret); 1859 } 1860 1861 static void 1862 vdev_raidz_io_done(zio_t *zio) 1863 { 1864 vdev_t *vd = zio->io_vd; 1865 vdev_t *cvd; 1866 raidz_map_t *rm = zio->io_vsd; 1867 raidz_col_t *rc; 1868 int unexpected_errors = 0; 1869 int parity_errors = 0; 1870 int parity_untried = 0; 1871 int data_errors = 0; 1872 int total_errors = 0; 1873 int n, c; 1874 int tgts[VDEV_RAIDZ_MAXPARITY]; 1875 int code; 1876 1877 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1878 1879 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1880 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1881 1882 for (c = 0; c < rm->rm_cols; c++) { 1883 rc = &rm->rm_col[c]; 1884 1885 if (rc->rc_error) { 1886 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1887 1888 if (c < rm->rm_firstdatacol) 1889 parity_errors++; 1890 else 1891 data_errors++; 1892 1893 if (!rc->rc_skipped) 1894 unexpected_errors++; 1895 1896 total_errors++; 1897 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1898 parity_untried++; 1899 } 1900 } 1901 1902 if (zio->io_type == ZIO_TYPE_WRITE) { 1903 /* 1904 * XXX -- for now, treat partial writes as a success. 1905 * (If we couldn't write enough columns to reconstruct 1906 * the data, the I/O failed. Otherwise, good enough.) 1907 * 1908 * Now that we support write reallocation, it would be better 1909 * to treat partial failure as real failure unless there are 1910 * no non-degraded top-level vdevs left, and not update DTLs 1911 * if we intend to reallocate. 1912 */ 1913 /* XXPOLICY */ 1914 if (total_errors > rm->rm_firstdatacol) 1915 zio->io_error = vdev_raidz_worst_error(rm); 1916 1917 return; 1918 } 1919 1920 ASSERT(zio->io_type == ZIO_TYPE_READ); 1921 /* 1922 * There are three potential phases for a read: 1923 * 1. produce valid data from the columns read 1924 * 2. read all disks and try again 1925 * 3. perform combinatorial reconstruction 1926 * 1927 * Each phase is progressively both more expensive and less likely to 1928 * occur. If we encounter more errors than we can repair or all phases 1929 * fail, we have no choice but to return an error. 1930 */ 1931 1932 /* 1933 * If the number of errors we saw was correctable -- less than or equal 1934 * to the number of parity disks read -- attempt to produce data that 1935 * has a valid checksum. Naturally, this case applies in the absence of 1936 * any errors. 1937 */ 1938 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1939 if (data_errors == 0) { 1940 if (raidz_checksum_verify(zio) == 0) { 1941 /* 1942 * If we read parity information (unnecessarily 1943 * as it happens since no reconstruction was 1944 * needed) regenerate and verify the parity. 1945 * We also regenerate parity when resilvering 1946 * so we can write it out to the failed device 1947 * later. 1948 */ 1949 if (parity_errors + parity_untried < 1950 rm->rm_firstdatacol || 1951 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1952 n = raidz_parity_verify(zio, rm); 1953 unexpected_errors += n; 1954 ASSERT(parity_errors + n <= 1955 rm->rm_firstdatacol); 1956 } 1957 goto done; 1958 } 1959 } else { 1960 /* 1961 * We either attempt to read all the parity columns or 1962 * none of them. If we didn't try to read parity, we 1963 * wouldn't be here in the correctable case. There must 1964 * also have been fewer parity errors than parity 1965 * columns or, again, we wouldn't be in this code path. 1966 */ 1967 ASSERT(parity_untried == 0); 1968 ASSERT(parity_errors < rm->rm_firstdatacol); 1969 1970 /* 1971 * Identify the data columns that reported an error. 1972 */ 1973 n = 0; 1974 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1975 rc = &rm->rm_col[c]; 1976 if (rc->rc_error != 0) { 1977 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1978 tgts[n++] = c; 1979 } 1980 } 1981 1982 ASSERT(rm->rm_firstdatacol >= n); 1983 1984 code = vdev_raidz_reconstruct(rm, tgts, n); 1985 1986 if (raidz_checksum_verify(zio) == 0) { 1987 atomic_inc_64(&raidz_corrected[code]); 1988 1989 /* 1990 * If we read more parity disks than were used 1991 * for reconstruction, confirm that the other 1992 * parity disks produced correct data. This 1993 * routine is suboptimal in that it regenerates 1994 * the parity that we already used in addition 1995 * to the parity that we're attempting to 1996 * verify, but this should be a relatively 1997 * uncommon case, and can be optimized if it 1998 * becomes a problem. Note that we regenerate 1999 * parity when resilvering so we can write it 2000 * out to failed devices later. 2001 */ 2002 if (parity_errors < rm->rm_firstdatacol - n || 2003 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2004 n = raidz_parity_verify(zio, rm); 2005 unexpected_errors += n; 2006 ASSERT(parity_errors + n <= 2007 rm->rm_firstdatacol); 2008 } 2009 2010 goto done; 2011 } 2012 } 2013 } 2014 2015 /* 2016 * This isn't a typical situation -- either we got a read error or 2017 * a child silently returned bad data. Read every block so we can 2018 * try again with as much data and parity as we can track down. If 2019 * we've already been through once before, all children will be marked 2020 * as tried so we'll proceed to combinatorial reconstruction. 2021 */ 2022 unexpected_errors = 1; 2023 rm->rm_missingdata = 0; 2024 rm->rm_missingparity = 0; 2025 2026 for (c = 0; c < rm->rm_cols; c++) { 2027 if (rm->rm_col[c].rc_tried) 2028 continue; 2029 2030 zio_vdev_io_redone(zio); 2031 do { 2032 rc = &rm->rm_col[c]; 2033 if (rc->rc_tried) 2034 continue; 2035 zio_nowait(zio_vdev_child_io(zio, NULL, 2036 vd->vdev_child[rc->rc_devidx], 2037 rc->rc_offset, rc->rc_data, rc->rc_size, 2038 zio->io_type, zio->io_priority, 0, 2039 vdev_raidz_child_done, rc)); 2040 } while (++c < rm->rm_cols); 2041 2042 return; 2043 } 2044 2045 /* 2046 * At this point we've attempted to reconstruct the data given the 2047 * errors we detected, and we've attempted to read all columns. There 2048 * must, therefore, be one or more additional problems -- silent errors 2049 * resulting in invalid data rather than explicit I/O errors resulting 2050 * in absent data. We check if there is enough additional data to 2051 * possibly reconstruct the data and then perform combinatorial 2052 * reconstruction over all possible combinations. If that fails, 2053 * we're cooked. 2054 */ 2055 if (total_errors > rm->rm_firstdatacol) { 2056 zio->io_error = vdev_raidz_worst_error(rm); 2057 2058 } else if (total_errors < rm->rm_firstdatacol && 2059 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2060 /* 2061 * If we didn't use all the available parity for the 2062 * combinatorial reconstruction, verify that the remaining 2063 * parity is correct. 2064 */ 2065 if (code != (1 << rm->rm_firstdatacol) - 1) 2066 (void) raidz_parity_verify(zio, rm); 2067 } else { 2068 /* 2069 * We're here because either: 2070 * 2071 * total_errors == rm_first_datacol, or 2072 * vdev_raidz_combrec() failed 2073 * 2074 * In either case, there is enough bad data to prevent 2075 * reconstruction. 2076 * 2077 * Start checksum ereports for all children which haven't 2078 * failed. 2079 */ 2080 zio->io_error = ECKSUM; 2081 2082 for (c = 0; c < rm->rm_cols; c++) { 2083 rc = &rm->rm_col[c]; 2084 if (rc->rc_error == 0) { 2085 zio_bad_cksum_t zbc; 2086 zbc.zbc_has_cksum = 0; 2087 zbc.zbc_injected = rm->rm_ecksuminjected; 2088 2089 zfs_ereport_start_checksum( 2090 zio->io_spa, vd->vdev_child[rc->rc_devidx], 2091 zio, rc->rc_offset, rc->rc_size, 2092 (void *)(uintptr_t)c, &zbc); 2093 } 2094 } 2095 } 2096 2097 done: 2098 zio_checksum_verified(zio); 2099 2100 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2101 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2102 /* 2103 * Use the good data we have in hand to repair damaged children. 2104 */ 2105 for (c = 0; c < rm->rm_cols; c++) { 2106 rc = &rm->rm_col[c]; 2107 cvd = vd->vdev_child[rc->rc_devidx]; 2108 2109 if (rc->rc_error == 0) 2110 continue; 2111 2112 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2113 rc->rc_offset, rc->rc_data, rc->rc_size, 2114 ZIO_TYPE_WRITE, zio->io_priority, 2115 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2116 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2117 } 2118 } 2119 } 2120 2121 static void 2122 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2123 { 2124 if (faulted > vd->vdev_nparity) 2125 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2126 VDEV_AUX_NO_REPLICAS); 2127 else if (degraded + faulted != 0) 2128 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2129 else 2130 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2131 } 2132 2133 vdev_ops_t vdev_raidz_ops = { 2134 vdev_raidz_open, 2135 vdev_raidz_close, 2136 vdev_raidz_asize, 2137 vdev_raidz_io_start, 2138 vdev_raidz_io_done, 2139 vdev_raidz_state_change, 2140 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2141 B_FALSE /* not a leaf vdev */ 2142 }; 2143