1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * (A * 2)_7 = A_6 64 * (A * 2)_6 = A_5 65 * (A * 2)_5 = A_4 66 * (A * 2)_4 = A_3 + A_7 67 * (A * 2)_3 = A_2 + A_7 68 * (A * 2)_2 = A_1 + A_7 69 * (A * 2)_1 = A_0 70 * (A * 2)_0 = A_7 71 * 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 * As an aside, this multiplication is derived from the error correcting 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 * 76 * Observe that any number in the field (except for 0) can be expressed as a 77 * power of 2 -- a generator for the field. We store a table of the powers of 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 * than field addition). The inverse of a field element A (A^-1) is therefore 81 * A ^ (255 - 1) = A^254. 82 * 83 * The up-to-three parity columns, P, Q, R over several data columns, 84 * D_0, ... D_n-1, can be expressed by field operations: 85 * 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 * 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 * independent coefficients. (There are no additional coefficients that have 95 * this property which is why the uncorrected Plank method breaks down.) 96 * 97 * See the reconstruction code below for how P, Q and R can used individually 98 * or in concert to recover missing data columns. 99 */ 100 101 typedef struct raidz_col { 102 uint64_t rc_devidx; /* child device index for I/O */ 103 uint64_t rc_offset; /* device offset */ 104 uint64_t rc_size; /* I/O size */ 105 void *rc_data; /* I/O data */ 106 void *rc_gdata; /* used to store the "good" version */ 107 int rc_error; /* I/O error for this device */ 108 uint8_t rc_tried; /* Did we attempt this I/O column? */ 109 uint8_t rc_skipped; /* Did we skip this I/O column? */ 110 } raidz_col_t; 111 112 typedef struct raidz_map { 113 uint64_t rm_cols; /* Regular column count */ 114 uint64_t rm_scols; /* Count including skipped columns */ 115 uint64_t rm_bigcols; /* Number of oversized columns */ 116 uint64_t rm_asize; /* Actual total I/O size */ 117 uint64_t rm_missingdata; /* Count of missing data devices */ 118 uint64_t rm_missingparity; /* Count of missing parity devices */ 119 uint64_t rm_firstdatacol; /* First data column/parity count */ 120 uint64_t rm_nskip; /* Skipped sectors for padding */ 121 uint64_t rm_skipstart; /* Column index of padding start */ 122 void *rm_datacopy; /* rm_asize-buffer of copied data */ 123 uintptr_t rm_reports; /* # of referencing checksum reports */ 124 uint8_t rm_freed; /* map no longer has referencing ZIO */ 125 uint8_t rm_ecksuminjected; /* checksum error was injected */ 126 uint64_t rm_skipped; /* Skipped sectors for padding */ 127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 128 } raidz_map_t; 129 130 #define VDEV_RAIDZ_P 0 131 #define VDEV_RAIDZ_Q 1 132 #define VDEV_RAIDZ_R 2 133 #define VDEV_RAIDZ_MAXPARITY 3 134 135 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 136 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 137 138 /* 139 * We provide a mechanism to perform the field multiplication operation on a 140 * 64-bit value all at once rather than a byte at a time. This works by 141 * creating a mask from the top bit in each byte and using that to 142 * conditionally apply the XOR of 0x1d. 143 */ 144 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 145 { \ 146 (mask) = (x) & 0x8080808080808080ULL; \ 147 (mask) = ((mask) << 1) - ((mask) >> 7); \ 148 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 149 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 150 } 151 152 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 153 { \ 154 VDEV_RAIDZ_64MUL_2((x), mask); \ 155 VDEV_RAIDZ_64MUL_2((x), mask); \ 156 } 157 158 /* 159 * Force reconstruction to use the general purpose method. 160 */ 161 int vdev_raidz_default_to_general; 162 163 /* 164 * These two tables represent powers and logs of 2 in the Galois field defined 165 * above. These values were computed by repeatedly multiplying by 2 as above. 166 */ 167 static const uint8_t vdev_raidz_pow2[256] = { 168 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 169 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 170 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 171 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 172 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 173 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 174 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 175 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 176 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 177 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 178 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 179 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 180 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 181 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 182 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 183 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 184 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 185 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 186 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 187 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 188 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 189 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 190 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 191 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 192 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 193 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 194 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 195 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 196 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 197 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 198 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 199 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 200 }; 201 static const uint8_t vdev_raidz_log2[256] = { 202 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 203 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 204 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 205 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 206 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 207 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 208 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 209 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 210 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 211 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 212 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 213 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 214 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 215 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 216 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 217 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 218 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 219 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 220 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 221 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 222 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 223 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 224 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 225 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 226 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 227 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 228 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 229 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 230 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 231 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 232 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 233 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 234 }; 235 236 static void vdev_raidz_generate_parity(raidz_map_t *rm); 237 238 /* 239 * Multiply a given number by 2 raised to the given power. 240 */ 241 static uint8_t 242 vdev_raidz_exp2(uint_t a, int exp) 243 { 244 if (a == 0) 245 return (0); 246 247 ASSERT(exp >= 0); 248 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 249 250 exp += vdev_raidz_log2[a]; 251 if (exp > 255) 252 exp -= 255; 253 254 return (vdev_raidz_pow2[exp]); 255 } 256 257 static void 258 vdev_raidz_map_free(raidz_map_t *rm) 259 { 260 int c; 261 size_t size = rm->rm_asize; /* will hold data-size after the loop */ 262 263 for (c = 0; c < rm->rm_firstdatacol; c++) { 264 size -= rm->rm_col[c].rc_size; 265 266 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 267 268 if (rm->rm_col[c].rc_gdata != NULL) 269 zio_buf_free(rm->rm_col[c].rc_gdata, 270 rm->rm_col[c].rc_size); 271 } 272 273 if (rm->rm_datacopy != NULL) 274 zio_buf_free(rm->rm_datacopy, size); 275 276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 277 } 278 279 static void 280 vdev_raidz_map_free_vsd(zio_t *zio) 281 { 282 raidz_map_t *rm = zio->io_vsd; 283 284 ASSERT3U(rm->rm_freed, ==, 0); 285 rm->rm_freed = 1; 286 287 if (rm->rm_reports == 0) 288 vdev_raidz_map_free(rm); 289 } 290 291 /*ARGSUSED*/ 292 static void 293 vdev_raidz_cksum_free(void *arg, size_t ignored) 294 { 295 raidz_map_t *rm = arg; 296 297 ASSERT3U(rm->rm_reports, >, 0); 298 ASSERT3U(rm->rm_freed, !=, 0); 299 300 if (--rm->rm_reports == 0) 301 vdev_raidz_map_free(rm); 302 } 303 304 static void 305 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 306 { 307 raidz_map_t *rm = zcr->zcr_cbdata; 308 size_t c = zcr->zcr_cbinfo; 309 size_t x; 310 311 const char *good = NULL; 312 const char *bad = rm->rm_col[c].rc_data; 313 314 if (good_data == NULL) { 315 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 316 return; 317 } 318 319 if (c < rm->rm_firstdatacol) { 320 /* 321 * The first time through, calculate the parity blocks for 322 * the good data (this relies on the fact that the good 323 * data never changes for a given logical ZIO) 324 */ 325 if (rm->rm_col[0].rc_gdata == NULL) { 326 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 327 char *buf; 328 329 /* 330 * Set up the rm_col[]s to generate the parity for 331 * good_data, first saving the parity bufs and 332 * replacing them with buffers to hold the result. 333 */ 334 for (x = 0; x < rm->rm_firstdatacol; x++) { 335 bad_parity[x] = rm->rm_col[x].rc_data; 336 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 337 zio_buf_alloc(rm->rm_col[x].rc_size); 338 } 339 340 /* fill in the data columns from good_data */ 341 buf = (char *)good_data; 342 for (; x < rm->rm_cols; x++) { 343 rm->rm_col[x].rc_data = buf; 344 buf += rm->rm_col[x].rc_size; 345 } 346 347 /* 348 * Construct the parity from the good data. 349 */ 350 vdev_raidz_generate_parity(rm); 351 352 /* restore everything back to its original state */ 353 for (x = 0; x < rm->rm_firstdatacol; x++) 354 rm->rm_col[x].rc_data = bad_parity[x]; 355 356 buf = rm->rm_datacopy; 357 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 358 rm->rm_col[x].rc_data = buf; 359 buf += rm->rm_col[x].rc_size; 360 } 361 } 362 363 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 364 good = rm->rm_col[c].rc_gdata; 365 } else { 366 /* adjust good_data to point at the start of our column */ 367 good = good_data; 368 369 for (x = rm->rm_firstdatacol; x < c; x++) 370 good += rm->rm_col[x].rc_size; 371 } 372 373 /* we drop the ereport if it ends up that the data was good */ 374 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 375 } 376 377 /* 378 * Invoked indirectly by zfs_ereport_start_checksum(), called 379 * below when our read operation fails completely. The main point 380 * is to keep a copy of everything we read from disk, so that at 381 * vdev_raidz_cksum_finish() time we can compare it with the good data. 382 */ 383 static void 384 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 385 { 386 size_t c = (size_t)(uintptr_t)arg; 387 caddr_t buf; 388 389 raidz_map_t *rm = zio->io_vsd; 390 size_t size; 391 392 /* set up the report and bump the refcount */ 393 zcr->zcr_cbdata = rm; 394 zcr->zcr_cbinfo = c; 395 zcr->zcr_finish = vdev_raidz_cksum_finish; 396 zcr->zcr_free = vdev_raidz_cksum_free; 397 398 rm->rm_reports++; 399 ASSERT3U(rm->rm_reports, >, 0); 400 401 if (rm->rm_reports != 1) 402 return; 403 404 /* 405 * It's the first time we're called, so we need to copy the data 406 * aside; there's no guarantee that our zio's buffer won't be 407 * re-used for something else. 408 * 409 * Our parity data is already in seperate buffers, so there's no need 410 * to copy them. 411 */ 412 ASSERT3P(rm->rm_datacopy, ==, NULL); 413 414 /* rm_asize includes the parity blocks; subtract them out */ 415 size = rm->rm_asize; 416 for (c = 0; c < rm->rm_firstdatacol; c++) 417 size -= rm->rm_col[c].rc_size; 418 419 buf = rm->rm_datacopy = zio_buf_alloc(size); 420 for (; c < rm->rm_cols; c++) { 421 raidz_col_t *col = &rm->rm_col[c]; 422 423 bcopy(col->rc_data, buf, col->rc_size); 424 col->rc_data = buf; 425 426 buf += col->rc_size; 427 } 428 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 429 } 430 431 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432 vdev_raidz_map_free_vsd, 433 vdev_raidz_cksum_report 434 }; 435 436 static raidz_map_t * 437 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 438 uint64_t nparity) 439 { 440 raidz_map_t *rm; 441 uint64_t b = zio->io_offset >> unit_shift; 442 uint64_t s = zio->io_size >> unit_shift; 443 uint64_t f = b % dcols; 444 uint64_t o = (b / dcols) << unit_shift; 445 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 446 447 q = s / (dcols - nparity); 448 r = s - q * (dcols - nparity); 449 bc = (r == 0 ? 0 : r + nparity); 450 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 451 452 if (q == 0) { 453 acols = bc; 454 scols = MIN(dcols, roundup(bc, nparity + 1)); 455 } else { 456 acols = dcols; 457 scols = dcols; 458 } 459 460 ASSERT3U(acols, <=, scols); 461 462 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 463 464 rm->rm_cols = acols; 465 rm->rm_scols = scols; 466 rm->rm_bigcols = bc; 467 rm->rm_skipstart = bc; 468 rm->rm_missingdata = 0; 469 rm->rm_missingparity = 0; 470 rm->rm_firstdatacol = nparity; 471 rm->rm_datacopy = NULL; 472 rm->rm_reports = 0; 473 rm->rm_freed = 0; 474 rm->rm_ecksuminjected = 0; 475 476 asize = 0; 477 478 for (c = 0; c < scols; c++) { 479 col = f + c; 480 coff = o; 481 if (col >= dcols) { 482 col -= dcols; 483 coff += 1ULL << unit_shift; 484 } 485 rm->rm_col[c].rc_devidx = col; 486 rm->rm_col[c].rc_offset = coff; 487 rm->rm_col[c].rc_data = NULL; 488 rm->rm_col[c].rc_gdata = NULL; 489 rm->rm_col[c].rc_error = 0; 490 rm->rm_col[c].rc_tried = 0; 491 rm->rm_col[c].rc_skipped = 0; 492 493 if (c >= acols) 494 rm->rm_col[c].rc_size = 0; 495 else if (c < bc) 496 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 497 else 498 rm->rm_col[c].rc_size = q << unit_shift; 499 500 asize += rm->rm_col[c].rc_size; 501 } 502 503 ASSERT3U(asize, ==, tot << unit_shift); 504 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 505 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 506 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 507 ASSERT3U(rm->rm_nskip, <=, nparity); 508 509 for (c = 0; c < rm->rm_firstdatacol; c++) 510 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 511 512 rm->rm_col[c].rc_data = zio->io_data; 513 514 for (c = c + 1; c < acols; c++) 515 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 516 rm->rm_col[c - 1].rc_size; 517 518 /* 519 * If all data stored spans all columns, there's a danger that parity 520 * will always be on the same device and, since parity isn't read 521 * during normal operation, that that device's I/O bandwidth won't be 522 * used effectively. We therefore switch the parity every 1MB. 523 * 524 * ... at least that was, ostensibly, the theory. As a practical 525 * matter unless we juggle the parity between all devices evenly, we 526 * won't see any benefit. Further, occasional writes that aren't a 527 * multiple of the LCM of the number of children and the minimum 528 * stripe width are sufficient to avoid pessimal behavior. 529 * Unfortunately, this decision created an implicit on-disk format 530 * requirement that we need to support for all eternity, but only 531 * for single-parity RAID-Z. 532 * 533 * If we intend to skip a sector in the zeroth column for padding 534 * we must make sure to note this swap. We will never intend to 535 * skip the first column since at least one data and one parity 536 * column must appear in each row. 537 */ 538 ASSERT(rm->rm_cols >= 2); 539 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 540 541 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 542 devidx = rm->rm_col[0].rc_devidx; 543 o = rm->rm_col[0].rc_offset; 544 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 545 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 546 rm->rm_col[1].rc_devidx = devidx; 547 rm->rm_col[1].rc_offset = o; 548 549 if (rm->rm_skipstart == 0) 550 rm->rm_skipstart = 1; 551 } 552 553 zio->io_vsd = rm; 554 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 555 return (rm); 556 } 557 558 static void 559 vdev_raidz_generate_parity_p(raidz_map_t *rm) 560 { 561 uint64_t *p, *src, pcount, ccount, i; 562 int c; 563 564 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 565 566 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 567 src = rm->rm_col[c].rc_data; 568 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 569 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 570 571 if (c == rm->rm_firstdatacol) { 572 ASSERT(ccount == pcount); 573 for (i = 0; i < ccount; i++, src++, p++) { 574 *p = *src; 575 } 576 } else { 577 ASSERT(ccount <= pcount); 578 for (i = 0; i < ccount; i++, src++, p++) { 579 *p ^= *src; 580 } 581 } 582 } 583 } 584 585 static void 586 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 587 { 588 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 589 int c; 590 591 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 592 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 593 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 594 595 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 596 src = rm->rm_col[c].rc_data; 597 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 598 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 599 600 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 601 602 if (c == rm->rm_firstdatacol) { 603 ASSERT(ccnt == pcnt || ccnt == 0); 604 for (i = 0; i < ccnt; i++, src++, p++, q++) { 605 *p = *src; 606 *q = *src; 607 } 608 for (; i < pcnt; i++, src++, p++, q++) { 609 *p = 0; 610 *q = 0; 611 } 612 } else { 613 ASSERT(ccnt <= pcnt); 614 615 /* 616 * Apply the algorithm described above by multiplying 617 * the previous result and adding in the new value. 618 */ 619 for (i = 0; i < ccnt; i++, src++, p++, q++) { 620 *p ^= *src; 621 622 VDEV_RAIDZ_64MUL_2(*q, mask); 623 *q ^= *src; 624 } 625 626 /* 627 * Treat short columns as though they are full of 0s. 628 * Note that there's therefore nothing needed for P. 629 */ 630 for (; i < pcnt; i++, q++) { 631 VDEV_RAIDZ_64MUL_2(*q, mask); 632 } 633 } 634 } 635 } 636 637 static void 638 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 639 { 640 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 641 int c; 642 643 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 644 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 645 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 646 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 647 rm->rm_col[VDEV_RAIDZ_R].rc_size); 648 649 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 650 src = rm->rm_col[c].rc_data; 651 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 652 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 653 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 654 655 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 656 657 if (c == rm->rm_firstdatacol) { 658 ASSERT(ccnt == pcnt || ccnt == 0); 659 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 660 *p = *src; 661 *q = *src; 662 *r = *src; 663 } 664 for (; i < pcnt; i++, src++, p++, q++, r++) { 665 *p = 0; 666 *q = 0; 667 *r = 0; 668 } 669 } else { 670 ASSERT(ccnt <= pcnt); 671 672 /* 673 * Apply the algorithm described above by multiplying 674 * the previous result and adding in the new value. 675 */ 676 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 677 *p ^= *src; 678 679 VDEV_RAIDZ_64MUL_2(*q, mask); 680 *q ^= *src; 681 682 VDEV_RAIDZ_64MUL_4(*r, mask); 683 *r ^= *src; 684 } 685 686 /* 687 * Treat short columns as though they are full of 0s. 688 * Note that there's therefore nothing needed for P. 689 */ 690 for (; i < pcnt; i++, q++, r++) { 691 VDEV_RAIDZ_64MUL_2(*q, mask); 692 VDEV_RAIDZ_64MUL_4(*r, mask); 693 } 694 } 695 } 696 } 697 698 /* 699 * Generate RAID parity in the first virtual columns according to the number of 700 * parity columns available. 701 */ 702 static void 703 vdev_raidz_generate_parity(raidz_map_t *rm) 704 { 705 switch (rm->rm_firstdatacol) { 706 case 1: 707 vdev_raidz_generate_parity_p(rm); 708 break; 709 case 2: 710 vdev_raidz_generate_parity_pq(rm); 711 break; 712 case 3: 713 vdev_raidz_generate_parity_pqr(rm); 714 break; 715 default: 716 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 717 } 718 } 719 720 static int 721 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 722 { 723 uint64_t *dst, *src, xcount, ccount, count, i; 724 int x = tgts[0]; 725 int c; 726 727 ASSERT(ntgts == 1); 728 ASSERT(x >= rm->rm_firstdatacol); 729 ASSERT(x < rm->rm_cols); 730 731 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 732 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 733 ASSERT(xcount > 0); 734 735 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 736 dst = rm->rm_col[x].rc_data; 737 for (i = 0; i < xcount; i++, dst++, src++) { 738 *dst = *src; 739 } 740 741 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 742 src = rm->rm_col[c].rc_data; 743 dst = rm->rm_col[x].rc_data; 744 745 if (c == x) 746 continue; 747 748 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 749 count = MIN(ccount, xcount); 750 751 for (i = 0; i < count; i++, dst++, src++) { 752 *dst ^= *src; 753 } 754 } 755 756 return (1 << VDEV_RAIDZ_P); 757 } 758 759 static int 760 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 761 { 762 uint64_t *dst, *src, xcount, ccount, count, mask, i; 763 uint8_t *b; 764 int x = tgts[0]; 765 int c, j, exp; 766 767 ASSERT(ntgts == 1); 768 769 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 770 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 771 772 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 773 src = rm->rm_col[c].rc_data; 774 dst = rm->rm_col[x].rc_data; 775 776 if (c == x) 777 ccount = 0; 778 else 779 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 780 781 count = MIN(ccount, xcount); 782 783 if (c == rm->rm_firstdatacol) { 784 for (i = 0; i < count; i++, dst++, src++) { 785 *dst = *src; 786 } 787 for (; i < xcount; i++, dst++) { 788 *dst = 0; 789 } 790 791 } else { 792 for (i = 0; i < count; i++, dst++, src++) { 793 VDEV_RAIDZ_64MUL_2(*dst, mask); 794 *dst ^= *src; 795 } 796 797 for (; i < xcount; i++, dst++) { 798 VDEV_RAIDZ_64MUL_2(*dst, mask); 799 } 800 } 801 } 802 803 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 804 dst = rm->rm_col[x].rc_data; 805 exp = 255 - (rm->rm_cols - 1 - x); 806 807 for (i = 0; i < xcount; i++, dst++, src++) { 808 *dst ^= *src; 809 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 810 *b = vdev_raidz_exp2(*b, exp); 811 } 812 } 813 814 return (1 << VDEV_RAIDZ_Q); 815 } 816 817 static int 818 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 819 { 820 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 821 void *pdata, *qdata; 822 uint64_t xsize, ysize, i; 823 int x = tgts[0]; 824 int y = tgts[1]; 825 826 ASSERT(ntgts == 2); 827 ASSERT(x < y); 828 ASSERT(x >= rm->rm_firstdatacol); 829 ASSERT(y < rm->rm_cols); 830 831 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 832 833 /* 834 * Move the parity data aside -- we're going to compute parity as 835 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 836 * reuse the parity generation mechanism without trashing the actual 837 * parity so we make those columns appear to be full of zeros by 838 * setting their lengths to zero. 839 */ 840 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 841 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 842 xsize = rm->rm_col[x].rc_size; 843 ysize = rm->rm_col[y].rc_size; 844 845 rm->rm_col[VDEV_RAIDZ_P].rc_data = 846 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 847 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 848 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 849 rm->rm_col[x].rc_size = 0; 850 rm->rm_col[y].rc_size = 0; 851 852 vdev_raidz_generate_parity_pq(rm); 853 854 rm->rm_col[x].rc_size = xsize; 855 rm->rm_col[y].rc_size = ysize; 856 857 p = pdata; 858 q = qdata; 859 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 860 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 861 xd = rm->rm_col[x].rc_data; 862 yd = rm->rm_col[y].rc_data; 863 864 /* 865 * We now have: 866 * Pxy = P + D_x + D_y 867 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 868 * 869 * We can then solve for D_x: 870 * D_x = A * (P + Pxy) + B * (Q + Qxy) 871 * where 872 * A = 2^(x - y) * (2^(x - y) + 1)^-1 873 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 874 * 875 * With D_x in hand, we can easily solve for D_y: 876 * D_y = P + Pxy + D_x 877 */ 878 879 a = vdev_raidz_pow2[255 + x - y]; 880 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 881 tmp = 255 - vdev_raidz_log2[a ^ 1]; 882 883 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 884 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 885 886 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 887 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 888 vdev_raidz_exp2(*q ^ *qxy, bexp); 889 890 if (i < ysize) 891 *yd = *p ^ *pxy ^ *xd; 892 } 893 894 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 895 rm->rm_col[VDEV_RAIDZ_P].rc_size); 896 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 897 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 898 899 /* 900 * Restore the saved parity data. 901 */ 902 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 903 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 904 905 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 906 } 907 908 /* BEGIN CSTYLED */ 909 /* 910 * In the general case of reconstruction, we must solve the system of linear 911 * equations defined by the coeffecients used to generate parity as well as 912 * the contents of the data and parity disks. This can be expressed with 913 * vectors for the original data (D) and the actual data (d) and parity (p) 914 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 915 * 916 * __ __ __ __ 917 * | | __ __ | p_0 | 918 * | V | | D_0 | | p_m-1 | 919 * | | x | : | = | d_0 | 920 * | I | | D_n-1 | | : | 921 * | | ~~ ~~ | d_n-1 | 922 * ~~ ~~ ~~ ~~ 923 * 924 * I is simply a square identity matrix of size n, and V is a vandermonde 925 * matrix defined by the coeffecients we chose for the various parity columns 926 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 927 * computation as well as linear separability. 928 * 929 * __ __ __ __ 930 * | 1 .. 1 1 1 | | p_0 | 931 * | 2^n-1 .. 4 2 1 | __ __ | : | 932 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 933 * | 1 .. 0 0 0 | | D_1 | | d_0 | 934 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 935 * | : : : : | | : | | d_2 | 936 * | 0 .. 1 0 0 | | D_n-1 | | : | 937 * | 0 .. 0 1 0 | ~~ ~~ | : | 938 * | 0 .. 0 0 1 | | d_n-1 | 939 * ~~ ~~ ~~ ~~ 940 * 941 * Note that I, V, d, and p are known. To compute D, we must invert the 942 * matrix and use the known data and parity values to reconstruct the unknown 943 * data values. We begin by removing the rows in V|I and d|p that correspond 944 * to failed or missing columns; we then make V|I square (n x n) and d|p 945 * sized n by removing rows corresponding to unused parity from the bottom up 946 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 947 * using Gauss-Jordan elimination. In the example below we use m=3 parity 948 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 949 * __ __ 950 * | 1 1 1 1 1 1 1 1 | 951 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 952 * | 19 205 116 29 64 16 4 1 | / / 953 * | 1 0 0 0 0 0 0 0 | / / 954 * | 0 1 0 0 0 0 0 0 | <--' / 955 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 956 * | 0 0 0 1 0 0 0 0 | 957 * | 0 0 0 0 1 0 0 0 | 958 * | 0 0 0 0 0 1 0 0 | 959 * | 0 0 0 0 0 0 1 0 | 960 * | 0 0 0 0 0 0 0 1 | 961 * ~~ ~~ 962 * __ __ 963 * | 1 1 1 1 1 1 1 1 | 964 * | 128 64 32 16 8 4 2 1 | 965 * | 19 205 116 29 64 16 4 1 | 966 * | 1 0 0 0 0 0 0 0 | 967 * | 0 1 0 0 0 0 0 0 | 968 * (V|I)' = | 0 0 1 0 0 0 0 0 | 969 * | 0 0 0 1 0 0 0 0 | 970 * | 0 0 0 0 1 0 0 0 | 971 * | 0 0 0 0 0 1 0 0 | 972 * | 0 0 0 0 0 0 1 0 | 973 * | 0 0 0 0 0 0 0 1 | 974 * ~~ ~~ 975 * 976 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 977 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 978 * matrix is not singular. 979 * __ __ 980 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 981 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 982 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 983 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 984 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 985 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 986 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 987 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 988 * ~~ ~~ 989 * __ __ 990 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 991 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 992 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 993 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 994 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 995 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 996 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 997 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 998 * ~~ ~~ 999 * __ __ 1000 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1001 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1002 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1003 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1004 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1005 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1006 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1007 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1008 * ~~ ~~ 1009 * __ __ 1010 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1011 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1012 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1013 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1014 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1015 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1016 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1017 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1018 * ~~ ~~ 1019 * __ __ 1020 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1021 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1022 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1023 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1024 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1025 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1026 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1027 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1028 * ~~ ~~ 1029 * __ __ 1030 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1031 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1032 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1033 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1034 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1035 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1036 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1037 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1038 * ~~ ~~ 1039 * __ __ 1040 * | 0 0 1 0 0 0 0 0 | 1041 * | 167 100 5 41 159 169 217 208 | 1042 * | 166 100 4 40 158 168 216 209 | 1043 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1044 * | 0 0 0 0 1 0 0 0 | 1045 * | 0 0 0 0 0 1 0 0 | 1046 * | 0 0 0 0 0 0 1 0 | 1047 * | 0 0 0 0 0 0 0 1 | 1048 * ~~ ~~ 1049 * 1050 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1051 * of the missing data. 1052 * 1053 * As is apparent from the example above, the only non-trivial rows in the 1054 * inverse matrix correspond to the data disks that we're trying to 1055 * reconstruct. Indeed, those are the only rows we need as the others would 1056 * only be useful for reconstructing data known or assumed to be valid. For 1057 * that reason, we only build the coefficients in the rows that correspond to 1058 * targeted columns. 1059 */ 1060 /* END CSTYLED */ 1061 1062 static void 1063 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1064 uint8_t **rows) 1065 { 1066 int i, j; 1067 int pow; 1068 1069 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1070 1071 /* 1072 * Fill in the missing rows of interest. 1073 */ 1074 for (i = 0; i < nmap; i++) { 1075 ASSERT3S(0, <=, map[i]); 1076 ASSERT3S(map[i], <=, 2); 1077 1078 pow = map[i] * n; 1079 if (pow > 255) 1080 pow -= 255; 1081 ASSERT(pow <= 255); 1082 1083 for (j = 0; j < n; j++) { 1084 pow -= map[i]; 1085 if (pow < 0) 1086 pow += 255; 1087 rows[i][j] = vdev_raidz_pow2[pow]; 1088 } 1089 } 1090 } 1091 1092 static void 1093 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1094 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1095 { 1096 int i, j, ii, jj; 1097 uint8_t log; 1098 1099 /* 1100 * Assert that the first nmissing entries from the array of used 1101 * columns correspond to parity columns and that subsequent entries 1102 * correspond to data columns. 1103 */ 1104 for (i = 0; i < nmissing; i++) { 1105 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1106 } 1107 for (; i < n; i++) { 1108 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1109 } 1110 1111 /* 1112 * First initialize the storage where we'll compute the inverse rows. 1113 */ 1114 for (i = 0; i < nmissing; i++) { 1115 for (j = 0; j < n; j++) { 1116 invrows[i][j] = (i == j) ? 1 : 0; 1117 } 1118 } 1119 1120 /* 1121 * Subtract all trivial rows from the rows of consequence. 1122 */ 1123 for (i = 0; i < nmissing; i++) { 1124 for (j = nmissing; j < n; j++) { 1125 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1126 jj = used[j] - rm->rm_firstdatacol; 1127 ASSERT3S(jj, <, n); 1128 invrows[i][j] = rows[i][jj]; 1129 rows[i][jj] = 0; 1130 } 1131 } 1132 1133 /* 1134 * For each of the rows of interest, we must normalize it and subtract 1135 * a multiple of it from the other rows. 1136 */ 1137 for (i = 0; i < nmissing; i++) { 1138 for (j = 0; j < missing[i]; j++) { 1139 ASSERT3U(rows[i][j], ==, 0); 1140 } 1141 ASSERT3U(rows[i][missing[i]], !=, 0); 1142 1143 /* 1144 * Compute the inverse of the first element and multiply each 1145 * element in the row by that value. 1146 */ 1147 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1148 1149 for (j = 0; j < n; j++) { 1150 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1151 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1152 } 1153 1154 for (ii = 0; ii < nmissing; ii++) { 1155 if (i == ii) 1156 continue; 1157 1158 ASSERT3U(rows[ii][missing[i]], !=, 0); 1159 1160 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1161 1162 for (j = 0; j < n; j++) { 1163 rows[ii][j] ^= 1164 vdev_raidz_exp2(rows[i][j], log); 1165 invrows[ii][j] ^= 1166 vdev_raidz_exp2(invrows[i][j], log); 1167 } 1168 } 1169 } 1170 1171 /* 1172 * Verify that the data that is left in the rows are properly part of 1173 * an identity matrix. 1174 */ 1175 for (i = 0; i < nmissing; i++) { 1176 for (j = 0; j < n; j++) { 1177 if (j == missing[i]) { 1178 ASSERT3U(rows[i][j], ==, 1); 1179 } else { 1180 ASSERT3U(rows[i][j], ==, 0); 1181 } 1182 } 1183 } 1184 } 1185 1186 static void 1187 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1188 int *missing, uint8_t **invrows, const uint8_t *used) 1189 { 1190 int i, j, x, cc, c; 1191 uint8_t *src; 1192 uint64_t ccount; 1193 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1194 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1195 uint8_t log, val; 1196 int ll; 1197 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1198 uint8_t *p, *pp; 1199 size_t psize; 1200 1201 psize = sizeof (invlog[0][0]) * n * nmissing; 1202 p = kmem_alloc(psize, KM_SLEEP); 1203 1204 for (pp = p, i = 0; i < nmissing; i++) { 1205 invlog[i] = pp; 1206 pp += n; 1207 } 1208 1209 for (i = 0; i < nmissing; i++) { 1210 for (j = 0; j < n; j++) { 1211 ASSERT3U(invrows[i][j], !=, 0); 1212 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1213 } 1214 } 1215 1216 for (i = 0; i < n; i++) { 1217 c = used[i]; 1218 ASSERT3U(c, <, rm->rm_cols); 1219 1220 src = rm->rm_col[c].rc_data; 1221 ccount = rm->rm_col[c].rc_size; 1222 for (j = 0; j < nmissing; j++) { 1223 cc = missing[j] + rm->rm_firstdatacol; 1224 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1225 ASSERT3U(cc, <, rm->rm_cols); 1226 ASSERT3U(cc, !=, c); 1227 1228 dst[j] = rm->rm_col[cc].rc_data; 1229 dcount[j] = rm->rm_col[cc].rc_size; 1230 } 1231 1232 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1233 1234 for (x = 0; x < ccount; x++, src++) { 1235 if (*src != 0) 1236 log = vdev_raidz_log2[*src]; 1237 1238 for (cc = 0; cc < nmissing; cc++) { 1239 if (x >= dcount[cc]) 1240 continue; 1241 1242 if (*src == 0) { 1243 val = 0; 1244 } else { 1245 if ((ll = log + invlog[cc][i]) >= 255) 1246 ll -= 255; 1247 val = vdev_raidz_pow2[ll]; 1248 } 1249 1250 if (i == 0) 1251 dst[cc][x] = val; 1252 else 1253 dst[cc][x] ^= val; 1254 } 1255 } 1256 } 1257 1258 kmem_free(p, psize); 1259 } 1260 1261 static int 1262 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1263 { 1264 int n, i, c, t, tt; 1265 int nmissing_rows; 1266 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1267 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1268 1269 uint8_t *p, *pp; 1270 size_t psize; 1271 1272 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1273 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1274 uint8_t *used; 1275 1276 int code = 0; 1277 1278 1279 n = rm->rm_cols - rm->rm_firstdatacol; 1280 1281 /* 1282 * Figure out which data columns are missing. 1283 */ 1284 nmissing_rows = 0; 1285 for (t = 0; t < ntgts; t++) { 1286 if (tgts[t] >= rm->rm_firstdatacol) { 1287 missing_rows[nmissing_rows++] = 1288 tgts[t] - rm->rm_firstdatacol; 1289 } 1290 } 1291 1292 /* 1293 * Figure out which parity columns to use to help generate the missing 1294 * data columns. 1295 */ 1296 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1297 ASSERT(tt < ntgts); 1298 ASSERT(c < rm->rm_firstdatacol); 1299 1300 /* 1301 * Skip any targeted parity columns. 1302 */ 1303 if (c == tgts[tt]) { 1304 tt++; 1305 continue; 1306 } 1307 1308 code |= 1 << c; 1309 1310 parity_map[i] = c; 1311 i++; 1312 } 1313 1314 ASSERT(code != 0); 1315 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1316 1317 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1318 nmissing_rows * n + sizeof (used[0]) * n; 1319 p = kmem_alloc(psize, KM_SLEEP); 1320 1321 for (pp = p, i = 0; i < nmissing_rows; i++) { 1322 rows[i] = pp; 1323 pp += n; 1324 invrows[i] = pp; 1325 pp += n; 1326 } 1327 used = pp; 1328 1329 for (i = 0; i < nmissing_rows; i++) { 1330 used[i] = parity_map[i]; 1331 } 1332 1333 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1334 if (tt < nmissing_rows && 1335 c == missing_rows[tt] + rm->rm_firstdatacol) { 1336 tt++; 1337 continue; 1338 } 1339 1340 ASSERT3S(i, <, n); 1341 used[i] = c; 1342 i++; 1343 } 1344 1345 /* 1346 * Initialize the interesting rows of the matrix. 1347 */ 1348 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1349 1350 /* 1351 * Invert the matrix. 1352 */ 1353 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1354 invrows, used); 1355 1356 /* 1357 * Reconstruct the missing data using the generated matrix. 1358 */ 1359 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1360 invrows, used); 1361 1362 kmem_free(p, psize); 1363 1364 return (code); 1365 } 1366 1367 static int 1368 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1369 { 1370 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1371 int ntgts; 1372 int i, c; 1373 int code; 1374 int nbadparity, nbaddata; 1375 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1376 1377 /* 1378 * The tgts list must already be sorted. 1379 */ 1380 for (i = 1; i < nt; i++) { 1381 ASSERT(t[i] > t[i - 1]); 1382 } 1383 1384 nbadparity = rm->rm_firstdatacol; 1385 nbaddata = rm->rm_cols - nbadparity; 1386 ntgts = 0; 1387 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1388 if (c < rm->rm_firstdatacol) 1389 parity_valid[c] = B_FALSE; 1390 1391 if (i < nt && c == t[i]) { 1392 tgts[ntgts++] = c; 1393 i++; 1394 } else if (rm->rm_col[c].rc_error != 0) { 1395 tgts[ntgts++] = c; 1396 } else if (c >= rm->rm_firstdatacol) { 1397 nbaddata--; 1398 } else { 1399 parity_valid[c] = B_TRUE; 1400 nbadparity--; 1401 } 1402 } 1403 1404 ASSERT(ntgts >= nt); 1405 ASSERT(nbaddata >= 0); 1406 ASSERT(nbaddata + nbadparity == ntgts); 1407 1408 dt = &tgts[nbadparity]; 1409 1410 /* 1411 * See if we can use any of our optimized reconstruction routines. 1412 */ 1413 if (!vdev_raidz_default_to_general) { 1414 switch (nbaddata) { 1415 case 1: 1416 if (parity_valid[VDEV_RAIDZ_P]) 1417 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1418 1419 ASSERT(rm->rm_firstdatacol > 1); 1420 1421 if (parity_valid[VDEV_RAIDZ_Q]) 1422 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1423 1424 ASSERT(rm->rm_firstdatacol > 2); 1425 break; 1426 1427 case 2: 1428 ASSERT(rm->rm_firstdatacol > 1); 1429 1430 if (parity_valid[VDEV_RAIDZ_P] && 1431 parity_valid[VDEV_RAIDZ_Q]) 1432 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1433 1434 ASSERT(rm->rm_firstdatacol > 2); 1435 1436 break; 1437 } 1438 } 1439 1440 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1441 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1442 ASSERT(code > 0); 1443 return (code); 1444 } 1445 1446 static int 1447 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1448 { 1449 vdev_t *cvd; 1450 uint64_t nparity = vd->vdev_nparity; 1451 int c; 1452 int lasterror = 0; 1453 int numerrors = 0; 1454 1455 ASSERT(nparity > 0); 1456 1457 if (nparity > VDEV_RAIDZ_MAXPARITY || 1458 vd->vdev_children < nparity + 1) { 1459 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1460 return (EINVAL); 1461 } 1462 1463 vdev_open_children(vd); 1464 1465 for (c = 0; c < vd->vdev_children; c++) { 1466 cvd = vd->vdev_child[c]; 1467 1468 if (cvd->vdev_open_error != 0) { 1469 lasterror = cvd->vdev_open_error; 1470 numerrors++; 1471 continue; 1472 } 1473 1474 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1475 *ashift = MAX(*ashift, cvd->vdev_ashift); 1476 } 1477 1478 *asize *= vd->vdev_children; 1479 1480 if (numerrors > nparity) { 1481 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1482 return (lasterror); 1483 } 1484 1485 return (0); 1486 } 1487 1488 static void 1489 vdev_raidz_close(vdev_t *vd) 1490 { 1491 int c; 1492 1493 for (c = 0; c < vd->vdev_children; c++) 1494 vdev_close(vd->vdev_child[c]); 1495 } 1496 1497 static uint64_t 1498 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1499 { 1500 uint64_t asize; 1501 uint64_t ashift = vd->vdev_top->vdev_ashift; 1502 uint64_t cols = vd->vdev_children; 1503 uint64_t nparity = vd->vdev_nparity; 1504 1505 asize = ((psize - 1) >> ashift) + 1; 1506 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1507 asize = roundup(asize, nparity + 1) << ashift; 1508 1509 return (asize); 1510 } 1511 1512 static void 1513 vdev_raidz_child_done(zio_t *zio) 1514 { 1515 raidz_col_t *rc = zio->io_private; 1516 1517 rc->rc_error = zio->io_error; 1518 rc->rc_tried = 1; 1519 rc->rc_skipped = 0; 1520 } 1521 1522 static int 1523 vdev_raidz_io_start(zio_t *zio) 1524 { 1525 vdev_t *vd = zio->io_vd; 1526 vdev_t *tvd = vd->vdev_top; 1527 vdev_t *cvd; 1528 blkptr_t *bp = zio->io_bp; 1529 raidz_map_t *rm; 1530 raidz_col_t *rc; 1531 int c, i; 1532 1533 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1534 vd->vdev_nparity); 1535 1536 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1537 1538 if (zio->io_type == ZIO_TYPE_WRITE) { 1539 vdev_raidz_generate_parity(rm); 1540 1541 for (c = 0; c < rm->rm_cols; c++) { 1542 rc = &rm->rm_col[c]; 1543 cvd = vd->vdev_child[rc->rc_devidx]; 1544 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1545 rc->rc_offset, rc->rc_data, rc->rc_size, 1546 zio->io_type, zio->io_priority, 0, 1547 vdev_raidz_child_done, rc)); 1548 } 1549 1550 /* 1551 * Generate optional I/Os for any skipped sectors to improve 1552 * aggregation contiguity. 1553 */ 1554 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1555 ASSERT(c <= rm->rm_scols); 1556 if (c == rm->rm_scols) 1557 c = 0; 1558 rc = &rm->rm_col[c]; 1559 cvd = vd->vdev_child[rc->rc_devidx]; 1560 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1561 rc->rc_offset + rc->rc_size, NULL, 1562 1 << tvd->vdev_ashift, 1563 zio->io_type, zio->io_priority, 1564 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1565 } 1566 1567 return (ZIO_PIPELINE_CONTINUE); 1568 } 1569 1570 ASSERT(zio->io_type == ZIO_TYPE_READ); 1571 1572 /* 1573 * Iterate over the columns in reverse order so that we hit the parity 1574 * last -- any errors along the way will force us to read the parity. 1575 */ 1576 for (c = rm->rm_cols - 1; c >= 0; c--) { 1577 rc = &rm->rm_col[c]; 1578 cvd = vd->vdev_child[rc->rc_devidx]; 1579 if (!vdev_readable(cvd)) { 1580 if (c >= rm->rm_firstdatacol) 1581 rm->rm_missingdata++; 1582 else 1583 rm->rm_missingparity++; 1584 rc->rc_error = ENXIO; 1585 rc->rc_tried = 1; /* don't even try */ 1586 rc->rc_skipped = 1; 1587 continue; 1588 } 1589 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { 1590 if (c >= rm->rm_firstdatacol) 1591 rm->rm_missingdata++; 1592 else 1593 rm->rm_missingparity++; 1594 rc->rc_error = ESTALE; 1595 rc->rc_skipped = 1; 1596 continue; 1597 } 1598 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1599 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1600 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1601 rc->rc_offset, rc->rc_data, rc->rc_size, 1602 zio->io_type, zio->io_priority, 0, 1603 vdev_raidz_child_done, rc)); 1604 } 1605 } 1606 1607 return (ZIO_PIPELINE_CONTINUE); 1608 } 1609 1610 /* 1611 * Report a checksum error for a child of a RAID-Z device. 1612 */ 1613 static void 1614 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1615 { 1616 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1617 1618 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1619 zio_bad_cksum_t zbc; 1620 raidz_map_t *rm = zio->io_vsd; 1621 1622 mutex_enter(&vd->vdev_stat_lock); 1623 vd->vdev_stat.vs_checksum_errors++; 1624 mutex_exit(&vd->vdev_stat_lock); 1625 1626 zbc.zbc_has_cksum = 0; 1627 zbc.zbc_injected = rm->rm_ecksuminjected; 1628 1629 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1630 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1631 &zbc); 1632 } 1633 } 1634 1635 /* 1636 * We keep track of whether or not there were any injected errors, so that 1637 * any ereports we generate can note it. 1638 */ 1639 static int 1640 raidz_checksum_verify(zio_t *zio) 1641 { 1642 zio_bad_cksum_t zbc; 1643 raidz_map_t *rm = zio->io_vsd; 1644 1645 int ret = zio_checksum_error(zio, &zbc); 1646 if (ret != 0 && zbc.zbc_injected != 0) 1647 rm->rm_ecksuminjected = 1; 1648 1649 return (ret); 1650 } 1651 1652 /* 1653 * Generate the parity from the data columns. If we tried and were able to 1654 * read the parity without error, verify that the generated parity matches the 1655 * data we read. If it doesn't, we fire off a checksum error. Return the 1656 * number such failures. 1657 */ 1658 static int 1659 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1660 { 1661 void *orig[VDEV_RAIDZ_MAXPARITY]; 1662 int c, ret = 0; 1663 raidz_col_t *rc; 1664 1665 for (c = 0; c < rm->rm_firstdatacol; c++) { 1666 rc = &rm->rm_col[c]; 1667 if (!rc->rc_tried || rc->rc_error != 0) 1668 continue; 1669 orig[c] = zio_buf_alloc(rc->rc_size); 1670 bcopy(rc->rc_data, orig[c], rc->rc_size); 1671 } 1672 1673 vdev_raidz_generate_parity(rm); 1674 1675 for (c = 0; c < rm->rm_firstdatacol; c++) { 1676 rc = &rm->rm_col[c]; 1677 if (!rc->rc_tried || rc->rc_error != 0) 1678 continue; 1679 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1680 raidz_checksum_error(zio, rc, orig[c]); 1681 rc->rc_error = ECKSUM; 1682 ret++; 1683 } 1684 zio_buf_free(orig[c], rc->rc_size); 1685 } 1686 1687 return (ret); 1688 } 1689 1690 /* 1691 * Keep statistics on all the ways that we used parity to correct data. 1692 */ 1693 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1694 1695 static int 1696 vdev_raidz_worst_error(raidz_map_t *rm) 1697 { 1698 int error = 0; 1699 1700 for (int c = 0; c < rm->rm_cols; c++) 1701 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1702 1703 return (error); 1704 } 1705 1706 /* 1707 * Iterate over all combinations of bad data and attempt a reconstruction. 1708 * Note that the algorithm below is non-optimal because it doesn't take into 1709 * account how reconstruction is actually performed. For example, with 1710 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1711 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1712 * cases we'd only use parity information in column 0. 1713 */ 1714 static int 1715 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1716 { 1717 raidz_map_t *rm = zio->io_vsd; 1718 raidz_col_t *rc; 1719 void *orig[VDEV_RAIDZ_MAXPARITY]; 1720 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1721 int *tgts = &tstore[1]; 1722 int current, next, i, c, n; 1723 int code, ret = 0; 1724 1725 ASSERT(total_errors < rm->rm_firstdatacol); 1726 1727 /* 1728 * This simplifies one edge condition. 1729 */ 1730 tgts[-1] = -1; 1731 1732 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1733 /* 1734 * Initialize the targets array by finding the first n columns 1735 * that contain no error. 1736 * 1737 * If there were no data errors, we need to ensure that we're 1738 * always explicitly attempting to reconstruct at least one 1739 * data column. To do this, we simply push the highest target 1740 * up into the data columns. 1741 */ 1742 for (c = 0, i = 0; i < n; i++) { 1743 if (i == n - 1 && data_errors == 0 && 1744 c < rm->rm_firstdatacol) { 1745 c = rm->rm_firstdatacol; 1746 } 1747 1748 while (rm->rm_col[c].rc_error != 0) { 1749 c++; 1750 ASSERT3S(c, <, rm->rm_cols); 1751 } 1752 1753 tgts[i] = c++; 1754 } 1755 1756 /* 1757 * Setting tgts[n] simplifies the other edge condition. 1758 */ 1759 tgts[n] = rm->rm_cols; 1760 1761 /* 1762 * These buffers were allocated in previous iterations. 1763 */ 1764 for (i = 0; i < n - 1; i++) { 1765 ASSERT(orig[i] != NULL); 1766 } 1767 1768 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1769 1770 current = 0; 1771 next = tgts[current]; 1772 1773 while (current != n) { 1774 tgts[current] = next; 1775 current = 0; 1776 1777 /* 1778 * Save off the original data that we're going to 1779 * attempt to reconstruct. 1780 */ 1781 for (i = 0; i < n; i++) { 1782 ASSERT(orig[i] != NULL); 1783 c = tgts[i]; 1784 ASSERT3S(c, >=, 0); 1785 ASSERT3S(c, <, rm->rm_cols); 1786 rc = &rm->rm_col[c]; 1787 bcopy(rc->rc_data, orig[i], rc->rc_size); 1788 } 1789 1790 /* 1791 * Attempt a reconstruction and exit the outer loop on 1792 * success. 1793 */ 1794 code = vdev_raidz_reconstruct(rm, tgts, n); 1795 if (raidz_checksum_verify(zio) == 0) { 1796 atomic_inc_64(&raidz_corrected[code]); 1797 1798 for (i = 0; i < n; i++) { 1799 c = tgts[i]; 1800 rc = &rm->rm_col[c]; 1801 ASSERT(rc->rc_error == 0); 1802 if (rc->rc_tried) 1803 raidz_checksum_error(zio, rc, 1804 orig[i]); 1805 rc->rc_error = ECKSUM; 1806 } 1807 1808 ret = code; 1809 goto done; 1810 } 1811 1812 /* 1813 * Restore the original data. 1814 */ 1815 for (i = 0; i < n; i++) { 1816 c = tgts[i]; 1817 rc = &rm->rm_col[c]; 1818 bcopy(orig[i], rc->rc_data, rc->rc_size); 1819 } 1820 1821 do { 1822 /* 1823 * Find the next valid column after the current 1824 * position.. 1825 */ 1826 for (next = tgts[current] + 1; 1827 next < rm->rm_cols && 1828 rm->rm_col[next].rc_error != 0; next++) 1829 continue; 1830 1831 ASSERT(next <= tgts[current + 1]); 1832 1833 /* 1834 * If that spot is available, we're done here. 1835 */ 1836 if (next != tgts[current + 1]) 1837 break; 1838 1839 /* 1840 * Otherwise, find the next valid column after 1841 * the previous position. 1842 */ 1843 for (c = tgts[current - 1] + 1; 1844 rm->rm_col[c].rc_error != 0; c++) 1845 continue; 1846 1847 tgts[current] = c; 1848 current++; 1849 1850 } while (current != n); 1851 } 1852 } 1853 n--; 1854 done: 1855 for (i = 0; i < n; i++) { 1856 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1857 } 1858 1859 return (ret); 1860 } 1861 1862 static void 1863 vdev_raidz_io_done(zio_t *zio) 1864 { 1865 vdev_t *vd = zio->io_vd; 1866 vdev_t *cvd; 1867 raidz_map_t *rm = zio->io_vsd; 1868 raidz_col_t *rc; 1869 int unexpected_errors = 0; 1870 int parity_errors = 0; 1871 int parity_untried = 0; 1872 int data_errors = 0; 1873 int total_errors = 0; 1874 int n, c; 1875 int tgts[VDEV_RAIDZ_MAXPARITY]; 1876 int code; 1877 1878 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1879 1880 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1881 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1882 1883 for (c = 0; c < rm->rm_cols; c++) { 1884 rc = &rm->rm_col[c]; 1885 1886 if (rc->rc_error) { 1887 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1888 1889 if (c < rm->rm_firstdatacol) 1890 parity_errors++; 1891 else 1892 data_errors++; 1893 1894 if (!rc->rc_skipped) 1895 unexpected_errors++; 1896 1897 total_errors++; 1898 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1899 parity_untried++; 1900 } 1901 } 1902 1903 if (zio->io_type == ZIO_TYPE_WRITE) { 1904 /* 1905 * XXX -- for now, treat partial writes as a success. 1906 * (If we couldn't write enough columns to reconstruct 1907 * the data, the I/O failed. Otherwise, good enough.) 1908 * 1909 * Now that we support write reallocation, it would be better 1910 * to treat partial failure as real failure unless there are 1911 * no non-degraded top-level vdevs left, and not update DTLs 1912 * if we intend to reallocate. 1913 */ 1914 /* XXPOLICY */ 1915 if (total_errors > rm->rm_firstdatacol) 1916 zio->io_error = vdev_raidz_worst_error(rm); 1917 1918 return; 1919 } 1920 1921 ASSERT(zio->io_type == ZIO_TYPE_READ); 1922 /* 1923 * There are three potential phases for a read: 1924 * 1. produce valid data from the columns read 1925 * 2. read all disks and try again 1926 * 3. perform combinatorial reconstruction 1927 * 1928 * Each phase is progressively both more expensive and less likely to 1929 * occur. If we encounter more errors than we can repair or all phases 1930 * fail, we have no choice but to return an error. 1931 */ 1932 1933 /* 1934 * If the number of errors we saw was correctable -- less than or equal 1935 * to the number of parity disks read -- attempt to produce data that 1936 * has a valid checksum. Naturally, this case applies in the absence of 1937 * any errors. 1938 */ 1939 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1940 if (data_errors == 0) { 1941 if (raidz_checksum_verify(zio) == 0) { 1942 /* 1943 * If we read parity information (unnecessarily 1944 * as it happens since no reconstruction was 1945 * needed) regenerate and verify the parity. 1946 * We also regenerate parity when resilvering 1947 * so we can write it out to the failed device 1948 * later. 1949 */ 1950 if (parity_errors + parity_untried < 1951 rm->rm_firstdatacol || 1952 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1953 n = raidz_parity_verify(zio, rm); 1954 unexpected_errors += n; 1955 ASSERT(parity_errors + n <= 1956 rm->rm_firstdatacol); 1957 } 1958 goto done; 1959 } 1960 } else { 1961 /* 1962 * We either attempt to read all the parity columns or 1963 * none of them. If we didn't try to read parity, we 1964 * wouldn't be here in the correctable case. There must 1965 * also have been fewer parity errors than parity 1966 * columns or, again, we wouldn't be in this code path. 1967 */ 1968 ASSERT(parity_untried == 0); 1969 ASSERT(parity_errors < rm->rm_firstdatacol); 1970 1971 /* 1972 * Identify the data columns that reported an error. 1973 */ 1974 n = 0; 1975 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1976 rc = &rm->rm_col[c]; 1977 if (rc->rc_error != 0) { 1978 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1979 tgts[n++] = c; 1980 } 1981 } 1982 1983 ASSERT(rm->rm_firstdatacol >= n); 1984 1985 code = vdev_raidz_reconstruct(rm, tgts, n); 1986 1987 if (raidz_checksum_verify(zio) == 0) { 1988 atomic_inc_64(&raidz_corrected[code]); 1989 1990 /* 1991 * If we read more parity disks than were used 1992 * for reconstruction, confirm that the other 1993 * parity disks produced correct data. This 1994 * routine is suboptimal in that it regenerates 1995 * the parity that we already used in addition 1996 * to the parity that we're attempting to 1997 * verify, but this should be a relatively 1998 * uncommon case, and can be optimized if it 1999 * becomes a problem. Note that we regenerate 2000 * parity when resilvering so we can write it 2001 * out to failed devices later. 2002 */ 2003 if (parity_errors < rm->rm_firstdatacol - n || 2004 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2005 n = raidz_parity_verify(zio, rm); 2006 unexpected_errors += n; 2007 ASSERT(parity_errors + n <= 2008 rm->rm_firstdatacol); 2009 } 2010 2011 goto done; 2012 } 2013 } 2014 } 2015 2016 /* 2017 * This isn't a typical situation -- either we got a read error or 2018 * a child silently returned bad data. Read every block so we can 2019 * try again with as much data and parity as we can track down. If 2020 * we've already been through once before, all children will be marked 2021 * as tried so we'll proceed to combinatorial reconstruction. 2022 */ 2023 unexpected_errors = 1; 2024 rm->rm_missingdata = 0; 2025 rm->rm_missingparity = 0; 2026 2027 for (c = 0; c < rm->rm_cols; c++) { 2028 if (rm->rm_col[c].rc_tried) 2029 continue; 2030 2031 zio_vdev_io_redone(zio); 2032 do { 2033 rc = &rm->rm_col[c]; 2034 if (rc->rc_tried) 2035 continue; 2036 zio_nowait(zio_vdev_child_io(zio, NULL, 2037 vd->vdev_child[rc->rc_devidx], 2038 rc->rc_offset, rc->rc_data, rc->rc_size, 2039 zio->io_type, zio->io_priority, 0, 2040 vdev_raidz_child_done, rc)); 2041 } while (++c < rm->rm_cols); 2042 2043 return; 2044 } 2045 2046 /* 2047 * At this point we've attempted to reconstruct the data given the 2048 * errors we detected, and we've attempted to read all columns. There 2049 * must, therefore, be one or more additional problems -- silent errors 2050 * resulting in invalid data rather than explicit I/O errors resulting 2051 * in absent data. We check if there is enough additional data to 2052 * possibly reconstruct the data and then perform combinatorial 2053 * reconstruction over all possible combinations. If that fails, 2054 * we're cooked. 2055 */ 2056 if (total_errors > rm->rm_firstdatacol) { 2057 zio->io_error = vdev_raidz_worst_error(rm); 2058 2059 } else if (total_errors < rm->rm_firstdatacol && 2060 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2061 /* 2062 * If we didn't use all the available parity for the 2063 * combinatorial reconstruction, verify that the remaining 2064 * parity is correct. 2065 */ 2066 if (code != (1 << rm->rm_firstdatacol) - 1) 2067 (void) raidz_parity_verify(zio, rm); 2068 } else { 2069 /* 2070 * We're here because either: 2071 * 2072 * total_errors == rm_first_datacol, or 2073 * vdev_raidz_combrec() failed 2074 * 2075 * In either case, there is enough bad data to prevent 2076 * reconstruction. 2077 * 2078 * Start checksum ereports for all children which haven't 2079 * failed. 2080 */ 2081 zio->io_error = ECKSUM; 2082 2083 for (c = 0; c < rm->rm_cols; c++) { 2084 rc = &rm->rm_col[c]; 2085 if (rc->rc_error == 0) { 2086 zio_bad_cksum_t zbc; 2087 zbc.zbc_has_cksum = 0; 2088 zbc.zbc_injected = rm->rm_ecksuminjected; 2089 2090 zfs_ereport_start_checksum( 2091 zio->io_spa, vd->vdev_child[rc->rc_devidx], 2092 zio, rc->rc_offset, rc->rc_size, 2093 (void *)(uintptr_t)c, &zbc); 2094 } 2095 } 2096 } 2097 2098 done: 2099 zio_checksum_verified(zio); 2100 2101 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2102 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2103 /* 2104 * Use the good data we have in hand to repair damaged children. 2105 */ 2106 for (c = 0; c < rm->rm_cols; c++) { 2107 rc = &rm->rm_col[c]; 2108 cvd = vd->vdev_child[rc->rc_devidx]; 2109 2110 if (rc->rc_error == 0) 2111 continue; 2112 2113 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2114 rc->rc_offset, rc->rc_data, rc->rc_size, 2115 ZIO_TYPE_WRITE, zio->io_priority, 2116 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2117 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2118 } 2119 } 2120 } 2121 2122 static void 2123 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2124 { 2125 if (faulted > vd->vdev_nparity) 2126 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2127 VDEV_AUX_NO_REPLICAS); 2128 else if (degraded + faulted != 0) 2129 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2130 else 2131 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2132 } 2133 2134 vdev_ops_t vdev_raidz_ops = { 2135 vdev_raidz_open, 2136 vdev_raidz_close, 2137 vdev_raidz_asize, 2138 vdev_raidz_io_start, 2139 vdev_raidz_io_done, 2140 vdev_raidz_state_change, 2141 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2142 B_FALSE /* not a leaf vdev */ 2143 }; 2144