1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 static void 32 zfs_init_crc(void) 33 { 34 int i, j; 35 uint64_t *ct; 36 37 /* 38 * Calculate the crc64 table (used for the zap hash 39 * function). 40 */ 41 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 42 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 43 for (i = 0; i < 256; i++) 44 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 45 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 46 } 47 } 48 49 static void 50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 51 { 52 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 53 } 54 55 /* 56 * Signature for checksum functions. 57 */ 58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 59 60 /* 61 * Information about each checksum function. 62 */ 63 typedef struct zio_checksum_info { 64 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 65 int ci_correctable; /* number of correctable bits */ 66 int ci_zbt; /* uses zio block tail? */ 67 const char *ci_name; /* descriptive name */ 68 } zio_checksum_info_t; 69 70 #include "fletcher.c" 71 #include "sha256.c" 72 73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 74 {{NULL, NULL}, 0, 0, "inherit"}, 75 {{NULL, NULL}, 0, 0, "on"}, 76 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, 77 {{zio_checksum_SHA256, NULL}, 1, 1, "label"}, 78 {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"}, 79 {{fletcher_2_native, NULL}, 0, 1, "zilog"}, 80 {{fletcher_2_native, NULL}, 0, 0, "fletcher2"}, 81 {{fletcher_4_native, NULL}, 1, 0, "fletcher4"}, 82 {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"}, 83 }; 84 85 /* 86 * Common signature for all zio compress/decompress functions. 87 */ 88 typedef size_t zio_compress_func_t(void *src, void *dst, 89 size_t s_len, size_t d_len, int); 90 typedef int zio_decompress_func_t(void *src, void *dst, 91 size_t s_len, size_t d_len, int); 92 93 /* 94 * Information about each compression function. 95 */ 96 typedef struct zio_compress_info { 97 zio_compress_func_t *ci_compress; /* compression function */ 98 zio_decompress_func_t *ci_decompress; /* decompression function */ 99 int ci_level; /* level parameter */ 100 const char *ci_name; /* algorithm name */ 101 } zio_compress_info_t; 102 103 #include "lzjb.c" 104 105 /* 106 * Compression vectors. 107 */ 108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 109 {NULL, NULL, 0, "inherit"}, 110 {NULL, NULL, 0, "on"}, 111 {NULL, NULL, 0, "uncompressed"}, 112 {NULL, lzjb_decompress, 0, "lzjb"}, 113 {NULL, NULL, 0, "empty"}, 114 {NULL, NULL, 1, "gzip-1"}, 115 {NULL, NULL, 2, "gzip-2"}, 116 {NULL, NULL, 3, "gzip-3"}, 117 {NULL, NULL, 4, "gzip-4"}, 118 {NULL, NULL, 5, "gzip-5"}, 119 {NULL, NULL, 6, "gzip-6"}, 120 {NULL, NULL, 7, "gzip-7"}, 121 {NULL, NULL, 8, "gzip-8"}, 122 {NULL, NULL, 9, "gzip-9"}, 123 }; 124 125 static int 126 zio_checksum_error(const blkptr_t *bp, void *data) 127 { 128 zio_cksum_t zc = bp->blk_cksum; 129 unsigned int checksum = BP_GET_CHECKSUM(bp); 130 uint64_t size = BP_GET_PSIZE(bp); 131 zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; 132 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 133 zio_cksum_t actual_cksum, expected_cksum; 134 135 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) 136 return (EINVAL); 137 138 if (ci->ci_zbt) { 139 expected_cksum = zbt->zbt_cksum; 140 zbt->zbt_cksum = zc; 141 ci->ci_func[0](data, size, &actual_cksum); 142 zbt->zbt_cksum = expected_cksum; 143 zc = expected_cksum; 144 } else { 145 /* ASSERT(!BP_IS_GANG(bp)); */ 146 ci->ci_func[0](data, size, &actual_cksum); 147 } 148 149 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) { 150 /*printf("ZFS: read checksum failed\n");*/ 151 return (EIO); 152 } 153 154 return (0); 155 } 156 157 static int 158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 159 void *dest, uint64_t destsize) 160 { 161 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 162 163 /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */ 164 if (!ci->ci_decompress) { 165 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 166 return (EIO); 167 } 168 169 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 170 } 171 172 static uint64_t 173 zap_hash(uint64_t salt, const char *name) 174 { 175 const uint8_t *cp; 176 uint8_t c; 177 uint64_t crc = salt; 178 179 /*ASSERT(crc != 0);*/ 180 /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/ 181 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 182 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 183 184 /* 185 * Only use 28 bits, since we need 4 bits in the cookie for the 186 * collision differentiator. We MUST use the high bits, since 187 * those are the onces that we first pay attention to when 188 * chosing the bucket. 189 */ 190 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 191 192 return (crc); 193 } 194 195 static char *zfs_alloc_temp(size_t sz); 196 197 typedef struct raidz_col { 198 uint64_t rc_devidx; /* child device index for I/O */ 199 uint64_t rc_offset; /* device offset */ 200 uint64_t rc_size; /* I/O size */ 201 void *rc_data; /* I/O data */ 202 int rc_error; /* I/O error for this device */ 203 uint8_t rc_tried; /* Did we attempt this I/O column? */ 204 uint8_t rc_skipped; /* Did we skip this I/O column? */ 205 } raidz_col_t; 206 207 #define VDEV_RAIDZ_P 0 208 #define VDEV_RAIDZ_Q 1 209 210 static void 211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x) 212 { 213 uint64_t *dst, *src, xcount, ccount, count, i; 214 int c; 215 216 xcount = cols[x].rc_size / sizeof (src[0]); 217 //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 218 //ASSERT(xcount > 0); 219 220 src = cols[VDEV_RAIDZ_P].rc_data; 221 dst = cols[x].rc_data; 222 for (i = 0; i < xcount; i++, dst++, src++) { 223 *dst = *src; 224 } 225 226 for (c = nparity; c < acols; c++) { 227 src = cols[c].rc_data; 228 dst = cols[x].rc_data; 229 230 if (c == x) 231 continue; 232 233 ccount = cols[c].rc_size / sizeof (src[0]); 234 count = MIN(ccount, xcount); 235 236 for (i = 0; i < count; i++, dst++, src++) { 237 *dst ^= *src; 238 } 239 } 240 } 241 242 /* 243 * These two tables represent powers and logs of 2 in the Galois field defined 244 * above. These values were computed by repeatedly multiplying by 2 as above. 245 */ 246 static const uint8_t vdev_raidz_pow2[256] = { 247 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 248 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 249 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 250 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 251 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 252 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 253 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 254 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 255 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 256 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 257 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 258 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 259 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 260 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 261 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 262 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 263 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 264 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 265 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 266 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 267 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 268 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 269 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 270 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 271 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 272 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 273 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 274 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 275 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 276 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 277 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 278 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 279 }; 280 static const uint8_t vdev_raidz_log2[256] = { 281 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 282 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 283 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 284 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 285 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 286 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 287 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 288 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 289 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 290 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 291 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 292 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 293 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 294 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 295 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 296 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 297 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 298 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 299 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 300 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 301 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 302 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 303 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 304 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 305 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 306 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 307 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 308 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 309 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 310 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 311 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 312 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 313 }; 314 315 /* 316 * Multiply a given number by 2 raised to the given power. 317 */ 318 static uint8_t 319 vdev_raidz_exp2(uint8_t a, int exp) 320 { 321 if (a == 0) 322 return (0); 323 324 //ASSERT(exp >= 0); 325 //ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 326 327 exp += vdev_raidz_log2[a]; 328 if (exp > 255) 329 exp -= 255; 330 331 return (vdev_raidz_pow2[exp]); 332 } 333 334 static void 335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols) 336 { 337 uint64_t *q, *p, *src, pcount, ccount, mask, i; 338 int c; 339 340 pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 341 //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size); 342 343 for (c = nparity; c < acols; c++) { 344 src = cols[c].rc_data; 345 p = cols[VDEV_RAIDZ_P].rc_data; 346 q = cols[VDEV_RAIDZ_Q].rc_data; 347 ccount = cols[c].rc_size / sizeof (src[0]); 348 349 if (c == nparity) { 350 //ASSERT(ccount == pcount || ccount == 0); 351 for (i = 0; i < ccount; i++, p++, q++, src++) { 352 *q = *src; 353 *p = *src; 354 } 355 for (; i < pcount; i++, p++, q++, src++) { 356 *q = 0; 357 *p = 0; 358 } 359 } else { 360 //ASSERT(ccount <= pcount); 361 362 /* 363 * Rather than multiplying each byte 364 * individually (as described above), we are 365 * able to handle 8 at once by generating a 366 * mask based on the high bit in each byte and 367 * using that to conditionally XOR in 0x1d. 368 */ 369 for (i = 0; i < ccount; i++, p++, q++, src++) { 370 mask = *q & 0x8080808080808080ULL; 371 mask = (mask << 1) - (mask >> 7); 372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 373 (mask & 0x1d1d1d1d1d1d1d1dULL); 374 *q ^= *src; 375 *p ^= *src; 376 } 377 378 /* 379 * Treat short columns as though they are full of 0s. 380 */ 381 for (; i < pcount; i++, q++) { 382 mask = *q & 0x8080808080808080ULL; 383 mask = (mask << 1) - (mask >> 7); 384 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 385 (mask & 0x1d1d1d1d1d1d1d1dULL); 386 } 387 } 388 } 389 } 390 391 static void 392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x) 393 { 394 uint64_t *dst, *src, xcount, ccount, count, mask, i; 395 uint8_t *b; 396 int c, j, exp; 397 398 xcount = cols[x].rc_size / sizeof (src[0]); 399 //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 400 401 for (c = nparity; c < acols; c++) { 402 src = cols[c].rc_data; 403 dst = cols[x].rc_data; 404 405 if (c == x) 406 ccount = 0; 407 else 408 ccount = cols[c].rc_size / sizeof (src[0]); 409 410 count = MIN(ccount, xcount); 411 412 if (c == nparity) { 413 for (i = 0; i < count; i++, dst++, src++) { 414 *dst = *src; 415 } 416 for (; i < xcount; i++, dst++) { 417 *dst = 0; 418 } 419 420 } else { 421 /* 422 * For an explanation of this, see the comment in 423 * vdev_raidz_generate_parity_pq() above. 424 */ 425 for (i = 0; i < count; i++, dst++, src++) { 426 mask = *dst & 0x8080808080808080ULL; 427 mask = (mask << 1) - (mask >> 7); 428 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 429 (mask & 0x1d1d1d1d1d1d1d1dULL); 430 *dst ^= *src; 431 } 432 433 for (; i < xcount; i++, dst++) { 434 mask = *dst & 0x8080808080808080ULL; 435 mask = (mask << 1) - (mask >> 7); 436 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 437 (mask & 0x1d1d1d1d1d1d1d1dULL); 438 } 439 } 440 } 441 442 src = cols[VDEV_RAIDZ_Q].rc_data; 443 dst = cols[x].rc_data; 444 exp = 255 - (acols - 1 - x); 445 446 for (i = 0; i < xcount; i++, dst++, src++) { 447 *dst ^= *src; 448 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 449 *b = vdev_raidz_exp2(*b, exp); 450 } 451 } 452 } 453 454 455 static void 456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols, 457 int x, int y, void *temp_p, void *temp_q) 458 { 459 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 460 void *pdata, *qdata; 461 uint64_t xsize, ysize, i; 462 463 //ASSERT(x < y); 464 //ASSERT(x >= nparity); 465 //ASSERT(y < acols); 466 467 //ASSERT(cols[x].rc_size >= cols[y].rc_size); 468 469 /* 470 * Move the parity data aside -- we're going to compute parity as 471 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 472 * reuse the parity generation mechanism without trashing the actual 473 * parity so we make those columns appear to be full of zeros by 474 * setting their lengths to zero. 475 */ 476 pdata = cols[VDEV_RAIDZ_P].rc_data; 477 qdata = cols[VDEV_RAIDZ_Q].rc_data; 478 xsize = cols[x].rc_size; 479 ysize = cols[y].rc_size; 480 481 cols[VDEV_RAIDZ_P].rc_data = temp_p; 482 cols[VDEV_RAIDZ_Q].rc_data = temp_q; 483 cols[x].rc_size = 0; 484 cols[y].rc_size = 0; 485 486 vdev_raidz_generate_parity_pq(cols, nparity, acols); 487 488 cols[x].rc_size = xsize; 489 cols[y].rc_size = ysize; 490 491 p = pdata; 492 q = qdata; 493 pxy = cols[VDEV_RAIDZ_P].rc_data; 494 qxy = cols[VDEV_RAIDZ_Q].rc_data; 495 xd = cols[x].rc_data; 496 yd = cols[y].rc_data; 497 498 /* 499 * We now have: 500 * Pxy = P + D_x + D_y 501 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 502 * 503 * We can then solve for D_x: 504 * D_x = A * (P + Pxy) + B * (Q + Qxy) 505 * where 506 * A = 2^(x - y) * (2^(x - y) + 1)^-1 507 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 508 * 509 * With D_x in hand, we can easily solve for D_y: 510 * D_y = P + Pxy + D_x 511 */ 512 513 a = vdev_raidz_pow2[255 + x - y]; 514 b = vdev_raidz_pow2[255 - (acols - 1 - x)]; 515 tmp = 255 - vdev_raidz_log2[a ^ 1]; 516 517 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 518 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 519 520 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 521 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 522 vdev_raidz_exp2(*q ^ *qxy, bexp); 523 524 if (i < ysize) 525 *yd = *p ^ *pxy ^ *xd; 526 } 527 528 /* 529 * Restore the saved parity data. 530 */ 531 cols[VDEV_RAIDZ_P].rc_data = pdata; 532 cols[VDEV_RAIDZ_Q].rc_data = qdata; 533 } 534 535 static int 536 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 537 off_t offset, size_t bytes) 538 { 539 size_t psize = BP_GET_PSIZE(bp); 540 vdev_t *kid; 541 int unit_shift = vdev->v_ashift; 542 int dcols = vdev->v_nchildren; 543 int nparity = vdev->v_nparity; 544 int missingdata, missingparity; 545 int parity_errors, data_errors, unexpected_errors, total_errors; 546 int parity_untried; 547 uint64_t b = offset >> unit_shift; 548 uint64_t s = psize >> unit_shift; 549 uint64_t f = b % dcols; 550 uint64_t o = (b / dcols) << unit_shift; 551 uint64_t q, r, coff; 552 int c, c1, bc, col, acols, devidx, asize, n, max_rc_size; 553 static raidz_col_t cols[16]; 554 raidz_col_t *rc, *rc1; 555 void *orig, *orig1, *temp_p, *temp_q; 556 557 orig = orig1 = temp_p = temp_q = NULL; 558 559 q = s / (dcols - nparity); 560 r = s - q * (dcols - nparity); 561 bc = (r == 0 ? 0 : r + nparity); 562 563 acols = (q == 0 ? bc : dcols); 564 asize = 0; 565 max_rc_size = 0; 566 567 for (c = 0; c < acols; c++) { 568 col = f + c; 569 coff = o; 570 if (col >= dcols) { 571 col -= dcols; 572 coff += 1ULL << unit_shift; 573 } 574 cols[c].rc_devidx = col; 575 cols[c].rc_offset = coff; 576 cols[c].rc_size = (q + (c < bc)) << unit_shift; 577 cols[c].rc_data = NULL; 578 cols[c].rc_error = 0; 579 cols[c].rc_tried = 0; 580 cols[c].rc_skipped = 0; 581 asize += cols[c].rc_size; 582 if (cols[c].rc_size > max_rc_size) 583 max_rc_size = cols[c].rc_size; 584 } 585 586 asize = roundup(asize, (nparity + 1) << unit_shift); 587 588 for (c = 0; c < nparity; c++) { 589 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size); 590 } 591 592 cols[c].rc_data = buf; 593 594 for (c = c + 1; c < acols; c++) 595 cols[c].rc_data = (char *)cols[c - 1].rc_data + 596 cols[c - 1].rc_size; 597 598 /* 599 * If all data stored spans all columns, there's a danger that 600 * parity will always be on the same device and, since parity 601 * isn't read during normal operation, that that device's I/O 602 * bandwidth won't be used effectively. We therefore switch 603 * the parity every 1MB. 604 * 605 * ... at least that was, ostensibly, the theory. As a 606 * practical matter unless we juggle the parity between all 607 * devices evenly, we won't see any benefit. Further, 608 * occasional writes that aren't a multiple of the LCM of the 609 * number of children and the minimum stripe width are 610 * sufficient to avoid pessimal behavior. Unfortunately, this 611 * decision created an implicit on-disk format requirement 612 * that we need to support for all eternity, but only for 613 * single-parity RAID-Z. 614 */ 615 //ASSERT(acols >= 2); 616 //ASSERT(cols[0].rc_size == cols[1].rc_size); 617 618 if (nparity == 1 && (offset & (1ULL << 20))) { 619 devidx = cols[0].rc_devidx; 620 o = cols[0].rc_offset; 621 cols[0].rc_devidx = cols[1].rc_devidx; 622 cols[0].rc_offset = cols[1].rc_offset; 623 cols[1].rc_devidx = devidx; 624 cols[1].rc_offset = o; 625 } 626 627 /* 628 * Iterate over the columns in reverse order so that we hit 629 * the parity last -- any errors along the way will force us 630 * to read the parity data. 631 */ 632 missingdata = 0; 633 missingparity = 0; 634 for (c = acols - 1; c >= 0; c--) { 635 rc = &cols[c]; 636 devidx = rc->rc_devidx; 637 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) 638 if (kid->v_id == devidx) 639 break; 640 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) { 641 if (c >= nparity) 642 missingdata++; 643 else 644 missingparity++; 645 rc->rc_error = ENXIO; 646 rc->rc_tried = 1; /* don't even try */ 647 rc->rc_skipped = 1; 648 continue; 649 } 650 #if 0 651 /* 652 * Too hard for the bootcode 653 */ 654 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 655 if (c >= nparity) 656 rm->rm_missingdata++; 657 else 658 rm->rm_missingparity++; 659 rc->rc_error = ESTALE; 660 rc->rc_skipped = 1; 661 continue; 662 } 663 #endif 664 if (c >= nparity || missingdata > 0) { 665 if (rc->rc_data) 666 rc->rc_error = kid->v_read(kid, NULL, 667 rc->rc_data, rc->rc_offset, rc->rc_size); 668 else 669 rc->rc_error = ENXIO; 670 rc->rc_tried = 1; 671 rc->rc_skipped = 0; 672 } 673 } 674 675 reconstruct: 676 parity_errors = 0; 677 data_errors = 0; 678 unexpected_errors = 0; 679 total_errors = 0; 680 parity_untried = 0; 681 for (c = 0; c < acols; c++) { 682 rc = &cols[c]; 683 684 if (rc->rc_error) { 685 if (c < nparity) 686 parity_errors++; 687 else 688 data_errors++; 689 690 if (!rc->rc_skipped) 691 unexpected_errors++; 692 693 total_errors++; 694 } else if (c < nparity && !rc->rc_tried) { 695 parity_untried++; 696 } 697 } 698 699 /* 700 * There are three potential phases for a read: 701 * 1. produce valid data from the columns read 702 * 2. read all disks and try again 703 * 3. perform combinatorial reconstruction 704 * 705 * Each phase is progressively both more expensive and less 706 * likely to occur. If we encounter more errors than we can 707 * repair or all phases fail, we have no choice but to return 708 * an error. 709 */ 710 711 /* 712 * If the number of errors we saw was correctable -- less than 713 * or equal to the number of parity disks read -- attempt to 714 * produce data that has a valid checksum. Naturally, this 715 * case applies in the absence of any errors. 716 */ 717 if (total_errors <= nparity - parity_untried) { 718 switch (data_errors) { 719 case 0: 720 if (zio_checksum_error(bp, buf) == 0) 721 return (0); 722 break; 723 724 case 1: 725 /* 726 * We either attempt to read all the parity columns or 727 * none of them. If we didn't try to read parity, we 728 * wouldn't be here in the correctable case. There must 729 * also have been fewer parity errors than parity 730 * columns or, again, we wouldn't be in this code path. 731 */ 732 //ASSERT(parity_untried == 0); 733 //ASSERT(parity_errors < nparity); 734 735 /* 736 * Find the column that reported the error. 737 */ 738 for (c = nparity; c < acols; c++) { 739 rc = &cols[c]; 740 if (rc->rc_error != 0) 741 break; 742 } 743 //ASSERT(c != acols); 744 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 745 746 if (cols[VDEV_RAIDZ_P].rc_error == 0) { 747 vdev_raidz_reconstruct_p(cols, nparity, 748 acols, c); 749 } else { 750 //ASSERT(nparity > 1); 751 vdev_raidz_reconstruct_q(cols, nparity, 752 acols, c); 753 } 754 755 if (zio_checksum_error(bp, buf) == 0) 756 return (0); 757 break; 758 759 case 2: 760 /* 761 * Two data column errors require double parity. 762 */ 763 //ASSERT(nparity == 2); 764 765 /* 766 * Find the two columns that reported errors. 767 */ 768 for (c = nparity; c < acols; c++) { 769 rc = &cols[c]; 770 if (rc->rc_error != 0) 771 break; 772 } 773 //ASSERT(c != acols); 774 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 775 776 for (c1 = c++; c < acols; c++) { 777 rc = &cols[c]; 778 if (rc->rc_error != 0) 779 break; 780 } 781 //ASSERT(c != acols); 782 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 783 784 if (temp_p == NULL) 785 temp_p = zfs_alloc_temp(max_rc_size); 786 if (temp_q == NULL) 787 temp_q = zfs_alloc_temp(max_rc_size); 788 789 vdev_raidz_reconstruct_pq(cols, nparity, acols, 790 c1, c, temp_p, temp_q); 791 792 if (zio_checksum_error(bp, buf) == 0) 793 return (0); 794 break; 795 796 default: 797 break; 798 //ASSERT(nparity <= 2); 799 //ASSERT(0); 800 } 801 } 802 803 /* 804 * This isn't a typical situation -- either we got a read 805 * error or a child silently returned bad data. Read every 806 * block so we can try again with as much data and parity as 807 * we can track down. If we've already been through once 808 * before, all children will be marked as tried so we'll 809 * proceed to combinatorial reconstruction. 810 */ 811 n = 0; 812 for (c = 0; c < acols; c++) { 813 rc = &cols[c]; 814 if (rc->rc_tried) 815 continue; 816 817 devidx = rc->rc_devidx; 818 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) 819 if (kid->v_id == devidx) 820 break; 821 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) { 822 rc->rc_error = ENXIO; 823 rc->rc_tried = 1; /* don't even try */ 824 rc->rc_skipped = 1; 825 continue; 826 } 827 if (rc->rc_data) 828 rc->rc_error = kid->v_read(kid, NULL, 829 rc->rc_data, rc->rc_offset, rc->rc_size); 830 else 831 rc->rc_error = ENXIO; 832 if (rc->rc_error == 0) 833 n++; 834 rc->rc_tried = 1; 835 rc->rc_skipped = 0; 836 } 837 838 /* 839 * If we managed to read anything more, retry the 840 * reconstruction. 841 */ 842 if (n) 843 goto reconstruct; 844 845 /* 846 * At this point we've attempted to reconstruct the data given the 847 * errors we detected, and we've attempted to read all columns. There 848 * must, therefore, be one or more additional problems -- silent errors 849 * resulting in invalid data rather than explicit I/O errors resulting 850 * in absent data. Before we attempt combinatorial reconstruction make 851 * sure we have a chance of coming up with the right answer. 852 */ 853 if (total_errors >= nparity) { 854 return (EIO); 855 } 856 857 if (cols[VDEV_RAIDZ_P].rc_error == 0) { 858 /* 859 * Attempt to reconstruct the data from parity P. 860 */ 861 if (orig == NULL) 862 orig = zfs_alloc_temp(max_rc_size); 863 for (c = nparity; c < acols; c++) { 864 rc = &cols[c]; 865 866 memcpy(orig, rc->rc_data, rc->rc_size); 867 vdev_raidz_reconstruct_p(cols, nparity, acols, c); 868 869 if (zio_checksum_error(bp, buf) == 0) 870 return (0); 871 872 memcpy(rc->rc_data, orig, rc->rc_size); 873 } 874 } 875 876 if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) { 877 /* 878 * Attempt to reconstruct the data from parity Q. 879 */ 880 if (orig == NULL) 881 orig = zfs_alloc_temp(max_rc_size); 882 for (c = nparity; c < acols; c++) { 883 rc = &cols[c]; 884 885 memcpy(orig, rc->rc_data, rc->rc_size); 886 vdev_raidz_reconstruct_q(cols, nparity, acols, c); 887 888 if (zio_checksum_error(bp, buf) == 0) 889 return (0); 890 891 memcpy(rc->rc_data, orig, rc->rc_size); 892 } 893 } 894 895 if (nparity > 1 && 896 cols[VDEV_RAIDZ_P].rc_error == 0 && 897 cols[VDEV_RAIDZ_Q].rc_error == 0) { 898 /* 899 * Attempt to reconstruct the data from both P and Q. 900 */ 901 if (orig == NULL) 902 orig = zfs_alloc_temp(max_rc_size); 903 if (orig1 == NULL) 904 orig1 = zfs_alloc_temp(max_rc_size); 905 if (temp_p == NULL) 906 temp_p = zfs_alloc_temp(max_rc_size); 907 if (temp_q == NULL) 908 temp_q = zfs_alloc_temp(max_rc_size); 909 for (c = nparity; c < acols - 1; c++) { 910 rc = &cols[c]; 911 912 memcpy(orig, rc->rc_data, rc->rc_size); 913 914 for (c1 = c + 1; c1 < acols; c1++) { 915 rc1 = &cols[c1]; 916 917 memcpy(orig1, rc1->rc_data, rc1->rc_size); 918 919 vdev_raidz_reconstruct_pq(cols, nparity, 920 acols, c, c1, temp_p, temp_q); 921 922 if (zio_checksum_error(bp, buf) == 0) 923 return (0); 924 925 memcpy(rc1->rc_data, orig1, rc1->rc_size); 926 } 927 928 memcpy(rc->rc_data, orig, rc->rc_size); 929 } 930 } 931 932 return (EIO); 933 } 934 935