1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 static void 32 zfs_init_crc(void) 33 { 34 int i, j; 35 uint64_t *ct; 36 37 /* 38 * Calculate the crc64 table (used for the zap hash 39 * function). 40 */ 41 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 42 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 43 for (i = 0; i < 256; i++) 44 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 45 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 46 } 47 } 48 49 static void 50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 51 { 52 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 53 } 54 55 /* 56 * Signature for checksum functions. 57 */ 58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 59 60 /* 61 * Information about each checksum function. 62 */ 63 typedef struct zio_checksum_info { 64 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 65 int ci_correctable; /* number of correctable bits */ 66 int ci_zbt; /* uses zio block tail? */ 67 const char *ci_name; /* descriptive name */ 68 } zio_checksum_info_t; 69 70 #include "fletcher.c" 71 #include "sha256.c" 72 73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 74 {{NULL, NULL}, 0, 0, "inherit"}, 75 {{NULL, NULL}, 0, 0, "on"}, 76 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, 77 {{zio_checksum_SHA256, NULL}, 1, 1, "label"}, 78 {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"}, 79 {{fletcher_2_native, NULL}, 0, 1, "zilog"}, 80 {{fletcher_2_native, NULL}, 0, 0, "fletcher2"}, 81 {{fletcher_4_native, NULL}, 1, 0, "fletcher4"}, 82 {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"}, 83 }; 84 85 /* 86 * Common signature for all zio compress/decompress functions. 87 */ 88 typedef size_t zio_compress_func_t(void *src, void *dst, 89 size_t s_len, size_t d_len, int); 90 typedef int zio_decompress_func_t(void *src, void *dst, 91 size_t s_len, size_t d_len, int); 92 93 /* 94 * Information about each compression function. 95 */ 96 typedef struct zio_compress_info { 97 zio_compress_func_t *ci_compress; /* compression function */ 98 zio_decompress_func_t *ci_decompress; /* decompression function */ 99 int ci_level; /* level parameter */ 100 const char *ci_name; /* algorithm name */ 101 } zio_compress_info_t; 102 103 #include "lzjb.c" 104 105 /* 106 * Compression vectors. 107 */ 108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 109 {NULL, NULL, 0, "inherit"}, 110 {NULL, NULL, 0, "on"}, 111 {NULL, NULL, 0, "uncompressed"}, 112 {NULL, lzjb_decompress, 0, "lzjb"}, 113 {NULL, NULL, 0, "empty"}, 114 {NULL, NULL, 1, "gzip-1"}, 115 {NULL, NULL, 2, "gzip-2"}, 116 {NULL, NULL, 3, "gzip-3"}, 117 {NULL, NULL, 4, "gzip-4"}, 118 {NULL, NULL, 5, "gzip-5"}, 119 {NULL, NULL, 6, "gzip-6"}, 120 {NULL, NULL, 7, "gzip-7"}, 121 {NULL, NULL, 8, "gzip-8"}, 122 {NULL, NULL, 9, "gzip-9"}, 123 }; 124 125 static int 126 zio_checksum_error(const blkptr_t *bp, void *data) 127 { 128 zio_cksum_t zc = bp->blk_cksum; 129 unsigned int checksum = BP_GET_CHECKSUM(bp); 130 uint64_t size = BP_GET_PSIZE(bp); 131 zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; 132 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 133 zio_cksum_t actual_cksum, expected_cksum; 134 135 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) 136 return (EINVAL); 137 138 if (ci->ci_zbt) { 139 expected_cksum = zbt->zbt_cksum; 140 zbt->zbt_cksum = zc; 141 ci->ci_func[0](data, size, &actual_cksum); 142 zbt->zbt_cksum = expected_cksum; 143 zc = expected_cksum; 144 } else { 145 /* ASSERT(!BP_IS_GANG(bp)); */ 146 ci->ci_func[0](data, size, &actual_cksum); 147 } 148 149 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) { 150 /*printf("ZFS: read checksum failed\n");*/ 151 return (EIO); 152 } 153 154 return (0); 155 } 156 157 static int 158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 159 void *dest, uint64_t destsize) 160 { 161 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 162 163 /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */ 164 if (!ci->ci_decompress) { 165 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 166 return (EIO); 167 } 168 169 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 170 } 171 172 static uint64_t 173 zap_hash(uint64_t salt, const char *name) 174 { 175 const uint8_t *cp; 176 uint8_t c; 177 uint64_t crc = salt; 178 179 /*ASSERT(crc != 0);*/ 180 /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/ 181 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 182 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 183 184 /* 185 * Only use 28 bits, since we need 4 bits in the cookie for the 186 * collision differentiator. We MUST use the high bits, since 187 * those are the onces that we first pay attention to when 188 * chosing the bucket. 189 */ 190 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 191 192 return (crc); 193 } 194 195 static char *zfs_alloc_temp(size_t sz); 196 197 typedef struct raidz_col { 198 uint64_t rc_devidx; /* child device index for I/O */ 199 uint64_t rc_offset; /* device offset */ 200 uint64_t rc_size; /* I/O size */ 201 void *rc_data; /* I/O data */ 202 int rc_error; /* I/O error for this device */ 203 uint8_t rc_tried; /* Did we attempt this I/O column? */ 204 uint8_t rc_skipped; /* Did we skip this I/O column? */ 205 } raidz_col_t; 206 207 #define VDEV_RAIDZ_P 0 208 #define VDEV_RAIDZ_Q 1 209 210 static void 211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x) 212 { 213 uint64_t *dst, *src, xcount, ccount, count, i; 214 int c; 215 216 xcount = cols[x].rc_size / sizeof (src[0]); 217 //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 218 //ASSERT(xcount > 0); 219 220 src = cols[VDEV_RAIDZ_P].rc_data; 221 dst = cols[x].rc_data; 222 for (i = 0; i < xcount; i++, dst++, src++) { 223 *dst = *src; 224 } 225 226 for (c = nparity; c < acols; c++) { 227 src = cols[c].rc_data; 228 dst = cols[x].rc_data; 229 230 if (c == x) 231 continue; 232 233 ccount = cols[c].rc_size / sizeof (src[0]); 234 count = MIN(ccount, xcount); 235 236 for (i = 0; i < count; i++, dst++, src++) { 237 *dst ^= *src; 238 } 239 } 240 } 241 242 /* 243 * These two tables represent powers and logs of 2 in the Galois field defined 244 * above. These values were computed by repeatedly multiplying by 2 as above. 245 */ 246 static const uint8_t vdev_raidz_pow2[256] = { 247 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 248 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 249 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 250 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 251 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 252 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 253 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 254 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 255 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 256 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 257 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 258 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 259 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 260 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 261 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 262 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 263 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 264 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 265 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 266 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 267 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 268 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 269 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 270 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 271 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 272 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 273 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 274 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 275 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 276 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 277 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 278 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 279 }; 280 static const uint8_t vdev_raidz_log2[256] = { 281 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 282 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 283 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 284 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 285 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 286 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 287 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 288 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 289 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 290 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 291 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 292 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 293 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 294 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 295 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 296 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 297 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 298 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 299 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 300 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 301 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 302 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 303 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 304 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 305 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 306 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 307 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 308 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 309 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 310 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 311 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 312 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 313 }; 314 315 /* 316 * Multiply a given number by 2 raised to the given power. 317 */ 318 static uint8_t 319 vdev_raidz_exp2(uint8_t a, int exp) 320 { 321 if (a == 0) 322 return (0); 323 324 //ASSERT(exp >= 0); 325 //ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 326 327 exp += vdev_raidz_log2[a]; 328 if (exp > 255) 329 exp -= 255; 330 331 return (vdev_raidz_pow2[exp]); 332 } 333 334 static void 335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols) 336 { 337 uint64_t *q, *p, *src, pcount, ccount, mask, i; 338 int c; 339 340 pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 341 //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size); 342 343 for (c = nparity; c < acols; c++) { 344 src = cols[c].rc_data; 345 p = cols[VDEV_RAIDZ_P].rc_data; 346 q = cols[VDEV_RAIDZ_Q].rc_data; 347 ccount = cols[c].rc_size / sizeof (src[0]); 348 349 if (c == nparity) { 350 //ASSERT(ccount == pcount || ccount == 0); 351 for (i = 0; i < ccount; i++, p++, q++, src++) { 352 *q = *src; 353 *p = *src; 354 } 355 for (; i < pcount; i++, p++, q++, src++) { 356 *q = 0; 357 *p = 0; 358 } 359 } else { 360 //ASSERT(ccount <= pcount); 361 362 /* 363 * Rather than multiplying each byte 364 * individually (as described above), we are 365 * able to handle 8 at once by generating a 366 * mask based on the high bit in each byte and 367 * using that to conditionally XOR in 0x1d. 368 */ 369 for (i = 0; i < ccount; i++, p++, q++, src++) { 370 mask = *q & 0x8080808080808080ULL; 371 mask = (mask << 1) - (mask >> 7); 372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 373 (mask & 0x1d1d1d1d1d1d1d1dULL); 374 *q ^= *src; 375 *p ^= *src; 376 } 377 378 /* 379 * Treat short columns as though they are full of 0s. 380 */ 381 for (; i < pcount; i++, q++) { 382 mask = *q & 0x8080808080808080ULL; 383 mask = (mask << 1) - (mask >> 7); 384 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 385 (mask & 0x1d1d1d1d1d1d1d1dULL); 386 } 387 } 388 } 389 } 390 391 static void 392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x) 393 { 394 uint64_t *dst, *src, xcount, ccount, count, mask, i; 395 uint8_t *b; 396 int c, j, exp; 397 398 xcount = cols[x].rc_size / sizeof (src[0]); 399 //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 400 401 for (c = nparity; c < acols; c++) { 402 src = cols[c].rc_data; 403 dst = cols[x].rc_data; 404 405 if (c == x) 406 ccount = 0; 407 else 408 ccount = cols[c].rc_size / sizeof (src[0]); 409 410 count = MIN(ccount, xcount); 411 412 if (c == nparity) { 413 for (i = 0; i < count; i++, dst++, src++) { 414 *dst = *src; 415 } 416 for (; i < xcount; i++, dst++) { 417 *dst = 0; 418 } 419 420 } else { 421 /* 422 * For an explanation of this, see the comment in 423 * vdev_raidz_generate_parity_pq() above. 424 */ 425 for (i = 0; i < count; i++, dst++, src++) { 426 mask = *dst & 0x8080808080808080ULL; 427 mask = (mask << 1) - (mask >> 7); 428 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 429 (mask & 0x1d1d1d1d1d1d1d1dULL); 430 *dst ^= *src; 431 } 432 433 for (; i < xcount; i++, dst++) { 434 mask = *dst & 0x8080808080808080ULL; 435 mask = (mask << 1) - (mask >> 7); 436 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 437 (mask & 0x1d1d1d1d1d1d1d1dULL); 438 } 439 } 440 } 441 442 src = cols[VDEV_RAIDZ_Q].rc_data; 443 dst = cols[x].rc_data; 444 exp = 255 - (acols - 1 - x); 445 446 for (i = 0; i < xcount; i++, dst++, src++) { 447 *dst ^= *src; 448 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 449 *b = vdev_raidz_exp2(*b, exp); 450 } 451 } 452 } 453 454 455 static void 456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols, 457 int x, int y) 458 { 459 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 460 void *pdata, *qdata; 461 uint64_t xsize, ysize, i; 462 463 //ASSERT(x < y); 464 //ASSERT(x >= nparity); 465 //ASSERT(y < acols); 466 467 //ASSERT(cols[x].rc_size >= cols[y].rc_size); 468 469 /* 470 * Move the parity data aside -- we're going to compute parity as 471 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 472 * reuse the parity generation mechanism without trashing the actual 473 * parity so we make those columns appear to be full of zeros by 474 * setting their lengths to zero. 475 */ 476 pdata = cols[VDEV_RAIDZ_P].rc_data; 477 qdata = cols[VDEV_RAIDZ_Q].rc_data; 478 xsize = cols[x].rc_size; 479 ysize = cols[y].rc_size; 480 481 cols[VDEV_RAIDZ_P].rc_data = 482 zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size); 483 cols[VDEV_RAIDZ_Q].rc_data = 484 zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size); 485 cols[x].rc_size = 0; 486 cols[y].rc_size = 0; 487 488 vdev_raidz_generate_parity_pq(cols, nparity, acols); 489 490 cols[x].rc_size = xsize; 491 cols[y].rc_size = ysize; 492 493 p = pdata; 494 q = qdata; 495 pxy = cols[VDEV_RAIDZ_P].rc_data; 496 qxy = cols[VDEV_RAIDZ_Q].rc_data; 497 xd = cols[x].rc_data; 498 yd = cols[y].rc_data; 499 500 /* 501 * We now have: 502 * Pxy = P + D_x + D_y 503 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 504 * 505 * We can then solve for D_x: 506 * D_x = A * (P + Pxy) + B * (Q + Qxy) 507 * where 508 * A = 2^(x - y) * (2^(x - y) + 1)^-1 509 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 510 * 511 * With D_x in hand, we can easily solve for D_y: 512 * D_y = P + Pxy + D_x 513 */ 514 515 a = vdev_raidz_pow2[255 + x - y]; 516 b = vdev_raidz_pow2[255 - (acols - 1 - x)]; 517 tmp = 255 - vdev_raidz_log2[a ^ 1]; 518 519 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 520 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 521 522 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 523 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 524 vdev_raidz_exp2(*q ^ *qxy, bexp); 525 526 if (i < ysize) 527 *yd = *p ^ *pxy ^ *xd; 528 } 529 530 /* 531 * Restore the saved parity data. 532 */ 533 cols[VDEV_RAIDZ_P].rc_data = pdata; 534 cols[VDEV_RAIDZ_Q].rc_data = qdata; 535 } 536 537 static int 538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 539 off_t offset, size_t bytes) 540 { 541 size_t psize = BP_GET_PSIZE(bp); 542 vdev_t *kid; 543 int unit_shift = vdev->v_ashift; 544 int dcols = vdev->v_nchildren; 545 int nparity = vdev->v_nparity; 546 int missingdata, missingparity; 547 int parity_errors, data_errors, unexpected_errors, total_errors; 548 int parity_untried; 549 uint64_t b = offset >> unit_shift; 550 uint64_t s = psize >> unit_shift; 551 uint64_t f = b % dcols; 552 uint64_t o = (b / dcols) << unit_shift; 553 uint64_t q, r, coff; 554 int c, c1, bc, col, acols, devidx, asize, n; 555 static raidz_col_t cols[16]; 556 raidz_col_t *rc, *rc1; 557 558 q = s / (dcols - nparity); 559 r = s - q * (dcols - nparity); 560 bc = (r == 0 ? 0 : r + nparity); 561 562 acols = (q == 0 ? bc : dcols); 563 asize = 0; 564 565 for (c = 0; c < acols; c++) { 566 col = f + c; 567 coff = o; 568 if (col >= dcols) { 569 col -= dcols; 570 coff += 1ULL << unit_shift; 571 } 572 cols[c].rc_devidx = col; 573 cols[c].rc_offset = coff; 574 cols[c].rc_size = (q + (c < bc)) << unit_shift; 575 cols[c].rc_data = NULL; 576 cols[c].rc_error = 0; 577 cols[c].rc_tried = 0; 578 cols[c].rc_skipped = 0; 579 asize += cols[c].rc_size; 580 } 581 582 asize = roundup(asize, (nparity + 1) << unit_shift); 583 584 for (c = 0; c < nparity; c++) { 585 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size); 586 } 587 588 cols[c].rc_data = buf; 589 590 for (c = c + 1; c < acols; c++) 591 cols[c].rc_data = (char *)cols[c - 1].rc_data + 592 cols[c - 1].rc_size; 593 594 /* 595 * If all data stored spans all columns, there's a danger that 596 * parity will always be on the same device and, since parity 597 * isn't read during normal operation, that that device's I/O 598 * bandwidth won't be used effectively. We therefore switch 599 * the parity every 1MB. 600 * 601 * ... at least that was, ostensibly, the theory. As a 602 * practical matter unless we juggle the parity between all 603 * devices evenly, we won't see any benefit. Further, 604 * occasional writes that aren't a multiple of the LCM of the 605 * number of children and the minimum stripe width are 606 * sufficient to avoid pessimal behavior. Unfortunately, this 607 * decision created an implicit on-disk format requirement 608 * that we need to support for all eternity, but only for 609 * single-parity RAID-Z. 610 */ 611 //ASSERT(acols >= 2); 612 //ASSERT(cols[0].rc_size == cols[1].rc_size); 613 614 if (nparity == 1 && (offset & (1ULL << 20))) { 615 devidx = cols[0].rc_devidx; 616 o = cols[0].rc_offset; 617 cols[0].rc_devidx = cols[1].rc_devidx; 618 cols[0].rc_offset = cols[1].rc_offset; 619 cols[1].rc_devidx = devidx; 620 cols[1].rc_offset = o; 621 } 622 623 /* 624 * Iterate over the columns in reverse order so that we hit 625 * the parity last -- any errors along the way will force us 626 * to read the parity data. 627 */ 628 missingdata = 0; 629 missingparity = 0; 630 for (c = acols - 1; c >= 0; c--) { 631 rc = &cols[c]; 632 devidx = rc->rc_devidx; 633 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) 634 if (kid->v_id == devidx) 635 break; 636 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) { 637 if (c >= nparity) 638 missingdata++; 639 else 640 missingparity++; 641 rc->rc_error = ENXIO; 642 rc->rc_tried = 1; /* don't even try */ 643 rc->rc_skipped = 1; 644 continue; 645 } 646 #if 0 647 /* 648 * Too hard for the bootcode 649 */ 650 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 651 if (c >= nparity) 652 rm->rm_missingdata++; 653 else 654 rm->rm_missingparity++; 655 rc->rc_error = ESTALE; 656 rc->rc_skipped = 1; 657 continue; 658 } 659 #endif 660 if (c >= nparity || missingdata > 0) { 661 if (rc->rc_data) 662 rc->rc_error = kid->v_read(kid, NULL, 663 rc->rc_data, rc->rc_offset, rc->rc_size); 664 else 665 rc->rc_error = ENXIO; 666 rc->rc_tried = 1; 667 rc->rc_skipped = 0; 668 } 669 } 670 671 reconstruct: 672 parity_errors = 0; 673 data_errors = 0; 674 unexpected_errors = 0; 675 total_errors = 0; 676 parity_untried = 0; 677 for (c = 0; c < acols; c++) { 678 rc = &cols[c]; 679 680 if (rc->rc_error) { 681 if (c < nparity) 682 parity_errors++; 683 else 684 data_errors++; 685 686 if (!rc->rc_skipped) 687 unexpected_errors++; 688 689 total_errors++; 690 } else if (c < nparity && !rc->rc_tried) { 691 parity_untried++; 692 } 693 } 694 695 /* 696 * There are three potential phases for a read: 697 * 1. produce valid data from the columns read 698 * 2. read all disks and try again 699 * 3. perform combinatorial reconstruction 700 * 701 * Each phase is progressively both more expensive and less 702 * likely to occur. If we encounter more errors than we can 703 * repair or all phases fail, we have no choice but to return 704 * an error. 705 */ 706 707 /* 708 * If the number of errors we saw was correctable -- less than 709 * or equal to the number of parity disks read -- attempt to 710 * produce data that has a valid checksum. Naturally, this 711 * case applies in the absence of any errors. 712 */ 713 if (total_errors <= nparity - parity_untried) { 714 switch (data_errors) { 715 case 0: 716 if (zio_checksum_error(bp, buf) == 0) 717 return (0); 718 break; 719 720 case 1: 721 /* 722 * We either attempt to read all the parity columns or 723 * none of them. If we didn't try to read parity, we 724 * wouldn't be here in the correctable case. There must 725 * also have been fewer parity errors than parity 726 * columns or, again, we wouldn't be in this code path. 727 */ 728 //ASSERT(parity_untried == 0); 729 //ASSERT(parity_errors < nparity); 730 731 /* 732 * Find the column that reported the error. 733 */ 734 for (c = nparity; c < acols; c++) { 735 rc = &cols[c]; 736 if (rc->rc_error != 0) 737 break; 738 } 739 //ASSERT(c != acols); 740 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 741 742 if (cols[VDEV_RAIDZ_P].rc_error == 0) { 743 vdev_raidz_reconstruct_p(cols, nparity, 744 acols, c); 745 } else { 746 //ASSERT(nparity > 1); 747 vdev_raidz_reconstruct_q(cols, nparity, 748 acols, c); 749 } 750 751 if (zio_checksum_error(bp, buf) == 0) 752 return (0); 753 break; 754 755 case 2: 756 /* 757 * Two data column errors require double parity. 758 */ 759 //ASSERT(nparity == 2); 760 761 /* 762 * Find the two columns that reported errors. 763 */ 764 for (c = nparity; c < acols; c++) { 765 rc = &cols[c]; 766 if (rc->rc_error != 0) 767 break; 768 } 769 //ASSERT(c != acols); 770 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 771 772 for (c1 = c++; c < acols; c++) { 773 rc = &cols[c]; 774 if (rc->rc_error != 0) 775 break; 776 } 777 //ASSERT(c != acols); 778 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 779 780 vdev_raidz_reconstruct_pq(cols, nparity, acols, 781 c1, c); 782 783 if (zio_checksum_error(bp, buf) == 0) 784 return (0); 785 break; 786 787 default: 788 break; 789 //ASSERT(nparity <= 2); 790 //ASSERT(0); 791 } 792 } 793 794 /* 795 * This isn't a typical situation -- either we got a read 796 * error or a child silently returned bad data. Read every 797 * block so we can try again with as much data and parity as 798 * we can track down. If we've already been through once 799 * before, all children will be marked as tried so we'll 800 * proceed to combinatorial reconstruction. 801 */ 802 n = 0; 803 for (c = 0; c < acols; c++) { 804 rc = &cols[c]; 805 if (rc->rc_tried) 806 continue; 807 808 devidx = rc->rc_devidx; 809 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) 810 if (kid->v_id == devidx) 811 break; 812 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) { 813 rc->rc_error = ENXIO; 814 rc->rc_tried = 1; /* don't even try */ 815 rc->rc_skipped = 1; 816 continue; 817 } 818 if (rc->rc_data) 819 rc->rc_error = kid->v_read(kid, NULL, 820 rc->rc_data, rc->rc_offset, rc->rc_size); 821 else 822 rc->rc_error = ENXIO; 823 if (rc->rc_error == 0) 824 n++; 825 rc->rc_tried = 1; 826 rc->rc_skipped = 0; 827 } 828 829 /* 830 * If we managed to read anything more, retry the 831 * reconstruction. 832 */ 833 if (n) 834 goto reconstruct; 835 836 /* 837 * At this point we've attempted to reconstruct the data given the 838 * errors we detected, and we've attempted to read all columns. There 839 * must, therefore, be one or more additional problems -- silent errors 840 * resulting in invalid data rather than explicit I/O errors resulting 841 * in absent data. Before we attempt combinatorial reconstruction make 842 * sure we have a chance of coming up with the right answer. 843 */ 844 if (total_errors >= nparity) { 845 return (EIO); 846 } 847 848 asize = 0; 849 for (c = 0; c < acols; c++) { 850 rc = &cols[c]; 851 if (rc->rc_size > asize) 852 asize = rc->rc_size; 853 } 854 if (cols[VDEV_RAIDZ_P].rc_error == 0) { 855 /* 856 * Attempt to reconstruct the data from parity P. 857 */ 858 void *orig; 859 orig = zfs_alloc_temp(asize); 860 for (c = nparity; c < acols; c++) { 861 rc = &cols[c]; 862 863 memcpy(orig, rc->rc_data, rc->rc_size); 864 vdev_raidz_reconstruct_p(cols, nparity, acols, c); 865 866 if (zio_checksum_error(bp, buf) == 0) 867 return (0); 868 869 memcpy(rc->rc_data, orig, rc->rc_size); 870 } 871 } 872 873 if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) { 874 /* 875 * Attempt to reconstruct the data from parity Q. 876 */ 877 void *orig; 878 orig = zfs_alloc_temp(asize); 879 for (c = nparity; c < acols; c++) { 880 rc = &cols[c]; 881 882 memcpy(orig, rc->rc_data, rc->rc_size); 883 vdev_raidz_reconstruct_q(cols, nparity, acols, c); 884 885 if (zio_checksum_error(bp, buf) == 0) 886 return (0); 887 888 memcpy(rc->rc_data, orig, rc->rc_size); 889 } 890 } 891 892 if (nparity > 1 && 893 cols[VDEV_RAIDZ_P].rc_error == 0 && 894 cols[VDEV_RAIDZ_Q].rc_error == 0) { 895 /* 896 * Attempt to reconstruct the data from both P and Q. 897 */ 898 void *orig, *orig1; 899 orig = zfs_alloc_temp(asize); 900 orig1 = zfs_alloc_temp(asize); 901 for (c = nparity; c < acols - 1; c++) { 902 rc = &cols[c]; 903 904 memcpy(orig, rc->rc_data, rc->rc_size); 905 906 for (c1 = c + 1; c1 < acols; c1++) { 907 rc1 = &cols[c1]; 908 909 memcpy(orig1, rc1->rc_data, rc1->rc_size); 910 911 vdev_raidz_reconstruct_pq(cols, nparity, 912 acols, c, c1); 913 914 if (zio_checksum_error(bp, buf) == 0) 915 return (0); 916 917 memcpy(rc1->rc_data, orig1, rc1->rc_size); 918 } 919 920 memcpy(rc->rc_data, orig, rc->rc_size); 921 } 922 } 923 924 return (EIO); 925 } 926 927