1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 static void 32 zfs_init_crc(void) 33 { 34 int i, j; 35 uint64_t *ct; 36 37 /* 38 * Calculate the crc64 table (used for the zap hash 39 * function). 40 */ 41 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 42 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 43 for (i = 0; i < 256; i++) 44 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 45 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 46 } 47 } 48 49 static void 50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 51 { 52 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 53 } 54 55 /* 56 * Signature for checksum functions. 57 */ 58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 59 60 /* 61 * Information about each checksum function. 62 */ 63 typedef struct zio_checksum_info { 64 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 65 int ci_correctable; /* number of correctable bits */ 66 int ci_zbt; /* uses zio block tail? */ 67 const char *ci_name; /* descriptive name */ 68 } zio_checksum_info_t; 69 70 #include "fletcher.c" 71 #include "sha256.c" 72 73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 74 {{NULL, NULL}, 0, 0, "inherit"}, 75 {{NULL, NULL}, 0, 0, "on"}, 76 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, 77 {{zio_checksum_SHA256, NULL}, 1, 1, "label"}, 78 {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"}, 79 {{fletcher_2_native, NULL}, 0, 1, "zilog"}, 80 {{fletcher_2_native, NULL}, 0, 0, "fletcher2"}, 81 {{fletcher_4_native, NULL}, 1, 0, "fletcher4"}, 82 {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"}, 83 }; 84 85 /* 86 * Common signature for all zio compress/decompress functions. 87 */ 88 typedef size_t zio_compress_func_t(void *src, void *dst, 89 size_t s_len, size_t d_len, int); 90 typedef int zio_decompress_func_t(void *src, void *dst, 91 size_t s_len, size_t d_len, int); 92 93 /* 94 * Information about each compression function. 95 */ 96 typedef struct zio_compress_info { 97 zio_compress_func_t *ci_compress; /* compression function */ 98 zio_decompress_func_t *ci_decompress; /* decompression function */ 99 int ci_level; /* level parameter */ 100 const char *ci_name; /* algorithm name */ 101 } zio_compress_info_t; 102 103 #include "lzjb.c" 104 105 /* 106 * Compression vectors. 107 */ 108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 109 {NULL, NULL, 0, "inherit"}, 110 {NULL, NULL, 0, "on"}, 111 {NULL, NULL, 0, "uncompressed"}, 112 {NULL, lzjb_decompress, 0, "lzjb"}, 113 {NULL, NULL, 0, "empty"}, 114 {NULL, NULL, 1, "gzip-1"}, 115 {NULL, NULL, 2, "gzip-2"}, 116 {NULL, NULL, 3, "gzip-3"}, 117 {NULL, NULL, 4, "gzip-4"}, 118 {NULL, NULL, 5, "gzip-5"}, 119 {NULL, NULL, 6, "gzip-6"}, 120 {NULL, NULL, 7, "gzip-7"}, 121 {NULL, NULL, 8, "gzip-8"}, 122 {NULL, NULL, 9, "gzip-9"}, 123 }; 124 125 static int 126 zio_checksum_error(const blkptr_t *bp, void *data) 127 { 128 zio_cksum_t zc = bp->blk_cksum; 129 unsigned int checksum = BP_GET_CHECKSUM(bp); 130 uint64_t size = BP_GET_PSIZE(bp); 131 zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; 132 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 133 zio_cksum_t actual_cksum, expected_cksum; 134 135 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) 136 return (EINVAL); 137 138 if (ci->ci_zbt) { 139 expected_cksum = zbt->zbt_cksum; 140 zbt->zbt_cksum = zc; 141 ci->ci_func[0](data, size, &actual_cksum); 142 zbt->zbt_cksum = expected_cksum; 143 zc = expected_cksum; 144 } else { 145 /* ASSERT(!BP_IS_GANG(bp)); */ 146 ci->ci_func[0](data, size, &actual_cksum); 147 } 148 149 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) { 150 /*printf("ZFS: read checksum failed\n");*/ 151 return (EIO); 152 } 153 154 return (0); 155 } 156 157 static int 158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 159 void *dest, uint64_t destsize) 160 { 161 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 162 163 /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */ 164 if (!ci->ci_decompress) { 165 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 166 return (EIO); 167 } 168 169 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 170 } 171 172 static uint64_t 173 zap_hash(uint64_t salt, const char *name) 174 { 175 const uint8_t *cp; 176 uint8_t c; 177 uint64_t crc = salt; 178 179 /*ASSERT(crc != 0);*/ 180 /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/ 181 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 182 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 183 184 /* 185 * Only use 28 bits, since we need 4 bits in the cookie for the 186 * collision differentiator. We MUST use the high bits, since 187 * those are the onces that we first pay attention to when 188 * chosing the bucket. 189 */ 190 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 191 192 return (crc); 193 } 194 195 static char *zfs_alloc_temp(size_t sz); 196 197 typedef struct raidz_col { 198 uint64_t rc_devidx; /* child device index for I/O */ 199 uint64_t rc_offset; /* device offset */ 200 uint64_t rc_size; /* I/O size */ 201 void *rc_data; /* I/O data */ 202 int rc_error; /* I/O error for this device */ 203 uint8_t rc_tried; /* Did we attempt this I/O column? */ 204 uint8_t rc_skipped; /* Did we skip this I/O column? */ 205 } raidz_col_t; 206 207 #define VDEV_RAIDZ_P 0 208 #define VDEV_RAIDZ_Q 1 209 210 static void 211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x) 212 { 213 uint64_t *dst, *src, xcount, ccount, count, i; 214 int c; 215 216 xcount = cols[x].rc_size / sizeof (src[0]); 217 //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 218 //ASSERT(xcount > 0); 219 220 src = cols[VDEV_RAIDZ_P].rc_data; 221 dst = cols[x].rc_data; 222 for (i = 0; i < xcount; i++, dst++, src++) { 223 *dst = *src; 224 } 225 226 for (c = nparity; c < acols; c++) { 227 src = cols[c].rc_data; 228 dst = cols[x].rc_data; 229 230 if (c == x) 231 continue; 232 233 ccount = cols[c].rc_size / sizeof (src[0]); 234 count = MIN(ccount, xcount); 235 236 for (i = 0; i < count; i++, dst++, src++) { 237 *dst ^= *src; 238 } 239 } 240 } 241 242 /* 243 * These two tables represent powers and logs of 2 in the Galois field defined 244 * above. These values were computed by repeatedly multiplying by 2 as above. 245 */ 246 static const uint8_t vdev_raidz_pow2[256] = { 247 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 248 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 249 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 250 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 251 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 252 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 253 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 254 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 255 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 256 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 257 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 258 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 259 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 260 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 261 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 262 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 263 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 264 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 265 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 266 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 267 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 268 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 269 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 270 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 271 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 272 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 273 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 274 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 275 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 276 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 277 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 278 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 279 }; 280 static const uint8_t vdev_raidz_log2[256] = { 281 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 282 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 283 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 284 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 285 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 286 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 287 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 288 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 289 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 290 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 291 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 292 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 293 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 294 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 295 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 296 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 297 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 298 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 299 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 300 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 301 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 302 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 303 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 304 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 305 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 306 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 307 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 308 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 309 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 310 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 311 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 312 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 313 }; 314 315 /* 316 * Multiply a given number by 2 raised to the given power. 317 */ 318 static uint8_t 319 vdev_raidz_exp2(uint8_t a, int exp) 320 { 321 if (a == 0) 322 return (0); 323 324 //ASSERT(exp >= 0); 325 //ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 326 327 exp += vdev_raidz_log2[a]; 328 if (exp > 255) 329 exp -= 255; 330 331 return (vdev_raidz_pow2[exp]); 332 } 333 334 static void 335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols) 336 { 337 uint64_t *q, *p, *src, pcount, ccount, mask, i; 338 int c; 339 340 pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 341 //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size); 342 343 for (c = nparity; c < acols; c++) { 344 src = cols[c].rc_data; 345 p = cols[VDEV_RAIDZ_P].rc_data; 346 q = cols[VDEV_RAIDZ_Q].rc_data; 347 ccount = cols[c].rc_size / sizeof (src[0]); 348 349 if (c == nparity) { 350 //ASSERT(ccount == pcount || ccount == 0); 351 for (i = 0; i < ccount; i++, p++, q++, src++) { 352 *q = *src; 353 *p = *src; 354 } 355 for (; i < pcount; i++, p++, q++, src++) { 356 *q = 0; 357 *p = 0; 358 } 359 } else { 360 //ASSERT(ccount <= pcount); 361 362 /* 363 * Rather than multiplying each byte 364 * individually (as described above), we are 365 * able to handle 8 at once by generating a 366 * mask based on the high bit in each byte and 367 * using that to conditionally XOR in 0x1d. 368 */ 369 for (i = 0; i < ccount; i++, p++, q++, src++) { 370 mask = *q & 0x8080808080808080ULL; 371 mask = (mask << 1) - (mask >> 7); 372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 373 (mask & 0x1d1d1d1d1d1d1d1dULL); 374 *q ^= *src; 375 *p ^= *src; 376 } 377 378 /* 379 * Treat short columns as though they are full of 0s. 380 */ 381 for (; i < pcount; i++, q++) { 382 mask = *q & 0x8080808080808080ULL; 383 mask = (mask << 1) - (mask >> 7); 384 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 385 (mask & 0x1d1d1d1d1d1d1d1dULL); 386 } 387 } 388 } 389 } 390 391 static void 392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x) 393 { 394 uint64_t *dst, *src, xcount, ccount, count, mask, i; 395 uint8_t *b; 396 int c, j, exp; 397 398 xcount = cols[x].rc_size / sizeof (src[0]); 399 //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 400 401 for (c = nparity; c < acols; c++) { 402 src = cols[c].rc_data; 403 dst = cols[x].rc_data; 404 405 if (c == x) 406 ccount = 0; 407 else 408 ccount = cols[c].rc_size / sizeof (src[0]); 409 410 count = MIN(ccount, xcount); 411 412 if (c == nparity) { 413 for (i = 0; i < count; i++, dst++, src++) { 414 *dst = *src; 415 } 416 for (; i < xcount; i++, dst++) { 417 *dst = 0; 418 } 419 420 } else { 421 /* 422 * For an explanation of this, see the comment in 423 * vdev_raidz_generate_parity_pq() above. 424 */ 425 for (i = 0; i < count; i++, dst++, src++) { 426 mask = *dst & 0x8080808080808080ULL; 427 mask = (mask << 1) - (mask >> 7); 428 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 429 (mask & 0x1d1d1d1d1d1d1d1dULL); 430 *dst ^= *src; 431 } 432 433 for (; i < xcount; i++, dst++) { 434 mask = *dst & 0x8080808080808080ULL; 435 mask = (mask << 1) - (mask >> 7); 436 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 437 (mask & 0x1d1d1d1d1d1d1d1dULL); 438 } 439 } 440 } 441 442 src = cols[VDEV_RAIDZ_Q].rc_data; 443 dst = cols[x].rc_data; 444 exp = 255 - (acols - 1 - x); 445 446 for (i = 0; i < xcount; i++, dst++, src++) { 447 *dst ^= *src; 448 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 449 *b = vdev_raidz_exp2(*b, exp); 450 } 451 } 452 } 453 454 455 static void 456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols, 457 int x, int y) 458 { 459 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 460 void *pdata, *qdata; 461 uint64_t xsize, ysize, i; 462 463 //ASSERT(x < y); 464 //ASSERT(x >= nparity); 465 //ASSERT(y < acols); 466 467 //ASSERT(cols[x].rc_size >= cols[y].rc_size); 468 469 /* 470 * Move the parity data aside -- we're going to compute parity as 471 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 472 * reuse the parity generation mechanism without trashing the actual 473 * parity so we make those columns appear to be full of zeros by 474 * setting their lengths to zero. 475 */ 476 pdata = cols[VDEV_RAIDZ_P].rc_data; 477 qdata = cols[VDEV_RAIDZ_Q].rc_data; 478 xsize = cols[x].rc_size; 479 ysize = cols[y].rc_size; 480 481 cols[VDEV_RAIDZ_P].rc_data = 482 zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size); 483 cols[VDEV_RAIDZ_Q].rc_data = 484 zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size); 485 cols[x].rc_size = 0; 486 cols[y].rc_size = 0; 487 488 vdev_raidz_generate_parity_pq(cols, nparity, acols); 489 490 cols[x].rc_size = xsize; 491 cols[y].rc_size = ysize; 492 493 p = pdata; 494 q = qdata; 495 pxy = cols[VDEV_RAIDZ_P].rc_data; 496 qxy = cols[VDEV_RAIDZ_Q].rc_data; 497 xd = cols[x].rc_data; 498 yd = cols[y].rc_data; 499 500 /* 501 * We now have: 502 * Pxy = P + D_x + D_y 503 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 504 * 505 * We can then solve for D_x: 506 * D_x = A * (P + Pxy) + B * (Q + Qxy) 507 * where 508 * A = 2^(x - y) * (2^(x - y) + 1)^-1 509 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 510 * 511 * With D_x in hand, we can easily solve for D_y: 512 * D_y = P + Pxy + D_x 513 */ 514 515 a = vdev_raidz_pow2[255 + x - y]; 516 b = vdev_raidz_pow2[255 - (acols - 1 - x)]; 517 tmp = 255 - vdev_raidz_log2[a ^ 1]; 518 519 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 520 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 521 522 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 523 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 524 vdev_raidz_exp2(*q ^ *qxy, bexp); 525 526 if (i < ysize) 527 *yd = *p ^ *pxy ^ *xd; 528 } 529 530 /* 531 * Restore the saved parity data. 532 */ 533 cols[VDEV_RAIDZ_P].rc_data = pdata; 534 cols[VDEV_RAIDZ_Q].rc_data = qdata; 535 } 536 537 static int 538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 539 off_t offset, size_t bytes) 540 { 541 size_t psize = BP_GET_PSIZE(bp); 542 vdev_t *kid; 543 int unit_shift = vdev->v_ashift; 544 int dcols = vdev->v_nchildren; 545 int nparity = vdev->v_nparity; 546 int missingdata, missingparity; 547 int parity_errors, data_errors, unexpected_errors, total_errors; 548 int parity_untried; 549 uint64_t b = offset >> unit_shift; 550 uint64_t s = psize >> unit_shift; 551 uint64_t f = b % dcols; 552 uint64_t o = (b / dcols) << unit_shift; 553 int q, r, c, c1, bc, col, acols, coff, devidx, asize, n; 554 static raidz_col_t cols[16]; 555 raidz_col_t *rc, *rc1; 556 557 q = s / (dcols - nparity); 558 r = s - q * (dcols - nparity); 559 bc = (r == 0 ? 0 : r + nparity); 560 561 acols = (q == 0 ? bc : dcols); 562 asize = 0; 563 564 for (c = 0; c < acols; c++) { 565 col = f + c; 566 coff = o; 567 if (col >= dcols) { 568 col -= dcols; 569 coff += 1ULL << unit_shift; 570 } 571 cols[c].rc_devidx = col; 572 cols[c].rc_offset = coff; 573 cols[c].rc_size = (q + (c < bc)) << unit_shift; 574 cols[c].rc_data = NULL; 575 cols[c].rc_error = 0; 576 cols[c].rc_tried = 0; 577 cols[c].rc_skipped = 0; 578 asize += cols[c].rc_size; 579 } 580 581 asize = roundup(asize, (nparity + 1) << unit_shift); 582 583 for (c = 0; c < nparity; c++) { 584 cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size); 585 } 586 587 cols[c].rc_data = buf; 588 589 for (c = c + 1; c < acols; c++) 590 cols[c].rc_data = (char *)cols[c - 1].rc_data + 591 cols[c - 1].rc_size; 592 593 /* 594 * If all data stored spans all columns, there's a danger that 595 * parity will always be on the same device and, since parity 596 * isn't read during normal operation, that that device's I/O 597 * bandwidth won't be used effectively. We therefore switch 598 * the parity every 1MB. 599 * 600 * ... at least that was, ostensibly, the theory. As a 601 * practical matter unless we juggle the parity between all 602 * devices evenly, we won't see any benefit. Further, 603 * occasional writes that aren't a multiple of the LCM of the 604 * number of children and the minimum stripe width are 605 * sufficient to avoid pessimal behavior. Unfortunately, this 606 * decision created an implicit on-disk format requirement 607 * that we need to support for all eternity, but only for 608 * single-parity RAID-Z. 609 */ 610 //ASSERT(acols >= 2); 611 //ASSERT(cols[0].rc_size == cols[1].rc_size); 612 613 if (nparity == 1 && (offset & (1ULL << 20))) { 614 devidx = cols[0].rc_devidx; 615 o = cols[0].rc_offset; 616 cols[0].rc_devidx = cols[1].rc_devidx; 617 cols[0].rc_offset = cols[1].rc_offset; 618 cols[1].rc_devidx = devidx; 619 cols[1].rc_offset = o; 620 } 621 622 /* 623 * Iterate over the columns in reverse order so that we hit 624 * the parity last -- any errors along the way will force us 625 * to read the parity data. 626 */ 627 missingdata = 0; 628 missingparity = 0; 629 for (c = acols - 1; c >= 0; c--) { 630 rc = &cols[c]; 631 devidx = rc->rc_devidx; 632 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) 633 if (kid->v_id == devidx) 634 break; 635 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) { 636 if (c >= nparity) 637 missingdata++; 638 else 639 missingparity++; 640 rc->rc_error = ENXIO; 641 rc->rc_tried = 1; /* don't even try */ 642 rc->rc_skipped = 1; 643 continue; 644 } 645 #if 0 646 /* 647 * Too hard for the bootcode 648 */ 649 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 650 if (c >= nparity) 651 rm->rm_missingdata++; 652 else 653 rm->rm_missingparity++; 654 rc->rc_error = ESTALE; 655 rc->rc_skipped = 1; 656 continue; 657 } 658 #endif 659 if (c >= nparity || missingdata > 0) { 660 if (rc->rc_data) 661 rc->rc_error = kid->v_read(kid, NULL, 662 rc->rc_data, rc->rc_offset, rc->rc_size); 663 else 664 rc->rc_error = ENXIO; 665 rc->rc_tried = 1; 666 rc->rc_skipped = 0; 667 } 668 } 669 670 reconstruct: 671 parity_errors = 0; 672 data_errors = 0; 673 unexpected_errors = 0; 674 total_errors = 0; 675 parity_untried = 0; 676 for (c = 0; c < acols; c++) { 677 rc = &cols[c]; 678 679 if (rc->rc_error) { 680 if (c < nparity) 681 parity_errors++; 682 else 683 data_errors++; 684 685 if (!rc->rc_skipped) 686 unexpected_errors++; 687 688 total_errors++; 689 } else if (c < nparity && !rc->rc_tried) { 690 parity_untried++; 691 } 692 } 693 694 /* 695 * There are three potential phases for a read: 696 * 1. produce valid data from the columns read 697 * 2. read all disks and try again 698 * 3. perform combinatorial reconstruction 699 * 700 * Each phase is progressively both more expensive and less 701 * likely to occur. If we encounter more errors than we can 702 * repair or all phases fail, we have no choice but to return 703 * an error. 704 */ 705 706 /* 707 * If the number of errors we saw was correctable -- less than 708 * or equal to the number of parity disks read -- attempt to 709 * produce data that has a valid checksum. Naturally, this 710 * case applies in the absence of any errors. 711 */ 712 if (total_errors <= nparity - parity_untried) { 713 switch (data_errors) { 714 case 0: 715 if (zio_checksum_error(bp, buf) == 0) 716 return (0); 717 break; 718 719 case 1: 720 /* 721 * We either attempt to read all the parity columns or 722 * none of them. If we didn't try to read parity, we 723 * wouldn't be here in the correctable case. There must 724 * also have been fewer parity errors than parity 725 * columns or, again, we wouldn't be in this code path. 726 */ 727 //ASSERT(parity_untried == 0); 728 //ASSERT(parity_errors < nparity); 729 730 /* 731 * Find the column that reported the error. 732 */ 733 for (c = nparity; c < acols; c++) { 734 rc = &cols[c]; 735 if (rc->rc_error != 0) 736 break; 737 } 738 //ASSERT(c != acols); 739 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 740 741 if (cols[VDEV_RAIDZ_P].rc_error == 0) { 742 vdev_raidz_reconstruct_p(cols, nparity, 743 acols, c); 744 } else { 745 //ASSERT(nparity > 1); 746 vdev_raidz_reconstruct_q(cols, nparity, 747 acols, c); 748 } 749 750 if (zio_checksum_error(bp, buf) == 0) 751 return (0); 752 break; 753 754 case 2: 755 /* 756 * Two data column errors require double parity. 757 */ 758 //ASSERT(nparity == 2); 759 760 /* 761 * Find the two columns that reported errors. 762 */ 763 for (c = nparity; c < acols; c++) { 764 rc = &cols[c]; 765 if (rc->rc_error != 0) 766 break; 767 } 768 //ASSERT(c != acols); 769 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 770 771 for (c1 = c++; c < acols; c++) { 772 rc = &cols[c]; 773 if (rc->rc_error != 0) 774 break; 775 } 776 //ASSERT(c != acols); 777 //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE); 778 779 vdev_raidz_reconstruct_pq(cols, nparity, acols, 780 c1, c); 781 782 if (zio_checksum_error(bp, buf) == 0) 783 return (0); 784 break; 785 786 default: 787 break; 788 //ASSERT(nparity <= 2); 789 //ASSERT(0); 790 } 791 } 792 793 /* 794 * This isn't a typical situation -- either we got a read 795 * error or a child silently returned bad data. Read every 796 * block so we can try again with as much data and parity as 797 * we can track down. If we've already been through once 798 * before, all children will be marked as tried so we'll 799 * proceed to combinatorial reconstruction. 800 */ 801 n = 0; 802 for (c = 0; c < acols; c++) { 803 rc = &cols[c]; 804 if (rc->rc_tried) 805 continue; 806 807 devidx = rc->rc_devidx; 808 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) 809 if (kid->v_id == devidx) 810 break; 811 if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) { 812 rc->rc_error = ENXIO; 813 rc->rc_tried = 1; /* don't even try */ 814 rc->rc_skipped = 1; 815 continue; 816 } 817 if (rc->rc_data) 818 rc->rc_error = kid->v_read(kid, NULL, 819 rc->rc_data, rc->rc_offset, rc->rc_size); 820 else 821 rc->rc_error = ENXIO; 822 if (rc->rc_error == 0) 823 n++; 824 rc->rc_tried = 1; 825 rc->rc_skipped = 0; 826 } 827 828 /* 829 * If we managed to read anything more, retry the 830 * reconstruction. 831 */ 832 if (n) 833 goto reconstruct; 834 835 /* 836 * At this point we've attempted to reconstruct the data given the 837 * errors we detected, and we've attempted to read all columns. There 838 * must, therefore, be one or more additional problems -- silent errors 839 * resulting in invalid data rather than explicit I/O errors resulting 840 * in absent data. Before we attempt combinatorial reconstruction make 841 * sure we have a chance of coming up with the right answer. 842 */ 843 if (total_errors >= nparity) { 844 return (EIO); 845 } 846 847 asize = 0; 848 for (c = 0; c < acols; c++) { 849 rc = &cols[c]; 850 if (rc->rc_size > asize) 851 asize = rc->rc_size; 852 } 853 if (cols[VDEV_RAIDZ_P].rc_error == 0) { 854 /* 855 * Attempt to reconstruct the data from parity P. 856 */ 857 void *orig; 858 orig = zfs_alloc_temp(asize); 859 for (c = nparity; c < acols; c++) { 860 rc = &cols[c]; 861 862 memcpy(orig, rc->rc_data, rc->rc_size); 863 vdev_raidz_reconstruct_p(cols, nparity, acols, c); 864 865 if (zio_checksum_error(bp, buf) == 0) 866 return (0); 867 868 memcpy(rc->rc_data, orig, rc->rc_size); 869 } 870 } 871 872 if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) { 873 /* 874 * Attempt to reconstruct the data from parity Q. 875 */ 876 void *orig; 877 orig = zfs_alloc_temp(asize); 878 for (c = nparity; c < acols; c++) { 879 rc = &cols[c]; 880 881 memcpy(orig, rc->rc_data, rc->rc_size); 882 vdev_raidz_reconstruct_q(cols, nparity, acols, c); 883 884 if (zio_checksum_error(bp, buf) == 0) 885 return (0); 886 887 memcpy(rc->rc_data, orig, rc->rc_size); 888 } 889 } 890 891 if (nparity > 1 && 892 cols[VDEV_RAIDZ_P].rc_error == 0 && 893 cols[VDEV_RAIDZ_Q].rc_error == 0) { 894 /* 895 * Attempt to reconstruct the data from both P and Q. 896 */ 897 void *orig, *orig1; 898 orig = zfs_alloc_temp(asize); 899 orig1 = zfs_alloc_temp(asize); 900 for (c = nparity; c < acols - 1; c++) { 901 rc = &cols[c]; 902 903 memcpy(orig, rc->rc_data, rc->rc_size); 904 905 for (c1 = c + 1; c1 < acols; c1++) { 906 rc1 = &cols[c1]; 907 908 memcpy(orig1, rc1->rc_data, rc1->rc_size); 909 910 vdev_raidz_reconstruct_pq(cols, nparity, 911 acols, c, c1); 912 913 if (zio_checksum_error(bp, buf) == 0) 914 return (0); 915 916 memcpy(rc1->rc_data, orig1, rc1->rc_size); 917 } 918 919 memcpy(rc->rc_data, orig, rc->rc_size); 920 } 921 } 922 923 return (EIO); 924 } 925 926