1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT(...) do { } while (0) 34 #define ASSERT3U(...) do { } while (0) 35 #define ASSERT3S(...) do { } while (0) 36 37 #define panic(...) do { \ 38 printf(__VA_ARGS__); \ 39 for (;;) ; \ 40 } while (0) 41 42 #define kmem_alloc(size, flag) zfs_alloc((size)) 43 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 44 45 static void 46 zfs_init_crc(void) 47 { 48 int i, j; 49 uint64_t *ct; 50 51 /* 52 * Calculate the crc64 table (used for the zap hash 53 * function). 54 */ 55 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 56 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 57 for (i = 0; i < 256; i++) 58 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 59 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 60 } 61 } 62 63 static void 64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 73 74 /* 75 * Information about each checksum function. 76 */ 77 typedef struct zio_checksum_info { 78 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 79 int ci_correctable; /* number of correctable bits */ 80 int ci_eck; /* uses zio embedded checksum? */ 81 int ci_dedup; /* strong enough for dedup? */ 82 const char *ci_name; /* descriptive name */ 83 } zio_checksum_info_t; 84 85 #include "fletcher.c" 86 #include "sha256.c" 87 88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 89 {{NULL, NULL}, 0, 0, 0, "inherit"}, 90 {{NULL, NULL}, 0, 0, 0, "on"}, 91 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, 92 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, 93 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, 94 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, 95 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, 96 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, 97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, 98 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, 99 }; 100 101 102 /* 103 * Common signature for all zio compress/decompress functions. 104 */ 105 typedef size_t zio_compress_func_t(void *src, void *dst, 106 size_t s_len, size_t d_len, int); 107 typedef int zio_decompress_func_t(void *src, void *dst, 108 size_t s_len, size_t d_len, int); 109 110 /* 111 * Information about each compression function. 112 */ 113 typedef struct zio_compress_info { 114 zio_compress_func_t *ci_compress; /* compression function */ 115 zio_decompress_func_t *ci_decompress; /* decompression function */ 116 int ci_level; /* level parameter */ 117 const char *ci_name; /* algorithm name */ 118 } zio_compress_info_t; 119 120 #include "lzjb.c" 121 #include "zle.c" 122 123 /* 124 * Compression vectors. 125 */ 126 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 127 {NULL, NULL, 0, "inherit"}, 128 {NULL, NULL, 0, "on"}, 129 {NULL, NULL, 0, "uncompressed"}, 130 {NULL, lzjb_decompress, 0, "lzjb"}, 131 {NULL, NULL, 0, "empty"}, 132 {NULL, NULL, 1, "gzip-1"}, 133 {NULL, NULL, 2, "gzip-2"}, 134 {NULL, NULL, 3, "gzip-3"}, 135 {NULL, NULL, 4, "gzip-4"}, 136 {NULL, NULL, 5, "gzip-5"}, 137 {NULL, NULL, 6, "gzip-6"}, 138 {NULL, NULL, 7, "gzip-7"}, 139 {NULL, NULL, 8, "gzip-8"}, 140 {NULL, NULL, 9, "gzip-9"}, 141 {NULL, zle_decompress, 64, "zle"}, 142 }; 143 144 static void 145 byteswap_uint64_array(void *vbuf, size_t size) 146 { 147 uint64_t *buf = vbuf; 148 size_t count = size >> 3; 149 int i; 150 151 ASSERT((size & 7) == 0); 152 153 for (i = 0; i < count; i++) 154 buf[i] = BSWAP_64(buf[i]); 155 } 156 157 /* 158 * Set the external verifier for a gang block based on <vdev, offset, txg>, 159 * a tuple which is guaranteed to be unique for the life of the pool. 160 */ 161 static void 162 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 163 { 164 const dva_t *dva = BP_IDENTITY(bp); 165 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 166 167 ASSERT(BP_IS_GANG(bp)); 168 169 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 170 } 171 172 /* 173 * Set the external verifier for a label block based on its offset. 174 * The vdev is implicit, and the txg is unknowable at pool open time -- 175 * hence the logic in vdev_uberblock_load() to find the most recent copy. 176 */ 177 static void 178 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 179 { 180 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 181 } 182 183 static int 184 zio_checksum_verify(const blkptr_t *bp, void *data) 185 { 186 uint64_t size; 187 unsigned int checksum; 188 zio_checksum_info_t *ci; 189 zio_cksum_t actual_cksum, expected_cksum, verifier; 190 int byteswap; 191 192 checksum = BP_GET_CHECKSUM(bp); 193 size = BP_GET_PSIZE(bp); 194 195 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 196 return (EINVAL); 197 ci = &zio_checksum_table[checksum]; 198 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 199 return (EINVAL); 200 201 if (ci->ci_eck) { 202 zio_eck_t *eck; 203 204 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 205 checksum == ZIO_CHECKSUM_LABEL); 206 207 eck = (zio_eck_t *)((char *)data + size) - 1; 208 209 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 210 zio_checksum_gang_verifier(&verifier, bp); 211 else if (checksum == ZIO_CHECKSUM_LABEL) 212 zio_checksum_label_verifier(&verifier, 213 DVA_GET_OFFSET(BP_IDENTITY(bp))); 214 else 215 verifier = bp->blk_cksum; 216 217 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 218 219 if (byteswap) 220 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 221 222 expected_cksum = eck->zec_cksum; 223 eck->zec_cksum = verifier; 224 ci->ci_func[byteswap](data, size, &actual_cksum); 225 eck->zec_cksum = expected_cksum; 226 227 if (byteswap) 228 byteswap_uint64_array(&expected_cksum, 229 sizeof (zio_cksum_t)); 230 } else { 231 expected_cksum = bp->blk_cksum; 232 ci->ci_func[0](data, size, &actual_cksum); 233 } 234 235 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 236 /*printf("ZFS: read checksum failed\n");*/ 237 return (EIO); 238 } 239 240 return (0); 241 } 242 243 static int 244 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 245 void *dest, uint64_t destsize) 246 { 247 zio_compress_info_t *ci; 248 249 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 250 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 251 return (EIO); 252 } 253 254 ci = &zio_compress_table[cpfunc]; 255 if (!ci->ci_decompress) { 256 printf("ZFS: unsupported compression algorithm %s\n", 257 ci->ci_name); 258 return (EIO); 259 } 260 261 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 262 } 263 264 static uint64_t 265 zap_hash(uint64_t salt, const char *name) 266 { 267 const uint8_t *cp; 268 uint8_t c; 269 uint64_t crc = salt; 270 271 ASSERT(crc != 0); 272 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 273 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 274 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 275 276 /* 277 * Only use 28 bits, since we need 4 bits in the cookie for the 278 * collision differentiator. We MUST use the high bits, since 279 * those are the onces that we first pay attention to when 280 * chosing the bucket. 281 */ 282 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 283 284 return (crc); 285 } 286 287 static void *zfs_alloc(size_t size); 288 static void zfs_free(void *ptr, size_t size); 289 290 typedef struct raidz_col { 291 uint64_t rc_devidx; /* child device index for I/O */ 292 uint64_t rc_offset; /* device offset */ 293 uint64_t rc_size; /* I/O size */ 294 void *rc_data; /* I/O data */ 295 int rc_error; /* I/O error for this device */ 296 uint8_t rc_tried; /* Did we attempt this I/O column? */ 297 uint8_t rc_skipped; /* Did we skip this I/O column? */ 298 } raidz_col_t; 299 300 typedef struct raidz_map { 301 uint64_t rm_cols; /* Regular column count */ 302 uint64_t rm_scols; /* Count including skipped columns */ 303 uint64_t rm_bigcols; /* Number of oversized columns */ 304 uint64_t rm_asize; /* Actual total I/O size */ 305 uint64_t rm_missingdata; /* Count of missing data devices */ 306 uint64_t rm_missingparity; /* Count of missing parity devices */ 307 uint64_t rm_firstdatacol; /* First data column/parity count */ 308 uint64_t rm_nskip; /* Skipped sectors for padding */ 309 uint64_t rm_skipstart; /* Column index of padding start */ 310 uintptr_t rm_reports; /* # of referencing checksum reports */ 311 uint8_t rm_freed; /* map no longer has referencing ZIO */ 312 uint8_t rm_ecksuminjected; /* checksum error was injected */ 313 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 314 } raidz_map_t; 315 316 #define VDEV_RAIDZ_P 0 317 #define VDEV_RAIDZ_Q 1 318 #define VDEV_RAIDZ_R 2 319 320 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 321 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 322 323 /* 324 * We provide a mechanism to perform the field multiplication operation on a 325 * 64-bit value all at once rather than a byte at a time. This works by 326 * creating a mask from the top bit in each byte and using that to 327 * conditionally apply the XOR of 0x1d. 328 */ 329 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 330 { \ 331 (mask) = (x) & 0x8080808080808080ULL; \ 332 (mask) = ((mask) << 1) - ((mask) >> 7); \ 333 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 334 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 335 } 336 337 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 338 { \ 339 VDEV_RAIDZ_64MUL_2((x), mask); \ 340 VDEV_RAIDZ_64MUL_2((x), mask); \ 341 } 342 343 /* 344 * These two tables represent powers and logs of 2 in the Galois field defined 345 * above. These values were computed by repeatedly multiplying by 2 as above. 346 */ 347 static const uint8_t vdev_raidz_pow2[256] = { 348 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 349 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 350 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 351 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 352 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 353 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 354 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 355 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 356 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 357 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 358 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 359 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 360 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 361 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 362 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 363 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 364 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 365 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 366 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 367 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 368 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 369 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 370 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 371 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 372 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 373 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 374 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 375 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 376 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 377 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 378 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 379 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 380 }; 381 static const uint8_t vdev_raidz_log2[256] = { 382 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 383 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 384 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 385 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 386 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 387 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 388 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 389 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 390 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 391 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 392 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 393 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 394 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 395 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 396 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 397 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 398 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 399 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 400 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 401 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 402 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 403 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 404 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 405 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 406 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 407 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 408 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 409 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 410 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 411 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 412 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 413 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 414 }; 415 416 /* 417 * Multiply a given number by 2 raised to the given power. 418 */ 419 static uint8_t 420 vdev_raidz_exp2(uint8_t a, int exp) 421 { 422 if (a == 0) 423 return (0); 424 425 ASSERT(exp >= 0); 426 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 427 428 exp += vdev_raidz_log2[a]; 429 if (exp > 255) 430 exp -= 255; 431 432 return (vdev_raidz_pow2[exp]); 433 } 434 435 static void 436 vdev_raidz_generate_parity_p(raidz_map_t *rm) 437 { 438 uint64_t *p, *src, pcount, ccount, i; 439 int c; 440 441 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 442 443 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 444 src = rm->rm_col[c].rc_data; 445 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 446 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 447 448 if (c == rm->rm_firstdatacol) { 449 ASSERT(ccount == pcount); 450 for (i = 0; i < ccount; i++, src++, p++) { 451 *p = *src; 452 } 453 } else { 454 ASSERT(ccount <= pcount); 455 for (i = 0; i < ccount; i++, src++, p++) { 456 *p ^= *src; 457 } 458 } 459 } 460 } 461 462 static void 463 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 464 { 465 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 466 int c; 467 468 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 469 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 470 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 471 472 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 473 src = rm->rm_col[c].rc_data; 474 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 475 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 476 477 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 478 479 if (c == rm->rm_firstdatacol) { 480 ASSERT(ccnt == pcnt || ccnt == 0); 481 for (i = 0; i < ccnt; i++, src++, p++, q++) { 482 *p = *src; 483 *q = *src; 484 } 485 for (; i < pcnt; i++, src++, p++, q++) { 486 *p = 0; 487 *q = 0; 488 } 489 } else { 490 ASSERT(ccnt <= pcnt); 491 492 /* 493 * Apply the algorithm described above by multiplying 494 * the previous result and adding in the new value. 495 */ 496 for (i = 0; i < ccnt; i++, src++, p++, q++) { 497 *p ^= *src; 498 499 VDEV_RAIDZ_64MUL_2(*q, mask); 500 *q ^= *src; 501 } 502 503 /* 504 * Treat short columns as though they are full of 0s. 505 * Note that there's therefore nothing needed for P. 506 */ 507 for (; i < pcnt; i++, q++) { 508 VDEV_RAIDZ_64MUL_2(*q, mask); 509 } 510 } 511 } 512 } 513 514 static void 515 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 516 { 517 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 518 int c; 519 520 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 521 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 522 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 523 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 524 rm->rm_col[VDEV_RAIDZ_R].rc_size); 525 526 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 527 src = rm->rm_col[c].rc_data; 528 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 529 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 530 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 531 532 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 533 534 if (c == rm->rm_firstdatacol) { 535 ASSERT(ccnt == pcnt || ccnt == 0); 536 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 537 *p = *src; 538 *q = *src; 539 *r = *src; 540 } 541 for (; i < pcnt; i++, src++, p++, q++, r++) { 542 *p = 0; 543 *q = 0; 544 *r = 0; 545 } 546 } else { 547 ASSERT(ccnt <= pcnt); 548 549 /* 550 * Apply the algorithm described above by multiplying 551 * the previous result and adding in the new value. 552 */ 553 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 554 *p ^= *src; 555 556 VDEV_RAIDZ_64MUL_2(*q, mask); 557 *q ^= *src; 558 559 VDEV_RAIDZ_64MUL_4(*r, mask); 560 *r ^= *src; 561 } 562 563 /* 564 * Treat short columns as though they are full of 0s. 565 * Note that there's therefore nothing needed for P. 566 */ 567 for (; i < pcnt; i++, q++, r++) { 568 VDEV_RAIDZ_64MUL_2(*q, mask); 569 VDEV_RAIDZ_64MUL_4(*r, mask); 570 } 571 } 572 } 573 } 574 575 /* 576 * Generate RAID parity in the first virtual columns according to the number of 577 * parity columns available. 578 */ 579 static void 580 vdev_raidz_generate_parity(raidz_map_t *rm) 581 { 582 switch (rm->rm_firstdatacol) { 583 case 1: 584 vdev_raidz_generate_parity_p(rm); 585 break; 586 case 2: 587 vdev_raidz_generate_parity_pq(rm); 588 break; 589 case 3: 590 vdev_raidz_generate_parity_pqr(rm); 591 break; 592 default: 593 panic("invalid RAID-Z configuration"); 594 } 595 } 596 597 /* BEGIN CSTYLED */ 598 /* 599 * In the general case of reconstruction, we must solve the system of linear 600 * equations defined by the coeffecients used to generate parity as well as 601 * the contents of the data and parity disks. This can be expressed with 602 * vectors for the original data (D) and the actual data (d) and parity (p) 603 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 604 * 605 * __ __ __ __ 606 * | | __ __ | p_0 | 607 * | V | | D_0 | | p_m-1 | 608 * | | x | : | = | d_0 | 609 * | I | | D_n-1 | | : | 610 * | | ~~ ~~ | d_n-1 | 611 * ~~ ~~ ~~ ~~ 612 * 613 * I is simply a square identity matrix of size n, and V is a vandermonde 614 * matrix defined by the coeffecients we chose for the various parity columns 615 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 616 * computation as well as linear separability. 617 * 618 * __ __ __ __ 619 * | 1 .. 1 1 1 | | p_0 | 620 * | 2^n-1 .. 4 2 1 | __ __ | : | 621 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 622 * | 1 .. 0 0 0 | | D_1 | | d_0 | 623 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 624 * | : : : : | | : | | d_2 | 625 * | 0 .. 1 0 0 | | D_n-1 | | : | 626 * | 0 .. 0 1 0 | ~~ ~~ | : | 627 * | 0 .. 0 0 1 | | d_n-1 | 628 * ~~ ~~ ~~ ~~ 629 * 630 * Note that I, V, d, and p are known. To compute D, we must invert the 631 * matrix and use the known data and parity values to reconstruct the unknown 632 * data values. We begin by removing the rows in V|I and d|p that correspond 633 * to failed or missing columns; we then make V|I square (n x n) and d|p 634 * sized n by removing rows corresponding to unused parity from the bottom up 635 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 636 * using Gauss-Jordan elimination. In the example below we use m=3 parity 637 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 638 * __ __ 639 * | 1 1 1 1 1 1 1 1 | 640 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 641 * | 19 205 116 29 64 16 4 1 | / / 642 * | 1 0 0 0 0 0 0 0 | / / 643 * | 0 1 0 0 0 0 0 0 | <--' / 644 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 645 * | 0 0 0 1 0 0 0 0 | 646 * | 0 0 0 0 1 0 0 0 | 647 * | 0 0 0 0 0 1 0 0 | 648 * | 0 0 0 0 0 0 1 0 | 649 * | 0 0 0 0 0 0 0 1 | 650 * ~~ ~~ 651 * __ __ 652 * | 1 1 1 1 1 1 1 1 | 653 * | 128 64 32 16 8 4 2 1 | 654 * | 19 205 116 29 64 16 4 1 | 655 * | 1 0 0 0 0 0 0 0 | 656 * | 0 1 0 0 0 0 0 0 | 657 * (V|I)' = | 0 0 1 0 0 0 0 0 | 658 * | 0 0 0 1 0 0 0 0 | 659 * | 0 0 0 0 1 0 0 0 | 660 * | 0 0 0 0 0 1 0 0 | 661 * | 0 0 0 0 0 0 1 0 | 662 * | 0 0 0 0 0 0 0 1 | 663 * ~~ ~~ 664 * 665 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 666 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 667 * matrix is not singular. 668 * __ __ 669 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 670 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 671 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 672 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 673 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 674 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 675 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 676 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 677 * ~~ ~~ 678 * __ __ 679 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 680 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 681 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 682 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 683 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 684 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 685 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 686 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 687 * ~~ ~~ 688 * __ __ 689 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 690 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 691 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 692 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 693 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 694 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 695 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 696 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 697 * ~~ ~~ 698 * __ __ 699 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 700 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 701 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 702 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 703 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 704 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 705 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 706 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 707 * ~~ ~~ 708 * __ __ 709 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 710 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 711 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 712 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 713 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 714 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 715 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 716 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 717 * ~~ ~~ 718 * __ __ 719 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 720 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 721 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 722 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 723 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 724 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 725 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 726 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 727 * ~~ ~~ 728 * __ __ 729 * | 0 0 1 0 0 0 0 0 | 730 * | 167 100 5 41 159 169 217 208 | 731 * | 166 100 4 40 158 168 216 209 | 732 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 733 * | 0 0 0 0 1 0 0 0 | 734 * | 0 0 0 0 0 1 0 0 | 735 * | 0 0 0 0 0 0 1 0 | 736 * | 0 0 0 0 0 0 0 1 | 737 * ~~ ~~ 738 * 739 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 740 * of the missing data. 741 * 742 * As is apparent from the example above, the only non-trivial rows in the 743 * inverse matrix correspond to the data disks that we're trying to 744 * reconstruct. Indeed, those are the only rows we need as the others would 745 * only be useful for reconstructing data known or assumed to be valid. For 746 * that reason, we only build the coefficients in the rows that correspond to 747 * targeted columns. 748 */ 749 /* END CSTYLED */ 750 751 static void 752 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 753 uint8_t **rows) 754 { 755 int i, j; 756 int pow; 757 758 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 759 760 /* 761 * Fill in the missing rows of interest. 762 */ 763 for (i = 0; i < nmap; i++) { 764 ASSERT3S(0, <=, map[i]); 765 ASSERT3S(map[i], <=, 2); 766 767 pow = map[i] * n; 768 if (pow > 255) 769 pow -= 255; 770 ASSERT(pow <= 255); 771 772 for (j = 0; j < n; j++) { 773 pow -= map[i]; 774 if (pow < 0) 775 pow += 255; 776 rows[i][j] = vdev_raidz_pow2[pow]; 777 } 778 } 779 } 780 781 static void 782 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 783 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 784 { 785 int i, j, ii, jj; 786 uint8_t log; 787 788 /* 789 * Assert that the first nmissing entries from the array of used 790 * columns correspond to parity columns and that subsequent entries 791 * correspond to data columns. 792 */ 793 for (i = 0; i < nmissing; i++) { 794 ASSERT3S(used[i], <, rm->rm_firstdatacol); 795 } 796 for (; i < n; i++) { 797 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 798 } 799 800 /* 801 * First initialize the storage where we'll compute the inverse rows. 802 */ 803 for (i = 0; i < nmissing; i++) { 804 for (j = 0; j < n; j++) { 805 invrows[i][j] = (i == j) ? 1 : 0; 806 } 807 } 808 809 /* 810 * Subtract all trivial rows from the rows of consequence. 811 */ 812 for (i = 0; i < nmissing; i++) { 813 for (j = nmissing; j < n; j++) { 814 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 815 jj = used[j] - rm->rm_firstdatacol; 816 ASSERT3S(jj, <, n); 817 invrows[i][j] = rows[i][jj]; 818 rows[i][jj] = 0; 819 } 820 } 821 822 /* 823 * For each of the rows of interest, we must normalize it and subtract 824 * a multiple of it from the other rows. 825 */ 826 for (i = 0; i < nmissing; i++) { 827 for (j = 0; j < missing[i]; j++) { 828 ASSERT3U(rows[i][j], ==, 0); 829 } 830 ASSERT3U(rows[i][missing[i]], !=, 0); 831 832 /* 833 * Compute the inverse of the first element and multiply each 834 * element in the row by that value. 835 */ 836 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 837 838 for (j = 0; j < n; j++) { 839 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 840 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 841 } 842 843 for (ii = 0; ii < nmissing; ii++) { 844 if (i == ii) 845 continue; 846 847 ASSERT3U(rows[ii][missing[i]], !=, 0); 848 849 log = vdev_raidz_log2[rows[ii][missing[i]]]; 850 851 for (j = 0; j < n; j++) { 852 rows[ii][j] ^= 853 vdev_raidz_exp2(rows[i][j], log); 854 invrows[ii][j] ^= 855 vdev_raidz_exp2(invrows[i][j], log); 856 } 857 } 858 } 859 860 /* 861 * Verify that the data that is left in the rows are properly part of 862 * an identity matrix. 863 */ 864 for (i = 0; i < nmissing; i++) { 865 for (j = 0; j < n; j++) { 866 if (j == missing[i]) { 867 ASSERT3U(rows[i][j], ==, 1); 868 } else { 869 ASSERT3U(rows[i][j], ==, 0); 870 } 871 } 872 } 873 } 874 875 static void 876 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 877 int *missing, uint8_t **invrows, const uint8_t *used) 878 { 879 int i, j, x, cc, c; 880 uint8_t *src; 881 uint64_t ccount; 882 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 883 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 884 uint8_t log, val; 885 int ll; 886 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 887 uint8_t *p, *pp; 888 size_t psize; 889 890 log = 0; /* gcc */ 891 psize = sizeof (invlog[0][0]) * n * nmissing; 892 p = zfs_alloc(psize); 893 894 for (pp = p, i = 0; i < nmissing; i++) { 895 invlog[i] = pp; 896 pp += n; 897 } 898 899 for (i = 0; i < nmissing; i++) { 900 for (j = 0; j < n; j++) { 901 ASSERT3U(invrows[i][j], !=, 0); 902 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 903 } 904 } 905 906 for (i = 0; i < n; i++) { 907 c = used[i]; 908 ASSERT3U(c, <, rm->rm_cols); 909 910 src = rm->rm_col[c].rc_data; 911 ccount = rm->rm_col[c].rc_size; 912 for (j = 0; j < nmissing; j++) { 913 cc = missing[j] + rm->rm_firstdatacol; 914 ASSERT3U(cc, >=, rm->rm_firstdatacol); 915 ASSERT3U(cc, <, rm->rm_cols); 916 ASSERT3U(cc, !=, c); 917 918 dst[j] = rm->rm_col[cc].rc_data; 919 dcount[j] = rm->rm_col[cc].rc_size; 920 } 921 922 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 923 924 for (x = 0; x < ccount; x++, src++) { 925 if (*src != 0) 926 log = vdev_raidz_log2[*src]; 927 928 for (cc = 0; cc < nmissing; cc++) { 929 if (x >= dcount[cc]) 930 continue; 931 932 if (*src == 0) { 933 val = 0; 934 } else { 935 if ((ll = log + invlog[cc][i]) >= 255) 936 ll -= 255; 937 val = vdev_raidz_pow2[ll]; 938 } 939 940 if (i == 0) 941 dst[cc][x] = val; 942 else 943 dst[cc][x] ^= val; 944 } 945 } 946 } 947 948 zfs_free(p, psize); 949 } 950 951 static int 952 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 953 { 954 int n, i, c, t, tt; 955 int nmissing_rows; 956 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 957 int parity_map[VDEV_RAIDZ_MAXPARITY]; 958 959 uint8_t *p, *pp; 960 size_t psize; 961 962 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 963 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 964 uint8_t *used; 965 966 int code = 0; 967 968 969 n = rm->rm_cols - rm->rm_firstdatacol; 970 971 /* 972 * Figure out which data columns are missing. 973 */ 974 nmissing_rows = 0; 975 for (t = 0; t < ntgts; t++) { 976 if (tgts[t] >= rm->rm_firstdatacol) { 977 missing_rows[nmissing_rows++] = 978 tgts[t] - rm->rm_firstdatacol; 979 } 980 } 981 982 /* 983 * Figure out which parity columns to use to help generate the missing 984 * data columns. 985 */ 986 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 987 ASSERT(tt < ntgts); 988 ASSERT(c < rm->rm_firstdatacol); 989 990 /* 991 * Skip any targeted parity columns. 992 */ 993 if (c == tgts[tt]) { 994 tt++; 995 continue; 996 } 997 998 code |= 1 << c; 999 1000 parity_map[i] = c; 1001 i++; 1002 } 1003 1004 ASSERT(code != 0); 1005 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1006 1007 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1008 nmissing_rows * n + sizeof (used[0]) * n; 1009 p = kmem_alloc(psize, KM_SLEEP); 1010 1011 for (pp = p, i = 0; i < nmissing_rows; i++) { 1012 rows[i] = pp; 1013 pp += n; 1014 invrows[i] = pp; 1015 pp += n; 1016 } 1017 used = pp; 1018 1019 for (i = 0; i < nmissing_rows; i++) { 1020 used[i] = parity_map[i]; 1021 } 1022 1023 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1024 if (tt < nmissing_rows && 1025 c == missing_rows[tt] + rm->rm_firstdatacol) { 1026 tt++; 1027 continue; 1028 } 1029 1030 ASSERT3S(i, <, n); 1031 used[i] = c; 1032 i++; 1033 } 1034 1035 /* 1036 * Initialize the interesting rows of the matrix. 1037 */ 1038 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1039 1040 /* 1041 * Invert the matrix. 1042 */ 1043 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1044 invrows, used); 1045 1046 /* 1047 * Reconstruct the missing data using the generated matrix. 1048 */ 1049 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1050 invrows, used); 1051 1052 kmem_free(p, psize); 1053 1054 return (code); 1055 } 1056 1057 static int 1058 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1059 { 1060 int tgts[VDEV_RAIDZ_MAXPARITY]; 1061 int ntgts; 1062 int i, c; 1063 int code; 1064 int nbadparity, nbaddata; 1065 1066 /* 1067 * The tgts list must already be sorted. 1068 */ 1069 for (i = 1; i < nt; i++) { 1070 ASSERT(t[i] > t[i - 1]); 1071 } 1072 1073 nbadparity = rm->rm_firstdatacol; 1074 nbaddata = rm->rm_cols - nbadparity; 1075 ntgts = 0; 1076 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1077 if (i < nt && c == t[i]) { 1078 tgts[ntgts++] = c; 1079 i++; 1080 } else if (rm->rm_col[c].rc_error != 0) { 1081 tgts[ntgts++] = c; 1082 } else if (c >= rm->rm_firstdatacol) { 1083 nbaddata--; 1084 } else { 1085 nbadparity--; 1086 } 1087 } 1088 1089 ASSERT(ntgts >= nt); 1090 ASSERT(nbaddata >= 0); 1091 ASSERT(nbaddata + nbadparity == ntgts); 1092 1093 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1094 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1095 ASSERT(code > 0); 1096 return (code); 1097 } 1098 1099 static raidz_map_t * 1100 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1101 uint64_t dcols, uint64_t nparity) 1102 { 1103 raidz_map_t *rm; 1104 uint64_t b = offset >> unit_shift; 1105 uint64_t s = size >> unit_shift; 1106 uint64_t f = b % dcols; 1107 uint64_t o = (b / dcols) << unit_shift; 1108 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1109 1110 q = s / (dcols - nparity); 1111 r = s - q * (dcols - nparity); 1112 bc = (r == 0 ? 0 : r + nparity); 1113 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1114 1115 if (q == 0) { 1116 acols = bc; 1117 scols = MIN(dcols, roundup(bc, nparity + 1)); 1118 } else { 1119 acols = dcols; 1120 scols = dcols; 1121 } 1122 1123 ASSERT3U(acols, <=, scols); 1124 1125 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1126 1127 rm->rm_cols = acols; 1128 rm->rm_scols = scols; 1129 rm->rm_bigcols = bc; 1130 rm->rm_skipstart = bc; 1131 rm->rm_missingdata = 0; 1132 rm->rm_missingparity = 0; 1133 rm->rm_firstdatacol = nparity; 1134 rm->rm_reports = 0; 1135 rm->rm_freed = 0; 1136 rm->rm_ecksuminjected = 0; 1137 1138 asize = 0; 1139 1140 for (c = 0; c < scols; c++) { 1141 col = f + c; 1142 coff = o; 1143 if (col >= dcols) { 1144 col -= dcols; 1145 coff += 1ULL << unit_shift; 1146 } 1147 rm->rm_col[c].rc_devidx = col; 1148 rm->rm_col[c].rc_offset = coff; 1149 rm->rm_col[c].rc_data = NULL; 1150 rm->rm_col[c].rc_error = 0; 1151 rm->rm_col[c].rc_tried = 0; 1152 rm->rm_col[c].rc_skipped = 0; 1153 1154 if (c >= acols) 1155 rm->rm_col[c].rc_size = 0; 1156 else if (c < bc) 1157 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1158 else 1159 rm->rm_col[c].rc_size = q << unit_shift; 1160 1161 asize += rm->rm_col[c].rc_size; 1162 } 1163 1164 ASSERT3U(asize, ==, tot << unit_shift); 1165 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1166 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1167 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1168 ASSERT3U(rm->rm_nskip, <=, nparity); 1169 1170 for (c = 0; c < rm->rm_firstdatacol; c++) 1171 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1172 1173 rm->rm_col[c].rc_data = data; 1174 1175 for (c = c + 1; c < acols; c++) 1176 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1177 rm->rm_col[c - 1].rc_size; 1178 1179 /* 1180 * If all data stored spans all columns, there's a danger that parity 1181 * will always be on the same device and, since parity isn't read 1182 * during normal operation, that that device's I/O bandwidth won't be 1183 * used effectively. We therefore switch the parity every 1MB. 1184 * 1185 * ... at least that was, ostensibly, the theory. As a practical 1186 * matter unless we juggle the parity between all devices evenly, we 1187 * won't see any benefit. Further, occasional writes that aren't a 1188 * multiple of the LCM of the number of children and the minimum 1189 * stripe width are sufficient to avoid pessimal behavior. 1190 * Unfortunately, this decision created an implicit on-disk format 1191 * requirement that we need to support for all eternity, but only 1192 * for single-parity RAID-Z. 1193 * 1194 * If we intend to skip a sector in the zeroth column for padding 1195 * we must make sure to note this swap. We will never intend to 1196 * skip the first column since at least one data and one parity 1197 * column must appear in each row. 1198 */ 1199 ASSERT(rm->rm_cols >= 2); 1200 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1201 1202 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1203 devidx = rm->rm_col[0].rc_devidx; 1204 o = rm->rm_col[0].rc_offset; 1205 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1206 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1207 rm->rm_col[1].rc_devidx = devidx; 1208 rm->rm_col[1].rc_offset = o; 1209 1210 if (rm->rm_skipstart == 0) 1211 rm->rm_skipstart = 1; 1212 } 1213 1214 return (rm); 1215 } 1216 1217 static void 1218 vdev_raidz_map_free(raidz_map_t *rm) 1219 { 1220 int c; 1221 1222 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1223 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1224 1225 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1226 } 1227 1228 static vdev_t * 1229 vdev_child(vdev_t *pvd, uint64_t devidx) 1230 { 1231 vdev_t *cvd; 1232 1233 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1234 if (cvd->v_id == devidx) 1235 break; 1236 } 1237 1238 return (cvd); 1239 } 1240 1241 /* 1242 * We keep track of whether or not there were any injected errors, so that 1243 * any ereports we generate can note it. 1244 */ 1245 static int 1246 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1247 { 1248 1249 return (zio_checksum_verify(bp, data)); 1250 } 1251 1252 /* 1253 * Generate the parity from the data columns. If we tried and were able to 1254 * read the parity without error, verify that the generated parity matches the 1255 * data we read. If it doesn't, we fire off a checksum error. Return the 1256 * number such failures. 1257 */ 1258 static int 1259 raidz_parity_verify(raidz_map_t *rm) 1260 { 1261 void *orig[VDEV_RAIDZ_MAXPARITY]; 1262 int c, ret = 0; 1263 raidz_col_t *rc; 1264 1265 for (c = 0; c < rm->rm_firstdatacol; c++) { 1266 rc = &rm->rm_col[c]; 1267 if (!rc->rc_tried || rc->rc_error != 0) 1268 continue; 1269 orig[c] = zfs_alloc(rc->rc_size); 1270 bcopy(rc->rc_data, orig[c], rc->rc_size); 1271 } 1272 1273 vdev_raidz_generate_parity(rm); 1274 1275 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1276 rc = &rm->rm_col[c]; 1277 if (!rc->rc_tried || rc->rc_error != 0) 1278 continue; 1279 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1280 rc->rc_error = ECKSUM; 1281 ret++; 1282 } 1283 zfs_free(orig[c], rc->rc_size); 1284 } 1285 1286 return (ret); 1287 } 1288 1289 /* 1290 * Iterate over all combinations of bad data and attempt a reconstruction. 1291 * Note that the algorithm below is non-optimal because it doesn't take into 1292 * account how reconstruction is actually performed. For example, with 1293 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1294 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1295 * cases we'd only use parity information in column 0. 1296 */ 1297 static int 1298 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1299 off_t offset, uint64_t bytes, int total_errors, int data_errors) 1300 { 1301 raidz_col_t *rc; 1302 void *orig[VDEV_RAIDZ_MAXPARITY]; 1303 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1304 int *tgts = &tstore[1]; 1305 int current, next, i, c, n; 1306 int code, ret = 0; 1307 1308 ASSERT(total_errors < rm->rm_firstdatacol); 1309 1310 /* 1311 * This simplifies one edge condition. 1312 */ 1313 tgts[-1] = -1; 1314 1315 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1316 /* 1317 * Initialize the targets array by finding the first n columns 1318 * that contain no error. 1319 * 1320 * If there were no data errors, we need to ensure that we're 1321 * always explicitly attempting to reconstruct at least one 1322 * data column. To do this, we simply push the highest target 1323 * up into the data columns. 1324 */ 1325 for (c = 0, i = 0; i < n; i++) { 1326 if (i == n - 1 && data_errors == 0 && 1327 c < rm->rm_firstdatacol) { 1328 c = rm->rm_firstdatacol; 1329 } 1330 1331 while (rm->rm_col[c].rc_error != 0) { 1332 c++; 1333 ASSERT3S(c, <, rm->rm_cols); 1334 } 1335 1336 tgts[i] = c++; 1337 } 1338 1339 /* 1340 * Setting tgts[n] simplifies the other edge condition. 1341 */ 1342 tgts[n] = rm->rm_cols; 1343 1344 /* 1345 * These buffers were allocated in previous iterations. 1346 */ 1347 for (i = 0; i < n - 1; i++) { 1348 ASSERT(orig[i] != NULL); 1349 } 1350 1351 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1352 1353 current = 0; 1354 next = tgts[current]; 1355 1356 while (current != n) { 1357 tgts[current] = next; 1358 current = 0; 1359 1360 /* 1361 * Save off the original data that we're going to 1362 * attempt to reconstruct. 1363 */ 1364 for (i = 0; i < n; i++) { 1365 ASSERT(orig[i] != NULL); 1366 c = tgts[i]; 1367 ASSERT3S(c, >=, 0); 1368 ASSERT3S(c, <, rm->rm_cols); 1369 rc = &rm->rm_col[c]; 1370 bcopy(rc->rc_data, orig[i], rc->rc_size); 1371 } 1372 1373 /* 1374 * Attempt a reconstruction and exit the outer loop on 1375 * success. 1376 */ 1377 code = vdev_raidz_reconstruct(rm, tgts, n); 1378 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1379 for (i = 0; i < n; i++) { 1380 c = tgts[i]; 1381 rc = &rm->rm_col[c]; 1382 ASSERT(rc->rc_error == 0); 1383 rc->rc_error = ECKSUM; 1384 } 1385 1386 ret = code; 1387 goto done; 1388 } 1389 1390 /* 1391 * Restore the original data. 1392 */ 1393 for (i = 0; i < n; i++) { 1394 c = tgts[i]; 1395 rc = &rm->rm_col[c]; 1396 bcopy(orig[i], rc->rc_data, rc->rc_size); 1397 } 1398 1399 do { 1400 /* 1401 * Find the next valid column after the current 1402 * position.. 1403 */ 1404 for (next = tgts[current] + 1; 1405 next < rm->rm_cols && 1406 rm->rm_col[next].rc_error != 0; next++) 1407 continue; 1408 1409 ASSERT(next <= tgts[current + 1]); 1410 1411 /* 1412 * If that spot is available, we're done here. 1413 */ 1414 if (next != tgts[current + 1]) 1415 break; 1416 1417 /* 1418 * Otherwise, find the next valid column after 1419 * the previous position. 1420 */ 1421 for (c = tgts[current - 1] + 1; 1422 rm->rm_col[c].rc_error != 0; c++) 1423 continue; 1424 1425 tgts[current] = c; 1426 current++; 1427 1428 } while (current != n); 1429 } 1430 } 1431 n--; 1432 done: 1433 for (i = n - 1; i >= 0; i--) { 1434 zfs_free(orig[i], rm->rm_col[0].rc_size); 1435 } 1436 1437 return (ret); 1438 } 1439 1440 static int 1441 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1442 off_t offset, size_t bytes) 1443 { 1444 vdev_t *tvd = vd->v_top; 1445 vdev_t *cvd; 1446 raidz_map_t *rm; 1447 raidz_col_t *rc; 1448 int c, error; 1449 int unexpected_errors; 1450 int parity_errors; 1451 int parity_untried; 1452 int data_errors; 1453 int total_errors; 1454 int n; 1455 int tgts[VDEV_RAIDZ_MAXPARITY]; 1456 int code; 1457 1458 rc = NULL; /* gcc */ 1459 error = 0; 1460 1461 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1462 vd->v_nchildren, vd->v_nparity); 1463 1464 /* 1465 * Iterate over the columns in reverse order so that we hit the parity 1466 * last -- any errors along the way will force us to read the parity. 1467 */ 1468 for (c = rm->rm_cols - 1; c >= 0; c--) { 1469 rc = &rm->rm_col[c]; 1470 cvd = vdev_child(vd, rc->rc_devidx); 1471 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1472 if (c >= rm->rm_firstdatacol) 1473 rm->rm_missingdata++; 1474 else 1475 rm->rm_missingparity++; 1476 rc->rc_error = ENXIO; 1477 rc->rc_tried = 1; /* don't even try */ 1478 rc->rc_skipped = 1; 1479 continue; 1480 } 1481 #if 0 /* XXX: Too hard for the boot code. */ 1482 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1483 if (c >= rm->rm_firstdatacol) 1484 rm->rm_missingdata++; 1485 else 1486 rm->rm_missingparity++; 1487 rc->rc_error = ESTALE; 1488 rc->rc_skipped = 1; 1489 continue; 1490 } 1491 #endif 1492 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1493 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1494 rc->rc_offset, rc->rc_size); 1495 rc->rc_tried = 1; 1496 rc->rc_skipped = 0; 1497 } 1498 } 1499 1500 reconstruct: 1501 unexpected_errors = 0; 1502 parity_errors = 0; 1503 parity_untried = 0; 1504 data_errors = 0; 1505 total_errors = 0; 1506 1507 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1508 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1509 1510 for (c = 0; c < rm->rm_cols; c++) { 1511 rc = &rm->rm_col[c]; 1512 1513 if (rc->rc_error) { 1514 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1515 1516 if (c < rm->rm_firstdatacol) 1517 parity_errors++; 1518 else 1519 data_errors++; 1520 1521 if (!rc->rc_skipped) 1522 unexpected_errors++; 1523 1524 total_errors++; 1525 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1526 parity_untried++; 1527 } 1528 } 1529 1530 /* 1531 * There are three potential phases for a read: 1532 * 1. produce valid data from the columns read 1533 * 2. read all disks and try again 1534 * 3. perform combinatorial reconstruction 1535 * 1536 * Each phase is progressively both more expensive and less likely to 1537 * occur. If we encounter more errors than we can repair or all phases 1538 * fail, we have no choice but to return an error. 1539 */ 1540 1541 /* 1542 * If the number of errors we saw was correctable -- less than or equal 1543 * to the number of parity disks read -- attempt to produce data that 1544 * has a valid checksum. Naturally, this case applies in the absence of 1545 * any errors. 1546 */ 1547 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1548 if (data_errors == 0) { 1549 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1550 /* 1551 * If we read parity information (unnecessarily 1552 * as it happens since no reconstruction was 1553 * needed) regenerate and verify the parity. 1554 * We also regenerate parity when resilvering 1555 * so we can write it out to the failed device 1556 * later. 1557 */ 1558 if (parity_errors + parity_untried < 1559 rm->rm_firstdatacol) { 1560 n = raidz_parity_verify(rm); 1561 unexpected_errors += n; 1562 ASSERT(parity_errors + n <= 1563 rm->rm_firstdatacol); 1564 } 1565 goto done; 1566 } 1567 } else { 1568 /* 1569 * We either attempt to read all the parity columns or 1570 * none of them. If we didn't try to read parity, we 1571 * wouldn't be here in the correctable case. There must 1572 * also have been fewer parity errors than parity 1573 * columns or, again, we wouldn't be in this code path. 1574 */ 1575 ASSERT(parity_untried == 0); 1576 ASSERT(parity_errors < rm->rm_firstdatacol); 1577 1578 /* 1579 * Identify the data columns that reported an error. 1580 */ 1581 n = 0; 1582 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1583 rc = &rm->rm_col[c]; 1584 if (rc->rc_error != 0) { 1585 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1586 tgts[n++] = c; 1587 } 1588 } 1589 1590 ASSERT(rm->rm_firstdatacol >= n); 1591 1592 code = vdev_raidz_reconstruct(rm, tgts, n); 1593 1594 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1595 /* 1596 * If we read more parity disks than were used 1597 * for reconstruction, confirm that the other 1598 * parity disks produced correct data. This 1599 * routine is suboptimal in that it regenerates 1600 * the parity that we already used in addition 1601 * to the parity that we're attempting to 1602 * verify, but this should be a relatively 1603 * uncommon case, and can be optimized if it 1604 * becomes a problem. Note that we regenerate 1605 * parity when resilvering so we can write it 1606 * out to failed devices later. 1607 */ 1608 if (parity_errors < rm->rm_firstdatacol - n) { 1609 n = raidz_parity_verify(rm); 1610 unexpected_errors += n; 1611 ASSERT(parity_errors + n <= 1612 rm->rm_firstdatacol); 1613 } 1614 1615 goto done; 1616 } 1617 } 1618 } 1619 1620 /* 1621 * This isn't a typical situation -- either we got a read 1622 * error or a child silently returned bad data. Read every 1623 * block so we can try again with as much data and parity as 1624 * we can track down. If we've already been through once 1625 * before, all children will be marked as tried so we'll 1626 * proceed to combinatorial reconstruction. 1627 */ 1628 unexpected_errors = 1; 1629 rm->rm_missingdata = 0; 1630 rm->rm_missingparity = 0; 1631 1632 n = 0; 1633 for (c = 0; c < rm->rm_cols; c++) { 1634 rc = &rm->rm_col[c]; 1635 1636 if (rc->rc_tried) 1637 continue; 1638 1639 cvd = vdev_child(vd, rc->rc_devidx); 1640 ASSERT(cvd != NULL); 1641 rc->rc_error = cvd->v_read(cvd, NULL, 1642 rc->rc_data, rc->rc_offset, rc->rc_size); 1643 if (rc->rc_error == 0) 1644 n++; 1645 rc->rc_tried = 1; 1646 rc->rc_skipped = 0; 1647 } 1648 /* 1649 * If we managed to read anything more, retry the 1650 * reconstruction. 1651 */ 1652 if (n > 0) 1653 goto reconstruct; 1654 1655 /* 1656 * At this point we've attempted to reconstruct the data given the 1657 * errors we detected, and we've attempted to read all columns. There 1658 * must, therefore, be one or more additional problems -- silent errors 1659 * resulting in invalid data rather than explicit I/O errors resulting 1660 * in absent data. We check if there is enough additional data to 1661 * possibly reconstruct the data and then perform combinatorial 1662 * reconstruction over all possible combinations. If that fails, 1663 * we're cooked. 1664 */ 1665 if (total_errors > rm->rm_firstdatacol) { 1666 error = EIO; 1667 } else if (total_errors < rm->rm_firstdatacol && 1668 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1669 total_errors, data_errors)) != 0) { 1670 /* 1671 * If we didn't use all the available parity for the 1672 * combinatorial reconstruction, verify that the remaining 1673 * parity is correct. 1674 */ 1675 if (code != (1 << rm->rm_firstdatacol) - 1) 1676 (void) raidz_parity_verify(rm); 1677 } else { 1678 /* 1679 * We're here because either: 1680 * 1681 * total_errors == rm_first_datacol, or 1682 * vdev_raidz_combrec() failed 1683 * 1684 * In either case, there is enough bad data to prevent 1685 * reconstruction. 1686 * 1687 * Start checksum ereports for all children which haven't 1688 * failed, and the IO wasn't speculative. 1689 */ 1690 error = ECKSUM; 1691 } 1692 1693 done: 1694 vdev_raidz_map_free(rm); 1695 1696 return (error); 1697 } 1698