1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT(...) do { } while (0) 34 #define ASSERT3U(...) do { } while (0) 35 #define ASSERT3S(...) do { } while (0) 36 37 #define panic(...) do { \ 38 printf(__VA_ARGS__); \ 39 for (;;) ; \ 40 } while (0) 41 42 #define kmem_alloc(size, flag) zfs_alloc((size)) 43 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 44 45 static void 46 zfs_init_crc(void) 47 { 48 int i, j; 49 uint64_t *ct; 50 51 /* 52 * Calculate the crc64 table (used for the zap hash 53 * function). 54 */ 55 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 56 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 57 for (i = 0; i < 256; i++) 58 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 59 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 60 } 61 } 62 63 static void 64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 73 74 /* 75 * Information about each checksum function. 76 */ 77 typedef struct zio_checksum_info { 78 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 79 int ci_correctable; /* number of correctable bits */ 80 int ci_eck; /* uses zio embedded checksum? */ 81 int ci_dedup; /* strong enough for dedup? */ 82 const char *ci_name; /* descriptive name */ 83 } zio_checksum_info_t; 84 85 #include "fletcher.c" 86 #include "sha256.c" 87 88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 89 {{NULL, NULL}, 0, 0, 0, "inherit"}, 90 {{NULL, NULL}, 0, 0, 0, "on"}, 91 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, 92 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, 93 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, 94 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, 95 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, 96 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, 97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, 98 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, 99 }; 100 101 102 /* 103 * Common signature for all zio compress/decompress functions. 104 */ 105 typedef size_t zio_compress_func_t(void *src, void *dst, 106 size_t s_len, size_t d_len, int); 107 typedef int zio_decompress_func_t(void *src, void *dst, 108 size_t s_len, size_t d_len, int); 109 110 /* 111 * Information about each compression function. 112 */ 113 typedef struct zio_compress_info { 114 zio_compress_func_t *ci_compress; /* compression function */ 115 zio_decompress_func_t *ci_decompress; /* decompression function */ 116 int ci_level; /* level parameter */ 117 const char *ci_name; /* algorithm name */ 118 } zio_compress_info_t; 119 120 #include "lzjb.c" 121 #include "zle.c" 122 #include "lz4.c" 123 124 /* 125 * Compression vectors. 126 */ 127 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 128 {NULL, NULL, 0, "inherit"}, 129 {NULL, NULL, 0, "on"}, 130 {NULL, NULL, 0, "uncompressed"}, 131 {NULL, lzjb_decompress, 0, "lzjb"}, 132 {NULL, NULL, 0, "empty"}, 133 {NULL, NULL, 1, "gzip-1"}, 134 {NULL, NULL, 2, "gzip-2"}, 135 {NULL, NULL, 3, "gzip-3"}, 136 {NULL, NULL, 4, "gzip-4"}, 137 {NULL, NULL, 5, "gzip-5"}, 138 {NULL, NULL, 6, "gzip-6"}, 139 {NULL, NULL, 7, "gzip-7"}, 140 {NULL, NULL, 8, "gzip-8"}, 141 {NULL, NULL, 9, "gzip-9"}, 142 {NULL, zle_decompress, 64, "zle"}, 143 {NULL, lz4_decompress, 0, "lz4"}, 144 }; 145 146 static void 147 byteswap_uint64_array(void *vbuf, size_t size) 148 { 149 uint64_t *buf = vbuf; 150 size_t count = size >> 3; 151 int i; 152 153 ASSERT((size & 7) == 0); 154 155 for (i = 0; i < count; i++) 156 buf[i] = BSWAP_64(buf[i]); 157 } 158 159 /* 160 * Set the external verifier for a gang block based on <vdev, offset, txg>, 161 * a tuple which is guaranteed to be unique for the life of the pool. 162 */ 163 static void 164 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 165 { 166 const dva_t *dva = BP_IDENTITY(bp); 167 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 168 169 ASSERT(BP_IS_GANG(bp)); 170 171 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 172 } 173 174 /* 175 * Set the external verifier for a label block based on its offset. 176 * The vdev is implicit, and the txg is unknowable at pool open time -- 177 * hence the logic in vdev_uberblock_load() to find the most recent copy. 178 */ 179 static void 180 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 181 { 182 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 183 } 184 185 static int 186 zio_checksum_verify(const blkptr_t *bp, void *data) 187 { 188 uint64_t size; 189 unsigned int checksum; 190 zio_checksum_info_t *ci; 191 zio_cksum_t actual_cksum, expected_cksum, verifier; 192 int byteswap; 193 194 checksum = BP_GET_CHECKSUM(bp); 195 size = BP_GET_PSIZE(bp); 196 197 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 198 return (EINVAL); 199 ci = &zio_checksum_table[checksum]; 200 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 201 return (EINVAL); 202 203 if (ci->ci_eck) { 204 zio_eck_t *eck; 205 206 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 207 checksum == ZIO_CHECKSUM_LABEL); 208 209 eck = (zio_eck_t *)((char *)data + size) - 1; 210 211 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 212 zio_checksum_gang_verifier(&verifier, bp); 213 else if (checksum == ZIO_CHECKSUM_LABEL) 214 zio_checksum_label_verifier(&verifier, 215 DVA_GET_OFFSET(BP_IDENTITY(bp))); 216 else 217 verifier = bp->blk_cksum; 218 219 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 220 221 if (byteswap) 222 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 223 224 expected_cksum = eck->zec_cksum; 225 eck->zec_cksum = verifier; 226 ci->ci_func[byteswap](data, size, &actual_cksum); 227 eck->zec_cksum = expected_cksum; 228 229 if (byteswap) 230 byteswap_uint64_array(&expected_cksum, 231 sizeof (zio_cksum_t)); 232 } else { 233 expected_cksum = bp->blk_cksum; 234 ci->ci_func[0](data, size, &actual_cksum); 235 } 236 237 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 238 /*printf("ZFS: read checksum failed\n");*/ 239 return (EIO); 240 } 241 242 return (0); 243 } 244 245 static int 246 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 247 void *dest, uint64_t destsize) 248 { 249 zio_compress_info_t *ci; 250 251 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 252 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 253 return (EIO); 254 } 255 256 ci = &zio_compress_table[cpfunc]; 257 if (!ci->ci_decompress) { 258 printf("ZFS: unsupported compression algorithm %s\n", 259 ci->ci_name); 260 return (EIO); 261 } 262 263 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 264 } 265 266 static uint64_t 267 zap_hash(uint64_t salt, const char *name) 268 { 269 const uint8_t *cp; 270 uint8_t c; 271 uint64_t crc = salt; 272 273 ASSERT(crc != 0); 274 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 275 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 276 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 277 278 /* 279 * Only use 28 bits, since we need 4 bits in the cookie for the 280 * collision differentiator. We MUST use the high bits, since 281 * those are the onces that we first pay attention to when 282 * chosing the bucket. 283 */ 284 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 285 286 return (crc); 287 } 288 289 static void *zfs_alloc(size_t size); 290 static void zfs_free(void *ptr, size_t size); 291 292 typedef struct raidz_col { 293 uint64_t rc_devidx; /* child device index for I/O */ 294 uint64_t rc_offset; /* device offset */ 295 uint64_t rc_size; /* I/O size */ 296 void *rc_data; /* I/O data */ 297 int rc_error; /* I/O error for this device */ 298 uint8_t rc_tried; /* Did we attempt this I/O column? */ 299 uint8_t rc_skipped; /* Did we skip this I/O column? */ 300 } raidz_col_t; 301 302 typedef struct raidz_map { 303 uint64_t rm_cols; /* Regular column count */ 304 uint64_t rm_scols; /* Count including skipped columns */ 305 uint64_t rm_bigcols; /* Number of oversized columns */ 306 uint64_t rm_asize; /* Actual total I/O size */ 307 uint64_t rm_missingdata; /* Count of missing data devices */ 308 uint64_t rm_missingparity; /* Count of missing parity devices */ 309 uint64_t rm_firstdatacol; /* First data column/parity count */ 310 uint64_t rm_nskip; /* Skipped sectors for padding */ 311 uint64_t rm_skipstart; /* Column index of padding start */ 312 uintptr_t rm_reports; /* # of referencing checksum reports */ 313 uint8_t rm_freed; /* map no longer has referencing ZIO */ 314 uint8_t rm_ecksuminjected; /* checksum error was injected */ 315 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 316 } raidz_map_t; 317 318 #define VDEV_RAIDZ_P 0 319 #define VDEV_RAIDZ_Q 1 320 #define VDEV_RAIDZ_R 2 321 322 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 323 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 324 325 /* 326 * We provide a mechanism to perform the field multiplication operation on a 327 * 64-bit value all at once rather than a byte at a time. This works by 328 * creating a mask from the top bit in each byte and using that to 329 * conditionally apply the XOR of 0x1d. 330 */ 331 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 332 { \ 333 (mask) = (x) & 0x8080808080808080ULL; \ 334 (mask) = ((mask) << 1) - ((mask) >> 7); \ 335 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 336 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 337 } 338 339 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 340 { \ 341 VDEV_RAIDZ_64MUL_2((x), mask); \ 342 VDEV_RAIDZ_64MUL_2((x), mask); \ 343 } 344 345 /* 346 * These two tables represent powers and logs of 2 in the Galois field defined 347 * above. These values were computed by repeatedly multiplying by 2 as above. 348 */ 349 static const uint8_t vdev_raidz_pow2[256] = { 350 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 351 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 352 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 353 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 354 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 355 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 356 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 357 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 358 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 359 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 360 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 361 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 362 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 363 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 364 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 365 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 366 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 367 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 368 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 369 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 370 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 371 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 372 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 373 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 374 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 375 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 376 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 377 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 378 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 379 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 380 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 381 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 382 }; 383 static const uint8_t vdev_raidz_log2[256] = { 384 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 385 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 386 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 387 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 388 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 389 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 390 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 391 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 392 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 393 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 394 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 395 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 396 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 397 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 398 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 399 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 400 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 401 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 402 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 403 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 404 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 405 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 406 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 407 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 408 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 409 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 410 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 411 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 412 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 413 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 414 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 415 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 416 }; 417 418 /* 419 * Multiply a given number by 2 raised to the given power. 420 */ 421 static uint8_t 422 vdev_raidz_exp2(uint8_t a, int exp) 423 { 424 if (a == 0) 425 return (0); 426 427 ASSERT(exp >= 0); 428 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 429 430 exp += vdev_raidz_log2[a]; 431 if (exp > 255) 432 exp -= 255; 433 434 return (vdev_raidz_pow2[exp]); 435 } 436 437 static void 438 vdev_raidz_generate_parity_p(raidz_map_t *rm) 439 { 440 uint64_t *p, *src, pcount, ccount, i; 441 int c; 442 443 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 444 445 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 446 src = rm->rm_col[c].rc_data; 447 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 448 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 449 450 if (c == rm->rm_firstdatacol) { 451 ASSERT(ccount == pcount); 452 for (i = 0; i < ccount; i++, src++, p++) { 453 *p = *src; 454 } 455 } else { 456 ASSERT(ccount <= pcount); 457 for (i = 0; i < ccount; i++, src++, p++) { 458 *p ^= *src; 459 } 460 } 461 } 462 } 463 464 static void 465 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 466 { 467 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 468 int c; 469 470 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 471 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 472 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 473 474 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 475 src = rm->rm_col[c].rc_data; 476 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 477 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 478 479 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 480 481 if (c == rm->rm_firstdatacol) { 482 ASSERT(ccnt == pcnt || ccnt == 0); 483 for (i = 0; i < ccnt; i++, src++, p++, q++) { 484 *p = *src; 485 *q = *src; 486 } 487 for (; i < pcnt; i++, src++, p++, q++) { 488 *p = 0; 489 *q = 0; 490 } 491 } else { 492 ASSERT(ccnt <= pcnt); 493 494 /* 495 * Apply the algorithm described above by multiplying 496 * the previous result and adding in the new value. 497 */ 498 for (i = 0; i < ccnt; i++, src++, p++, q++) { 499 *p ^= *src; 500 501 VDEV_RAIDZ_64MUL_2(*q, mask); 502 *q ^= *src; 503 } 504 505 /* 506 * Treat short columns as though they are full of 0s. 507 * Note that there's therefore nothing needed for P. 508 */ 509 for (; i < pcnt; i++, q++) { 510 VDEV_RAIDZ_64MUL_2(*q, mask); 511 } 512 } 513 } 514 } 515 516 static void 517 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 518 { 519 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 520 int c; 521 522 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 523 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 524 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 525 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 526 rm->rm_col[VDEV_RAIDZ_R].rc_size); 527 528 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 529 src = rm->rm_col[c].rc_data; 530 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 531 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 532 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 533 534 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 535 536 if (c == rm->rm_firstdatacol) { 537 ASSERT(ccnt == pcnt || ccnt == 0); 538 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 539 *p = *src; 540 *q = *src; 541 *r = *src; 542 } 543 for (; i < pcnt; i++, src++, p++, q++, r++) { 544 *p = 0; 545 *q = 0; 546 *r = 0; 547 } 548 } else { 549 ASSERT(ccnt <= pcnt); 550 551 /* 552 * Apply the algorithm described above by multiplying 553 * the previous result and adding in the new value. 554 */ 555 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 556 *p ^= *src; 557 558 VDEV_RAIDZ_64MUL_2(*q, mask); 559 *q ^= *src; 560 561 VDEV_RAIDZ_64MUL_4(*r, mask); 562 *r ^= *src; 563 } 564 565 /* 566 * Treat short columns as though they are full of 0s. 567 * Note that there's therefore nothing needed for P. 568 */ 569 for (; i < pcnt; i++, q++, r++) { 570 VDEV_RAIDZ_64MUL_2(*q, mask); 571 VDEV_RAIDZ_64MUL_4(*r, mask); 572 } 573 } 574 } 575 } 576 577 /* 578 * Generate RAID parity in the first virtual columns according to the number of 579 * parity columns available. 580 */ 581 static void 582 vdev_raidz_generate_parity(raidz_map_t *rm) 583 { 584 switch (rm->rm_firstdatacol) { 585 case 1: 586 vdev_raidz_generate_parity_p(rm); 587 break; 588 case 2: 589 vdev_raidz_generate_parity_pq(rm); 590 break; 591 case 3: 592 vdev_raidz_generate_parity_pqr(rm); 593 break; 594 default: 595 panic("invalid RAID-Z configuration"); 596 } 597 } 598 599 /* BEGIN CSTYLED */ 600 /* 601 * In the general case of reconstruction, we must solve the system of linear 602 * equations defined by the coeffecients used to generate parity as well as 603 * the contents of the data and parity disks. This can be expressed with 604 * vectors for the original data (D) and the actual data (d) and parity (p) 605 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 606 * 607 * __ __ __ __ 608 * | | __ __ | p_0 | 609 * | V | | D_0 | | p_m-1 | 610 * | | x | : | = | d_0 | 611 * | I | | D_n-1 | | : | 612 * | | ~~ ~~ | d_n-1 | 613 * ~~ ~~ ~~ ~~ 614 * 615 * I is simply a square identity matrix of size n, and V is a vandermonde 616 * matrix defined by the coeffecients we chose for the various parity columns 617 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 618 * computation as well as linear separability. 619 * 620 * __ __ __ __ 621 * | 1 .. 1 1 1 | | p_0 | 622 * | 2^n-1 .. 4 2 1 | __ __ | : | 623 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 624 * | 1 .. 0 0 0 | | D_1 | | d_0 | 625 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 626 * | : : : : | | : | | d_2 | 627 * | 0 .. 1 0 0 | | D_n-1 | | : | 628 * | 0 .. 0 1 0 | ~~ ~~ | : | 629 * | 0 .. 0 0 1 | | d_n-1 | 630 * ~~ ~~ ~~ ~~ 631 * 632 * Note that I, V, d, and p are known. To compute D, we must invert the 633 * matrix and use the known data and parity values to reconstruct the unknown 634 * data values. We begin by removing the rows in V|I and d|p that correspond 635 * to failed or missing columns; we then make V|I square (n x n) and d|p 636 * sized n by removing rows corresponding to unused parity from the bottom up 637 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 638 * using Gauss-Jordan elimination. In the example below we use m=3 parity 639 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 640 * __ __ 641 * | 1 1 1 1 1 1 1 1 | 642 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 643 * | 19 205 116 29 64 16 4 1 | / / 644 * | 1 0 0 0 0 0 0 0 | / / 645 * | 0 1 0 0 0 0 0 0 | <--' / 646 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 647 * | 0 0 0 1 0 0 0 0 | 648 * | 0 0 0 0 1 0 0 0 | 649 * | 0 0 0 0 0 1 0 0 | 650 * | 0 0 0 0 0 0 1 0 | 651 * | 0 0 0 0 0 0 0 1 | 652 * ~~ ~~ 653 * __ __ 654 * | 1 1 1 1 1 1 1 1 | 655 * | 128 64 32 16 8 4 2 1 | 656 * | 19 205 116 29 64 16 4 1 | 657 * | 1 0 0 0 0 0 0 0 | 658 * | 0 1 0 0 0 0 0 0 | 659 * (V|I)' = | 0 0 1 0 0 0 0 0 | 660 * | 0 0 0 1 0 0 0 0 | 661 * | 0 0 0 0 1 0 0 0 | 662 * | 0 0 0 0 0 1 0 0 | 663 * | 0 0 0 0 0 0 1 0 | 664 * | 0 0 0 0 0 0 0 1 | 665 * ~~ ~~ 666 * 667 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 668 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 669 * matrix is not singular. 670 * __ __ 671 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 672 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 673 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 674 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 675 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 676 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 677 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 678 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 679 * ~~ ~~ 680 * __ __ 681 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 682 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 683 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 684 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 685 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 686 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 687 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 688 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 689 * ~~ ~~ 690 * __ __ 691 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 692 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 693 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 694 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 695 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 696 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 697 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 698 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 699 * ~~ ~~ 700 * __ __ 701 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 702 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 703 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 704 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 705 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 706 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 707 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 708 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 709 * ~~ ~~ 710 * __ __ 711 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 712 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 713 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 714 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 715 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 716 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 717 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 718 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 719 * ~~ ~~ 720 * __ __ 721 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 722 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 723 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 724 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 725 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 726 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 727 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 728 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 729 * ~~ ~~ 730 * __ __ 731 * | 0 0 1 0 0 0 0 0 | 732 * | 167 100 5 41 159 169 217 208 | 733 * | 166 100 4 40 158 168 216 209 | 734 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 735 * | 0 0 0 0 1 0 0 0 | 736 * | 0 0 0 0 0 1 0 0 | 737 * | 0 0 0 0 0 0 1 0 | 738 * | 0 0 0 0 0 0 0 1 | 739 * ~~ ~~ 740 * 741 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 742 * of the missing data. 743 * 744 * As is apparent from the example above, the only non-trivial rows in the 745 * inverse matrix correspond to the data disks that we're trying to 746 * reconstruct. Indeed, those are the only rows we need as the others would 747 * only be useful for reconstructing data known or assumed to be valid. For 748 * that reason, we only build the coefficients in the rows that correspond to 749 * targeted columns. 750 */ 751 /* END CSTYLED */ 752 753 static void 754 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 755 uint8_t **rows) 756 { 757 int i, j; 758 int pow; 759 760 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 761 762 /* 763 * Fill in the missing rows of interest. 764 */ 765 for (i = 0; i < nmap; i++) { 766 ASSERT3S(0, <=, map[i]); 767 ASSERT3S(map[i], <=, 2); 768 769 pow = map[i] * n; 770 if (pow > 255) 771 pow -= 255; 772 ASSERT(pow <= 255); 773 774 for (j = 0; j < n; j++) { 775 pow -= map[i]; 776 if (pow < 0) 777 pow += 255; 778 rows[i][j] = vdev_raidz_pow2[pow]; 779 } 780 } 781 } 782 783 static void 784 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 785 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 786 { 787 int i, j, ii, jj; 788 uint8_t log; 789 790 /* 791 * Assert that the first nmissing entries from the array of used 792 * columns correspond to parity columns and that subsequent entries 793 * correspond to data columns. 794 */ 795 for (i = 0; i < nmissing; i++) { 796 ASSERT3S(used[i], <, rm->rm_firstdatacol); 797 } 798 for (; i < n; i++) { 799 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 800 } 801 802 /* 803 * First initialize the storage where we'll compute the inverse rows. 804 */ 805 for (i = 0; i < nmissing; i++) { 806 for (j = 0; j < n; j++) { 807 invrows[i][j] = (i == j) ? 1 : 0; 808 } 809 } 810 811 /* 812 * Subtract all trivial rows from the rows of consequence. 813 */ 814 for (i = 0; i < nmissing; i++) { 815 for (j = nmissing; j < n; j++) { 816 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 817 jj = used[j] - rm->rm_firstdatacol; 818 ASSERT3S(jj, <, n); 819 invrows[i][j] = rows[i][jj]; 820 rows[i][jj] = 0; 821 } 822 } 823 824 /* 825 * For each of the rows of interest, we must normalize it and subtract 826 * a multiple of it from the other rows. 827 */ 828 for (i = 0; i < nmissing; i++) { 829 for (j = 0; j < missing[i]; j++) { 830 ASSERT3U(rows[i][j], ==, 0); 831 } 832 ASSERT3U(rows[i][missing[i]], !=, 0); 833 834 /* 835 * Compute the inverse of the first element and multiply each 836 * element in the row by that value. 837 */ 838 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 839 840 for (j = 0; j < n; j++) { 841 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 842 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 843 } 844 845 for (ii = 0; ii < nmissing; ii++) { 846 if (i == ii) 847 continue; 848 849 ASSERT3U(rows[ii][missing[i]], !=, 0); 850 851 log = vdev_raidz_log2[rows[ii][missing[i]]]; 852 853 for (j = 0; j < n; j++) { 854 rows[ii][j] ^= 855 vdev_raidz_exp2(rows[i][j], log); 856 invrows[ii][j] ^= 857 vdev_raidz_exp2(invrows[i][j], log); 858 } 859 } 860 } 861 862 /* 863 * Verify that the data that is left in the rows are properly part of 864 * an identity matrix. 865 */ 866 for (i = 0; i < nmissing; i++) { 867 for (j = 0; j < n; j++) { 868 if (j == missing[i]) { 869 ASSERT3U(rows[i][j], ==, 1); 870 } else { 871 ASSERT3U(rows[i][j], ==, 0); 872 } 873 } 874 } 875 } 876 877 static void 878 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 879 int *missing, uint8_t **invrows, const uint8_t *used) 880 { 881 int i, j, x, cc, c; 882 uint8_t *src; 883 uint64_t ccount; 884 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 885 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 886 uint8_t log, val; 887 int ll; 888 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 889 uint8_t *p, *pp; 890 size_t psize; 891 892 log = 0; /* gcc */ 893 psize = sizeof (invlog[0][0]) * n * nmissing; 894 p = zfs_alloc(psize); 895 896 for (pp = p, i = 0; i < nmissing; i++) { 897 invlog[i] = pp; 898 pp += n; 899 } 900 901 for (i = 0; i < nmissing; i++) { 902 for (j = 0; j < n; j++) { 903 ASSERT3U(invrows[i][j], !=, 0); 904 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 905 } 906 } 907 908 for (i = 0; i < n; i++) { 909 c = used[i]; 910 ASSERT3U(c, <, rm->rm_cols); 911 912 src = rm->rm_col[c].rc_data; 913 ccount = rm->rm_col[c].rc_size; 914 for (j = 0; j < nmissing; j++) { 915 cc = missing[j] + rm->rm_firstdatacol; 916 ASSERT3U(cc, >=, rm->rm_firstdatacol); 917 ASSERT3U(cc, <, rm->rm_cols); 918 ASSERT3U(cc, !=, c); 919 920 dst[j] = rm->rm_col[cc].rc_data; 921 dcount[j] = rm->rm_col[cc].rc_size; 922 } 923 924 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 925 926 for (x = 0; x < ccount; x++, src++) { 927 if (*src != 0) 928 log = vdev_raidz_log2[*src]; 929 930 for (cc = 0; cc < nmissing; cc++) { 931 if (x >= dcount[cc]) 932 continue; 933 934 if (*src == 0) { 935 val = 0; 936 } else { 937 if ((ll = log + invlog[cc][i]) >= 255) 938 ll -= 255; 939 val = vdev_raidz_pow2[ll]; 940 } 941 942 if (i == 0) 943 dst[cc][x] = val; 944 else 945 dst[cc][x] ^= val; 946 } 947 } 948 } 949 950 zfs_free(p, psize); 951 } 952 953 static int 954 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 955 { 956 int n, i, c, t, tt; 957 int nmissing_rows; 958 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 959 int parity_map[VDEV_RAIDZ_MAXPARITY]; 960 961 uint8_t *p, *pp; 962 size_t psize; 963 964 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 965 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 966 uint8_t *used; 967 968 int code = 0; 969 970 971 n = rm->rm_cols - rm->rm_firstdatacol; 972 973 /* 974 * Figure out which data columns are missing. 975 */ 976 nmissing_rows = 0; 977 for (t = 0; t < ntgts; t++) { 978 if (tgts[t] >= rm->rm_firstdatacol) { 979 missing_rows[nmissing_rows++] = 980 tgts[t] - rm->rm_firstdatacol; 981 } 982 } 983 984 /* 985 * Figure out which parity columns to use to help generate the missing 986 * data columns. 987 */ 988 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 989 ASSERT(tt < ntgts); 990 ASSERT(c < rm->rm_firstdatacol); 991 992 /* 993 * Skip any targeted parity columns. 994 */ 995 if (c == tgts[tt]) { 996 tt++; 997 continue; 998 } 999 1000 code |= 1 << c; 1001 1002 parity_map[i] = c; 1003 i++; 1004 } 1005 1006 ASSERT(code != 0); 1007 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1008 1009 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1010 nmissing_rows * n + sizeof (used[0]) * n; 1011 p = kmem_alloc(psize, KM_SLEEP); 1012 1013 for (pp = p, i = 0; i < nmissing_rows; i++) { 1014 rows[i] = pp; 1015 pp += n; 1016 invrows[i] = pp; 1017 pp += n; 1018 } 1019 used = pp; 1020 1021 for (i = 0; i < nmissing_rows; i++) { 1022 used[i] = parity_map[i]; 1023 } 1024 1025 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1026 if (tt < nmissing_rows && 1027 c == missing_rows[tt] + rm->rm_firstdatacol) { 1028 tt++; 1029 continue; 1030 } 1031 1032 ASSERT3S(i, <, n); 1033 used[i] = c; 1034 i++; 1035 } 1036 1037 /* 1038 * Initialize the interesting rows of the matrix. 1039 */ 1040 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1041 1042 /* 1043 * Invert the matrix. 1044 */ 1045 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1046 invrows, used); 1047 1048 /* 1049 * Reconstruct the missing data using the generated matrix. 1050 */ 1051 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1052 invrows, used); 1053 1054 kmem_free(p, psize); 1055 1056 return (code); 1057 } 1058 1059 static int 1060 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1061 { 1062 int tgts[VDEV_RAIDZ_MAXPARITY]; 1063 int ntgts; 1064 int i, c; 1065 int code; 1066 int nbadparity, nbaddata; 1067 1068 /* 1069 * The tgts list must already be sorted. 1070 */ 1071 for (i = 1; i < nt; i++) { 1072 ASSERT(t[i] > t[i - 1]); 1073 } 1074 1075 nbadparity = rm->rm_firstdatacol; 1076 nbaddata = rm->rm_cols - nbadparity; 1077 ntgts = 0; 1078 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1079 if (i < nt && c == t[i]) { 1080 tgts[ntgts++] = c; 1081 i++; 1082 } else if (rm->rm_col[c].rc_error != 0) { 1083 tgts[ntgts++] = c; 1084 } else if (c >= rm->rm_firstdatacol) { 1085 nbaddata--; 1086 } else { 1087 nbadparity--; 1088 } 1089 } 1090 1091 ASSERT(ntgts >= nt); 1092 ASSERT(nbaddata >= 0); 1093 ASSERT(nbaddata + nbadparity == ntgts); 1094 1095 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1096 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1097 ASSERT(code > 0); 1098 return (code); 1099 } 1100 1101 static raidz_map_t * 1102 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1103 uint64_t dcols, uint64_t nparity) 1104 { 1105 raidz_map_t *rm; 1106 uint64_t b = offset >> unit_shift; 1107 uint64_t s = size >> unit_shift; 1108 uint64_t f = b % dcols; 1109 uint64_t o = (b / dcols) << unit_shift; 1110 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1111 1112 q = s / (dcols - nparity); 1113 r = s - q * (dcols - nparity); 1114 bc = (r == 0 ? 0 : r + nparity); 1115 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1116 1117 if (q == 0) { 1118 acols = bc; 1119 scols = MIN(dcols, roundup(bc, nparity + 1)); 1120 } else { 1121 acols = dcols; 1122 scols = dcols; 1123 } 1124 1125 ASSERT3U(acols, <=, scols); 1126 1127 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1128 1129 rm->rm_cols = acols; 1130 rm->rm_scols = scols; 1131 rm->rm_bigcols = bc; 1132 rm->rm_skipstart = bc; 1133 rm->rm_missingdata = 0; 1134 rm->rm_missingparity = 0; 1135 rm->rm_firstdatacol = nparity; 1136 rm->rm_reports = 0; 1137 rm->rm_freed = 0; 1138 rm->rm_ecksuminjected = 0; 1139 1140 asize = 0; 1141 1142 for (c = 0; c < scols; c++) { 1143 col = f + c; 1144 coff = o; 1145 if (col >= dcols) { 1146 col -= dcols; 1147 coff += 1ULL << unit_shift; 1148 } 1149 rm->rm_col[c].rc_devidx = col; 1150 rm->rm_col[c].rc_offset = coff; 1151 rm->rm_col[c].rc_data = NULL; 1152 rm->rm_col[c].rc_error = 0; 1153 rm->rm_col[c].rc_tried = 0; 1154 rm->rm_col[c].rc_skipped = 0; 1155 1156 if (c >= acols) 1157 rm->rm_col[c].rc_size = 0; 1158 else if (c < bc) 1159 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1160 else 1161 rm->rm_col[c].rc_size = q << unit_shift; 1162 1163 asize += rm->rm_col[c].rc_size; 1164 } 1165 1166 ASSERT3U(asize, ==, tot << unit_shift); 1167 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1168 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1169 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1170 ASSERT3U(rm->rm_nskip, <=, nparity); 1171 1172 for (c = 0; c < rm->rm_firstdatacol; c++) 1173 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1174 1175 rm->rm_col[c].rc_data = data; 1176 1177 for (c = c + 1; c < acols; c++) 1178 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1179 rm->rm_col[c - 1].rc_size; 1180 1181 /* 1182 * If all data stored spans all columns, there's a danger that parity 1183 * will always be on the same device and, since parity isn't read 1184 * during normal operation, that that device's I/O bandwidth won't be 1185 * used effectively. We therefore switch the parity every 1MB. 1186 * 1187 * ... at least that was, ostensibly, the theory. As a practical 1188 * matter unless we juggle the parity between all devices evenly, we 1189 * won't see any benefit. Further, occasional writes that aren't a 1190 * multiple of the LCM of the number of children and the minimum 1191 * stripe width are sufficient to avoid pessimal behavior. 1192 * Unfortunately, this decision created an implicit on-disk format 1193 * requirement that we need to support for all eternity, but only 1194 * for single-parity RAID-Z. 1195 * 1196 * If we intend to skip a sector in the zeroth column for padding 1197 * we must make sure to note this swap. We will never intend to 1198 * skip the first column since at least one data and one parity 1199 * column must appear in each row. 1200 */ 1201 ASSERT(rm->rm_cols >= 2); 1202 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1203 1204 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1205 devidx = rm->rm_col[0].rc_devidx; 1206 o = rm->rm_col[0].rc_offset; 1207 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1208 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1209 rm->rm_col[1].rc_devidx = devidx; 1210 rm->rm_col[1].rc_offset = o; 1211 1212 if (rm->rm_skipstart == 0) 1213 rm->rm_skipstart = 1; 1214 } 1215 1216 return (rm); 1217 } 1218 1219 static void 1220 vdev_raidz_map_free(raidz_map_t *rm) 1221 { 1222 int c; 1223 1224 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1225 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1226 1227 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1228 } 1229 1230 static vdev_t * 1231 vdev_child(vdev_t *pvd, uint64_t devidx) 1232 { 1233 vdev_t *cvd; 1234 1235 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1236 if (cvd->v_id == devidx) 1237 break; 1238 } 1239 1240 return (cvd); 1241 } 1242 1243 /* 1244 * We keep track of whether or not there were any injected errors, so that 1245 * any ereports we generate can note it. 1246 */ 1247 static int 1248 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1249 { 1250 1251 return (zio_checksum_verify(bp, data)); 1252 } 1253 1254 /* 1255 * Generate the parity from the data columns. If we tried and were able to 1256 * read the parity without error, verify that the generated parity matches the 1257 * data we read. If it doesn't, we fire off a checksum error. Return the 1258 * number such failures. 1259 */ 1260 static int 1261 raidz_parity_verify(raidz_map_t *rm) 1262 { 1263 void *orig[VDEV_RAIDZ_MAXPARITY]; 1264 int c, ret = 0; 1265 raidz_col_t *rc; 1266 1267 for (c = 0; c < rm->rm_firstdatacol; c++) { 1268 rc = &rm->rm_col[c]; 1269 if (!rc->rc_tried || rc->rc_error != 0) 1270 continue; 1271 orig[c] = zfs_alloc(rc->rc_size); 1272 bcopy(rc->rc_data, orig[c], rc->rc_size); 1273 } 1274 1275 vdev_raidz_generate_parity(rm); 1276 1277 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1278 rc = &rm->rm_col[c]; 1279 if (!rc->rc_tried || rc->rc_error != 0) 1280 continue; 1281 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1282 rc->rc_error = ECKSUM; 1283 ret++; 1284 } 1285 zfs_free(orig[c], rc->rc_size); 1286 } 1287 1288 return (ret); 1289 } 1290 1291 /* 1292 * Iterate over all combinations of bad data and attempt a reconstruction. 1293 * Note that the algorithm below is non-optimal because it doesn't take into 1294 * account how reconstruction is actually performed. For example, with 1295 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1296 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1297 * cases we'd only use parity information in column 0. 1298 */ 1299 static int 1300 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1301 off_t offset, uint64_t bytes, int total_errors, int data_errors) 1302 { 1303 raidz_col_t *rc; 1304 void *orig[VDEV_RAIDZ_MAXPARITY]; 1305 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1306 int *tgts = &tstore[1]; 1307 int current, next, i, c, n; 1308 int code, ret = 0; 1309 1310 ASSERT(total_errors < rm->rm_firstdatacol); 1311 1312 /* 1313 * This simplifies one edge condition. 1314 */ 1315 tgts[-1] = -1; 1316 1317 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1318 /* 1319 * Initialize the targets array by finding the first n columns 1320 * that contain no error. 1321 * 1322 * If there were no data errors, we need to ensure that we're 1323 * always explicitly attempting to reconstruct at least one 1324 * data column. To do this, we simply push the highest target 1325 * up into the data columns. 1326 */ 1327 for (c = 0, i = 0; i < n; i++) { 1328 if (i == n - 1 && data_errors == 0 && 1329 c < rm->rm_firstdatacol) { 1330 c = rm->rm_firstdatacol; 1331 } 1332 1333 while (rm->rm_col[c].rc_error != 0) { 1334 c++; 1335 ASSERT3S(c, <, rm->rm_cols); 1336 } 1337 1338 tgts[i] = c++; 1339 } 1340 1341 /* 1342 * Setting tgts[n] simplifies the other edge condition. 1343 */ 1344 tgts[n] = rm->rm_cols; 1345 1346 /* 1347 * These buffers were allocated in previous iterations. 1348 */ 1349 for (i = 0; i < n - 1; i++) { 1350 ASSERT(orig[i] != NULL); 1351 } 1352 1353 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1354 1355 current = 0; 1356 next = tgts[current]; 1357 1358 while (current != n) { 1359 tgts[current] = next; 1360 current = 0; 1361 1362 /* 1363 * Save off the original data that we're going to 1364 * attempt to reconstruct. 1365 */ 1366 for (i = 0; i < n; i++) { 1367 ASSERT(orig[i] != NULL); 1368 c = tgts[i]; 1369 ASSERT3S(c, >=, 0); 1370 ASSERT3S(c, <, rm->rm_cols); 1371 rc = &rm->rm_col[c]; 1372 bcopy(rc->rc_data, orig[i], rc->rc_size); 1373 } 1374 1375 /* 1376 * Attempt a reconstruction and exit the outer loop on 1377 * success. 1378 */ 1379 code = vdev_raidz_reconstruct(rm, tgts, n); 1380 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1381 for (i = 0; i < n; i++) { 1382 c = tgts[i]; 1383 rc = &rm->rm_col[c]; 1384 ASSERT(rc->rc_error == 0); 1385 rc->rc_error = ECKSUM; 1386 } 1387 1388 ret = code; 1389 goto done; 1390 } 1391 1392 /* 1393 * Restore the original data. 1394 */ 1395 for (i = 0; i < n; i++) { 1396 c = tgts[i]; 1397 rc = &rm->rm_col[c]; 1398 bcopy(orig[i], rc->rc_data, rc->rc_size); 1399 } 1400 1401 do { 1402 /* 1403 * Find the next valid column after the current 1404 * position.. 1405 */ 1406 for (next = tgts[current] + 1; 1407 next < rm->rm_cols && 1408 rm->rm_col[next].rc_error != 0; next++) 1409 continue; 1410 1411 ASSERT(next <= tgts[current + 1]); 1412 1413 /* 1414 * If that spot is available, we're done here. 1415 */ 1416 if (next != tgts[current + 1]) 1417 break; 1418 1419 /* 1420 * Otherwise, find the next valid column after 1421 * the previous position. 1422 */ 1423 for (c = tgts[current - 1] + 1; 1424 rm->rm_col[c].rc_error != 0; c++) 1425 continue; 1426 1427 tgts[current] = c; 1428 current++; 1429 1430 } while (current != n); 1431 } 1432 } 1433 n--; 1434 done: 1435 for (i = n - 1; i >= 0; i--) { 1436 zfs_free(orig[i], rm->rm_col[0].rc_size); 1437 } 1438 1439 return (ret); 1440 } 1441 1442 static int 1443 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1444 off_t offset, size_t bytes) 1445 { 1446 vdev_t *tvd = vd->v_top; 1447 vdev_t *cvd; 1448 raidz_map_t *rm; 1449 raidz_col_t *rc; 1450 int c, error; 1451 int unexpected_errors; 1452 int parity_errors; 1453 int parity_untried; 1454 int data_errors; 1455 int total_errors; 1456 int n; 1457 int tgts[VDEV_RAIDZ_MAXPARITY]; 1458 int code; 1459 1460 rc = NULL; /* gcc */ 1461 error = 0; 1462 1463 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1464 vd->v_nchildren, vd->v_nparity); 1465 1466 /* 1467 * Iterate over the columns in reverse order so that we hit the parity 1468 * last -- any errors along the way will force us to read the parity. 1469 */ 1470 for (c = rm->rm_cols - 1; c >= 0; c--) { 1471 rc = &rm->rm_col[c]; 1472 cvd = vdev_child(vd, rc->rc_devidx); 1473 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1474 if (c >= rm->rm_firstdatacol) 1475 rm->rm_missingdata++; 1476 else 1477 rm->rm_missingparity++; 1478 rc->rc_error = ENXIO; 1479 rc->rc_tried = 1; /* don't even try */ 1480 rc->rc_skipped = 1; 1481 continue; 1482 } 1483 #if 0 /* XXX: Too hard for the boot code. */ 1484 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1485 if (c >= rm->rm_firstdatacol) 1486 rm->rm_missingdata++; 1487 else 1488 rm->rm_missingparity++; 1489 rc->rc_error = ESTALE; 1490 rc->rc_skipped = 1; 1491 continue; 1492 } 1493 #endif 1494 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1495 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1496 rc->rc_offset, rc->rc_size); 1497 rc->rc_tried = 1; 1498 rc->rc_skipped = 0; 1499 } 1500 } 1501 1502 reconstruct: 1503 unexpected_errors = 0; 1504 parity_errors = 0; 1505 parity_untried = 0; 1506 data_errors = 0; 1507 total_errors = 0; 1508 1509 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1510 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1511 1512 for (c = 0; c < rm->rm_cols; c++) { 1513 rc = &rm->rm_col[c]; 1514 1515 if (rc->rc_error) { 1516 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1517 1518 if (c < rm->rm_firstdatacol) 1519 parity_errors++; 1520 else 1521 data_errors++; 1522 1523 if (!rc->rc_skipped) 1524 unexpected_errors++; 1525 1526 total_errors++; 1527 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1528 parity_untried++; 1529 } 1530 } 1531 1532 /* 1533 * There are three potential phases for a read: 1534 * 1. produce valid data from the columns read 1535 * 2. read all disks and try again 1536 * 3. perform combinatorial reconstruction 1537 * 1538 * Each phase is progressively both more expensive and less likely to 1539 * occur. If we encounter more errors than we can repair or all phases 1540 * fail, we have no choice but to return an error. 1541 */ 1542 1543 /* 1544 * If the number of errors we saw was correctable -- less than or equal 1545 * to the number of parity disks read -- attempt to produce data that 1546 * has a valid checksum. Naturally, this case applies in the absence of 1547 * any errors. 1548 */ 1549 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1550 if (data_errors == 0) { 1551 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1552 /* 1553 * If we read parity information (unnecessarily 1554 * as it happens since no reconstruction was 1555 * needed) regenerate and verify the parity. 1556 * We also regenerate parity when resilvering 1557 * so we can write it out to the failed device 1558 * later. 1559 */ 1560 if (parity_errors + parity_untried < 1561 rm->rm_firstdatacol) { 1562 n = raidz_parity_verify(rm); 1563 unexpected_errors += n; 1564 ASSERT(parity_errors + n <= 1565 rm->rm_firstdatacol); 1566 } 1567 goto done; 1568 } 1569 } else { 1570 /* 1571 * We either attempt to read all the parity columns or 1572 * none of them. If we didn't try to read parity, we 1573 * wouldn't be here in the correctable case. There must 1574 * also have been fewer parity errors than parity 1575 * columns or, again, we wouldn't be in this code path. 1576 */ 1577 ASSERT(parity_untried == 0); 1578 ASSERT(parity_errors < rm->rm_firstdatacol); 1579 1580 /* 1581 * Identify the data columns that reported an error. 1582 */ 1583 n = 0; 1584 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1585 rc = &rm->rm_col[c]; 1586 if (rc->rc_error != 0) { 1587 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1588 tgts[n++] = c; 1589 } 1590 } 1591 1592 ASSERT(rm->rm_firstdatacol >= n); 1593 1594 code = vdev_raidz_reconstruct(rm, tgts, n); 1595 1596 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1597 /* 1598 * If we read more parity disks than were used 1599 * for reconstruction, confirm that the other 1600 * parity disks produced correct data. This 1601 * routine is suboptimal in that it regenerates 1602 * the parity that we already used in addition 1603 * to the parity that we're attempting to 1604 * verify, but this should be a relatively 1605 * uncommon case, and can be optimized if it 1606 * becomes a problem. Note that we regenerate 1607 * parity when resilvering so we can write it 1608 * out to failed devices later. 1609 */ 1610 if (parity_errors < rm->rm_firstdatacol - n) { 1611 n = raidz_parity_verify(rm); 1612 unexpected_errors += n; 1613 ASSERT(parity_errors + n <= 1614 rm->rm_firstdatacol); 1615 } 1616 1617 goto done; 1618 } 1619 } 1620 } 1621 1622 /* 1623 * This isn't a typical situation -- either we got a read 1624 * error or a child silently returned bad data. Read every 1625 * block so we can try again with as much data and parity as 1626 * we can track down. If we've already been through once 1627 * before, all children will be marked as tried so we'll 1628 * proceed to combinatorial reconstruction. 1629 */ 1630 unexpected_errors = 1; 1631 rm->rm_missingdata = 0; 1632 rm->rm_missingparity = 0; 1633 1634 n = 0; 1635 for (c = 0; c < rm->rm_cols; c++) { 1636 rc = &rm->rm_col[c]; 1637 1638 if (rc->rc_tried) 1639 continue; 1640 1641 cvd = vdev_child(vd, rc->rc_devidx); 1642 ASSERT(cvd != NULL); 1643 rc->rc_error = cvd->v_read(cvd, NULL, 1644 rc->rc_data, rc->rc_offset, rc->rc_size); 1645 if (rc->rc_error == 0) 1646 n++; 1647 rc->rc_tried = 1; 1648 rc->rc_skipped = 0; 1649 } 1650 /* 1651 * If we managed to read anything more, retry the 1652 * reconstruction. 1653 */ 1654 if (n > 0) 1655 goto reconstruct; 1656 1657 /* 1658 * At this point we've attempted to reconstruct the data given the 1659 * errors we detected, and we've attempted to read all columns. There 1660 * must, therefore, be one or more additional problems -- silent errors 1661 * resulting in invalid data rather than explicit I/O errors resulting 1662 * in absent data. We check if there is enough additional data to 1663 * possibly reconstruct the data and then perform combinatorial 1664 * reconstruction over all possible combinations. If that fails, 1665 * we're cooked. 1666 */ 1667 if (total_errors > rm->rm_firstdatacol) { 1668 error = EIO; 1669 } else if (total_errors < rm->rm_firstdatacol && 1670 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1671 total_errors, data_errors)) != 0) { 1672 /* 1673 * If we didn't use all the available parity for the 1674 * combinatorial reconstruction, verify that the remaining 1675 * parity is correct. 1676 */ 1677 if (code != (1 << rm->rm_firstdatacol) - 1) 1678 (void) raidz_parity_verify(rm); 1679 } else { 1680 /* 1681 * We're here because either: 1682 * 1683 * total_errors == rm_first_datacol, or 1684 * vdev_raidz_combrec() failed 1685 * 1686 * In either case, there is enough bad data to prevent 1687 * reconstruction. 1688 * 1689 * Start checksum ereports for all children which haven't 1690 * failed, and the IO wasn't speculative. 1691 */ 1692 error = ECKSUM; 1693 } 1694 1695 done: 1696 vdev_raidz_map_free(rm); 1697 1698 return (error); 1699 } 1700