1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define panic(...) do { \ 40 printf(__VA_ARGS__); \ 41 for (;;) ; \ 42 } while (0) 43 44 #define kmem_alloc(size, flag) zfs_alloc((size)) 45 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 46 47 static void 48 zfs_init_crc(void) 49 { 50 int i, j; 51 uint64_t *ct; 52 53 /* 54 * Calculate the crc64 table (used for the zap hash 55 * function). 56 */ 57 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 58 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 59 for (i = 0; i < 256; i++) 60 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 61 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 62 } 63 } 64 65 static void 66 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 67 { 68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69 } 70 71 /* 72 * Signature for checksum functions. 73 */ 74 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 75 76 /* 77 * Information about each checksum function. 78 */ 79 typedef struct zio_checksum_info { 80 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 81 int ci_correctable; /* number of correctable bits */ 82 int ci_eck; /* uses zio embedded checksum? */ 83 int ci_dedup; /* strong enough for dedup? */ 84 const char *ci_name; /* descriptive name */ 85 } zio_checksum_info_t; 86 87 #include "blkptr.c" 88 89 #include "fletcher.c" 90 #include "sha256.c" 91 92 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 93 {{NULL, NULL}, 0, 0, 0, "inherit"}, 94 {{NULL, NULL}, 0, 0, 0, "on"}, 95 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, 96 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, 97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, 98 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, 99 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, 100 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, 101 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, 102 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, 103 }; 104 105 106 /* 107 * Common signature for all zio compress/decompress functions. 108 */ 109 typedef size_t zio_compress_func_t(void *src, void *dst, 110 size_t s_len, size_t d_len, int); 111 typedef int zio_decompress_func_t(void *src, void *dst, 112 size_t s_len, size_t d_len, int); 113 114 /* 115 * Information about each compression function. 116 */ 117 typedef struct zio_compress_info { 118 zio_compress_func_t *ci_compress; /* compression function */ 119 zio_decompress_func_t *ci_decompress; /* decompression function */ 120 int ci_level; /* level parameter */ 121 const char *ci_name; /* algorithm name */ 122 } zio_compress_info_t; 123 124 #include "lzjb.c" 125 #include "zle.c" 126 #include "lz4.c" 127 128 /* 129 * Compression vectors. 130 */ 131 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 132 {NULL, NULL, 0, "inherit"}, 133 {NULL, NULL, 0, "on"}, 134 {NULL, NULL, 0, "uncompressed"}, 135 {NULL, lzjb_decompress, 0, "lzjb"}, 136 {NULL, NULL, 0, "empty"}, 137 {NULL, NULL, 1, "gzip-1"}, 138 {NULL, NULL, 2, "gzip-2"}, 139 {NULL, NULL, 3, "gzip-3"}, 140 {NULL, NULL, 4, "gzip-4"}, 141 {NULL, NULL, 5, "gzip-5"}, 142 {NULL, NULL, 6, "gzip-6"}, 143 {NULL, NULL, 7, "gzip-7"}, 144 {NULL, NULL, 8, "gzip-8"}, 145 {NULL, NULL, 9, "gzip-9"}, 146 {NULL, zle_decompress, 64, "zle"}, 147 {NULL, lz4_decompress, 0, "lz4"}, 148 }; 149 150 static void 151 byteswap_uint64_array(void *vbuf, size_t size) 152 { 153 uint64_t *buf = vbuf; 154 size_t count = size >> 3; 155 int i; 156 157 ASSERT((size & 7) == 0); 158 159 for (i = 0; i < count; i++) 160 buf[i] = BSWAP_64(buf[i]); 161 } 162 163 /* 164 * Set the external verifier for a gang block based on <vdev, offset, txg>, 165 * a tuple which is guaranteed to be unique for the life of the pool. 166 */ 167 static void 168 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 169 { 170 const dva_t *dva = BP_IDENTITY(bp); 171 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 172 173 ASSERT(BP_IS_GANG(bp)); 174 175 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 176 } 177 178 /* 179 * Set the external verifier for a label block based on its offset. 180 * The vdev is implicit, and the txg is unknowable at pool open time -- 181 * hence the logic in vdev_uberblock_load() to find the most recent copy. 182 */ 183 static void 184 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 185 { 186 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 187 } 188 189 static int 190 zio_checksum_verify(const blkptr_t *bp, void *data) 191 { 192 uint64_t size; 193 unsigned int checksum; 194 zio_checksum_info_t *ci; 195 zio_cksum_t actual_cksum, expected_cksum, verifier; 196 int byteswap; 197 198 checksum = BP_GET_CHECKSUM(bp); 199 size = BP_GET_PSIZE(bp); 200 201 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 202 return (EINVAL); 203 ci = &zio_checksum_table[checksum]; 204 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 205 return (EINVAL); 206 207 if (ci->ci_eck) { 208 zio_eck_t *eck; 209 210 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 211 checksum == ZIO_CHECKSUM_LABEL); 212 213 eck = (zio_eck_t *)((char *)data + size) - 1; 214 215 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 216 zio_checksum_gang_verifier(&verifier, bp); 217 else if (checksum == ZIO_CHECKSUM_LABEL) 218 zio_checksum_label_verifier(&verifier, 219 DVA_GET_OFFSET(BP_IDENTITY(bp))); 220 else 221 verifier = bp->blk_cksum; 222 223 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 224 225 if (byteswap) 226 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 227 228 expected_cksum = eck->zec_cksum; 229 eck->zec_cksum = verifier; 230 ci->ci_func[byteswap](data, size, &actual_cksum); 231 eck->zec_cksum = expected_cksum; 232 233 if (byteswap) 234 byteswap_uint64_array(&expected_cksum, 235 sizeof (zio_cksum_t)); 236 } else { 237 expected_cksum = bp->blk_cksum; 238 ci->ci_func[0](data, size, &actual_cksum); 239 } 240 241 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 242 /*printf("ZFS: read checksum failed\n");*/ 243 return (EIO); 244 } 245 246 return (0); 247 } 248 249 static int 250 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 251 void *dest, uint64_t destsize) 252 { 253 zio_compress_info_t *ci; 254 255 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 256 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 257 return (EIO); 258 } 259 260 ci = &zio_compress_table[cpfunc]; 261 if (!ci->ci_decompress) { 262 printf("ZFS: unsupported compression algorithm %s\n", 263 ci->ci_name); 264 return (EIO); 265 } 266 267 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 268 } 269 270 static uint64_t 271 zap_hash(uint64_t salt, const char *name) 272 { 273 const uint8_t *cp; 274 uint8_t c; 275 uint64_t crc = salt; 276 277 ASSERT(crc != 0); 278 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 279 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 280 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 281 282 /* 283 * Only use 28 bits, since we need 4 bits in the cookie for the 284 * collision differentiator. We MUST use the high bits, since 285 * those are the onces that we first pay attention to when 286 * chosing the bucket. 287 */ 288 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 289 290 return (crc); 291 } 292 293 static void *zfs_alloc(size_t size); 294 static void zfs_free(void *ptr, size_t size); 295 296 typedef struct raidz_col { 297 uint64_t rc_devidx; /* child device index for I/O */ 298 uint64_t rc_offset; /* device offset */ 299 uint64_t rc_size; /* I/O size */ 300 void *rc_data; /* I/O data */ 301 int rc_error; /* I/O error for this device */ 302 uint8_t rc_tried; /* Did we attempt this I/O column? */ 303 uint8_t rc_skipped; /* Did we skip this I/O column? */ 304 } raidz_col_t; 305 306 typedef struct raidz_map { 307 uint64_t rm_cols; /* Regular column count */ 308 uint64_t rm_scols; /* Count including skipped columns */ 309 uint64_t rm_bigcols; /* Number of oversized columns */ 310 uint64_t rm_asize; /* Actual total I/O size */ 311 uint64_t rm_missingdata; /* Count of missing data devices */ 312 uint64_t rm_missingparity; /* Count of missing parity devices */ 313 uint64_t rm_firstdatacol; /* First data column/parity count */ 314 uint64_t rm_nskip; /* Skipped sectors for padding */ 315 uint64_t rm_skipstart; /* Column index of padding start */ 316 uintptr_t rm_reports; /* # of referencing checksum reports */ 317 uint8_t rm_freed; /* map no longer has referencing ZIO */ 318 uint8_t rm_ecksuminjected; /* checksum error was injected */ 319 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 320 } raidz_map_t; 321 322 #define VDEV_RAIDZ_P 0 323 #define VDEV_RAIDZ_Q 1 324 #define VDEV_RAIDZ_R 2 325 326 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 327 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 328 329 /* 330 * We provide a mechanism to perform the field multiplication operation on a 331 * 64-bit value all at once rather than a byte at a time. This works by 332 * creating a mask from the top bit in each byte and using that to 333 * conditionally apply the XOR of 0x1d. 334 */ 335 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 336 { \ 337 (mask) = (x) & 0x8080808080808080ULL; \ 338 (mask) = ((mask) << 1) - ((mask) >> 7); \ 339 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 340 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 341 } 342 343 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 344 { \ 345 VDEV_RAIDZ_64MUL_2((x), mask); \ 346 VDEV_RAIDZ_64MUL_2((x), mask); \ 347 } 348 349 /* 350 * These two tables represent powers and logs of 2 in the Galois field defined 351 * above. These values were computed by repeatedly multiplying by 2 as above. 352 */ 353 static const uint8_t vdev_raidz_pow2[256] = { 354 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 355 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 356 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 357 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 358 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 359 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 360 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 361 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 362 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 363 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 364 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 365 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 366 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 367 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 368 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 369 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 370 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 371 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 372 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 373 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 374 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 375 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 376 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 377 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 378 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 379 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 380 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 381 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 382 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 383 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 384 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 385 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 386 }; 387 static const uint8_t vdev_raidz_log2[256] = { 388 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 389 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 390 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 391 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 392 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 393 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 394 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 395 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 396 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 397 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 398 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 399 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 400 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 401 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 402 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 403 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 404 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 405 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 406 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 407 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 408 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 409 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 410 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 411 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 412 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 413 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 414 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 415 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 416 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 417 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 418 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 419 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 420 }; 421 422 /* 423 * Multiply a given number by 2 raised to the given power. 424 */ 425 static uint8_t 426 vdev_raidz_exp2(uint8_t a, int exp) 427 { 428 if (a == 0) 429 return (0); 430 431 ASSERT(exp >= 0); 432 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 433 434 exp += vdev_raidz_log2[a]; 435 if (exp > 255) 436 exp -= 255; 437 438 return (vdev_raidz_pow2[exp]); 439 } 440 441 static void 442 vdev_raidz_generate_parity_p(raidz_map_t *rm) 443 { 444 uint64_t *p, *src, pcount, ccount, i; 445 int c; 446 447 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 448 449 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 450 src = rm->rm_col[c].rc_data; 451 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 452 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 453 454 if (c == rm->rm_firstdatacol) { 455 ASSERT(ccount == pcount); 456 for (i = 0; i < ccount; i++, src++, p++) { 457 *p = *src; 458 } 459 } else { 460 ASSERT(ccount <= pcount); 461 for (i = 0; i < ccount; i++, src++, p++) { 462 *p ^= *src; 463 } 464 } 465 } 466 } 467 468 static void 469 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 470 { 471 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 472 int c; 473 474 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 475 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 476 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 477 478 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 479 src = rm->rm_col[c].rc_data; 480 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 481 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 482 483 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 484 485 if (c == rm->rm_firstdatacol) { 486 ASSERT(ccnt == pcnt || ccnt == 0); 487 for (i = 0; i < ccnt; i++, src++, p++, q++) { 488 *p = *src; 489 *q = *src; 490 } 491 for (; i < pcnt; i++, src++, p++, q++) { 492 *p = 0; 493 *q = 0; 494 } 495 } else { 496 ASSERT(ccnt <= pcnt); 497 498 /* 499 * Apply the algorithm described above by multiplying 500 * the previous result and adding in the new value. 501 */ 502 for (i = 0; i < ccnt; i++, src++, p++, q++) { 503 *p ^= *src; 504 505 VDEV_RAIDZ_64MUL_2(*q, mask); 506 *q ^= *src; 507 } 508 509 /* 510 * Treat short columns as though they are full of 0s. 511 * Note that there's therefore nothing needed for P. 512 */ 513 for (; i < pcnt; i++, q++) { 514 VDEV_RAIDZ_64MUL_2(*q, mask); 515 } 516 } 517 } 518 } 519 520 static void 521 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 522 { 523 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 524 int c; 525 526 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 527 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 528 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 529 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 530 rm->rm_col[VDEV_RAIDZ_R].rc_size); 531 532 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 533 src = rm->rm_col[c].rc_data; 534 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 535 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 536 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 537 538 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 539 540 if (c == rm->rm_firstdatacol) { 541 ASSERT(ccnt == pcnt || ccnt == 0); 542 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 543 *p = *src; 544 *q = *src; 545 *r = *src; 546 } 547 for (; i < pcnt; i++, src++, p++, q++, r++) { 548 *p = 0; 549 *q = 0; 550 *r = 0; 551 } 552 } else { 553 ASSERT(ccnt <= pcnt); 554 555 /* 556 * Apply the algorithm described above by multiplying 557 * the previous result and adding in the new value. 558 */ 559 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 560 *p ^= *src; 561 562 VDEV_RAIDZ_64MUL_2(*q, mask); 563 *q ^= *src; 564 565 VDEV_RAIDZ_64MUL_4(*r, mask); 566 *r ^= *src; 567 } 568 569 /* 570 * Treat short columns as though they are full of 0s. 571 * Note that there's therefore nothing needed for P. 572 */ 573 for (; i < pcnt; i++, q++, r++) { 574 VDEV_RAIDZ_64MUL_2(*q, mask); 575 VDEV_RAIDZ_64MUL_4(*r, mask); 576 } 577 } 578 } 579 } 580 581 /* 582 * Generate RAID parity in the first virtual columns according to the number of 583 * parity columns available. 584 */ 585 static void 586 vdev_raidz_generate_parity(raidz_map_t *rm) 587 { 588 switch (rm->rm_firstdatacol) { 589 case 1: 590 vdev_raidz_generate_parity_p(rm); 591 break; 592 case 2: 593 vdev_raidz_generate_parity_pq(rm); 594 break; 595 case 3: 596 vdev_raidz_generate_parity_pqr(rm); 597 break; 598 default: 599 panic("invalid RAID-Z configuration"); 600 } 601 } 602 603 /* BEGIN CSTYLED */ 604 /* 605 * In the general case of reconstruction, we must solve the system of linear 606 * equations defined by the coeffecients used to generate parity as well as 607 * the contents of the data and parity disks. This can be expressed with 608 * vectors for the original data (D) and the actual data (d) and parity (p) 609 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 610 * 611 * __ __ __ __ 612 * | | __ __ | p_0 | 613 * | V | | D_0 | | p_m-1 | 614 * | | x | : | = | d_0 | 615 * | I | | D_n-1 | | : | 616 * | | ~~ ~~ | d_n-1 | 617 * ~~ ~~ ~~ ~~ 618 * 619 * I is simply a square identity matrix of size n, and V is a vandermonde 620 * matrix defined by the coeffecients we chose for the various parity columns 621 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 622 * computation as well as linear separability. 623 * 624 * __ __ __ __ 625 * | 1 .. 1 1 1 | | p_0 | 626 * | 2^n-1 .. 4 2 1 | __ __ | : | 627 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 628 * | 1 .. 0 0 0 | | D_1 | | d_0 | 629 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 630 * | : : : : | | : | | d_2 | 631 * | 0 .. 1 0 0 | | D_n-1 | | : | 632 * | 0 .. 0 1 0 | ~~ ~~ | : | 633 * | 0 .. 0 0 1 | | d_n-1 | 634 * ~~ ~~ ~~ ~~ 635 * 636 * Note that I, V, d, and p are known. To compute D, we must invert the 637 * matrix and use the known data and parity values to reconstruct the unknown 638 * data values. We begin by removing the rows in V|I and d|p that correspond 639 * to failed or missing columns; we then make V|I square (n x n) and d|p 640 * sized n by removing rows corresponding to unused parity from the bottom up 641 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 642 * using Gauss-Jordan elimination. In the example below we use m=3 parity 643 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 644 * __ __ 645 * | 1 1 1 1 1 1 1 1 | 646 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 647 * | 19 205 116 29 64 16 4 1 | / / 648 * | 1 0 0 0 0 0 0 0 | / / 649 * | 0 1 0 0 0 0 0 0 | <--' / 650 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 651 * | 0 0 0 1 0 0 0 0 | 652 * | 0 0 0 0 1 0 0 0 | 653 * | 0 0 0 0 0 1 0 0 | 654 * | 0 0 0 0 0 0 1 0 | 655 * | 0 0 0 0 0 0 0 1 | 656 * ~~ ~~ 657 * __ __ 658 * | 1 1 1 1 1 1 1 1 | 659 * | 128 64 32 16 8 4 2 1 | 660 * | 19 205 116 29 64 16 4 1 | 661 * | 1 0 0 0 0 0 0 0 | 662 * | 0 1 0 0 0 0 0 0 | 663 * (V|I)' = | 0 0 1 0 0 0 0 0 | 664 * | 0 0 0 1 0 0 0 0 | 665 * | 0 0 0 0 1 0 0 0 | 666 * | 0 0 0 0 0 1 0 0 | 667 * | 0 0 0 0 0 0 1 0 | 668 * | 0 0 0 0 0 0 0 1 | 669 * ~~ ~~ 670 * 671 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 672 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 673 * matrix is not singular. 674 * __ __ 675 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 676 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 677 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 678 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 679 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 680 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 681 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 682 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 683 * ~~ ~~ 684 * __ __ 685 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 686 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 687 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 688 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 689 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 690 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 691 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 692 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 693 * ~~ ~~ 694 * __ __ 695 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 696 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 697 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 698 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 699 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 700 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 701 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 702 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 703 * ~~ ~~ 704 * __ __ 705 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 706 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 707 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 708 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 709 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 710 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 711 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 712 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 713 * ~~ ~~ 714 * __ __ 715 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 716 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 717 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 718 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 719 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 720 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 721 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 722 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 723 * ~~ ~~ 724 * __ __ 725 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 726 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 727 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 728 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 729 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 730 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 731 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 732 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 733 * ~~ ~~ 734 * __ __ 735 * | 0 0 1 0 0 0 0 0 | 736 * | 167 100 5 41 159 169 217 208 | 737 * | 166 100 4 40 158 168 216 209 | 738 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 739 * | 0 0 0 0 1 0 0 0 | 740 * | 0 0 0 0 0 1 0 0 | 741 * | 0 0 0 0 0 0 1 0 | 742 * | 0 0 0 0 0 0 0 1 | 743 * ~~ ~~ 744 * 745 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 746 * of the missing data. 747 * 748 * As is apparent from the example above, the only non-trivial rows in the 749 * inverse matrix correspond to the data disks that we're trying to 750 * reconstruct. Indeed, those are the only rows we need as the others would 751 * only be useful for reconstructing data known or assumed to be valid. For 752 * that reason, we only build the coefficients in the rows that correspond to 753 * targeted columns. 754 */ 755 /* END CSTYLED */ 756 757 static void 758 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 759 uint8_t **rows) 760 { 761 int i, j; 762 int pow; 763 764 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 765 766 /* 767 * Fill in the missing rows of interest. 768 */ 769 for (i = 0; i < nmap; i++) { 770 ASSERT3S(0, <=, map[i]); 771 ASSERT3S(map[i], <=, 2); 772 773 pow = map[i] * n; 774 if (pow > 255) 775 pow -= 255; 776 ASSERT(pow <= 255); 777 778 for (j = 0; j < n; j++) { 779 pow -= map[i]; 780 if (pow < 0) 781 pow += 255; 782 rows[i][j] = vdev_raidz_pow2[pow]; 783 } 784 } 785 } 786 787 static void 788 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 789 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 790 { 791 int i, j, ii, jj; 792 uint8_t log; 793 794 /* 795 * Assert that the first nmissing entries from the array of used 796 * columns correspond to parity columns and that subsequent entries 797 * correspond to data columns. 798 */ 799 for (i = 0; i < nmissing; i++) { 800 ASSERT3S(used[i], <, rm->rm_firstdatacol); 801 } 802 for (; i < n; i++) { 803 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 804 } 805 806 /* 807 * First initialize the storage where we'll compute the inverse rows. 808 */ 809 for (i = 0; i < nmissing; i++) { 810 for (j = 0; j < n; j++) { 811 invrows[i][j] = (i == j) ? 1 : 0; 812 } 813 } 814 815 /* 816 * Subtract all trivial rows from the rows of consequence. 817 */ 818 for (i = 0; i < nmissing; i++) { 819 for (j = nmissing; j < n; j++) { 820 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 821 jj = used[j] - rm->rm_firstdatacol; 822 ASSERT3S(jj, <, n); 823 invrows[i][j] = rows[i][jj]; 824 rows[i][jj] = 0; 825 } 826 } 827 828 /* 829 * For each of the rows of interest, we must normalize it and subtract 830 * a multiple of it from the other rows. 831 */ 832 for (i = 0; i < nmissing; i++) { 833 for (j = 0; j < missing[i]; j++) { 834 ASSERT3U(rows[i][j], ==, 0); 835 } 836 ASSERT3U(rows[i][missing[i]], !=, 0); 837 838 /* 839 * Compute the inverse of the first element and multiply each 840 * element in the row by that value. 841 */ 842 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 843 844 for (j = 0; j < n; j++) { 845 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 846 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 847 } 848 849 for (ii = 0; ii < nmissing; ii++) { 850 if (i == ii) 851 continue; 852 853 ASSERT3U(rows[ii][missing[i]], !=, 0); 854 855 log = vdev_raidz_log2[rows[ii][missing[i]]]; 856 857 for (j = 0; j < n; j++) { 858 rows[ii][j] ^= 859 vdev_raidz_exp2(rows[i][j], log); 860 invrows[ii][j] ^= 861 vdev_raidz_exp2(invrows[i][j], log); 862 } 863 } 864 } 865 866 /* 867 * Verify that the data that is left in the rows are properly part of 868 * an identity matrix. 869 */ 870 for (i = 0; i < nmissing; i++) { 871 for (j = 0; j < n; j++) { 872 if (j == missing[i]) { 873 ASSERT3U(rows[i][j], ==, 1); 874 } else { 875 ASSERT3U(rows[i][j], ==, 0); 876 } 877 } 878 } 879 } 880 881 static void 882 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 883 int *missing, uint8_t **invrows, const uint8_t *used) 884 { 885 int i, j, x, cc, c; 886 uint8_t *src; 887 uint64_t ccount; 888 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 889 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 890 uint8_t log, val; 891 int ll; 892 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 893 uint8_t *p, *pp; 894 size_t psize; 895 896 log = 0; /* gcc */ 897 psize = sizeof (invlog[0][0]) * n * nmissing; 898 p = zfs_alloc(psize); 899 900 for (pp = p, i = 0; i < nmissing; i++) { 901 invlog[i] = pp; 902 pp += n; 903 } 904 905 for (i = 0; i < nmissing; i++) { 906 for (j = 0; j < n; j++) { 907 ASSERT3U(invrows[i][j], !=, 0); 908 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 909 } 910 } 911 912 for (i = 0; i < n; i++) { 913 c = used[i]; 914 ASSERT3U(c, <, rm->rm_cols); 915 916 src = rm->rm_col[c].rc_data; 917 ccount = rm->rm_col[c].rc_size; 918 for (j = 0; j < nmissing; j++) { 919 cc = missing[j] + rm->rm_firstdatacol; 920 ASSERT3U(cc, >=, rm->rm_firstdatacol); 921 ASSERT3U(cc, <, rm->rm_cols); 922 ASSERT3U(cc, !=, c); 923 924 dst[j] = rm->rm_col[cc].rc_data; 925 dcount[j] = rm->rm_col[cc].rc_size; 926 } 927 928 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 929 930 for (x = 0; x < ccount; x++, src++) { 931 if (*src != 0) 932 log = vdev_raidz_log2[*src]; 933 934 for (cc = 0; cc < nmissing; cc++) { 935 if (x >= dcount[cc]) 936 continue; 937 938 if (*src == 0) { 939 val = 0; 940 } else { 941 if ((ll = log + invlog[cc][i]) >= 255) 942 ll -= 255; 943 val = vdev_raidz_pow2[ll]; 944 } 945 946 if (i == 0) 947 dst[cc][x] = val; 948 else 949 dst[cc][x] ^= val; 950 } 951 } 952 } 953 954 zfs_free(p, psize); 955 } 956 957 static int 958 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 959 { 960 int n, i, c, t, tt; 961 int nmissing_rows; 962 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 963 int parity_map[VDEV_RAIDZ_MAXPARITY]; 964 965 uint8_t *p, *pp; 966 size_t psize; 967 968 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 969 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 970 uint8_t *used; 971 972 int code = 0; 973 974 975 n = rm->rm_cols - rm->rm_firstdatacol; 976 977 /* 978 * Figure out which data columns are missing. 979 */ 980 nmissing_rows = 0; 981 for (t = 0; t < ntgts; t++) { 982 if (tgts[t] >= rm->rm_firstdatacol) { 983 missing_rows[nmissing_rows++] = 984 tgts[t] - rm->rm_firstdatacol; 985 } 986 } 987 988 /* 989 * Figure out which parity columns to use to help generate the missing 990 * data columns. 991 */ 992 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 993 ASSERT(tt < ntgts); 994 ASSERT(c < rm->rm_firstdatacol); 995 996 /* 997 * Skip any targeted parity columns. 998 */ 999 if (c == tgts[tt]) { 1000 tt++; 1001 continue; 1002 } 1003 1004 code |= 1 << c; 1005 1006 parity_map[i] = c; 1007 i++; 1008 } 1009 1010 ASSERT(code != 0); 1011 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1012 1013 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1014 nmissing_rows * n + sizeof (used[0]) * n; 1015 p = kmem_alloc(psize, KM_SLEEP); 1016 1017 for (pp = p, i = 0; i < nmissing_rows; i++) { 1018 rows[i] = pp; 1019 pp += n; 1020 invrows[i] = pp; 1021 pp += n; 1022 } 1023 used = pp; 1024 1025 for (i = 0; i < nmissing_rows; i++) { 1026 used[i] = parity_map[i]; 1027 } 1028 1029 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1030 if (tt < nmissing_rows && 1031 c == missing_rows[tt] + rm->rm_firstdatacol) { 1032 tt++; 1033 continue; 1034 } 1035 1036 ASSERT3S(i, <, n); 1037 used[i] = c; 1038 i++; 1039 } 1040 1041 /* 1042 * Initialize the interesting rows of the matrix. 1043 */ 1044 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1045 1046 /* 1047 * Invert the matrix. 1048 */ 1049 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1050 invrows, used); 1051 1052 /* 1053 * Reconstruct the missing data using the generated matrix. 1054 */ 1055 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1056 invrows, used); 1057 1058 kmem_free(p, psize); 1059 1060 return (code); 1061 } 1062 1063 static int 1064 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1065 { 1066 int tgts[VDEV_RAIDZ_MAXPARITY]; 1067 int ntgts; 1068 int i, c; 1069 int code; 1070 int nbadparity, nbaddata; 1071 1072 /* 1073 * The tgts list must already be sorted. 1074 */ 1075 for (i = 1; i < nt; i++) { 1076 ASSERT(t[i] > t[i - 1]); 1077 } 1078 1079 nbadparity = rm->rm_firstdatacol; 1080 nbaddata = rm->rm_cols - nbadparity; 1081 ntgts = 0; 1082 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1083 if (i < nt && c == t[i]) { 1084 tgts[ntgts++] = c; 1085 i++; 1086 } else if (rm->rm_col[c].rc_error != 0) { 1087 tgts[ntgts++] = c; 1088 } else if (c >= rm->rm_firstdatacol) { 1089 nbaddata--; 1090 } else { 1091 nbadparity--; 1092 } 1093 } 1094 1095 ASSERT(ntgts >= nt); 1096 ASSERT(nbaddata >= 0); 1097 ASSERT(nbaddata + nbadparity == ntgts); 1098 1099 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1100 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1101 ASSERT(code > 0); 1102 return (code); 1103 } 1104 1105 static raidz_map_t * 1106 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1107 uint64_t dcols, uint64_t nparity) 1108 { 1109 raidz_map_t *rm; 1110 uint64_t b = offset >> unit_shift; 1111 uint64_t s = size >> unit_shift; 1112 uint64_t f = b % dcols; 1113 uint64_t o = (b / dcols) << unit_shift; 1114 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1115 1116 q = s / (dcols - nparity); 1117 r = s - q * (dcols - nparity); 1118 bc = (r == 0 ? 0 : r + nparity); 1119 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1120 1121 if (q == 0) { 1122 acols = bc; 1123 scols = MIN(dcols, roundup(bc, nparity + 1)); 1124 } else { 1125 acols = dcols; 1126 scols = dcols; 1127 } 1128 1129 ASSERT3U(acols, <=, scols); 1130 1131 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1132 1133 rm->rm_cols = acols; 1134 rm->rm_scols = scols; 1135 rm->rm_bigcols = bc; 1136 rm->rm_skipstart = bc; 1137 rm->rm_missingdata = 0; 1138 rm->rm_missingparity = 0; 1139 rm->rm_firstdatacol = nparity; 1140 rm->rm_reports = 0; 1141 rm->rm_freed = 0; 1142 rm->rm_ecksuminjected = 0; 1143 1144 asize = 0; 1145 1146 for (c = 0; c < scols; c++) { 1147 col = f + c; 1148 coff = o; 1149 if (col >= dcols) { 1150 col -= dcols; 1151 coff += 1ULL << unit_shift; 1152 } 1153 rm->rm_col[c].rc_devidx = col; 1154 rm->rm_col[c].rc_offset = coff; 1155 rm->rm_col[c].rc_data = NULL; 1156 rm->rm_col[c].rc_error = 0; 1157 rm->rm_col[c].rc_tried = 0; 1158 rm->rm_col[c].rc_skipped = 0; 1159 1160 if (c >= acols) 1161 rm->rm_col[c].rc_size = 0; 1162 else if (c < bc) 1163 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1164 else 1165 rm->rm_col[c].rc_size = q << unit_shift; 1166 1167 asize += rm->rm_col[c].rc_size; 1168 } 1169 1170 ASSERT3U(asize, ==, tot << unit_shift); 1171 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1172 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1173 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1174 ASSERT3U(rm->rm_nskip, <=, nparity); 1175 1176 for (c = 0; c < rm->rm_firstdatacol; c++) 1177 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1178 1179 rm->rm_col[c].rc_data = data; 1180 1181 for (c = c + 1; c < acols; c++) 1182 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1183 rm->rm_col[c - 1].rc_size; 1184 1185 /* 1186 * If all data stored spans all columns, there's a danger that parity 1187 * will always be on the same device and, since parity isn't read 1188 * during normal operation, that that device's I/O bandwidth won't be 1189 * used effectively. We therefore switch the parity every 1MB. 1190 * 1191 * ... at least that was, ostensibly, the theory. As a practical 1192 * matter unless we juggle the parity between all devices evenly, we 1193 * won't see any benefit. Further, occasional writes that aren't a 1194 * multiple of the LCM of the number of children and the minimum 1195 * stripe width are sufficient to avoid pessimal behavior. 1196 * Unfortunately, this decision created an implicit on-disk format 1197 * requirement that we need to support for all eternity, but only 1198 * for single-parity RAID-Z. 1199 * 1200 * If we intend to skip a sector in the zeroth column for padding 1201 * we must make sure to note this swap. We will never intend to 1202 * skip the first column since at least one data and one parity 1203 * column must appear in each row. 1204 */ 1205 ASSERT(rm->rm_cols >= 2); 1206 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1207 1208 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1209 devidx = rm->rm_col[0].rc_devidx; 1210 o = rm->rm_col[0].rc_offset; 1211 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1212 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1213 rm->rm_col[1].rc_devidx = devidx; 1214 rm->rm_col[1].rc_offset = o; 1215 1216 if (rm->rm_skipstart == 0) 1217 rm->rm_skipstart = 1; 1218 } 1219 1220 return (rm); 1221 } 1222 1223 static void 1224 vdev_raidz_map_free(raidz_map_t *rm) 1225 { 1226 int c; 1227 1228 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1229 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1230 1231 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1232 } 1233 1234 static vdev_t * 1235 vdev_child(vdev_t *pvd, uint64_t devidx) 1236 { 1237 vdev_t *cvd; 1238 1239 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1240 if (cvd->v_id == devidx) 1241 break; 1242 } 1243 1244 return (cvd); 1245 } 1246 1247 /* 1248 * We keep track of whether or not there were any injected errors, so that 1249 * any ereports we generate can note it. 1250 */ 1251 static int 1252 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1253 { 1254 1255 return (zio_checksum_verify(bp, data)); 1256 } 1257 1258 /* 1259 * Generate the parity from the data columns. If we tried and were able to 1260 * read the parity without error, verify that the generated parity matches the 1261 * data we read. If it doesn't, we fire off a checksum error. Return the 1262 * number such failures. 1263 */ 1264 static int 1265 raidz_parity_verify(raidz_map_t *rm) 1266 { 1267 void *orig[VDEV_RAIDZ_MAXPARITY]; 1268 int c, ret = 0; 1269 raidz_col_t *rc; 1270 1271 for (c = 0; c < rm->rm_firstdatacol; c++) { 1272 rc = &rm->rm_col[c]; 1273 if (!rc->rc_tried || rc->rc_error != 0) 1274 continue; 1275 orig[c] = zfs_alloc(rc->rc_size); 1276 bcopy(rc->rc_data, orig[c], rc->rc_size); 1277 } 1278 1279 vdev_raidz_generate_parity(rm); 1280 1281 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1282 rc = &rm->rm_col[c]; 1283 if (!rc->rc_tried || rc->rc_error != 0) 1284 continue; 1285 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1286 rc->rc_error = ECKSUM; 1287 ret++; 1288 } 1289 zfs_free(orig[c], rc->rc_size); 1290 } 1291 1292 return (ret); 1293 } 1294 1295 /* 1296 * Iterate over all combinations of bad data and attempt a reconstruction. 1297 * Note that the algorithm below is non-optimal because it doesn't take into 1298 * account how reconstruction is actually performed. For example, with 1299 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1300 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1301 * cases we'd only use parity information in column 0. 1302 */ 1303 static int 1304 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1305 off_t offset, uint64_t bytes, int total_errors, int data_errors) 1306 { 1307 raidz_col_t *rc; 1308 void *orig[VDEV_RAIDZ_MAXPARITY]; 1309 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1310 int *tgts = &tstore[1]; 1311 int current, next, i, c, n; 1312 int code, ret = 0; 1313 1314 ASSERT(total_errors < rm->rm_firstdatacol); 1315 1316 /* 1317 * This simplifies one edge condition. 1318 */ 1319 tgts[-1] = -1; 1320 1321 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1322 /* 1323 * Initialize the targets array by finding the first n columns 1324 * that contain no error. 1325 * 1326 * If there were no data errors, we need to ensure that we're 1327 * always explicitly attempting to reconstruct at least one 1328 * data column. To do this, we simply push the highest target 1329 * up into the data columns. 1330 */ 1331 for (c = 0, i = 0; i < n; i++) { 1332 if (i == n - 1 && data_errors == 0 && 1333 c < rm->rm_firstdatacol) { 1334 c = rm->rm_firstdatacol; 1335 } 1336 1337 while (rm->rm_col[c].rc_error != 0) { 1338 c++; 1339 ASSERT3S(c, <, rm->rm_cols); 1340 } 1341 1342 tgts[i] = c++; 1343 } 1344 1345 /* 1346 * Setting tgts[n] simplifies the other edge condition. 1347 */ 1348 tgts[n] = rm->rm_cols; 1349 1350 /* 1351 * These buffers were allocated in previous iterations. 1352 */ 1353 for (i = 0; i < n - 1; i++) { 1354 ASSERT(orig[i] != NULL); 1355 } 1356 1357 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1358 1359 current = 0; 1360 next = tgts[current]; 1361 1362 while (current != n) { 1363 tgts[current] = next; 1364 current = 0; 1365 1366 /* 1367 * Save off the original data that we're going to 1368 * attempt to reconstruct. 1369 */ 1370 for (i = 0; i < n; i++) { 1371 ASSERT(orig[i] != NULL); 1372 c = tgts[i]; 1373 ASSERT3S(c, >=, 0); 1374 ASSERT3S(c, <, rm->rm_cols); 1375 rc = &rm->rm_col[c]; 1376 bcopy(rc->rc_data, orig[i], rc->rc_size); 1377 } 1378 1379 /* 1380 * Attempt a reconstruction and exit the outer loop on 1381 * success. 1382 */ 1383 code = vdev_raidz_reconstruct(rm, tgts, n); 1384 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1385 for (i = 0; i < n; i++) { 1386 c = tgts[i]; 1387 rc = &rm->rm_col[c]; 1388 ASSERT(rc->rc_error == 0); 1389 rc->rc_error = ECKSUM; 1390 } 1391 1392 ret = code; 1393 goto done; 1394 } 1395 1396 /* 1397 * Restore the original data. 1398 */ 1399 for (i = 0; i < n; i++) { 1400 c = tgts[i]; 1401 rc = &rm->rm_col[c]; 1402 bcopy(orig[i], rc->rc_data, rc->rc_size); 1403 } 1404 1405 do { 1406 /* 1407 * Find the next valid column after the current 1408 * position.. 1409 */ 1410 for (next = tgts[current] + 1; 1411 next < rm->rm_cols && 1412 rm->rm_col[next].rc_error != 0; next++) 1413 continue; 1414 1415 ASSERT(next <= tgts[current + 1]); 1416 1417 /* 1418 * If that spot is available, we're done here. 1419 */ 1420 if (next != tgts[current + 1]) 1421 break; 1422 1423 /* 1424 * Otherwise, find the next valid column after 1425 * the previous position. 1426 */ 1427 for (c = tgts[current - 1] + 1; 1428 rm->rm_col[c].rc_error != 0; c++) 1429 continue; 1430 1431 tgts[current] = c; 1432 current++; 1433 1434 } while (current != n); 1435 } 1436 } 1437 n--; 1438 done: 1439 for (i = n - 1; i >= 0; i--) { 1440 zfs_free(orig[i], rm->rm_col[0].rc_size); 1441 } 1442 1443 return (ret); 1444 } 1445 1446 static int 1447 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1448 off_t offset, size_t bytes) 1449 { 1450 vdev_t *tvd = vd->v_top; 1451 vdev_t *cvd; 1452 raidz_map_t *rm; 1453 raidz_col_t *rc; 1454 int c, error; 1455 int unexpected_errors; 1456 int parity_errors; 1457 int parity_untried; 1458 int data_errors; 1459 int total_errors; 1460 int n; 1461 int tgts[VDEV_RAIDZ_MAXPARITY]; 1462 int code; 1463 1464 rc = NULL; /* gcc */ 1465 error = 0; 1466 1467 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1468 vd->v_nchildren, vd->v_nparity); 1469 1470 /* 1471 * Iterate over the columns in reverse order so that we hit the parity 1472 * last -- any errors along the way will force us to read the parity. 1473 */ 1474 for (c = rm->rm_cols - 1; c >= 0; c--) { 1475 rc = &rm->rm_col[c]; 1476 cvd = vdev_child(vd, rc->rc_devidx); 1477 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1478 if (c >= rm->rm_firstdatacol) 1479 rm->rm_missingdata++; 1480 else 1481 rm->rm_missingparity++; 1482 rc->rc_error = ENXIO; 1483 rc->rc_tried = 1; /* don't even try */ 1484 rc->rc_skipped = 1; 1485 continue; 1486 } 1487 #if 0 /* XXX: Too hard for the boot code. */ 1488 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1489 if (c >= rm->rm_firstdatacol) 1490 rm->rm_missingdata++; 1491 else 1492 rm->rm_missingparity++; 1493 rc->rc_error = ESTALE; 1494 rc->rc_skipped = 1; 1495 continue; 1496 } 1497 #endif 1498 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1499 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1500 rc->rc_offset, rc->rc_size); 1501 rc->rc_tried = 1; 1502 rc->rc_skipped = 0; 1503 } 1504 } 1505 1506 reconstruct: 1507 unexpected_errors = 0; 1508 parity_errors = 0; 1509 parity_untried = 0; 1510 data_errors = 0; 1511 total_errors = 0; 1512 1513 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1514 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1515 1516 for (c = 0; c < rm->rm_cols; c++) { 1517 rc = &rm->rm_col[c]; 1518 1519 if (rc->rc_error) { 1520 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1521 1522 if (c < rm->rm_firstdatacol) 1523 parity_errors++; 1524 else 1525 data_errors++; 1526 1527 if (!rc->rc_skipped) 1528 unexpected_errors++; 1529 1530 total_errors++; 1531 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1532 parity_untried++; 1533 } 1534 } 1535 1536 /* 1537 * There are three potential phases for a read: 1538 * 1. produce valid data from the columns read 1539 * 2. read all disks and try again 1540 * 3. perform combinatorial reconstruction 1541 * 1542 * Each phase is progressively both more expensive and less likely to 1543 * occur. If we encounter more errors than we can repair or all phases 1544 * fail, we have no choice but to return an error. 1545 */ 1546 1547 /* 1548 * If the number of errors we saw was correctable -- less than or equal 1549 * to the number of parity disks read -- attempt to produce data that 1550 * has a valid checksum. Naturally, this case applies in the absence of 1551 * any errors. 1552 */ 1553 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1554 if (data_errors == 0) { 1555 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1556 /* 1557 * If we read parity information (unnecessarily 1558 * as it happens since no reconstruction was 1559 * needed) regenerate and verify the parity. 1560 * We also regenerate parity when resilvering 1561 * so we can write it out to the failed device 1562 * later. 1563 */ 1564 if (parity_errors + parity_untried < 1565 rm->rm_firstdatacol) { 1566 n = raidz_parity_verify(rm); 1567 unexpected_errors += n; 1568 ASSERT(parity_errors + n <= 1569 rm->rm_firstdatacol); 1570 } 1571 goto done; 1572 } 1573 } else { 1574 /* 1575 * We either attempt to read all the parity columns or 1576 * none of them. If we didn't try to read parity, we 1577 * wouldn't be here in the correctable case. There must 1578 * also have been fewer parity errors than parity 1579 * columns or, again, we wouldn't be in this code path. 1580 */ 1581 ASSERT(parity_untried == 0); 1582 ASSERT(parity_errors < rm->rm_firstdatacol); 1583 1584 /* 1585 * Identify the data columns that reported an error. 1586 */ 1587 n = 0; 1588 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1589 rc = &rm->rm_col[c]; 1590 if (rc->rc_error != 0) { 1591 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1592 tgts[n++] = c; 1593 } 1594 } 1595 1596 ASSERT(rm->rm_firstdatacol >= n); 1597 1598 code = vdev_raidz_reconstruct(rm, tgts, n); 1599 1600 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1601 /* 1602 * If we read more parity disks than were used 1603 * for reconstruction, confirm that the other 1604 * parity disks produced correct data. This 1605 * routine is suboptimal in that it regenerates 1606 * the parity that we already used in addition 1607 * to the parity that we're attempting to 1608 * verify, but this should be a relatively 1609 * uncommon case, and can be optimized if it 1610 * becomes a problem. Note that we regenerate 1611 * parity when resilvering so we can write it 1612 * out to failed devices later. 1613 */ 1614 if (parity_errors < rm->rm_firstdatacol - n) { 1615 n = raidz_parity_verify(rm); 1616 unexpected_errors += n; 1617 ASSERT(parity_errors + n <= 1618 rm->rm_firstdatacol); 1619 } 1620 1621 goto done; 1622 } 1623 } 1624 } 1625 1626 /* 1627 * This isn't a typical situation -- either we got a read 1628 * error or a child silently returned bad data. Read every 1629 * block so we can try again with as much data and parity as 1630 * we can track down. If we've already been through once 1631 * before, all children will be marked as tried so we'll 1632 * proceed to combinatorial reconstruction. 1633 */ 1634 unexpected_errors = 1; 1635 rm->rm_missingdata = 0; 1636 rm->rm_missingparity = 0; 1637 1638 n = 0; 1639 for (c = 0; c < rm->rm_cols; c++) { 1640 rc = &rm->rm_col[c]; 1641 1642 if (rc->rc_tried) 1643 continue; 1644 1645 cvd = vdev_child(vd, rc->rc_devidx); 1646 ASSERT(cvd != NULL); 1647 rc->rc_error = cvd->v_read(cvd, NULL, 1648 rc->rc_data, rc->rc_offset, rc->rc_size); 1649 if (rc->rc_error == 0) 1650 n++; 1651 rc->rc_tried = 1; 1652 rc->rc_skipped = 0; 1653 } 1654 /* 1655 * If we managed to read anything more, retry the 1656 * reconstruction. 1657 */ 1658 if (n > 0) 1659 goto reconstruct; 1660 1661 /* 1662 * At this point we've attempted to reconstruct the data given the 1663 * errors we detected, and we've attempted to read all columns. There 1664 * must, therefore, be one or more additional problems -- silent errors 1665 * resulting in invalid data rather than explicit I/O errors resulting 1666 * in absent data. We check if there is enough additional data to 1667 * possibly reconstruct the data and then perform combinatorial 1668 * reconstruction over all possible combinations. If that fails, 1669 * we're cooked. 1670 */ 1671 if (total_errors > rm->rm_firstdatacol) { 1672 error = EIO; 1673 } else if (total_errors < rm->rm_firstdatacol && 1674 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1675 total_errors, data_errors)) != 0) { 1676 /* 1677 * If we didn't use all the available parity for the 1678 * combinatorial reconstruction, verify that the remaining 1679 * parity is correct. 1680 */ 1681 if (code != (1 << rm->rm_firstdatacol) - 1) 1682 (void) raidz_parity_verify(rm); 1683 } else { 1684 /* 1685 * We're here because either: 1686 * 1687 * total_errors == rm_first_datacol, or 1688 * vdev_raidz_combrec() failed 1689 * 1690 * In either case, there is enough bad data to prevent 1691 * reconstruction. 1692 * 1693 * Start checksum ereports for all children which haven't 1694 * failed, and the IO wasn't speculative. 1695 */ 1696 error = ECKSUM; 1697 } 1698 1699 done: 1700 vdev_raidz_map_free(rm); 1701 1702 return (error); 1703 } 1704