1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT(...) do { } while (0) 34 #define ASSERT3U(...) do { } while (0) 35 #define ASSERT3S(...) do { } while (0) 36 37 #define panic(...) do { \ 38 printf(__VA_ARGS__); \ 39 for (;;) ; \ 40 } while (0) 41 42 #define kmem_alloc(size, flag) zfs_alloc((size)) 43 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 44 45 static void 46 zfs_init_crc(void) 47 { 48 int i, j; 49 uint64_t *ct; 50 51 /* 52 * Calculate the crc64 table (used for the zap hash 53 * function). 54 */ 55 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 56 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 57 for (i = 0; i < 256; i++) 58 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 59 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 60 } 61 } 62 63 static void 64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 73 74 /* 75 * Information about each checksum function. 76 */ 77 typedef struct zio_checksum_info { 78 zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 79 int ci_correctable; /* number of correctable bits */ 80 int ci_eck; /* uses zio embedded checksum? */ 81 int ci_dedup; /* strong enough for dedup? */ 82 const char *ci_name; /* descriptive name */ 83 } zio_checksum_info_t; 84 85 #include "fletcher.c" 86 #include "sha256.c" 87 88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 89 {{NULL, NULL}, 0, 0, 0, "inherit"}, 90 {{NULL, NULL}, 0, 0, 0, "on"}, 91 {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, 92 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, 93 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, 94 {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, 95 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, 96 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, 97 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, 98 {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, 99 }; 100 101 102 /* 103 * Common signature for all zio compress/decompress functions. 104 */ 105 typedef size_t zio_compress_func_t(void *src, void *dst, 106 size_t s_len, size_t d_len, int); 107 typedef int zio_decompress_func_t(void *src, void *dst, 108 size_t s_len, size_t d_len, int); 109 110 /* 111 * Information about each compression function. 112 */ 113 typedef struct zio_compress_info { 114 zio_compress_func_t *ci_compress; /* compression function */ 115 zio_decompress_func_t *ci_decompress; /* decompression function */ 116 int ci_level; /* level parameter */ 117 const char *ci_name; /* algorithm name */ 118 } zio_compress_info_t; 119 120 #include "lzjb.c" 121 #include "zle.c" 122 123 /* 124 * Compression vectors. 125 */ 126 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 127 {NULL, NULL, 0, "inherit"}, 128 {NULL, NULL, 0, "on"}, 129 {NULL, NULL, 0, "uncompressed"}, 130 {NULL, lzjb_decompress, 0, "lzjb"}, 131 {NULL, NULL, 0, "empty"}, 132 {NULL, NULL, 1, "gzip-1"}, 133 {NULL, NULL, 2, "gzip-2"}, 134 {NULL, NULL, 3, "gzip-3"}, 135 {NULL, NULL, 4, "gzip-4"}, 136 {NULL, NULL, 5, "gzip-5"}, 137 {NULL, NULL, 6, "gzip-6"}, 138 {NULL, NULL, 7, "gzip-7"}, 139 {NULL, NULL, 8, "gzip-8"}, 140 {NULL, NULL, 9, "gzip-9"}, 141 {NULL, zle_decompress, 64, "zle"}, 142 }; 143 144 static void 145 byteswap_uint64_array(void *vbuf, size_t size) 146 { 147 uint64_t *buf = vbuf; 148 size_t count = size >> 3; 149 int i; 150 151 ASSERT((size & 7) == 0); 152 153 for (i = 0; i < count; i++) 154 buf[i] = BSWAP_64(buf[i]); 155 } 156 157 /* 158 * Set the external verifier for a gang block based on <vdev, offset, txg>, 159 * a tuple which is guaranteed to be unique for the life of the pool. 160 */ 161 static void 162 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 163 { 164 const dva_t *dva = BP_IDENTITY(bp); 165 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 166 167 ASSERT(BP_IS_GANG(bp)); 168 169 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 170 } 171 172 /* 173 * Set the external verifier for a label block based on its offset. 174 * The vdev is implicit, and the txg is unknowable at pool open time -- 175 * hence the logic in vdev_uberblock_load() to find the most recent copy. 176 */ 177 static void 178 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 179 { 180 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 181 } 182 183 static int 184 zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset) 185 { 186 unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp); 187 uint64_t size = BP_GET_PSIZE(bp); 188 zio_checksum_info_t *ci; 189 zio_cksum_t actual_cksum, expected_cksum, verifier; 190 int byteswap; 191 192 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 193 return (EINVAL); 194 ci = &zio_checksum_table[checksum]; 195 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 196 return (EINVAL); 197 198 if (ci->ci_eck) { 199 zio_eck_t *eck; 200 201 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 202 checksum == ZIO_CHECKSUM_LABEL); 203 204 eck = (zio_eck_t *)((char *)data + size) - 1; 205 206 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 207 zio_checksum_gang_verifier(&verifier, bp); 208 else if (checksum == ZIO_CHECKSUM_LABEL) 209 zio_checksum_label_verifier(&verifier, offset); 210 else 211 verifier = bp->blk_cksum; 212 213 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 214 215 if (byteswap) 216 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 217 218 expected_cksum = eck->zec_cksum; 219 eck->zec_cksum = verifier; 220 ci->ci_func[byteswap](data, size, &actual_cksum); 221 eck->zec_cksum = expected_cksum; 222 223 if (byteswap) 224 byteswap_uint64_array(&expected_cksum, 225 sizeof (zio_cksum_t)); 226 } else { 227 ASSERT(!BP_IS_GANG(bp)); 228 expected_cksum = bp->blk_cksum; 229 ci->ci_func[0](data, size, &actual_cksum); 230 } 231 232 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 233 /*printf("ZFS: read checksum failed\n");*/ 234 return (EIO); 235 } 236 237 return (0); 238 } 239 240 static int 241 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 242 void *dest, uint64_t destsize) 243 { 244 zio_compress_info_t *ci; 245 246 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 247 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 248 return (EIO); 249 } 250 251 ci = &zio_compress_table[cpfunc]; 252 if (!ci->ci_decompress) { 253 printf("ZFS: unsupported compression algorithm %s\n", 254 ci->ci_name); 255 return (EIO); 256 } 257 258 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 259 } 260 261 static uint64_t 262 zap_hash(uint64_t salt, const char *name) 263 { 264 const uint8_t *cp; 265 uint8_t c; 266 uint64_t crc = salt; 267 268 ASSERT(crc != 0); 269 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 270 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 271 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 272 273 /* 274 * Only use 28 bits, since we need 4 bits in the cookie for the 275 * collision differentiator. We MUST use the high bits, since 276 * those are the onces that we first pay attention to when 277 * chosing the bucket. 278 */ 279 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 280 281 return (crc); 282 } 283 284 static void *zfs_alloc(size_t size); 285 static void zfs_free(void *ptr, size_t size); 286 287 typedef struct raidz_col { 288 uint64_t rc_devidx; /* child device index for I/O */ 289 uint64_t rc_offset; /* device offset */ 290 uint64_t rc_size; /* I/O size */ 291 void *rc_data; /* I/O data */ 292 int rc_error; /* I/O error for this device */ 293 uint8_t rc_tried; /* Did we attempt this I/O column? */ 294 uint8_t rc_skipped; /* Did we skip this I/O column? */ 295 } raidz_col_t; 296 297 typedef struct raidz_map { 298 uint64_t rm_cols; /* Regular column count */ 299 uint64_t rm_scols; /* Count including skipped columns */ 300 uint64_t rm_bigcols; /* Number of oversized columns */ 301 uint64_t rm_asize; /* Actual total I/O size */ 302 uint64_t rm_missingdata; /* Count of missing data devices */ 303 uint64_t rm_missingparity; /* Count of missing parity devices */ 304 uint64_t rm_firstdatacol; /* First data column/parity count */ 305 uint64_t rm_nskip; /* Skipped sectors for padding */ 306 uint64_t rm_skipstart; /* Column index of padding start */ 307 uintptr_t rm_reports; /* # of referencing checksum reports */ 308 uint8_t rm_freed; /* map no longer has referencing ZIO */ 309 uint8_t rm_ecksuminjected; /* checksum error was injected */ 310 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 311 } raidz_map_t; 312 313 #define VDEV_RAIDZ_P 0 314 #define VDEV_RAIDZ_Q 1 315 #define VDEV_RAIDZ_R 2 316 317 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 318 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 319 320 /* 321 * We provide a mechanism to perform the field multiplication operation on a 322 * 64-bit value all at once rather than a byte at a time. This works by 323 * creating a mask from the top bit in each byte and using that to 324 * conditionally apply the XOR of 0x1d. 325 */ 326 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 327 { \ 328 (mask) = (x) & 0x8080808080808080ULL; \ 329 (mask) = ((mask) << 1) - ((mask) >> 7); \ 330 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 331 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 332 } 333 334 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 335 { \ 336 VDEV_RAIDZ_64MUL_2((x), mask); \ 337 VDEV_RAIDZ_64MUL_2((x), mask); \ 338 } 339 340 /* 341 * These two tables represent powers and logs of 2 in the Galois field defined 342 * above. These values were computed by repeatedly multiplying by 2 as above. 343 */ 344 static const uint8_t vdev_raidz_pow2[256] = { 345 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 346 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 347 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 348 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 349 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 350 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 351 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 352 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 353 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 354 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 355 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 356 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 357 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 358 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 359 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 360 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 361 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 362 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 363 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 364 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 365 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 366 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 367 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 368 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 369 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 370 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 371 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 372 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 373 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 374 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 375 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 376 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 377 }; 378 static const uint8_t vdev_raidz_log2[256] = { 379 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 380 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 381 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 382 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 383 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 384 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 385 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 386 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 387 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 388 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 389 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 390 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 391 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 392 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 393 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 394 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 395 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 396 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 397 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 398 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 399 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 400 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 401 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 402 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 403 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 404 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 405 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 406 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 407 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 408 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 409 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 410 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 411 }; 412 413 /* 414 * Multiply a given number by 2 raised to the given power. 415 */ 416 static uint8_t 417 vdev_raidz_exp2(uint8_t a, int exp) 418 { 419 if (a == 0) 420 return (0); 421 422 ASSERT(exp >= 0); 423 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 424 425 exp += vdev_raidz_log2[a]; 426 if (exp > 255) 427 exp -= 255; 428 429 return (vdev_raidz_pow2[exp]); 430 } 431 432 static void 433 vdev_raidz_generate_parity_p(raidz_map_t *rm) 434 { 435 uint64_t *p, *src, pcount, ccount, i; 436 int c; 437 438 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 439 440 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 441 src = rm->rm_col[c].rc_data; 442 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 443 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 444 445 if (c == rm->rm_firstdatacol) { 446 ASSERT(ccount == pcount); 447 for (i = 0; i < ccount; i++, src++, p++) { 448 *p = *src; 449 } 450 } else { 451 ASSERT(ccount <= pcount); 452 for (i = 0; i < ccount; i++, src++, p++) { 453 *p ^= *src; 454 } 455 } 456 } 457 } 458 459 static void 460 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 461 { 462 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 463 int c; 464 465 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 466 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 467 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 468 469 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 470 src = rm->rm_col[c].rc_data; 471 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 472 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 473 474 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 475 476 if (c == rm->rm_firstdatacol) { 477 ASSERT(ccnt == pcnt || ccnt == 0); 478 for (i = 0; i < ccnt; i++, src++, p++, q++) { 479 *p = *src; 480 *q = *src; 481 } 482 for (; i < pcnt; i++, src++, p++, q++) { 483 *p = 0; 484 *q = 0; 485 } 486 } else { 487 ASSERT(ccnt <= pcnt); 488 489 /* 490 * Apply the algorithm described above by multiplying 491 * the previous result and adding in the new value. 492 */ 493 for (i = 0; i < ccnt; i++, src++, p++, q++) { 494 *p ^= *src; 495 496 VDEV_RAIDZ_64MUL_2(*q, mask); 497 *q ^= *src; 498 } 499 500 /* 501 * Treat short columns as though they are full of 0s. 502 * Note that there's therefore nothing needed for P. 503 */ 504 for (; i < pcnt; i++, q++) { 505 VDEV_RAIDZ_64MUL_2(*q, mask); 506 } 507 } 508 } 509 } 510 511 static void 512 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 513 { 514 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 515 int c; 516 517 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 518 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 519 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 520 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 521 rm->rm_col[VDEV_RAIDZ_R].rc_size); 522 523 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 524 src = rm->rm_col[c].rc_data; 525 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 526 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 527 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 528 529 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 530 531 if (c == rm->rm_firstdatacol) { 532 ASSERT(ccnt == pcnt || ccnt == 0); 533 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 534 *p = *src; 535 *q = *src; 536 *r = *src; 537 } 538 for (; i < pcnt; i++, src++, p++, q++, r++) { 539 *p = 0; 540 *q = 0; 541 *r = 0; 542 } 543 } else { 544 ASSERT(ccnt <= pcnt); 545 546 /* 547 * Apply the algorithm described above by multiplying 548 * the previous result and adding in the new value. 549 */ 550 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 551 *p ^= *src; 552 553 VDEV_RAIDZ_64MUL_2(*q, mask); 554 *q ^= *src; 555 556 VDEV_RAIDZ_64MUL_4(*r, mask); 557 *r ^= *src; 558 } 559 560 /* 561 * Treat short columns as though they are full of 0s. 562 * Note that there's therefore nothing needed for P. 563 */ 564 for (; i < pcnt; i++, q++, r++) { 565 VDEV_RAIDZ_64MUL_2(*q, mask); 566 VDEV_RAIDZ_64MUL_4(*r, mask); 567 } 568 } 569 } 570 } 571 572 /* 573 * Generate RAID parity in the first virtual columns according to the number of 574 * parity columns available. 575 */ 576 static void 577 vdev_raidz_generate_parity(raidz_map_t *rm) 578 { 579 switch (rm->rm_firstdatacol) { 580 case 1: 581 vdev_raidz_generate_parity_p(rm); 582 break; 583 case 2: 584 vdev_raidz_generate_parity_pq(rm); 585 break; 586 case 3: 587 vdev_raidz_generate_parity_pqr(rm); 588 break; 589 default: 590 panic("invalid RAID-Z configuration"); 591 } 592 } 593 594 /* BEGIN CSTYLED */ 595 /* 596 * In the general case of reconstruction, we must solve the system of linear 597 * equations defined by the coeffecients used to generate parity as well as 598 * the contents of the data and parity disks. This can be expressed with 599 * vectors for the original data (D) and the actual data (d) and parity (p) 600 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 601 * 602 * __ __ __ __ 603 * | | __ __ | p_0 | 604 * | V | | D_0 | | p_m-1 | 605 * | | x | : | = | d_0 | 606 * | I | | D_n-1 | | : | 607 * | | ~~ ~~ | d_n-1 | 608 * ~~ ~~ ~~ ~~ 609 * 610 * I is simply a square identity matrix of size n, and V is a vandermonde 611 * matrix defined by the coeffecients we chose for the various parity columns 612 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 613 * computation as well as linear separability. 614 * 615 * __ __ __ __ 616 * | 1 .. 1 1 1 | | p_0 | 617 * | 2^n-1 .. 4 2 1 | __ __ | : | 618 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 619 * | 1 .. 0 0 0 | | D_1 | | d_0 | 620 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 621 * | : : : : | | : | | d_2 | 622 * | 0 .. 1 0 0 | | D_n-1 | | : | 623 * | 0 .. 0 1 0 | ~~ ~~ | : | 624 * | 0 .. 0 0 1 | | d_n-1 | 625 * ~~ ~~ ~~ ~~ 626 * 627 * Note that I, V, d, and p are known. To compute D, we must invert the 628 * matrix and use the known data and parity values to reconstruct the unknown 629 * data values. We begin by removing the rows in V|I and d|p that correspond 630 * to failed or missing columns; we then make V|I square (n x n) and d|p 631 * sized n by removing rows corresponding to unused parity from the bottom up 632 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 633 * using Gauss-Jordan elimination. In the example below we use m=3 parity 634 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 635 * __ __ 636 * | 1 1 1 1 1 1 1 1 | 637 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 638 * | 19 205 116 29 64 16 4 1 | / / 639 * | 1 0 0 0 0 0 0 0 | / / 640 * | 0 1 0 0 0 0 0 0 | <--' / 641 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 642 * | 0 0 0 1 0 0 0 0 | 643 * | 0 0 0 0 1 0 0 0 | 644 * | 0 0 0 0 0 1 0 0 | 645 * | 0 0 0 0 0 0 1 0 | 646 * | 0 0 0 0 0 0 0 1 | 647 * ~~ ~~ 648 * __ __ 649 * | 1 1 1 1 1 1 1 1 | 650 * | 128 64 32 16 8 4 2 1 | 651 * | 19 205 116 29 64 16 4 1 | 652 * | 1 0 0 0 0 0 0 0 | 653 * | 0 1 0 0 0 0 0 0 | 654 * (V|I)' = | 0 0 1 0 0 0 0 0 | 655 * | 0 0 0 1 0 0 0 0 | 656 * | 0 0 0 0 1 0 0 0 | 657 * | 0 0 0 0 0 1 0 0 | 658 * | 0 0 0 0 0 0 1 0 | 659 * | 0 0 0 0 0 0 0 1 | 660 * ~~ ~~ 661 * 662 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 663 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 664 * matrix is not singular. 665 * __ __ 666 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 667 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 668 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 669 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 670 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 671 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 672 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 673 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 674 * ~~ ~~ 675 * __ __ 676 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 677 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 678 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 679 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 680 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 681 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 682 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 683 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 684 * ~~ ~~ 685 * __ __ 686 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 687 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 688 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 689 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 690 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 691 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 692 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 693 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 694 * ~~ ~~ 695 * __ __ 696 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 697 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 698 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 699 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 700 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 701 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 702 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 703 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 704 * ~~ ~~ 705 * __ __ 706 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 707 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 708 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 709 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 710 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 711 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 712 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 713 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 714 * ~~ ~~ 715 * __ __ 716 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 717 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 718 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 719 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 720 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 721 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 722 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 723 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 724 * ~~ ~~ 725 * __ __ 726 * | 0 0 1 0 0 0 0 0 | 727 * | 167 100 5 41 159 169 217 208 | 728 * | 166 100 4 40 158 168 216 209 | 729 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 730 * | 0 0 0 0 1 0 0 0 | 731 * | 0 0 0 0 0 1 0 0 | 732 * | 0 0 0 0 0 0 1 0 | 733 * | 0 0 0 0 0 0 0 1 | 734 * ~~ ~~ 735 * 736 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 737 * of the missing data. 738 * 739 * As is apparent from the example above, the only non-trivial rows in the 740 * inverse matrix correspond to the data disks that we're trying to 741 * reconstruct. Indeed, those are the only rows we need as the others would 742 * only be useful for reconstructing data known or assumed to be valid. For 743 * that reason, we only build the coefficients in the rows that correspond to 744 * targeted columns. 745 */ 746 /* END CSTYLED */ 747 748 static void 749 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 750 uint8_t **rows) 751 { 752 int i, j; 753 int pow; 754 755 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 756 757 /* 758 * Fill in the missing rows of interest. 759 */ 760 for (i = 0; i < nmap; i++) { 761 ASSERT3S(0, <=, map[i]); 762 ASSERT3S(map[i], <=, 2); 763 764 pow = map[i] * n; 765 if (pow > 255) 766 pow -= 255; 767 ASSERT(pow <= 255); 768 769 for (j = 0; j < n; j++) { 770 pow -= map[i]; 771 if (pow < 0) 772 pow += 255; 773 rows[i][j] = vdev_raidz_pow2[pow]; 774 } 775 } 776 } 777 778 static void 779 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 780 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 781 { 782 int i, j, ii, jj; 783 uint8_t log; 784 785 /* 786 * Assert that the first nmissing entries from the array of used 787 * columns correspond to parity columns and that subsequent entries 788 * correspond to data columns. 789 */ 790 for (i = 0; i < nmissing; i++) { 791 ASSERT3S(used[i], <, rm->rm_firstdatacol); 792 } 793 for (; i < n; i++) { 794 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 795 } 796 797 /* 798 * First initialize the storage where we'll compute the inverse rows. 799 */ 800 for (i = 0; i < nmissing; i++) { 801 for (j = 0; j < n; j++) { 802 invrows[i][j] = (i == j) ? 1 : 0; 803 } 804 } 805 806 /* 807 * Subtract all trivial rows from the rows of consequence. 808 */ 809 for (i = 0; i < nmissing; i++) { 810 for (j = nmissing; j < n; j++) { 811 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 812 jj = used[j] - rm->rm_firstdatacol; 813 ASSERT3S(jj, <, n); 814 invrows[i][j] = rows[i][jj]; 815 rows[i][jj] = 0; 816 } 817 } 818 819 /* 820 * For each of the rows of interest, we must normalize it and subtract 821 * a multiple of it from the other rows. 822 */ 823 for (i = 0; i < nmissing; i++) { 824 for (j = 0; j < missing[i]; j++) { 825 ASSERT3U(rows[i][j], ==, 0); 826 } 827 ASSERT3U(rows[i][missing[i]], !=, 0); 828 829 /* 830 * Compute the inverse of the first element and multiply each 831 * element in the row by that value. 832 */ 833 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 834 835 for (j = 0; j < n; j++) { 836 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 837 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 838 } 839 840 for (ii = 0; ii < nmissing; ii++) { 841 if (i == ii) 842 continue; 843 844 ASSERT3U(rows[ii][missing[i]], !=, 0); 845 846 log = vdev_raidz_log2[rows[ii][missing[i]]]; 847 848 for (j = 0; j < n; j++) { 849 rows[ii][j] ^= 850 vdev_raidz_exp2(rows[i][j], log); 851 invrows[ii][j] ^= 852 vdev_raidz_exp2(invrows[i][j], log); 853 } 854 } 855 } 856 857 /* 858 * Verify that the data that is left in the rows are properly part of 859 * an identity matrix. 860 */ 861 for (i = 0; i < nmissing; i++) { 862 for (j = 0; j < n; j++) { 863 if (j == missing[i]) { 864 ASSERT3U(rows[i][j], ==, 1); 865 } else { 866 ASSERT3U(rows[i][j], ==, 0); 867 } 868 } 869 } 870 } 871 872 static void 873 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 874 int *missing, uint8_t **invrows, const uint8_t *used) 875 { 876 int i, j, x, cc, c; 877 uint8_t *src; 878 uint64_t ccount; 879 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 880 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 881 uint8_t log, val; 882 int ll; 883 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 884 uint8_t *p, *pp; 885 size_t psize; 886 887 log = 0; /* gcc */ 888 psize = sizeof (invlog[0][0]) * n * nmissing; 889 p = zfs_alloc(psize); 890 891 for (pp = p, i = 0; i < nmissing; i++) { 892 invlog[i] = pp; 893 pp += n; 894 } 895 896 for (i = 0; i < nmissing; i++) { 897 for (j = 0; j < n; j++) { 898 ASSERT3U(invrows[i][j], !=, 0); 899 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 900 } 901 } 902 903 for (i = 0; i < n; i++) { 904 c = used[i]; 905 ASSERT3U(c, <, rm->rm_cols); 906 907 src = rm->rm_col[c].rc_data; 908 ccount = rm->rm_col[c].rc_size; 909 for (j = 0; j < nmissing; j++) { 910 cc = missing[j] + rm->rm_firstdatacol; 911 ASSERT3U(cc, >=, rm->rm_firstdatacol); 912 ASSERT3U(cc, <, rm->rm_cols); 913 ASSERT3U(cc, !=, c); 914 915 dst[j] = rm->rm_col[cc].rc_data; 916 dcount[j] = rm->rm_col[cc].rc_size; 917 } 918 919 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 920 921 for (x = 0; x < ccount; x++, src++) { 922 if (*src != 0) 923 log = vdev_raidz_log2[*src]; 924 925 for (cc = 0; cc < nmissing; cc++) { 926 if (x >= dcount[cc]) 927 continue; 928 929 if (*src == 0) { 930 val = 0; 931 } else { 932 if ((ll = log + invlog[cc][i]) >= 255) 933 ll -= 255; 934 val = vdev_raidz_pow2[ll]; 935 } 936 937 if (i == 0) 938 dst[cc][x] = val; 939 else 940 dst[cc][x] ^= val; 941 } 942 } 943 } 944 945 zfs_free(p, psize); 946 } 947 948 static int 949 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 950 { 951 int n, i, c, t, tt; 952 int nmissing_rows; 953 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 954 int parity_map[VDEV_RAIDZ_MAXPARITY]; 955 956 uint8_t *p, *pp; 957 size_t psize; 958 959 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 960 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 961 uint8_t *used; 962 963 int code = 0; 964 965 966 n = rm->rm_cols - rm->rm_firstdatacol; 967 968 /* 969 * Figure out which data columns are missing. 970 */ 971 nmissing_rows = 0; 972 for (t = 0; t < ntgts; t++) { 973 if (tgts[t] >= rm->rm_firstdatacol) { 974 missing_rows[nmissing_rows++] = 975 tgts[t] - rm->rm_firstdatacol; 976 } 977 } 978 979 /* 980 * Figure out which parity columns to use to help generate the missing 981 * data columns. 982 */ 983 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 984 ASSERT(tt < ntgts); 985 ASSERT(c < rm->rm_firstdatacol); 986 987 /* 988 * Skip any targeted parity columns. 989 */ 990 if (c == tgts[tt]) { 991 tt++; 992 continue; 993 } 994 995 code |= 1 << c; 996 997 parity_map[i] = c; 998 i++; 999 } 1000 1001 ASSERT(code != 0); 1002 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1003 1004 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1005 nmissing_rows * n + sizeof (used[0]) * n; 1006 p = kmem_alloc(psize, KM_SLEEP); 1007 1008 for (pp = p, i = 0; i < nmissing_rows; i++) { 1009 rows[i] = pp; 1010 pp += n; 1011 invrows[i] = pp; 1012 pp += n; 1013 } 1014 used = pp; 1015 1016 for (i = 0; i < nmissing_rows; i++) { 1017 used[i] = parity_map[i]; 1018 } 1019 1020 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1021 if (tt < nmissing_rows && 1022 c == missing_rows[tt] + rm->rm_firstdatacol) { 1023 tt++; 1024 continue; 1025 } 1026 1027 ASSERT3S(i, <, n); 1028 used[i] = c; 1029 i++; 1030 } 1031 1032 /* 1033 * Initialize the interesting rows of the matrix. 1034 */ 1035 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1036 1037 /* 1038 * Invert the matrix. 1039 */ 1040 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1041 invrows, used); 1042 1043 /* 1044 * Reconstruct the missing data using the generated matrix. 1045 */ 1046 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1047 invrows, used); 1048 1049 kmem_free(p, psize); 1050 1051 return (code); 1052 } 1053 1054 static int 1055 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1056 { 1057 int tgts[VDEV_RAIDZ_MAXPARITY]; 1058 int ntgts; 1059 int i, c; 1060 int code; 1061 int nbadparity, nbaddata; 1062 1063 /* 1064 * The tgts list must already be sorted. 1065 */ 1066 for (i = 1; i < nt; i++) { 1067 ASSERT(t[i] > t[i - 1]); 1068 } 1069 1070 nbadparity = rm->rm_firstdatacol; 1071 nbaddata = rm->rm_cols - nbadparity; 1072 ntgts = 0; 1073 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1074 if (i < nt && c == t[i]) { 1075 tgts[ntgts++] = c; 1076 i++; 1077 } else if (rm->rm_col[c].rc_error != 0) { 1078 tgts[ntgts++] = c; 1079 } else if (c >= rm->rm_firstdatacol) { 1080 nbaddata--; 1081 } else { 1082 nbadparity--; 1083 } 1084 } 1085 1086 ASSERT(ntgts >= nt); 1087 ASSERT(nbaddata >= 0); 1088 ASSERT(nbaddata + nbadparity == ntgts); 1089 1090 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1091 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1092 ASSERT(code > 0); 1093 return (code); 1094 } 1095 1096 static raidz_map_t * 1097 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1098 uint64_t dcols, uint64_t nparity) 1099 { 1100 raidz_map_t *rm; 1101 uint64_t b = offset >> unit_shift; 1102 uint64_t s = size >> unit_shift; 1103 uint64_t f = b % dcols; 1104 uint64_t o = (b / dcols) << unit_shift; 1105 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1106 1107 q = s / (dcols - nparity); 1108 r = s - q * (dcols - nparity); 1109 bc = (r == 0 ? 0 : r + nparity); 1110 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1111 1112 if (q == 0) { 1113 acols = bc; 1114 scols = MIN(dcols, roundup(bc, nparity + 1)); 1115 } else { 1116 acols = dcols; 1117 scols = dcols; 1118 } 1119 1120 ASSERT3U(acols, <=, scols); 1121 1122 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1123 1124 rm->rm_cols = acols; 1125 rm->rm_scols = scols; 1126 rm->rm_bigcols = bc; 1127 rm->rm_skipstart = bc; 1128 rm->rm_missingdata = 0; 1129 rm->rm_missingparity = 0; 1130 rm->rm_firstdatacol = nparity; 1131 rm->rm_reports = 0; 1132 rm->rm_freed = 0; 1133 rm->rm_ecksuminjected = 0; 1134 1135 asize = 0; 1136 1137 for (c = 0; c < scols; c++) { 1138 col = f + c; 1139 coff = o; 1140 if (col >= dcols) { 1141 col -= dcols; 1142 coff += 1ULL << unit_shift; 1143 } 1144 rm->rm_col[c].rc_devidx = col; 1145 rm->rm_col[c].rc_offset = coff; 1146 rm->rm_col[c].rc_data = NULL; 1147 rm->rm_col[c].rc_error = 0; 1148 rm->rm_col[c].rc_tried = 0; 1149 rm->rm_col[c].rc_skipped = 0; 1150 1151 if (c >= acols) 1152 rm->rm_col[c].rc_size = 0; 1153 else if (c < bc) 1154 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1155 else 1156 rm->rm_col[c].rc_size = q << unit_shift; 1157 1158 asize += rm->rm_col[c].rc_size; 1159 } 1160 1161 ASSERT3U(asize, ==, tot << unit_shift); 1162 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1163 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1164 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1165 ASSERT3U(rm->rm_nskip, <=, nparity); 1166 1167 for (c = 0; c < rm->rm_firstdatacol; c++) 1168 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1169 1170 rm->rm_col[c].rc_data = data; 1171 1172 for (c = c + 1; c < acols; c++) 1173 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1174 rm->rm_col[c - 1].rc_size; 1175 1176 /* 1177 * If all data stored spans all columns, there's a danger that parity 1178 * will always be on the same device and, since parity isn't read 1179 * during normal operation, that that device's I/O bandwidth won't be 1180 * used effectively. We therefore switch the parity every 1MB. 1181 * 1182 * ... at least that was, ostensibly, the theory. As a practical 1183 * matter unless we juggle the parity between all devices evenly, we 1184 * won't see any benefit. Further, occasional writes that aren't a 1185 * multiple of the LCM of the number of children and the minimum 1186 * stripe width are sufficient to avoid pessimal behavior. 1187 * Unfortunately, this decision created an implicit on-disk format 1188 * requirement that we need to support for all eternity, but only 1189 * for single-parity RAID-Z. 1190 * 1191 * If we intend to skip a sector in the zeroth column for padding 1192 * we must make sure to note this swap. We will never intend to 1193 * skip the first column since at least one data and one parity 1194 * column must appear in each row. 1195 */ 1196 ASSERT(rm->rm_cols >= 2); 1197 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1198 1199 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1200 devidx = rm->rm_col[0].rc_devidx; 1201 o = rm->rm_col[0].rc_offset; 1202 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1203 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1204 rm->rm_col[1].rc_devidx = devidx; 1205 rm->rm_col[1].rc_offset = o; 1206 1207 if (rm->rm_skipstart == 0) 1208 rm->rm_skipstart = 1; 1209 } 1210 1211 return (rm); 1212 } 1213 1214 static void 1215 vdev_raidz_map_free(raidz_map_t *rm) 1216 { 1217 int c; 1218 size_t size; 1219 1220 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1221 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1222 1223 size = 0; 1224 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 1225 size += rm->rm_col[c].rc_size; 1226 1227 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1228 } 1229 1230 static vdev_t * 1231 vdev_child(vdev_t *pvd, uint64_t devidx) 1232 { 1233 vdev_t *cvd; 1234 1235 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1236 if (cvd->v_id == devidx) 1237 break; 1238 } 1239 1240 return (cvd); 1241 } 1242 1243 /* 1244 * We keep track of whether or not there were any injected errors, so that 1245 * any ereports we generate can note it. 1246 */ 1247 static int 1248 raidz_checksum_verify(const blkptr_t *bp, void *data) 1249 { 1250 1251 return (zio_checksum_error(bp, data, 0)); 1252 } 1253 1254 /* 1255 * Generate the parity from the data columns. If we tried and were able to 1256 * read the parity without error, verify that the generated parity matches the 1257 * data we read. If it doesn't, we fire off a checksum error. Return the 1258 * number such failures. 1259 */ 1260 static int 1261 raidz_parity_verify(raidz_map_t *rm) 1262 { 1263 void *orig[VDEV_RAIDZ_MAXPARITY]; 1264 int c, ret = 0; 1265 raidz_col_t *rc; 1266 1267 for (c = 0; c < rm->rm_firstdatacol; c++) { 1268 rc = &rm->rm_col[c]; 1269 if (!rc->rc_tried || rc->rc_error != 0) 1270 continue; 1271 orig[c] = zfs_alloc(rc->rc_size); 1272 bcopy(rc->rc_data, orig[c], rc->rc_size); 1273 } 1274 1275 vdev_raidz_generate_parity(rm); 1276 1277 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1278 rc = &rm->rm_col[c]; 1279 if (!rc->rc_tried || rc->rc_error != 0) 1280 continue; 1281 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1282 rc->rc_error = ECKSUM; 1283 ret++; 1284 } 1285 zfs_free(orig[c], rc->rc_size); 1286 } 1287 1288 return (ret); 1289 } 1290 1291 /* 1292 * Iterate over all combinations of bad data and attempt a reconstruction. 1293 * Note that the algorithm below is non-optimal because it doesn't take into 1294 * account how reconstruction is actually performed. For example, with 1295 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1296 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1297 * cases we'd only use parity information in column 0. 1298 */ 1299 static int 1300 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1301 off_t offset, int total_errors, int data_errors) 1302 { 1303 raidz_col_t *rc; 1304 void *orig[VDEV_RAIDZ_MAXPARITY]; 1305 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1306 int *tgts = &tstore[1]; 1307 int current, next, i, c, n; 1308 int code, ret = 0; 1309 1310 ASSERT(total_errors < rm->rm_firstdatacol); 1311 1312 /* 1313 * This simplifies one edge condition. 1314 */ 1315 tgts[-1] = -1; 1316 1317 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1318 /* 1319 * Initialize the targets array by finding the first n columns 1320 * that contain no error. 1321 * 1322 * If there were no data errors, we need to ensure that we're 1323 * always explicitly attempting to reconstruct at least one 1324 * data column. To do this, we simply push the highest target 1325 * up into the data columns. 1326 */ 1327 for (c = 0, i = 0; i < n; i++) { 1328 if (i == n - 1 && data_errors == 0 && 1329 c < rm->rm_firstdatacol) { 1330 c = rm->rm_firstdatacol; 1331 } 1332 1333 while (rm->rm_col[c].rc_error != 0) { 1334 c++; 1335 ASSERT3S(c, <, rm->rm_cols); 1336 } 1337 1338 tgts[i] = c++; 1339 } 1340 1341 /* 1342 * Setting tgts[n] simplifies the other edge condition. 1343 */ 1344 tgts[n] = rm->rm_cols; 1345 1346 /* 1347 * These buffers were allocated in previous iterations. 1348 */ 1349 for (i = 0; i < n - 1; i++) { 1350 ASSERT(orig[i] != NULL); 1351 } 1352 1353 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1354 1355 current = 0; 1356 next = tgts[current]; 1357 1358 while (current != n) { 1359 tgts[current] = next; 1360 current = 0; 1361 1362 /* 1363 * Save off the original data that we're going to 1364 * attempt to reconstruct. 1365 */ 1366 for (i = 0; i < n; i++) { 1367 ASSERT(orig[i] != NULL); 1368 c = tgts[i]; 1369 ASSERT3S(c, >=, 0); 1370 ASSERT3S(c, <, rm->rm_cols); 1371 rc = &rm->rm_col[c]; 1372 bcopy(rc->rc_data, orig[i], rc->rc_size); 1373 } 1374 1375 /* 1376 * Attempt a reconstruction and exit the outer loop on 1377 * success. 1378 */ 1379 code = vdev_raidz_reconstruct(rm, tgts, n); 1380 if (raidz_checksum_verify(bp, data) == 0) { 1381 for (i = 0; i < n; i++) { 1382 c = tgts[i]; 1383 rc = &rm->rm_col[c]; 1384 ASSERT(rc->rc_error == 0); 1385 rc->rc_error = ECKSUM; 1386 } 1387 1388 ret = code; 1389 goto done; 1390 } 1391 1392 /* 1393 * Restore the original data. 1394 */ 1395 for (i = 0; i < n; i++) { 1396 c = tgts[i]; 1397 rc = &rm->rm_col[c]; 1398 bcopy(orig[i], rc->rc_data, rc->rc_size); 1399 } 1400 1401 do { 1402 /* 1403 * Find the next valid column after the current 1404 * position.. 1405 */ 1406 for (next = tgts[current] + 1; 1407 next < rm->rm_cols && 1408 rm->rm_col[next].rc_error != 0; next++) 1409 continue; 1410 1411 ASSERT(next <= tgts[current + 1]); 1412 1413 /* 1414 * If that spot is available, we're done here. 1415 */ 1416 if (next != tgts[current + 1]) 1417 break; 1418 1419 /* 1420 * Otherwise, find the next valid column after 1421 * the previous position. 1422 */ 1423 for (c = tgts[current - 1] + 1; 1424 rm->rm_col[c].rc_error != 0; c++) 1425 continue; 1426 1427 tgts[current] = c; 1428 current++; 1429 1430 } while (current != n); 1431 } 1432 } 1433 n--; 1434 done: 1435 for (i = n - 1; i >= 0; i--) { 1436 zfs_free(orig[i], rm->rm_col[0].rc_size); 1437 } 1438 1439 return (ret); 1440 } 1441 1442 static int 1443 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1444 off_t offset, size_t bytes) 1445 { 1446 vdev_t *tvd = vd->v_top; 1447 vdev_t *cvd; 1448 raidz_map_t *rm; 1449 raidz_col_t *rc; 1450 int c, error; 1451 int unexpected_errors; 1452 int parity_errors; 1453 int parity_untried; 1454 int data_errors; 1455 int total_errors; 1456 int n; 1457 int tgts[VDEV_RAIDZ_MAXPARITY]; 1458 int code; 1459 1460 rc = NULL; /* gcc */ 1461 error = 0; 1462 1463 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1464 vd->v_nchildren, vd->v_nparity); 1465 1466 /* 1467 * Iterate over the columns in reverse order so that we hit the parity 1468 * last -- any errors along the way will force us to read the parity. 1469 */ 1470 for (c = rm->rm_cols - 1; c >= 0; c--) { 1471 rc = &rm->rm_col[c]; 1472 cvd = vdev_child(vd, rc->rc_devidx); 1473 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1474 if (c >= rm->rm_firstdatacol) 1475 rm->rm_missingdata++; 1476 else 1477 rm->rm_missingparity++; 1478 rc->rc_error = ENXIO; 1479 rc->rc_tried = 1; /* don't even try */ 1480 rc->rc_skipped = 1; 1481 continue; 1482 } 1483 #if 0 /* XXX: Too hard for the boot code. */ 1484 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1485 if (c >= rm->rm_firstdatacol) 1486 rm->rm_missingdata++; 1487 else 1488 rm->rm_missingparity++; 1489 rc->rc_error = ESTALE; 1490 rc->rc_skipped = 1; 1491 continue; 1492 } 1493 #endif 1494 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1495 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1496 rc->rc_offset, rc->rc_size); 1497 rc->rc_tried = 1; 1498 rc->rc_skipped = 0; 1499 } 1500 } 1501 1502 reconstruct: 1503 unexpected_errors = 0; 1504 parity_errors = 0; 1505 parity_untried = 0; 1506 data_errors = 0; 1507 total_errors = 0; 1508 1509 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1510 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1511 1512 for (c = 0; c < rm->rm_cols; c++) { 1513 rc = &rm->rm_col[c]; 1514 1515 if (rc->rc_error) { 1516 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1517 1518 if (c < rm->rm_firstdatacol) 1519 parity_errors++; 1520 else 1521 data_errors++; 1522 1523 if (!rc->rc_skipped) 1524 unexpected_errors++; 1525 1526 total_errors++; 1527 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1528 parity_untried++; 1529 } 1530 } 1531 1532 /* 1533 * There are three potential phases for a read: 1534 * 1. produce valid data from the columns read 1535 * 2. read all disks and try again 1536 * 3. perform combinatorial reconstruction 1537 * 1538 * Each phase is progressively both more expensive and less likely to 1539 * occur. If we encounter more errors than we can repair or all phases 1540 * fail, we have no choice but to return an error. 1541 */ 1542 1543 /* 1544 * If the number of errors we saw was correctable -- less than or equal 1545 * to the number of parity disks read -- attempt to produce data that 1546 * has a valid checksum. Naturally, this case applies in the absence of 1547 * any errors. 1548 */ 1549 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1550 if (data_errors == 0) { 1551 if (raidz_checksum_verify(bp, data) == 0) { 1552 /* 1553 * If we read parity information (unnecessarily 1554 * as it happens since no reconstruction was 1555 * needed) regenerate and verify the parity. 1556 * We also regenerate parity when resilvering 1557 * so we can write it out to the failed device 1558 * later. 1559 */ 1560 if (parity_errors + parity_untried < 1561 rm->rm_firstdatacol) { 1562 n = raidz_parity_verify(rm); 1563 unexpected_errors += n; 1564 ASSERT(parity_errors + n <= 1565 rm->rm_firstdatacol); 1566 } 1567 goto done; 1568 } 1569 } else { 1570 /* 1571 * We either attempt to read all the parity columns or 1572 * none of them. If we didn't try to read parity, we 1573 * wouldn't be here in the correctable case. There must 1574 * also have been fewer parity errors than parity 1575 * columns or, again, we wouldn't be in this code path. 1576 */ 1577 ASSERT(parity_untried == 0); 1578 ASSERT(parity_errors < rm->rm_firstdatacol); 1579 1580 /* 1581 * Identify the data columns that reported an error. 1582 */ 1583 n = 0; 1584 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1585 rc = &rm->rm_col[c]; 1586 if (rc->rc_error != 0) { 1587 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1588 tgts[n++] = c; 1589 } 1590 } 1591 1592 ASSERT(rm->rm_firstdatacol >= n); 1593 1594 code = vdev_raidz_reconstruct(rm, tgts, n); 1595 1596 if (raidz_checksum_verify(bp, data) == 0) { 1597 /* 1598 * If we read more parity disks than were used 1599 * for reconstruction, confirm that the other 1600 * parity disks produced correct data. This 1601 * routine is suboptimal in that it regenerates 1602 * the parity that we already used in addition 1603 * to the parity that we're attempting to 1604 * verify, but this should be a relatively 1605 * uncommon case, and can be optimized if it 1606 * becomes a problem. Note that we regenerate 1607 * parity when resilvering so we can write it 1608 * out to failed devices later. 1609 */ 1610 if (parity_errors < rm->rm_firstdatacol - n) { 1611 n = raidz_parity_verify(rm); 1612 unexpected_errors += n; 1613 ASSERT(parity_errors + n <= 1614 rm->rm_firstdatacol); 1615 } 1616 1617 goto done; 1618 } 1619 } 1620 } 1621 1622 /* 1623 * This isn't a typical situation -- either we got a read 1624 * error or a child silently returned bad data. Read every 1625 * block so we can try again with as much data and parity as 1626 * we can track down. If we've already been through once 1627 * before, all children will be marked as tried so we'll 1628 * proceed to combinatorial reconstruction. 1629 */ 1630 unexpected_errors = 1; 1631 rm->rm_missingdata = 0; 1632 rm->rm_missingparity = 0; 1633 1634 n = 0; 1635 for (c = 0; c < rm->rm_cols; c++) { 1636 if (rm->rm_col[c].rc_tried) 1637 continue; 1638 1639 cvd = vdev_child(vd, rc->rc_devidx); 1640 ASSERT(cvd != NULL); 1641 rc->rc_error = cvd->v_read(cvd, NULL, 1642 rc->rc_data, rc->rc_offset, rc->rc_size); 1643 if (rc->rc_error == 0) 1644 n++; 1645 rc->rc_tried = 1; 1646 rc->rc_skipped = 0; 1647 } 1648 /* 1649 * If we managed to read anything more, retry the 1650 * reconstruction. 1651 */ 1652 if (n > 0) 1653 goto reconstruct; 1654 1655 /* 1656 * At this point we've attempted to reconstruct the data given the 1657 * errors we detected, and we've attempted to read all columns. There 1658 * must, therefore, be one or more additional problems -- silent errors 1659 * resulting in invalid data rather than explicit I/O errors resulting 1660 * in absent data. We check if there is enough additional data to 1661 * possibly reconstruct the data and then perform combinatorial 1662 * reconstruction over all possible combinations. If that fails, 1663 * we're cooked. 1664 */ 1665 if (total_errors > rm->rm_firstdatacol) { 1666 error = EIO; 1667 } else if (total_errors < rm->rm_firstdatacol && 1668 (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors, 1669 data_errors)) != 0) { 1670 /* 1671 * If we didn't use all the available parity for the 1672 * combinatorial reconstruction, verify that the remaining 1673 * parity is correct. 1674 */ 1675 if (code != (1 << rm->rm_firstdatacol) - 1) 1676 (void) raidz_parity_verify(rm); 1677 } else { 1678 /* 1679 * We're here because either: 1680 * 1681 * total_errors == rm_first_datacol, or 1682 * vdev_raidz_combrec() failed 1683 * 1684 * In either case, there is enough bad data to prevent 1685 * reconstruction. 1686 * 1687 * Start checksum ereports for all children which haven't 1688 * failed, and the IO wasn't speculative. 1689 */ 1690 error = ECKSUM; 1691 } 1692 1693 done: 1694 vdev_raidz_map_free(rm); 1695 1696 return (error); 1697 } 1698