1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_file.h> 34 #include <sys/vdev_raidz.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/fm/fs/zfs.h> 39 40 /* 41 * Virtual device vector for RAID-Z. 42 * 43 * This vdev supports single, double, and triple parity. For single parity, 44 * we use a simple XOR of all the data columns. For double or triple parity, 45 * we use a special case of Reed-Solomon coding. This extends the 46 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 47 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 48 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 49 * former is also based. The latter is designed to provide higher performance 50 * for writes. 51 * 52 * Note that the Plank paper claimed to support arbitrary N+M, but was then 53 * amended six years later identifying a critical flaw that invalidates its 54 * claims. Nevertheless, the technique can be adapted to work for up to 55 * triple parity. For additional parity, the amendment "Note: Correction to 56 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 57 * is viable, but the additional complexity means that write performance will 58 * suffer. 59 * 60 * All of the methods above operate on a Galois field, defined over the 61 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 62 * can be expressed with a single byte. Briefly, the operations on the 63 * field are defined as follows: 64 * 65 * o addition (+) is represented by a bitwise XOR 66 * o subtraction (-) is therefore identical to addition: A + B = A - B 67 * o multiplication of A by 2 is defined by the following bitwise expression: 68 * 69 * (A * 2)_7 = A_6 70 * (A * 2)_6 = A_5 71 * (A * 2)_5 = A_4 72 * (A * 2)_4 = A_3 + A_7 73 * (A * 2)_3 = A_2 + A_7 74 * (A * 2)_2 = A_1 + A_7 75 * (A * 2)_1 = A_0 76 * (A * 2)_0 = A_7 77 * 78 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 79 * As an aside, this multiplication is derived from the error correcting 80 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 81 * 82 * Observe that any number in the field (except for 0) can be expressed as a 83 * power of 2 -- a generator for the field. We store a table of the powers of 84 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 85 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 86 * than field addition). The inverse of a field element A (A^-1) is therefore 87 * A ^ (255 - 1) = A^254. 88 * 89 * The up-to-three parity columns, P, Q, R over several data columns, 90 * D_0, ... D_n-1, can be expressed by field operations: 91 * 92 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 93 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 94 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 95 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 96 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 97 * 98 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 99 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 100 * independent coefficients. (There are no additional coefficients that have 101 * this property which is why the uncorrected Plank method breaks down.) 102 * 103 * See the reconstruction code below for how P, Q and R can used individually 104 * or in concert to recover missing data columns. 105 */ 106 107 typedef struct raidz_col { 108 uint64_t rc_devidx; /* child device index for I/O */ 109 uint64_t rc_offset; /* device offset */ 110 uint64_t rc_size; /* I/O size */ 111 void *rc_data; /* I/O data */ 112 void *rc_gdata; /* used to store the "good" version */ 113 int rc_error; /* I/O error for this device */ 114 uint8_t rc_tried; /* Did we attempt this I/O column? */ 115 uint8_t rc_skipped; /* Did we skip this I/O column? */ 116 } raidz_col_t; 117 118 typedef struct raidz_map { 119 uint64_t rm_cols; /* Regular column count */ 120 uint64_t rm_scols; /* Count including skipped columns */ 121 uint64_t rm_bigcols; /* Number of oversized columns */ 122 uint64_t rm_asize; /* Actual total I/O size */ 123 uint64_t rm_missingdata; /* Count of missing data devices */ 124 uint64_t rm_missingparity; /* Count of missing parity devices */ 125 uint64_t rm_firstdatacol; /* First data column/parity count */ 126 uint64_t rm_nskip; /* Skipped sectors for padding */ 127 uint64_t rm_skipstart; /* Column index of padding start */ 128 void *rm_datacopy; /* rm_asize-buffer of copied data */ 129 uintptr_t rm_reports; /* # of referencing checksum reports */ 130 uint8_t rm_freed; /* map no longer has referencing ZIO */ 131 uint8_t rm_ecksuminjected; /* checksum error was injected */ 132 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 133 } raidz_map_t; 134 135 #define VDEV_RAIDZ_P 0 136 #define VDEV_RAIDZ_Q 1 137 #define VDEV_RAIDZ_R 2 138 139 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 140 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 141 142 /* 143 * We provide a mechanism to perform the field multiplication operation on a 144 * 64-bit value all at once rather than a byte at a time. This works by 145 * creating a mask from the top bit in each byte and using that to 146 * conditionally apply the XOR of 0x1d. 147 */ 148 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 149 { \ 150 (mask) = (x) & 0x8080808080808080ULL; \ 151 (mask) = ((mask) << 1) - ((mask) >> 7); \ 152 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 153 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 154 } 155 156 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 157 { \ 158 VDEV_RAIDZ_64MUL_2((x), mask); \ 159 VDEV_RAIDZ_64MUL_2((x), mask); \ 160 } 161 162 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 163 164 /* 165 * Force reconstruction to use the general purpose method. 166 */ 167 int vdev_raidz_default_to_general; 168 169 /* Powers of 2 in the Galois field defined above. */ 170 static const uint8_t vdev_raidz_pow2[256] = { 171 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 172 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 173 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 174 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 175 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 176 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 177 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 178 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 179 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 180 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 181 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 182 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 183 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 184 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 185 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 186 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 187 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 188 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 189 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 190 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 191 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 192 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 193 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 194 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 195 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 196 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 197 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 198 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 199 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 200 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 201 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 202 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 203 }; 204 /* Logs of 2 in the Galois field defined above. */ 205 static const uint8_t vdev_raidz_log2[256] = { 206 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 207 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 208 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 209 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 210 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 211 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 212 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 213 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 214 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 215 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 216 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 217 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 218 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 219 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 220 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 221 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 222 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 223 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 224 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 225 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 226 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 227 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 228 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 229 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 230 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 231 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 232 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 233 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 234 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 235 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 236 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 237 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 238 }; 239 240 static void vdev_raidz_generate_parity(raidz_map_t *rm); 241 242 /* 243 * Multiply a given number by 2 raised to the given power. 244 */ 245 static uint8_t 246 vdev_raidz_exp2(uint_t a, int exp) 247 { 248 if (a == 0) 249 return (0); 250 251 ASSERT(exp >= 0); 252 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 253 254 exp += vdev_raidz_log2[a]; 255 if (exp > 255) 256 exp -= 255; 257 258 return (vdev_raidz_pow2[exp]); 259 } 260 261 static void 262 vdev_raidz_map_free(raidz_map_t *rm) 263 { 264 int c; 265 size_t size; 266 267 for (c = 0; c < rm->rm_firstdatacol; c++) { 268 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 269 270 if (rm->rm_col[c].rc_gdata != NULL) 271 zio_buf_free(rm->rm_col[c].rc_gdata, 272 rm->rm_col[c].rc_size); 273 } 274 275 size = 0; 276 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 277 size += rm->rm_col[c].rc_size; 278 279 if (rm->rm_datacopy != NULL) 280 zio_buf_free(rm->rm_datacopy, size); 281 282 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 283 } 284 285 static void 286 vdev_raidz_map_free_vsd(zio_t *zio) 287 { 288 raidz_map_t *rm = zio->io_vsd; 289 290 ASSERT0(rm->rm_freed); 291 rm->rm_freed = 1; 292 293 if (rm->rm_reports == 0) 294 vdev_raidz_map_free(rm); 295 } 296 297 /*ARGSUSED*/ 298 static void 299 vdev_raidz_cksum_free(void *arg, size_t ignored) 300 { 301 raidz_map_t *rm = arg; 302 303 ASSERT3U(rm->rm_reports, >, 0); 304 305 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 306 vdev_raidz_map_free(rm); 307 } 308 309 static void 310 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 311 { 312 raidz_map_t *rm = zcr->zcr_cbdata; 313 size_t c = zcr->zcr_cbinfo; 314 size_t x; 315 316 const char *good = NULL; 317 const char *bad = rm->rm_col[c].rc_data; 318 319 if (good_data == NULL) { 320 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 321 return; 322 } 323 324 if (c < rm->rm_firstdatacol) { 325 /* 326 * The first time through, calculate the parity blocks for 327 * the good data (this relies on the fact that the good 328 * data never changes for a given logical ZIO) 329 */ 330 if (rm->rm_col[0].rc_gdata == NULL) { 331 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 332 char *buf; 333 334 /* 335 * Set up the rm_col[]s to generate the parity for 336 * good_data, first saving the parity bufs and 337 * replacing them with buffers to hold the result. 338 */ 339 for (x = 0; x < rm->rm_firstdatacol; x++) { 340 bad_parity[x] = rm->rm_col[x].rc_data; 341 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 342 zio_buf_alloc(rm->rm_col[x].rc_size); 343 } 344 345 /* fill in the data columns from good_data */ 346 buf = (char *)good_data; 347 for (; x < rm->rm_cols; x++) { 348 rm->rm_col[x].rc_data = buf; 349 buf += rm->rm_col[x].rc_size; 350 } 351 352 /* 353 * Construct the parity from the good data. 354 */ 355 vdev_raidz_generate_parity(rm); 356 357 /* restore everything back to its original state */ 358 for (x = 0; x < rm->rm_firstdatacol; x++) 359 rm->rm_col[x].rc_data = bad_parity[x]; 360 361 buf = rm->rm_datacopy; 362 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 363 rm->rm_col[x].rc_data = buf; 364 buf += rm->rm_col[x].rc_size; 365 } 366 } 367 368 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 369 good = rm->rm_col[c].rc_gdata; 370 } else { 371 /* adjust good_data to point at the start of our column */ 372 good = good_data; 373 374 for (x = rm->rm_firstdatacol; x < c; x++) 375 good += rm->rm_col[x].rc_size; 376 } 377 378 /* we drop the ereport if it ends up that the data was good */ 379 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 380 } 381 382 /* 383 * Invoked indirectly by zfs_ereport_start_checksum(), called 384 * below when our read operation fails completely. The main point 385 * is to keep a copy of everything we read from disk, so that at 386 * vdev_raidz_cksum_finish() time we can compare it with the good data. 387 */ 388 static void 389 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 390 { 391 size_t c = (size_t)(uintptr_t)arg; 392 caddr_t buf; 393 394 raidz_map_t *rm = zio->io_vsd; 395 size_t size; 396 397 /* set up the report and bump the refcount */ 398 zcr->zcr_cbdata = rm; 399 zcr->zcr_cbinfo = c; 400 zcr->zcr_finish = vdev_raidz_cksum_finish; 401 zcr->zcr_free = vdev_raidz_cksum_free; 402 403 rm->rm_reports++; 404 ASSERT3U(rm->rm_reports, >, 0); 405 406 if (rm->rm_datacopy != NULL) 407 return; 408 409 /* 410 * It's the first time we're called for this raidz_map_t, so we need 411 * to copy the data aside; there's no guarantee that our zio's buffer 412 * won't be re-used for something else. 413 * 414 * Our parity data is already in separate buffers, so there's no need 415 * to copy them. 416 */ 417 418 size = 0; 419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 420 size += rm->rm_col[c].rc_size; 421 422 buf = rm->rm_datacopy = zio_buf_alloc(size); 423 424 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 425 raidz_col_t *col = &rm->rm_col[c]; 426 427 bcopy(col->rc_data, buf, col->rc_size); 428 col->rc_data = buf; 429 430 buf += col->rc_size; 431 } 432 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 433 } 434 435 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 436 vdev_raidz_map_free_vsd, 437 vdev_raidz_cksum_report 438 }; 439 440 /* 441 * Divides the IO evenly across all child vdevs; usually, dcols is 442 * the number of children in the target vdev. 443 */ 444 static raidz_map_t * 445 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, 446 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 447 { 448 raidz_map_t *rm; 449 /* The starting RAIDZ (parent) vdev sector of the block. */ 450 uint64_t b = offset >> unit_shift; 451 /* The zio's size in units of the vdev's minimum sector size. */ 452 uint64_t s = size >> unit_shift; 453 /* The first column for this stripe. */ 454 uint64_t f = b % dcols; 455 /* The starting byte offset on each child vdev. */ 456 uint64_t o = (b / dcols) << unit_shift; 457 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 458 459 /* 460 * "Quotient": The number of data sectors for this stripe on all but 461 * the "big column" child vdevs that also contain "remainder" data. 462 */ 463 q = s / (dcols - nparity); 464 465 /* 466 * "Remainder": The number of partial stripe data sectors in this I/O. 467 * This will add a sector to some, but not all, child vdevs. 468 */ 469 r = s - q * (dcols - nparity); 470 471 /* The number of "big columns" - those which contain remainder data. */ 472 bc = (r == 0 ? 0 : r + nparity); 473 474 /* 475 * The total number of data and parity sectors associated with 476 * this I/O. 477 */ 478 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 479 480 /* acols: The columns that will be accessed. */ 481 /* scols: The columns that will be accessed or skipped. */ 482 if (q == 0) { 483 /* Our I/O request doesn't span all child vdevs. */ 484 acols = bc; 485 scols = MIN(dcols, roundup(bc, nparity + 1)); 486 } else { 487 acols = dcols; 488 scols = dcols; 489 } 490 491 ASSERT3U(acols, <=, scols); 492 493 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 494 495 rm->rm_cols = acols; 496 rm->rm_scols = scols; 497 rm->rm_bigcols = bc; 498 rm->rm_skipstart = bc; 499 rm->rm_missingdata = 0; 500 rm->rm_missingparity = 0; 501 rm->rm_firstdatacol = nparity; 502 rm->rm_datacopy = NULL; 503 rm->rm_reports = 0; 504 rm->rm_freed = 0; 505 rm->rm_ecksuminjected = 0; 506 507 asize = 0; 508 509 for (c = 0; c < scols; c++) { 510 col = f + c; 511 coff = o; 512 if (col >= dcols) { 513 col -= dcols; 514 coff += 1ULL << unit_shift; 515 } 516 rm->rm_col[c].rc_devidx = col; 517 rm->rm_col[c].rc_offset = coff; 518 rm->rm_col[c].rc_data = NULL; 519 rm->rm_col[c].rc_gdata = NULL; 520 rm->rm_col[c].rc_error = 0; 521 rm->rm_col[c].rc_tried = 0; 522 rm->rm_col[c].rc_skipped = 0; 523 524 if (c >= acols) 525 rm->rm_col[c].rc_size = 0; 526 else if (c < bc) 527 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 528 else 529 rm->rm_col[c].rc_size = q << unit_shift; 530 531 asize += rm->rm_col[c].rc_size; 532 } 533 534 ASSERT3U(asize, ==, tot << unit_shift); 535 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 536 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 537 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 538 ASSERT3U(rm->rm_nskip, <=, nparity); 539 540 for (c = 0; c < rm->rm_firstdatacol; c++) 541 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 542 543 rm->rm_col[c].rc_data = data; 544 545 for (c = c + 1; c < acols; c++) 546 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 547 rm->rm_col[c - 1].rc_size; 548 549 /* 550 * If all data stored spans all columns, there's a danger that parity 551 * will always be on the same device and, since parity isn't read 552 * during normal operation, that that device's I/O bandwidth won't be 553 * used effectively. We therefore switch the parity every 1MB. 554 * 555 * ... at least that was, ostensibly, the theory. As a practical 556 * matter unless we juggle the parity between all devices evenly, we 557 * won't see any benefit. Further, occasional writes that aren't a 558 * multiple of the LCM of the number of children and the minimum 559 * stripe width are sufficient to avoid pessimal behavior. 560 * Unfortunately, this decision created an implicit on-disk format 561 * requirement that we need to support for all eternity, but only 562 * for single-parity RAID-Z. 563 * 564 * If we intend to skip a sector in the zeroth column for padding 565 * we must make sure to note this swap. We will never intend to 566 * skip the first column since at least one data and one parity 567 * column must appear in each row. 568 */ 569 ASSERT(rm->rm_cols >= 2); 570 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 571 572 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 573 devidx = rm->rm_col[0].rc_devidx; 574 o = rm->rm_col[0].rc_offset; 575 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 576 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 577 rm->rm_col[1].rc_devidx = devidx; 578 rm->rm_col[1].rc_offset = o; 579 580 if (rm->rm_skipstart == 0) 581 rm->rm_skipstart = 1; 582 } 583 584 return (rm); 585 } 586 587 static void 588 vdev_raidz_generate_parity_p(raidz_map_t *rm) 589 { 590 uint64_t *p, *src, pcount, ccount, i; 591 int c; 592 593 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 594 595 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 596 src = rm->rm_col[c].rc_data; 597 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 598 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 599 600 if (c == rm->rm_firstdatacol) { 601 ASSERT(ccount == pcount); 602 for (i = 0; i < ccount; i++, src++, p++) { 603 *p = *src; 604 } 605 } else { 606 ASSERT(ccount <= pcount); 607 for (i = 0; i < ccount; i++, src++, p++) { 608 *p ^= *src; 609 } 610 } 611 } 612 } 613 614 static void 615 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 616 { 617 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 618 int c; 619 620 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 621 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 622 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 623 624 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 625 src = rm->rm_col[c].rc_data; 626 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 627 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 628 629 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 630 631 if (c == rm->rm_firstdatacol) { 632 ASSERT(ccnt == pcnt || ccnt == 0); 633 for (i = 0; i < ccnt; i++, src++, p++, q++) { 634 *p = *src; 635 *q = *src; 636 } 637 for (; i < pcnt; i++, src++, p++, q++) { 638 *p = 0; 639 *q = 0; 640 } 641 } else { 642 ASSERT(ccnt <= pcnt); 643 644 /* 645 * Apply the algorithm described above by multiplying 646 * the previous result and adding in the new value. 647 */ 648 for (i = 0; i < ccnt; i++, src++, p++, q++) { 649 *p ^= *src; 650 651 VDEV_RAIDZ_64MUL_2(*q, mask); 652 *q ^= *src; 653 } 654 655 /* 656 * Treat short columns as though they are full of 0s. 657 * Note that there's therefore nothing needed for P. 658 */ 659 for (; i < pcnt; i++, q++) { 660 VDEV_RAIDZ_64MUL_2(*q, mask); 661 } 662 } 663 } 664 } 665 666 static void 667 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 668 { 669 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 670 int c; 671 672 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 673 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 674 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 675 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 676 rm->rm_col[VDEV_RAIDZ_R].rc_size); 677 678 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 679 src = rm->rm_col[c].rc_data; 680 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 681 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 682 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 683 684 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 685 686 if (c == rm->rm_firstdatacol) { 687 ASSERT(ccnt == pcnt || ccnt == 0); 688 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 689 *p = *src; 690 *q = *src; 691 *r = *src; 692 } 693 for (; i < pcnt; i++, src++, p++, q++, r++) { 694 *p = 0; 695 *q = 0; 696 *r = 0; 697 } 698 } else { 699 ASSERT(ccnt <= pcnt); 700 701 /* 702 * Apply the algorithm described above by multiplying 703 * the previous result and adding in the new value. 704 */ 705 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 706 *p ^= *src; 707 708 VDEV_RAIDZ_64MUL_2(*q, mask); 709 *q ^= *src; 710 711 VDEV_RAIDZ_64MUL_4(*r, mask); 712 *r ^= *src; 713 } 714 715 /* 716 * Treat short columns as though they are full of 0s. 717 * Note that there's therefore nothing needed for P. 718 */ 719 for (; i < pcnt; i++, q++, r++) { 720 VDEV_RAIDZ_64MUL_2(*q, mask); 721 VDEV_RAIDZ_64MUL_4(*r, mask); 722 } 723 } 724 } 725 } 726 727 /* 728 * Generate RAID parity in the first virtual columns according to the number of 729 * parity columns available. 730 */ 731 static void 732 vdev_raidz_generate_parity(raidz_map_t *rm) 733 { 734 switch (rm->rm_firstdatacol) { 735 case 1: 736 vdev_raidz_generate_parity_p(rm); 737 break; 738 case 2: 739 vdev_raidz_generate_parity_pq(rm); 740 break; 741 case 3: 742 vdev_raidz_generate_parity_pqr(rm); 743 break; 744 default: 745 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 746 } 747 } 748 749 static int 750 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 751 { 752 uint64_t *dst, *src, xcount, ccount, count, i; 753 int x = tgts[0]; 754 int c; 755 756 ASSERT(ntgts == 1); 757 ASSERT(x >= rm->rm_firstdatacol); 758 ASSERT(x < rm->rm_cols); 759 760 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 761 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 762 ASSERT(xcount > 0); 763 764 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 765 dst = rm->rm_col[x].rc_data; 766 for (i = 0; i < xcount; i++, dst++, src++) { 767 *dst = *src; 768 } 769 770 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 771 src = rm->rm_col[c].rc_data; 772 dst = rm->rm_col[x].rc_data; 773 774 if (c == x) 775 continue; 776 777 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 778 count = MIN(ccount, xcount); 779 780 for (i = 0; i < count; i++, dst++, src++) { 781 *dst ^= *src; 782 } 783 } 784 785 return (1 << VDEV_RAIDZ_P); 786 } 787 788 static int 789 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 790 { 791 uint64_t *dst, *src, xcount, ccount, count, mask, i; 792 uint8_t *b; 793 int x = tgts[0]; 794 int c, j, exp; 795 796 ASSERT(ntgts == 1); 797 798 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 799 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 800 801 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 802 src = rm->rm_col[c].rc_data; 803 dst = rm->rm_col[x].rc_data; 804 805 if (c == x) 806 ccount = 0; 807 else 808 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 809 810 count = MIN(ccount, xcount); 811 812 if (c == rm->rm_firstdatacol) { 813 for (i = 0; i < count; i++, dst++, src++) { 814 *dst = *src; 815 } 816 for (; i < xcount; i++, dst++) { 817 *dst = 0; 818 } 819 820 } else { 821 for (i = 0; i < count; i++, dst++, src++) { 822 VDEV_RAIDZ_64MUL_2(*dst, mask); 823 *dst ^= *src; 824 } 825 826 for (; i < xcount; i++, dst++) { 827 VDEV_RAIDZ_64MUL_2(*dst, mask); 828 } 829 } 830 } 831 832 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 833 dst = rm->rm_col[x].rc_data; 834 exp = 255 - (rm->rm_cols - 1 - x); 835 836 for (i = 0; i < xcount; i++, dst++, src++) { 837 *dst ^= *src; 838 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 839 *b = vdev_raidz_exp2(*b, exp); 840 } 841 } 842 843 return (1 << VDEV_RAIDZ_Q); 844 } 845 846 static int 847 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 848 { 849 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 850 void *pdata, *qdata; 851 uint64_t xsize, ysize, i; 852 int x = tgts[0]; 853 int y = tgts[1]; 854 855 ASSERT(ntgts == 2); 856 ASSERT(x < y); 857 ASSERT(x >= rm->rm_firstdatacol); 858 ASSERT(y < rm->rm_cols); 859 860 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 861 862 /* 863 * Move the parity data aside -- we're going to compute parity as 864 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 865 * reuse the parity generation mechanism without trashing the actual 866 * parity so we make those columns appear to be full of zeros by 867 * setting their lengths to zero. 868 */ 869 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 870 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 871 xsize = rm->rm_col[x].rc_size; 872 ysize = rm->rm_col[y].rc_size; 873 874 rm->rm_col[VDEV_RAIDZ_P].rc_data = 875 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 876 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 877 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 878 rm->rm_col[x].rc_size = 0; 879 rm->rm_col[y].rc_size = 0; 880 881 vdev_raidz_generate_parity_pq(rm); 882 883 rm->rm_col[x].rc_size = xsize; 884 rm->rm_col[y].rc_size = ysize; 885 886 p = pdata; 887 q = qdata; 888 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 889 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 890 xd = rm->rm_col[x].rc_data; 891 yd = rm->rm_col[y].rc_data; 892 893 /* 894 * We now have: 895 * Pxy = P + D_x + D_y 896 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 897 * 898 * We can then solve for D_x: 899 * D_x = A * (P + Pxy) + B * (Q + Qxy) 900 * where 901 * A = 2^(x - y) * (2^(x - y) + 1)^-1 902 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 903 * 904 * With D_x in hand, we can easily solve for D_y: 905 * D_y = P + Pxy + D_x 906 */ 907 908 a = vdev_raidz_pow2[255 + x - y]; 909 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 910 tmp = 255 - vdev_raidz_log2[a ^ 1]; 911 912 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 913 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 914 915 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 916 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 917 vdev_raidz_exp2(*q ^ *qxy, bexp); 918 919 if (i < ysize) 920 *yd = *p ^ *pxy ^ *xd; 921 } 922 923 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 924 rm->rm_col[VDEV_RAIDZ_P].rc_size); 925 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 926 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 927 928 /* 929 * Restore the saved parity data. 930 */ 931 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 932 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 933 934 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 935 } 936 937 /* BEGIN CSTYLED */ 938 /* 939 * In the general case of reconstruction, we must solve the system of linear 940 * equations defined by the coeffecients used to generate parity as well as 941 * the contents of the data and parity disks. This can be expressed with 942 * vectors for the original data (D) and the actual data (d) and parity (p) 943 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 944 * 945 * __ __ __ __ 946 * | | __ __ | p_0 | 947 * | V | | D_0 | | p_m-1 | 948 * | | x | : | = | d_0 | 949 * | I | | D_n-1 | | : | 950 * | | ~~ ~~ | d_n-1 | 951 * ~~ ~~ ~~ ~~ 952 * 953 * I is simply a square identity matrix of size n, and V is a vandermonde 954 * matrix defined by the coeffecients we chose for the various parity columns 955 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 956 * computation as well as linear separability. 957 * 958 * __ __ __ __ 959 * | 1 .. 1 1 1 | | p_0 | 960 * | 2^n-1 .. 4 2 1 | __ __ | : | 961 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 962 * | 1 .. 0 0 0 | | D_1 | | d_0 | 963 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 964 * | : : : : | | : | | d_2 | 965 * | 0 .. 1 0 0 | | D_n-1 | | : | 966 * | 0 .. 0 1 0 | ~~ ~~ | : | 967 * | 0 .. 0 0 1 | | d_n-1 | 968 * ~~ ~~ ~~ ~~ 969 * 970 * Note that I, V, d, and p are known. To compute D, we must invert the 971 * matrix and use the known data and parity values to reconstruct the unknown 972 * data values. We begin by removing the rows in V|I and d|p that correspond 973 * to failed or missing columns; we then make V|I square (n x n) and d|p 974 * sized n by removing rows corresponding to unused parity from the bottom up 975 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 976 * using Gauss-Jordan elimination. In the example below we use m=3 parity 977 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 978 * __ __ 979 * | 1 1 1 1 1 1 1 1 | 980 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 981 * | 19 205 116 29 64 16 4 1 | / / 982 * | 1 0 0 0 0 0 0 0 | / / 983 * | 0 1 0 0 0 0 0 0 | <--' / 984 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 985 * | 0 0 0 1 0 0 0 0 | 986 * | 0 0 0 0 1 0 0 0 | 987 * | 0 0 0 0 0 1 0 0 | 988 * | 0 0 0 0 0 0 1 0 | 989 * | 0 0 0 0 0 0 0 1 | 990 * ~~ ~~ 991 * __ __ 992 * | 1 1 1 1 1 1 1 1 | 993 * | 19 205 116 29 64 16 4 1 | 994 * | 1 0 0 0 0 0 0 0 | 995 * (V|I)' = | 0 0 0 1 0 0 0 0 | 996 * | 0 0 0 0 1 0 0 0 | 997 * | 0 0 0 0 0 1 0 0 | 998 * | 0 0 0 0 0 0 1 0 | 999 * | 0 0 0 0 0 0 0 1 | 1000 * ~~ ~~ 1001 * 1002 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1003 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1004 * matrix is not singular. 1005 * __ __ 1006 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1007 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1009 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1010 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1011 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1012 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1013 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1014 * ~~ ~~ 1015 * __ __ 1016 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1017 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1018 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1024 * ~~ ~~ 1025 * __ __ 1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1027 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1028 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1034 * ~~ ~~ 1035 * __ __ 1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1038 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1044 * ~~ ~~ 1045 * __ __ 1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1048 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1054 * ~~ ~~ 1055 * __ __ 1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1064 * ~~ ~~ 1065 * __ __ 1066 * | 0 0 1 0 0 0 0 0 | 1067 * | 167 100 5 41 159 169 217 208 | 1068 * | 166 100 4 40 158 168 216 209 | 1069 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1070 * | 0 0 0 0 1 0 0 0 | 1071 * | 0 0 0 0 0 1 0 0 | 1072 * | 0 0 0 0 0 0 1 0 | 1073 * | 0 0 0 0 0 0 0 1 | 1074 * ~~ ~~ 1075 * 1076 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1077 * of the missing data. 1078 * 1079 * As is apparent from the example above, the only non-trivial rows in the 1080 * inverse matrix correspond to the data disks that we're trying to 1081 * reconstruct. Indeed, those are the only rows we need as the others would 1082 * only be useful for reconstructing data known or assumed to be valid. For 1083 * that reason, we only build the coefficients in the rows that correspond to 1084 * targeted columns. 1085 */ 1086 /* END CSTYLED */ 1087 1088 static void 1089 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1090 uint8_t **rows) 1091 { 1092 int i, j; 1093 int pow; 1094 1095 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1096 1097 /* 1098 * Fill in the missing rows of interest. 1099 */ 1100 for (i = 0; i < nmap; i++) { 1101 ASSERT3S(0, <=, map[i]); 1102 ASSERT3S(map[i], <=, 2); 1103 1104 pow = map[i] * n; 1105 if (pow > 255) 1106 pow -= 255; 1107 ASSERT(pow <= 255); 1108 1109 for (j = 0; j < n; j++) { 1110 pow -= map[i]; 1111 if (pow < 0) 1112 pow += 255; 1113 rows[i][j] = vdev_raidz_pow2[pow]; 1114 } 1115 } 1116 } 1117 1118 static void 1119 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1120 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1121 { 1122 int i, j, ii, jj; 1123 uint8_t log; 1124 1125 /* 1126 * Assert that the first nmissing entries from the array of used 1127 * columns correspond to parity columns and that subsequent entries 1128 * correspond to data columns. 1129 */ 1130 for (i = 0; i < nmissing; i++) { 1131 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1132 } 1133 for (; i < n; i++) { 1134 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1135 } 1136 1137 /* 1138 * First initialize the storage where we'll compute the inverse rows. 1139 */ 1140 for (i = 0; i < nmissing; i++) { 1141 for (j = 0; j < n; j++) { 1142 invrows[i][j] = (i == j) ? 1 : 0; 1143 } 1144 } 1145 1146 /* 1147 * Subtract all trivial rows from the rows of consequence. 1148 */ 1149 for (i = 0; i < nmissing; i++) { 1150 for (j = nmissing; j < n; j++) { 1151 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1152 jj = used[j] - rm->rm_firstdatacol; 1153 ASSERT3S(jj, <, n); 1154 invrows[i][j] = rows[i][jj]; 1155 rows[i][jj] = 0; 1156 } 1157 } 1158 1159 /* 1160 * For each of the rows of interest, we must normalize it and subtract 1161 * a multiple of it from the other rows. 1162 */ 1163 for (i = 0; i < nmissing; i++) { 1164 for (j = 0; j < missing[i]; j++) { 1165 ASSERT0(rows[i][j]); 1166 } 1167 ASSERT3U(rows[i][missing[i]], !=, 0); 1168 1169 /* 1170 * Compute the inverse of the first element and multiply each 1171 * element in the row by that value. 1172 */ 1173 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1174 1175 for (j = 0; j < n; j++) { 1176 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1177 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1178 } 1179 1180 for (ii = 0; ii < nmissing; ii++) { 1181 if (i == ii) 1182 continue; 1183 1184 ASSERT3U(rows[ii][missing[i]], !=, 0); 1185 1186 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1187 1188 for (j = 0; j < n; j++) { 1189 rows[ii][j] ^= 1190 vdev_raidz_exp2(rows[i][j], log); 1191 invrows[ii][j] ^= 1192 vdev_raidz_exp2(invrows[i][j], log); 1193 } 1194 } 1195 } 1196 1197 /* 1198 * Verify that the data that is left in the rows are properly part of 1199 * an identity matrix. 1200 */ 1201 for (i = 0; i < nmissing; i++) { 1202 for (j = 0; j < n; j++) { 1203 if (j == missing[i]) { 1204 ASSERT3U(rows[i][j], ==, 1); 1205 } else { 1206 ASSERT0(rows[i][j]); 1207 } 1208 } 1209 } 1210 } 1211 1212 static void 1213 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1214 int *missing, uint8_t **invrows, const uint8_t *used) 1215 { 1216 int i, j, x, cc, c; 1217 uint8_t *src; 1218 uint64_t ccount; 1219 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1220 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1221 uint8_t log = 0; 1222 uint8_t val; 1223 int ll; 1224 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1225 uint8_t *p, *pp; 1226 size_t psize; 1227 1228 psize = sizeof (invlog[0][0]) * n * nmissing; 1229 p = kmem_alloc(psize, KM_SLEEP); 1230 1231 for (pp = p, i = 0; i < nmissing; i++) { 1232 invlog[i] = pp; 1233 pp += n; 1234 } 1235 1236 for (i = 0; i < nmissing; i++) { 1237 for (j = 0; j < n; j++) { 1238 ASSERT3U(invrows[i][j], !=, 0); 1239 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1240 } 1241 } 1242 1243 for (i = 0; i < n; i++) { 1244 c = used[i]; 1245 ASSERT3U(c, <, rm->rm_cols); 1246 1247 src = rm->rm_col[c].rc_data; 1248 ccount = rm->rm_col[c].rc_size; 1249 for (j = 0; j < nmissing; j++) { 1250 cc = missing[j] + rm->rm_firstdatacol; 1251 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1252 ASSERT3U(cc, <, rm->rm_cols); 1253 ASSERT3U(cc, !=, c); 1254 1255 dst[j] = rm->rm_col[cc].rc_data; 1256 dcount[j] = rm->rm_col[cc].rc_size; 1257 } 1258 1259 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1260 1261 for (x = 0; x < ccount; x++, src++) { 1262 if (*src != 0) 1263 log = vdev_raidz_log2[*src]; 1264 1265 for (cc = 0; cc < nmissing; cc++) { 1266 if (x >= dcount[cc]) 1267 continue; 1268 1269 if (*src == 0) { 1270 val = 0; 1271 } else { 1272 if ((ll = log + invlog[cc][i]) >= 255) 1273 ll -= 255; 1274 val = vdev_raidz_pow2[ll]; 1275 } 1276 1277 if (i == 0) 1278 dst[cc][x] = val; 1279 else 1280 dst[cc][x] ^= val; 1281 } 1282 } 1283 } 1284 1285 kmem_free(p, psize); 1286 } 1287 1288 static int 1289 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1290 { 1291 int n, i, c, t, tt; 1292 int nmissing_rows; 1293 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1294 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1295 1296 uint8_t *p, *pp; 1297 size_t psize; 1298 1299 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1300 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1301 uint8_t *used; 1302 1303 int code = 0; 1304 1305 1306 n = rm->rm_cols - rm->rm_firstdatacol; 1307 1308 /* 1309 * Figure out which data columns are missing. 1310 */ 1311 nmissing_rows = 0; 1312 for (t = 0; t < ntgts; t++) { 1313 if (tgts[t] >= rm->rm_firstdatacol) { 1314 missing_rows[nmissing_rows++] = 1315 tgts[t] - rm->rm_firstdatacol; 1316 } 1317 } 1318 1319 /* 1320 * Figure out which parity columns to use to help generate the missing 1321 * data columns. 1322 */ 1323 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1324 ASSERT(tt < ntgts); 1325 ASSERT(c < rm->rm_firstdatacol); 1326 1327 /* 1328 * Skip any targeted parity columns. 1329 */ 1330 if (c == tgts[tt]) { 1331 tt++; 1332 continue; 1333 } 1334 1335 code |= 1 << c; 1336 1337 parity_map[i] = c; 1338 i++; 1339 } 1340 1341 ASSERT(code != 0); 1342 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1343 1344 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1345 nmissing_rows * n + sizeof (used[0]) * n; 1346 p = kmem_alloc(psize, KM_SLEEP); 1347 1348 for (pp = p, i = 0; i < nmissing_rows; i++) { 1349 rows[i] = pp; 1350 pp += n; 1351 invrows[i] = pp; 1352 pp += n; 1353 } 1354 used = pp; 1355 1356 for (i = 0; i < nmissing_rows; i++) { 1357 used[i] = parity_map[i]; 1358 } 1359 1360 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1361 if (tt < nmissing_rows && 1362 c == missing_rows[tt] + rm->rm_firstdatacol) { 1363 tt++; 1364 continue; 1365 } 1366 1367 ASSERT3S(i, <, n); 1368 used[i] = c; 1369 i++; 1370 } 1371 1372 /* 1373 * Initialize the interesting rows of the matrix. 1374 */ 1375 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1376 1377 /* 1378 * Invert the matrix. 1379 */ 1380 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1381 invrows, used); 1382 1383 /* 1384 * Reconstruct the missing data using the generated matrix. 1385 */ 1386 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1387 invrows, used); 1388 1389 kmem_free(p, psize); 1390 1391 return (code); 1392 } 1393 1394 static int 1395 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1396 { 1397 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1398 int ntgts; 1399 int i, c; 1400 int code; 1401 int nbadparity, nbaddata; 1402 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1403 1404 /* 1405 * The tgts list must already be sorted. 1406 */ 1407 for (i = 1; i < nt; i++) { 1408 ASSERT(t[i] > t[i - 1]); 1409 } 1410 1411 nbadparity = rm->rm_firstdatacol; 1412 nbaddata = rm->rm_cols - nbadparity; 1413 ntgts = 0; 1414 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1415 if (c < rm->rm_firstdatacol) 1416 parity_valid[c] = B_FALSE; 1417 1418 if (i < nt && c == t[i]) { 1419 tgts[ntgts++] = c; 1420 i++; 1421 } else if (rm->rm_col[c].rc_error != 0) { 1422 tgts[ntgts++] = c; 1423 } else if (c >= rm->rm_firstdatacol) { 1424 nbaddata--; 1425 } else { 1426 parity_valid[c] = B_TRUE; 1427 nbadparity--; 1428 } 1429 } 1430 1431 ASSERT(ntgts >= nt); 1432 ASSERT(nbaddata >= 0); 1433 ASSERT(nbaddata + nbadparity == ntgts); 1434 1435 dt = &tgts[nbadparity]; 1436 1437 /* 1438 * See if we can use any of our optimized reconstruction routines. 1439 */ 1440 if (!vdev_raidz_default_to_general) { 1441 switch (nbaddata) { 1442 case 1: 1443 if (parity_valid[VDEV_RAIDZ_P]) 1444 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1445 1446 ASSERT(rm->rm_firstdatacol > 1); 1447 1448 if (parity_valid[VDEV_RAIDZ_Q]) 1449 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1450 1451 ASSERT(rm->rm_firstdatacol > 2); 1452 break; 1453 1454 case 2: 1455 ASSERT(rm->rm_firstdatacol > 1); 1456 1457 if (parity_valid[VDEV_RAIDZ_P] && 1458 parity_valid[VDEV_RAIDZ_Q]) 1459 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1460 1461 ASSERT(rm->rm_firstdatacol > 2); 1462 1463 break; 1464 } 1465 } 1466 1467 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1468 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1469 ASSERT(code > 0); 1470 return (code); 1471 } 1472 1473 static int 1474 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1475 uint64_t *ashift) 1476 { 1477 vdev_t *cvd; 1478 uint64_t nparity = vd->vdev_nparity; 1479 int c; 1480 int lasterror = 0; 1481 int numerrors = 0; 1482 1483 ASSERT(nparity > 0); 1484 1485 if (nparity > VDEV_RAIDZ_MAXPARITY || 1486 vd->vdev_children < nparity + 1) { 1487 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1488 return (SET_ERROR(EINVAL)); 1489 } 1490 1491 vdev_open_children(vd); 1492 1493 for (c = 0; c < vd->vdev_children; c++) { 1494 cvd = vd->vdev_child[c]; 1495 1496 if (cvd->vdev_open_error != 0) { 1497 lasterror = cvd->vdev_open_error; 1498 numerrors++; 1499 continue; 1500 } 1501 1502 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1503 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1504 *ashift = MAX(*ashift, cvd->vdev_ashift); 1505 } 1506 1507 *asize *= vd->vdev_children; 1508 *max_asize *= vd->vdev_children; 1509 1510 if (numerrors > nparity) { 1511 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1512 return (lasterror); 1513 } 1514 1515 return (0); 1516 } 1517 1518 static void 1519 vdev_raidz_close(vdev_t *vd) 1520 { 1521 int c; 1522 1523 for (c = 0; c < vd->vdev_children; c++) 1524 vdev_close(vd->vdev_child[c]); 1525 } 1526 1527 /* 1528 * Handle a read or write I/O to a RAID-Z dump device. 1529 * 1530 * The dump device is in a unique situation compared to other ZFS datasets: 1531 * writing to this device should be as simple and fast as possible. In 1532 * addition, durability matters much less since the dump will be extracted 1533 * once the machine reboots. For that reason, this function eschews parity for 1534 * performance and simplicity. The dump device uses the checksum setting 1535 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1536 * dataset. 1537 * 1538 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1539 * 128 KB will not fill an entire block; in addition, they may not be properly 1540 * aligned. In that case, this function uses the preallocated 128 KB block and 1541 * omits reading or writing any "empty" portions of that block, as opposed to 1542 * allocating a fresh appropriately-sized block. 1543 * 1544 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1545 * 1546 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1547 * 1548 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1549 * allocated which spans all five child vdevs. 8 KB of data would be written to 1550 * each of four vdevs, with the fifth containing the parity bits. 1551 * 1552 * parity data data data data 1553 * | PP | XX | XX | XX | XX | 1554 * ^ ^ ^ ^ ^ 1555 * | | | | | 1556 * 8 KB parity ------8 KB data blocks------ 1557 * 1558 * However, when writing to the dump device, the behavior is different: 1559 * 1560 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1561 * 1562 * Unlike the normal RAID-Z case in which the block is allocated based on the 1563 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1564 * I/O size is less than 128 KB, only the actual portions of data are written. 1565 * In this example the data is written to the third data vdev since that vdev 1566 * contains the offset [64 KB, 96 KB). 1567 * 1568 * parity data data data data 1569 * | | | | XX | | 1570 * ^ 1571 * | 1572 * 32 KB data block 1573 * 1574 * As a result, an individual I/O may not span all child vdevs; moreover, a 1575 * small I/O may only operate on a single child vdev. 1576 * 1577 * Note that since there are no parity bits calculated or written, this format 1578 * remains the same no matter how many parity bits are used in a normal RAID-Z 1579 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1580 * would look like: 1581 * 1582 * parity parity parity data data data data 1583 * | | | | | | XX | | 1584 * ^ 1585 * | 1586 * 32 KB data block 1587 */ 1588 int 1589 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1590 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1591 { 1592 vdev_t *tvd = vd->vdev_top; 1593 vdev_t *cvd; 1594 raidz_map_t *rm; 1595 raidz_col_t *rc; 1596 int c, err = 0; 1597 1598 uint64_t start, end, colstart, colend; 1599 uint64_t coloffset, colsize, colskip; 1600 1601 int flags = doread ? B_READ : B_WRITE; 1602 1603 #ifdef _KERNEL 1604 1605 /* 1606 * Don't write past the end of the block 1607 */ 1608 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1609 1610 start = offset; 1611 end = start + size; 1612 1613 /* 1614 * Allocate a RAID-Z map for this block. Note that this block starts 1615 * from the "original" offset, this is, the offset of the extent which 1616 * contains the requisite offset of the data being read or written. 1617 * 1618 * Even if this I/O operation doesn't span the full block size, let's 1619 * treat the on-disk format as if the only blocks are the complete 128 1620 * KB size. 1621 */ 1622 rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1623 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1624 vd->vdev_children, vd->vdev_nparity); 1625 1626 coloffset = origoffset; 1627 1628 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1629 c++, coloffset += rc->rc_size) { 1630 rc = &rm->rm_col[c]; 1631 cvd = vd->vdev_child[rc->rc_devidx]; 1632 1633 /* 1634 * Find the start and end of this column in the RAID-Z map, 1635 * keeping in mind that the stated size and offset of the 1636 * operation may not fill the entire column for this vdev. 1637 * 1638 * If any portion of the data spans this column, issue the 1639 * appropriate operation to the vdev. 1640 */ 1641 if (coloffset + rc->rc_size <= start) 1642 continue; 1643 if (coloffset >= end) 1644 continue; 1645 1646 colstart = MAX(coloffset, start); 1647 colend = MIN(end, coloffset + rc->rc_size); 1648 colsize = colend - colstart; 1649 colskip = colstart - coloffset; 1650 1651 VERIFY3U(colsize, <=, rc->rc_size); 1652 VERIFY3U(colskip, <=, rc->rc_size); 1653 1654 /* 1655 * Note that the child vdev will have a vdev label at the start 1656 * of its range of offsets, hence the need for 1657 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1658 * example of why this calculation is needed. 1659 */ 1660 if ((err = vdev_disk_physio(cvd, 1661 ((char *)rc->rc_data) + colskip, colsize, 1662 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1663 flags, isdump)) != 0) 1664 break; 1665 } 1666 1667 vdev_raidz_map_free(rm); 1668 #endif /* KERNEL */ 1669 1670 return (err); 1671 } 1672 1673 static uint64_t 1674 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1675 { 1676 uint64_t asize; 1677 uint64_t ashift = vd->vdev_top->vdev_ashift; 1678 uint64_t cols = vd->vdev_children; 1679 uint64_t nparity = vd->vdev_nparity; 1680 1681 asize = ((psize - 1) >> ashift) + 1; 1682 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1683 asize = roundup(asize, nparity + 1) << ashift; 1684 1685 return (asize); 1686 } 1687 1688 static void 1689 vdev_raidz_child_done(zio_t *zio) 1690 { 1691 raidz_col_t *rc = zio->io_private; 1692 1693 rc->rc_error = zio->io_error; 1694 rc->rc_tried = 1; 1695 rc->rc_skipped = 0; 1696 } 1697 1698 /* 1699 * Start an IO operation on a RAIDZ VDev 1700 * 1701 * Outline: 1702 * - For write operations: 1703 * 1. Generate the parity data 1704 * 2. Create child zio write operations to each column's vdev, for both 1705 * data and parity. 1706 * 3. If the column skips any sectors for padding, create optional dummy 1707 * write zio children for those areas to improve aggregation continuity. 1708 * - For read operations: 1709 * 1. Create child zio read operations to each data column's vdev to read 1710 * the range of data required for zio. 1711 * 2. If this is a scrub or resilver operation, or if any of the data 1712 * vdevs have had errors, then create zio read operations to the parity 1713 * columns' VDevs as well. 1714 */ 1715 static void 1716 vdev_raidz_io_start(zio_t *zio) 1717 { 1718 vdev_t *vd = zio->io_vd; 1719 vdev_t *tvd = vd->vdev_top; 1720 vdev_t *cvd; 1721 raidz_map_t *rm; 1722 raidz_col_t *rc; 1723 int c, i; 1724 1725 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1726 tvd->vdev_ashift, vd->vdev_children, 1727 vd->vdev_nparity); 1728 1729 zio->io_vsd = rm; 1730 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1731 1732 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1733 1734 if (zio->io_type == ZIO_TYPE_WRITE) { 1735 vdev_raidz_generate_parity(rm); 1736 1737 for (c = 0; c < rm->rm_cols; c++) { 1738 rc = &rm->rm_col[c]; 1739 cvd = vd->vdev_child[rc->rc_devidx]; 1740 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1741 rc->rc_offset, rc->rc_data, rc->rc_size, 1742 zio->io_type, zio->io_priority, 0, 1743 vdev_raidz_child_done, rc)); 1744 } 1745 1746 /* 1747 * Generate optional I/Os for any skipped sectors to improve 1748 * aggregation contiguity. 1749 */ 1750 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1751 ASSERT(c <= rm->rm_scols); 1752 if (c == rm->rm_scols) 1753 c = 0; 1754 rc = &rm->rm_col[c]; 1755 cvd = vd->vdev_child[rc->rc_devidx]; 1756 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1757 rc->rc_offset + rc->rc_size, NULL, 1758 1 << tvd->vdev_ashift, 1759 zio->io_type, zio->io_priority, 1760 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1761 } 1762 1763 zio_execute(zio); 1764 return; 1765 } 1766 1767 ASSERT(zio->io_type == ZIO_TYPE_READ); 1768 1769 /* 1770 * Iterate over the columns in reverse order so that we hit the parity 1771 * last -- any errors along the way will force us to read the parity. 1772 */ 1773 for (c = rm->rm_cols - 1; c >= 0; c--) { 1774 rc = &rm->rm_col[c]; 1775 cvd = vd->vdev_child[rc->rc_devidx]; 1776 if (!vdev_readable(cvd)) { 1777 if (c >= rm->rm_firstdatacol) 1778 rm->rm_missingdata++; 1779 else 1780 rm->rm_missingparity++; 1781 rc->rc_error = SET_ERROR(ENXIO); 1782 rc->rc_tried = 1; /* don't even try */ 1783 rc->rc_skipped = 1; 1784 continue; 1785 } 1786 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1787 if (c >= rm->rm_firstdatacol) 1788 rm->rm_missingdata++; 1789 else 1790 rm->rm_missingparity++; 1791 rc->rc_error = SET_ERROR(ESTALE); 1792 rc->rc_skipped = 1; 1793 continue; 1794 } 1795 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1796 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1797 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1798 rc->rc_offset, rc->rc_data, rc->rc_size, 1799 zio->io_type, zio->io_priority, 0, 1800 vdev_raidz_child_done, rc)); 1801 } 1802 } 1803 1804 zio_execute(zio); 1805 } 1806 1807 1808 /* 1809 * Report a checksum error for a child of a RAID-Z device. 1810 */ 1811 static void 1812 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1813 { 1814 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1815 1816 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1817 zio_bad_cksum_t zbc; 1818 raidz_map_t *rm = zio->io_vsd; 1819 1820 mutex_enter(&vd->vdev_stat_lock); 1821 vd->vdev_stat.vs_checksum_errors++; 1822 mutex_exit(&vd->vdev_stat_lock); 1823 1824 zbc.zbc_has_cksum = 0; 1825 zbc.zbc_injected = rm->rm_ecksuminjected; 1826 1827 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1828 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1829 &zbc); 1830 } 1831 } 1832 1833 /* 1834 * We keep track of whether or not there were any injected errors, so that 1835 * any ereports we generate can note it. 1836 */ 1837 static int 1838 raidz_checksum_verify(zio_t *zio) 1839 { 1840 zio_bad_cksum_t zbc; 1841 raidz_map_t *rm = zio->io_vsd; 1842 1843 int ret = zio_checksum_error(zio, &zbc); 1844 if (ret != 0 && zbc.zbc_injected != 0) 1845 rm->rm_ecksuminjected = 1; 1846 1847 return (ret); 1848 } 1849 1850 /* 1851 * Generate the parity from the data columns. If we tried and were able to 1852 * read the parity without error, verify that the generated parity matches the 1853 * data we read. If it doesn't, we fire off a checksum error. Return the 1854 * number such failures. 1855 */ 1856 static int 1857 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1858 { 1859 void *orig[VDEV_RAIDZ_MAXPARITY]; 1860 int c, ret = 0; 1861 raidz_col_t *rc; 1862 1863 blkptr_t *bp = zio->io_bp; 1864 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1865 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1866 1867 if (checksum == ZIO_CHECKSUM_NOPARITY) 1868 return (ret); 1869 1870 for (c = 0; c < rm->rm_firstdatacol; c++) { 1871 rc = &rm->rm_col[c]; 1872 if (!rc->rc_tried || rc->rc_error != 0) 1873 continue; 1874 orig[c] = zio_buf_alloc(rc->rc_size); 1875 bcopy(rc->rc_data, orig[c], rc->rc_size); 1876 } 1877 1878 vdev_raidz_generate_parity(rm); 1879 1880 for (c = 0; c < rm->rm_firstdatacol; c++) { 1881 rc = &rm->rm_col[c]; 1882 if (!rc->rc_tried || rc->rc_error != 0) 1883 continue; 1884 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1885 raidz_checksum_error(zio, rc, orig[c]); 1886 rc->rc_error = SET_ERROR(ECKSUM); 1887 ret++; 1888 } 1889 zio_buf_free(orig[c], rc->rc_size); 1890 } 1891 1892 return (ret); 1893 } 1894 1895 /* 1896 * Keep statistics on all the ways that we used parity to correct data. 1897 */ 1898 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1899 1900 static int 1901 vdev_raidz_worst_error(raidz_map_t *rm) 1902 { 1903 int error = 0; 1904 1905 for (int c = 0; c < rm->rm_cols; c++) 1906 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1907 1908 return (error); 1909 } 1910 1911 /* 1912 * Iterate over all combinations of bad data and attempt a reconstruction. 1913 * Note that the algorithm below is non-optimal because it doesn't take into 1914 * account how reconstruction is actually performed. For example, with 1915 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1916 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1917 * cases we'd only use parity information in column 0. 1918 */ 1919 static int 1920 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1921 { 1922 raidz_map_t *rm = zio->io_vsd; 1923 raidz_col_t *rc; 1924 void *orig[VDEV_RAIDZ_MAXPARITY]; 1925 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1926 int *tgts = &tstore[1]; 1927 int current, next, i, c, n; 1928 int code, ret = 0; 1929 1930 ASSERT(total_errors < rm->rm_firstdatacol); 1931 1932 /* 1933 * This simplifies one edge condition. 1934 */ 1935 tgts[-1] = -1; 1936 1937 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1938 /* 1939 * Initialize the targets array by finding the first n columns 1940 * that contain no error. 1941 * 1942 * If there were no data errors, we need to ensure that we're 1943 * always explicitly attempting to reconstruct at least one 1944 * data column. To do this, we simply push the highest target 1945 * up into the data columns. 1946 */ 1947 for (c = 0, i = 0; i < n; i++) { 1948 if (i == n - 1 && data_errors == 0 && 1949 c < rm->rm_firstdatacol) { 1950 c = rm->rm_firstdatacol; 1951 } 1952 1953 while (rm->rm_col[c].rc_error != 0) { 1954 c++; 1955 ASSERT3S(c, <, rm->rm_cols); 1956 } 1957 1958 tgts[i] = c++; 1959 } 1960 1961 /* 1962 * Setting tgts[n] simplifies the other edge condition. 1963 */ 1964 tgts[n] = rm->rm_cols; 1965 1966 /* 1967 * These buffers were allocated in previous iterations. 1968 */ 1969 for (i = 0; i < n - 1; i++) { 1970 ASSERT(orig[i] != NULL); 1971 } 1972 1973 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1974 1975 current = 0; 1976 next = tgts[current]; 1977 1978 while (current != n) { 1979 tgts[current] = next; 1980 current = 0; 1981 1982 /* 1983 * Save off the original data that we're going to 1984 * attempt to reconstruct. 1985 */ 1986 for (i = 0; i < n; i++) { 1987 ASSERT(orig[i] != NULL); 1988 c = tgts[i]; 1989 ASSERT3S(c, >=, 0); 1990 ASSERT3S(c, <, rm->rm_cols); 1991 rc = &rm->rm_col[c]; 1992 bcopy(rc->rc_data, orig[i], rc->rc_size); 1993 } 1994 1995 /* 1996 * Attempt a reconstruction and exit the outer loop on 1997 * success. 1998 */ 1999 code = vdev_raidz_reconstruct(rm, tgts, n); 2000 if (raidz_checksum_verify(zio) == 0) { 2001 atomic_inc_64(&raidz_corrected[code]); 2002 2003 for (i = 0; i < n; i++) { 2004 c = tgts[i]; 2005 rc = &rm->rm_col[c]; 2006 ASSERT(rc->rc_error == 0); 2007 if (rc->rc_tried) 2008 raidz_checksum_error(zio, rc, 2009 orig[i]); 2010 rc->rc_error = SET_ERROR(ECKSUM); 2011 } 2012 2013 ret = code; 2014 goto done; 2015 } 2016 2017 /* 2018 * Restore the original data. 2019 */ 2020 for (i = 0; i < n; i++) { 2021 c = tgts[i]; 2022 rc = &rm->rm_col[c]; 2023 bcopy(orig[i], rc->rc_data, rc->rc_size); 2024 } 2025 2026 do { 2027 /* 2028 * Find the next valid column after the current 2029 * position.. 2030 */ 2031 for (next = tgts[current] + 1; 2032 next < rm->rm_cols && 2033 rm->rm_col[next].rc_error != 0; next++) 2034 continue; 2035 2036 ASSERT(next <= tgts[current + 1]); 2037 2038 /* 2039 * If that spot is available, we're done here. 2040 */ 2041 if (next != tgts[current + 1]) 2042 break; 2043 2044 /* 2045 * Otherwise, find the next valid column after 2046 * the previous position. 2047 */ 2048 for (c = tgts[current - 1] + 1; 2049 rm->rm_col[c].rc_error != 0; c++) 2050 continue; 2051 2052 tgts[current] = c; 2053 current++; 2054 2055 } while (current != n); 2056 } 2057 } 2058 n--; 2059 done: 2060 for (i = 0; i < n; i++) { 2061 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2062 } 2063 2064 return (ret); 2065 } 2066 2067 /* 2068 * Complete an IO operation on a RAIDZ VDev 2069 * 2070 * Outline: 2071 * - For write operations: 2072 * 1. Check for errors on the child IOs. 2073 * 2. Return, setting an error code if too few child VDevs were written 2074 * to reconstruct the data later. Note that partial writes are 2075 * considered successful if they can be reconstructed at all. 2076 * - For read operations: 2077 * 1. Check for errors on the child IOs. 2078 * 2. If data errors occurred: 2079 * a. Try to reassemble the data from the parity available. 2080 * b. If we haven't yet read the parity drives, read them now. 2081 * c. If all parity drives have been read but the data still doesn't 2082 * reassemble with a correct checksum, then try combinatorial 2083 * reconstruction. 2084 * d. If that doesn't work, return an error. 2085 * 3. If there were unexpected errors or this is a resilver operation, 2086 * rewrite the vdevs that had errors. 2087 */ 2088 static void 2089 vdev_raidz_io_done(zio_t *zio) 2090 { 2091 vdev_t *vd = zio->io_vd; 2092 vdev_t *cvd; 2093 raidz_map_t *rm = zio->io_vsd; 2094 raidz_col_t *rc; 2095 int unexpected_errors = 0; 2096 int parity_errors = 0; 2097 int parity_untried = 0; 2098 int data_errors = 0; 2099 int total_errors = 0; 2100 int n, c; 2101 int tgts[VDEV_RAIDZ_MAXPARITY]; 2102 int code; 2103 2104 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2105 2106 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2107 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2108 2109 for (c = 0; c < rm->rm_cols; c++) { 2110 rc = &rm->rm_col[c]; 2111 2112 if (rc->rc_error) { 2113 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2114 2115 if (c < rm->rm_firstdatacol) 2116 parity_errors++; 2117 else 2118 data_errors++; 2119 2120 if (!rc->rc_skipped) 2121 unexpected_errors++; 2122 2123 total_errors++; 2124 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2125 parity_untried++; 2126 } 2127 } 2128 2129 if (zio->io_type == ZIO_TYPE_WRITE) { 2130 /* 2131 * XXX -- for now, treat partial writes as a success. 2132 * (If we couldn't write enough columns to reconstruct 2133 * the data, the I/O failed. Otherwise, good enough.) 2134 * 2135 * Now that we support write reallocation, it would be better 2136 * to treat partial failure as real failure unless there are 2137 * no non-degraded top-level vdevs left, and not update DTLs 2138 * if we intend to reallocate. 2139 */ 2140 /* XXPOLICY */ 2141 if (total_errors > rm->rm_firstdatacol) 2142 zio->io_error = vdev_raidz_worst_error(rm); 2143 2144 return; 2145 } 2146 2147 ASSERT(zio->io_type == ZIO_TYPE_READ); 2148 /* 2149 * There are three potential phases for a read: 2150 * 1. produce valid data from the columns read 2151 * 2. read all disks and try again 2152 * 3. perform combinatorial reconstruction 2153 * 2154 * Each phase is progressively both more expensive and less likely to 2155 * occur. If we encounter more errors than we can repair or all phases 2156 * fail, we have no choice but to return an error. 2157 */ 2158 2159 /* 2160 * If the number of errors we saw was correctable -- less than or equal 2161 * to the number of parity disks read -- attempt to produce data that 2162 * has a valid checksum. Naturally, this case applies in the absence of 2163 * any errors. 2164 */ 2165 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2166 if (data_errors == 0) { 2167 if (raidz_checksum_verify(zio) == 0) { 2168 /* 2169 * If we read parity information (unnecessarily 2170 * as it happens since no reconstruction was 2171 * needed) regenerate and verify the parity. 2172 * We also regenerate parity when resilvering 2173 * so we can write it out to the failed device 2174 * later. 2175 */ 2176 if (parity_errors + parity_untried < 2177 rm->rm_firstdatacol || 2178 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2179 n = raidz_parity_verify(zio, rm); 2180 unexpected_errors += n; 2181 ASSERT(parity_errors + n <= 2182 rm->rm_firstdatacol); 2183 } 2184 goto done; 2185 } 2186 } else { 2187 /* 2188 * We either attempt to read all the parity columns or 2189 * none of them. If we didn't try to read parity, we 2190 * wouldn't be here in the correctable case. There must 2191 * also have been fewer parity errors than parity 2192 * columns or, again, we wouldn't be in this code path. 2193 */ 2194 ASSERT(parity_untried == 0); 2195 ASSERT(parity_errors < rm->rm_firstdatacol); 2196 2197 /* 2198 * Identify the data columns that reported an error. 2199 */ 2200 n = 0; 2201 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2202 rc = &rm->rm_col[c]; 2203 if (rc->rc_error != 0) { 2204 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2205 tgts[n++] = c; 2206 } 2207 } 2208 2209 ASSERT(rm->rm_firstdatacol >= n); 2210 2211 code = vdev_raidz_reconstruct(rm, tgts, n); 2212 2213 if (raidz_checksum_verify(zio) == 0) { 2214 atomic_inc_64(&raidz_corrected[code]); 2215 2216 /* 2217 * If we read more parity disks than were used 2218 * for reconstruction, confirm that the other 2219 * parity disks produced correct data. This 2220 * routine is suboptimal in that it regenerates 2221 * the parity that we already used in addition 2222 * to the parity that we're attempting to 2223 * verify, but this should be a relatively 2224 * uncommon case, and can be optimized if it 2225 * becomes a problem. Note that we regenerate 2226 * parity when resilvering so we can write it 2227 * out to failed devices later. 2228 */ 2229 if (parity_errors < rm->rm_firstdatacol - n || 2230 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2231 n = raidz_parity_verify(zio, rm); 2232 unexpected_errors += n; 2233 ASSERT(parity_errors + n <= 2234 rm->rm_firstdatacol); 2235 } 2236 2237 goto done; 2238 } 2239 } 2240 } 2241 2242 /* 2243 * This isn't a typical situation -- either we got a read error or 2244 * a child silently returned bad data. Read every block so we can 2245 * try again with as much data and parity as we can track down. If 2246 * we've already been through once before, all children will be marked 2247 * as tried so we'll proceed to combinatorial reconstruction. 2248 */ 2249 unexpected_errors = 1; 2250 rm->rm_missingdata = 0; 2251 rm->rm_missingparity = 0; 2252 2253 for (c = 0; c < rm->rm_cols; c++) { 2254 if (rm->rm_col[c].rc_tried) 2255 continue; 2256 2257 zio_vdev_io_redone(zio); 2258 do { 2259 rc = &rm->rm_col[c]; 2260 if (rc->rc_tried) 2261 continue; 2262 zio_nowait(zio_vdev_child_io(zio, NULL, 2263 vd->vdev_child[rc->rc_devidx], 2264 rc->rc_offset, rc->rc_data, rc->rc_size, 2265 zio->io_type, zio->io_priority, 0, 2266 vdev_raidz_child_done, rc)); 2267 } while (++c < rm->rm_cols); 2268 2269 return; 2270 } 2271 2272 /* 2273 * At this point we've attempted to reconstruct the data given the 2274 * errors we detected, and we've attempted to read all columns. There 2275 * must, therefore, be one or more additional problems -- silent errors 2276 * resulting in invalid data rather than explicit I/O errors resulting 2277 * in absent data. We check if there is enough additional data to 2278 * possibly reconstruct the data and then perform combinatorial 2279 * reconstruction over all possible combinations. If that fails, 2280 * we're cooked. 2281 */ 2282 if (total_errors > rm->rm_firstdatacol) { 2283 zio->io_error = vdev_raidz_worst_error(rm); 2284 2285 } else if (total_errors < rm->rm_firstdatacol && 2286 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2287 /* 2288 * If we didn't use all the available parity for the 2289 * combinatorial reconstruction, verify that the remaining 2290 * parity is correct. 2291 */ 2292 if (code != (1 << rm->rm_firstdatacol) - 1) 2293 (void) raidz_parity_verify(zio, rm); 2294 } else { 2295 /* 2296 * We're here because either: 2297 * 2298 * total_errors == rm_first_datacol, or 2299 * vdev_raidz_combrec() failed 2300 * 2301 * In either case, there is enough bad data to prevent 2302 * reconstruction. 2303 * 2304 * Start checksum ereports for all children which haven't 2305 * failed, and the IO wasn't speculative. 2306 */ 2307 zio->io_error = SET_ERROR(ECKSUM); 2308 2309 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2310 for (c = 0; c < rm->rm_cols; c++) { 2311 rc = &rm->rm_col[c]; 2312 if (rc->rc_error == 0) { 2313 zio_bad_cksum_t zbc; 2314 zbc.zbc_has_cksum = 0; 2315 zbc.zbc_injected = 2316 rm->rm_ecksuminjected; 2317 2318 zfs_ereport_start_checksum( 2319 zio->io_spa, 2320 vd->vdev_child[rc->rc_devidx], 2321 zio, rc->rc_offset, rc->rc_size, 2322 (void *)(uintptr_t)c, &zbc); 2323 } 2324 } 2325 } 2326 } 2327 2328 done: 2329 zio_checksum_verified(zio); 2330 2331 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2332 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2333 /* 2334 * Use the good data we have in hand to repair damaged children. 2335 */ 2336 for (c = 0; c < rm->rm_cols; c++) { 2337 rc = &rm->rm_col[c]; 2338 cvd = vd->vdev_child[rc->rc_devidx]; 2339 2340 if (rc->rc_error == 0) 2341 continue; 2342 2343 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2344 rc->rc_offset, rc->rc_data, rc->rc_size, 2345 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2346 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2347 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2348 } 2349 } 2350 } 2351 2352 static void 2353 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2354 { 2355 if (faulted > vd->vdev_nparity) 2356 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2357 VDEV_AUX_NO_REPLICAS); 2358 else if (degraded + faulted != 0) 2359 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2360 else 2361 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2362 } 2363 2364 vdev_ops_t vdev_raidz_ops = { 2365 vdev_raidz_open, 2366 vdev_raidz_close, 2367 vdev_raidz_asize, 2368 vdev_raidz_io_start, 2369 vdev_raidz_io_done, 2370 vdev_raidz_state_change, 2371 NULL, 2372 NULL, 2373 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2374 B_FALSE /* not a leaf vdev */ 2375 }; 2376