1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_file.h> 34 #include <sys/vdev_raidz.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/fm/fs/zfs.h> 39 40 /* 41 * Virtual device vector for RAID-Z. 42 * 43 * This vdev supports single, double, and triple parity. For single parity, 44 * we use a simple XOR of all the data columns. For double or triple parity, 45 * we use a special case of Reed-Solomon coding. This extends the 46 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 47 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 48 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 49 * former is also based. The latter is designed to provide higher performance 50 * for writes. 51 * 52 * Note that the Plank paper claimed to support arbitrary N+M, but was then 53 * amended six years later identifying a critical flaw that invalidates its 54 * claims. Nevertheless, the technique can be adapted to work for up to 55 * triple parity. For additional parity, the amendment "Note: Correction to 56 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 57 * is viable, but the additional complexity means that write performance will 58 * suffer. 59 * 60 * All of the methods above operate on a Galois field, defined over the 61 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 62 * can be expressed with a single byte. Briefly, the operations on the 63 * field are defined as follows: 64 * 65 * o addition (+) is represented by a bitwise XOR 66 * o subtraction (-) is therefore identical to addition: A + B = A - B 67 * o multiplication of A by 2 is defined by the following bitwise expression: 68 * 69 * (A * 2)_7 = A_6 70 * (A * 2)_6 = A_5 71 * (A * 2)_5 = A_4 72 * (A * 2)_4 = A_3 + A_7 73 * (A * 2)_3 = A_2 + A_7 74 * (A * 2)_2 = A_1 + A_7 75 * (A * 2)_1 = A_0 76 * (A * 2)_0 = A_7 77 * 78 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 79 * As an aside, this multiplication is derived from the error correcting 80 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 81 * 82 * Observe that any number in the field (except for 0) can be expressed as a 83 * power of 2 -- a generator for the field. We store a table of the powers of 84 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 85 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 86 * than field addition). The inverse of a field element A (A^-1) is therefore 87 * A ^ (255 - 1) = A^254. 88 * 89 * The up-to-three parity columns, P, Q, R over several data columns, 90 * D_0, ... D_n-1, can be expressed by field operations: 91 * 92 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 93 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 94 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 95 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 96 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 97 * 98 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 99 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 100 * independent coefficients. (There are no additional coefficients that have 101 * this property which is why the uncorrected Plank method breaks down.) 102 * 103 * See the reconstruction code below for how P, Q and R can used individually 104 * or in concert to recover missing data columns. 105 */ 106 107 typedef struct raidz_col { 108 uint64_t rc_devidx; /* child device index for I/O */ 109 uint64_t rc_offset; /* device offset */ 110 uint64_t rc_size; /* I/O size */ 111 void *rc_data; /* I/O data */ 112 void *rc_gdata; /* used to store the "good" version */ 113 int rc_error; /* I/O error for this device */ 114 uint8_t rc_tried; /* Did we attempt this I/O column? */ 115 uint8_t rc_skipped; /* Did we skip this I/O column? */ 116 } raidz_col_t; 117 118 typedef struct raidz_map { 119 uint64_t rm_cols; /* Regular column count */ 120 uint64_t rm_scols; /* Count including skipped columns */ 121 uint64_t rm_bigcols; /* Number of oversized columns */ 122 uint64_t rm_asize; /* Actual total I/O size */ 123 uint64_t rm_missingdata; /* Count of missing data devices */ 124 uint64_t rm_missingparity; /* Count of missing parity devices */ 125 uint64_t rm_firstdatacol; /* First data column/parity count */ 126 uint64_t rm_nskip; /* Skipped sectors for padding */ 127 uint64_t rm_skipstart; /* Column index of padding start */ 128 void *rm_datacopy; /* rm_asize-buffer of copied data */ 129 uintptr_t rm_reports; /* # of referencing checksum reports */ 130 uint8_t rm_freed; /* map no longer has referencing ZIO */ 131 uint8_t rm_ecksuminjected; /* checksum error was injected */ 132 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 133 } raidz_map_t; 134 135 #define VDEV_RAIDZ_P 0 136 #define VDEV_RAIDZ_Q 1 137 #define VDEV_RAIDZ_R 2 138 139 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 140 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 141 142 /* 143 * We provide a mechanism to perform the field multiplication operation on a 144 * 64-bit value all at once rather than a byte at a time. This works by 145 * creating a mask from the top bit in each byte and using that to 146 * conditionally apply the XOR of 0x1d. 147 */ 148 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 149 { \ 150 (mask) = (x) & 0x8080808080808080ULL; \ 151 (mask) = ((mask) << 1) - ((mask) >> 7); \ 152 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 153 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 154 } 155 156 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 157 { \ 158 VDEV_RAIDZ_64MUL_2((x), mask); \ 159 VDEV_RAIDZ_64MUL_2((x), mask); \ 160 } 161 162 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 163 164 /* 165 * Force reconstruction to use the general purpose method. 166 */ 167 int vdev_raidz_default_to_general; 168 169 /* Powers of 2 in the Galois field defined above. */ 170 static const uint8_t vdev_raidz_pow2[256] = { 171 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 172 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 173 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 174 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 175 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 176 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 177 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 178 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 179 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 180 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 181 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 182 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 183 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 184 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 185 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 186 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 187 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 188 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 189 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 190 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 191 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 192 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 193 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 194 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 195 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 196 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 197 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 198 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 199 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 200 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 201 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 202 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 203 }; 204 /* Logs of 2 in the Galois field defined above. */ 205 static const uint8_t vdev_raidz_log2[256] = { 206 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 207 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 208 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 209 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 210 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 211 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 212 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 213 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 214 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 215 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 216 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 217 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 218 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 219 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 220 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 221 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 222 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 223 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 224 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 225 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 226 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 227 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 228 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 229 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 230 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 231 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 232 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 233 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 234 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 235 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 236 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 237 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 238 }; 239 240 static void vdev_raidz_generate_parity(raidz_map_t *rm); 241 242 /* 243 * Multiply a given number by 2 raised to the given power. 244 */ 245 static uint8_t 246 vdev_raidz_exp2(uint_t a, int exp) 247 { 248 if (a == 0) 249 return (0); 250 251 ASSERT(exp >= 0); 252 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 253 254 exp += vdev_raidz_log2[a]; 255 if (exp > 255) 256 exp -= 255; 257 258 return (vdev_raidz_pow2[exp]); 259 } 260 261 static void 262 vdev_raidz_map_free(raidz_map_t *rm) 263 { 264 int c; 265 size_t size; 266 267 for (c = 0; c < rm->rm_firstdatacol; c++) { 268 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 269 270 if (rm->rm_col[c].rc_gdata != NULL) 271 zio_buf_free(rm->rm_col[c].rc_gdata, 272 rm->rm_col[c].rc_size); 273 } 274 275 size = 0; 276 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 277 size += rm->rm_col[c].rc_size; 278 279 if (rm->rm_datacopy != NULL) 280 zio_buf_free(rm->rm_datacopy, size); 281 282 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 283 } 284 285 static void 286 vdev_raidz_map_free_vsd(zio_t *zio) 287 { 288 raidz_map_t *rm = zio->io_vsd; 289 290 ASSERT0(rm->rm_freed); 291 rm->rm_freed = 1; 292 293 if (rm->rm_reports == 0) 294 vdev_raidz_map_free(rm); 295 } 296 297 /*ARGSUSED*/ 298 static void 299 vdev_raidz_cksum_free(void *arg, size_t ignored) 300 { 301 raidz_map_t *rm = arg; 302 303 ASSERT3U(rm->rm_reports, >, 0); 304 305 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 306 vdev_raidz_map_free(rm); 307 } 308 309 static void 310 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 311 { 312 raidz_map_t *rm = zcr->zcr_cbdata; 313 size_t c = zcr->zcr_cbinfo; 314 size_t x; 315 316 const char *good = NULL; 317 const char *bad = rm->rm_col[c].rc_data; 318 319 if (good_data == NULL) { 320 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 321 return; 322 } 323 324 if (c < rm->rm_firstdatacol) { 325 /* 326 * The first time through, calculate the parity blocks for 327 * the good data (this relies on the fact that the good 328 * data never changes for a given logical ZIO) 329 */ 330 if (rm->rm_col[0].rc_gdata == NULL) { 331 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 332 char *buf; 333 334 /* 335 * Set up the rm_col[]s to generate the parity for 336 * good_data, first saving the parity bufs and 337 * replacing them with buffers to hold the result. 338 */ 339 for (x = 0; x < rm->rm_firstdatacol; x++) { 340 bad_parity[x] = rm->rm_col[x].rc_data; 341 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 342 zio_buf_alloc(rm->rm_col[x].rc_size); 343 } 344 345 /* fill in the data columns from good_data */ 346 buf = (char *)good_data; 347 for (; x < rm->rm_cols; x++) { 348 rm->rm_col[x].rc_data = buf; 349 buf += rm->rm_col[x].rc_size; 350 } 351 352 /* 353 * Construct the parity from the good data. 354 */ 355 vdev_raidz_generate_parity(rm); 356 357 /* restore everything back to its original state */ 358 for (x = 0; x < rm->rm_firstdatacol; x++) 359 rm->rm_col[x].rc_data = bad_parity[x]; 360 361 buf = rm->rm_datacopy; 362 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 363 rm->rm_col[x].rc_data = buf; 364 buf += rm->rm_col[x].rc_size; 365 } 366 } 367 368 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 369 good = rm->rm_col[c].rc_gdata; 370 } else { 371 /* adjust good_data to point at the start of our column */ 372 good = good_data; 373 374 for (x = rm->rm_firstdatacol; x < c; x++) 375 good += rm->rm_col[x].rc_size; 376 } 377 378 /* we drop the ereport if it ends up that the data was good */ 379 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 380 } 381 382 /* 383 * Invoked indirectly by zfs_ereport_start_checksum(), called 384 * below when our read operation fails completely. The main point 385 * is to keep a copy of everything we read from disk, so that at 386 * vdev_raidz_cksum_finish() time we can compare it with the good data. 387 */ 388 static void 389 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 390 { 391 size_t c = (size_t)(uintptr_t)arg; 392 caddr_t buf; 393 394 raidz_map_t *rm = zio->io_vsd; 395 size_t size; 396 397 /* set up the report and bump the refcount */ 398 zcr->zcr_cbdata = rm; 399 zcr->zcr_cbinfo = c; 400 zcr->zcr_finish = vdev_raidz_cksum_finish; 401 zcr->zcr_free = vdev_raidz_cksum_free; 402 403 rm->rm_reports++; 404 ASSERT3U(rm->rm_reports, >, 0); 405 406 if (rm->rm_datacopy != NULL) 407 return; 408 409 /* 410 * It's the first time we're called for this raidz_map_t, so we need 411 * to copy the data aside; there's no guarantee that our zio's buffer 412 * won't be re-used for something else. 413 * 414 * Our parity data is already in separate buffers, so there's no need 415 * to copy them. 416 */ 417 418 size = 0; 419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 420 size += rm->rm_col[c].rc_size; 421 422 buf = rm->rm_datacopy = zio_buf_alloc(size); 423 424 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 425 raidz_col_t *col = &rm->rm_col[c]; 426 427 bcopy(col->rc_data, buf, col->rc_size); 428 col->rc_data = buf; 429 430 buf += col->rc_size; 431 } 432 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 433 } 434 435 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 436 vdev_raidz_map_free_vsd, 437 vdev_raidz_cksum_report 438 }; 439 440 /* 441 * Divides the IO evenly across all child vdevs; usually, dcols is 442 * the number of children in the target vdev. 443 */ 444 static raidz_map_t * 445 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, 446 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 447 { 448 raidz_map_t *rm; 449 /* The starting RAIDZ (parent) vdev sector of the block. */ 450 uint64_t b = offset >> unit_shift; 451 /* The zio's size in units of the vdev's minimum sector size. */ 452 uint64_t s = size >> unit_shift; 453 /* The first column for this stripe. */ 454 uint64_t f = b % dcols; 455 /* The starting byte offset on each child vdev. */ 456 uint64_t o = (b / dcols) << unit_shift; 457 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 458 459 /* 460 * "Quotient": The number of data sectors for this stripe on all but 461 * the "big column" child vdevs that also contain "remainder" data. 462 */ 463 q = s / (dcols - nparity); 464 465 /* 466 * "Remainder": The number of partial stripe data sectors in this I/O. 467 * This will add a sector to some, but not all, child vdevs. 468 */ 469 r = s - q * (dcols - nparity); 470 471 /* The number of "big columns" - those which contain remainder data. */ 472 bc = (r == 0 ? 0 : r + nparity); 473 474 /* 475 * The total number of data and parity sectors associated with 476 * this I/O. 477 */ 478 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 479 480 /* acols: The columns that will be accessed. */ 481 /* scols: The columns that will be accessed or skipped. */ 482 if (q == 0) { 483 /* Our I/O request doesn't span all child vdevs. */ 484 acols = bc; 485 scols = MIN(dcols, roundup(bc, nparity + 1)); 486 } else { 487 acols = dcols; 488 scols = dcols; 489 } 490 491 ASSERT3U(acols, <=, scols); 492 493 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 494 495 rm->rm_cols = acols; 496 rm->rm_scols = scols; 497 rm->rm_bigcols = bc; 498 rm->rm_skipstart = bc; 499 rm->rm_missingdata = 0; 500 rm->rm_missingparity = 0; 501 rm->rm_firstdatacol = nparity; 502 rm->rm_datacopy = NULL; 503 rm->rm_reports = 0; 504 rm->rm_freed = 0; 505 rm->rm_ecksuminjected = 0; 506 507 asize = 0; 508 509 for (c = 0; c < scols; c++) { 510 col = f + c; 511 coff = o; 512 if (col >= dcols) { 513 col -= dcols; 514 coff += 1ULL << unit_shift; 515 } 516 rm->rm_col[c].rc_devidx = col; 517 rm->rm_col[c].rc_offset = coff; 518 rm->rm_col[c].rc_data = NULL; 519 rm->rm_col[c].rc_gdata = NULL; 520 rm->rm_col[c].rc_error = 0; 521 rm->rm_col[c].rc_tried = 0; 522 rm->rm_col[c].rc_skipped = 0; 523 524 if (c >= acols) 525 rm->rm_col[c].rc_size = 0; 526 else if (c < bc) 527 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 528 else 529 rm->rm_col[c].rc_size = q << unit_shift; 530 531 asize += rm->rm_col[c].rc_size; 532 } 533 534 ASSERT3U(asize, ==, tot << unit_shift); 535 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 536 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 537 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 538 ASSERT3U(rm->rm_nskip, <=, nparity); 539 540 for (c = 0; c < rm->rm_firstdatacol; c++) 541 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 542 543 rm->rm_col[c].rc_data = data; 544 545 for (c = c + 1; c < acols; c++) 546 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 547 rm->rm_col[c - 1].rc_size; 548 549 /* 550 * If all data stored spans all columns, there's a danger that parity 551 * will always be on the same device and, since parity isn't read 552 * during normal operation, that that device's I/O bandwidth won't be 553 * used effectively. We therefore switch the parity every 1MB. 554 * 555 * ... at least that was, ostensibly, the theory. As a practical 556 * matter unless we juggle the parity between all devices evenly, we 557 * won't see any benefit. Further, occasional writes that aren't a 558 * multiple of the LCM of the number of children and the minimum 559 * stripe width are sufficient to avoid pessimal behavior. 560 * Unfortunately, this decision created an implicit on-disk format 561 * requirement that we need to support for all eternity, but only 562 * for single-parity RAID-Z. 563 * 564 * If we intend to skip a sector in the zeroth column for padding 565 * we must make sure to note this swap. We will never intend to 566 * skip the first column since at least one data and one parity 567 * column must appear in each row. 568 */ 569 ASSERT(rm->rm_cols >= 2); 570 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 571 572 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 573 devidx = rm->rm_col[0].rc_devidx; 574 o = rm->rm_col[0].rc_offset; 575 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 576 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 577 rm->rm_col[1].rc_devidx = devidx; 578 rm->rm_col[1].rc_offset = o; 579 580 if (rm->rm_skipstart == 0) 581 rm->rm_skipstart = 1; 582 } 583 584 return (rm); 585 } 586 587 static void 588 vdev_raidz_generate_parity_p(raidz_map_t *rm) 589 { 590 uint64_t *p, *src, pcount, ccount, i; 591 int c; 592 593 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 594 595 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 596 src = rm->rm_col[c].rc_data; 597 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 598 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 599 600 if (c == rm->rm_firstdatacol) { 601 ASSERT(ccount == pcount); 602 for (i = 0; i < ccount; i++, src++, p++) { 603 *p = *src; 604 } 605 } else { 606 ASSERT(ccount <= pcount); 607 for (i = 0; i < ccount; i++, src++, p++) { 608 *p ^= *src; 609 } 610 } 611 } 612 } 613 614 static void 615 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 616 { 617 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 618 int c; 619 620 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 621 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 622 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 623 624 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 625 src = rm->rm_col[c].rc_data; 626 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 627 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 628 629 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 630 631 if (c == rm->rm_firstdatacol) { 632 ASSERT(ccnt == pcnt || ccnt == 0); 633 for (i = 0; i < ccnt; i++, src++, p++, q++) { 634 *p = *src; 635 *q = *src; 636 } 637 for (; i < pcnt; i++, src++, p++, q++) { 638 *p = 0; 639 *q = 0; 640 } 641 } else { 642 ASSERT(ccnt <= pcnt); 643 644 /* 645 * Apply the algorithm described above by multiplying 646 * the previous result and adding in the new value. 647 */ 648 for (i = 0; i < ccnt; i++, src++, p++, q++) { 649 *p ^= *src; 650 651 VDEV_RAIDZ_64MUL_2(*q, mask); 652 *q ^= *src; 653 } 654 655 /* 656 * Treat short columns as though they are full of 0s. 657 * Note that there's therefore nothing needed for P. 658 */ 659 for (; i < pcnt; i++, q++) { 660 VDEV_RAIDZ_64MUL_2(*q, mask); 661 } 662 } 663 } 664 } 665 666 static void 667 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 668 { 669 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 670 int c; 671 672 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 673 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 674 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 675 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 676 rm->rm_col[VDEV_RAIDZ_R].rc_size); 677 678 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 679 src = rm->rm_col[c].rc_data; 680 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 681 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 682 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 683 684 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 685 686 if (c == rm->rm_firstdatacol) { 687 ASSERT(ccnt == pcnt || ccnt == 0); 688 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 689 *p = *src; 690 *q = *src; 691 *r = *src; 692 } 693 for (; i < pcnt; i++, src++, p++, q++, r++) { 694 *p = 0; 695 *q = 0; 696 *r = 0; 697 } 698 } else { 699 ASSERT(ccnt <= pcnt); 700 701 /* 702 * Apply the algorithm described above by multiplying 703 * the previous result and adding in the new value. 704 */ 705 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 706 *p ^= *src; 707 708 VDEV_RAIDZ_64MUL_2(*q, mask); 709 *q ^= *src; 710 711 VDEV_RAIDZ_64MUL_4(*r, mask); 712 *r ^= *src; 713 } 714 715 /* 716 * Treat short columns as though they are full of 0s. 717 * Note that there's therefore nothing needed for P. 718 */ 719 for (; i < pcnt; i++, q++, r++) { 720 VDEV_RAIDZ_64MUL_2(*q, mask); 721 VDEV_RAIDZ_64MUL_4(*r, mask); 722 } 723 } 724 } 725 } 726 727 /* 728 * Generate RAID parity in the first virtual columns according to the number of 729 * parity columns available. 730 */ 731 static void 732 vdev_raidz_generate_parity(raidz_map_t *rm) 733 { 734 switch (rm->rm_firstdatacol) { 735 case 1: 736 vdev_raidz_generate_parity_p(rm); 737 break; 738 case 2: 739 vdev_raidz_generate_parity_pq(rm); 740 break; 741 case 3: 742 vdev_raidz_generate_parity_pqr(rm); 743 break; 744 default: 745 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 746 } 747 } 748 749 static int 750 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 751 { 752 uint64_t *dst, *src, xcount, ccount, count, i; 753 int x = tgts[0]; 754 int c; 755 756 ASSERT(ntgts == 1); 757 ASSERT(x >= rm->rm_firstdatacol); 758 ASSERT(x < rm->rm_cols); 759 760 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 761 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 762 ASSERT(xcount > 0); 763 764 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 765 dst = rm->rm_col[x].rc_data; 766 for (i = 0; i < xcount; i++, dst++, src++) { 767 *dst = *src; 768 } 769 770 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 771 src = rm->rm_col[c].rc_data; 772 dst = rm->rm_col[x].rc_data; 773 774 if (c == x) 775 continue; 776 777 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 778 count = MIN(ccount, xcount); 779 780 for (i = 0; i < count; i++, dst++, src++) { 781 *dst ^= *src; 782 } 783 } 784 785 return (1 << VDEV_RAIDZ_P); 786 } 787 788 static int 789 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 790 { 791 uint64_t *dst, *src, xcount, ccount, count, mask, i; 792 uint8_t *b; 793 int x = tgts[0]; 794 int c, j, exp; 795 796 ASSERT(ntgts == 1); 797 798 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 799 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 800 801 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 802 src = rm->rm_col[c].rc_data; 803 dst = rm->rm_col[x].rc_data; 804 805 if (c == x) 806 ccount = 0; 807 else 808 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 809 810 count = MIN(ccount, xcount); 811 812 if (c == rm->rm_firstdatacol) { 813 for (i = 0; i < count; i++, dst++, src++) { 814 *dst = *src; 815 } 816 for (; i < xcount; i++, dst++) { 817 *dst = 0; 818 } 819 820 } else { 821 for (i = 0; i < count; i++, dst++, src++) { 822 VDEV_RAIDZ_64MUL_2(*dst, mask); 823 *dst ^= *src; 824 } 825 826 for (; i < xcount; i++, dst++) { 827 VDEV_RAIDZ_64MUL_2(*dst, mask); 828 } 829 } 830 } 831 832 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 833 dst = rm->rm_col[x].rc_data; 834 exp = 255 - (rm->rm_cols - 1 - x); 835 836 for (i = 0; i < xcount; i++, dst++, src++) { 837 *dst ^= *src; 838 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 839 *b = vdev_raidz_exp2(*b, exp); 840 } 841 } 842 843 return (1 << VDEV_RAIDZ_Q); 844 } 845 846 static int 847 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 848 { 849 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 850 void *pdata, *qdata; 851 uint64_t xsize, ysize, i; 852 int x = tgts[0]; 853 int y = tgts[1]; 854 855 ASSERT(ntgts == 2); 856 ASSERT(x < y); 857 ASSERT(x >= rm->rm_firstdatacol); 858 ASSERT(y < rm->rm_cols); 859 860 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 861 862 /* 863 * Move the parity data aside -- we're going to compute parity as 864 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 865 * reuse the parity generation mechanism without trashing the actual 866 * parity so we make those columns appear to be full of zeros by 867 * setting their lengths to zero. 868 */ 869 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 870 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 871 xsize = rm->rm_col[x].rc_size; 872 ysize = rm->rm_col[y].rc_size; 873 874 rm->rm_col[VDEV_RAIDZ_P].rc_data = 875 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 876 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 877 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 878 rm->rm_col[x].rc_size = 0; 879 rm->rm_col[y].rc_size = 0; 880 881 vdev_raidz_generate_parity_pq(rm); 882 883 rm->rm_col[x].rc_size = xsize; 884 rm->rm_col[y].rc_size = ysize; 885 886 p = pdata; 887 q = qdata; 888 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 889 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 890 xd = rm->rm_col[x].rc_data; 891 yd = rm->rm_col[y].rc_data; 892 893 /* 894 * We now have: 895 * Pxy = P + D_x + D_y 896 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 897 * 898 * We can then solve for D_x: 899 * D_x = A * (P + Pxy) + B * (Q + Qxy) 900 * where 901 * A = 2^(x - y) * (2^(x - y) + 1)^-1 902 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 903 * 904 * With D_x in hand, we can easily solve for D_y: 905 * D_y = P + Pxy + D_x 906 */ 907 908 a = vdev_raidz_pow2[255 + x - y]; 909 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 910 tmp = 255 - vdev_raidz_log2[a ^ 1]; 911 912 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 913 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 914 915 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 916 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 917 vdev_raidz_exp2(*q ^ *qxy, bexp); 918 919 if (i < ysize) 920 *yd = *p ^ *pxy ^ *xd; 921 } 922 923 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 924 rm->rm_col[VDEV_RAIDZ_P].rc_size); 925 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 926 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 927 928 /* 929 * Restore the saved parity data. 930 */ 931 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 932 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 933 934 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 935 } 936 937 /* BEGIN CSTYLED */ 938 /* 939 * In the general case of reconstruction, we must solve the system of linear 940 * equations defined by the coeffecients used to generate parity as well as 941 * the contents of the data and parity disks. This can be expressed with 942 * vectors for the original data (D) and the actual data (d) and parity (p) 943 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 944 * 945 * __ __ __ __ 946 * | | __ __ | p_0 | 947 * | V | | D_0 | | p_m-1 | 948 * | | x | : | = | d_0 | 949 * | I | | D_n-1 | | : | 950 * | | ~~ ~~ | d_n-1 | 951 * ~~ ~~ ~~ ~~ 952 * 953 * I is simply a square identity matrix of size n, and V is a vandermonde 954 * matrix defined by the coeffecients we chose for the various parity columns 955 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 956 * computation as well as linear separability. 957 * 958 * __ __ __ __ 959 * | 1 .. 1 1 1 | | p_0 | 960 * | 2^n-1 .. 4 2 1 | __ __ | : | 961 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 962 * | 1 .. 0 0 0 | | D_1 | | d_0 | 963 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 964 * | : : : : | | : | | d_2 | 965 * | 0 .. 1 0 0 | | D_n-1 | | : | 966 * | 0 .. 0 1 0 | ~~ ~~ | : | 967 * | 0 .. 0 0 1 | | d_n-1 | 968 * ~~ ~~ ~~ ~~ 969 * 970 * Note that I, V, d, and p are known. To compute D, we must invert the 971 * matrix and use the known data and parity values to reconstruct the unknown 972 * data values. We begin by removing the rows in V|I and d|p that correspond 973 * to failed or missing columns; we then make V|I square (n x n) and d|p 974 * sized n by removing rows corresponding to unused parity from the bottom up 975 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 976 * using Gauss-Jordan elimination. In the example below we use m=3 parity 977 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 978 * __ __ 979 * | 1 1 1 1 1 1 1 1 | 980 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 981 * | 19 205 116 29 64 16 4 1 | / / 982 * | 1 0 0 0 0 0 0 0 | / / 983 * | 0 1 0 0 0 0 0 0 | <--' / 984 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 985 * | 0 0 0 1 0 0 0 0 | 986 * | 0 0 0 0 1 0 0 0 | 987 * | 0 0 0 0 0 1 0 0 | 988 * | 0 0 0 0 0 0 1 0 | 989 * | 0 0 0 0 0 0 0 1 | 990 * ~~ ~~ 991 * __ __ 992 * | 1 1 1 1 1 1 1 1 | 993 * | 19 205 116 29 64 16 4 1 | 994 * | 1 0 0 0 0 0 0 0 | 995 * (V|I)' = | 0 0 0 1 0 0 0 0 | 996 * | 0 0 0 0 1 0 0 0 | 997 * | 0 0 0 0 0 1 0 0 | 998 * | 0 0 0 0 0 0 1 0 | 999 * | 0 0 0 0 0 0 0 1 | 1000 * ~~ ~~ 1001 * 1002 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1003 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1004 * matrix is not singular. 1005 * __ __ 1006 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1007 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1009 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1010 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1011 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1012 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1013 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1014 * ~~ ~~ 1015 * __ __ 1016 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1017 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1018 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1024 * ~~ ~~ 1025 * __ __ 1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1027 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1028 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1034 * ~~ ~~ 1035 * __ __ 1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1038 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1044 * ~~ ~~ 1045 * __ __ 1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1048 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1054 * ~~ ~~ 1055 * __ __ 1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1064 * ~~ ~~ 1065 * __ __ 1066 * | 0 0 1 0 0 0 0 0 | 1067 * | 167 100 5 41 159 169 217 208 | 1068 * | 166 100 4 40 158 168 216 209 | 1069 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1070 * | 0 0 0 0 1 0 0 0 | 1071 * | 0 0 0 0 0 1 0 0 | 1072 * | 0 0 0 0 0 0 1 0 | 1073 * | 0 0 0 0 0 0 0 1 | 1074 * ~~ ~~ 1075 * 1076 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1077 * of the missing data. 1078 * 1079 * As is apparent from the example above, the only non-trivial rows in the 1080 * inverse matrix correspond to the data disks that we're trying to 1081 * reconstruct. Indeed, those are the only rows we need as the others would 1082 * only be useful for reconstructing data known or assumed to be valid. For 1083 * that reason, we only build the coefficients in the rows that correspond to 1084 * targeted columns. 1085 */ 1086 /* END CSTYLED */ 1087 1088 static void 1089 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1090 uint8_t **rows) 1091 { 1092 int i, j; 1093 int pow; 1094 1095 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1096 1097 /* 1098 * Fill in the missing rows of interest. 1099 */ 1100 for (i = 0; i < nmap; i++) { 1101 ASSERT3S(0, <=, map[i]); 1102 ASSERT3S(map[i], <=, 2); 1103 1104 pow = map[i] * n; 1105 if (pow > 255) 1106 pow -= 255; 1107 ASSERT(pow <= 255); 1108 1109 for (j = 0; j < n; j++) { 1110 pow -= map[i]; 1111 if (pow < 0) 1112 pow += 255; 1113 rows[i][j] = vdev_raidz_pow2[pow]; 1114 } 1115 } 1116 } 1117 1118 static void 1119 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1120 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1121 { 1122 int i, j, ii, jj; 1123 uint8_t log; 1124 1125 /* 1126 * Assert that the first nmissing entries from the array of used 1127 * columns correspond to parity columns and that subsequent entries 1128 * correspond to data columns. 1129 */ 1130 for (i = 0; i < nmissing; i++) { 1131 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1132 } 1133 for (; i < n; i++) { 1134 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1135 } 1136 1137 /* 1138 * First initialize the storage where we'll compute the inverse rows. 1139 */ 1140 for (i = 0; i < nmissing; i++) { 1141 for (j = 0; j < n; j++) { 1142 invrows[i][j] = (i == j) ? 1 : 0; 1143 } 1144 } 1145 1146 /* 1147 * Subtract all trivial rows from the rows of consequence. 1148 */ 1149 for (i = 0; i < nmissing; i++) { 1150 for (j = nmissing; j < n; j++) { 1151 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1152 jj = used[j] - rm->rm_firstdatacol; 1153 ASSERT3S(jj, <, n); 1154 invrows[i][j] = rows[i][jj]; 1155 rows[i][jj] = 0; 1156 } 1157 } 1158 1159 /* 1160 * For each of the rows of interest, we must normalize it and subtract 1161 * a multiple of it from the other rows. 1162 */ 1163 for (i = 0; i < nmissing; i++) { 1164 for (j = 0; j < missing[i]; j++) { 1165 ASSERT0(rows[i][j]); 1166 } 1167 ASSERT3U(rows[i][missing[i]], !=, 0); 1168 1169 /* 1170 * Compute the inverse of the first element and multiply each 1171 * element in the row by that value. 1172 */ 1173 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1174 1175 for (j = 0; j < n; j++) { 1176 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1177 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1178 } 1179 1180 for (ii = 0; ii < nmissing; ii++) { 1181 if (i == ii) 1182 continue; 1183 1184 ASSERT3U(rows[ii][missing[i]], !=, 0); 1185 1186 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1187 1188 for (j = 0; j < n; j++) { 1189 rows[ii][j] ^= 1190 vdev_raidz_exp2(rows[i][j], log); 1191 invrows[ii][j] ^= 1192 vdev_raidz_exp2(invrows[i][j], log); 1193 } 1194 } 1195 } 1196 1197 /* 1198 * Verify that the data that is left in the rows are properly part of 1199 * an identity matrix. 1200 */ 1201 for (i = 0; i < nmissing; i++) { 1202 for (j = 0; j < n; j++) { 1203 if (j == missing[i]) { 1204 ASSERT3U(rows[i][j], ==, 1); 1205 } else { 1206 ASSERT0(rows[i][j]); 1207 } 1208 } 1209 } 1210 } 1211 1212 static void 1213 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1214 int *missing, uint8_t **invrows, const uint8_t *used) 1215 { 1216 int i, j, x, cc, c; 1217 uint8_t *src; 1218 uint64_t ccount; 1219 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1220 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1221 uint8_t log = 0; 1222 uint8_t val; 1223 int ll; 1224 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1225 uint8_t *p, *pp; 1226 size_t psize; 1227 1228 psize = sizeof (invlog[0][0]) * n * nmissing; 1229 p = kmem_alloc(psize, KM_SLEEP); 1230 1231 for (pp = p, i = 0; i < nmissing; i++) { 1232 invlog[i] = pp; 1233 pp += n; 1234 } 1235 1236 for (i = 0; i < nmissing; i++) { 1237 for (j = 0; j < n; j++) { 1238 ASSERT3U(invrows[i][j], !=, 0); 1239 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1240 } 1241 } 1242 1243 for (i = 0; i < n; i++) { 1244 c = used[i]; 1245 ASSERT3U(c, <, rm->rm_cols); 1246 1247 src = rm->rm_col[c].rc_data; 1248 ccount = rm->rm_col[c].rc_size; 1249 for (j = 0; j < nmissing; j++) { 1250 cc = missing[j] + rm->rm_firstdatacol; 1251 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1252 ASSERT3U(cc, <, rm->rm_cols); 1253 ASSERT3U(cc, !=, c); 1254 1255 dst[j] = rm->rm_col[cc].rc_data; 1256 dcount[j] = rm->rm_col[cc].rc_size; 1257 } 1258 1259 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1260 1261 for (x = 0; x < ccount; x++, src++) { 1262 if (*src != 0) 1263 log = vdev_raidz_log2[*src]; 1264 1265 for (cc = 0; cc < nmissing; cc++) { 1266 if (x >= dcount[cc]) 1267 continue; 1268 1269 if (*src == 0) { 1270 val = 0; 1271 } else { 1272 if ((ll = log + invlog[cc][i]) >= 255) 1273 ll -= 255; 1274 val = vdev_raidz_pow2[ll]; 1275 } 1276 1277 if (i == 0) 1278 dst[cc][x] = val; 1279 else 1280 dst[cc][x] ^= val; 1281 } 1282 } 1283 } 1284 1285 kmem_free(p, psize); 1286 } 1287 1288 static int 1289 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1290 { 1291 int n, i, c, t, tt; 1292 int nmissing_rows; 1293 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1294 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1295 1296 uint8_t *p, *pp; 1297 size_t psize; 1298 1299 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1300 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1301 uint8_t *used; 1302 1303 int code = 0; 1304 1305 1306 n = rm->rm_cols - rm->rm_firstdatacol; 1307 1308 /* 1309 * Figure out which data columns are missing. 1310 */ 1311 nmissing_rows = 0; 1312 for (t = 0; t < ntgts; t++) { 1313 if (tgts[t] >= rm->rm_firstdatacol) { 1314 missing_rows[nmissing_rows++] = 1315 tgts[t] - rm->rm_firstdatacol; 1316 } 1317 } 1318 1319 /* 1320 * Figure out which parity columns to use to help generate the missing 1321 * data columns. 1322 */ 1323 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1324 ASSERT(tt < ntgts); 1325 ASSERT(c < rm->rm_firstdatacol); 1326 1327 /* 1328 * Skip any targeted parity columns. 1329 */ 1330 if (c == tgts[tt]) { 1331 tt++; 1332 continue; 1333 } 1334 1335 code |= 1 << c; 1336 1337 parity_map[i] = c; 1338 i++; 1339 } 1340 1341 ASSERT(code != 0); 1342 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1343 1344 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1345 nmissing_rows * n + sizeof (used[0]) * n; 1346 p = kmem_alloc(psize, KM_SLEEP); 1347 1348 for (pp = p, i = 0; i < nmissing_rows; i++) { 1349 rows[i] = pp; 1350 pp += n; 1351 invrows[i] = pp; 1352 pp += n; 1353 } 1354 used = pp; 1355 1356 for (i = 0; i < nmissing_rows; i++) { 1357 used[i] = parity_map[i]; 1358 } 1359 1360 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1361 if (tt < nmissing_rows && 1362 c == missing_rows[tt] + rm->rm_firstdatacol) { 1363 tt++; 1364 continue; 1365 } 1366 1367 ASSERT3S(i, <, n); 1368 used[i] = c; 1369 i++; 1370 } 1371 1372 /* 1373 * Initialize the interesting rows of the matrix. 1374 */ 1375 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1376 1377 /* 1378 * Invert the matrix. 1379 */ 1380 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1381 invrows, used); 1382 1383 /* 1384 * Reconstruct the missing data using the generated matrix. 1385 */ 1386 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1387 invrows, used); 1388 1389 kmem_free(p, psize); 1390 1391 return (code); 1392 } 1393 1394 static int 1395 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1396 { 1397 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1398 int ntgts; 1399 int i, c; 1400 int code; 1401 int nbadparity, nbaddata; 1402 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1403 1404 /* 1405 * The tgts list must already be sorted. 1406 */ 1407 for (i = 1; i < nt; i++) { 1408 ASSERT(t[i] > t[i - 1]); 1409 } 1410 1411 nbadparity = rm->rm_firstdatacol; 1412 nbaddata = rm->rm_cols - nbadparity; 1413 ntgts = 0; 1414 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1415 if (c < rm->rm_firstdatacol) 1416 parity_valid[c] = B_FALSE; 1417 1418 if (i < nt && c == t[i]) { 1419 tgts[ntgts++] = c; 1420 i++; 1421 } else if (rm->rm_col[c].rc_error != 0) { 1422 tgts[ntgts++] = c; 1423 } else if (c >= rm->rm_firstdatacol) { 1424 nbaddata--; 1425 } else { 1426 parity_valid[c] = B_TRUE; 1427 nbadparity--; 1428 } 1429 } 1430 1431 ASSERT(ntgts >= nt); 1432 ASSERT(nbaddata >= 0); 1433 ASSERT(nbaddata + nbadparity == ntgts); 1434 1435 dt = &tgts[nbadparity]; 1436 1437 /* 1438 * See if we can use any of our optimized reconstruction routines. 1439 */ 1440 if (!vdev_raidz_default_to_general) { 1441 switch (nbaddata) { 1442 case 1: 1443 if (parity_valid[VDEV_RAIDZ_P]) 1444 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1445 1446 ASSERT(rm->rm_firstdatacol > 1); 1447 1448 if (parity_valid[VDEV_RAIDZ_Q]) 1449 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1450 1451 ASSERT(rm->rm_firstdatacol > 2); 1452 break; 1453 1454 case 2: 1455 ASSERT(rm->rm_firstdatacol > 1); 1456 1457 if (parity_valid[VDEV_RAIDZ_P] && 1458 parity_valid[VDEV_RAIDZ_Q]) 1459 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1460 1461 ASSERT(rm->rm_firstdatacol > 2); 1462 1463 break; 1464 } 1465 } 1466 1467 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1468 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1469 ASSERT(code > 0); 1470 return (code); 1471 } 1472 1473 static int 1474 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1475 uint64_t *ashift) 1476 { 1477 vdev_t *cvd; 1478 uint64_t nparity = vd->vdev_nparity; 1479 int c; 1480 int lasterror = 0; 1481 int numerrors = 0; 1482 1483 ASSERT(nparity > 0); 1484 1485 if (nparity > VDEV_RAIDZ_MAXPARITY || 1486 vd->vdev_children < nparity + 1) { 1487 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1488 return (SET_ERROR(EINVAL)); 1489 } 1490 1491 vdev_open_children(vd); 1492 1493 for (c = 0; c < vd->vdev_children; c++) { 1494 cvd = vd->vdev_child[c]; 1495 1496 if (cvd->vdev_open_error != 0) { 1497 lasterror = cvd->vdev_open_error; 1498 numerrors++; 1499 continue; 1500 } 1501 1502 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1503 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1504 *ashift = MAX(*ashift, cvd->vdev_ashift); 1505 } 1506 1507 *asize *= vd->vdev_children; 1508 *max_asize *= vd->vdev_children; 1509 1510 if (numerrors > nparity) { 1511 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1512 return (lasterror); 1513 } 1514 1515 return (0); 1516 } 1517 1518 static void 1519 vdev_raidz_close(vdev_t *vd) 1520 { 1521 int c; 1522 1523 for (c = 0; c < vd->vdev_children; c++) 1524 vdev_close(vd->vdev_child[c]); 1525 } 1526 1527 /* 1528 * Handle a read or write I/O to a RAID-Z dump device. 1529 * 1530 * The dump device is in a unique situation compared to other ZFS datasets: 1531 * writing to this device should be as simple and fast as possible. In 1532 * addition, durability matters much less since the dump will be extracted 1533 * once the machine reboots. For that reason, this function eschews parity for 1534 * performance and simplicity. The dump device uses the checksum setting 1535 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1536 * dataset. 1537 * 1538 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1539 * 128 KB will not fill an entire block; in addition, they may not be properly 1540 * aligned. In that case, this function uses the preallocated 128 KB block and 1541 * omits reading or writing any "empty" portions of that block, as opposed to 1542 * allocating a fresh appropriately-sized block. 1543 * 1544 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1545 * 1546 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1547 * 1548 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1549 * allocated which spans all five child vdevs. 8 KB of data would be written to 1550 * each of four vdevs, with the fifth containing the parity bits. 1551 * 1552 * parity data data data data 1553 * | PP | XX | XX | XX | XX | 1554 * ^ ^ ^ ^ ^ 1555 * | | | | | 1556 * 8 KB parity ------8 KB data blocks------ 1557 * 1558 * However, when writing to the dump device, the behavior is different: 1559 * 1560 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1561 * 1562 * Unlike the normal RAID-Z case in which the block is allocated based on the 1563 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1564 * I/O size is less than 128 KB, only the actual portions of data are written. 1565 * In this example the data is written to the third data vdev since that vdev 1566 * contains the offset [64 KB, 96 KB). 1567 * 1568 * parity data data data data 1569 * | | | | XX | | 1570 * ^ 1571 * | 1572 * 32 KB data block 1573 * 1574 * As a result, an individual I/O may not span all child vdevs; moreover, a 1575 * small I/O may only operate on a single child vdev. 1576 * 1577 * Note that since there are no parity bits calculated or written, this format 1578 * remains the same no matter how many parity bits are used in a normal RAID-Z 1579 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1580 * would look like: 1581 * 1582 * parity parity parity data data data data 1583 * | | | | | | XX | | 1584 * ^ 1585 * | 1586 * 32 KB data block 1587 */ 1588 int 1589 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1590 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1591 { 1592 vdev_t *tvd = vd->vdev_top; 1593 vdev_t *cvd; 1594 raidz_map_t *rm; 1595 raidz_col_t *rc; 1596 int c, err = 0; 1597 1598 uint64_t start, end, colstart, colend; 1599 uint64_t coloffset, colsize, colskip; 1600 1601 int flags = doread ? B_READ : B_WRITE; 1602 1603 #ifdef _KERNEL 1604 1605 /* 1606 * Don't write past the end of the block 1607 */ 1608 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1609 1610 start = offset; 1611 end = start + size; 1612 1613 /* 1614 * Allocate a RAID-Z map for this block. Note that this block starts 1615 * from the "original" offset, this is, the offset of the extent which 1616 * contains the requisite offset of the data being read or written. 1617 * 1618 * Even if this I/O operation doesn't span the full block size, let's 1619 * treat the on-disk format as if the only blocks are the complete 128 1620 * KB size. 1621 */ 1622 rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1623 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1624 vd->vdev_children, vd->vdev_nparity); 1625 1626 coloffset = origoffset; 1627 1628 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1629 c++, coloffset += rc->rc_size) { 1630 rc = &rm->rm_col[c]; 1631 cvd = vd->vdev_child[rc->rc_devidx]; 1632 1633 /* 1634 * Find the start and end of this column in the RAID-Z map, 1635 * keeping in mind that the stated size and offset of the 1636 * operation may not fill the entire column for this vdev. 1637 * 1638 * If any portion of the data spans this column, issue the 1639 * appropriate operation to the vdev. 1640 */ 1641 if (coloffset + rc->rc_size <= start) 1642 continue; 1643 if (coloffset >= end) 1644 continue; 1645 1646 colstart = MAX(coloffset, start); 1647 colend = MIN(end, coloffset + rc->rc_size); 1648 colsize = colend - colstart; 1649 colskip = colstart - coloffset; 1650 1651 VERIFY3U(colsize, <=, rc->rc_size); 1652 VERIFY3U(colskip, <=, rc->rc_size); 1653 1654 /* 1655 * Note that the child vdev will have a vdev label at the start 1656 * of its range of offsets, hence the need for 1657 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1658 * example of why this calculation is needed. 1659 */ 1660 if ((err = vdev_disk_physio(cvd, 1661 ((char *)rc->rc_data) + colskip, colsize, 1662 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1663 flags, isdump)) != 0) 1664 break; 1665 } 1666 1667 vdev_raidz_map_free(rm); 1668 #endif /* KERNEL */ 1669 1670 return (err); 1671 } 1672 1673 static uint64_t 1674 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1675 { 1676 uint64_t asize; 1677 uint64_t ashift = vd->vdev_top->vdev_ashift; 1678 uint64_t cols = vd->vdev_children; 1679 uint64_t nparity = vd->vdev_nparity; 1680 1681 asize = ((psize - 1) >> ashift) + 1; 1682 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1683 asize = roundup(asize, nparity + 1) << ashift; 1684 1685 return (asize); 1686 } 1687 1688 static void 1689 vdev_raidz_child_done(zio_t *zio) 1690 { 1691 raidz_col_t *rc = zio->io_private; 1692 1693 rc->rc_error = zio->io_error; 1694 rc->rc_tried = 1; 1695 rc->rc_skipped = 0; 1696 } 1697 1698 /* 1699 * Start an IO operation on a RAIDZ VDev 1700 * 1701 * Outline: 1702 * - For write operations: 1703 * 1. Generate the parity data 1704 * 2. Create child zio write operations to each column's vdev, for both 1705 * data and parity. 1706 * 3. If the column skips any sectors for padding, create optional dummy 1707 * write zio children for those areas to improve aggregation continuity. 1708 * - For read operations: 1709 * 1. Create child zio read operations to each data column's vdev to read 1710 * the range of data required for zio. 1711 * 2. If this is a scrub or resilver operation, or if any of the data 1712 * vdevs have had errors, then create zio read operations to the parity 1713 * columns' VDevs as well. 1714 */ 1715 static void 1716 vdev_raidz_io_start(zio_t *zio) 1717 { 1718 vdev_t *vd = zio->io_vd; 1719 vdev_t *tvd = vd->vdev_top; 1720 vdev_t *cvd; 1721 raidz_map_t *rm; 1722 raidz_col_t *rc; 1723 int c, i; 1724 1725 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1726 tvd->vdev_ashift, vd->vdev_children, 1727 vd->vdev_nparity); 1728 1729 zio->io_vsd = rm; 1730 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1731 1732 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1733 1734 if (zio->io_type == ZIO_TYPE_WRITE) { 1735 vdev_raidz_generate_parity(rm); 1736 1737 for (c = 0; c < rm->rm_cols; c++) { 1738 rc = &rm->rm_col[c]; 1739 cvd = vd->vdev_child[rc->rc_devidx]; 1740 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1741 rc->rc_offset, rc->rc_data, rc->rc_size, 1742 zio->io_type, zio->io_priority, 0, 1743 vdev_raidz_child_done, rc)); 1744 } 1745 1746 /* 1747 * Generate optional I/Os for any skipped sectors to improve 1748 * aggregation contiguity. 1749 */ 1750 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1751 ASSERT(c <= rm->rm_scols); 1752 if (c == rm->rm_scols) 1753 c = 0; 1754 rc = &rm->rm_col[c]; 1755 cvd = vd->vdev_child[rc->rc_devidx]; 1756 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1757 rc->rc_offset + rc->rc_size, NULL, 1758 1 << tvd->vdev_ashift, 1759 zio->io_type, zio->io_priority, 1760 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1761 } 1762 1763 zio_execute(zio); 1764 return; 1765 } 1766 1767 ASSERT(zio->io_type == ZIO_TYPE_READ); 1768 1769 /* 1770 * Iterate over the columns in reverse order so that we hit the parity 1771 * last -- any errors along the way will force us to read the parity. 1772 */ 1773 for (c = rm->rm_cols - 1; c >= 0; c--) { 1774 rc = &rm->rm_col[c]; 1775 cvd = vd->vdev_child[rc->rc_devidx]; 1776 if (cvd->vdev_avoid_read) { 1777 if (c >= rm->rm_firstdatacol) 1778 rm->rm_missingdata++; 1779 else 1780 rm->rm_missingparity++; 1781 rc->rc_error = SET_ERROR(ENXIO); 1782 rc->rc_skipped = 1; /* only try if necessary */ 1783 continue; 1784 } 1785 if (!vdev_readable(cvd)) { 1786 if (c >= rm->rm_firstdatacol) 1787 rm->rm_missingdata++; 1788 else 1789 rm->rm_missingparity++; 1790 rc->rc_error = SET_ERROR(ENXIO); 1791 rc->rc_tried = 1; /* don't even try */ 1792 rc->rc_skipped = 1; 1793 continue; 1794 } 1795 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1796 if (c >= rm->rm_firstdatacol) 1797 rm->rm_missingdata++; 1798 else 1799 rm->rm_missingparity++; 1800 rc->rc_error = SET_ERROR(ESTALE); 1801 rc->rc_skipped = 1; 1802 continue; 1803 } 1804 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1805 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1806 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1807 rc->rc_offset, rc->rc_data, rc->rc_size, 1808 zio->io_type, zio->io_priority, 0, 1809 vdev_raidz_child_done, rc)); 1810 } 1811 } 1812 1813 zio_execute(zio); 1814 } 1815 1816 1817 /* 1818 * Report a checksum error for a child of a RAID-Z device. 1819 */ 1820 static void 1821 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1822 { 1823 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1824 1825 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1826 zio_bad_cksum_t zbc; 1827 raidz_map_t *rm = zio->io_vsd; 1828 1829 mutex_enter(&vd->vdev_stat_lock); 1830 vd->vdev_stat.vs_checksum_errors++; 1831 mutex_exit(&vd->vdev_stat_lock); 1832 1833 zbc.zbc_has_cksum = 0; 1834 zbc.zbc_injected = rm->rm_ecksuminjected; 1835 1836 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1837 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1838 &zbc); 1839 } 1840 } 1841 1842 /* 1843 * We keep track of whether or not there were any injected errors, so that 1844 * any ereports we generate can note it. 1845 */ 1846 static int 1847 raidz_checksum_verify(zio_t *zio) 1848 { 1849 zio_bad_cksum_t zbc; 1850 raidz_map_t *rm = zio->io_vsd; 1851 1852 int ret = zio_checksum_error(zio, &zbc); 1853 if (ret != 0 && zbc.zbc_injected != 0) 1854 rm->rm_ecksuminjected = 1; 1855 1856 return (ret); 1857 } 1858 1859 /* 1860 * Generate the parity from the data columns. If we tried and were able to 1861 * read the parity without error, verify that the generated parity matches the 1862 * data we read. If it doesn't, we fire off a checksum error. Return the 1863 * number such failures. 1864 */ 1865 static int 1866 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1867 { 1868 void *orig[VDEV_RAIDZ_MAXPARITY]; 1869 int c, ret = 0; 1870 raidz_col_t *rc; 1871 1872 blkptr_t *bp = zio->io_bp; 1873 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1874 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1875 1876 if (checksum == ZIO_CHECKSUM_NOPARITY) 1877 return (ret); 1878 1879 for (c = 0; c < rm->rm_firstdatacol; c++) { 1880 rc = &rm->rm_col[c]; 1881 if (!rc->rc_tried || rc->rc_error != 0) 1882 continue; 1883 orig[c] = zio_buf_alloc(rc->rc_size); 1884 bcopy(rc->rc_data, orig[c], rc->rc_size); 1885 } 1886 1887 vdev_raidz_generate_parity(rm); 1888 1889 for (c = 0; c < rm->rm_firstdatacol; c++) { 1890 rc = &rm->rm_col[c]; 1891 if (!rc->rc_tried || rc->rc_error != 0) 1892 continue; 1893 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1894 raidz_checksum_error(zio, rc, orig[c]); 1895 rc->rc_error = SET_ERROR(ECKSUM); 1896 ret++; 1897 } 1898 zio_buf_free(orig[c], rc->rc_size); 1899 } 1900 1901 return (ret); 1902 } 1903 1904 /* 1905 * Keep statistics on all the ways that we used parity to correct data. 1906 */ 1907 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1908 1909 static int 1910 vdev_raidz_worst_error(raidz_map_t *rm) 1911 { 1912 int error = 0; 1913 1914 for (int c = 0; c < rm->rm_cols; c++) 1915 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1916 1917 return (error); 1918 } 1919 1920 /* 1921 * Iterate over all combinations of bad data and attempt a reconstruction. 1922 * Note that the algorithm below is non-optimal because it doesn't take into 1923 * account how reconstruction is actually performed. For example, with 1924 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1925 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1926 * cases we'd only use parity information in column 0. 1927 */ 1928 static int 1929 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1930 { 1931 raidz_map_t *rm = zio->io_vsd; 1932 raidz_col_t *rc; 1933 void *orig[VDEV_RAIDZ_MAXPARITY]; 1934 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1935 int *tgts = &tstore[1]; 1936 int current, next, i, c, n; 1937 int code, ret = 0; 1938 1939 ASSERT(total_errors < rm->rm_firstdatacol); 1940 1941 /* 1942 * This simplifies one edge condition. 1943 */ 1944 tgts[-1] = -1; 1945 1946 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1947 /* 1948 * Initialize the targets array by finding the first n columns 1949 * that contain no error. 1950 * 1951 * If there were no data errors, we need to ensure that we're 1952 * always explicitly attempting to reconstruct at least one 1953 * data column. To do this, we simply push the highest target 1954 * up into the data columns. 1955 */ 1956 for (c = 0, i = 0; i < n; i++) { 1957 if (i == n - 1 && data_errors == 0 && 1958 c < rm->rm_firstdatacol) { 1959 c = rm->rm_firstdatacol; 1960 } 1961 1962 while (rm->rm_col[c].rc_error != 0) { 1963 c++; 1964 ASSERT3S(c, <, rm->rm_cols); 1965 } 1966 1967 tgts[i] = c++; 1968 } 1969 1970 /* 1971 * Setting tgts[n] simplifies the other edge condition. 1972 */ 1973 tgts[n] = rm->rm_cols; 1974 1975 /* 1976 * These buffers were allocated in previous iterations. 1977 */ 1978 for (i = 0; i < n - 1; i++) { 1979 ASSERT(orig[i] != NULL); 1980 } 1981 1982 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1983 1984 current = 0; 1985 next = tgts[current]; 1986 1987 while (current != n) { 1988 tgts[current] = next; 1989 current = 0; 1990 1991 /* 1992 * Save off the original data that we're going to 1993 * attempt to reconstruct. 1994 */ 1995 for (i = 0; i < n; i++) { 1996 ASSERT(orig[i] != NULL); 1997 c = tgts[i]; 1998 ASSERT3S(c, >=, 0); 1999 ASSERT3S(c, <, rm->rm_cols); 2000 rc = &rm->rm_col[c]; 2001 bcopy(rc->rc_data, orig[i], rc->rc_size); 2002 } 2003 2004 /* 2005 * Attempt a reconstruction and exit the outer loop on 2006 * success. 2007 */ 2008 code = vdev_raidz_reconstruct(rm, tgts, n); 2009 if (raidz_checksum_verify(zio) == 0) { 2010 atomic_inc_64(&raidz_corrected[code]); 2011 2012 for (i = 0; i < n; i++) { 2013 c = tgts[i]; 2014 rc = &rm->rm_col[c]; 2015 ASSERT(rc->rc_error == 0); 2016 if (rc->rc_tried) 2017 raidz_checksum_error(zio, rc, 2018 orig[i]); 2019 rc->rc_error = SET_ERROR(ECKSUM); 2020 } 2021 2022 ret = code; 2023 goto done; 2024 } 2025 2026 /* 2027 * Restore the original data. 2028 */ 2029 for (i = 0; i < n; i++) { 2030 c = tgts[i]; 2031 rc = &rm->rm_col[c]; 2032 bcopy(orig[i], rc->rc_data, rc->rc_size); 2033 } 2034 2035 do { 2036 /* 2037 * Find the next valid column after the current 2038 * position.. 2039 */ 2040 for (next = tgts[current] + 1; 2041 next < rm->rm_cols && 2042 rm->rm_col[next].rc_error != 0; next++) 2043 continue; 2044 2045 ASSERT(next <= tgts[current + 1]); 2046 2047 /* 2048 * If that spot is available, we're done here. 2049 */ 2050 if (next != tgts[current + 1]) 2051 break; 2052 2053 /* 2054 * Otherwise, find the next valid column after 2055 * the previous position. 2056 */ 2057 for (c = tgts[current - 1] + 1; 2058 rm->rm_col[c].rc_error != 0; c++) 2059 continue; 2060 2061 tgts[current] = c; 2062 current++; 2063 2064 } while (current != n); 2065 } 2066 } 2067 n--; 2068 done: 2069 for (i = 0; i < n; i++) { 2070 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2071 } 2072 2073 return (ret); 2074 } 2075 2076 /* 2077 * Complete an IO operation on a RAIDZ VDev 2078 * 2079 * Outline: 2080 * - For write operations: 2081 * 1. Check for errors on the child IOs. 2082 * 2. Return, setting an error code if too few child VDevs were written 2083 * to reconstruct the data later. Note that partial writes are 2084 * considered successful if they can be reconstructed at all. 2085 * - For read operations: 2086 * 1. Check for errors on the child IOs. 2087 * 2. If data errors occurred: 2088 * a. Try to reassemble the data from the parity available. 2089 * b. If we haven't yet read the parity drives, read them now. 2090 * c. If all parity drives have been read but the data still doesn't 2091 * reassemble with a correct checksum, then try combinatorial 2092 * reconstruction. 2093 * d. If that doesn't work, return an error. 2094 * 3. If there were unexpected errors or this is a resilver operation, 2095 * rewrite the vdevs that had errors. 2096 */ 2097 static void 2098 vdev_raidz_io_done(zio_t *zio) 2099 { 2100 vdev_t *vd = zio->io_vd; 2101 vdev_t *cvd; 2102 raidz_map_t *rm = zio->io_vsd; 2103 raidz_col_t *rc; 2104 int unexpected_errors = 0; 2105 int parity_errors = 0; 2106 int parity_untried = 0; 2107 int data_errors = 0; 2108 int total_errors = 0; 2109 int n, c; 2110 int tgts[VDEV_RAIDZ_MAXPARITY]; 2111 int code; 2112 2113 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2114 2115 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2116 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2117 2118 for (c = 0; c < rm->rm_cols; c++) { 2119 rc = &rm->rm_col[c]; 2120 2121 if (rc->rc_error) { 2122 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2123 2124 if (c < rm->rm_firstdatacol) 2125 parity_errors++; 2126 else 2127 data_errors++; 2128 2129 if (!rc->rc_skipped) 2130 unexpected_errors++; 2131 2132 total_errors++; 2133 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2134 parity_untried++; 2135 } 2136 } 2137 2138 if (zio->io_type == ZIO_TYPE_WRITE) { 2139 /* 2140 * XXX -- for now, treat partial writes as a success. 2141 * (If we couldn't write enough columns to reconstruct 2142 * the data, the I/O failed. Otherwise, good enough.) 2143 * 2144 * Now that we support write reallocation, it would be better 2145 * to treat partial failure as real failure unless there are 2146 * no non-degraded top-level vdevs left, and not update DTLs 2147 * if we intend to reallocate. 2148 */ 2149 /* XXPOLICY */ 2150 if (total_errors > rm->rm_firstdatacol) 2151 zio->io_error = vdev_raidz_worst_error(rm); 2152 2153 return; 2154 } 2155 2156 ASSERT(zio->io_type == ZIO_TYPE_READ); 2157 /* 2158 * There are three potential phases for a read: 2159 * 1. produce valid data from the columns read 2160 * 2. read all disks and try again 2161 * 3. perform combinatorial reconstruction 2162 * 2163 * Each phase is progressively both more expensive and less likely to 2164 * occur. If we encounter more errors than we can repair or all phases 2165 * fail, we have no choice but to return an error. 2166 */ 2167 2168 /* 2169 * If the number of errors we saw was correctable -- less than or equal 2170 * to the number of parity disks read -- attempt to produce data that 2171 * has a valid checksum. Naturally, this case applies in the absence of 2172 * any errors. 2173 */ 2174 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2175 if (data_errors == 0) { 2176 if (raidz_checksum_verify(zio) == 0) { 2177 /* 2178 * If we read parity information (unnecessarily 2179 * as it happens since no reconstruction was 2180 * needed) regenerate and verify the parity. 2181 * We also regenerate parity when resilvering 2182 * so we can write it out to the failed device 2183 * later. 2184 */ 2185 if (parity_errors + parity_untried < 2186 rm->rm_firstdatacol || 2187 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2188 n = raidz_parity_verify(zio, rm); 2189 unexpected_errors += n; 2190 ASSERT(parity_errors + n <= 2191 rm->rm_firstdatacol); 2192 } 2193 goto done; 2194 } 2195 } else { 2196 /* 2197 * We either attempt to read all the parity columns or 2198 * none of them. If we didn't try to read parity, we 2199 * wouldn't be here in the correctable case. There must 2200 * also have been fewer parity errors than parity 2201 * columns or, again, we wouldn't be in this code path. 2202 */ 2203 ASSERT(parity_untried == 0); 2204 ASSERT(parity_errors < rm->rm_firstdatacol); 2205 2206 /* 2207 * Identify the data columns that reported an error. 2208 */ 2209 n = 0; 2210 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2211 rc = &rm->rm_col[c]; 2212 if (rc->rc_error != 0) { 2213 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2214 tgts[n++] = c; 2215 } 2216 } 2217 2218 ASSERT(rm->rm_firstdatacol >= n); 2219 2220 code = vdev_raidz_reconstruct(rm, tgts, n); 2221 2222 if (raidz_checksum_verify(zio) == 0) { 2223 atomic_inc_64(&raidz_corrected[code]); 2224 2225 /* 2226 * If we read more parity disks than were used 2227 * for reconstruction, confirm that the other 2228 * parity disks produced correct data. This 2229 * routine is suboptimal in that it regenerates 2230 * the parity that we already used in addition 2231 * to the parity that we're attempting to 2232 * verify, but this should be a relatively 2233 * uncommon case, and can be optimized if it 2234 * becomes a problem. Note that we regenerate 2235 * parity when resilvering so we can write it 2236 * out to failed devices later. 2237 */ 2238 if (parity_errors < rm->rm_firstdatacol - n || 2239 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2240 n = raidz_parity_verify(zio, rm); 2241 unexpected_errors += n; 2242 ASSERT(parity_errors + n <= 2243 rm->rm_firstdatacol); 2244 } 2245 2246 goto done; 2247 } 2248 } 2249 } 2250 2251 /* 2252 * This isn't a typical situation -- either we got a read error or 2253 * a child silently returned bad data. Read every block so we can 2254 * try again with as much data and parity as we can track down. If 2255 * we've already been through once before, all children will be marked 2256 * as tried so we'll proceed to combinatorial reconstruction. 2257 */ 2258 unexpected_errors = 1; 2259 rm->rm_missingdata = 0; 2260 rm->rm_missingparity = 0; 2261 2262 for (c = 0; c < rm->rm_cols; c++) { 2263 if (rm->rm_col[c].rc_tried) 2264 continue; 2265 2266 zio_vdev_io_redone(zio); 2267 do { 2268 rc = &rm->rm_col[c]; 2269 if (rc->rc_tried) 2270 continue; 2271 zio_nowait(zio_vdev_child_io(zio, NULL, 2272 vd->vdev_child[rc->rc_devidx], 2273 rc->rc_offset, rc->rc_data, rc->rc_size, 2274 zio->io_type, zio->io_priority, 0, 2275 vdev_raidz_child_done, rc)); 2276 } while (++c < rm->rm_cols); 2277 2278 return; 2279 } 2280 2281 /* 2282 * At this point we've attempted to reconstruct the data given the 2283 * errors we detected, and we've attempted to read all columns. There 2284 * must, therefore, be one or more additional problems -- silent errors 2285 * resulting in invalid data rather than explicit I/O errors resulting 2286 * in absent data. We check if there is enough additional data to 2287 * possibly reconstruct the data and then perform combinatorial 2288 * reconstruction over all possible combinations. If that fails, 2289 * we're cooked. 2290 */ 2291 if (total_errors > rm->rm_firstdatacol) { 2292 zio->io_error = vdev_raidz_worst_error(rm); 2293 2294 } else if (total_errors < rm->rm_firstdatacol && 2295 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2296 /* 2297 * If we didn't use all the available parity for the 2298 * combinatorial reconstruction, verify that the remaining 2299 * parity is correct. 2300 */ 2301 if (code != (1 << rm->rm_firstdatacol) - 1) 2302 (void) raidz_parity_verify(zio, rm); 2303 } else { 2304 /* 2305 * We're here because either: 2306 * 2307 * total_errors == rm_first_datacol, or 2308 * vdev_raidz_combrec() failed 2309 * 2310 * In either case, there is enough bad data to prevent 2311 * reconstruction. 2312 * 2313 * Start checksum ereports for all children which haven't 2314 * failed, and the IO wasn't speculative. 2315 */ 2316 zio->io_error = SET_ERROR(ECKSUM); 2317 2318 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2319 for (c = 0; c < rm->rm_cols; c++) { 2320 rc = &rm->rm_col[c]; 2321 if (rc->rc_error == 0) { 2322 zio_bad_cksum_t zbc; 2323 zbc.zbc_has_cksum = 0; 2324 zbc.zbc_injected = 2325 rm->rm_ecksuminjected; 2326 2327 zfs_ereport_start_checksum( 2328 zio->io_spa, 2329 vd->vdev_child[rc->rc_devidx], 2330 zio, rc->rc_offset, rc->rc_size, 2331 (void *)(uintptr_t)c, &zbc); 2332 } 2333 } 2334 } 2335 } 2336 2337 done: 2338 zio_checksum_verified(zio); 2339 2340 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2341 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2342 /* 2343 * Use the good data we have in hand to repair damaged children. 2344 */ 2345 for (c = 0; c < rm->rm_cols; c++) { 2346 rc = &rm->rm_col[c]; 2347 cvd = vd->vdev_child[rc->rc_devidx]; 2348 2349 if (rc->rc_error == 0) 2350 continue; 2351 2352 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2353 rc->rc_offset, rc->rc_data, rc->rc_size, 2354 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2355 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2356 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2357 } 2358 } 2359 } 2360 2361 static void 2362 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2363 { 2364 if (faulted > vd->vdev_nparity) 2365 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2366 VDEV_AUX_NO_REPLICAS); 2367 else if (degraded + faulted != 0) 2368 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2369 else 2370 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2371 } 2372 2373 vdev_ops_t vdev_raidz_ops = { 2374 vdev_raidz_open, 2375 vdev_raidz_close, 2376 vdev_raidz_asize, 2377 vdev_raidz_io_start, 2378 vdev_raidz_io_done, 2379 vdev_raidz_state_change, 2380 NULL, 2381 NULL, 2382 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2383 B_FALSE /* not a leaf vdev */ 2384 }; 2385