1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_file.h> 34 #include <sys/vdev_raidz.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/abd.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/fs/zfs.h> 40 41 /* 42 * Virtual device vector for RAID-Z. 43 * 44 * This vdev supports single, double, and triple parity. For single parity, 45 * we use a simple XOR of all the data columns. For double or triple parity, 46 * we use a special case of Reed-Solomon coding. This extends the 47 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 48 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 49 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 50 * former is also based. The latter is designed to provide higher performance 51 * for writes. 52 * 53 * Note that the Plank paper claimed to support arbitrary N+M, but was then 54 * amended six years later identifying a critical flaw that invalidates its 55 * claims. Nevertheless, the technique can be adapted to work for up to 56 * triple parity. For additional parity, the amendment "Note: Correction to 57 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 58 * is viable, but the additional complexity means that write performance will 59 * suffer. 60 * 61 * All of the methods above operate on a Galois field, defined over the 62 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 63 * can be expressed with a single byte. Briefly, the operations on the 64 * field are defined as follows: 65 * 66 * o addition (+) is represented by a bitwise XOR 67 * o subtraction (-) is therefore identical to addition: A + B = A - B 68 * o multiplication of A by 2 is defined by the following bitwise expression: 69 * 70 * (A * 2)_7 = A_6 71 * (A * 2)_6 = A_5 72 * (A * 2)_5 = A_4 73 * (A * 2)_4 = A_3 + A_7 74 * (A * 2)_3 = A_2 + A_7 75 * (A * 2)_2 = A_1 + A_7 76 * (A * 2)_1 = A_0 77 * (A * 2)_0 = A_7 78 * 79 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 80 * As an aside, this multiplication is derived from the error correcting 81 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 82 * 83 * Observe that any number in the field (except for 0) can be expressed as a 84 * power of 2 -- a generator for the field. We store a table of the powers of 85 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 86 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 87 * than field addition). The inverse of a field element A (A^-1) is therefore 88 * A ^ (255 - 1) = A^254. 89 * 90 * The up-to-three parity columns, P, Q, R over several data columns, 91 * D_0, ... D_n-1, can be expressed by field operations: 92 * 93 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 94 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 95 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 96 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 97 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 98 * 99 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 100 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 101 * independent coefficients. (There are no additional coefficients that have 102 * this property which is why the uncorrected Plank method breaks down.) 103 * 104 * See the reconstruction code below for how P, Q and R can used individually 105 * or in concert to recover missing data columns. 106 */ 107 108 typedef struct raidz_col { 109 uint64_t rc_devidx; /* child device index for I/O */ 110 uint64_t rc_offset; /* device offset */ 111 uint64_t rc_size; /* I/O size */ 112 abd_t *rc_abd; /* I/O data */ 113 void *rc_gdata; /* used to store the "good" version */ 114 int rc_error; /* I/O error for this device */ 115 uint8_t rc_tried; /* Did we attempt this I/O column? */ 116 uint8_t rc_skipped; /* Did we skip this I/O column? */ 117 } raidz_col_t; 118 119 typedef struct raidz_map { 120 uint64_t rm_cols; /* Regular column count */ 121 uint64_t rm_scols; /* Count including skipped columns */ 122 uint64_t rm_bigcols; /* Number of oversized columns */ 123 uint64_t rm_asize; /* Actual total I/O size */ 124 uint64_t rm_missingdata; /* Count of missing data devices */ 125 uint64_t rm_missingparity; /* Count of missing parity devices */ 126 uint64_t rm_firstdatacol; /* First data column/parity count */ 127 uint64_t rm_nskip; /* Skipped sectors for padding */ 128 uint64_t rm_skipstart; /* Column index of padding start */ 129 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ 130 uintptr_t rm_reports; /* # of referencing checksum reports */ 131 uint8_t rm_freed; /* map no longer has referencing ZIO */ 132 uint8_t rm_ecksuminjected; /* checksum error was injected */ 133 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 134 } raidz_map_t; 135 136 #define VDEV_RAIDZ_P 0 137 #define VDEV_RAIDZ_Q 1 138 #define VDEV_RAIDZ_R 2 139 140 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 141 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 142 143 /* 144 * We provide a mechanism to perform the field multiplication operation on a 145 * 64-bit value all at once rather than a byte at a time. This works by 146 * creating a mask from the top bit in each byte and using that to 147 * conditionally apply the XOR of 0x1d. 148 */ 149 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 150 { \ 151 (mask) = (x) & 0x8080808080808080ULL; \ 152 (mask) = ((mask) << 1) - ((mask) >> 7); \ 153 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 154 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 155 } 156 157 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 158 { \ 159 VDEV_RAIDZ_64MUL_2((x), mask); \ 160 VDEV_RAIDZ_64MUL_2((x), mask); \ 161 } 162 163 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 164 165 /* 166 * Force reconstruction to use the general purpose method. 167 */ 168 int vdev_raidz_default_to_general; 169 170 /* Powers of 2 in the Galois field defined above. */ 171 static const uint8_t vdev_raidz_pow2[256] = { 172 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 173 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 174 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 175 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 176 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 177 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 178 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 179 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 180 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 181 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 182 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 183 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 184 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 185 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 186 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 187 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 188 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 189 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 190 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 191 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 192 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 193 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 194 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 195 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 196 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 197 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 198 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 199 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 200 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 201 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 202 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 203 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 204 }; 205 /* Logs of 2 in the Galois field defined above. */ 206 static const uint8_t vdev_raidz_log2[256] = { 207 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 208 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 209 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 210 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 211 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 212 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 213 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 214 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 215 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 216 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 217 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 218 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 219 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 220 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 221 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 222 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 223 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 224 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 225 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 226 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 227 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 228 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 229 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 230 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 231 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 232 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 233 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 234 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 235 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 236 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 237 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 238 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 239 }; 240 241 static void vdev_raidz_generate_parity(raidz_map_t *rm); 242 243 /* 244 * Multiply a given number by 2 raised to the given power. 245 */ 246 static uint8_t 247 vdev_raidz_exp2(uint_t a, int exp) 248 { 249 if (a == 0) 250 return (0); 251 252 ASSERT(exp >= 0); 253 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 254 255 exp += vdev_raidz_log2[a]; 256 if (exp > 255) 257 exp -= 255; 258 259 return (vdev_raidz_pow2[exp]); 260 } 261 262 static void 263 vdev_raidz_map_free(raidz_map_t *rm) 264 { 265 int c; 266 size_t size; 267 268 for (c = 0; c < rm->rm_firstdatacol; c++) { 269 abd_free(rm->rm_col[c].rc_abd); 270 271 if (rm->rm_col[c].rc_gdata != NULL) 272 zio_buf_free(rm->rm_col[c].rc_gdata, 273 rm->rm_col[c].rc_size); 274 } 275 276 size = 0; 277 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 278 abd_put(rm->rm_col[c].rc_abd); 279 size += rm->rm_col[c].rc_size; 280 } 281 282 if (rm->rm_abd_copy != NULL) 283 abd_free(rm->rm_abd_copy); 284 285 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 286 } 287 288 static void 289 vdev_raidz_map_free_vsd(zio_t *zio) 290 { 291 raidz_map_t *rm = zio->io_vsd; 292 293 ASSERT0(rm->rm_freed); 294 rm->rm_freed = 1; 295 296 if (rm->rm_reports == 0) 297 vdev_raidz_map_free(rm); 298 } 299 300 /*ARGSUSED*/ 301 static void 302 vdev_raidz_cksum_free(void *arg, size_t ignored) 303 { 304 raidz_map_t *rm = arg; 305 306 ASSERT3U(rm->rm_reports, >, 0); 307 308 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 309 vdev_raidz_map_free(rm); 310 } 311 312 static void 313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 314 { 315 raidz_map_t *rm = zcr->zcr_cbdata; 316 size_t c = zcr->zcr_cbinfo; 317 size_t x; 318 319 const char *good = NULL; 320 char *bad; 321 322 if (good_data == NULL) { 323 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 324 return; 325 } 326 327 if (c < rm->rm_firstdatacol) { 328 /* 329 * The first time through, calculate the parity blocks for 330 * the good data (this relies on the fact that the good 331 * data never changes for a given logical ZIO) 332 */ 333 if (rm->rm_col[0].rc_gdata == NULL) { 334 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 335 char *buf; 336 int offset; 337 338 /* 339 * Set up the rm_col[]s to generate the parity for 340 * good_data, first saving the parity bufs and 341 * replacing them with buffers to hold the result. 342 */ 343 for (x = 0; x < rm->rm_firstdatacol; x++) { 344 bad_parity[x] = rm->rm_col[x].rc_abd; 345 rm->rm_col[x].rc_gdata = 346 zio_buf_alloc(rm->rm_col[x].rc_size); 347 rm->rm_col[x].rc_abd = 348 abd_get_from_buf(rm->rm_col[x].rc_gdata, 349 rm->rm_col[x].rc_size); 350 } 351 352 /* fill in the data columns from good_data */ 353 buf = (char *)good_data; 354 for (; x < rm->rm_cols; x++) { 355 abd_put(rm->rm_col[x].rc_abd); 356 rm->rm_col[x].rc_abd = abd_get_from_buf(buf, 357 rm->rm_col[x].rc_size); 358 buf += rm->rm_col[x].rc_size; 359 } 360 361 /* 362 * Construct the parity from the good data. 363 */ 364 vdev_raidz_generate_parity(rm); 365 366 /* restore everything back to its original state */ 367 for (x = 0; x < rm->rm_firstdatacol; x++) { 368 abd_put(rm->rm_col[x].rc_abd); 369 rm->rm_col[x].rc_abd = bad_parity[x]; 370 } 371 372 offset = 0; 373 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 374 abd_put(rm->rm_col[x].rc_abd); 375 rm->rm_col[x].rc_abd = abd_get_offset( 376 rm->rm_abd_copy, offset); 377 offset += rm->rm_col[x].rc_size; 378 } 379 } 380 381 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 382 good = rm->rm_col[c].rc_gdata; 383 } else { 384 /* adjust good_data to point at the start of our column */ 385 good = good_data; 386 387 for (x = rm->rm_firstdatacol; x < c; x++) 388 good += rm->rm_col[x].rc_size; 389 } 390 391 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); 392 /* we drop the ereport if it ends up that the data was good */ 393 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 394 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); 395 } 396 397 /* 398 * Invoked indirectly by zfs_ereport_start_checksum(), called 399 * below when our read operation fails completely. The main point 400 * is to keep a copy of everything we read from disk, so that at 401 * vdev_raidz_cksum_finish() time we can compare it with the good data. 402 */ 403 static void 404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 405 { 406 size_t c = (size_t)(uintptr_t)arg; 407 size_t offset; 408 409 raidz_map_t *rm = zio->io_vsd; 410 size_t size; 411 412 /* set up the report and bump the refcount */ 413 zcr->zcr_cbdata = rm; 414 zcr->zcr_cbinfo = c; 415 zcr->zcr_finish = vdev_raidz_cksum_finish; 416 zcr->zcr_free = vdev_raidz_cksum_free; 417 418 rm->rm_reports++; 419 ASSERT3U(rm->rm_reports, >, 0); 420 421 if (rm->rm_abd_copy != NULL) 422 return; 423 424 /* 425 * It's the first time we're called for this raidz_map_t, so we need 426 * to copy the data aside; there's no guarantee that our zio's buffer 427 * won't be re-used for something else. 428 * 429 * Our parity data is already in separate buffers, so there's no need 430 * to copy them. 431 */ 432 433 size = 0; 434 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 435 size += rm->rm_col[c].rc_size; 436 437 rm->rm_abd_copy = 438 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); 439 440 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 441 raidz_col_t *col = &rm->rm_col[c]; 442 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); 443 444 abd_copy(tmp, col->rc_abd, col->rc_size); 445 abd_put(col->rc_abd); 446 col->rc_abd = tmp; 447 448 offset += col->rc_size; 449 } 450 ASSERT3U(offset, ==, size); 451 } 452 453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 454 vdev_raidz_map_free_vsd, 455 vdev_raidz_cksum_report 456 }; 457 458 /* 459 * Divides the IO evenly across all child vdevs; usually, dcols is 460 * the number of children in the target vdev. 461 */ 462 static raidz_map_t * 463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, 464 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 465 { 466 raidz_map_t *rm; 467 /* The starting RAIDZ (parent) vdev sector of the block. */ 468 uint64_t b = offset >> unit_shift; 469 /* The zio's size in units of the vdev's minimum sector size. */ 470 uint64_t s = size >> unit_shift; 471 /* The first column for this stripe. */ 472 uint64_t f = b % dcols; 473 /* The starting byte offset on each child vdev. */ 474 uint64_t o = (b / dcols) << unit_shift; 475 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 476 uint64_t off = 0; 477 478 /* 479 * "Quotient": The number of data sectors for this stripe on all but 480 * the "big column" child vdevs that also contain "remainder" data. 481 */ 482 q = s / (dcols - nparity); 483 484 /* 485 * "Remainder": The number of partial stripe data sectors in this I/O. 486 * This will add a sector to some, but not all, child vdevs. 487 */ 488 r = s - q * (dcols - nparity); 489 490 /* The number of "big columns" - those which contain remainder data. */ 491 bc = (r == 0 ? 0 : r + nparity); 492 493 /* 494 * The total number of data and parity sectors associated with 495 * this I/O. 496 */ 497 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 498 499 /* acols: The columns that will be accessed. */ 500 /* scols: The columns that will be accessed or skipped. */ 501 if (q == 0) { 502 /* Our I/O request doesn't span all child vdevs. */ 503 acols = bc; 504 scols = MIN(dcols, roundup(bc, nparity + 1)); 505 } else { 506 acols = dcols; 507 scols = dcols; 508 } 509 510 ASSERT3U(acols, <=, scols); 511 512 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 513 514 rm->rm_cols = acols; 515 rm->rm_scols = scols; 516 rm->rm_bigcols = bc; 517 rm->rm_skipstart = bc; 518 rm->rm_missingdata = 0; 519 rm->rm_missingparity = 0; 520 rm->rm_firstdatacol = nparity; 521 rm->rm_abd_copy = NULL; 522 rm->rm_reports = 0; 523 rm->rm_freed = 0; 524 rm->rm_ecksuminjected = 0; 525 526 asize = 0; 527 528 for (c = 0; c < scols; c++) { 529 col = f + c; 530 coff = o; 531 if (col >= dcols) { 532 col -= dcols; 533 coff += 1ULL << unit_shift; 534 } 535 rm->rm_col[c].rc_devidx = col; 536 rm->rm_col[c].rc_offset = coff; 537 rm->rm_col[c].rc_abd = NULL; 538 rm->rm_col[c].rc_gdata = NULL; 539 rm->rm_col[c].rc_error = 0; 540 rm->rm_col[c].rc_tried = 0; 541 rm->rm_col[c].rc_skipped = 0; 542 543 if (c >= acols) 544 rm->rm_col[c].rc_size = 0; 545 else if (c < bc) 546 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 547 else 548 rm->rm_col[c].rc_size = q << unit_shift; 549 550 asize += rm->rm_col[c].rc_size; 551 } 552 553 ASSERT3U(asize, ==, tot << unit_shift); 554 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 555 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 556 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 557 ASSERT3U(rm->rm_nskip, <=, nparity); 558 559 for (c = 0; c < rm->rm_firstdatacol; c++) 560 rm->rm_col[c].rc_abd = 561 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); 562 563 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); 564 off = rm->rm_col[c].rc_size; 565 566 for (c = c + 1; c < acols; c++) { 567 rm->rm_col[c].rc_abd = abd_get_offset(abd, off); 568 off += rm->rm_col[c].rc_size; 569 } 570 571 /* 572 * If all data stored spans all columns, there's a danger that parity 573 * will always be on the same device and, since parity isn't read 574 * during normal operation, that that device's I/O bandwidth won't be 575 * used effectively. We therefore switch the parity every 1MB. 576 * 577 * ... at least that was, ostensibly, the theory. As a practical 578 * matter unless we juggle the parity between all devices evenly, we 579 * won't see any benefit. Further, occasional writes that aren't a 580 * multiple of the LCM of the number of children and the minimum 581 * stripe width are sufficient to avoid pessimal behavior. 582 * Unfortunately, this decision created an implicit on-disk format 583 * requirement that we need to support for all eternity, but only 584 * for single-parity RAID-Z. 585 * 586 * If we intend to skip a sector in the zeroth column for padding 587 * we must make sure to note this swap. We will never intend to 588 * skip the first column since at least one data and one parity 589 * column must appear in each row. 590 */ 591 ASSERT(rm->rm_cols >= 2); 592 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 593 594 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 595 devidx = rm->rm_col[0].rc_devidx; 596 o = rm->rm_col[0].rc_offset; 597 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 598 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 599 rm->rm_col[1].rc_devidx = devidx; 600 rm->rm_col[1].rc_offset = o; 601 602 if (rm->rm_skipstart == 0) 603 rm->rm_skipstart = 1; 604 } 605 606 return (rm); 607 } 608 609 struct pqr_struct { 610 uint64_t *p; 611 uint64_t *q; 612 uint64_t *r; 613 }; 614 615 static int 616 vdev_raidz_p_func(void *buf, size_t size, void *private) 617 { 618 struct pqr_struct *pqr = private; 619 const uint64_t *src = buf; 620 int i, cnt = size / sizeof (src[0]); 621 622 ASSERT(pqr->p && !pqr->q && !pqr->r); 623 624 for (i = 0; i < cnt; i++, src++, pqr->p++) 625 *pqr->p ^= *src; 626 627 return (0); 628 } 629 630 static int 631 vdev_raidz_pq_func(void *buf, size_t size, void *private) 632 { 633 struct pqr_struct *pqr = private; 634 const uint64_t *src = buf; 635 uint64_t mask; 636 int i, cnt = size / sizeof (src[0]); 637 638 ASSERT(pqr->p && pqr->q && !pqr->r); 639 640 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 641 *pqr->p ^= *src; 642 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 643 *pqr->q ^= *src; 644 } 645 646 return (0); 647 } 648 649 static int 650 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 651 { 652 struct pqr_struct *pqr = private; 653 const uint64_t *src = buf; 654 uint64_t mask; 655 int i, cnt = size / sizeof (src[0]); 656 657 ASSERT(pqr->p && pqr->q && pqr->r); 658 659 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 660 *pqr->p ^= *src; 661 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 662 *pqr->q ^= *src; 663 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 664 *pqr->r ^= *src; 665 } 666 667 return (0); 668 } 669 670 static void 671 vdev_raidz_generate_parity_p(raidz_map_t *rm) 672 { 673 uint64_t *p; 674 int c; 675 abd_t *src; 676 677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 678 src = rm->rm_col[c].rc_abd; 679 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 680 681 if (c == rm->rm_firstdatacol) { 682 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 683 } else { 684 struct pqr_struct pqr = { p, NULL, NULL }; 685 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 686 vdev_raidz_p_func, &pqr); 687 } 688 } 689 } 690 691 static void 692 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 693 { 694 uint64_t *p, *q, pcnt, ccnt, mask, i; 695 int c; 696 abd_t *src; 697 698 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 699 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 700 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 701 702 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 703 src = rm->rm_col[c].rc_abd; 704 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 705 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 706 707 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 708 709 710 if (c == rm->rm_firstdatacol) { 711 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 712 (void) memcpy(q, p, rm->rm_col[c].rc_size); 713 } else { 714 struct pqr_struct pqr = { p, q, NULL }; 715 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 716 vdev_raidz_pq_func, &pqr); 717 } 718 719 if (c == rm->rm_firstdatacol) { 720 for (i = ccnt; i < pcnt; i++) { 721 p[i] = 0; 722 q[i] = 0; 723 } 724 } else { 725 /* 726 * Treat short columns as though they are full of 0s. 727 * Note that there's therefore nothing needed for P. 728 */ 729 for (i = ccnt; i < pcnt; i++) { 730 VDEV_RAIDZ_64MUL_2(q[i], mask); 731 } 732 } 733 } 734 } 735 736 static void 737 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 738 { 739 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 740 int c; 741 abd_t *src; 742 743 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 744 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 745 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 746 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 747 rm->rm_col[VDEV_RAIDZ_R].rc_size); 748 749 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 750 src = rm->rm_col[c].rc_abd; 751 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 752 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 753 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 754 755 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 756 757 if (c == rm->rm_firstdatacol) { 758 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 759 (void) memcpy(q, p, rm->rm_col[c].rc_size); 760 (void) memcpy(r, p, rm->rm_col[c].rc_size); 761 } else { 762 struct pqr_struct pqr = { p, q, r }; 763 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 764 vdev_raidz_pqr_func, &pqr); 765 } 766 767 if (c == rm->rm_firstdatacol) { 768 for (i = ccnt; i < pcnt; i++) { 769 p[i] = 0; 770 q[i] = 0; 771 r[i] = 0; 772 } 773 } else { 774 /* 775 * Treat short columns as though they are full of 0s. 776 * Note that there's therefore nothing needed for P. 777 */ 778 for (i = ccnt; i < pcnt; i++) { 779 VDEV_RAIDZ_64MUL_2(q[i], mask); 780 VDEV_RAIDZ_64MUL_4(r[i], mask); 781 } 782 } 783 } 784 } 785 786 /* 787 * Generate RAID parity in the first virtual columns according to the number of 788 * parity columns available. 789 */ 790 static void 791 vdev_raidz_generate_parity(raidz_map_t *rm) 792 { 793 switch (rm->rm_firstdatacol) { 794 case 1: 795 vdev_raidz_generate_parity_p(rm); 796 break; 797 case 2: 798 vdev_raidz_generate_parity_pq(rm); 799 break; 800 case 3: 801 vdev_raidz_generate_parity_pqr(rm); 802 break; 803 default: 804 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 805 } 806 } 807 808 /* ARGSUSED */ 809 static int 810 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 811 { 812 uint64_t *dst = dbuf; 813 uint64_t *src = sbuf; 814 int cnt = size / sizeof (src[0]); 815 816 for (int i = 0; i < cnt; i++) { 817 dst[i] ^= src[i]; 818 } 819 820 return (0); 821 } 822 823 /* ARGSUSED */ 824 static int 825 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 826 void *private) 827 { 828 uint64_t *dst = dbuf; 829 uint64_t *src = sbuf; 830 uint64_t mask; 831 int cnt = size / sizeof (dst[0]); 832 833 for (int i = 0; i < cnt; i++, dst++, src++) { 834 VDEV_RAIDZ_64MUL_2(*dst, mask); 835 *dst ^= *src; 836 } 837 838 return (0); 839 } 840 841 /* ARGSUSED */ 842 static int 843 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 844 { 845 uint64_t *dst = buf; 846 uint64_t mask; 847 int cnt = size / sizeof (dst[0]); 848 849 for (int i = 0; i < cnt; i++, dst++) { 850 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 851 VDEV_RAIDZ_64MUL_2(*dst, mask); 852 } 853 854 return (0); 855 } 856 857 struct reconst_q_struct { 858 uint64_t *q; 859 int exp; 860 }; 861 862 static int 863 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 864 { 865 struct reconst_q_struct *rq = private; 866 uint64_t *dst = buf; 867 int cnt = size / sizeof (dst[0]); 868 869 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 870 *dst ^= *rq->q; 871 872 int j; 873 uint8_t *b; 874 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 875 *b = vdev_raidz_exp2(*b, rq->exp); 876 } 877 } 878 879 return (0); 880 } 881 882 struct reconst_pq_struct { 883 uint8_t *p; 884 uint8_t *q; 885 uint8_t *pxy; 886 uint8_t *qxy; 887 int aexp; 888 int bexp; 889 }; 890 891 static int 892 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 893 { 894 struct reconst_pq_struct *rpq = private; 895 uint8_t *xd = xbuf; 896 uint8_t *yd = ybuf; 897 898 for (int i = 0; i < size; 899 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 900 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 901 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 902 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 903 } 904 905 return (0); 906 } 907 908 static int 909 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 910 { 911 struct reconst_pq_struct *rpq = private; 912 uint8_t *xd = xbuf; 913 914 for (int i = 0; i < size; 915 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 916 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 917 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 918 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 919 } 920 921 return (0); 922 } 923 924 static int 925 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 926 { 927 int x = tgts[0]; 928 int c; 929 abd_t *dst, *src; 930 931 ASSERT(ntgts == 1); 932 ASSERT(x >= rm->rm_firstdatacol); 933 ASSERT(x < rm->rm_cols); 934 935 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 936 ASSERT(rm->rm_col[x].rc_size > 0); 937 938 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 939 dst = rm->rm_col[x].rc_abd; 940 941 abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); 942 943 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 944 uint64_t size = MIN(rm->rm_col[x].rc_size, 945 rm->rm_col[c].rc_size); 946 947 src = rm->rm_col[c].rc_abd; 948 dst = rm->rm_col[x].rc_abd; 949 950 if (c == x) 951 continue; 952 953 (void) abd_iterate_func2(dst, src, 0, 0, size, 954 vdev_raidz_reconst_p_func, NULL); 955 } 956 957 return (1 << VDEV_RAIDZ_P); 958 } 959 960 static int 961 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 962 { 963 int x = tgts[0]; 964 int c, exp; 965 abd_t *dst, *src; 966 967 ASSERT(ntgts == 1); 968 969 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 970 971 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 972 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 973 rm->rm_col[c].rc_size); 974 975 src = rm->rm_col[c].rc_abd; 976 dst = rm->rm_col[x].rc_abd; 977 978 if (c == rm->rm_firstdatacol) { 979 abd_copy(dst, src, size); 980 if (rm->rm_col[x].rc_size > size) 981 abd_zero_off(dst, size, 982 rm->rm_col[x].rc_size - size); 983 } else { 984 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 985 (void) abd_iterate_func2(dst, src, 0, 0, size, 986 vdev_raidz_reconst_q_pre_func, NULL); 987 (void) abd_iterate_func(dst, 988 size, rm->rm_col[x].rc_size - size, 989 vdev_raidz_reconst_q_pre_tail_func, NULL); 990 } 991 } 992 993 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 994 dst = rm->rm_col[x].rc_abd; 995 exp = 255 - (rm->rm_cols - 1 - x); 996 997 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 998 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 999 vdev_raidz_reconst_q_post_func, &rq); 1000 1001 return (1 << VDEV_RAIDZ_Q); 1002 } 1003 1004 static int 1005 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 1006 { 1007 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1008 abd_t *pdata, *qdata; 1009 uint64_t xsize, ysize; 1010 int x = tgts[0]; 1011 int y = tgts[1]; 1012 abd_t *xd, *yd; 1013 1014 ASSERT(ntgts == 2); 1015 ASSERT(x < y); 1016 ASSERT(x >= rm->rm_firstdatacol); 1017 ASSERT(y < rm->rm_cols); 1018 1019 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 1020 1021 /* 1022 * Move the parity data aside -- we're going to compute parity as 1023 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1024 * reuse the parity generation mechanism without trashing the actual 1025 * parity so we make those columns appear to be full of zeros by 1026 * setting their lengths to zero. 1027 */ 1028 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 1029 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1030 xsize = rm->rm_col[x].rc_size; 1031 ysize = rm->rm_col[y].rc_size; 1032 1033 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 1034 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1035 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 1036 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1037 rm->rm_col[x].rc_size = 0; 1038 rm->rm_col[y].rc_size = 0; 1039 1040 vdev_raidz_generate_parity_pq(rm); 1041 1042 rm->rm_col[x].rc_size = xsize; 1043 rm->rm_col[y].rc_size = ysize; 1044 1045 p = abd_to_buf(pdata); 1046 q = abd_to_buf(qdata); 1047 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1048 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1049 xd = rm->rm_col[x].rc_abd; 1050 yd = rm->rm_col[y].rc_abd; 1051 1052 /* 1053 * We now have: 1054 * Pxy = P + D_x + D_y 1055 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1056 * 1057 * We can then solve for D_x: 1058 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1059 * where 1060 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1061 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1062 * 1063 * With D_x in hand, we can easily solve for D_y: 1064 * D_y = P + Pxy + D_x 1065 */ 1066 1067 a = vdev_raidz_pow2[255 + x - y]; 1068 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 1069 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1070 1071 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1072 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1073 1074 ASSERT3U(xsize, >=, ysize); 1075 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1076 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1077 vdev_raidz_reconst_pq_func, &rpq); 1078 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1079 vdev_raidz_reconst_pq_tail_func, &rpq); 1080 1081 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1082 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1083 1084 /* 1085 * Restore the saved parity data. 1086 */ 1087 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 1088 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1089 1090 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1091 } 1092 1093 /* BEGIN CSTYLED */ 1094 /* 1095 * In the general case of reconstruction, we must solve the system of linear 1096 * equations defined by the coeffecients used to generate parity as well as 1097 * the contents of the data and parity disks. This can be expressed with 1098 * vectors for the original data (D) and the actual data (d) and parity (p) 1099 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1100 * 1101 * __ __ __ __ 1102 * | | __ __ | p_0 | 1103 * | V | | D_0 | | p_m-1 | 1104 * | | x | : | = | d_0 | 1105 * | I | | D_n-1 | | : | 1106 * | | ~~ ~~ | d_n-1 | 1107 * ~~ ~~ ~~ ~~ 1108 * 1109 * I is simply a square identity matrix of size n, and V is a vandermonde 1110 * matrix defined by the coeffecients we chose for the various parity columns 1111 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1112 * computation as well as linear separability. 1113 * 1114 * __ __ __ __ 1115 * | 1 .. 1 1 1 | | p_0 | 1116 * | 2^n-1 .. 4 2 1 | __ __ | : | 1117 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1118 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1119 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1120 * | : : : : | | : | | d_2 | 1121 * | 0 .. 1 0 0 | | D_n-1 | | : | 1122 * | 0 .. 0 1 0 | ~~ ~~ | : | 1123 * | 0 .. 0 0 1 | | d_n-1 | 1124 * ~~ ~~ ~~ ~~ 1125 * 1126 * Note that I, V, d, and p are known. To compute D, we must invert the 1127 * matrix and use the known data and parity values to reconstruct the unknown 1128 * data values. We begin by removing the rows in V|I and d|p that correspond 1129 * to failed or missing columns; we then make V|I square (n x n) and d|p 1130 * sized n by removing rows corresponding to unused parity from the bottom up 1131 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1132 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1133 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1134 * __ __ 1135 * | 1 1 1 1 1 1 1 1 | 1136 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1137 * | 19 205 116 29 64 16 4 1 | / / 1138 * | 1 0 0 0 0 0 0 0 | / / 1139 * | 0 1 0 0 0 0 0 0 | <--' / 1140 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1141 * | 0 0 0 1 0 0 0 0 | 1142 * | 0 0 0 0 1 0 0 0 | 1143 * | 0 0 0 0 0 1 0 0 | 1144 * | 0 0 0 0 0 0 1 0 | 1145 * | 0 0 0 0 0 0 0 1 | 1146 * ~~ ~~ 1147 * __ __ 1148 * | 1 1 1 1 1 1 1 1 | 1149 * | 19 205 116 29 64 16 4 1 | 1150 * | 1 0 0 0 0 0 0 0 | 1151 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1152 * | 0 0 0 0 1 0 0 0 | 1153 * | 0 0 0 0 0 1 0 0 | 1154 * | 0 0 0 0 0 0 1 0 | 1155 * | 0 0 0 0 0 0 0 1 | 1156 * ~~ ~~ 1157 * 1158 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1159 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1160 * matrix is not singular. 1161 * __ __ 1162 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1163 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1164 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1165 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1166 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1167 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1168 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1169 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1170 * ~~ ~~ 1171 * __ __ 1172 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1173 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1174 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1175 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1176 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1177 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1178 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1179 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1180 * ~~ ~~ 1181 * __ __ 1182 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1183 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1184 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1185 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1186 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1187 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1188 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1189 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1190 * ~~ ~~ 1191 * __ __ 1192 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1193 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1194 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1195 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1196 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1197 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1198 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1199 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1200 * ~~ ~~ 1201 * __ __ 1202 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1203 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1204 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1205 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1206 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1207 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1208 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1209 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1210 * ~~ ~~ 1211 * __ __ 1212 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1213 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1214 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1215 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1216 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1217 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1218 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1219 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1220 * ~~ ~~ 1221 * __ __ 1222 * | 0 0 1 0 0 0 0 0 | 1223 * | 167 100 5 41 159 169 217 208 | 1224 * | 166 100 4 40 158 168 216 209 | 1225 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1226 * | 0 0 0 0 1 0 0 0 | 1227 * | 0 0 0 0 0 1 0 0 | 1228 * | 0 0 0 0 0 0 1 0 | 1229 * | 0 0 0 0 0 0 0 1 | 1230 * ~~ ~~ 1231 * 1232 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1233 * of the missing data. 1234 * 1235 * As is apparent from the example above, the only non-trivial rows in the 1236 * inverse matrix correspond to the data disks that we're trying to 1237 * reconstruct. Indeed, those are the only rows we need as the others would 1238 * only be useful for reconstructing data known or assumed to be valid. For 1239 * that reason, we only build the coefficients in the rows that correspond to 1240 * targeted columns. 1241 */ 1242 /* END CSTYLED */ 1243 1244 static void 1245 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1246 uint8_t **rows) 1247 { 1248 int i, j; 1249 int pow; 1250 1251 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1252 1253 /* 1254 * Fill in the missing rows of interest. 1255 */ 1256 for (i = 0; i < nmap; i++) { 1257 ASSERT3S(0, <=, map[i]); 1258 ASSERT3S(map[i], <=, 2); 1259 1260 pow = map[i] * n; 1261 if (pow > 255) 1262 pow -= 255; 1263 ASSERT(pow <= 255); 1264 1265 for (j = 0; j < n; j++) { 1266 pow -= map[i]; 1267 if (pow < 0) 1268 pow += 255; 1269 rows[i][j] = vdev_raidz_pow2[pow]; 1270 } 1271 } 1272 } 1273 1274 static void 1275 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1276 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1277 { 1278 int i, j, ii, jj; 1279 uint8_t log; 1280 1281 /* 1282 * Assert that the first nmissing entries from the array of used 1283 * columns correspond to parity columns and that subsequent entries 1284 * correspond to data columns. 1285 */ 1286 for (i = 0; i < nmissing; i++) { 1287 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1288 } 1289 for (; i < n; i++) { 1290 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1291 } 1292 1293 /* 1294 * First initialize the storage where we'll compute the inverse rows. 1295 */ 1296 for (i = 0; i < nmissing; i++) { 1297 for (j = 0; j < n; j++) { 1298 invrows[i][j] = (i == j) ? 1 : 0; 1299 } 1300 } 1301 1302 /* 1303 * Subtract all trivial rows from the rows of consequence. 1304 */ 1305 for (i = 0; i < nmissing; i++) { 1306 for (j = nmissing; j < n; j++) { 1307 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1308 jj = used[j] - rm->rm_firstdatacol; 1309 ASSERT3S(jj, <, n); 1310 invrows[i][j] = rows[i][jj]; 1311 rows[i][jj] = 0; 1312 } 1313 } 1314 1315 /* 1316 * For each of the rows of interest, we must normalize it and subtract 1317 * a multiple of it from the other rows. 1318 */ 1319 for (i = 0; i < nmissing; i++) { 1320 for (j = 0; j < missing[i]; j++) { 1321 ASSERT0(rows[i][j]); 1322 } 1323 ASSERT3U(rows[i][missing[i]], !=, 0); 1324 1325 /* 1326 * Compute the inverse of the first element and multiply each 1327 * element in the row by that value. 1328 */ 1329 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1330 1331 for (j = 0; j < n; j++) { 1332 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1333 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1334 } 1335 1336 for (ii = 0; ii < nmissing; ii++) { 1337 if (i == ii) 1338 continue; 1339 1340 ASSERT3U(rows[ii][missing[i]], !=, 0); 1341 1342 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1343 1344 for (j = 0; j < n; j++) { 1345 rows[ii][j] ^= 1346 vdev_raidz_exp2(rows[i][j], log); 1347 invrows[ii][j] ^= 1348 vdev_raidz_exp2(invrows[i][j], log); 1349 } 1350 } 1351 } 1352 1353 /* 1354 * Verify that the data that is left in the rows are properly part of 1355 * an identity matrix. 1356 */ 1357 for (i = 0; i < nmissing; i++) { 1358 for (j = 0; j < n; j++) { 1359 if (j == missing[i]) { 1360 ASSERT3U(rows[i][j], ==, 1); 1361 } else { 1362 ASSERT0(rows[i][j]); 1363 } 1364 } 1365 } 1366 } 1367 1368 static void 1369 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1370 int *missing, uint8_t **invrows, const uint8_t *used) 1371 { 1372 int i, j, x, cc, c; 1373 uint8_t *src; 1374 uint64_t ccount; 1375 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1376 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1377 uint8_t log = 0; 1378 uint8_t val; 1379 int ll; 1380 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1381 uint8_t *p, *pp; 1382 size_t psize; 1383 1384 psize = sizeof (invlog[0][0]) * n * nmissing; 1385 p = kmem_alloc(psize, KM_SLEEP); 1386 1387 for (pp = p, i = 0; i < nmissing; i++) { 1388 invlog[i] = pp; 1389 pp += n; 1390 } 1391 1392 for (i = 0; i < nmissing; i++) { 1393 for (j = 0; j < n; j++) { 1394 ASSERT3U(invrows[i][j], !=, 0); 1395 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1396 } 1397 } 1398 1399 for (i = 0; i < n; i++) { 1400 c = used[i]; 1401 ASSERT3U(c, <, rm->rm_cols); 1402 1403 src = abd_to_buf(rm->rm_col[c].rc_abd); 1404 ccount = rm->rm_col[c].rc_size; 1405 for (j = 0; j < nmissing; j++) { 1406 cc = missing[j] + rm->rm_firstdatacol; 1407 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1408 ASSERT3U(cc, <, rm->rm_cols); 1409 ASSERT3U(cc, !=, c); 1410 1411 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1412 dcount[j] = rm->rm_col[cc].rc_size; 1413 } 1414 1415 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1416 1417 for (x = 0; x < ccount; x++, src++) { 1418 if (*src != 0) 1419 log = vdev_raidz_log2[*src]; 1420 1421 for (cc = 0; cc < nmissing; cc++) { 1422 if (x >= dcount[cc]) 1423 continue; 1424 1425 if (*src == 0) { 1426 val = 0; 1427 } else { 1428 if ((ll = log + invlog[cc][i]) >= 255) 1429 ll -= 255; 1430 val = vdev_raidz_pow2[ll]; 1431 } 1432 1433 if (i == 0) 1434 dst[cc][x] = val; 1435 else 1436 dst[cc][x] ^= val; 1437 } 1438 } 1439 } 1440 1441 kmem_free(p, psize); 1442 } 1443 1444 static int 1445 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1446 { 1447 int n, i, c, t, tt; 1448 int nmissing_rows; 1449 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1450 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1451 1452 uint8_t *p, *pp; 1453 size_t psize; 1454 1455 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1456 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1457 uint8_t *used; 1458 1459 abd_t **bufs = NULL; 1460 1461 int code = 0; 1462 1463 /* 1464 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1465 * temporary linear ABDs. 1466 */ 1467 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1468 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1469 1470 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1471 raidz_col_t *col = &rm->rm_col[c]; 1472 1473 bufs[c] = col->rc_abd; 1474 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1475 abd_copy(col->rc_abd, bufs[c], col->rc_size); 1476 } 1477 } 1478 1479 n = rm->rm_cols - rm->rm_firstdatacol; 1480 1481 /* 1482 * Figure out which data columns are missing. 1483 */ 1484 nmissing_rows = 0; 1485 for (t = 0; t < ntgts; t++) { 1486 if (tgts[t] >= rm->rm_firstdatacol) { 1487 missing_rows[nmissing_rows++] = 1488 tgts[t] - rm->rm_firstdatacol; 1489 } 1490 } 1491 1492 /* 1493 * Figure out which parity columns to use to help generate the missing 1494 * data columns. 1495 */ 1496 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1497 ASSERT(tt < ntgts); 1498 ASSERT(c < rm->rm_firstdatacol); 1499 1500 /* 1501 * Skip any targeted parity columns. 1502 */ 1503 if (c == tgts[tt]) { 1504 tt++; 1505 continue; 1506 } 1507 1508 code |= 1 << c; 1509 1510 parity_map[i] = c; 1511 i++; 1512 } 1513 1514 ASSERT(code != 0); 1515 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1516 1517 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1518 nmissing_rows * n + sizeof (used[0]) * n; 1519 p = kmem_alloc(psize, KM_SLEEP); 1520 1521 for (pp = p, i = 0; i < nmissing_rows; i++) { 1522 rows[i] = pp; 1523 pp += n; 1524 invrows[i] = pp; 1525 pp += n; 1526 } 1527 used = pp; 1528 1529 for (i = 0; i < nmissing_rows; i++) { 1530 used[i] = parity_map[i]; 1531 } 1532 1533 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1534 if (tt < nmissing_rows && 1535 c == missing_rows[tt] + rm->rm_firstdatacol) { 1536 tt++; 1537 continue; 1538 } 1539 1540 ASSERT3S(i, <, n); 1541 used[i] = c; 1542 i++; 1543 } 1544 1545 /* 1546 * Initialize the interesting rows of the matrix. 1547 */ 1548 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1549 1550 /* 1551 * Invert the matrix. 1552 */ 1553 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1554 invrows, used); 1555 1556 /* 1557 * Reconstruct the missing data using the generated matrix. 1558 */ 1559 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1560 invrows, used); 1561 1562 kmem_free(p, psize); 1563 1564 /* 1565 * copy back from temporary linear abds and free them 1566 */ 1567 if (bufs) { 1568 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1569 raidz_col_t *col = &rm->rm_col[c]; 1570 1571 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1572 abd_free(col->rc_abd); 1573 col->rc_abd = bufs[c]; 1574 } 1575 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1576 } 1577 1578 return (code); 1579 } 1580 1581 static int 1582 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1583 { 1584 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1585 int ntgts; 1586 int i, c; 1587 int code; 1588 int nbadparity, nbaddata; 1589 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1590 1591 /* 1592 * The tgts list must already be sorted. 1593 */ 1594 for (i = 1; i < nt; i++) { 1595 ASSERT(t[i] > t[i - 1]); 1596 } 1597 1598 nbadparity = rm->rm_firstdatacol; 1599 nbaddata = rm->rm_cols - nbadparity; 1600 ntgts = 0; 1601 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1602 if (c < rm->rm_firstdatacol) 1603 parity_valid[c] = B_FALSE; 1604 1605 if (i < nt && c == t[i]) { 1606 tgts[ntgts++] = c; 1607 i++; 1608 } else if (rm->rm_col[c].rc_error != 0) { 1609 tgts[ntgts++] = c; 1610 } else if (c >= rm->rm_firstdatacol) { 1611 nbaddata--; 1612 } else { 1613 parity_valid[c] = B_TRUE; 1614 nbadparity--; 1615 } 1616 } 1617 1618 ASSERT(ntgts >= nt); 1619 ASSERT(nbaddata >= 0); 1620 ASSERT(nbaddata + nbadparity == ntgts); 1621 1622 dt = &tgts[nbadparity]; 1623 1624 /* 1625 * See if we can use any of our optimized reconstruction routines. 1626 */ 1627 if (!vdev_raidz_default_to_general) { 1628 switch (nbaddata) { 1629 case 1: 1630 if (parity_valid[VDEV_RAIDZ_P]) 1631 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1632 1633 ASSERT(rm->rm_firstdatacol > 1); 1634 1635 if (parity_valid[VDEV_RAIDZ_Q]) 1636 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1637 1638 ASSERT(rm->rm_firstdatacol > 2); 1639 break; 1640 1641 case 2: 1642 ASSERT(rm->rm_firstdatacol > 1); 1643 1644 if (parity_valid[VDEV_RAIDZ_P] && 1645 parity_valid[VDEV_RAIDZ_Q]) 1646 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1647 1648 ASSERT(rm->rm_firstdatacol > 2); 1649 1650 break; 1651 } 1652 } 1653 1654 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1655 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1656 ASSERT(code > 0); 1657 return (code); 1658 } 1659 1660 static int 1661 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1662 uint64_t *ashift) 1663 { 1664 vdev_t *cvd; 1665 uint64_t nparity = vd->vdev_nparity; 1666 int c; 1667 int lasterror = 0; 1668 int numerrors = 0; 1669 1670 ASSERT(nparity > 0); 1671 1672 if (nparity > VDEV_RAIDZ_MAXPARITY || 1673 vd->vdev_children < nparity + 1) { 1674 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1675 return (SET_ERROR(EINVAL)); 1676 } 1677 1678 vdev_open_children(vd); 1679 1680 for (c = 0; c < vd->vdev_children; c++) { 1681 cvd = vd->vdev_child[c]; 1682 1683 if (cvd->vdev_open_error != 0) { 1684 lasterror = cvd->vdev_open_error; 1685 numerrors++; 1686 continue; 1687 } 1688 1689 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1690 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1691 *ashift = MAX(*ashift, cvd->vdev_ashift); 1692 } 1693 1694 *asize *= vd->vdev_children; 1695 *max_asize *= vd->vdev_children; 1696 1697 if (numerrors > nparity) { 1698 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1699 return (lasterror); 1700 } 1701 1702 return (0); 1703 } 1704 1705 static void 1706 vdev_raidz_close(vdev_t *vd) 1707 { 1708 int c; 1709 1710 for (c = 0; c < vd->vdev_children; c++) 1711 vdev_close(vd->vdev_child[c]); 1712 } 1713 1714 /* 1715 * Handle a read or write I/O to a RAID-Z dump device. 1716 * 1717 * The dump device is in a unique situation compared to other ZFS datasets: 1718 * writing to this device should be as simple and fast as possible. In 1719 * addition, durability matters much less since the dump will be extracted 1720 * once the machine reboots. For that reason, this function eschews parity for 1721 * performance and simplicity. The dump device uses the checksum setting 1722 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1723 * dataset. 1724 * 1725 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1726 * 128 KB will not fill an entire block; in addition, they may not be properly 1727 * aligned. In that case, this function uses the preallocated 128 KB block and 1728 * omits reading or writing any "empty" portions of that block, as opposed to 1729 * allocating a fresh appropriately-sized block. 1730 * 1731 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1732 * 1733 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1734 * 1735 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1736 * allocated which spans all five child vdevs. 8 KB of data would be written to 1737 * each of four vdevs, with the fifth containing the parity bits. 1738 * 1739 * parity data data data data 1740 * | PP | XX | XX | XX | XX | 1741 * ^ ^ ^ ^ ^ 1742 * | | | | | 1743 * 8 KB parity ------8 KB data blocks------ 1744 * 1745 * However, when writing to the dump device, the behavior is different: 1746 * 1747 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1748 * 1749 * Unlike the normal RAID-Z case in which the block is allocated based on the 1750 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1751 * I/O size is less than 128 KB, only the actual portions of data are written. 1752 * In this example the data is written to the third data vdev since that vdev 1753 * contains the offset [64 KB, 96 KB). 1754 * 1755 * parity data data data data 1756 * | | | | XX | | 1757 * ^ 1758 * | 1759 * 32 KB data block 1760 * 1761 * As a result, an individual I/O may not span all child vdevs; moreover, a 1762 * small I/O may only operate on a single child vdev. 1763 * 1764 * Note that since there are no parity bits calculated or written, this format 1765 * remains the same no matter how many parity bits are used in a normal RAID-Z 1766 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1767 * would look like: 1768 * 1769 * parity parity parity data data data data 1770 * | | | | | | XX | | 1771 * ^ 1772 * | 1773 * 32 KB data block 1774 */ 1775 int 1776 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1777 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1778 { 1779 vdev_t *tvd = vd->vdev_top; 1780 vdev_t *cvd; 1781 raidz_map_t *rm; 1782 raidz_col_t *rc; 1783 int c, err = 0; 1784 1785 uint64_t start, end, colstart, colend; 1786 uint64_t coloffset, colsize, colskip; 1787 1788 int flags = doread ? B_READ : B_WRITE; 1789 1790 #ifdef _KERNEL 1791 1792 /* 1793 * Don't write past the end of the block 1794 */ 1795 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1796 1797 start = offset; 1798 end = start + size; 1799 1800 /* 1801 * Allocate a RAID-Z map for this block. Note that this block starts 1802 * from the "original" offset, this is, the offset of the extent which 1803 * contains the requisite offset of the data being read or written. 1804 * 1805 * Even if this I/O operation doesn't span the full block size, let's 1806 * treat the on-disk format as if the only blocks are the complete 128 1807 * KB size. 1808 */ 1809 abd_t *abd = abd_get_from_buf(data - (offset - origoffset), 1810 SPA_OLD_MAXBLOCKSIZE); 1811 rm = vdev_raidz_map_alloc(abd, 1812 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1813 vd->vdev_children, vd->vdev_nparity); 1814 1815 coloffset = origoffset; 1816 1817 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1818 c++, coloffset += rc->rc_size) { 1819 rc = &rm->rm_col[c]; 1820 cvd = vd->vdev_child[rc->rc_devidx]; 1821 1822 /* 1823 * Find the start and end of this column in the RAID-Z map, 1824 * keeping in mind that the stated size and offset of the 1825 * operation may not fill the entire column for this vdev. 1826 * 1827 * If any portion of the data spans this column, issue the 1828 * appropriate operation to the vdev. 1829 */ 1830 if (coloffset + rc->rc_size <= start) 1831 continue; 1832 if (coloffset >= end) 1833 continue; 1834 1835 colstart = MAX(coloffset, start); 1836 colend = MIN(end, coloffset + rc->rc_size); 1837 colsize = colend - colstart; 1838 colskip = colstart - coloffset; 1839 1840 VERIFY3U(colsize, <=, rc->rc_size); 1841 VERIFY3U(colskip, <=, rc->rc_size); 1842 1843 /* 1844 * Note that the child vdev will have a vdev label at the start 1845 * of its range of offsets, hence the need for 1846 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1847 * example of why this calculation is needed. 1848 */ 1849 if ((err = vdev_disk_physio(cvd, 1850 ((char *)rc->rc_abd) + colskip, colsize, 1851 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1852 flags, isdump)) != 0) 1853 break; 1854 } 1855 1856 vdev_raidz_map_free(rm); 1857 abd_put(abd); 1858 #endif /* KERNEL */ 1859 1860 return (err); 1861 } 1862 1863 static uint64_t 1864 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1865 { 1866 uint64_t asize; 1867 uint64_t ashift = vd->vdev_top->vdev_ashift; 1868 uint64_t cols = vd->vdev_children; 1869 uint64_t nparity = vd->vdev_nparity; 1870 1871 asize = ((psize - 1) >> ashift) + 1; 1872 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1873 asize = roundup(asize, nparity + 1) << ashift; 1874 1875 return (asize); 1876 } 1877 1878 static void 1879 vdev_raidz_child_done(zio_t *zio) 1880 { 1881 raidz_col_t *rc = zio->io_private; 1882 1883 rc->rc_error = zio->io_error; 1884 rc->rc_tried = 1; 1885 rc->rc_skipped = 0; 1886 } 1887 1888 /* 1889 * Start an IO operation on a RAIDZ VDev 1890 * 1891 * Outline: 1892 * - For write operations: 1893 * 1. Generate the parity data 1894 * 2. Create child zio write operations to each column's vdev, for both 1895 * data and parity. 1896 * 3. If the column skips any sectors for padding, create optional dummy 1897 * write zio children for those areas to improve aggregation continuity. 1898 * - For read operations: 1899 * 1. Create child zio read operations to each data column's vdev to read 1900 * the range of data required for zio. 1901 * 2. If this is a scrub or resilver operation, or if any of the data 1902 * vdevs have had errors, then create zio read operations to the parity 1903 * columns' VDevs as well. 1904 */ 1905 static void 1906 vdev_raidz_io_start(zio_t *zio) 1907 { 1908 vdev_t *vd = zio->io_vd; 1909 vdev_t *tvd = vd->vdev_top; 1910 vdev_t *cvd; 1911 raidz_map_t *rm; 1912 raidz_col_t *rc; 1913 int c, i; 1914 1915 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, 1916 tvd->vdev_ashift, vd->vdev_children, 1917 vd->vdev_nparity); 1918 1919 zio->io_vsd = rm; 1920 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1921 1922 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1923 1924 if (zio->io_type == ZIO_TYPE_WRITE) { 1925 vdev_raidz_generate_parity(rm); 1926 1927 for (c = 0; c < rm->rm_cols; c++) { 1928 rc = &rm->rm_col[c]; 1929 cvd = vd->vdev_child[rc->rc_devidx]; 1930 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1931 rc->rc_offset, rc->rc_abd, rc->rc_size, 1932 zio->io_type, zio->io_priority, 0, 1933 vdev_raidz_child_done, rc)); 1934 } 1935 1936 /* 1937 * Generate optional I/Os for any skipped sectors to improve 1938 * aggregation contiguity. 1939 */ 1940 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1941 ASSERT(c <= rm->rm_scols); 1942 if (c == rm->rm_scols) 1943 c = 0; 1944 rc = &rm->rm_col[c]; 1945 cvd = vd->vdev_child[rc->rc_devidx]; 1946 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1947 rc->rc_offset + rc->rc_size, NULL, 1948 1 << tvd->vdev_ashift, 1949 zio->io_type, zio->io_priority, 1950 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1951 } 1952 1953 zio_execute(zio); 1954 return; 1955 } 1956 1957 ASSERT(zio->io_type == ZIO_TYPE_READ); 1958 1959 /* 1960 * Iterate over the columns in reverse order so that we hit the parity 1961 * last -- any errors along the way will force us to read the parity. 1962 */ 1963 for (c = rm->rm_cols - 1; c >= 0; c--) { 1964 rc = &rm->rm_col[c]; 1965 cvd = vd->vdev_child[rc->rc_devidx]; 1966 if (!vdev_readable(cvd)) { 1967 if (c >= rm->rm_firstdatacol) 1968 rm->rm_missingdata++; 1969 else 1970 rm->rm_missingparity++; 1971 rc->rc_error = SET_ERROR(ENXIO); 1972 rc->rc_tried = 1; /* don't even try */ 1973 rc->rc_skipped = 1; 1974 continue; 1975 } 1976 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1977 if (c >= rm->rm_firstdatacol) 1978 rm->rm_missingdata++; 1979 else 1980 rm->rm_missingparity++; 1981 rc->rc_error = SET_ERROR(ESTALE); 1982 rc->rc_skipped = 1; 1983 continue; 1984 } 1985 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1986 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1987 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1988 rc->rc_offset, rc->rc_abd, rc->rc_size, 1989 zio->io_type, zio->io_priority, 0, 1990 vdev_raidz_child_done, rc)); 1991 } 1992 } 1993 1994 zio_execute(zio); 1995 } 1996 1997 1998 /* 1999 * Report a checksum error for a child of a RAID-Z device. 2000 */ 2001 static void 2002 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 2003 { 2004 void *buf; 2005 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2006 2007 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2008 zio_bad_cksum_t zbc; 2009 raidz_map_t *rm = zio->io_vsd; 2010 2011 mutex_enter(&vd->vdev_stat_lock); 2012 vd->vdev_stat.vs_checksum_errors++; 2013 mutex_exit(&vd->vdev_stat_lock); 2014 2015 zbc.zbc_has_cksum = 0; 2016 zbc.zbc_injected = rm->rm_ecksuminjected; 2017 2018 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); 2019 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 2020 rc->rc_offset, rc->rc_size, buf, bad_data, 2021 &zbc); 2022 abd_return_buf(rc->rc_abd, buf, rc->rc_size); 2023 } 2024 } 2025 2026 /* 2027 * We keep track of whether or not there were any injected errors, so that 2028 * any ereports we generate can note it. 2029 */ 2030 static int 2031 raidz_checksum_verify(zio_t *zio) 2032 { 2033 zio_bad_cksum_t zbc; 2034 raidz_map_t *rm = zio->io_vsd; 2035 2036 int ret = zio_checksum_error(zio, &zbc); 2037 if (ret != 0 && zbc.zbc_injected != 0) 2038 rm->rm_ecksuminjected = 1; 2039 2040 return (ret); 2041 } 2042 2043 /* 2044 * Generate the parity from the data columns. If we tried and were able to 2045 * read the parity without error, verify that the generated parity matches the 2046 * data we read. If it doesn't, we fire off a checksum error. Return the 2047 * number such failures. 2048 */ 2049 static int 2050 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2051 { 2052 void *orig[VDEV_RAIDZ_MAXPARITY]; 2053 int c, ret = 0; 2054 raidz_col_t *rc; 2055 2056 blkptr_t *bp = zio->io_bp; 2057 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2058 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2059 2060 if (checksum == ZIO_CHECKSUM_NOPARITY) 2061 return (ret); 2062 2063 for (c = 0; c < rm->rm_firstdatacol; c++) { 2064 rc = &rm->rm_col[c]; 2065 if (!rc->rc_tried || rc->rc_error != 0) 2066 continue; 2067 orig[c] = zio_buf_alloc(rc->rc_size); 2068 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); 2069 } 2070 2071 vdev_raidz_generate_parity(rm); 2072 2073 for (c = 0; c < rm->rm_firstdatacol; c++) { 2074 rc = &rm->rm_col[c]; 2075 if (!rc->rc_tried || rc->rc_error != 0) 2076 continue; 2077 if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) { 2078 raidz_checksum_error(zio, rc, orig[c]); 2079 rc->rc_error = SET_ERROR(ECKSUM); 2080 ret++; 2081 } 2082 zio_buf_free(orig[c], rc->rc_size); 2083 } 2084 2085 return (ret); 2086 } 2087 2088 /* 2089 * Keep statistics on all the ways that we used parity to correct data. 2090 */ 2091 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 2092 2093 static int 2094 vdev_raidz_worst_error(raidz_map_t *rm) 2095 { 2096 int error = 0; 2097 2098 for (int c = 0; c < rm->rm_cols; c++) 2099 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2100 2101 return (error); 2102 } 2103 2104 /* 2105 * Iterate over all combinations of bad data and attempt a reconstruction. 2106 * Note that the algorithm below is non-optimal because it doesn't take into 2107 * account how reconstruction is actually performed. For example, with 2108 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2109 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2110 * cases we'd only use parity information in column 0. 2111 */ 2112 static int 2113 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2114 { 2115 raidz_map_t *rm = zio->io_vsd; 2116 raidz_col_t *rc; 2117 void *orig[VDEV_RAIDZ_MAXPARITY]; 2118 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2119 int *tgts = &tstore[1]; 2120 int current, next, i, c, n; 2121 int code, ret = 0; 2122 2123 ASSERT(total_errors < rm->rm_firstdatacol); 2124 2125 /* 2126 * This simplifies one edge condition. 2127 */ 2128 tgts[-1] = -1; 2129 2130 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2131 /* 2132 * Initialize the targets array by finding the first n columns 2133 * that contain no error. 2134 * 2135 * If there were no data errors, we need to ensure that we're 2136 * always explicitly attempting to reconstruct at least one 2137 * data column. To do this, we simply push the highest target 2138 * up into the data columns. 2139 */ 2140 for (c = 0, i = 0; i < n; i++) { 2141 if (i == n - 1 && data_errors == 0 && 2142 c < rm->rm_firstdatacol) { 2143 c = rm->rm_firstdatacol; 2144 } 2145 2146 while (rm->rm_col[c].rc_error != 0) { 2147 c++; 2148 ASSERT3S(c, <, rm->rm_cols); 2149 } 2150 2151 tgts[i] = c++; 2152 } 2153 2154 /* 2155 * Setting tgts[n] simplifies the other edge condition. 2156 */ 2157 tgts[n] = rm->rm_cols; 2158 2159 /* 2160 * These buffers were allocated in previous iterations. 2161 */ 2162 for (i = 0; i < n - 1; i++) { 2163 ASSERT(orig[i] != NULL); 2164 } 2165 2166 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2167 2168 current = 0; 2169 next = tgts[current]; 2170 2171 while (current != n) { 2172 tgts[current] = next; 2173 current = 0; 2174 2175 /* 2176 * Save off the original data that we're going to 2177 * attempt to reconstruct. 2178 */ 2179 for (i = 0; i < n; i++) { 2180 ASSERT(orig[i] != NULL); 2181 c = tgts[i]; 2182 ASSERT3S(c, >=, 0); 2183 ASSERT3S(c, <, rm->rm_cols); 2184 rc = &rm->rm_col[c]; 2185 abd_copy_to_buf(orig[i], rc->rc_abd, 2186 rc->rc_size); 2187 } 2188 2189 /* 2190 * Attempt a reconstruction and exit the outer loop on 2191 * success. 2192 */ 2193 code = vdev_raidz_reconstruct(rm, tgts, n); 2194 if (raidz_checksum_verify(zio) == 0) { 2195 atomic_inc_64(&raidz_corrected[code]); 2196 2197 for (i = 0; i < n; i++) { 2198 c = tgts[i]; 2199 rc = &rm->rm_col[c]; 2200 ASSERT(rc->rc_error == 0); 2201 if (rc->rc_tried) 2202 raidz_checksum_error(zio, rc, 2203 orig[i]); 2204 rc->rc_error = SET_ERROR(ECKSUM); 2205 } 2206 2207 ret = code; 2208 goto done; 2209 } 2210 2211 /* 2212 * Restore the original data. 2213 */ 2214 for (i = 0; i < n; i++) { 2215 c = tgts[i]; 2216 rc = &rm->rm_col[c]; 2217 abd_copy_from_buf(rc->rc_abd, orig[i], 2218 rc->rc_size); 2219 } 2220 2221 do { 2222 /* 2223 * Find the next valid column after the current 2224 * position.. 2225 */ 2226 for (next = tgts[current] + 1; 2227 next < rm->rm_cols && 2228 rm->rm_col[next].rc_error != 0; next++) 2229 continue; 2230 2231 ASSERT(next <= tgts[current + 1]); 2232 2233 /* 2234 * If that spot is available, we're done here. 2235 */ 2236 if (next != tgts[current + 1]) 2237 break; 2238 2239 /* 2240 * Otherwise, find the next valid column after 2241 * the previous position. 2242 */ 2243 for (c = tgts[current - 1] + 1; 2244 rm->rm_col[c].rc_error != 0; c++) 2245 continue; 2246 2247 tgts[current] = c; 2248 current++; 2249 2250 } while (current != n); 2251 } 2252 } 2253 n--; 2254 done: 2255 for (i = 0; i < n; i++) { 2256 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2257 } 2258 2259 return (ret); 2260 } 2261 2262 /* 2263 * Complete an IO operation on a RAIDZ VDev 2264 * 2265 * Outline: 2266 * - For write operations: 2267 * 1. Check for errors on the child IOs. 2268 * 2. Return, setting an error code if too few child VDevs were written 2269 * to reconstruct the data later. Note that partial writes are 2270 * considered successful if they can be reconstructed at all. 2271 * - For read operations: 2272 * 1. Check for errors on the child IOs. 2273 * 2. If data errors occurred: 2274 * a. Try to reassemble the data from the parity available. 2275 * b. If we haven't yet read the parity drives, read them now. 2276 * c. If all parity drives have been read but the data still doesn't 2277 * reassemble with a correct checksum, then try combinatorial 2278 * reconstruction. 2279 * d. If that doesn't work, return an error. 2280 * 3. If there were unexpected errors or this is a resilver operation, 2281 * rewrite the vdevs that had errors. 2282 */ 2283 static void 2284 vdev_raidz_io_done(zio_t *zio) 2285 { 2286 vdev_t *vd = zio->io_vd; 2287 vdev_t *cvd; 2288 raidz_map_t *rm = zio->io_vsd; 2289 raidz_col_t *rc; 2290 int unexpected_errors = 0; 2291 int parity_errors = 0; 2292 int parity_untried = 0; 2293 int data_errors = 0; 2294 int total_errors = 0; 2295 int n, c; 2296 int tgts[VDEV_RAIDZ_MAXPARITY]; 2297 int code; 2298 2299 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2300 2301 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2302 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2303 2304 for (c = 0; c < rm->rm_cols; c++) { 2305 rc = &rm->rm_col[c]; 2306 2307 if (rc->rc_error) { 2308 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2309 2310 if (c < rm->rm_firstdatacol) 2311 parity_errors++; 2312 else 2313 data_errors++; 2314 2315 if (!rc->rc_skipped) 2316 unexpected_errors++; 2317 2318 total_errors++; 2319 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2320 parity_untried++; 2321 } 2322 } 2323 2324 if (zio->io_type == ZIO_TYPE_WRITE) { 2325 /* 2326 * XXX -- for now, treat partial writes as a success. 2327 * (If we couldn't write enough columns to reconstruct 2328 * the data, the I/O failed. Otherwise, good enough.) 2329 * 2330 * Now that we support write reallocation, it would be better 2331 * to treat partial failure as real failure unless there are 2332 * no non-degraded top-level vdevs left, and not update DTLs 2333 * if we intend to reallocate. 2334 */ 2335 /* XXPOLICY */ 2336 if (total_errors > rm->rm_firstdatacol) 2337 zio->io_error = vdev_raidz_worst_error(rm); 2338 2339 return; 2340 } 2341 2342 ASSERT(zio->io_type == ZIO_TYPE_READ); 2343 /* 2344 * There are three potential phases for a read: 2345 * 1. produce valid data from the columns read 2346 * 2. read all disks and try again 2347 * 3. perform combinatorial reconstruction 2348 * 2349 * Each phase is progressively both more expensive and less likely to 2350 * occur. If we encounter more errors than we can repair or all phases 2351 * fail, we have no choice but to return an error. 2352 */ 2353 2354 /* 2355 * If the number of errors we saw was correctable -- less than or equal 2356 * to the number of parity disks read -- attempt to produce data that 2357 * has a valid checksum. Naturally, this case applies in the absence of 2358 * any errors. 2359 */ 2360 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2361 if (data_errors == 0) { 2362 if (raidz_checksum_verify(zio) == 0) { 2363 /* 2364 * If we read parity information (unnecessarily 2365 * as it happens since no reconstruction was 2366 * needed) regenerate and verify the parity. 2367 * We also regenerate parity when resilvering 2368 * so we can write it out to the failed device 2369 * later. 2370 */ 2371 if (parity_errors + parity_untried < 2372 rm->rm_firstdatacol || 2373 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2374 n = raidz_parity_verify(zio, rm); 2375 unexpected_errors += n; 2376 ASSERT(parity_errors + n <= 2377 rm->rm_firstdatacol); 2378 } 2379 goto done; 2380 } 2381 } else { 2382 /* 2383 * We either attempt to read all the parity columns or 2384 * none of them. If we didn't try to read parity, we 2385 * wouldn't be here in the correctable case. There must 2386 * also have been fewer parity errors than parity 2387 * columns or, again, we wouldn't be in this code path. 2388 */ 2389 ASSERT(parity_untried == 0); 2390 ASSERT(parity_errors < rm->rm_firstdatacol); 2391 2392 /* 2393 * Identify the data columns that reported an error. 2394 */ 2395 n = 0; 2396 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2397 rc = &rm->rm_col[c]; 2398 if (rc->rc_error != 0) { 2399 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2400 tgts[n++] = c; 2401 } 2402 } 2403 2404 ASSERT(rm->rm_firstdatacol >= n); 2405 2406 code = vdev_raidz_reconstruct(rm, tgts, n); 2407 2408 if (raidz_checksum_verify(zio) == 0) { 2409 atomic_inc_64(&raidz_corrected[code]); 2410 2411 /* 2412 * If we read more parity disks than were used 2413 * for reconstruction, confirm that the other 2414 * parity disks produced correct data. This 2415 * routine is suboptimal in that it regenerates 2416 * the parity that we already used in addition 2417 * to the parity that we're attempting to 2418 * verify, but this should be a relatively 2419 * uncommon case, and can be optimized if it 2420 * becomes a problem. Note that we regenerate 2421 * parity when resilvering so we can write it 2422 * out to failed devices later. 2423 */ 2424 if (parity_errors < rm->rm_firstdatacol - n || 2425 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2426 n = raidz_parity_verify(zio, rm); 2427 unexpected_errors += n; 2428 ASSERT(parity_errors + n <= 2429 rm->rm_firstdatacol); 2430 } 2431 2432 goto done; 2433 } 2434 } 2435 } 2436 2437 /* 2438 * This isn't a typical situation -- either we got a read error or 2439 * a child silently returned bad data. Read every block so we can 2440 * try again with as much data and parity as we can track down. If 2441 * we've already been through once before, all children will be marked 2442 * as tried so we'll proceed to combinatorial reconstruction. 2443 */ 2444 unexpected_errors = 1; 2445 rm->rm_missingdata = 0; 2446 rm->rm_missingparity = 0; 2447 2448 for (c = 0; c < rm->rm_cols; c++) { 2449 if (rm->rm_col[c].rc_tried) 2450 continue; 2451 2452 zio_vdev_io_redone(zio); 2453 do { 2454 rc = &rm->rm_col[c]; 2455 if (rc->rc_tried) 2456 continue; 2457 zio_nowait(zio_vdev_child_io(zio, NULL, 2458 vd->vdev_child[rc->rc_devidx], 2459 rc->rc_offset, rc->rc_abd, rc->rc_size, 2460 zio->io_type, zio->io_priority, 0, 2461 vdev_raidz_child_done, rc)); 2462 } while (++c < rm->rm_cols); 2463 2464 return; 2465 } 2466 2467 /* 2468 * At this point we've attempted to reconstruct the data given the 2469 * errors we detected, and we've attempted to read all columns. There 2470 * must, therefore, be one or more additional problems -- silent errors 2471 * resulting in invalid data rather than explicit I/O errors resulting 2472 * in absent data. We check if there is enough additional data to 2473 * possibly reconstruct the data and then perform combinatorial 2474 * reconstruction over all possible combinations. If that fails, 2475 * we're cooked. 2476 */ 2477 if (total_errors > rm->rm_firstdatacol) { 2478 zio->io_error = vdev_raidz_worst_error(rm); 2479 2480 } else if (total_errors < rm->rm_firstdatacol && 2481 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2482 /* 2483 * If we didn't use all the available parity for the 2484 * combinatorial reconstruction, verify that the remaining 2485 * parity is correct. 2486 */ 2487 if (code != (1 << rm->rm_firstdatacol) - 1) 2488 (void) raidz_parity_verify(zio, rm); 2489 } else { 2490 /* 2491 * We're here because either: 2492 * 2493 * total_errors == rm_first_datacol, or 2494 * vdev_raidz_combrec() failed 2495 * 2496 * In either case, there is enough bad data to prevent 2497 * reconstruction. 2498 * 2499 * Start checksum ereports for all children which haven't 2500 * failed, and the IO wasn't speculative. 2501 */ 2502 zio->io_error = SET_ERROR(ECKSUM); 2503 2504 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2505 for (c = 0; c < rm->rm_cols; c++) { 2506 rc = &rm->rm_col[c]; 2507 if (rc->rc_error == 0) { 2508 zio_bad_cksum_t zbc; 2509 zbc.zbc_has_cksum = 0; 2510 zbc.zbc_injected = 2511 rm->rm_ecksuminjected; 2512 2513 zfs_ereport_start_checksum( 2514 zio->io_spa, 2515 vd->vdev_child[rc->rc_devidx], 2516 zio, rc->rc_offset, rc->rc_size, 2517 (void *)(uintptr_t)c, &zbc); 2518 } 2519 } 2520 } 2521 } 2522 2523 done: 2524 zio_checksum_verified(zio); 2525 2526 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2527 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2528 /* 2529 * Use the good data we have in hand to repair damaged children. 2530 */ 2531 for (c = 0; c < rm->rm_cols; c++) { 2532 rc = &rm->rm_col[c]; 2533 cvd = vd->vdev_child[rc->rc_devidx]; 2534 2535 if (rc->rc_error == 0) 2536 continue; 2537 2538 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2539 rc->rc_offset, rc->rc_abd, rc->rc_size, 2540 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2541 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2542 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2543 } 2544 } 2545 } 2546 2547 static void 2548 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2549 { 2550 if (faulted > vd->vdev_nparity) 2551 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2552 VDEV_AUX_NO_REPLICAS); 2553 else if (degraded + faulted != 0) 2554 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2555 else 2556 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2557 } 2558 2559 vdev_ops_t vdev_raidz_ops = { 2560 vdev_raidz_open, 2561 vdev_raidz_close, 2562 vdev_raidz_asize, 2563 vdev_raidz_io_start, 2564 vdev_raidz_io_done, 2565 vdev_raidz_state_change, 2566 NULL, 2567 NULL, 2568 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2569 B_FALSE /* not a leaf vdev */ 2570 }; 2571