1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/zio_checksum.h> 33 #include <sys/abd.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/vdev_raidz.h> 37 #include <sys/vdev_raidz_impl.h> 38 #include <sys/vdev_draid.h> 39 40 #ifdef ZFS_DEBUG 41 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 42 #endif 43 44 /* 45 * Virtual device vector for RAID-Z. 46 * 47 * This vdev supports single, double, and triple parity. For single parity, 48 * we use a simple XOR of all the data columns. For double or triple parity, 49 * we use a special case of Reed-Solomon coding. This extends the 50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 53 * former is also based. The latter is designed to provide higher performance 54 * for writes. 55 * 56 * Note that the Plank paper claimed to support arbitrary N+M, but was then 57 * amended six years later identifying a critical flaw that invalidates its 58 * claims. Nevertheless, the technique can be adapted to work for up to 59 * triple parity. For additional parity, the amendment "Note: Correction to 60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 61 * is viable, but the additional complexity means that write performance will 62 * suffer. 63 * 64 * All of the methods above operate on a Galois field, defined over the 65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 66 * can be expressed with a single byte. Briefly, the operations on the 67 * field are defined as follows: 68 * 69 * o addition (+) is represented by a bitwise XOR 70 * o subtraction (-) is therefore identical to addition: A + B = A - B 71 * o multiplication of A by 2 is defined by the following bitwise expression: 72 * 73 * (A * 2)_7 = A_6 74 * (A * 2)_6 = A_5 75 * (A * 2)_5 = A_4 76 * (A * 2)_4 = A_3 + A_7 77 * (A * 2)_3 = A_2 + A_7 78 * (A * 2)_2 = A_1 + A_7 79 * (A * 2)_1 = A_0 80 * (A * 2)_0 = A_7 81 * 82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 83 * As an aside, this multiplication is derived from the error correcting 84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 85 * 86 * Observe that any number in the field (except for 0) can be expressed as a 87 * power of 2 -- a generator for the field. We store a table of the powers of 88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 90 * than field addition). The inverse of a field element A (A^-1) is therefore 91 * A ^ (255 - 1) = A^254. 92 * 93 * The up-to-three parity columns, P, Q, R over several data columns, 94 * D_0, ... D_n-1, can be expressed by field operations: 95 * 96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 101 * 102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 104 * independent coefficients. (There are no additional coefficients that have 105 * this property which is why the uncorrected Plank method breaks down.) 106 * 107 * See the reconstruction code below for how P, Q and R can used individually 108 * or in concert to recover missing data columns. 109 */ 110 111 #define VDEV_RAIDZ_P 0 112 #define VDEV_RAIDZ_Q 1 113 #define VDEV_RAIDZ_R 2 114 115 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 116 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 117 118 /* 119 * We provide a mechanism to perform the field multiplication operation on a 120 * 64-bit value all at once rather than a byte at a time. This works by 121 * creating a mask from the top bit in each byte and using that to 122 * conditionally apply the XOR of 0x1d. 123 */ 124 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 125 { \ 126 (mask) = (x) & 0x8080808080808080ULL; \ 127 (mask) = ((mask) << 1) - ((mask) >> 7); \ 128 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 129 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 130 } 131 132 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 133 { \ 134 VDEV_RAIDZ_64MUL_2((x), mask); \ 135 VDEV_RAIDZ_64MUL_2((x), mask); \ 136 } 137 138 static void 139 vdev_raidz_row_free(raidz_row_t *rr) 140 { 141 for (int c = 0; c < rr->rr_cols; c++) { 142 raidz_col_t *rc = &rr->rr_col[c]; 143 144 if (rc->rc_size != 0) 145 abd_free(rc->rc_abd); 146 if (rc->rc_orig_data != NULL) 147 abd_free(rc->rc_orig_data); 148 } 149 150 if (rr->rr_abd_empty != NULL) 151 abd_free(rr->rr_abd_empty); 152 153 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 154 } 155 156 void 157 vdev_raidz_map_free(raidz_map_t *rm) 158 { 159 for (int i = 0; i < rm->rm_nrows; i++) 160 vdev_raidz_row_free(rm->rm_row[i]); 161 162 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 163 } 164 165 static void 166 vdev_raidz_map_free_vsd(zio_t *zio) 167 { 168 raidz_map_t *rm = zio->io_vsd; 169 170 vdev_raidz_map_free(rm); 171 } 172 173 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 174 .vsd_free = vdev_raidz_map_free_vsd, 175 }; 176 177 static void 178 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 179 { 180 int c; 181 int nwrapped = 0; 182 uint64_t off = 0; 183 raidz_row_t *rr = rm->rm_row[0]; 184 185 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 186 ASSERT3U(rm->rm_nrows, ==, 1); 187 188 /* 189 * Pad any parity columns with additional space to account for skip 190 * sectors. 191 */ 192 if (rm->rm_skipstart < rr->rr_firstdatacol) { 193 ASSERT0(rm->rm_skipstart); 194 nwrapped = rm->rm_nskip; 195 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 196 nwrapped = 197 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 198 } 199 200 /* 201 * Optional single skip sectors (rc_size == 0) will be handled in 202 * vdev_raidz_io_start_write(). 203 */ 204 int skipped = rr->rr_scols - rr->rr_cols; 205 206 /* Allocate buffers for the parity columns */ 207 for (c = 0; c < rr->rr_firstdatacol; c++) { 208 raidz_col_t *rc = &rr->rr_col[c]; 209 210 /* 211 * Parity columns will pad out a linear ABD to account for 212 * the skip sector. A linear ABD is used here because 213 * parity calculations use the ABD buffer directly to calculate 214 * parity. This avoids doing a memcpy back to the ABD after the 215 * parity has been calculated. By issuing the parity column 216 * with the skip sector we can reduce contention on the child 217 * VDEV queue locks (vq_lock). 218 */ 219 if (c < nwrapped) { 220 rc->rc_abd = abd_alloc_linear( 221 rc->rc_size + (1ULL << ashift), B_FALSE); 222 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 223 skipped++; 224 } else { 225 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 226 } 227 } 228 229 for (off = 0; c < rr->rr_cols; c++) { 230 raidz_col_t *rc = &rr->rr_col[c]; 231 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 232 zio->io_abd, off, rc->rc_size); 233 234 /* 235 * Generate I/O for skip sectors to improve aggregation 236 * continuity. We will use gang ABD's to reduce contention 237 * on the child VDEV queue locks (vq_lock) by issuing 238 * a single I/O that contains the data and skip sector. 239 * 240 * It is important to make sure that rc_size is not updated 241 * even though we are adding a skip sector to the ABD. When 242 * calculating the parity in vdev_raidz_generate_parity_row() 243 * the rc_size is used to iterate through the ABD's. We can 244 * not have zero'd out skip sectors used for calculating 245 * parity for raidz, because those same sectors are not used 246 * during reconstruction. 247 */ 248 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 249 rc->rc_abd = abd_alloc_gang(); 250 abd_gang_add(rc->rc_abd, abd, B_TRUE); 251 abd_gang_add(rc->rc_abd, 252 abd_get_zeros(1ULL << ashift), B_TRUE); 253 skipped++; 254 } else { 255 rc->rc_abd = abd; 256 } 257 off += rc->rc_size; 258 } 259 260 ASSERT3U(off, ==, zio->io_size); 261 ASSERT3S(skipped, ==, rm->rm_nskip); 262 } 263 264 static void 265 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 266 { 267 int c; 268 raidz_row_t *rr = rm->rm_row[0]; 269 270 ASSERT3U(rm->rm_nrows, ==, 1); 271 272 /* Allocate buffers for the parity columns */ 273 for (c = 0; c < rr->rr_firstdatacol; c++) 274 rr->rr_col[c].rc_abd = 275 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 276 277 for (uint64_t off = 0; c < rr->rr_cols; c++) { 278 raidz_col_t *rc = &rr->rr_col[c]; 279 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 280 zio->io_abd, off, rc->rc_size); 281 off += rc->rc_size; 282 } 283 } 284 285 /* 286 * Divides the IO evenly across all child vdevs; usually, dcols is 287 * the number of children in the target vdev. 288 * 289 * Avoid inlining the function to keep vdev_raidz_io_start(), which 290 * is this functions only caller, as small as possible on the stack. 291 */ 292 noinline raidz_map_t * 293 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 294 uint64_t nparity) 295 { 296 raidz_row_t *rr; 297 /* The starting RAIDZ (parent) vdev sector of the block. */ 298 uint64_t b = zio->io_offset >> ashift; 299 /* The zio's size in units of the vdev's minimum sector size. */ 300 uint64_t s = zio->io_size >> ashift; 301 /* The first column for this stripe. */ 302 uint64_t f = b % dcols; 303 /* The starting byte offset on each child vdev. */ 304 uint64_t o = (b / dcols) << ashift; 305 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 306 307 raidz_map_t *rm = 308 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 309 rm->rm_nrows = 1; 310 311 /* 312 * "Quotient": The number of data sectors for this stripe on all but 313 * the "big column" child vdevs that also contain "remainder" data. 314 */ 315 q = s / (dcols - nparity); 316 317 /* 318 * "Remainder": The number of partial stripe data sectors in this I/O. 319 * This will add a sector to some, but not all, child vdevs. 320 */ 321 r = s - q * (dcols - nparity); 322 323 /* The number of "big columns" - those which contain remainder data. */ 324 bc = (r == 0 ? 0 : r + nparity); 325 326 /* 327 * The total number of data and parity sectors associated with 328 * this I/O. 329 */ 330 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 331 332 /* 333 * acols: The columns that will be accessed. 334 * scols: The columns that will be accessed or skipped. 335 */ 336 if (q == 0) { 337 /* Our I/O request doesn't span all child vdevs. */ 338 acols = bc; 339 scols = MIN(dcols, roundup(bc, nparity + 1)); 340 } else { 341 acols = dcols; 342 scols = dcols; 343 } 344 345 ASSERT3U(acols, <=, scols); 346 347 rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); 348 rm->rm_row[0] = rr; 349 350 rr->rr_cols = acols; 351 rr->rr_scols = scols; 352 rr->rr_bigcols = bc; 353 rr->rr_missingdata = 0; 354 rr->rr_missingparity = 0; 355 rr->rr_firstdatacol = nparity; 356 rr->rr_abd_empty = NULL; 357 rr->rr_nempty = 0; 358 #ifdef ZFS_DEBUG 359 rr->rr_offset = zio->io_offset; 360 rr->rr_size = zio->io_size; 361 #endif 362 363 asize = 0; 364 365 for (c = 0; c < scols; c++) { 366 raidz_col_t *rc = &rr->rr_col[c]; 367 col = f + c; 368 coff = o; 369 if (col >= dcols) { 370 col -= dcols; 371 coff += 1ULL << ashift; 372 } 373 rc->rc_devidx = col; 374 rc->rc_offset = coff; 375 rc->rc_abd = NULL; 376 rc->rc_orig_data = NULL; 377 rc->rc_error = 0; 378 rc->rc_tried = 0; 379 rc->rc_skipped = 0; 380 rc->rc_force_repair = 0; 381 rc->rc_allow_repair = 1; 382 rc->rc_need_orig_restore = B_FALSE; 383 384 if (c >= acols) 385 rc->rc_size = 0; 386 else if (c < bc) 387 rc->rc_size = (q + 1) << ashift; 388 else 389 rc->rc_size = q << ashift; 390 391 asize += rc->rc_size; 392 } 393 394 ASSERT3U(asize, ==, tot << ashift); 395 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 396 rm->rm_skipstart = bc; 397 398 /* 399 * If all data stored spans all columns, there's a danger that parity 400 * will always be on the same device and, since parity isn't read 401 * during normal operation, that device's I/O bandwidth won't be 402 * used effectively. We therefore switch the parity every 1MB. 403 * 404 * ... at least that was, ostensibly, the theory. As a practical 405 * matter unless we juggle the parity between all devices evenly, we 406 * won't see any benefit. Further, occasional writes that aren't a 407 * multiple of the LCM of the number of children and the minimum 408 * stripe width are sufficient to avoid pessimal behavior. 409 * Unfortunately, this decision created an implicit on-disk format 410 * requirement that we need to support for all eternity, but only 411 * for single-parity RAID-Z. 412 * 413 * If we intend to skip a sector in the zeroth column for padding 414 * we must make sure to note this swap. We will never intend to 415 * skip the first column since at least one data and one parity 416 * column must appear in each row. 417 */ 418 ASSERT(rr->rr_cols >= 2); 419 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 420 421 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 422 devidx = rr->rr_col[0].rc_devidx; 423 o = rr->rr_col[0].rc_offset; 424 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 425 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 426 rr->rr_col[1].rc_devidx = devidx; 427 rr->rr_col[1].rc_offset = o; 428 429 if (rm->rm_skipstart == 0) 430 rm->rm_skipstart = 1; 431 } 432 433 if (zio->io_type == ZIO_TYPE_WRITE) { 434 vdev_raidz_map_alloc_write(zio, rm, ashift); 435 } else { 436 vdev_raidz_map_alloc_read(zio, rm); 437 } 438 439 /* init RAIDZ parity ops */ 440 rm->rm_ops = vdev_raidz_math_get_ops(); 441 442 return (rm); 443 } 444 445 struct pqr_struct { 446 uint64_t *p; 447 uint64_t *q; 448 uint64_t *r; 449 }; 450 451 static int 452 vdev_raidz_p_func(void *buf, size_t size, void *private) 453 { 454 struct pqr_struct *pqr = private; 455 const uint64_t *src = buf; 456 int i, cnt = size / sizeof (src[0]); 457 458 ASSERT(pqr->p && !pqr->q && !pqr->r); 459 460 for (i = 0; i < cnt; i++, src++, pqr->p++) 461 *pqr->p ^= *src; 462 463 return (0); 464 } 465 466 static int 467 vdev_raidz_pq_func(void *buf, size_t size, void *private) 468 { 469 struct pqr_struct *pqr = private; 470 const uint64_t *src = buf; 471 uint64_t mask; 472 int i, cnt = size / sizeof (src[0]); 473 474 ASSERT(pqr->p && pqr->q && !pqr->r); 475 476 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 477 *pqr->p ^= *src; 478 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 479 *pqr->q ^= *src; 480 } 481 482 return (0); 483 } 484 485 static int 486 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 487 { 488 struct pqr_struct *pqr = private; 489 const uint64_t *src = buf; 490 uint64_t mask; 491 int i, cnt = size / sizeof (src[0]); 492 493 ASSERT(pqr->p && pqr->q && pqr->r); 494 495 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 496 *pqr->p ^= *src; 497 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 498 *pqr->q ^= *src; 499 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 500 *pqr->r ^= *src; 501 } 502 503 return (0); 504 } 505 506 static void 507 vdev_raidz_generate_parity_p(raidz_row_t *rr) 508 { 509 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 510 511 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 512 abd_t *src = rr->rr_col[c].rc_abd; 513 514 if (c == rr->rr_firstdatacol) { 515 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 516 } else { 517 struct pqr_struct pqr = { p, NULL, NULL }; 518 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 519 vdev_raidz_p_func, &pqr); 520 } 521 } 522 } 523 524 static void 525 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 526 { 527 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 528 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 529 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 530 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 531 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 532 533 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 534 abd_t *src = rr->rr_col[c].rc_abd; 535 536 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 537 538 if (c == rr->rr_firstdatacol) { 539 ASSERT(ccnt == pcnt || ccnt == 0); 540 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 541 (void) memcpy(q, p, rr->rr_col[c].rc_size); 542 543 for (uint64_t i = ccnt; i < pcnt; i++) { 544 p[i] = 0; 545 q[i] = 0; 546 } 547 } else { 548 struct pqr_struct pqr = { p, q, NULL }; 549 550 ASSERT(ccnt <= pcnt); 551 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 552 vdev_raidz_pq_func, &pqr); 553 554 /* 555 * Treat short columns as though they are full of 0s. 556 * Note that there's therefore nothing needed for P. 557 */ 558 uint64_t mask; 559 for (uint64_t i = ccnt; i < pcnt; i++) { 560 VDEV_RAIDZ_64MUL_2(q[i], mask); 561 } 562 } 563 } 564 } 565 566 static void 567 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 568 { 569 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 570 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 571 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 572 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 573 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 574 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 575 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 576 rr->rr_col[VDEV_RAIDZ_R].rc_size); 577 578 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 579 abd_t *src = rr->rr_col[c].rc_abd; 580 581 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 582 583 if (c == rr->rr_firstdatacol) { 584 ASSERT(ccnt == pcnt || ccnt == 0); 585 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 586 (void) memcpy(q, p, rr->rr_col[c].rc_size); 587 (void) memcpy(r, p, rr->rr_col[c].rc_size); 588 589 for (uint64_t i = ccnt; i < pcnt; i++) { 590 p[i] = 0; 591 q[i] = 0; 592 r[i] = 0; 593 } 594 } else { 595 struct pqr_struct pqr = { p, q, r }; 596 597 ASSERT(ccnt <= pcnt); 598 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 599 vdev_raidz_pqr_func, &pqr); 600 601 /* 602 * Treat short columns as though they are full of 0s. 603 * Note that there's therefore nothing needed for P. 604 */ 605 uint64_t mask; 606 for (uint64_t i = ccnt; i < pcnt; i++) { 607 VDEV_RAIDZ_64MUL_2(q[i], mask); 608 VDEV_RAIDZ_64MUL_4(r[i], mask); 609 } 610 } 611 } 612 } 613 614 /* 615 * Generate RAID parity in the first virtual columns according to the number of 616 * parity columns available. 617 */ 618 void 619 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 620 { 621 ASSERT3U(rr->rr_cols, !=, 0); 622 623 /* Generate using the new math implementation */ 624 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 625 return; 626 627 switch (rr->rr_firstdatacol) { 628 case 1: 629 vdev_raidz_generate_parity_p(rr); 630 break; 631 case 2: 632 vdev_raidz_generate_parity_pq(rr); 633 break; 634 case 3: 635 vdev_raidz_generate_parity_pqr(rr); 636 break; 637 default: 638 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 639 } 640 } 641 642 void 643 vdev_raidz_generate_parity(raidz_map_t *rm) 644 { 645 for (int i = 0; i < rm->rm_nrows; i++) { 646 raidz_row_t *rr = rm->rm_row[i]; 647 vdev_raidz_generate_parity_row(rm, rr); 648 } 649 } 650 651 static int 652 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 653 { 654 (void) private; 655 uint64_t *dst = dbuf; 656 uint64_t *src = sbuf; 657 int cnt = size / sizeof (src[0]); 658 659 for (int i = 0; i < cnt; i++) { 660 dst[i] ^= src[i]; 661 } 662 663 return (0); 664 } 665 666 static int 667 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 668 void *private) 669 { 670 (void) private; 671 uint64_t *dst = dbuf; 672 uint64_t *src = sbuf; 673 uint64_t mask; 674 int cnt = size / sizeof (dst[0]); 675 676 for (int i = 0; i < cnt; i++, dst++, src++) { 677 VDEV_RAIDZ_64MUL_2(*dst, mask); 678 *dst ^= *src; 679 } 680 681 return (0); 682 } 683 684 static int 685 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 686 { 687 (void) private; 688 uint64_t *dst = buf; 689 uint64_t mask; 690 int cnt = size / sizeof (dst[0]); 691 692 for (int i = 0; i < cnt; i++, dst++) { 693 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 694 VDEV_RAIDZ_64MUL_2(*dst, mask); 695 } 696 697 return (0); 698 } 699 700 struct reconst_q_struct { 701 uint64_t *q; 702 int exp; 703 }; 704 705 static int 706 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 707 { 708 struct reconst_q_struct *rq = private; 709 uint64_t *dst = buf; 710 int cnt = size / sizeof (dst[0]); 711 712 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 713 int j; 714 uint8_t *b; 715 716 *dst ^= *rq->q; 717 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 718 *b = vdev_raidz_exp2(*b, rq->exp); 719 } 720 } 721 722 return (0); 723 } 724 725 struct reconst_pq_struct { 726 uint8_t *p; 727 uint8_t *q; 728 uint8_t *pxy; 729 uint8_t *qxy; 730 int aexp; 731 int bexp; 732 }; 733 734 static int 735 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 736 { 737 struct reconst_pq_struct *rpq = private; 738 uint8_t *xd = xbuf; 739 uint8_t *yd = ybuf; 740 741 for (int i = 0; i < size; 742 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 743 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 744 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 745 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 746 } 747 748 return (0); 749 } 750 751 static int 752 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 753 { 754 struct reconst_pq_struct *rpq = private; 755 uint8_t *xd = xbuf; 756 757 for (int i = 0; i < size; 758 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 759 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 760 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 761 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 762 } 763 764 return (0); 765 } 766 767 static void 768 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 769 { 770 int x = tgts[0]; 771 abd_t *dst, *src; 772 773 ASSERT3U(ntgts, ==, 1); 774 ASSERT3U(x, >=, rr->rr_firstdatacol); 775 ASSERT3U(x, <, rr->rr_cols); 776 777 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 778 779 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 780 dst = rr->rr_col[x].rc_abd; 781 782 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 783 784 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 785 uint64_t size = MIN(rr->rr_col[x].rc_size, 786 rr->rr_col[c].rc_size); 787 788 src = rr->rr_col[c].rc_abd; 789 790 if (c == x) 791 continue; 792 793 (void) abd_iterate_func2(dst, src, 0, 0, size, 794 vdev_raidz_reconst_p_func, NULL); 795 } 796 } 797 798 static void 799 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 800 { 801 int x = tgts[0]; 802 int c, exp; 803 abd_t *dst, *src; 804 805 ASSERT(ntgts == 1); 806 807 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 808 809 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 810 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 811 rr->rr_col[c].rc_size); 812 813 src = rr->rr_col[c].rc_abd; 814 dst = rr->rr_col[x].rc_abd; 815 816 if (c == rr->rr_firstdatacol) { 817 abd_copy(dst, src, size); 818 if (rr->rr_col[x].rc_size > size) { 819 abd_zero_off(dst, size, 820 rr->rr_col[x].rc_size - size); 821 } 822 } else { 823 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 824 (void) abd_iterate_func2(dst, src, 0, 0, size, 825 vdev_raidz_reconst_q_pre_func, NULL); 826 (void) abd_iterate_func(dst, 827 size, rr->rr_col[x].rc_size - size, 828 vdev_raidz_reconst_q_pre_tail_func, NULL); 829 } 830 } 831 832 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 833 dst = rr->rr_col[x].rc_abd; 834 exp = 255 - (rr->rr_cols - 1 - x); 835 836 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 837 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 838 vdev_raidz_reconst_q_post_func, &rq); 839 } 840 841 static void 842 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 843 { 844 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 845 abd_t *pdata, *qdata; 846 uint64_t xsize, ysize; 847 int x = tgts[0]; 848 int y = tgts[1]; 849 abd_t *xd, *yd; 850 851 ASSERT(ntgts == 2); 852 ASSERT(x < y); 853 ASSERT(x >= rr->rr_firstdatacol); 854 ASSERT(y < rr->rr_cols); 855 856 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 857 858 /* 859 * Move the parity data aside -- we're going to compute parity as 860 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 861 * reuse the parity generation mechanism without trashing the actual 862 * parity so we make those columns appear to be full of zeros by 863 * setting their lengths to zero. 864 */ 865 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 866 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 867 xsize = rr->rr_col[x].rc_size; 868 ysize = rr->rr_col[y].rc_size; 869 870 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 871 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 872 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 873 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 874 rr->rr_col[x].rc_size = 0; 875 rr->rr_col[y].rc_size = 0; 876 877 vdev_raidz_generate_parity_pq(rr); 878 879 rr->rr_col[x].rc_size = xsize; 880 rr->rr_col[y].rc_size = ysize; 881 882 p = abd_to_buf(pdata); 883 q = abd_to_buf(qdata); 884 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 885 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 886 xd = rr->rr_col[x].rc_abd; 887 yd = rr->rr_col[y].rc_abd; 888 889 /* 890 * We now have: 891 * Pxy = P + D_x + D_y 892 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 893 * 894 * We can then solve for D_x: 895 * D_x = A * (P + Pxy) + B * (Q + Qxy) 896 * where 897 * A = 2^(x - y) * (2^(x - y) + 1)^-1 898 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 899 * 900 * With D_x in hand, we can easily solve for D_y: 901 * D_y = P + Pxy + D_x 902 */ 903 904 a = vdev_raidz_pow2[255 + x - y]; 905 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 906 tmp = 255 - vdev_raidz_log2[a ^ 1]; 907 908 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 909 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 910 911 ASSERT3U(xsize, >=, ysize); 912 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 913 914 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 915 vdev_raidz_reconst_pq_func, &rpq); 916 (void) abd_iterate_func(xd, ysize, xsize - ysize, 917 vdev_raidz_reconst_pq_tail_func, &rpq); 918 919 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 920 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 921 922 /* 923 * Restore the saved parity data. 924 */ 925 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 926 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 927 } 928 929 /* 930 * In the general case of reconstruction, we must solve the system of linear 931 * equations defined by the coefficients used to generate parity as well as 932 * the contents of the data and parity disks. This can be expressed with 933 * vectors for the original data (D) and the actual data (d) and parity (p) 934 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 935 * 936 * __ __ __ __ 937 * | | __ __ | p_0 | 938 * | V | | D_0 | | p_m-1 | 939 * | | x | : | = | d_0 | 940 * | I | | D_n-1 | | : | 941 * | | ~~ ~~ | d_n-1 | 942 * ~~ ~~ ~~ ~~ 943 * 944 * I is simply a square identity matrix of size n, and V is a vandermonde 945 * matrix defined by the coefficients we chose for the various parity columns 946 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 947 * computation as well as linear separability. 948 * 949 * __ __ __ __ 950 * | 1 .. 1 1 1 | | p_0 | 951 * | 2^n-1 .. 4 2 1 | __ __ | : | 952 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 953 * | 1 .. 0 0 0 | | D_1 | | d_0 | 954 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 955 * | : : : : | | : | | d_2 | 956 * | 0 .. 1 0 0 | | D_n-1 | | : | 957 * | 0 .. 0 1 0 | ~~ ~~ | : | 958 * | 0 .. 0 0 1 | | d_n-1 | 959 * ~~ ~~ ~~ ~~ 960 * 961 * Note that I, V, d, and p are known. To compute D, we must invert the 962 * matrix and use the known data and parity values to reconstruct the unknown 963 * data values. We begin by removing the rows in V|I and d|p that correspond 964 * to failed or missing columns; we then make V|I square (n x n) and d|p 965 * sized n by removing rows corresponding to unused parity from the bottom up 966 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 967 * using Gauss-Jordan elimination. In the example below we use m=3 parity 968 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 969 * __ __ 970 * | 1 1 1 1 1 1 1 1 | 971 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 972 * | 19 205 116 29 64 16 4 1 | / / 973 * | 1 0 0 0 0 0 0 0 | / / 974 * | 0 1 0 0 0 0 0 0 | <--' / 975 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 976 * | 0 0 0 1 0 0 0 0 | 977 * | 0 0 0 0 1 0 0 0 | 978 * | 0 0 0 0 0 1 0 0 | 979 * | 0 0 0 0 0 0 1 0 | 980 * | 0 0 0 0 0 0 0 1 | 981 * ~~ ~~ 982 * __ __ 983 * | 1 1 1 1 1 1 1 1 | 984 * | 128 64 32 16 8 4 2 1 | 985 * | 19 205 116 29 64 16 4 1 | 986 * | 1 0 0 0 0 0 0 0 | 987 * | 0 1 0 0 0 0 0 0 | 988 * (V|I)' = | 0 0 1 0 0 0 0 0 | 989 * | 0 0 0 1 0 0 0 0 | 990 * | 0 0 0 0 1 0 0 0 | 991 * | 0 0 0 0 0 1 0 0 | 992 * | 0 0 0 0 0 0 1 0 | 993 * | 0 0 0 0 0 0 0 1 | 994 * ~~ ~~ 995 * 996 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 997 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 998 * matrix is not singular. 999 * __ __ 1000 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1001 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1002 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1003 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1004 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1005 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1006 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1007 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1008 * ~~ ~~ 1009 * __ __ 1010 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1011 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1012 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1013 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1014 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1015 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1016 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1017 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1018 * ~~ ~~ 1019 * __ __ 1020 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1021 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1022 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1023 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1024 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1025 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1026 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1027 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1028 * ~~ ~~ 1029 * __ __ 1030 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1031 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1032 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1033 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1034 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1035 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1036 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1037 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1038 * ~~ ~~ 1039 * __ __ 1040 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1041 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1042 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1043 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1044 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1045 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1046 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1047 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1048 * ~~ ~~ 1049 * __ __ 1050 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1051 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1052 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1053 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1054 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1055 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1056 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1057 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1058 * ~~ ~~ 1059 * __ __ 1060 * | 0 0 1 0 0 0 0 0 | 1061 * | 167 100 5 41 159 169 217 208 | 1062 * | 166 100 4 40 158 168 216 209 | 1063 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1064 * | 0 0 0 0 1 0 0 0 | 1065 * | 0 0 0 0 0 1 0 0 | 1066 * | 0 0 0 0 0 0 1 0 | 1067 * | 0 0 0 0 0 0 0 1 | 1068 * ~~ ~~ 1069 * 1070 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1071 * of the missing data. 1072 * 1073 * As is apparent from the example above, the only non-trivial rows in the 1074 * inverse matrix correspond to the data disks that we're trying to 1075 * reconstruct. Indeed, those are the only rows we need as the others would 1076 * only be useful for reconstructing data known or assumed to be valid. For 1077 * that reason, we only build the coefficients in the rows that correspond to 1078 * targeted columns. 1079 */ 1080 1081 static void 1082 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1083 uint8_t **rows) 1084 { 1085 int i, j; 1086 int pow; 1087 1088 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1089 1090 /* 1091 * Fill in the missing rows of interest. 1092 */ 1093 for (i = 0; i < nmap; i++) { 1094 ASSERT3S(0, <=, map[i]); 1095 ASSERT3S(map[i], <=, 2); 1096 1097 pow = map[i] * n; 1098 if (pow > 255) 1099 pow -= 255; 1100 ASSERT(pow <= 255); 1101 1102 for (j = 0; j < n; j++) { 1103 pow -= map[i]; 1104 if (pow < 0) 1105 pow += 255; 1106 rows[i][j] = vdev_raidz_pow2[pow]; 1107 } 1108 } 1109 } 1110 1111 static void 1112 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1113 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1114 { 1115 int i, j, ii, jj; 1116 uint8_t log; 1117 1118 /* 1119 * Assert that the first nmissing entries from the array of used 1120 * columns correspond to parity columns and that subsequent entries 1121 * correspond to data columns. 1122 */ 1123 for (i = 0; i < nmissing; i++) { 1124 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1125 } 1126 for (; i < n; i++) { 1127 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1128 } 1129 1130 /* 1131 * First initialize the storage where we'll compute the inverse rows. 1132 */ 1133 for (i = 0; i < nmissing; i++) { 1134 for (j = 0; j < n; j++) { 1135 invrows[i][j] = (i == j) ? 1 : 0; 1136 } 1137 } 1138 1139 /* 1140 * Subtract all trivial rows from the rows of consequence. 1141 */ 1142 for (i = 0; i < nmissing; i++) { 1143 for (j = nmissing; j < n; j++) { 1144 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1145 jj = used[j] - rr->rr_firstdatacol; 1146 ASSERT3S(jj, <, n); 1147 invrows[i][j] = rows[i][jj]; 1148 rows[i][jj] = 0; 1149 } 1150 } 1151 1152 /* 1153 * For each of the rows of interest, we must normalize it and subtract 1154 * a multiple of it from the other rows. 1155 */ 1156 for (i = 0; i < nmissing; i++) { 1157 for (j = 0; j < missing[i]; j++) { 1158 ASSERT0(rows[i][j]); 1159 } 1160 ASSERT3U(rows[i][missing[i]], !=, 0); 1161 1162 /* 1163 * Compute the inverse of the first element and multiply each 1164 * element in the row by that value. 1165 */ 1166 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1167 1168 for (j = 0; j < n; j++) { 1169 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1170 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1171 } 1172 1173 for (ii = 0; ii < nmissing; ii++) { 1174 if (i == ii) 1175 continue; 1176 1177 ASSERT3U(rows[ii][missing[i]], !=, 0); 1178 1179 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1180 1181 for (j = 0; j < n; j++) { 1182 rows[ii][j] ^= 1183 vdev_raidz_exp2(rows[i][j], log); 1184 invrows[ii][j] ^= 1185 vdev_raidz_exp2(invrows[i][j], log); 1186 } 1187 } 1188 } 1189 1190 /* 1191 * Verify that the data that is left in the rows are properly part of 1192 * an identity matrix. 1193 */ 1194 for (i = 0; i < nmissing; i++) { 1195 for (j = 0; j < n; j++) { 1196 if (j == missing[i]) { 1197 ASSERT3U(rows[i][j], ==, 1); 1198 } else { 1199 ASSERT0(rows[i][j]); 1200 } 1201 } 1202 } 1203 } 1204 1205 static void 1206 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1207 int *missing, uint8_t **invrows, const uint8_t *used) 1208 { 1209 int i, j, x, cc, c; 1210 uint8_t *src; 1211 uint64_t ccount; 1212 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1213 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1214 uint8_t log = 0; 1215 uint8_t val; 1216 int ll; 1217 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1218 uint8_t *p, *pp; 1219 size_t psize; 1220 1221 psize = sizeof (invlog[0][0]) * n * nmissing; 1222 p = kmem_alloc(psize, KM_SLEEP); 1223 1224 for (pp = p, i = 0; i < nmissing; i++) { 1225 invlog[i] = pp; 1226 pp += n; 1227 } 1228 1229 for (i = 0; i < nmissing; i++) { 1230 for (j = 0; j < n; j++) { 1231 ASSERT3U(invrows[i][j], !=, 0); 1232 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1233 } 1234 } 1235 1236 for (i = 0; i < n; i++) { 1237 c = used[i]; 1238 ASSERT3U(c, <, rr->rr_cols); 1239 1240 ccount = rr->rr_col[c].rc_size; 1241 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1242 if (ccount == 0) 1243 continue; 1244 src = abd_to_buf(rr->rr_col[c].rc_abd); 1245 for (j = 0; j < nmissing; j++) { 1246 cc = missing[j] + rr->rr_firstdatacol; 1247 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1248 ASSERT3U(cc, <, rr->rr_cols); 1249 ASSERT3U(cc, !=, c); 1250 1251 dcount[j] = rr->rr_col[cc].rc_size; 1252 if (dcount[j] != 0) 1253 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1254 } 1255 1256 for (x = 0; x < ccount; x++, src++) { 1257 if (*src != 0) 1258 log = vdev_raidz_log2[*src]; 1259 1260 for (cc = 0; cc < nmissing; cc++) { 1261 if (x >= dcount[cc]) 1262 continue; 1263 1264 if (*src == 0) { 1265 val = 0; 1266 } else { 1267 if ((ll = log + invlog[cc][i]) >= 255) 1268 ll -= 255; 1269 val = vdev_raidz_pow2[ll]; 1270 } 1271 1272 if (i == 0) 1273 dst[cc][x] = val; 1274 else 1275 dst[cc][x] ^= val; 1276 } 1277 } 1278 } 1279 1280 kmem_free(p, psize); 1281 } 1282 1283 static void 1284 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1285 { 1286 int n, i, c, t, tt; 1287 int nmissing_rows; 1288 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1289 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1290 uint8_t *p, *pp; 1291 size_t psize; 1292 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1293 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1294 uint8_t *used; 1295 1296 abd_t **bufs = NULL; 1297 1298 /* 1299 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1300 * temporary linear ABDs if any non-linear ABDs are found. 1301 */ 1302 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1303 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1304 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1305 KM_PUSHPAGE); 1306 1307 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1308 raidz_col_t *col = &rr->rr_col[c]; 1309 1310 bufs[c] = col->rc_abd; 1311 if (bufs[c] != NULL) { 1312 col->rc_abd = abd_alloc_linear( 1313 col->rc_size, B_TRUE); 1314 abd_copy(col->rc_abd, bufs[c], 1315 col->rc_size); 1316 } 1317 } 1318 1319 break; 1320 } 1321 } 1322 1323 n = rr->rr_cols - rr->rr_firstdatacol; 1324 1325 /* 1326 * Figure out which data columns are missing. 1327 */ 1328 nmissing_rows = 0; 1329 for (t = 0; t < ntgts; t++) { 1330 if (tgts[t] >= rr->rr_firstdatacol) { 1331 missing_rows[nmissing_rows++] = 1332 tgts[t] - rr->rr_firstdatacol; 1333 } 1334 } 1335 1336 /* 1337 * Figure out which parity columns to use to help generate the missing 1338 * data columns. 1339 */ 1340 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1341 ASSERT(tt < ntgts); 1342 ASSERT(c < rr->rr_firstdatacol); 1343 1344 /* 1345 * Skip any targeted parity columns. 1346 */ 1347 if (c == tgts[tt]) { 1348 tt++; 1349 continue; 1350 } 1351 1352 parity_map[i] = c; 1353 i++; 1354 } 1355 1356 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1357 nmissing_rows * n + sizeof (used[0]) * n; 1358 p = kmem_alloc(psize, KM_SLEEP); 1359 1360 for (pp = p, i = 0; i < nmissing_rows; i++) { 1361 rows[i] = pp; 1362 pp += n; 1363 invrows[i] = pp; 1364 pp += n; 1365 } 1366 used = pp; 1367 1368 for (i = 0; i < nmissing_rows; i++) { 1369 used[i] = parity_map[i]; 1370 } 1371 1372 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1373 if (tt < nmissing_rows && 1374 c == missing_rows[tt] + rr->rr_firstdatacol) { 1375 tt++; 1376 continue; 1377 } 1378 1379 ASSERT3S(i, <, n); 1380 used[i] = c; 1381 i++; 1382 } 1383 1384 /* 1385 * Initialize the interesting rows of the matrix. 1386 */ 1387 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 1388 1389 /* 1390 * Invert the matrix. 1391 */ 1392 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 1393 invrows, used); 1394 1395 /* 1396 * Reconstruct the missing data using the generated matrix. 1397 */ 1398 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 1399 invrows, used); 1400 1401 kmem_free(p, psize); 1402 1403 /* 1404 * copy back from temporary linear abds and free them 1405 */ 1406 if (bufs) { 1407 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1408 raidz_col_t *col = &rr->rr_col[c]; 1409 1410 if (bufs[c] != NULL) { 1411 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1412 abd_free(col->rc_abd); 1413 } 1414 col->rc_abd = bufs[c]; 1415 } 1416 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 1417 } 1418 } 1419 1420 static void 1421 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 1422 const int *t, int nt) 1423 { 1424 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1425 int ntgts; 1426 int i, c, ret; 1427 int nbadparity, nbaddata; 1428 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1429 1430 nbadparity = rr->rr_firstdatacol; 1431 nbaddata = rr->rr_cols - nbadparity; 1432 ntgts = 0; 1433 for (i = 0, c = 0; c < rr->rr_cols; c++) { 1434 if (c < rr->rr_firstdatacol) 1435 parity_valid[c] = B_FALSE; 1436 1437 if (i < nt && c == t[i]) { 1438 tgts[ntgts++] = c; 1439 i++; 1440 } else if (rr->rr_col[c].rc_error != 0) { 1441 tgts[ntgts++] = c; 1442 } else if (c >= rr->rr_firstdatacol) { 1443 nbaddata--; 1444 } else { 1445 parity_valid[c] = B_TRUE; 1446 nbadparity--; 1447 } 1448 } 1449 1450 ASSERT(ntgts >= nt); 1451 ASSERT(nbaddata >= 0); 1452 ASSERT(nbaddata + nbadparity == ntgts); 1453 1454 dt = &tgts[nbadparity]; 1455 1456 /* Reconstruct using the new math implementation */ 1457 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 1458 if (ret != RAIDZ_ORIGINAL_IMPL) 1459 return; 1460 1461 /* 1462 * See if we can use any of our optimized reconstruction routines. 1463 */ 1464 switch (nbaddata) { 1465 case 1: 1466 if (parity_valid[VDEV_RAIDZ_P]) { 1467 vdev_raidz_reconstruct_p(rr, dt, 1); 1468 return; 1469 } 1470 1471 ASSERT(rr->rr_firstdatacol > 1); 1472 1473 if (parity_valid[VDEV_RAIDZ_Q]) { 1474 vdev_raidz_reconstruct_q(rr, dt, 1); 1475 return; 1476 } 1477 1478 ASSERT(rr->rr_firstdatacol > 2); 1479 break; 1480 1481 case 2: 1482 ASSERT(rr->rr_firstdatacol > 1); 1483 1484 if (parity_valid[VDEV_RAIDZ_P] && 1485 parity_valid[VDEV_RAIDZ_Q]) { 1486 vdev_raidz_reconstruct_pq(rr, dt, 2); 1487 return; 1488 } 1489 1490 ASSERT(rr->rr_firstdatacol > 2); 1491 1492 break; 1493 } 1494 1495 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 1496 } 1497 1498 static int 1499 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1500 uint64_t *logical_ashift, uint64_t *physical_ashift) 1501 { 1502 vdev_raidz_t *vdrz = vd->vdev_tsd; 1503 uint64_t nparity = vdrz->vd_nparity; 1504 int c; 1505 int lasterror = 0; 1506 int numerrors = 0; 1507 1508 ASSERT(nparity > 0); 1509 1510 if (nparity > VDEV_RAIDZ_MAXPARITY || 1511 vd->vdev_children < nparity + 1) { 1512 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1513 return (SET_ERROR(EINVAL)); 1514 } 1515 1516 vdev_open_children(vd); 1517 1518 for (c = 0; c < vd->vdev_children; c++) { 1519 vdev_t *cvd = vd->vdev_child[c]; 1520 1521 if (cvd->vdev_open_error != 0) { 1522 lasterror = cvd->vdev_open_error; 1523 numerrors++; 1524 continue; 1525 } 1526 1527 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1528 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1529 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1530 } 1531 for (c = 0; c < vd->vdev_children; c++) { 1532 vdev_t *cvd = vd->vdev_child[c]; 1533 1534 if (cvd->vdev_open_error != 0) 1535 continue; 1536 *physical_ashift = vdev_best_ashift(*logical_ashift, 1537 *physical_ashift, cvd->vdev_physical_ashift); 1538 } 1539 1540 *asize *= vd->vdev_children; 1541 *max_asize *= vd->vdev_children; 1542 1543 if (numerrors > nparity) { 1544 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1545 return (lasterror); 1546 } 1547 1548 return (0); 1549 } 1550 1551 static void 1552 vdev_raidz_close(vdev_t *vd) 1553 { 1554 for (int c = 0; c < vd->vdev_children; c++) { 1555 if (vd->vdev_child[c] != NULL) 1556 vdev_close(vd->vdev_child[c]); 1557 } 1558 } 1559 1560 static uint64_t 1561 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1562 { 1563 vdev_raidz_t *vdrz = vd->vdev_tsd; 1564 uint64_t asize; 1565 uint64_t ashift = vd->vdev_top->vdev_ashift; 1566 uint64_t cols = vdrz->vd_logical_width; 1567 uint64_t nparity = vdrz->vd_nparity; 1568 1569 asize = ((psize - 1) >> ashift) + 1; 1570 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1571 asize = roundup(asize, nparity + 1) << ashift; 1572 1573 return (asize); 1574 } 1575 1576 /* 1577 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 1578 * so each child must provide at least 1/Nth of its asize. 1579 */ 1580 static uint64_t 1581 vdev_raidz_min_asize(vdev_t *vd) 1582 { 1583 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 1584 vd->vdev_children); 1585 } 1586 1587 void 1588 vdev_raidz_child_done(zio_t *zio) 1589 { 1590 raidz_col_t *rc = zio->io_private; 1591 1592 ASSERT3P(rc->rc_abd, !=, NULL); 1593 rc->rc_error = zio->io_error; 1594 rc->rc_tried = 1; 1595 rc->rc_skipped = 0; 1596 } 1597 1598 static void 1599 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) 1600 { 1601 #ifdef ZFS_DEBUG 1602 vdev_t *tvd = vd->vdev_top; 1603 1604 range_seg64_t logical_rs, physical_rs, remain_rs; 1605 logical_rs.rs_start = rr->rr_offset; 1606 logical_rs.rs_end = logical_rs.rs_start + 1607 vdev_raidz_asize(vd, rr->rr_size); 1608 1609 raidz_col_t *rc = &rr->rr_col[col]; 1610 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1611 1612 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 1613 ASSERT(vdev_xlate_is_empty(&remain_rs)); 1614 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1615 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1616 /* 1617 * It would be nice to assert that rs_end is equal 1618 * to rc_offset + rc_size but there might be an 1619 * optional I/O at the end that is not accounted in 1620 * rc_size. 1621 */ 1622 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1623 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1624 rc->rc_size + (1 << tvd->vdev_ashift)); 1625 } else { 1626 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1627 } 1628 #endif 1629 } 1630 1631 static void 1632 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) 1633 { 1634 vdev_t *vd = zio->io_vd; 1635 raidz_map_t *rm = zio->io_vsd; 1636 1637 vdev_raidz_generate_parity_row(rm, rr); 1638 1639 for (int c = 0; c < rr->rr_scols; c++) { 1640 raidz_col_t *rc = &rr->rr_col[c]; 1641 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1642 1643 /* Verify physical to logical translation */ 1644 vdev_raidz_io_verify(vd, rr, c); 1645 1646 if (rc->rc_size > 0) { 1647 ASSERT3P(rc->rc_abd, !=, NULL); 1648 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1649 rc->rc_offset, rc->rc_abd, 1650 abd_get_size(rc->rc_abd), zio->io_type, 1651 zio->io_priority, 0, vdev_raidz_child_done, rc)); 1652 } else { 1653 /* 1654 * Generate optional write for skip sector to improve 1655 * aggregation contiguity. 1656 */ 1657 ASSERT3P(rc->rc_abd, ==, NULL); 1658 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1659 rc->rc_offset, NULL, 1ULL << ashift, 1660 zio->io_type, zio->io_priority, 1661 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, 1662 NULL)); 1663 } 1664 } 1665 } 1666 1667 static void 1668 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) 1669 { 1670 vdev_t *vd = zio->io_vd; 1671 1672 /* 1673 * Iterate over the columns in reverse order so that we hit the parity 1674 * last -- any errors along the way will force us to read the parity. 1675 */ 1676 for (int c = rr->rr_cols - 1; c >= 0; c--) { 1677 raidz_col_t *rc = &rr->rr_col[c]; 1678 if (rc->rc_size == 0) 1679 continue; 1680 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1681 if (!vdev_readable(cvd)) { 1682 if (c >= rr->rr_firstdatacol) 1683 rr->rr_missingdata++; 1684 else 1685 rr->rr_missingparity++; 1686 rc->rc_error = SET_ERROR(ENXIO); 1687 rc->rc_tried = 1; /* don't even try */ 1688 rc->rc_skipped = 1; 1689 continue; 1690 } 1691 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1692 if (c >= rr->rr_firstdatacol) 1693 rr->rr_missingdata++; 1694 else 1695 rr->rr_missingparity++; 1696 rc->rc_error = SET_ERROR(ESTALE); 1697 rc->rc_skipped = 1; 1698 continue; 1699 } 1700 if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 1701 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1702 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1703 rc->rc_offset, rc->rc_abd, rc->rc_size, 1704 zio->io_type, zio->io_priority, 0, 1705 vdev_raidz_child_done, rc)); 1706 } 1707 } 1708 } 1709 1710 /* 1711 * Start an IO operation on a RAIDZ VDev 1712 * 1713 * Outline: 1714 * - For write operations: 1715 * 1. Generate the parity data 1716 * 2. Create child zio write operations to each column's vdev, for both 1717 * data and parity. 1718 * 3. If the column skips any sectors for padding, create optional dummy 1719 * write zio children for those areas to improve aggregation continuity. 1720 * - For read operations: 1721 * 1. Create child zio read operations to each data column's vdev to read 1722 * the range of data required for zio. 1723 * 2. If this is a scrub or resilver operation, or if any of the data 1724 * vdevs have had errors, then create zio read operations to the parity 1725 * columns' VDevs as well. 1726 */ 1727 static void 1728 vdev_raidz_io_start(zio_t *zio) 1729 { 1730 vdev_t *vd = zio->io_vd; 1731 vdev_t *tvd = vd->vdev_top; 1732 vdev_raidz_t *vdrz = vd->vdev_tsd; 1733 1734 raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, 1735 vdrz->vd_logical_width, vdrz->vd_nparity); 1736 zio->io_vsd = rm; 1737 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1738 1739 /* 1740 * Until raidz expansion is implemented all maps for a raidz vdev 1741 * contain a single row. 1742 */ 1743 ASSERT3U(rm->rm_nrows, ==, 1); 1744 raidz_row_t *rr = rm->rm_row[0]; 1745 1746 if (zio->io_type == ZIO_TYPE_WRITE) { 1747 vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); 1748 } else { 1749 ASSERT(zio->io_type == ZIO_TYPE_READ); 1750 vdev_raidz_io_start_read(zio, rr); 1751 } 1752 1753 zio_execute(zio); 1754 } 1755 1756 /* 1757 * Report a checksum error for a child of a RAID-Z device. 1758 */ 1759 void 1760 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1761 { 1762 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1763 1764 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 1765 zio->io_priority != ZIO_PRIORITY_REBUILD) { 1766 zio_bad_cksum_t zbc; 1767 raidz_map_t *rm = zio->io_vsd; 1768 1769 zbc.zbc_has_cksum = 0; 1770 zbc.zbc_injected = rm->rm_ecksuminjected; 1771 1772 mutex_enter(&vd->vdev_stat_lock); 1773 vd->vdev_stat.vs_checksum_errors++; 1774 mutex_exit(&vd->vdev_stat_lock); 1775 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1776 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1777 rc->rc_abd, bad_data, &zbc); 1778 } 1779 } 1780 1781 /* 1782 * We keep track of whether or not there were any injected errors, so that 1783 * any ereports we generate can note it. 1784 */ 1785 static int 1786 raidz_checksum_verify(zio_t *zio) 1787 { 1788 zio_bad_cksum_t zbc = {0}; 1789 raidz_map_t *rm = zio->io_vsd; 1790 1791 int ret = zio_checksum_error(zio, &zbc); 1792 if (ret != 0 && zbc.zbc_injected != 0) 1793 rm->rm_ecksuminjected = 1; 1794 1795 return (ret); 1796 } 1797 1798 /* 1799 * Generate the parity from the data columns. If we tried and were able to 1800 * read the parity without error, verify that the generated parity matches the 1801 * data we read. If it doesn't, we fire off a checksum error. Return the 1802 * number of such failures. 1803 */ 1804 static int 1805 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 1806 { 1807 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1808 int c, ret = 0; 1809 raidz_map_t *rm = zio->io_vsd; 1810 raidz_col_t *rc; 1811 1812 blkptr_t *bp = zio->io_bp; 1813 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1814 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1815 1816 if (checksum == ZIO_CHECKSUM_NOPARITY) 1817 return (ret); 1818 1819 for (c = 0; c < rr->rr_firstdatacol; c++) { 1820 rc = &rr->rr_col[c]; 1821 if (!rc->rc_tried || rc->rc_error != 0) 1822 continue; 1823 1824 orig[c] = rc->rc_abd; 1825 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 1826 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 1827 } 1828 1829 /* 1830 * Verify any empty sectors are zero filled to ensure the parity 1831 * is calculated correctly even if these non-data sectors are damaged. 1832 */ 1833 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 1834 ret += vdev_draid_map_verify_empty(zio, rr); 1835 1836 /* 1837 * Regenerates parity even for !tried||rc_error!=0 columns. This 1838 * isn't harmful but it does have the side effect of fixing stuff 1839 * we didn't realize was necessary (i.e. even if we return 0). 1840 */ 1841 vdev_raidz_generate_parity_row(rm, rr); 1842 1843 for (c = 0; c < rr->rr_firstdatacol; c++) { 1844 rc = &rr->rr_col[c]; 1845 1846 if (!rc->rc_tried || rc->rc_error != 0) 1847 continue; 1848 1849 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 1850 vdev_raidz_checksum_error(zio, rc, orig[c]); 1851 rc->rc_error = SET_ERROR(ECKSUM); 1852 ret++; 1853 } 1854 abd_free(orig[c]); 1855 } 1856 1857 return (ret); 1858 } 1859 1860 static int 1861 vdev_raidz_worst_error(raidz_row_t *rr) 1862 { 1863 int error = 0; 1864 1865 for (int c = 0; c < rr->rr_cols; c++) 1866 error = zio_worst_error(error, rr->rr_col[c].rc_error); 1867 1868 return (error); 1869 } 1870 1871 static void 1872 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 1873 { 1874 int unexpected_errors = 0; 1875 int parity_errors = 0; 1876 int parity_untried = 0; 1877 int data_errors = 0; 1878 1879 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 1880 1881 for (int c = 0; c < rr->rr_cols; c++) { 1882 raidz_col_t *rc = &rr->rr_col[c]; 1883 1884 if (rc->rc_error) { 1885 if (c < rr->rr_firstdatacol) 1886 parity_errors++; 1887 else 1888 data_errors++; 1889 1890 if (!rc->rc_skipped) 1891 unexpected_errors++; 1892 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 1893 parity_untried++; 1894 } 1895 1896 if (rc->rc_force_repair) 1897 unexpected_errors++; 1898 } 1899 1900 /* 1901 * If we read more parity disks than were used for 1902 * reconstruction, confirm that the other parity disks produced 1903 * correct data. 1904 * 1905 * Note that we also regenerate parity when resilvering so we 1906 * can write it out to failed devices later. 1907 */ 1908 if (parity_errors + parity_untried < 1909 rr->rr_firstdatacol - data_errors || 1910 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1911 int n = raidz_parity_verify(zio, rr); 1912 unexpected_errors += n; 1913 } 1914 1915 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1916 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1917 /* 1918 * Use the good data we have in hand to repair damaged children. 1919 */ 1920 for (int c = 0; c < rr->rr_cols; c++) { 1921 raidz_col_t *rc = &rr->rr_col[c]; 1922 vdev_t *vd = zio->io_vd; 1923 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1924 1925 if (!rc->rc_allow_repair) { 1926 continue; 1927 } else if (!rc->rc_force_repair && 1928 (rc->rc_error == 0 || rc->rc_size == 0)) { 1929 continue; 1930 } 1931 1932 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1933 rc->rc_offset, rc->rc_abd, rc->rc_size, 1934 ZIO_TYPE_WRITE, 1935 zio->io_priority == ZIO_PRIORITY_REBUILD ? 1936 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 1937 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1938 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1939 } 1940 } 1941 } 1942 1943 static void 1944 raidz_restore_orig_data(raidz_map_t *rm) 1945 { 1946 for (int i = 0; i < rm->rm_nrows; i++) { 1947 raidz_row_t *rr = rm->rm_row[i]; 1948 for (int c = 0; c < rr->rr_cols; c++) { 1949 raidz_col_t *rc = &rr->rr_col[c]; 1950 if (rc->rc_need_orig_restore) { 1951 abd_copy(rc->rc_abd, 1952 rc->rc_orig_data, rc->rc_size); 1953 rc->rc_need_orig_restore = B_FALSE; 1954 } 1955 } 1956 } 1957 } 1958 1959 /* 1960 * returns EINVAL if reconstruction of the block will not be possible 1961 * returns ECKSUM if this specific reconstruction failed 1962 * returns 0 on successful reconstruction 1963 */ 1964 static int 1965 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 1966 { 1967 raidz_map_t *rm = zio->io_vsd; 1968 1969 /* Reconstruct each row */ 1970 for (int r = 0; r < rm->rm_nrows; r++) { 1971 raidz_row_t *rr = rm->rm_row[r]; 1972 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 1973 int t = 0; 1974 int dead = 0; 1975 int dead_data = 0; 1976 1977 for (int c = 0; c < rr->rr_cols; c++) { 1978 raidz_col_t *rc = &rr->rr_col[c]; 1979 ASSERT0(rc->rc_need_orig_restore); 1980 if (rc->rc_error != 0) { 1981 dead++; 1982 if (c >= nparity) 1983 dead_data++; 1984 continue; 1985 } 1986 if (rc->rc_size == 0) 1987 continue; 1988 for (int lt = 0; lt < ntgts; lt++) { 1989 if (rc->rc_devidx == ltgts[lt]) { 1990 if (rc->rc_orig_data == NULL) { 1991 rc->rc_orig_data = 1992 abd_alloc_linear( 1993 rc->rc_size, B_TRUE); 1994 abd_copy(rc->rc_orig_data, 1995 rc->rc_abd, rc->rc_size); 1996 } 1997 rc->rc_need_orig_restore = B_TRUE; 1998 1999 dead++; 2000 if (c >= nparity) 2001 dead_data++; 2002 my_tgts[t++] = c; 2003 break; 2004 } 2005 } 2006 } 2007 if (dead > nparity) { 2008 /* reconstruction not possible */ 2009 raidz_restore_orig_data(rm); 2010 return (EINVAL); 2011 } 2012 if (dead_data > 0) 2013 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 2014 } 2015 2016 /* Check for success */ 2017 if (raidz_checksum_verify(zio) == 0) { 2018 2019 /* Reconstruction succeeded - report errors */ 2020 for (int i = 0; i < rm->rm_nrows; i++) { 2021 raidz_row_t *rr = rm->rm_row[i]; 2022 2023 for (int c = 0; c < rr->rr_cols; c++) { 2024 raidz_col_t *rc = &rr->rr_col[c]; 2025 if (rc->rc_need_orig_restore) { 2026 /* 2027 * Note: if this is a parity column, 2028 * we don't really know if it's wrong. 2029 * We need to let 2030 * vdev_raidz_io_done_verified() check 2031 * it, and if we set rc_error, it will 2032 * think that it is a "known" error 2033 * that doesn't need to be checked 2034 * or corrected. 2035 */ 2036 if (rc->rc_error == 0 && 2037 c >= rr->rr_firstdatacol) { 2038 vdev_raidz_checksum_error(zio, 2039 rc, rc->rc_orig_data); 2040 rc->rc_error = 2041 SET_ERROR(ECKSUM); 2042 } 2043 rc->rc_need_orig_restore = B_FALSE; 2044 } 2045 } 2046 2047 vdev_raidz_io_done_verified(zio, rr); 2048 } 2049 2050 zio_checksum_verified(zio); 2051 2052 return (0); 2053 } 2054 2055 /* Reconstruction failed - restore original data */ 2056 raidz_restore_orig_data(rm); 2057 return (ECKSUM); 2058 } 2059 2060 /* 2061 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 2062 * Note that the algorithm below is non-optimal because it doesn't take into 2063 * account how reconstruction is actually performed. For example, with 2064 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2065 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2066 * cases we'd only use parity information in column 0. 2067 * 2068 * The order that we find the various possible combinations of failed 2069 * disks is dictated by these rules: 2070 * - Examine each "slot" (the "i" in tgts[i]) 2071 * - Try to increment this slot (tgts[i] = tgts[i] + 1) 2072 * - if we can't increment because it runs into the next slot, 2073 * reset our slot to the minimum, and examine the next slot 2074 * 2075 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 2076 * 3 columns to reconstruct), we will generate the following sequence: 2077 * 2078 * STATE ACTION 2079 * 0 1 2 special case: skip since these are all parity 2080 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 2081 * 0 2 3 first slot: increment to 1 2082 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 2083 * 0 1 4 first: reset to 0; middle: increment to 2 2084 * 0 2 4 first: increment to 1 2085 * 1 2 4 first: reset to 0; middle: increment to 3 2086 * 0 3 4 first: increment to 1 2087 * 1 3 4 first: increment to 2 2088 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 2089 * 0 1 5 first: reset to 0; middle: increment to 2 2090 * 0 2 5 first: increment to 1 2091 * 1 2 5 first: reset to 0; middle: increment to 3 2092 * 0 3 5 first: increment to 1 2093 * 1 3 5 first: increment to 2 2094 * 2 3 5 first: reset to 0; middle: increment to 4 2095 * 0 4 5 first: increment to 1 2096 * 1 4 5 first: increment to 2 2097 * 2 4 5 first: increment to 3 2098 * 3 4 5 done 2099 * 2100 * This strategy works for dRAID but is less efficient when there are a large 2101 * number of child vdevs and therefore permutations to check. Furthermore, 2102 * since the raidz_map_t rows likely do not overlap reconstruction would be 2103 * possible as long as there are no more than nparity data errors per row. 2104 * These additional permutations are not currently checked but could be as 2105 * a future improvement. 2106 */ 2107 static int 2108 vdev_raidz_combrec(zio_t *zio) 2109 { 2110 int nparity = vdev_get_nparity(zio->io_vd); 2111 raidz_map_t *rm = zio->io_vsd; 2112 2113 /* Check if there's enough data to attempt reconstrution. */ 2114 for (int i = 0; i < rm->rm_nrows; i++) { 2115 raidz_row_t *rr = rm->rm_row[i]; 2116 int total_errors = 0; 2117 2118 for (int c = 0; c < rr->rr_cols; c++) { 2119 if (rr->rr_col[c].rc_error) 2120 total_errors++; 2121 } 2122 2123 if (total_errors > nparity) 2124 return (vdev_raidz_worst_error(rr)); 2125 } 2126 2127 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 2128 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2129 int *ltgts = &tstore[1]; /* value is logical child ID */ 2130 2131 /* Determine number of logical children, n */ 2132 int n = zio->io_vd->vdev_children; 2133 2134 ASSERT3U(num_failures, <=, nparity); 2135 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 2136 2137 /* Handle corner cases in combrec logic */ 2138 ltgts[-1] = -1; 2139 for (int i = 0; i < num_failures; i++) { 2140 ltgts[i] = i; 2141 } 2142 ltgts[num_failures] = n; 2143 2144 for (;;) { 2145 int err = raidz_reconstruct(zio, ltgts, num_failures, 2146 nparity); 2147 if (err == EINVAL) { 2148 /* 2149 * Reconstruction not possible with this # 2150 * failures; try more failures. 2151 */ 2152 break; 2153 } else if (err == 0) 2154 return (0); 2155 2156 /* Compute next targets to try */ 2157 for (int t = 0; ; t++) { 2158 ASSERT3U(t, <, num_failures); 2159 ltgts[t]++; 2160 if (ltgts[t] == n) { 2161 /* try more failures */ 2162 ASSERT3U(t, ==, num_failures - 1); 2163 break; 2164 } 2165 2166 ASSERT3U(ltgts[t], <, n); 2167 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 2168 2169 /* 2170 * If that spot is available, we're done here. 2171 * Try the next combination. 2172 */ 2173 if (ltgts[t] != ltgts[t + 1]) 2174 break; 2175 2176 /* 2177 * Otherwise, reset this tgt to the minimum, 2178 * and move on to the next tgt. 2179 */ 2180 ltgts[t] = ltgts[t - 1] + 1; 2181 ASSERT3U(ltgts[t], ==, t); 2182 } 2183 2184 /* Increase the number of failures and keep trying. */ 2185 if (ltgts[num_failures - 1] == n) 2186 break; 2187 } 2188 } 2189 2190 return (ECKSUM); 2191 } 2192 2193 void 2194 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 2195 { 2196 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 2197 raidz_row_t *rr = rm->rm_row[row]; 2198 vdev_raidz_reconstruct_row(rm, rr, t, nt); 2199 } 2200 } 2201 2202 /* 2203 * Complete a write IO operation on a RAIDZ VDev 2204 * 2205 * Outline: 2206 * 1. Check for errors on the child IOs. 2207 * 2. Return, setting an error code if too few child VDevs were written 2208 * to reconstruct the data later. Note that partial writes are 2209 * considered successful if they can be reconstructed at all. 2210 */ 2211 static void 2212 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 2213 { 2214 int total_errors = 0; 2215 2216 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2217 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2218 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 2219 2220 for (int c = 0; c < rr->rr_cols; c++) { 2221 raidz_col_t *rc = &rr->rr_col[c]; 2222 2223 if (rc->rc_error) { 2224 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2225 2226 total_errors++; 2227 } 2228 } 2229 2230 /* 2231 * Treat partial writes as a success. If we couldn't write enough 2232 * columns to reconstruct the data, the I/O failed. Otherwise, 2233 * good enough. 2234 * 2235 * Now that we support write reallocation, it would be better 2236 * to treat partial failure as real failure unless there are 2237 * no non-degraded top-level vdevs left, and not update DTLs 2238 * if we intend to reallocate. 2239 */ 2240 if (total_errors > rr->rr_firstdatacol) { 2241 zio->io_error = zio_worst_error(zio->io_error, 2242 vdev_raidz_worst_error(rr)); 2243 } 2244 } 2245 2246 static void 2247 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 2248 raidz_row_t *rr) 2249 { 2250 int parity_errors = 0; 2251 int parity_untried = 0; 2252 int data_errors = 0; 2253 int total_errors = 0; 2254 2255 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2256 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2257 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2258 2259 for (int c = 0; c < rr->rr_cols; c++) { 2260 raidz_col_t *rc = &rr->rr_col[c]; 2261 2262 /* 2263 * If scrubbing and a replacing/sparing child vdev determined 2264 * that not all of its children have an identical copy of the 2265 * data, then clear the error so the column is treated like 2266 * any other read and force a repair to correct the damage. 2267 */ 2268 if (rc->rc_error == ECKSUM) { 2269 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 2270 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 2271 rc->rc_force_repair = 1; 2272 rc->rc_error = 0; 2273 } 2274 2275 if (rc->rc_error) { 2276 if (c < rr->rr_firstdatacol) 2277 parity_errors++; 2278 else 2279 data_errors++; 2280 2281 total_errors++; 2282 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2283 parity_untried++; 2284 } 2285 } 2286 2287 /* 2288 * If there were data errors and the number of errors we saw was 2289 * correctable -- less than or equal to the number of parity disks read 2290 * -- reconstruct based on the missing data. 2291 */ 2292 if (data_errors != 0 && 2293 total_errors <= rr->rr_firstdatacol - parity_untried) { 2294 /* 2295 * We either attempt to read all the parity columns or 2296 * none of them. If we didn't try to read parity, we 2297 * wouldn't be here in the correctable case. There must 2298 * also have been fewer parity errors than parity 2299 * columns or, again, we wouldn't be in this code path. 2300 */ 2301 ASSERT(parity_untried == 0); 2302 ASSERT(parity_errors < rr->rr_firstdatacol); 2303 2304 /* 2305 * Identify the data columns that reported an error. 2306 */ 2307 int n = 0; 2308 int tgts[VDEV_RAIDZ_MAXPARITY]; 2309 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2310 raidz_col_t *rc = &rr->rr_col[c]; 2311 if (rc->rc_error != 0) { 2312 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2313 tgts[n++] = c; 2314 } 2315 } 2316 2317 ASSERT(rr->rr_firstdatacol >= n); 2318 2319 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 2320 } 2321 } 2322 2323 /* 2324 * Return the number of reads issued. 2325 */ 2326 static int 2327 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 2328 { 2329 vdev_t *vd = zio->io_vd; 2330 int nread = 0; 2331 2332 rr->rr_missingdata = 0; 2333 rr->rr_missingparity = 0; 2334 2335 /* 2336 * If this rows contains empty sectors which are not required 2337 * for a normal read then allocate an ABD for them now so they 2338 * may be read, verified, and any needed repairs performed. 2339 */ 2340 if (rr->rr_nempty && rr->rr_abd_empty == NULL) 2341 vdev_draid_map_alloc_empty(zio, rr); 2342 2343 for (int c = 0; c < rr->rr_cols; c++) { 2344 raidz_col_t *rc = &rr->rr_col[c]; 2345 if (rc->rc_tried || rc->rc_size == 0) 2346 continue; 2347 2348 zio_nowait(zio_vdev_child_io(zio, NULL, 2349 vd->vdev_child[rc->rc_devidx], 2350 rc->rc_offset, rc->rc_abd, rc->rc_size, 2351 zio->io_type, zio->io_priority, 0, 2352 vdev_raidz_child_done, rc)); 2353 nread++; 2354 } 2355 return (nread); 2356 } 2357 2358 /* 2359 * We're here because either there were too many errors to even attempt 2360 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 2361 * failed. In either case, there is enough bad data to prevent reconstruction. 2362 * Start checksum ereports for all children which haven't failed. 2363 */ 2364 static void 2365 vdev_raidz_io_done_unrecoverable(zio_t *zio) 2366 { 2367 raidz_map_t *rm = zio->io_vsd; 2368 2369 for (int i = 0; i < rm->rm_nrows; i++) { 2370 raidz_row_t *rr = rm->rm_row[i]; 2371 2372 for (int c = 0; c < rr->rr_cols; c++) { 2373 raidz_col_t *rc = &rr->rr_col[c]; 2374 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2375 2376 if (rc->rc_error != 0) 2377 continue; 2378 2379 zio_bad_cksum_t zbc; 2380 zbc.zbc_has_cksum = 0; 2381 zbc.zbc_injected = rm->rm_ecksuminjected; 2382 2383 mutex_enter(&cvd->vdev_stat_lock); 2384 cvd->vdev_stat.vs_checksum_errors++; 2385 mutex_exit(&cvd->vdev_stat_lock); 2386 (void) zfs_ereport_start_checksum(zio->io_spa, 2387 cvd, &zio->io_bookmark, zio, rc->rc_offset, 2388 rc->rc_size, &zbc); 2389 } 2390 } 2391 } 2392 2393 void 2394 vdev_raidz_io_done(zio_t *zio) 2395 { 2396 raidz_map_t *rm = zio->io_vsd; 2397 2398 if (zio->io_type == ZIO_TYPE_WRITE) { 2399 for (int i = 0; i < rm->rm_nrows; i++) { 2400 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 2401 } 2402 } else { 2403 for (int i = 0; i < rm->rm_nrows; i++) { 2404 raidz_row_t *rr = rm->rm_row[i]; 2405 vdev_raidz_io_done_reconstruct_known_missing(zio, 2406 rm, rr); 2407 } 2408 2409 if (raidz_checksum_verify(zio) == 0) { 2410 for (int i = 0; i < rm->rm_nrows; i++) { 2411 raidz_row_t *rr = rm->rm_row[i]; 2412 vdev_raidz_io_done_verified(zio, rr); 2413 } 2414 zio_checksum_verified(zio); 2415 } else { 2416 /* 2417 * A sequential resilver has no checksum which makes 2418 * combinatoral reconstruction impossible. This code 2419 * path is unreachable since raidz_checksum_verify() 2420 * has no checksum to verify and must succeed. 2421 */ 2422 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 2423 2424 /* 2425 * This isn't a typical situation -- either we got a 2426 * read error or a child silently returned bad data. 2427 * Read every block so we can try again with as much 2428 * data and parity as we can track down. If we've 2429 * already been through once before, all children will 2430 * be marked as tried so we'll proceed to combinatorial 2431 * reconstruction. 2432 */ 2433 int nread = 0; 2434 for (int i = 0; i < rm->rm_nrows; i++) { 2435 nread += vdev_raidz_read_all(zio, 2436 rm->rm_row[i]); 2437 } 2438 if (nread != 0) { 2439 /* 2440 * Normally our stage is VDEV_IO_DONE, but if 2441 * we've already called redone(), it will have 2442 * changed to VDEV_IO_START, in which case we 2443 * don't want to call redone() again. 2444 */ 2445 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 2446 zio_vdev_io_redone(zio); 2447 return; 2448 } 2449 2450 zio->io_error = vdev_raidz_combrec(zio); 2451 if (zio->io_error == ECKSUM && 2452 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2453 vdev_raidz_io_done_unrecoverable(zio); 2454 } 2455 } 2456 } 2457 } 2458 2459 static void 2460 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2461 { 2462 vdev_raidz_t *vdrz = vd->vdev_tsd; 2463 if (faulted > vdrz->vd_nparity) 2464 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2465 VDEV_AUX_NO_REPLICAS); 2466 else if (degraded + faulted != 0) 2467 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2468 else 2469 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2470 } 2471 2472 /* 2473 * Determine if any portion of the provided block resides on a child vdev 2474 * with a dirty DTL and therefore needs to be resilvered. The function 2475 * assumes that at least one DTL is dirty which implies that full stripe 2476 * width blocks must be resilvered. 2477 */ 2478 static boolean_t 2479 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 2480 uint64_t phys_birth) 2481 { 2482 vdev_raidz_t *vdrz = vd->vdev_tsd; 2483 uint64_t dcols = vd->vdev_children; 2484 uint64_t nparity = vdrz->vd_nparity; 2485 uint64_t ashift = vd->vdev_top->vdev_ashift; 2486 /* The starting RAIDZ (parent) vdev sector of the block. */ 2487 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 2488 /* The zio's size in units of the vdev's minimum sector size. */ 2489 uint64_t s = ((psize - 1) >> ashift) + 1; 2490 /* The first column for this stripe. */ 2491 uint64_t f = b % dcols; 2492 2493 /* Unreachable by sequential resilver. */ 2494 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 2495 2496 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 2497 return (B_FALSE); 2498 2499 if (s + nparity >= dcols) 2500 return (B_TRUE); 2501 2502 for (uint64_t c = 0; c < s + nparity; c++) { 2503 uint64_t devidx = (f + c) % dcols; 2504 vdev_t *cvd = vd->vdev_child[devidx]; 2505 2506 /* 2507 * dsl_scan_need_resilver() already checked vd with 2508 * vdev_dtl_contains(). So here just check cvd with 2509 * vdev_dtl_empty(), cheaper and a good approximation. 2510 */ 2511 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2512 return (B_TRUE); 2513 } 2514 2515 return (B_FALSE); 2516 } 2517 2518 static void 2519 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 2520 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 2521 { 2522 (void) remain_rs; 2523 2524 vdev_t *raidvd = cvd->vdev_parent; 2525 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2526 2527 uint64_t width = raidvd->vdev_children; 2528 uint64_t tgt_col = cvd->vdev_id; 2529 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2530 2531 /* make sure the offsets are block-aligned */ 2532 ASSERT0(logical_rs->rs_start % (1 << ashift)); 2533 ASSERT0(logical_rs->rs_end % (1 << ashift)); 2534 uint64_t b_start = logical_rs->rs_start >> ashift; 2535 uint64_t b_end = logical_rs->rs_end >> ashift; 2536 2537 uint64_t start_row = 0; 2538 if (b_start > tgt_col) /* avoid underflow */ 2539 start_row = ((b_start - tgt_col - 1) / width) + 1; 2540 2541 uint64_t end_row = 0; 2542 if (b_end > tgt_col) 2543 end_row = ((b_end - tgt_col - 1) / width) + 1; 2544 2545 physical_rs->rs_start = start_row << ashift; 2546 physical_rs->rs_end = end_row << ashift; 2547 2548 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 2549 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 2550 logical_rs->rs_end - logical_rs->rs_start); 2551 } 2552 2553 /* 2554 * Initialize private RAIDZ specific fields from the nvlist. 2555 */ 2556 static int 2557 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 2558 { 2559 vdev_raidz_t *vdrz; 2560 uint64_t nparity; 2561 2562 uint_t children; 2563 nvlist_t **child; 2564 int error = nvlist_lookup_nvlist_array(nv, 2565 ZPOOL_CONFIG_CHILDREN, &child, &children); 2566 if (error != 0) 2567 return (SET_ERROR(EINVAL)); 2568 2569 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 2570 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 2571 return (SET_ERROR(EINVAL)); 2572 2573 /* 2574 * Previous versions could only support 1 or 2 parity 2575 * device. 2576 */ 2577 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 2578 return (SET_ERROR(EINVAL)); 2579 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 2580 return (SET_ERROR(EINVAL)); 2581 } else { 2582 /* 2583 * We require the parity to be specified for SPAs that 2584 * support multiple parity levels. 2585 */ 2586 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 2587 return (SET_ERROR(EINVAL)); 2588 2589 /* 2590 * Otherwise, we default to 1 parity device for RAID-Z. 2591 */ 2592 nparity = 1; 2593 } 2594 2595 vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 2596 vdrz->vd_logical_width = children; 2597 vdrz->vd_nparity = nparity; 2598 2599 *tsd = vdrz; 2600 2601 return (0); 2602 } 2603 2604 static void 2605 vdev_raidz_fini(vdev_t *vd) 2606 { 2607 kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); 2608 } 2609 2610 /* 2611 * Add RAIDZ specific fields to the config nvlist. 2612 */ 2613 static void 2614 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 2615 { 2616 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 2617 vdev_raidz_t *vdrz = vd->vdev_tsd; 2618 2619 /* 2620 * Make sure someone hasn't managed to sneak a fancy new vdev 2621 * into a crufty old storage pool. 2622 */ 2623 ASSERT(vdrz->vd_nparity == 1 || 2624 (vdrz->vd_nparity <= 2 && 2625 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 2626 (vdrz->vd_nparity <= 3 && 2627 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 2628 2629 /* 2630 * Note that we'll add these even on storage pools where they 2631 * aren't strictly required -- older software will just ignore 2632 * it. 2633 */ 2634 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 2635 } 2636 2637 static uint64_t 2638 vdev_raidz_nparity(vdev_t *vd) 2639 { 2640 vdev_raidz_t *vdrz = vd->vdev_tsd; 2641 return (vdrz->vd_nparity); 2642 } 2643 2644 static uint64_t 2645 vdev_raidz_ndisks(vdev_t *vd) 2646 { 2647 return (vd->vdev_children); 2648 } 2649 2650 vdev_ops_t vdev_raidz_ops = { 2651 .vdev_op_init = vdev_raidz_init, 2652 .vdev_op_fini = vdev_raidz_fini, 2653 .vdev_op_open = vdev_raidz_open, 2654 .vdev_op_close = vdev_raidz_close, 2655 .vdev_op_asize = vdev_raidz_asize, 2656 .vdev_op_min_asize = vdev_raidz_min_asize, 2657 .vdev_op_min_alloc = NULL, 2658 .vdev_op_io_start = vdev_raidz_io_start, 2659 .vdev_op_io_done = vdev_raidz_io_done, 2660 .vdev_op_state_change = vdev_raidz_state_change, 2661 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2662 .vdev_op_hold = NULL, 2663 .vdev_op_rele = NULL, 2664 .vdev_op_remap = NULL, 2665 .vdev_op_xlate = vdev_raidz_xlate, 2666 .vdev_op_rebuild_asize = NULL, 2667 .vdev_op_metaslab_init = NULL, 2668 .vdev_op_config_generate = vdev_raidz_config_generate, 2669 .vdev_op_nparity = vdev_raidz_nparity, 2670 .vdev_op_ndisks = vdev_raidz_ndisks, 2671 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2672 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2673 }; 2674