1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/zio_checksum.h> 33 #include <sys/abd.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/vdev_raidz.h> 37 #include <sys/vdev_raidz_impl.h> 38 #include <sys/vdev_draid.h> 39 40 #ifdef ZFS_DEBUG 41 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 42 #endif 43 44 /* 45 * Virtual device vector for RAID-Z. 46 * 47 * This vdev supports single, double, and triple parity. For single parity, 48 * we use a simple XOR of all the data columns. For double or triple parity, 49 * we use a special case of Reed-Solomon coding. This extends the 50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 53 * former is also based. The latter is designed to provide higher performance 54 * for writes. 55 * 56 * Note that the Plank paper claimed to support arbitrary N+M, but was then 57 * amended six years later identifying a critical flaw that invalidates its 58 * claims. Nevertheless, the technique can be adapted to work for up to 59 * triple parity. For additional parity, the amendment "Note: Correction to 60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 61 * is viable, but the additional complexity means that write performance will 62 * suffer. 63 * 64 * All of the methods above operate on a Galois field, defined over the 65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 66 * can be expressed with a single byte. Briefly, the operations on the 67 * field are defined as follows: 68 * 69 * o addition (+) is represented by a bitwise XOR 70 * o subtraction (-) is therefore identical to addition: A + B = A - B 71 * o multiplication of A by 2 is defined by the following bitwise expression: 72 * 73 * (A * 2)_7 = A_6 74 * (A * 2)_6 = A_5 75 * (A * 2)_5 = A_4 76 * (A * 2)_4 = A_3 + A_7 77 * (A * 2)_3 = A_2 + A_7 78 * (A * 2)_2 = A_1 + A_7 79 * (A * 2)_1 = A_0 80 * (A * 2)_0 = A_7 81 * 82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 83 * As an aside, this multiplication is derived from the error correcting 84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 85 * 86 * Observe that any number in the field (except for 0) can be expressed as a 87 * power of 2 -- a generator for the field. We store a table of the powers of 88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 90 * than field addition). The inverse of a field element A (A^-1) is therefore 91 * A ^ (255 - 1) = A^254. 92 * 93 * The up-to-three parity columns, P, Q, R over several data columns, 94 * D_0, ... D_n-1, can be expressed by field operations: 95 * 96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 101 * 102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 104 * independent coefficients. (There are no additional coefficients that have 105 * this property which is why the uncorrected Plank method breaks down.) 106 * 107 * See the reconstruction code below for how P, Q and R can used individually 108 * or in concert to recover missing data columns. 109 */ 110 111 #define VDEV_RAIDZ_P 0 112 #define VDEV_RAIDZ_Q 1 113 #define VDEV_RAIDZ_R 2 114 115 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 116 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 117 118 /* 119 * We provide a mechanism to perform the field multiplication operation on a 120 * 64-bit value all at once rather than a byte at a time. This works by 121 * creating a mask from the top bit in each byte and using that to 122 * conditionally apply the XOR of 0x1d. 123 */ 124 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 125 { \ 126 (mask) = (x) & 0x8080808080808080ULL; \ 127 (mask) = ((mask) << 1) - ((mask) >> 7); \ 128 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 129 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 130 } 131 132 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 133 { \ 134 VDEV_RAIDZ_64MUL_2((x), mask); \ 135 VDEV_RAIDZ_64MUL_2((x), mask); \ 136 } 137 138 static void 139 vdev_raidz_row_free(raidz_row_t *rr) 140 { 141 for (int c = 0; c < rr->rr_cols; c++) { 142 raidz_col_t *rc = &rr->rr_col[c]; 143 144 if (rc->rc_size != 0) 145 abd_free(rc->rc_abd); 146 if (rc->rc_orig_data != NULL) 147 abd_free(rc->rc_orig_data); 148 } 149 150 if (rr->rr_abd_empty != NULL) 151 abd_free(rr->rr_abd_empty); 152 153 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 154 } 155 156 void 157 vdev_raidz_map_free(raidz_map_t *rm) 158 { 159 for (int i = 0; i < rm->rm_nrows; i++) 160 vdev_raidz_row_free(rm->rm_row[i]); 161 162 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 163 } 164 165 static void 166 vdev_raidz_map_free_vsd(zio_t *zio) 167 { 168 raidz_map_t *rm = zio->io_vsd; 169 170 vdev_raidz_map_free(rm); 171 } 172 173 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 174 .vsd_free = vdev_raidz_map_free_vsd, 175 }; 176 177 static void 178 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 179 { 180 int c; 181 int nwrapped = 0; 182 uint64_t off = 0; 183 raidz_row_t *rr = rm->rm_row[0]; 184 185 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 186 ASSERT3U(rm->rm_nrows, ==, 1); 187 188 /* 189 * Pad any parity columns with additional space to account for skip 190 * sectors. 191 */ 192 if (rm->rm_skipstart < rr->rr_firstdatacol) { 193 ASSERT0(rm->rm_skipstart); 194 nwrapped = rm->rm_nskip; 195 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 196 nwrapped = 197 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 198 } 199 200 /* 201 * Optional single skip sectors (rc_size == 0) will be handled in 202 * vdev_raidz_io_start_write(). 203 */ 204 int skipped = rr->rr_scols - rr->rr_cols; 205 206 /* Allocate buffers for the parity columns */ 207 for (c = 0; c < rr->rr_firstdatacol; c++) { 208 raidz_col_t *rc = &rr->rr_col[c]; 209 210 /* 211 * Parity columns will pad out a linear ABD to account for 212 * the skip sector. A linear ABD is used here because 213 * parity calculations use the ABD buffer directly to calculate 214 * parity. This avoids doing a memcpy back to the ABD after the 215 * parity has been calculated. By issuing the parity column 216 * with the skip sector we can reduce contention on the child 217 * VDEV queue locks (vq_lock). 218 */ 219 if (c < nwrapped) { 220 rc->rc_abd = abd_alloc_linear( 221 rc->rc_size + (1ULL << ashift), B_FALSE); 222 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 223 skipped++; 224 } else { 225 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 226 } 227 } 228 229 for (off = 0; c < rr->rr_cols; c++) { 230 raidz_col_t *rc = &rr->rr_col[c]; 231 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 232 zio->io_abd, off, rc->rc_size); 233 234 /* 235 * Generate I/O for skip sectors to improve aggregation 236 * continuity. We will use gang ABD's to reduce contention 237 * on the child VDEV queue locks (vq_lock) by issuing 238 * a single I/O that contains the data and skip sector. 239 * 240 * It is important to make sure that rc_size is not updated 241 * even though we are adding a skip sector to the ABD. When 242 * calculating the parity in vdev_raidz_generate_parity_row() 243 * the rc_size is used to iterate through the ABD's. We can 244 * not have zero'd out skip sectors used for calculating 245 * parity for raidz, because those same sectors are not used 246 * during reconstruction. 247 */ 248 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 249 rc->rc_abd = abd_alloc_gang(); 250 abd_gang_add(rc->rc_abd, abd, B_TRUE); 251 abd_gang_add(rc->rc_abd, 252 abd_get_zeros(1ULL << ashift), B_TRUE); 253 skipped++; 254 } else { 255 rc->rc_abd = abd; 256 } 257 off += rc->rc_size; 258 } 259 260 ASSERT3U(off, ==, zio->io_size); 261 ASSERT3S(skipped, ==, rm->rm_nskip); 262 } 263 264 static void 265 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 266 { 267 int c; 268 raidz_row_t *rr = rm->rm_row[0]; 269 270 ASSERT3U(rm->rm_nrows, ==, 1); 271 272 /* Allocate buffers for the parity columns */ 273 for (c = 0; c < rr->rr_firstdatacol; c++) 274 rr->rr_col[c].rc_abd = 275 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 276 277 for (uint64_t off = 0; c < rr->rr_cols; c++) { 278 raidz_col_t *rc = &rr->rr_col[c]; 279 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 280 zio->io_abd, off, rc->rc_size); 281 off += rc->rc_size; 282 } 283 } 284 285 /* 286 * Divides the IO evenly across all child vdevs; usually, dcols is 287 * the number of children in the target vdev. 288 * 289 * Avoid inlining the function to keep vdev_raidz_io_start(), which 290 * is this functions only caller, as small as possible on the stack. 291 */ 292 noinline raidz_map_t * 293 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 294 uint64_t nparity) 295 { 296 raidz_row_t *rr; 297 /* The starting RAIDZ (parent) vdev sector of the block. */ 298 uint64_t b = zio->io_offset >> ashift; 299 /* The zio's size in units of the vdev's minimum sector size. */ 300 uint64_t s = zio->io_size >> ashift; 301 /* The first column for this stripe. */ 302 uint64_t f = b % dcols; 303 /* The starting byte offset on each child vdev. */ 304 uint64_t o = (b / dcols) << ashift; 305 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 306 307 raidz_map_t *rm = 308 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 309 rm->rm_nrows = 1; 310 311 /* 312 * "Quotient": The number of data sectors for this stripe on all but 313 * the "big column" child vdevs that also contain "remainder" data. 314 */ 315 q = s / (dcols - nparity); 316 317 /* 318 * "Remainder": The number of partial stripe data sectors in this I/O. 319 * This will add a sector to some, but not all, child vdevs. 320 */ 321 r = s - q * (dcols - nparity); 322 323 /* The number of "big columns" - those which contain remainder data. */ 324 bc = (r == 0 ? 0 : r + nparity); 325 326 /* 327 * The total number of data and parity sectors associated with 328 * this I/O. 329 */ 330 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 331 332 /* 333 * acols: The columns that will be accessed. 334 * scols: The columns that will be accessed or skipped. 335 */ 336 if (q == 0) { 337 /* Our I/O request doesn't span all child vdevs. */ 338 acols = bc; 339 scols = MIN(dcols, roundup(bc, nparity + 1)); 340 } else { 341 acols = dcols; 342 scols = dcols; 343 } 344 345 ASSERT3U(acols, <=, scols); 346 347 rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); 348 rm->rm_row[0] = rr; 349 350 rr->rr_cols = acols; 351 rr->rr_scols = scols; 352 rr->rr_bigcols = bc; 353 rr->rr_missingdata = 0; 354 rr->rr_missingparity = 0; 355 rr->rr_firstdatacol = nparity; 356 rr->rr_abd_empty = NULL; 357 rr->rr_nempty = 0; 358 #ifdef ZFS_DEBUG 359 rr->rr_offset = zio->io_offset; 360 rr->rr_size = zio->io_size; 361 #endif 362 363 asize = 0; 364 365 for (c = 0; c < scols; c++) { 366 raidz_col_t *rc = &rr->rr_col[c]; 367 col = f + c; 368 coff = o; 369 if (col >= dcols) { 370 col -= dcols; 371 coff += 1ULL << ashift; 372 } 373 rc->rc_devidx = col; 374 rc->rc_offset = coff; 375 rc->rc_abd = NULL; 376 rc->rc_orig_data = NULL; 377 rc->rc_error = 0; 378 rc->rc_tried = 0; 379 rc->rc_skipped = 0; 380 rc->rc_force_repair = 0; 381 rc->rc_allow_repair = 1; 382 rc->rc_need_orig_restore = B_FALSE; 383 384 if (c >= acols) 385 rc->rc_size = 0; 386 else if (c < bc) 387 rc->rc_size = (q + 1) << ashift; 388 else 389 rc->rc_size = q << ashift; 390 391 asize += rc->rc_size; 392 } 393 394 ASSERT3U(asize, ==, tot << ashift); 395 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 396 rm->rm_skipstart = bc; 397 398 /* 399 * If all data stored spans all columns, there's a danger that parity 400 * will always be on the same device and, since parity isn't read 401 * during normal operation, that device's I/O bandwidth won't be 402 * used effectively. We therefore switch the parity every 1MB. 403 * 404 * ... at least that was, ostensibly, the theory. As a practical 405 * matter unless we juggle the parity between all devices evenly, we 406 * won't see any benefit. Further, occasional writes that aren't a 407 * multiple of the LCM of the number of children and the minimum 408 * stripe width are sufficient to avoid pessimal behavior. 409 * Unfortunately, this decision created an implicit on-disk format 410 * requirement that we need to support for all eternity, but only 411 * for single-parity RAID-Z. 412 * 413 * If we intend to skip a sector in the zeroth column for padding 414 * we must make sure to note this swap. We will never intend to 415 * skip the first column since at least one data and one parity 416 * column must appear in each row. 417 */ 418 ASSERT(rr->rr_cols >= 2); 419 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 420 421 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 422 devidx = rr->rr_col[0].rc_devidx; 423 o = rr->rr_col[0].rc_offset; 424 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 425 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 426 rr->rr_col[1].rc_devidx = devidx; 427 rr->rr_col[1].rc_offset = o; 428 429 if (rm->rm_skipstart == 0) 430 rm->rm_skipstart = 1; 431 } 432 433 if (zio->io_type == ZIO_TYPE_WRITE) { 434 vdev_raidz_map_alloc_write(zio, rm, ashift); 435 } else { 436 vdev_raidz_map_alloc_read(zio, rm); 437 } 438 439 /* init RAIDZ parity ops */ 440 rm->rm_ops = vdev_raidz_math_get_ops(); 441 442 return (rm); 443 } 444 445 struct pqr_struct { 446 uint64_t *p; 447 uint64_t *q; 448 uint64_t *r; 449 }; 450 451 static int 452 vdev_raidz_p_func(void *buf, size_t size, void *private) 453 { 454 struct pqr_struct *pqr = private; 455 const uint64_t *src = buf; 456 int i, cnt = size / sizeof (src[0]); 457 458 ASSERT(pqr->p && !pqr->q && !pqr->r); 459 460 for (i = 0; i < cnt; i++, src++, pqr->p++) 461 *pqr->p ^= *src; 462 463 return (0); 464 } 465 466 static int 467 vdev_raidz_pq_func(void *buf, size_t size, void *private) 468 { 469 struct pqr_struct *pqr = private; 470 const uint64_t *src = buf; 471 uint64_t mask; 472 int i, cnt = size / sizeof (src[0]); 473 474 ASSERT(pqr->p && pqr->q && !pqr->r); 475 476 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 477 *pqr->p ^= *src; 478 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 479 *pqr->q ^= *src; 480 } 481 482 return (0); 483 } 484 485 static int 486 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 487 { 488 struct pqr_struct *pqr = private; 489 const uint64_t *src = buf; 490 uint64_t mask; 491 int i, cnt = size / sizeof (src[0]); 492 493 ASSERT(pqr->p && pqr->q && pqr->r); 494 495 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 496 *pqr->p ^= *src; 497 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 498 *pqr->q ^= *src; 499 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 500 *pqr->r ^= *src; 501 } 502 503 return (0); 504 } 505 506 static void 507 vdev_raidz_generate_parity_p(raidz_row_t *rr) 508 { 509 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 510 511 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 512 abd_t *src = rr->rr_col[c].rc_abd; 513 514 if (c == rr->rr_firstdatacol) { 515 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 516 } else { 517 struct pqr_struct pqr = { p, NULL, NULL }; 518 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 519 vdev_raidz_p_func, &pqr); 520 } 521 } 522 } 523 524 static void 525 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 526 { 527 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 528 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 529 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 530 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 531 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 532 533 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 534 abd_t *src = rr->rr_col[c].rc_abd; 535 536 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 537 538 if (c == rr->rr_firstdatacol) { 539 ASSERT(ccnt == pcnt || ccnt == 0); 540 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 541 (void) memcpy(q, p, rr->rr_col[c].rc_size); 542 543 for (uint64_t i = ccnt; i < pcnt; i++) { 544 p[i] = 0; 545 q[i] = 0; 546 } 547 } else { 548 struct pqr_struct pqr = { p, q, NULL }; 549 550 ASSERT(ccnt <= pcnt); 551 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 552 vdev_raidz_pq_func, &pqr); 553 554 /* 555 * Treat short columns as though they are full of 0s. 556 * Note that there's therefore nothing needed for P. 557 */ 558 uint64_t mask; 559 for (uint64_t i = ccnt; i < pcnt; i++) { 560 VDEV_RAIDZ_64MUL_2(q[i], mask); 561 } 562 } 563 } 564 } 565 566 static void 567 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 568 { 569 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 570 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 571 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 572 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 573 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 574 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 575 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 576 rr->rr_col[VDEV_RAIDZ_R].rc_size); 577 578 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 579 abd_t *src = rr->rr_col[c].rc_abd; 580 581 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 582 583 if (c == rr->rr_firstdatacol) { 584 ASSERT(ccnt == pcnt || ccnt == 0); 585 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 586 (void) memcpy(q, p, rr->rr_col[c].rc_size); 587 (void) memcpy(r, p, rr->rr_col[c].rc_size); 588 589 for (uint64_t i = ccnt; i < pcnt; i++) { 590 p[i] = 0; 591 q[i] = 0; 592 r[i] = 0; 593 } 594 } else { 595 struct pqr_struct pqr = { p, q, r }; 596 597 ASSERT(ccnt <= pcnt); 598 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 599 vdev_raidz_pqr_func, &pqr); 600 601 /* 602 * Treat short columns as though they are full of 0s. 603 * Note that there's therefore nothing needed for P. 604 */ 605 uint64_t mask; 606 for (uint64_t i = ccnt; i < pcnt; i++) { 607 VDEV_RAIDZ_64MUL_2(q[i], mask); 608 VDEV_RAIDZ_64MUL_4(r[i], mask); 609 } 610 } 611 } 612 } 613 614 /* 615 * Generate RAID parity in the first virtual columns according to the number of 616 * parity columns available. 617 */ 618 void 619 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 620 { 621 ASSERT3U(rr->rr_cols, !=, 0); 622 623 /* Generate using the new math implementation */ 624 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 625 return; 626 627 switch (rr->rr_firstdatacol) { 628 case 1: 629 vdev_raidz_generate_parity_p(rr); 630 break; 631 case 2: 632 vdev_raidz_generate_parity_pq(rr); 633 break; 634 case 3: 635 vdev_raidz_generate_parity_pqr(rr); 636 break; 637 default: 638 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 639 } 640 } 641 642 void 643 vdev_raidz_generate_parity(raidz_map_t *rm) 644 { 645 for (int i = 0; i < rm->rm_nrows; i++) { 646 raidz_row_t *rr = rm->rm_row[i]; 647 vdev_raidz_generate_parity_row(rm, rr); 648 } 649 } 650 651 static int 652 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 653 { 654 (void) private; 655 uint64_t *dst = dbuf; 656 uint64_t *src = sbuf; 657 int cnt = size / sizeof (src[0]); 658 659 for (int i = 0; i < cnt; i++) { 660 dst[i] ^= src[i]; 661 } 662 663 return (0); 664 } 665 666 static int 667 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 668 void *private) 669 { 670 (void) private; 671 uint64_t *dst = dbuf; 672 uint64_t *src = sbuf; 673 uint64_t mask; 674 int cnt = size / sizeof (dst[0]); 675 676 for (int i = 0; i < cnt; i++, dst++, src++) { 677 VDEV_RAIDZ_64MUL_2(*dst, mask); 678 *dst ^= *src; 679 } 680 681 return (0); 682 } 683 684 static int 685 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 686 { 687 (void) private; 688 uint64_t *dst = buf; 689 uint64_t mask; 690 int cnt = size / sizeof (dst[0]); 691 692 for (int i = 0; i < cnt; i++, dst++) { 693 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 694 VDEV_RAIDZ_64MUL_2(*dst, mask); 695 } 696 697 return (0); 698 } 699 700 struct reconst_q_struct { 701 uint64_t *q; 702 int exp; 703 }; 704 705 static int 706 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 707 { 708 struct reconst_q_struct *rq = private; 709 uint64_t *dst = buf; 710 int cnt = size / sizeof (dst[0]); 711 712 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 713 int j; 714 uint8_t *b; 715 716 *dst ^= *rq->q; 717 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 718 *b = vdev_raidz_exp2(*b, rq->exp); 719 } 720 } 721 722 return (0); 723 } 724 725 struct reconst_pq_struct { 726 uint8_t *p; 727 uint8_t *q; 728 uint8_t *pxy; 729 uint8_t *qxy; 730 int aexp; 731 int bexp; 732 }; 733 734 static int 735 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 736 { 737 struct reconst_pq_struct *rpq = private; 738 uint8_t *xd = xbuf; 739 uint8_t *yd = ybuf; 740 741 for (int i = 0; i < size; 742 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 743 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 744 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 745 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 746 } 747 748 return (0); 749 } 750 751 static int 752 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 753 { 754 struct reconst_pq_struct *rpq = private; 755 uint8_t *xd = xbuf; 756 757 for (int i = 0; i < size; 758 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 759 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 760 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 761 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 762 } 763 764 return (0); 765 } 766 767 static void 768 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 769 { 770 int x = tgts[0]; 771 abd_t *dst, *src; 772 773 ASSERT3U(ntgts, ==, 1); 774 ASSERT3U(x, >=, rr->rr_firstdatacol); 775 ASSERT3U(x, <, rr->rr_cols); 776 777 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 778 779 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 780 dst = rr->rr_col[x].rc_abd; 781 782 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 783 784 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 785 uint64_t size = MIN(rr->rr_col[x].rc_size, 786 rr->rr_col[c].rc_size); 787 788 src = rr->rr_col[c].rc_abd; 789 790 if (c == x) 791 continue; 792 793 (void) abd_iterate_func2(dst, src, 0, 0, size, 794 vdev_raidz_reconst_p_func, NULL); 795 } 796 } 797 798 static void 799 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 800 { 801 int x = tgts[0]; 802 int c, exp; 803 abd_t *dst, *src; 804 805 ASSERT(ntgts == 1); 806 807 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 808 809 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 810 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 811 rr->rr_col[c].rc_size); 812 813 src = rr->rr_col[c].rc_abd; 814 dst = rr->rr_col[x].rc_abd; 815 816 if (c == rr->rr_firstdatacol) { 817 abd_copy(dst, src, size); 818 if (rr->rr_col[x].rc_size > size) { 819 abd_zero_off(dst, size, 820 rr->rr_col[x].rc_size - size); 821 } 822 } else { 823 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 824 (void) abd_iterate_func2(dst, src, 0, 0, size, 825 vdev_raidz_reconst_q_pre_func, NULL); 826 (void) abd_iterate_func(dst, 827 size, rr->rr_col[x].rc_size - size, 828 vdev_raidz_reconst_q_pre_tail_func, NULL); 829 } 830 } 831 832 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 833 dst = rr->rr_col[x].rc_abd; 834 exp = 255 - (rr->rr_cols - 1 - x); 835 836 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 837 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 838 vdev_raidz_reconst_q_post_func, &rq); 839 } 840 841 static void 842 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 843 { 844 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 845 abd_t *pdata, *qdata; 846 uint64_t xsize, ysize; 847 int x = tgts[0]; 848 int y = tgts[1]; 849 abd_t *xd, *yd; 850 851 ASSERT(ntgts == 2); 852 ASSERT(x < y); 853 ASSERT(x >= rr->rr_firstdatacol); 854 ASSERT(y < rr->rr_cols); 855 856 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 857 858 /* 859 * Move the parity data aside -- we're going to compute parity as 860 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 861 * reuse the parity generation mechanism without trashing the actual 862 * parity so we make those columns appear to be full of zeros by 863 * setting their lengths to zero. 864 */ 865 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 866 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 867 xsize = rr->rr_col[x].rc_size; 868 ysize = rr->rr_col[y].rc_size; 869 870 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 871 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 872 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 873 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 874 rr->rr_col[x].rc_size = 0; 875 rr->rr_col[y].rc_size = 0; 876 877 vdev_raidz_generate_parity_pq(rr); 878 879 rr->rr_col[x].rc_size = xsize; 880 rr->rr_col[y].rc_size = ysize; 881 882 p = abd_to_buf(pdata); 883 q = abd_to_buf(qdata); 884 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 885 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 886 xd = rr->rr_col[x].rc_abd; 887 yd = rr->rr_col[y].rc_abd; 888 889 /* 890 * We now have: 891 * Pxy = P + D_x + D_y 892 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 893 * 894 * We can then solve for D_x: 895 * D_x = A * (P + Pxy) + B * (Q + Qxy) 896 * where 897 * A = 2^(x - y) * (2^(x - y) + 1)^-1 898 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 899 * 900 * With D_x in hand, we can easily solve for D_y: 901 * D_y = P + Pxy + D_x 902 */ 903 904 a = vdev_raidz_pow2[255 + x - y]; 905 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 906 tmp = 255 - vdev_raidz_log2[a ^ 1]; 907 908 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 909 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 910 911 ASSERT3U(xsize, >=, ysize); 912 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 913 914 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 915 vdev_raidz_reconst_pq_func, &rpq); 916 (void) abd_iterate_func(xd, ysize, xsize - ysize, 917 vdev_raidz_reconst_pq_tail_func, &rpq); 918 919 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 920 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 921 922 /* 923 * Restore the saved parity data. 924 */ 925 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 926 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 927 } 928 929 /* 930 * In the general case of reconstruction, we must solve the system of linear 931 * equations defined by the coefficients used to generate parity as well as 932 * the contents of the data and parity disks. This can be expressed with 933 * vectors for the original data (D) and the actual data (d) and parity (p) 934 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 935 * 936 * __ __ __ __ 937 * | | __ __ | p_0 | 938 * | V | | D_0 | | p_m-1 | 939 * | | x | : | = | d_0 | 940 * | I | | D_n-1 | | : | 941 * | | ~~ ~~ | d_n-1 | 942 * ~~ ~~ ~~ ~~ 943 * 944 * I is simply a square identity matrix of size n, and V is a vandermonde 945 * matrix defined by the coefficients we chose for the various parity columns 946 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 947 * computation as well as linear separability. 948 * 949 * __ __ __ __ 950 * | 1 .. 1 1 1 | | p_0 | 951 * | 2^n-1 .. 4 2 1 | __ __ | : | 952 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 953 * | 1 .. 0 0 0 | | D_1 | | d_0 | 954 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 955 * | : : : : | | : | | d_2 | 956 * | 0 .. 1 0 0 | | D_n-1 | | : | 957 * | 0 .. 0 1 0 | ~~ ~~ | : | 958 * | 0 .. 0 0 1 | | d_n-1 | 959 * ~~ ~~ ~~ ~~ 960 * 961 * Note that I, V, d, and p are known. To compute D, we must invert the 962 * matrix and use the known data and parity values to reconstruct the unknown 963 * data values. We begin by removing the rows in V|I and d|p that correspond 964 * to failed or missing columns; we then make V|I square (n x n) and d|p 965 * sized n by removing rows corresponding to unused parity from the bottom up 966 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 967 * using Gauss-Jordan elimination. In the example below we use m=3 parity 968 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 969 * __ __ 970 * | 1 1 1 1 1 1 1 1 | 971 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 972 * | 19 205 116 29 64 16 4 1 | / / 973 * | 1 0 0 0 0 0 0 0 | / / 974 * | 0 1 0 0 0 0 0 0 | <--' / 975 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 976 * | 0 0 0 1 0 0 0 0 | 977 * | 0 0 0 0 1 0 0 0 | 978 * | 0 0 0 0 0 1 0 0 | 979 * | 0 0 0 0 0 0 1 0 | 980 * | 0 0 0 0 0 0 0 1 | 981 * ~~ ~~ 982 * __ __ 983 * | 1 1 1 1 1 1 1 1 | 984 * | 128 64 32 16 8 4 2 1 | 985 * | 19 205 116 29 64 16 4 1 | 986 * | 1 0 0 0 0 0 0 0 | 987 * | 0 1 0 0 0 0 0 0 | 988 * (V|I)' = | 0 0 1 0 0 0 0 0 | 989 * | 0 0 0 1 0 0 0 0 | 990 * | 0 0 0 0 1 0 0 0 | 991 * | 0 0 0 0 0 1 0 0 | 992 * | 0 0 0 0 0 0 1 0 | 993 * | 0 0 0 0 0 0 0 1 | 994 * ~~ ~~ 995 * 996 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 997 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 998 * matrix is not singular. 999 * __ __ 1000 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1001 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1002 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1003 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1004 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1005 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1006 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1007 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1008 * ~~ ~~ 1009 * __ __ 1010 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1011 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1012 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1013 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1014 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1015 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1016 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1017 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1018 * ~~ ~~ 1019 * __ __ 1020 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1021 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1022 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1023 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1024 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1025 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1026 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1027 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1028 * ~~ ~~ 1029 * __ __ 1030 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1031 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1032 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1033 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1034 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1035 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1036 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1037 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1038 * ~~ ~~ 1039 * __ __ 1040 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1041 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1042 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1043 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1044 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1045 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1046 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1047 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1048 * ~~ ~~ 1049 * __ __ 1050 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1051 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1052 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1053 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1054 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1055 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1056 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1057 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1058 * ~~ ~~ 1059 * __ __ 1060 * | 0 0 1 0 0 0 0 0 | 1061 * | 167 100 5 41 159 169 217 208 | 1062 * | 166 100 4 40 158 168 216 209 | 1063 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1064 * | 0 0 0 0 1 0 0 0 | 1065 * | 0 0 0 0 0 1 0 0 | 1066 * | 0 0 0 0 0 0 1 0 | 1067 * | 0 0 0 0 0 0 0 1 | 1068 * ~~ ~~ 1069 * 1070 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1071 * of the missing data. 1072 * 1073 * As is apparent from the example above, the only non-trivial rows in the 1074 * inverse matrix correspond to the data disks that we're trying to 1075 * reconstruct. Indeed, those are the only rows we need as the others would 1076 * only be useful for reconstructing data known or assumed to be valid. For 1077 * that reason, we only build the coefficients in the rows that correspond to 1078 * targeted columns. 1079 */ 1080 1081 static void 1082 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1083 uint8_t **rows) 1084 { 1085 int i, j; 1086 int pow; 1087 1088 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1089 1090 /* 1091 * Fill in the missing rows of interest. 1092 */ 1093 for (i = 0; i < nmap; i++) { 1094 ASSERT3S(0, <=, map[i]); 1095 ASSERT3S(map[i], <=, 2); 1096 1097 pow = map[i] * n; 1098 if (pow > 255) 1099 pow -= 255; 1100 ASSERT(pow <= 255); 1101 1102 for (j = 0; j < n; j++) { 1103 pow -= map[i]; 1104 if (pow < 0) 1105 pow += 255; 1106 rows[i][j] = vdev_raidz_pow2[pow]; 1107 } 1108 } 1109 } 1110 1111 static void 1112 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1113 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1114 { 1115 int i, j, ii, jj; 1116 uint8_t log; 1117 1118 /* 1119 * Assert that the first nmissing entries from the array of used 1120 * columns correspond to parity columns and that subsequent entries 1121 * correspond to data columns. 1122 */ 1123 for (i = 0; i < nmissing; i++) { 1124 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1125 } 1126 for (; i < n; i++) { 1127 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1128 } 1129 1130 /* 1131 * First initialize the storage where we'll compute the inverse rows. 1132 */ 1133 for (i = 0; i < nmissing; i++) { 1134 for (j = 0; j < n; j++) { 1135 invrows[i][j] = (i == j) ? 1 : 0; 1136 } 1137 } 1138 1139 /* 1140 * Subtract all trivial rows from the rows of consequence. 1141 */ 1142 for (i = 0; i < nmissing; i++) { 1143 for (j = nmissing; j < n; j++) { 1144 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1145 jj = used[j] - rr->rr_firstdatacol; 1146 ASSERT3S(jj, <, n); 1147 invrows[i][j] = rows[i][jj]; 1148 rows[i][jj] = 0; 1149 } 1150 } 1151 1152 /* 1153 * For each of the rows of interest, we must normalize it and subtract 1154 * a multiple of it from the other rows. 1155 */ 1156 for (i = 0; i < nmissing; i++) { 1157 for (j = 0; j < missing[i]; j++) { 1158 ASSERT0(rows[i][j]); 1159 } 1160 ASSERT3U(rows[i][missing[i]], !=, 0); 1161 1162 /* 1163 * Compute the inverse of the first element and multiply each 1164 * element in the row by that value. 1165 */ 1166 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1167 1168 for (j = 0; j < n; j++) { 1169 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1170 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1171 } 1172 1173 for (ii = 0; ii < nmissing; ii++) { 1174 if (i == ii) 1175 continue; 1176 1177 ASSERT3U(rows[ii][missing[i]], !=, 0); 1178 1179 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1180 1181 for (j = 0; j < n; j++) { 1182 rows[ii][j] ^= 1183 vdev_raidz_exp2(rows[i][j], log); 1184 invrows[ii][j] ^= 1185 vdev_raidz_exp2(invrows[i][j], log); 1186 } 1187 } 1188 } 1189 1190 /* 1191 * Verify that the data that is left in the rows are properly part of 1192 * an identity matrix. 1193 */ 1194 for (i = 0; i < nmissing; i++) { 1195 for (j = 0; j < n; j++) { 1196 if (j == missing[i]) { 1197 ASSERT3U(rows[i][j], ==, 1); 1198 } else { 1199 ASSERT0(rows[i][j]); 1200 } 1201 } 1202 } 1203 } 1204 1205 static void 1206 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1207 int *missing, uint8_t **invrows, const uint8_t *used) 1208 { 1209 int i, j, x, cc, c; 1210 uint8_t *src; 1211 uint64_t ccount; 1212 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1213 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1214 uint8_t log = 0; 1215 uint8_t val; 1216 int ll; 1217 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1218 uint8_t *p, *pp; 1219 size_t psize; 1220 1221 psize = sizeof (invlog[0][0]) * n * nmissing; 1222 p = kmem_alloc(psize, KM_SLEEP); 1223 1224 for (pp = p, i = 0; i < nmissing; i++) { 1225 invlog[i] = pp; 1226 pp += n; 1227 } 1228 1229 for (i = 0; i < nmissing; i++) { 1230 for (j = 0; j < n; j++) { 1231 ASSERT3U(invrows[i][j], !=, 0); 1232 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1233 } 1234 } 1235 1236 for (i = 0; i < n; i++) { 1237 c = used[i]; 1238 ASSERT3U(c, <, rr->rr_cols); 1239 1240 ccount = rr->rr_col[c].rc_size; 1241 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1242 if (ccount == 0) 1243 continue; 1244 src = abd_to_buf(rr->rr_col[c].rc_abd); 1245 for (j = 0; j < nmissing; j++) { 1246 cc = missing[j] + rr->rr_firstdatacol; 1247 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1248 ASSERT3U(cc, <, rr->rr_cols); 1249 ASSERT3U(cc, !=, c); 1250 1251 dcount[j] = rr->rr_col[cc].rc_size; 1252 if (dcount[j] != 0) 1253 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1254 } 1255 1256 for (x = 0; x < ccount; x++, src++) { 1257 if (*src != 0) 1258 log = vdev_raidz_log2[*src]; 1259 1260 for (cc = 0; cc < nmissing; cc++) { 1261 if (x >= dcount[cc]) 1262 continue; 1263 1264 if (*src == 0) { 1265 val = 0; 1266 } else { 1267 if ((ll = log + invlog[cc][i]) >= 255) 1268 ll -= 255; 1269 val = vdev_raidz_pow2[ll]; 1270 } 1271 1272 if (i == 0) 1273 dst[cc][x] = val; 1274 else 1275 dst[cc][x] ^= val; 1276 } 1277 } 1278 } 1279 1280 kmem_free(p, psize); 1281 } 1282 1283 static void 1284 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1285 { 1286 int n, i, c, t, tt; 1287 int nmissing_rows; 1288 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1289 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1290 uint8_t *p, *pp; 1291 size_t psize; 1292 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1293 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1294 uint8_t *used; 1295 1296 abd_t **bufs = NULL; 1297 1298 /* 1299 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1300 * temporary linear ABDs if any non-linear ABDs are found. 1301 */ 1302 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1303 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1304 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1305 KM_PUSHPAGE); 1306 1307 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1308 raidz_col_t *col = &rr->rr_col[c]; 1309 1310 bufs[c] = col->rc_abd; 1311 if (bufs[c] != NULL) { 1312 col->rc_abd = abd_alloc_linear( 1313 col->rc_size, B_TRUE); 1314 abd_copy(col->rc_abd, bufs[c], 1315 col->rc_size); 1316 } 1317 } 1318 1319 break; 1320 } 1321 } 1322 1323 n = rr->rr_cols - rr->rr_firstdatacol; 1324 1325 /* 1326 * Figure out which data columns are missing. 1327 */ 1328 nmissing_rows = 0; 1329 for (t = 0; t < ntgts; t++) { 1330 if (tgts[t] >= rr->rr_firstdatacol) { 1331 missing_rows[nmissing_rows++] = 1332 tgts[t] - rr->rr_firstdatacol; 1333 } 1334 } 1335 1336 /* 1337 * Figure out which parity columns to use to help generate the missing 1338 * data columns. 1339 */ 1340 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1341 ASSERT(tt < ntgts); 1342 ASSERT(c < rr->rr_firstdatacol); 1343 1344 /* 1345 * Skip any targeted parity columns. 1346 */ 1347 if (c == tgts[tt]) { 1348 tt++; 1349 continue; 1350 } 1351 1352 parity_map[i] = c; 1353 i++; 1354 } 1355 1356 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1357 nmissing_rows * n + sizeof (used[0]) * n; 1358 p = kmem_alloc(psize, KM_SLEEP); 1359 1360 for (pp = p, i = 0; i < nmissing_rows; i++) { 1361 rows[i] = pp; 1362 pp += n; 1363 invrows[i] = pp; 1364 pp += n; 1365 } 1366 used = pp; 1367 1368 for (i = 0; i < nmissing_rows; i++) { 1369 used[i] = parity_map[i]; 1370 } 1371 1372 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1373 if (tt < nmissing_rows && 1374 c == missing_rows[tt] + rr->rr_firstdatacol) { 1375 tt++; 1376 continue; 1377 } 1378 1379 ASSERT3S(i, <, n); 1380 used[i] = c; 1381 i++; 1382 } 1383 1384 /* 1385 * Initialize the interesting rows of the matrix. 1386 */ 1387 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 1388 1389 /* 1390 * Invert the matrix. 1391 */ 1392 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 1393 invrows, used); 1394 1395 /* 1396 * Reconstruct the missing data using the generated matrix. 1397 */ 1398 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 1399 invrows, used); 1400 1401 kmem_free(p, psize); 1402 1403 /* 1404 * copy back from temporary linear abds and free them 1405 */ 1406 if (bufs) { 1407 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1408 raidz_col_t *col = &rr->rr_col[c]; 1409 1410 if (bufs[c] != NULL) { 1411 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1412 abd_free(col->rc_abd); 1413 } 1414 col->rc_abd = bufs[c]; 1415 } 1416 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 1417 } 1418 } 1419 1420 static void 1421 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 1422 const int *t, int nt) 1423 { 1424 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1425 int ntgts; 1426 int i, c, ret; 1427 int nbadparity, nbaddata; 1428 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1429 1430 nbadparity = rr->rr_firstdatacol; 1431 nbaddata = rr->rr_cols - nbadparity; 1432 ntgts = 0; 1433 for (i = 0, c = 0; c < rr->rr_cols; c++) { 1434 if (c < rr->rr_firstdatacol) 1435 parity_valid[c] = B_FALSE; 1436 1437 if (i < nt && c == t[i]) { 1438 tgts[ntgts++] = c; 1439 i++; 1440 } else if (rr->rr_col[c].rc_error != 0) { 1441 tgts[ntgts++] = c; 1442 } else if (c >= rr->rr_firstdatacol) { 1443 nbaddata--; 1444 } else { 1445 parity_valid[c] = B_TRUE; 1446 nbadparity--; 1447 } 1448 } 1449 1450 ASSERT(ntgts >= nt); 1451 ASSERT(nbaddata >= 0); 1452 ASSERT(nbaddata + nbadparity == ntgts); 1453 1454 dt = &tgts[nbadparity]; 1455 1456 /* Reconstruct using the new math implementation */ 1457 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 1458 if (ret != RAIDZ_ORIGINAL_IMPL) 1459 return; 1460 1461 /* 1462 * See if we can use any of our optimized reconstruction routines. 1463 */ 1464 switch (nbaddata) { 1465 case 1: 1466 if (parity_valid[VDEV_RAIDZ_P]) { 1467 vdev_raidz_reconstruct_p(rr, dt, 1); 1468 return; 1469 } 1470 1471 ASSERT(rr->rr_firstdatacol > 1); 1472 1473 if (parity_valid[VDEV_RAIDZ_Q]) { 1474 vdev_raidz_reconstruct_q(rr, dt, 1); 1475 return; 1476 } 1477 1478 ASSERT(rr->rr_firstdatacol > 2); 1479 break; 1480 1481 case 2: 1482 ASSERT(rr->rr_firstdatacol > 1); 1483 1484 if (parity_valid[VDEV_RAIDZ_P] && 1485 parity_valid[VDEV_RAIDZ_Q]) { 1486 vdev_raidz_reconstruct_pq(rr, dt, 2); 1487 return; 1488 } 1489 1490 ASSERT(rr->rr_firstdatacol > 2); 1491 1492 break; 1493 } 1494 1495 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 1496 } 1497 1498 static int 1499 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1500 uint64_t *logical_ashift, uint64_t *physical_ashift) 1501 { 1502 vdev_raidz_t *vdrz = vd->vdev_tsd; 1503 uint64_t nparity = vdrz->vd_nparity; 1504 int c; 1505 int lasterror = 0; 1506 int numerrors = 0; 1507 1508 ASSERT(nparity > 0); 1509 1510 if (nparity > VDEV_RAIDZ_MAXPARITY || 1511 vd->vdev_children < nparity + 1) { 1512 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1513 return (SET_ERROR(EINVAL)); 1514 } 1515 1516 vdev_open_children(vd); 1517 1518 for (c = 0; c < vd->vdev_children; c++) { 1519 vdev_t *cvd = vd->vdev_child[c]; 1520 1521 if (cvd->vdev_open_error != 0) { 1522 lasterror = cvd->vdev_open_error; 1523 numerrors++; 1524 continue; 1525 } 1526 1527 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1528 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1529 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1530 *physical_ashift = MAX(*physical_ashift, 1531 cvd->vdev_physical_ashift); 1532 } 1533 1534 *asize *= vd->vdev_children; 1535 *max_asize *= vd->vdev_children; 1536 1537 if (numerrors > nparity) { 1538 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1539 return (lasterror); 1540 } 1541 1542 return (0); 1543 } 1544 1545 static void 1546 vdev_raidz_close(vdev_t *vd) 1547 { 1548 for (int c = 0; c < vd->vdev_children; c++) { 1549 if (vd->vdev_child[c] != NULL) 1550 vdev_close(vd->vdev_child[c]); 1551 } 1552 } 1553 1554 static uint64_t 1555 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1556 { 1557 vdev_raidz_t *vdrz = vd->vdev_tsd; 1558 uint64_t asize; 1559 uint64_t ashift = vd->vdev_top->vdev_ashift; 1560 uint64_t cols = vdrz->vd_logical_width; 1561 uint64_t nparity = vdrz->vd_nparity; 1562 1563 asize = ((psize - 1) >> ashift) + 1; 1564 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1565 asize = roundup(asize, nparity + 1) << ashift; 1566 1567 return (asize); 1568 } 1569 1570 /* 1571 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 1572 * so each child must provide at least 1/Nth of its asize. 1573 */ 1574 static uint64_t 1575 vdev_raidz_min_asize(vdev_t *vd) 1576 { 1577 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 1578 vd->vdev_children); 1579 } 1580 1581 void 1582 vdev_raidz_child_done(zio_t *zio) 1583 { 1584 raidz_col_t *rc = zio->io_private; 1585 1586 ASSERT3P(rc->rc_abd, !=, NULL); 1587 rc->rc_error = zio->io_error; 1588 rc->rc_tried = 1; 1589 rc->rc_skipped = 0; 1590 } 1591 1592 static void 1593 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) 1594 { 1595 #ifdef ZFS_DEBUG 1596 vdev_t *tvd = vd->vdev_top; 1597 1598 range_seg64_t logical_rs, physical_rs, remain_rs; 1599 logical_rs.rs_start = rr->rr_offset; 1600 logical_rs.rs_end = logical_rs.rs_start + 1601 vdev_raidz_asize(vd, rr->rr_size); 1602 1603 raidz_col_t *rc = &rr->rr_col[col]; 1604 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1605 1606 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 1607 ASSERT(vdev_xlate_is_empty(&remain_rs)); 1608 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1609 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1610 /* 1611 * It would be nice to assert that rs_end is equal 1612 * to rc_offset + rc_size but there might be an 1613 * optional I/O at the end that is not accounted in 1614 * rc_size. 1615 */ 1616 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1617 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1618 rc->rc_size + (1 << tvd->vdev_ashift)); 1619 } else { 1620 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1621 } 1622 #endif 1623 } 1624 1625 static void 1626 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) 1627 { 1628 vdev_t *vd = zio->io_vd; 1629 raidz_map_t *rm = zio->io_vsd; 1630 1631 vdev_raidz_generate_parity_row(rm, rr); 1632 1633 for (int c = 0; c < rr->rr_scols; c++) { 1634 raidz_col_t *rc = &rr->rr_col[c]; 1635 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1636 1637 /* Verify physical to logical translation */ 1638 vdev_raidz_io_verify(vd, rr, c); 1639 1640 if (rc->rc_size > 0) { 1641 ASSERT3P(rc->rc_abd, !=, NULL); 1642 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1643 rc->rc_offset, rc->rc_abd, 1644 abd_get_size(rc->rc_abd), zio->io_type, 1645 zio->io_priority, 0, vdev_raidz_child_done, rc)); 1646 } else { 1647 /* 1648 * Generate optional write for skip sector to improve 1649 * aggregation contiguity. 1650 */ 1651 ASSERT3P(rc->rc_abd, ==, NULL); 1652 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1653 rc->rc_offset, NULL, 1ULL << ashift, 1654 zio->io_type, zio->io_priority, 1655 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, 1656 NULL)); 1657 } 1658 } 1659 } 1660 1661 static void 1662 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) 1663 { 1664 vdev_t *vd = zio->io_vd; 1665 1666 /* 1667 * Iterate over the columns in reverse order so that we hit the parity 1668 * last -- any errors along the way will force us to read the parity. 1669 */ 1670 for (int c = rr->rr_cols - 1; c >= 0; c--) { 1671 raidz_col_t *rc = &rr->rr_col[c]; 1672 if (rc->rc_size == 0) 1673 continue; 1674 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1675 if (!vdev_readable(cvd)) { 1676 if (c >= rr->rr_firstdatacol) 1677 rr->rr_missingdata++; 1678 else 1679 rr->rr_missingparity++; 1680 rc->rc_error = SET_ERROR(ENXIO); 1681 rc->rc_tried = 1; /* don't even try */ 1682 rc->rc_skipped = 1; 1683 continue; 1684 } 1685 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1686 if (c >= rr->rr_firstdatacol) 1687 rr->rr_missingdata++; 1688 else 1689 rr->rr_missingparity++; 1690 rc->rc_error = SET_ERROR(ESTALE); 1691 rc->rc_skipped = 1; 1692 continue; 1693 } 1694 if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 1695 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1696 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1697 rc->rc_offset, rc->rc_abd, rc->rc_size, 1698 zio->io_type, zio->io_priority, 0, 1699 vdev_raidz_child_done, rc)); 1700 } 1701 } 1702 } 1703 1704 /* 1705 * Start an IO operation on a RAIDZ VDev 1706 * 1707 * Outline: 1708 * - For write operations: 1709 * 1. Generate the parity data 1710 * 2. Create child zio write operations to each column's vdev, for both 1711 * data and parity. 1712 * 3. If the column skips any sectors for padding, create optional dummy 1713 * write zio children for those areas to improve aggregation continuity. 1714 * - For read operations: 1715 * 1. Create child zio read operations to each data column's vdev to read 1716 * the range of data required for zio. 1717 * 2. If this is a scrub or resilver operation, or if any of the data 1718 * vdevs have had errors, then create zio read operations to the parity 1719 * columns' VDevs as well. 1720 */ 1721 static void 1722 vdev_raidz_io_start(zio_t *zio) 1723 { 1724 vdev_t *vd = zio->io_vd; 1725 vdev_t *tvd = vd->vdev_top; 1726 vdev_raidz_t *vdrz = vd->vdev_tsd; 1727 1728 raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, 1729 vdrz->vd_logical_width, vdrz->vd_nparity); 1730 zio->io_vsd = rm; 1731 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1732 1733 /* 1734 * Until raidz expansion is implemented all maps for a raidz vdev 1735 * contain a single row. 1736 */ 1737 ASSERT3U(rm->rm_nrows, ==, 1); 1738 raidz_row_t *rr = rm->rm_row[0]; 1739 1740 if (zio->io_type == ZIO_TYPE_WRITE) { 1741 vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); 1742 } else { 1743 ASSERT(zio->io_type == ZIO_TYPE_READ); 1744 vdev_raidz_io_start_read(zio, rr); 1745 } 1746 1747 zio_execute(zio); 1748 } 1749 1750 /* 1751 * Report a checksum error for a child of a RAID-Z device. 1752 */ 1753 void 1754 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1755 { 1756 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1757 1758 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 1759 zio->io_priority != ZIO_PRIORITY_REBUILD) { 1760 zio_bad_cksum_t zbc; 1761 raidz_map_t *rm = zio->io_vsd; 1762 1763 zbc.zbc_has_cksum = 0; 1764 zbc.zbc_injected = rm->rm_ecksuminjected; 1765 1766 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1767 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1768 rc->rc_abd, bad_data, &zbc); 1769 mutex_enter(&vd->vdev_stat_lock); 1770 vd->vdev_stat.vs_checksum_errors++; 1771 mutex_exit(&vd->vdev_stat_lock); 1772 } 1773 } 1774 1775 /* 1776 * We keep track of whether or not there were any injected errors, so that 1777 * any ereports we generate can note it. 1778 */ 1779 static int 1780 raidz_checksum_verify(zio_t *zio) 1781 { 1782 zio_bad_cksum_t zbc = {{{0}}}; 1783 raidz_map_t *rm = zio->io_vsd; 1784 1785 int ret = zio_checksum_error(zio, &zbc); 1786 if (ret != 0 && zbc.zbc_injected != 0) 1787 rm->rm_ecksuminjected = 1; 1788 1789 return (ret); 1790 } 1791 1792 /* 1793 * Generate the parity from the data columns. If we tried and were able to 1794 * read the parity without error, verify that the generated parity matches the 1795 * data we read. If it doesn't, we fire off a checksum error. Return the 1796 * number of such failures. 1797 */ 1798 static int 1799 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 1800 { 1801 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1802 int c, ret = 0; 1803 raidz_map_t *rm = zio->io_vsd; 1804 raidz_col_t *rc; 1805 1806 blkptr_t *bp = zio->io_bp; 1807 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1808 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1809 1810 if (checksum == ZIO_CHECKSUM_NOPARITY) 1811 return (ret); 1812 1813 for (c = 0; c < rr->rr_firstdatacol; c++) { 1814 rc = &rr->rr_col[c]; 1815 if (!rc->rc_tried || rc->rc_error != 0) 1816 continue; 1817 1818 orig[c] = rc->rc_abd; 1819 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 1820 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 1821 } 1822 1823 /* 1824 * Verify any empty sectors are zero filled to ensure the parity 1825 * is calculated correctly even if these non-data sectors are damaged. 1826 */ 1827 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 1828 ret += vdev_draid_map_verify_empty(zio, rr); 1829 1830 /* 1831 * Regenerates parity even for !tried||rc_error!=0 columns. This 1832 * isn't harmful but it does have the side effect of fixing stuff 1833 * we didn't realize was necessary (i.e. even if we return 0). 1834 */ 1835 vdev_raidz_generate_parity_row(rm, rr); 1836 1837 for (c = 0; c < rr->rr_firstdatacol; c++) { 1838 rc = &rr->rr_col[c]; 1839 1840 if (!rc->rc_tried || rc->rc_error != 0) 1841 continue; 1842 1843 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 1844 vdev_raidz_checksum_error(zio, rc, orig[c]); 1845 rc->rc_error = SET_ERROR(ECKSUM); 1846 ret++; 1847 } 1848 abd_free(orig[c]); 1849 } 1850 1851 return (ret); 1852 } 1853 1854 static int 1855 vdev_raidz_worst_error(raidz_row_t *rr) 1856 { 1857 int error = 0; 1858 1859 for (int c = 0; c < rr->rr_cols; c++) 1860 error = zio_worst_error(error, rr->rr_col[c].rc_error); 1861 1862 return (error); 1863 } 1864 1865 static void 1866 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 1867 { 1868 int unexpected_errors = 0; 1869 int parity_errors = 0; 1870 int parity_untried = 0; 1871 int data_errors = 0; 1872 1873 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 1874 1875 for (int c = 0; c < rr->rr_cols; c++) { 1876 raidz_col_t *rc = &rr->rr_col[c]; 1877 1878 if (rc->rc_error) { 1879 if (c < rr->rr_firstdatacol) 1880 parity_errors++; 1881 else 1882 data_errors++; 1883 1884 if (!rc->rc_skipped) 1885 unexpected_errors++; 1886 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 1887 parity_untried++; 1888 } 1889 1890 if (rc->rc_force_repair) 1891 unexpected_errors++; 1892 } 1893 1894 /* 1895 * If we read more parity disks than were used for 1896 * reconstruction, confirm that the other parity disks produced 1897 * correct data. 1898 * 1899 * Note that we also regenerate parity when resilvering so we 1900 * can write it out to failed devices later. 1901 */ 1902 if (parity_errors + parity_untried < 1903 rr->rr_firstdatacol - data_errors || 1904 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1905 int n = raidz_parity_verify(zio, rr); 1906 unexpected_errors += n; 1907 } 1908 1909 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1910 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1911 /* 1912 * Use the good data we have in hand to repair damaged children. 1913 */ 1914 for (int c = 0; c < rr->rr_cols; c++) { 1915 raidz_col_t *rc = &rr->rr_col[c]; 1916 vdev_t *vd = zio->io_vd; 1917 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1918 1919 if (!rc->rc_allow_repair) { 1920 continue; 1921 } else if (!rc->rc_force_repair && 1922 (rc->rc_error == 0 || rc->rc_size == 0)) { 1923 continue; 1924 } 1925 1926 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1927 rc->rc_offset, rc->rc_abd, rc->rc_size, 1928 ZIO_TYPE_WRITE, 1929 zio->io_priority == ZIO_PRIORITY_REBUILD ? 1930 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 1931 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1932 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1933 } 1934 } 1935 } 1936 1937 static void 1938 raidz_restore_orig_data(raidz_map_t *rm) 1939 { 1940 for (int i = 0; i < rm->rm_nrows; i++) { 1941 raidz_row_t *rr = rm->rm_row[i]; 1942 for (int c = 0; c < rr->rr_cols; c++) { 1943 raidz_col_t *rc = &rr->rr_col[c]; 1944 if (rc->rc_need_orig_restore) { 1945 abd_copy(rc->rc_abd, 1946 rc->rc_orig_data, rc->rc_size); 1947 rc->rc_need_orig_restore = B_FALSE; 1948 } 1949 } 1950 } 1951 } 1952 1953 /* 1954 * returns EINVAL if reconstruction of the block will not be possible 1955 * returns ECKSUM if this specific reconstruction failed 1956 * returns 0 on successful reconstruction 1957 */ 1958 static int 1959 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 1960 { 1961 raidz_map_t *rm = zio->io_vsd; 1962 1963 /* Reconstruct each row */ 1964 for (int r = 0; r < rm->rm_nrows; r++) { 1965 raidz_row_t *rr = rm->rm_row[r]; 1966 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 1967 int t = 0; 1968 int dead = 0; 1969 int dead_data = 0; 1970 1971 for (int c = 0; c < rr->rr_cols; c++) { 1972 raidz_col_t *rc = &rr->rr_col[c]; 1973 ASSERT0(rc->rc_need_orig_restore); 1974 if (rc->rc_error != 0) { 1975 dead++; 1976 if (c >= nparity) 1977 dead_data++; 1978 continue; 1979 } 1980 if (rc->rc_size == 0) 1981 continue; 1982 for (int lt = 0; lt < ntgts; lt++) { 1983 if (rc->rc_devidx == ltgts[lt]) { 1984 if (rc->rc_orig_data == NULL) { 1985 rc->rc_orig_data = 1986 abd_alloc_linear( 1987 rc->rc_size, B_TRUE); 1988 abd_copy(rc->rc_orig_data, 1989 rc->rc_abd, rc->rc_size); 1990 } 1991 rc->rc_need_orig_restore = B_TRUE; 1992 1993 dead++; 1994 if (c >= nparity) 1995 dead_data++; 1996 my_tgts[t++] = c; 1997 break; 1998 } 1999 } 2000 } 2001 if (dead > nparity) { 2002 /* reconstruction not possible */ 2003 raidz_restore_orig_data(rm); 2004 return (EINVAL); 2005 } 2006 if (dead_data > 0) 2007 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 2008 } 2009 2010 /* Check for success */ 2011 if (raidz_checksum_verify(zio) == 0) { 2012 2013 /* Reconstruction succeeded - report errors */ 2014 for (int i = 0; i < rm->rm_nrows; i++) { 2015 raidz_row_t *rr = rm->rm_row[i]; 2016 2017 for (int c = 0; c < rr->rr_cols; c++) { 2018 raidz_col_t *rc = &rr->rr_col[c]; 2019 if (rc->rc_need_orig_restore) { 2020 /* 2021 * Note: if this is a parity column, 2022 * we don't really know if it's wrong. 2023 * We need to let 2024 * vdev_raidz_io_done_verified() check 2025 * it, and if we set rc_error, it will 2026 * think that it is a "known" error 2027 * that doesn't need to be checked 2028 * or corrected. 2029 */ 2030 if (rc->rc_error == 0 && 2031 c >= rr->rr_firstdatacol) { 2032 vdev_raidz_checksum_error(zio, 2033 rc, rc->rc_orig_data); 2034 rc->rc_error = 2035 SET_ERROR(ECKSUM); 2036 } 2037 rc->rc_need_orig_restore = B_FALSE; 2038 } 2039 } 2040 2041 vdev_raidz_io_done_verified(zio, rr); 2042 } 2043 2044 zio_checksum_verified(zio); 2045 2046 return (0); 2047 } 2048 2049 /* Reconstruction failed - restore original data */ 2050 raidz_restore_orig_data(rm); 2051 return (ECKSUM); 2052 } 2053 2054 /* 2055 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 2056 * Note that the algorithm below is non-optimal because it doesn't take into 2057 * account how reconstruction is actually performed. For example, with 2058 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2059 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2060 * cases we'd only use parity information in column 0. 2061 * 2062 * The order that we find the various possible combinations of failed 2063 * disks is dictated by these rules: 2064 * - Examine each "slot" (the "i" in tgts[i]) 2065 * - Try to increment this slot (tgts[i] = tgts[i] + 1) 2066 * - if we can't increment because it runs into the next slot, 2067 * reset our slot to the minimum, and examine the next slot 2068 * 2069 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 2070 * 3 columns to reconstruct), we will generate the following sequence: 2071 * 2072 * STATE ACTION 2073 * 0 1 2 special case: skip since these are all parity 2074 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 2075 * 0 2 3 first slot: increment to 1 2076 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 2077 * 0 1 4 first: reset to 0; middle: increment to 2 2078 * 0 2 4 first: increment to 1 2079 * 1 2 4 first: reset to 0; middle: increment to 3 2080 * 0 3 4 first: increment to 1 2081 * 1 3 4 first: increment to 2 2082 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 2083 * 0 1 5 first: reset to 0; middle: increment to 2 2084 * 0 2 5 first: increment to 1 2085 * 1 2 5 first: reset to 0; middle: increment to 3 2086 * 0 3 5 first: increment to 1 2087 * 1 3 5 first: increment to 2 2088 * 2 3 5 first: reset to 0; middle: increment to 4 2089 * 0 4 5 first: increment to 1 2090 * 1 4 5 first: increment to 2 2091 * 2 4 5 first: increment to 3 2092 * 3 4 5 done 2093 * 2094 * This strategy works for dRAID but is less efficient when there are a large 2095 * number of child vdevs and therefore permutations to check. Furthermore, 2096 * since the raidz_map_t rows likely do not overlap reconstruction would be 2097 * possible as long as there are no more than nparity data errors per row. 2098 * These additional permutations are not currently checked but could be as 2099 * a future improvement. 2100 */ 2101 static int 2102 vdev_raidz_combrec(zio_t *zio) 2103 { 2104 int nparity = vdev_get_nparity(zio->io_vd); 2105 raidz_map_t *rm = zio->io_vsd; 2106 2107 /* Check if there's enough data to attempt reconstrution. */ 2108 for (int i = 0; i < rm->rm_nrows; i++) { 2109 raidz_row_t *rr = rm->rm_row[i]; 2110 int total_errors = 0; 2111 2112 for (int c = 0; c < rr->rr_cols; c++) { 2113 if (rr->rr_col[c].rc_error) 2114 total_errors++; 2115 } 2116 2117 if (total_errors > nparity) 2118 return (vdev_raidz_worst_error(rr)); 2119 } 2120 2121 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 2122 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2123 int *ltgts = &tstore[1]; /* value is logical child ID */ 2124 2125 /* Determine number of logical children, n */ 2126 int n = zio->io_vd->vdev_children; 2127 2128 ASSERT3U(num_failures, <=, nparity); 2129 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 2130 2131 /* Handle corner cases in combrec logic */ 2132 ltgts[-1] = -1; 2133 for (int i = 0; i < num_failures; i++) { 2134 ltgts[i] = i; 2135 } 2136 ltgts[num_failures] = n; 2137 2138 for (;;) { 2139 int err = raidz_reconstruct(zio, ltgts, num_failures, 2140 nparity); 2141 if (err == EINVAL) { 2142 /* 2143 * Reconstruction not possible with this # 2144 * failures; try more failures. 2145 */ 2146 break; 2147 } else if (err == 0) 2148 return (0); 2149 2150 /* Compute next targets to try */ 2151 for (int t = 0; ; t++) { 2152 ASSERT3U(t, <, num_failures); 2153 ltgts[t]++; 2154 if (ltgts[t] == n) { 2155 /* try more failures */ 2156 ASSERT3U(t, ==, num_failures - 1); 2157 break; 2158 } 2159 2160 ASSERT3U(ltgts[t], <, n); 2161 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 2162 2163 /* 2164 * If that spot is available, we're done here. 2165 * Try the next combination. 2166 */ 2167 if (ltgts[t] != ltgts[t + 1]) 2168 break; 2169 2170 /* 2171 * Otherwise, reset this tgt to the minimum, 2172 * and move on to the next tgt. 2173 */ 2174 ltgts[t] = ltgts[t - 1] + 1; 2175 ASSERT3U(ltgts[t], ==, t); 2176 } 2177 2178 /* Increase the number of failures and keep trying. */ 2179 if (ltgts[num_failures - 1] == n) 2180 break; 2181 } 2182 } 2183 2184 return (ECKSUM); 2185 } 2186 2187 void 2188 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 2189 { 2190 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 2191 raidz_row_t *rr = rm->rm_row[row]; 2192 vdev_raidz_reconstruct_row(rm, rr, t, nt); 2193 } 2194 } 2195 2196 /* 2197 * Complete a write IO operation on a RAIDZ VDev 2198 * 2199 * Outline: 2200 * 1. Check for errors on the child IOs. 2201 * 2. Return, setting an error code if too few child VDevs were written 2202 * to reconstruct the data later. Note that partial writes are 2203 * considered successful if they can be reconstructed at all. 2204 */ 2205 static void 2206 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 2207 { 2208 int total_errors = 0; 2209 2210 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2211 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2212 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 2213 2214 for (int c = 0; c < rr->rr_cols; c++) { 2215 raidz_col_t *rc = &rr->rr_col[c]; 2216 2217 if (rc->rc_error) { 2218 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2219 2220 total_errors++; 2221 } 2222 } 2223 2224 /* 2225 * Treat partial writes as a success. If we couldn't write enough 2226 * columns to reconstruct the data, the I/O failed. Otherwise, 2227 * good enough. 2228 * 2229 * Now that we support write reallocation, it would be better 2230 * to treat partial failure as real failure unless there are 2231 * no non-degraded top-level vdevs left, and not update DTLs 2232 * if we intend to reallocate. 2233 */ 2234 if (total_errors > rr->rr_firstdatacol) { 2235 zio->io_error = zio_worst_error(zio->io_error, 2236 vdev_raidz_worst_error(rr)); 2237 } 2238 } 2239 2240 static void 2241 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 2242 raidz_row_t *rr) 2243 { 2244 int parity_errors = 0; 2245 int parity_untried = 0; 2246 int data_errors = 0; 2247 int total_errors = 0; 2248 2249 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2250 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2251 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2252 2253 for (int c = 0; c < rr->rr_cols; c++) { 2254 raidz_col_t *rc = &rr->rr_col[c]; 2255 2256 /* 2257 * If scrubbing and a replacing/sparing child vdev determined 2258 * that not all of its children have an identical copy of the 2259 * data, then clear the error so the column is treated like 2260 * any other read and force a repair to correct the damage. 2261 */ 2262 if (rc->rc_error == ECKSUM) { 2263 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 2264 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 2265 rc->rc_force_repair = 1; 2266 rc->rc_error = 0; 2267 } 2268 2269 if (rc->rc_error) { 2270 if (c < rr->rr_firstdatacol) 2271 parity_errors++; 2272 else 2273 data_errors++; 2274 2275 total_errors++; 2276 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2277 parity_untried++; 2278 } 2279 } 2280 2281 /* 2282 * If there were data errors and the number of errors we saw was 2283 * correctable -- less than or equal to the number of parity disks read 2284 * -- reconstruct based on the missing data. 2285 */ 2286 if (data_errors != 0 && 2287 total_errors <= rr->rr_firstdatacol - parity_untried) { 2288 /* 2289 * We either attempt to read all the parity columns or 2290 * none of them. If we didn't try to read parity, we 2291 * wouldn't be here in the correctable case. There must 2292 * also have been fewer parity errors than parity 2293 * columns or, again, we wouldn't be in this code path. 2294 */ 2295 ASSERT(parity_untried == 0); 2296 ASSERT(parity_errors < rr->rr_firstdatacol); 2297 2298 /* 2299 * Identify the data columns that reported an error. 2300 */ 2301 int n = 0; 2302 int tgts[VDEV_RAIDZ_MAXPARITY]; 2303 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2304 raidz_col_t *rc = &rr->rr_col[c]; 2305 if (rc->rc_error != 0) { 2306 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2307 tgts[n++] = c; 2308 } 2309 } 2310 2311 ASSERT(rr->rr_firstdatacol >= n); 2312 2313 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 2314 } 2315 } 2316 2317 /* 2318 * Return the number of reads issued. 2319 */ 2320 static int 2321 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 2322 { 2323 vdev_t *vd = zio->io_vd; 2324 int nread = 0; 2325 2326 rr->rr_missingdata = 0; 2327 rr->rr_missingparity = 0; 2328 2329 /* 2330 * If this rows contains empty sectors which are not required 2331 * for a normal read then allocate an ABD for them now so they 2332 * may be read, verified, and any needed repairs performed. 2333 */ 2334 if (rr->rr_nempty && rr->rr_abd_empty == NULL) 2335 vdev_draid_map_alloc_empty(zio, rr); 2336 2337 for (int c = 0; c < rr->rr_cols; c++) { 2338 raidz_col_t *rc = &rr->rr_col[c]; 2339 if (rc->rc_tried || rc->rc_size == 0) 2340 continue; 2341 2342 zio_nowait(zio_vdev_child_io(zio, NULL, 2343 vd->vdev_child[rc->rc_devidx], 2344 rc->rc_offset, rc->rc_abd, rc->rc_size, 2345 zio->io_type, zio->io_priority, 0, 2346 vdev_raidz_child_done, rc)); 2347 nread++; 2348 } 2349 return (nread); 2350 } 2351 2352 /* 2353 * We're here because either there were too many errors to even attempt 2354 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 2355 * failed. In either case, there is enough bad data to prevent reconstruction. 2356 * Start checksum ereports for all children which haven't failed. 2357 */ 2358 static void 2359 vdev_raidz_io_done_unrecoverable(zio_t *zio) 2360 { 2361 raidz_map_t *rm = zio->io_vsd; 2362 2363 for (int i = 0; i < rm->rm_nrows; i++) { 2364 raidz_row_t *rr = rm->rm_row[i]; 2365 2366 for (int c = 0; c < rr->rr_cols; c++) { 2367 raidz_col_t *rc = &rr->rr_col[c]; 2368 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2369 2370 if (rc->rc_error != 0) 2371 continue; 2372 2373 zio_bad_cksum_t zbc; 2374 zbc.zbc_has_cksum = 0; 2375 zbc.zbc_injected = rm->rm_ecksuminjected; 2376 2377 (void) zfs_ereport_start_checksum(zio->io_spa, 2378 cvd, &zio->io_bookmark, zio, rc->rc_offset, 2379 rc->rc_size, &zbc); 2380 mutex_enter(&cvd->vdev_stat_lock); 2381 cvd->vdev_stat.vs_checksum_errors++; 2382 mutex_exit(&cvd->vdev_stat_lock); 2383 } 2384 } 2385 } 2386 2387 void 2388 vdev_raidz_io_done(zio_t *zio) 2389 { 2390 raidz_map_t *rm = zio->io_vsd; 2391 2392 if (zio->io_type == ZIO_TYPE_WRITE) { 2393 for (int i = 0; i < rm->rm_nrows; i++) { 2394 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 2395 } 2396 } else { 2397 for (int i = 0; i < rm->rm_nrows; i++) { 2398 raidz_row_t *rr = rm->rm_row[i]; 2399 vdev_raidz_io_done_reconstruct_known_missing(zio, 2400 rm, rr); 2401 } 2402 2403 if (raidz_checksum_verify(zio) == 0) { 2404 for (int i = 0; i < rm->rm_nrows; i++) { 2405 raidz_row_t *rr = rm->rm_row[i]; 2406 vdev_raidz_io_done_verified(zio, rr); 2407 } 2408 zio_checksum_verified(zio); 2409 } else { 2410 /* 2411 * A sequential resilver has no checksum which makes 2412 * combinatoral reconstruction impossible. This code 2413 * path is unreachable since raidz_checksum_verify() 2414 * has no checksum to verify and must succeed. 2415 */ 2416 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 2417 2418 /* 2419 * This isn't a typical situation -- either we got a 2420 * read error or a child silently returned bad data. 2421 * Read every block so we can try again with as much 2422 * data and parity as we can track down. If we've 2423 * already been through once before, all children will 2424 * be marked as tried so we'll proceed to combinatorial 2425 * reconstruction. 2426 */ 2427 int nread = 0; 2428 for (int i = 0; i < rm->rm_nrows; i++) { 2429 nread += vdev_raidz_read_all(zio, 2430 rm->rm_row[i]); 2431 } 2432 if (nread != 0) { 2433 /* 2434 * Normally our stage is VDEV_IO_DONE, but if 2435 * we've already called redone(), it will have 2436 * changed to VDEV_IO_START, in which case we 2437 * don't want to call redone() again. 2438 */ 2439 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 2440 zio_vdev_io_redone(zio); 2441 return; 2442 } 2443 2444 zio->io_error = vdev_raidz_combrec(zio); 2445 if (zio->io_error == ECKSUM && 2446 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2447 vdev_raidz_io_done_unrecoverable(zio); 2448 } 2449 } 2450 } 2451 } 2452 2453 static void 2454 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2455 { 2456 vdev_raidz_t *vdrz = vd->vdev_tsd; 2457 if (faulted > vdrz->vd_nparity) 2458 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2459 VDEV_AUX_NO_REPLICAS); 2460 else if (degraded + faulted != 0) 2461 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2462 else 2463 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2464 } 2465 2466 /* 2467 * Determine if any portion of the provided block resides on a child vdev 2468 * with a dirty DTL and therefore needs to be resilvered. The function 2469 * assumes that at least one DTL is dirty which implies that full stripe 2470 * width blocks must be resilvered. 2471 */ 2472 static boolean_t 2473 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 2474 uint64_t phys_birth) 2475 { 2476 vdev_raidz_t *vdrz = vd->vdev_tsd; 2477 uint64_t dcols = vd->vdev_children; 2478 uint64_t nparity = vdrz->vd_nparity; 2479 uint64_t ashift = vd->vdev_top->vdev_ashift; 2480 /* The starting RAIDZ (parent) vdev sector of the block. */ 2481 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 2482 /* The zio's size in units of the vdev's minimum sector size. */ 2483 uint64_t s = ((psize - 1) >> ashift) + 1; 2484 /* The first column for this stripe. */ 2485 uint64_t f = b % dcols; 2486 2487 /* Unreachable by sequential resilver. */ 2488 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 2489 2490 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 2491 return (B_FALSE); 2492 2493 if (s + nparity >= dcols) 2494 return (B_TRUE); 2495 2496 for (uint64_t c = 0; c < s + nparity; c++) { 2497 uint64_t devidx = (f + c) % dcols; 2498 vdev_t *cvd = vd->vdev_child[devidx]; 2499 2500 /* 2501 * dsl_scan_need_resilver() already checked vd with 2502 * vdev_dtl_contains(). So here just check cvd with 2503 * vdev_dtl_empty(), cheaper and a good approximation. 2504 */ 2505 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2506 return (B_TRUE); 2507 } 2508 2509 return (B_FALSE); 2510 } 2511 2512 static void 2513 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 2514 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 2515 { 2516 (void) remain_rs; 2517 2518 vdev_t *raidvd = cvd->vdev_parent; 2519 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2520 2521 uint64_t width = raidvd->vdev_children; 2522 uint64_t tgt_col = cvd->vdev_id; 2523 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2524 2525 /* make sure the offsets are block-aligned */ 2526 ASSERT0(logical_rs->rs_start % (1 << ashift)); 2527 ASSERT0(logical_rs->rs_end % (1 << ashift)); 2528 uint64_t b_start = logical_rs->rs_start >> ashift; 2529 uint64_t b_end = logical_rs->rs_end >> ashift; 2530 2531 uint64_t start_row = 0; 2532 if (b_start > tgt_col) /* avoid underflow */ 2533 start_row = ((b_start - tgt_col - 1) / width) + 1; 2534 2535 uint64_t end_row = 0; 2536 if (b_end > tgt_col) 2537 end_row = ((b_end - tgt_col - 1) / width) + 1; 2538 2539 physical_rs->rs_start = start_row << ashift; 2540 physical_rs->rs_end = end_row << ashift; 2541 2542 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 2543 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 2544 logical_rs->rs_end - logical_rs->rs_start); 2545 } 2546 2547 /* 2548 * Initialize private RAIDZ specific fields from the nvlist. 2549 */ 2550 static int 2551 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 2552 { 2553 vdev_raidz_t *vdrz; 2554 uint64_t nparity; 2555 2556 uint_t children; 2557 nvlist_t **child; 2558 int error = nvlist_lookup_nvlist_array(nv, 2559 ZPOOL_CONFIG_CHILDREN, &child, &children); 2560 if (error != 0) 2561 return (SET_ERROR(EINVAL)); 2562 2563 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 2564 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 2565 return (SET_ERROR(EINVAL)); 2566 2567 /* 2568 * Previous versions could only support 1 or 2 parity 2569 * device. 2570 */ 2571 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 2572 return (SET_ERROR(EINVAL)); 2573 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 2574 return (SET_ERROR(EINVAL)); 2575 } else { 2576 /* 2577 * We require the parity to be specified for SPAs that 2578 * support multiple parity levels. 2579 */ 2580 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 2581 return (SET_ERROR(EINVAL)); 2582 2583 /* 2584 * Otherwise, we default to 1 parity device for RAID-Z. 2585 */ 2586 nparity = 1; 2587 } 2588 2589 vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 2590 vdrz->vd_logical_width = children; 2591 vdrz->vd_nparity = nparity; 2592 2593 *tsd = vdrz; 2594 2595 return (0); 2596 } 2597 2598 static void 2599 vdev_raidz_fini(vdev_t *vd) 2600 { 2601 kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); 2602 } 2603 2604 /* 2605 * Add RAIDZ specific fields to the config nvlist. 2606 */ 2607 static void 2608 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 2609 { 2610 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 2611 vdev_raidz_t *vdrz = vd->vdev_tsd; 2612 2613 /* 2614 * Make sure someone hasn't managed to sneak a fancy new vdev 2615 * into a crufty old storage pool. 2616 */ 2617 ASSERT(vdrz->vd_nparity == 1 || 2618 (vdrz->vd_nparity <= 2 && 2619 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 2620 (vdrz->vd_nparity <= 3 && 2621 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 2622 2623 /* 2624 * Note that we'll add these even on storage pools where they 2625 * aren't strictly required -- older software will just ignore 2626 * it. 2627 */ 2628 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 2629 } 2630 2631 static uint64_t 2632 vdev_raidz_nparity(vdev_t *vd) 2633 { 2634 vdev_raidz_t *vdrz = vd->vdev_tsd; 2635 return (vdrz->vd_nparity); 2636 } 2637 2638 static uint64_t 2639 vdev_raidz_ndisks(vdev_t *vd) 2640 { 2641 return (vd->vdev_children); 2642 } 2643 2644 vdev_ops_t vdev_raidz_ops = { 2645 .vdev_op_init = vdev_raidz_init, 2646 .vdev_op_fini = vdev_raidz_fini, 2647 .vdev_op_open = vdev_raidz_open, 2648 .vdev_op_close = vdev_raidz_close, 2649 .vdev_op_asize = vdev_raidz_asize, 2650 .vdev_op_min_asize = vdev_raidz_min_asize, 2651 .vdev_op_min_alloc = NULL, 2652 .vdev_op_io_start = vdev_raidz_io_start, 2653 .vdev_op_io_done = vdev_raidz_io_done, 2654 .vdev_op_state_change = vdev_raidz_state_change, 2655 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2656 .vdev_op_hold = NULL, 2657 .vdev_op_rele = NULL, 2658 .vdev_op_remap = NULL, 2659 .vdev_op_xlate = vdev_raidz_xlate, 2660 .vdev_op_rebuild_asize = NULL, 2661 .vdev_op_metaslab_init = NULL, 2662 .vdev_op_config_generate = vdev_raidz_config_generate, 2663 .vdev_op_nparity = vdev_raidz_nparity, 2664 .vdev_op_ndisks = vdev_raidz_ndisks, 2665 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2666 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2667 }; 2668