1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/zio_checksum.h> 33 #include <sys/abd.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/vdev_raidz.h> 37 #include <sys/vdev_raidz_impl.h> 38 #include <sys/vdev_draid.h> 39 40 #ifdef ZFS_DEBUG 41 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 42 #endif 43 44 /* 45 * Virtual device vector for RAID-Z. 46 * 47 * This vdev supports single, double, and triple parity. For single parity, 48 * we use a simple XOR of all the data columns. For double or triple parity, 49 * we use a special case of Reed-Solomon coding. This extends the 50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 53 * former is also based. The latter is designed to provide higher performance 54 * for writes. 55 * 56 * Note that the Plank paper claimed to support arbitrary N+M, but was then 57 * amended six years later identifying a critical flaw that invalidates its 58 * claims. Nevertheless, the technique can be adapted to work for up to 59 * triple parity. For additional parity, the amendment "Note: Correction to 60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 61 * is viable, but the additional complexity means that write performance will 62 * suffer. 63 * 64 * All of the methods above operate on a Galois field, defined over the 65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 66 * can be expressed with a single byte. Briefly, the operations on the 67 * field are defined as follows: 68 * 69 * o addition (+) is represented by a bitwise XOR 70 * o subtraction (-) is therefore identical to addition: A + B = A - B 71 * o multiplication of A by 2 is defined by the following bitwise expression: 72 * 73 * (A * 2)_7 = A_6 74 * (A * 2)_6 = A_5 75 * (A * 2)_5 = A_4 76 * (A * 2)_4 = A_3 + A_7 77 * (A * 2)_3 = A_2 + A_7 78 * (A * 2)_2 = A_1 + A_7 79 * (A * 2)_1 = A_0 80 * (A * 2)_0 = A_7 81 * 82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 83 * As an aside, this multiplication is derived from the error correcting 84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 85 * 86 * Observe that any number in the field (except for 0) can be expressed as a 87 * power of 2 -- a generator for the field. We store a table of the powers of 88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 90 * than field addition). The inverse of a field element A (A^-1) is therefore 91 * A ^ (255 - 1) = A^254. 92 * 93 * The up-to-three parity columns, P, Q, R over several data columns, 94 * D_0, ... D_n-1, can be expressed by field operations: 95 * 96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 101 * 102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 104 * independent coefficients. (There are no additional coefficients that have 105 * this property which is why the uncorrected Plank method breaks down.) 106 * 107 * See the reconstruction code below for how P, Q and R can used individually 108 * or in concert to recover missing data columns. 109 */ 110 111 #define VDEV_RAIDZ_P 0 112 #define VDEV_RAIDZ_Q 1 113 #define VDEV_RAIDZ_R 2 114 115 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 116 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 117 118 /* 119 * We provide a mechanism to perform the field multiplication operation on a 120 * 64-bit value all at once rather than a byte at a time. This works by 121 * creating a mask from the top bit in each byte and using that to 122 * conditionally apply the XOR of 0x1d. 123 */ 124 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 125 { \ 126 (mask) = (x) & 0x8080808080808080ULL; \ 127 (mask) = ((mask) << 1) - ((mask) >> 7); \ 128 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 129 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 130 } 131 132 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 133 { \ 134 VDEV_RAIDZ_64MUL_2((x), mask); \ 135 VDEV_RAIDZ_64MUL_2((x), mask); \ 136 } 137 138 static void 139 vdev_raidz_row_free(raidz_row_t *rr) 140 { 141 for (int c = 0; c < rr->rr_cols; c++) { 142 raidz_col_t *rc = &rr->rr_col[c]; 143 144 if (rc->rc_size != 0) 145 abd_free(rc->rc_abd); 146 if (rc->rc_gdata != NULL) 147 abd_free(rc->rc_gdata); 148 if (rc->rc_orig_data != NULL) 149 zio_buf_free(rc->rc_orig_data, rc->rc_size); 150 } 151 152 if (rr->rr_abd_copy != NULL) 153 abd_free(rr->rr_abd_copy); 154 155 if (rr->rr_abd_empty != NULL) 156 abd_free(rr->rr_abd_empty); 157 158 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 159 } 160 161 void 162 vdev_raidz_map_free(raidz_map_t *rm) 163 { 164 for (int i = 0; i < rm->rm_nrows; i++) 165 vdev_raidz_row_free(rm->rm_row[i]); 166 167 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 168 } 169 170 static void 171 vdev_raidz_map_free_vsd(zio_t *zio) 172 { 173 raidz_map_t *rm = zio->io_vsd; 174 175 ASSERT0(rm->rm_freed); 176 rm->rm_freed = B_TRUE; 177 178 if (rm->rm_reports == 0) { 179 vdev_raidz_map_free(rm); 180 } 181 } 182 183 /*ARGSUSED*/ 184 static void 185 vdev_raidz_cksum_free(void *arg, size_t ignored) 186 { 187 raidz_map_t *rm = arg; 188 189 ASSERT3U(rm->rm_reports, >, 0); 190 191 if (--rm->rm_reports == 0 && rm->rm_freed) 192 vdev_raidz_map_free(rm); 193 } 194 195 static void 196 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) 197 { 198 raidz_map_t *rm = zcr->zcr_cbdata; 199 const size_t c = zcr->zcr_cbinfo; 200 size_t x, offset; 201 202 if (good_data == NULL) { 203 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 204 return; 205 } 206 207 ASSERT3U(rm->rm_nrows, ==, 1); 208 raidz_row_t *rr = rm->rm_row[0]; 209 210 const abd_t *good = NULL; 211 const abd_t *bad = rr->rr_col[c].rc_abd; 212 213 if (c < rr->rr_firstdatacol) { 214 /* 215 * The first time through, calculate the parity blocks for 216 * the good data (this relies on the fact that the good 217 * data never changes for a given logical ZIO) 218 */ 219 if (rr->rr_col[0].rc_gdata == NULL) { 220 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 221 222 /* 223 * Set up the rr_col[]s to generate the parity for 224 * good_data, first saving the parity bufs and 225 * replacing them with buffers to hold the result. 226 */ 227 for (x = 0; x < rr->rr_firstdatacol; x++) { 228 bad_parity[x] = rr->rr_col[x].rc_abd; 229 rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = 230 abd_alloc_sametype(rr->rr_col[x].rc_abd, 231 rr->rr_col[x].rc_size); 232 } 233 234 /* fill in the data columns from good_data */ 235 offset = 0; 236 for (; x < rr->rr_cols; x++) { 237 abd_free(rr->rr_col[x].rc_abd); 238 239 rr->rr_col[x].rc_abd = 240 abd_get_offset_size((abd_t *)good_data, 241 offset, rr->rr_col[x].rc_size); 242 offset += rr->rr_col[x].rc_size; 243 } 244 245 /* 246 * Construct the parity from the good data. 247 */ 248 vdev_raidz_generate_parity_row(rm, rr); 249 250 /* restore everything back to its original state */ 251 for (x = 0; x < rr->rr_firstdatacol; x++) 252 rr->rr_col[x].rc_abd = bad_parity[x]; 253 254 offset = 0; 255 for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { 256 abd_free(rr->rr_col[x].rc_abd); 257 rr->rr_col[x].rc_abd = abd_get_offset_size( 258 rr->rr_abd_copy, offset, 259 rr->rr_col[x].rc_size); 260 offset += rr->rr_col[x].rc_size; 261 } 262 } 263 264 ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); 265 good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, 266 rr->rr_col[c].rc_size); 267 } else { 268 /* adjust good_data to point at the start of our column */ 269 offset = 0; 270 for (x = rr->rr_firstdatacol; x < c; x++) 271 offset += rr->rr_col[x].rc_size; 272 273 good = abd_get_offset_size((abd_t *)good_data, offset, 274 rr->rr_col[c].rc_size); 275 } 276 277 /* we drop the ereport if it ends up that the data was good */ 278 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 279 abd_free((abd_t *)good); 280 } 281 282 /* 283 * Invoked indirectly by zfs_ereport_start_checksum(), called 284 * below when our read operation fails completely. The main point 285 * is to keep a copy of everything we read from disk, so that at 286 * vdev_raidz_cksum_finish() time we can compare it with the good data. 287 */ 288 static void 289 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 290 { 291 size_t c = (size_t)(uintptr_t)arg; 292 raidz_map_t *rm = zio->io_vsd; 293 294 /* set up the report and bump the refcount */ 295 zcr->zcr_cbdata = rm; 296 zcr->zcr_cbinfo = c; 297 zcr->zcr_finish = vdev_raidz_cksum_finish; 298 zcr->zcr_free = vdev_raidz_cksum_free; 299 300 rm->rm_reports++; 301 ASSERT3U(rm->rm_reports, >, 0); 302 ASSERT3U(rm->rm_nrows, ==, 1); 303 304 if (rm->rm_row[0]->rr_abd_copy != NULL) 305 return; 306 307 /* 308 * It's the first time we're called for this raidz_map_t, so we need 309 * to copy the data aside; there's no guarantee that our zio's buffer 310 * won't be re-used for something else. 311 * 312 * Our parity data is already in separate buffers, so there's no need 313 * to copy them. 314 */ 315 for (int i = 0; i < rm->rm_nrows; i++) { 316 raidz_row_t *rr = rm->rm_row[i]; 317 size_t offset = 0; 318 size_t size = 0; 319 320 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) 321 size += rr->rr_col[c].rc_size; 322 323 rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); 324 325 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 326 raidz_col_t *col = &rr->rr_col[c]; 327 abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, 328 offset, col->rc_size); 329 330 abd_copy(tmp, col->rc_abd, col->rc_size); 331 332 abd_free(col->rc_abd); 333 col->rc_abd = tmp; 334 335 offset += col->rc_size; 336 } 337 ASSERT3U(offset, ==, size); 338 } 339 } 340 341 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 342 .vsd_free = vdev_raidz_map_free_vsd, 343 .vsd_cksum_report = vdev_raidz_cksum_report 344 }; 345 346 /* 347 * Divides the IO evenly across all child vdevs; usually, dcols is 348 * the number of children in the target vdev. 349 * 350 * Avoid inlining the function to keep vdev_raidz_io_start(), which 351 * is this functions only caller, as small as possible on the stack. 352 */ 353 noinline raidz_map_t * 354 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 355 uint64_t nparity) 356 { 357 raidz_row_t *rr; 358 /* The starting RAIDZ (parent) vdev sector of the block. */ 359 uint64_t b = zio->io_offset >> ashift; 360 /* The zio's size in units of the vdev's minimum sector size. */ 361 uint64_t s = zio->io_size >> ashift; 362 /* The first column for this stripe. */ 363 uint64_t f = b % dcols; 364 /* The starting byte offset on each child vdev. */ 365 uint64_t o = (b / dcols) << ashift; 366 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 367 368 raidz_map_t *rm = 369 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 370 rm->rm_nrows = 1; 371 372 /* 373 * "Quotient": The number of data sectors for this stripe on all but 374 * the "big column" child vdevs that also contain "remainder" data. 375 */ 376 q = s / (dcols - nparity); 377 378 /* 379 * "Remainder": The number of partial stripe data sectors in this I/O. 380 * This will add a sector to some, but not all, child vdevs. 381 */ 382 r = s - q * (dcols - nparity); 383 384 /* The number of "big columns" - those which contain remainder data. */ 385 bc = (r == 0 ? 0 : r + nparity); 386 387 /* 388 * The total number of data and parity sectors associated with 389 * this I/O. 390 */ 391 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 392 393 /* 394 * acols: The columns that will be accessed. 395 * scols: The columns that will be accessed or skipped. 396 */ 397 if (q == 0) { 398 /* Our I/O request doesn't span all child vdevs. */ 399 acols = bc; 400 scols = MIN(dcols, roundup(bc, nparity + 1)); 401 } else { 402 acols = dcols; 403 scols = dcols; 404 } 405 406 ASSERT3U(acols, <=, scols); 407 408 rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); 409 rm->rm_row[0] = rr; 410 411 rr->rr_cols = acols; 412 rr->rr_scols = scols; 413 rr->rr_bigcols = bc; 414 rr->rr_missingdata = 0; 415 rr->rr_missingparity = 0; 416 rr->rr_firstdatacol = nparity; 417 rr->rr_abd_copy = NULL; 418 rr->rr_abd_empty = NULL; 419 rr->rr_nempty = 0; 420 #ifdef ZFS_DEBUG 421 rr->rr_offset = zio->io_offset; 422 rr->rr_size = zio->io_size; 423 #endif 424 425 asize = 0; 426 427 for (c = 0; c < scols; c++) { 428 raidz_col_t *rc = &rr->rr_col[c]; 429 col = f + c; 430 coff = o; 431 if (col >= dcols) { 432 col -= dcols; 433 coff += 1ULL << ashift; 434 } 435 rc->rc_devidx = col; 436 rc->rc_offset = coff; 437 rc->rc_abd = NULL; 438 rc->rc_gdata = NULL; 439 rc->rc_orig_data = NULL; 440 rc->rc_error = 0; 441 rc->rc_tried = 0; 442 rc->rc_skipped = 0; 443 rc->rc_repair = 0; 444 rc->rc_need_orig_restore = B_FALSE; 445 446 if (c >= acols) 447 rc->rc_size = 0; 448 else if (c < bc) 449 rc->rc_size = (q + 1) << ashift; 450 else 451 rc->rc_size = q << ashift; 452 453 asize += rc->rc_size; 454 } 455 456 ASSERT3U(asize, ==, tot << ashift); 457 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 458 rm->rm_skipstart = bc; 459 460 for (c = 0; c < rr->rr_firstdatacol; c++) 461 rr->rr_col[c].rc_abd = 462 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 463 464 for (uint64_t off = 0; c < acols; c++) { 465 raidz_col_t *rc = &rr->rr_col[c]; 466 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 467 zio->io_abd, off, rc->rc_size); 468 off += rc->rc_size; 469 } 470 471 /* 472 * If all data stored spans all columns, there's a danger that parity 473 * will always be on the same device and, since parity isn't read 474 * during normal operation, that device's I/O bandwidth won't be 475 * used effectively. We therefore switch the parity every 1MB. 476 * 477 * ... at least that was, ostensibly, the theory. As a practical 478 * matter unless we juggle the parity between all devices evenly, we 479 * won't see any benefit. Further, occasional writes that aren't a 480 * multiple of the LCM of the number of children and the minimum 481 * stripe width are sufficient to avoid pessimal behavior. 482 * Unfortunately, this decision created an implicit on-disk format 483 * requirement that we need to support for all eternity, but only 484 * for single-parity RAID-Z. 485 * 486 * If we intend to skip a sector in the zeroth column for padding 487 * we must make sure to note this swap. We will never intend to 488 * skip the first column since at least one data and one parity 489 * column must appear in each row. 490 */ 491 ASSERT(rr->rr_cols >= 2); 492 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 493 494 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 495 devidx = rr->rr_col[0].rc_devidx; 496 o = rr->rr_col[0].rc_offset; 497 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 498 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 499 rr->rr_col[1].rc_devidx = devidx; 500 rr->rr_col[1].rc_offset = o; 501 502 if (rm->rm_skipstart == 0) 503 rm->rm_skipstart = 1; 504 } 505 506 /* init RAIDZ parity ops */ 507 rm->rm_ops = vdev_raidz_math_get_ops(); 508 509 return (rm); 510 } 511 512 struct pqr_struct { 513 uint64_t *p; 514 uint64_t *q; 515 uint64_t *r; 516 }; 517 518 static int 519 vdev_raidz_p_func(void *buf, size_t size, void *private) 520 { 521 struct pqr_struct *pqr = private; 522 const uint64_t *src = buf; 523 int i, cnt = size / sizeof (src[0]); 524 525 ASSERT(pqr->p && !pqr->q && !pqr->r); 526 527 for (i = 0; i < cnt; i++, src++, pqr->p++) 528 *pqr->p ^= *src; 529 530 return (0); 531 } 532 533 static int 534 vdev_raidz_pq_func(void *buf, size_t size, void *private) 535 { 536 struct pqr_struct *pqr = private; 537 const uint64_t *src = buf; 538 uint64_t mask; 539 int i, cnt = size / sizeof (src[0]); 540 541 ASSERT(pqr->p && pqr->q && !pqr->r); 542 543 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 544 *pqr->p ^= *src; 545 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 546 *pqr->q ^= *src; 547 } 548 549 return (0); 550 } 551 552 static int 553 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 554 { 555 struct pqr_struct *pqr = private; 556 const uint64_t *src = buf; 557 uint64_t mask; 558 int i, cnt = size / sizeof (src[0]); 559 560 ASSERT(pqr->p && pqr->q && pqr->r); 561 562 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 563 *pqr->p ^= *src; 564 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 565 *pqr->q ^= *src; 566 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 567 *pqr->r ^= *src; 568 } 569 570 return (0); 571 } 572 573 static void 574 vdev_raidz_generate_parity_p(raidz_row_t *rr) 575 { 576 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 577 578 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 579 abd_t *src = rr->rr_col[c].rc_abd; 580 581 if (c == rr->rr_firstdatacol) { 582 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 583 } else { 584 struct pqr_struct pqr = { p, NULL, NULL }; 585 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 586 vdev_raidz_p_func, &pqr); 587 } 588 } 589 } 590 591 static void 592 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 593 { 594 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 595 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 596 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 597 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 598 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 599 600 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 601 abd_t *src = rr->rr_col[c].rc_abd; 602 603 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 604 605 if (c == rr->rr_firstdatacol) { 606 ASSERT(ccnt == pcnt || ccnt == 0); 607 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 608 (void) memcpy(q, p, rr->rr_col[c].rc_size); 609 610 for (uint64_t i = ccnt; i < pcnt; i++) { 611 p[i] = 0; 612 q[i] = 0; 613 } 614 } else { 615 struct pqr_struct pqr = { p, q, NULL }; 616 617 ASSERT(ccnt <= pcnt); 618 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 619 vdev_raidz_pq_func, &pqr); 620 621 /* 622 * Treat short columns as though they are full of 0s. 623 * Note that there's therefore nothing needed for P. 624 */ 625 uint64_t mask; 626 for (uint64_t i = ccnt; i < pcnt; i++) { 627 VDEV_RAIDZ_64MUL_2(q[i], mask); 628 } 629 } 630 } 631 } 632 633 static void 634 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 635 { 636 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 637 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 638 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 639 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 640 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 641 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 642 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 643 rr->rr_col[VDEV_RAIDZ_R].rc_size); 644 645 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 646 abd_t *src = rr->rr_col[c].rc_abd; 647 648 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 649 650 if (c == rr->rr_firstdatacol) { 651 ASSERT(ccnt == pcnt || ccnt == 0); 652 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 653 (void) memcpy(q, p, rr->rr_col[c].rc_size); 654 (void) memcpy(r, p, rr->rr_col[c].rc_size); 655 656 for (uint64_t i = ccnt; i < pcnt; i++) { 657 p[i] = 0; 658 q[i] = 0; 659 r[i] = 0; 660 } 661 } else { 662 struct pqr_struct pqr = { p, q, r }; 663 664 ASSERT(ccnt <= pcnt); 665 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 666 vdev_raidz_pqr_func, &pqr); 667 668 /* 669 * Treat short columns as though they are full of 0s. 670 * Note that there's therefore nothing needed for P. 671 */ 672 uint64_t mask; 673 for (uint64_t i = ccnt; i < pcnt; i++) { 674 VDEV_RAIDZ_64MUL_2(q[i], mask); 675 VDEV_RAIDZ_64MUL_4(r[i], mask); 676 } 677 } 678 } 679 } 680 681 /* 682 * Generate RAID parity in the first virtual columns according to the number of 683 * parity columns available. 684 */ 685 void 686 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 687 { 688 ASSERT3U(rr->rr_cols, !=, 0); 689 690 /* Generate using the new math implementation */ 691 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 692 return; 693 694 switch (rr->rr_firstdatacol) { 695 case 1: 696 vdev_raidz_generate_parity_p(rr); 697 break; 698 case 2: 699 vdev_raidz_generate_parity_pq(rr); 700 break; 701 case 3: 702 vdev_raidz_generate_parity_pqr(rr); 703 break; 704 default: 705 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 706 } 707 } 708 709 void 710 vdev_raidz_generate_parity(raidz_map_t *rm) 711 { 712 for (int i = 0; i < rm->rm_nrows; i++) { 713 raidz_row_t *rr = rm->rm_row[i]; 714 vdev_raidz_generate_parity_row(rm, rr); 715 } 716 } 717 718 /* ARGSUSED */ 719 static int 720 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 721 { 722 uint64_t *dst = dbuf; 723 uint64_t *src = sbuf; 724 int cnt = size / sizeof (src[0]); 725 726 for (int i = 0; i < cnt; i++) { 727 dst[i] ^= src[i]; 728 } 729 730 return (0); 731 } 732 733 /* ARGSUSED */ 734 static int 735 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 736 void *private) 737 { 738 uint64_t *dst = dbuf; 739 uint64_t *src = sbuf; 740 uint64_t mask; 741 int cnt = size / sizeof (dst[0]); 742 743 for (int i = 0; i < cnt; i++, dst++, src++) { 744 VDEV_RAIDZ_64MUL_2(*dst, mask); 745 *dst ^= *src; 746 } 747 748 return (0); 749 } 750 751 /* ARGSUSED */ 752 static int 753 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 754 { 755 uint64_t *dst = buf; 756 uint64_t mask; 757 int cnt = size / sizeof (dst[0]); 758 759 for (int i = 0; i < cnt; i++, dst++) { 760 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 761 VDEV_RAIDZ_64MUL_2(*dst, mask); 762 } 763 764 return (0); 765 } 766 767 struct reconst_q_struct { 768 uint64_t *q; 769 int exp; 770 }; 771 772 static int 773 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 774 { 775 struct reconst_q_struct *rq = private; 776 uint64_t *dst = buf; 777 int cnt = size / sizeof (dst[0]); 778 779 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 780 int j; 781 uint8_t *b; 782 783 *dst ^= *rq->q; 784 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 785 *b = vdev_raidz_exp2(*b, rq->exp); 786 } 787 } 788 789 return (0); 790 } 791 792 struct reconst_pq_struct { 793 uint8_t *p; 794 uint8_t *q; 795 uint8_t *pxy; 796 uint8_t *qxy; 797 int aexp; 798 int bexp; 799 }; 800 801 static int 802 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 803 { 804 struct reconst_pq_struct *rpq = private; 805 uint8_t *xd = xbuf; 806 uint8_t *yd = ybuf; 807 808 for (int i = 0; i < size; 809 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 810 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 811 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 812 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 813 } 814 815 return (0); 816 } 817 818 static int 819 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 820 { 821 struct reconst_pq_struct *rpq = private; 822 uint8_t *xd = xbuf; 823 824 for (int i = 0; i < size; 825 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 826 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 827 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 828 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 829 } 830 831 return (0); 832 } 833 834 static int 835 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 836 { 837 int x = tgts[0]; 838 abd_t *dst, *src; 839 840 ASSERT3U(ntgts, ==, 1); 841 ASSERT3U(x, >=, rr->rr_firstdatacol); 842 ASSERT3U(x, <, rr->rr_cols); 843 844 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 845 846 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 847 dst = rr->rr_col[x].rc_abd; 848 849 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 850 851 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 852 uint64_t size = MIN(rr->rr_col[x].rc_size, 853 rr->rr_col[c].rc_size); 854 855 src = rr->rr_col[c].rc_abd; 856 857 if (c == x) 858 continue; 859 860 (void) abd_iterate_func2(dst, src, 0, 0, size, 861 vdev_raidz_reconst_p_func, NULL); 862 } 863 864 return (1 << VDEV_RAIDZ_P); 865 } 866 867 static int 868 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 869 { 870 int x = tgts[0]; 871 int c, exp; 872 abd_t *dst, *src; 873 874 ASSERT(ntgts == 1); 875 876 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 877 878 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 879 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 880 rr->rr_col[c].rc_size); 881 882 src = rr->rr_col[c].rc_abd; 883 dst = rr->rr_col[x].rc_abd; 884 885 if (c == rr->rr_firstdatacol) { 886 abd_copy(dst, src, size); 887 if (rr->rr_col[x].rc_size > size) { 888 abd_zero_off(dst, size, 889 rr->rr_col[x].rc_size - size); 890 } 891 } else { 892 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 893 (void) abd_iterate_func2(dst, src, 0, 0, size, 894 vdev_raidz_reconst_q_pre_func, NULL); 895 (void) abd_iterate_func(dst, 896 size, rr->rr_col[x].rc_size - size, 897 vdev_raidz_reconst_q_pre_tail_func, NULL); 898 } 899 } 900 901 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 902 dst = rr->rr_col[x].rc_abd; 903 exp = 255 - (rr->rr_cols - 1 - x); 904 905 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 906 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 907 vdev_raidz_reconst_q_post_func, &rq); 908 909 return (1 << VDEV_RAIDZ_Q); 910 } 911 912 static int 913 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 914 { 915 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 916 abd_t *pdata, *qdata; 917 uint64_t xsize, ysize; 918 int x = tgts[0]; 919 int y = tgts[1]; 920 abd_t *xd, *yd; 921 922 ASSERT(ntgts == 2); 923 ASSERT(x < y); 924 ASSERT(x >= rr->rr_firstdatacol); 925 ASSERT(y < rr->rr_cols); 926 927 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 928 929 /* 930 * Move the parity data aside -- we're going to compute parity as 931 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 932 * reuse the parity generation mechanism without trashing the actual 933 * parity so we make those columns appear to be full of zeros by 934 * setting their lengths to zero. 935 */ 936 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 937 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 938 xsize = rr->rr_col[x].rc_size; 939 ysize = rr->rr_col[y].rc_size; 940 941 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 942 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 943 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 944 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 945 rr->rr_col[x].rc_size = 0; 946 rr->rr_col[y].rc_size = 0; 947 948 vdev_raidz_generate_parity_pq(rr); 949 950 rr->rr_col[x].rc_size = xsize; 951 rr->rr_col[y].rc_size = ysize; 952 953 p = abd_to_buf(pdata); 954 q = abd_to_buf(qdata); 955 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 956 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 957 xd = rr->rr_col[x].rc_abd; 958 yd = rr->rr_col[y].rc_abd; 959 960 /* 961 * We now have: 962 * Pxy = P + D_x + D_y 963 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 964 * 965 * We can then solve for D_x: 966 * D_x = A * (P + Pxy) + B * (Q + Qxy) 967 * where 968 * A = 2^(x - y) * (2^(x - y) + 1)^-1 969 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 970 * 971 * With D_x in hand, we can easily solve for D_y: 972 * D_y = P + Pxy + D_x 973 */ 974 975 a = vdev_raidz_pow2[255 + x - y]; 976 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 977 tmp = 255 - vdev_raidz_log2[a ^ 1]; 978 979 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 980 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 981 982 ASSERT3U(xsize, >=, ysize); 983 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 984 985 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 986 vdev_raidz_reconst_pq_func, &rpq); 987 (void) abd_iterate_func(xd, ysize, xsize - ysize, 988 vdev_raidz_reconst_pq_tail_func, &rpq); 989 990 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 991 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 992 993 /* 994 * Restore the saved parity data. 995 */ 996 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 997 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 998 999 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1000 } 1001 1002 /* BEGIN CSTYLED */ 1003 /* 1004 * In the general case of reconstruction, we must solve the system of linear 1005 * equations defined by the coefficients used to generate parity as well as 1006 * the contents of the data and parity disks. This can be expressed with 1007 * vectors for the original data (D) and the actual data (d) and parity (p) 1008 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1009 * 1010 * __ __ __ __ 1011 * | | __ __ | p_0 | 1012 * | V | | D_0 | | p_m-1 | 1013 * | | x | : | = | d_0 | 1014 * | I | | D_n-1 | | : | 1015 * | | ~~ ~~ | d_n-1 | 1016 * ~~ ~~ ~~ ~~ 1017 * 1018 * I is simply a square identity matrix of size n, and V is a vandermonde 1019 * matrix defined by the coefficients we chose for the various parity columns 1020 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1021 * computation as well as linear separability. 1022 * 1023 * __ __ __ __ 1024 * | 1 .. 1 1 1 | | p_0 | 1025 * | 2^n-1 .. 4 2 1 | __ __ | : | 1026 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1027 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1028 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1029 * | : : : : | | : | | d_2 | 1030 * | 0 .. 1 0 0 | | D_n-1 | | : | 1031 * | 0 .. 0 1 0 | ~~ ~~ | : | 1032 * | 0 .. 0 0 1 | | d_n-1 | 1033 * ~~ ~~ ~~ ~~ 1034 * 1035 * Note that I, V, d, and p are known. To compute D, we must invert the 1036 * matrix and use the known data and parity values to reconstruct the unknown 1037 * data values. We begin by removing the rows in V|I and d|p that correspond 1038 * to failed or missing columns; we then make V|I square (n x n) and d|p 1039 * sized n by removing rows corresponding to unused parity from the bottom up 1040 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1041 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1042 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1043 * __ __ 1044 * | 1 1 1 1 1 1 1 1 | 1045 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1046 * | 19 205 116 29 64 16 4 1 | / / 1047 * | 1 0 0 0 0 0 0 0 | / / 1048 * | 0 1 0 0 0 0 0 0 | <--' / 1049 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1050 * | 0 0 0 1 0 0 0 0 | 1051 * | 0 0 0 0 1 0 0 0 | 1052 * | 0 0 0 0 0 1 0 0 | 1053 * | 0 0 0 0 0 0 1 0 | 1054 * | 0 0 0 0 0 0 0 1 | 1055 * ~~ ~~ 1056 * __ __ 1057 * | 1 1 1 1 1 1 1 1 | 1058 * | 128 64 32 16 8 4 2 1 | 1059 * | 19 205 116 29 64 16 4 1 | 1060 * | 1 0 0 0 0 0 0 0 | 1061 * | 0 1 0 0 0 0 0 0 | 1062 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1063 * | 0 0 0 1 0 0 0 0 | 1064 * | 0 0 0 0 1 0 0 0 | 1065 * | 0 0 0 0 0 1 0 0 | 1066 * | 0 0 0 0 0 0 1 0 | 1067 * | 0 0 0 0 0 0 0 1 | 1068 * ~~ ~~ 1069 * 1070 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1071 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1072 * matrix is not singular. 1073 * __ __ 1074 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1075 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1076 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1077 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1078 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1079 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1080 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1081 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1082 * ~~ ~~ 1083 * __ __ 1084 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1085 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1086 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1087 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1088 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1089 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1090 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1091 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1092 * ~~ ~~ 1093 * __ __ 1094 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1095 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1096 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1097 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1098 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1099 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1100 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1101 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1102 * ~~ ~~ 1103 * __ __ 1104 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1105 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1106 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1107 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1108 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1109 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1110 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1111 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1112 * ~~ ~~ 1113 * __ __ 1114 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1115 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1116 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1117 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1118 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1119 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1120 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1121 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1122 * ~~ ~~ 1123 * __ __ 1124 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1125 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1126 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1127 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1128 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1129 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1130 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1131 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1132 * ~~ ~~ 1133 * __ __ 1134 * | 0 0 1 0 0 0 0 0 | 1135 * | 167 100 5 41 159 169 217 208 | 1136 * | 166 100 4 40 158 168 216 209 | 1137 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1138 * | 0 0 0 0 1 0 0 0 | 1139 * | 0 0 0 0 0 1 0 0 | 1140 * | 0 0 0 0 0 0 1 0 | 1141 * | 0 0 0 0 0 0 0 1 | 1142 * ~~ ~~ 1143 * 1144 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1145 * of the missing data. 1146 * 1147 * As is apparent from the example above, the only non-trivial rows in the 1148 * inverse matrix correspond to the data disks that we're trying to 1149 * reconstruct. Indeed, those are the only rows we need as the others would 1150 * only be useful for reconstructing data known or assumed to be valid. For 1151 * that reason, we only build the coefficients in the rows that correspond to 1152 * targeted columns. 1153 */ 1154 /* END CSTYLED */ 1155 1156 static void 1157 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1158 uint8_t **rows) 1159 { 1160 int i, j; 1161 int pow; 1162 1163 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1164 1165 /* 1166 * Fill in the missing rows of interest. 1167 */ 1168 for (i = 0; i < nmap; i++) { 1169 ASSERT3S(0, <=, map[i]); 1170 ASSERT3S(map[i], <=, 2); 1171 1172 pow = map[i] * n; 1173 if (pow > 255) 1174 pow -= 255; 1175 ASSERT(pow <= 255); 1176 1177 for (j = 0; j < n; j++) { 1178 pow -= map[i]; 1179 if (pow < 0) 1180 pow += 255; 1181 rows[i][j] = vdev_raidz_pow2[pow]; 1182 } 1183 } 1184 } 1185 1186 static void 1187 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1188 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1189 { 1190 int i, j, ii, jj; 1191 uint8_t log; 1192 1193 /* 1194 * Assert that the first nmissing entries from the array of used 1195 * columns correspond to parity columns and that subsequent entries 1196 * correspond to data columns. 1197 */ 1198 for (i = 0; i < nmissing; i++) { 1199 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1200 } 1201 for (; i < n; i++) { 1202 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1203 } 1204 1205 /* 1206 * First initialize the storage where we'll compute the inverse rows. 1207 */ 1208 for (i = 0; i < nmissing; i++) { 1209 for (j = 0; j < n; j++) { 1210 invrows[i][j] = (i == j) ? 1 : 0; 1211 } 1212 } 1213 1214 /* 1215 * Subtract all trivial rows from the rows of consequence. 1216 */ 1217 for (i = 0; i < nmissing; i++) { 1218 for (j = nmissing; j < n; j++) { 1219 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1220 jj = used[j] - rr->rr_firstdatacol; 1221 ASSERT3S(jj, <, n); 1222 invrows[i][j] = rows[i][jj]; 1223 rows[i][jj] = 0; 1224 } 1225 } 1226 1227 /* 1228 * For each of the rows of interest, we must normalize it and subtract 1229 * a multiple of it from the other rows. 1230 */ 1231 for (i = 0; i < nmissing; i++) { 1232 for (j = 0; j < missing[i]; j++) { 1233 ASSERT0(rows[i][j]); 1234 } 1235 ASSERT3U(rows[i][missing[i]], !=, 0); 1236 1237 /* 1238 * Compute the inverse of the first element and multiply each 1239 * element in the row by that value. 1240 */ 1241 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1242 1243 for (j = 0; j < n; j++) { 1244 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1245 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1246 } 1247 1248 for (ii = 0; ii < nmissing; ii++) { 1249 if (i == ii) 1250 continue; 1251 1252 ASSERT3U(rows[ii][missing[i]], !=, 0); 1253 1254 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1255 1256 for (j = 0; j < n; j++) { 1257 rows[ii][j] ^= 1258 vdev_raidz_exp2(rows[i][j], log); 1259 invrows[ii][j] ^= 1260 vdev_raidz_exp2(invrows[i][j], log); 1261 } 1262 } 1263 } 1264 1265 /* 1266 * Verify that the data that is left in the rows are properly part of 1267 * an identity matrix. 1268 */ 1269 for (i = 0; i < nmissing; i++) { 1270 for (j = 0; j < n; j++) { 1271 if (j == missing[i]) { 1272 ASSERT3U(rows[i][j], ==, 1); 1273 } else { 1274 ASSERT0(rows[i][j]); 1275 } 1276 } 1277 } 1278 } 1279 1280 static void 1281 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1282 int *missing, uint8_t **invrows, const uint8_t *used) 1283 { 1284 int i, j, x, cc, c; 1285 uint8_t *src; 1286 uint64_t ccount; 1287 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1288 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1289 uint8_t log = 0; 1290 uint8_t val; 1291 int ll; 1292 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1293 uint8_t *p, *pp; 1294 size_t psize; 1295 1296 psize = sizeof (invlog[0][0]) * n * nmissing; 1297 p = kmem_alloc(psize, KM_SLEEP); 1298 1299 for (pp = p, i = 0; i < nmissing; i++) { 1300 invlog[i] = pp; 1301 pp += n; 1302 } 1303 1304 for (i = 0; i < nmissing; i++) { 1305 for (j = 0; j < n; j++) { 1306 ASSERT3U(invrows[i][j], !=, 0); 1307 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1308 } 1309 } 1310 1311 for (i = 0; i < n; i++) { 1312 c = used[i]; 1313 ASSERT3U(c, <, rr->rr_cols); 1314 1315 ccount = rr->rr_col[c].rc_size; 1316 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1317 if (ccount == 0) 1318 continue; 1319 src = abd_to_buf(rr->rr_col[c].rc_abd); 1320 for (j = 0; j < nmissing; j++) { 1321 cc = missing[j] + rr->rr_firstdatacol; 1322 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1323 ASSERT3U(cc, <, rr->rr_cols); 1324 ASSERT3U(cc, !=, c); 1325 1326 dcount[j] = rr->rr_col[cc].rc_size; 1327 if (dcount[j] != 0) 1328 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1329 } 1330 1331 for (x = 0; x < ccount; x++, src++) { 1332 if (*src != 0) 1333 log = vdev_raidz_log2[*src]; 1334 1335 for (cc = 0; cc < nmissing; cc++) { 1336 if (x >= dcount[cc]) 1337 continue; 1338 1339 if (*src == 0) { 1340 val = 0; 1341 } else { 1342 if ((ll = log + invlog[cc][i]) >= 255) 1343 ll -= 255; 1344 val = vdev_raidz_pow2[ll]; 1345 } 1346 1347 if (i == 0) 1348 dst[cc][x] = val; 1349 else 1350 dst[cc][x] ^= val; 1351 } 1352 } 1353 } 1354 1355 kmem_free(p, psize); 1356 } 1357 1358 static int 1359 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1360 { 1361 int n, i, c, t, tt; 1362 int nmissing_rows; 1363 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1364 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1365 uint8_t *p, *pp; 1366 size_t psize; 1367 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1368 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1369 uint8_t *used; 1370 1371 abd_t **bufs = NULL; 1372 1373 int code = 0; 1374 1375 /* 1376 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1377 * temporary linear ABDs if any non-linear ABDs are found. 1378 */ 1379 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1380 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1381 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1382 KM_PUSHPAGE); 1383 1384 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1385 raidz_col_t *col = &rr->rr_col[c]; 1386 1387 bufs[c] = col->rc_abd; 1388 if (bufs[c] != NULL) { 1389 col->rc_abd = abd_alloc_linear( 1390 col->rc_size, B_TRUE); 1391 abd_copy(col->rc_abd, bufs[c], 1392 col->rc_size); 1393 } 1394 } 1395 1396 break; 1397 } 1398 } 1399 1400 n = rr->rr_cols - rr->rr_firstdatacol; 1401 1402 /* 1403 * Figure out which data columns are missing. 1404 */ 1405 nmissing_rows = 0; 1406 for (t = 0; t < ntgts; t++) { 1407 if (tgts[t] >= rr->rr_firstdatacol) { 1408 missing_rows[nmissing_rows++] = 1409 tgts[t] - rr->rr_firstdatacol; 1410 } 1411 } 1412 1413 /* 1414 * Figure out which parity columns to use to help generate the missing 1415 * data columns. 1416 */ 1417 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1418 ASSERT(tt < ntgts); 1419 ASSERT(c < rr->rr_firstdatacol); 1420 1421 /* 1422 * Skip any targeted parity columns. 1423 */ 1424 if (c == tgts[tt]) { 1425 tt++; 1426 continue; 1427 } 1428 1429 code |= 1 << c; 1430 1431 parity_map[i] = c; 1432 i++; 1433 } 1434 1435 ASSERT(code != 0); 1436 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1437 1438 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1439 nmissing_rows * n + sizeof (used[0]) * n; 1440 p = kmem_alloc(psize, KM_SLEEP); 1441 1442 for (pp = p, i = 0; i < nmissing_rows; i++) { 1443 rows[i] = pp; 1444 pp += n; 1445 invrows[i] = pp; 1446 pp += n; 1447 } 1448 used = pp; 1449 1450 for (i = 0; i < nmissing_rows; i++) { 1451 used[i] = parity_map[i]; 1452 } 1453 1454 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1455 if (tt < nmissing_rows && 1456 c == missing_rows[tt] + rr->rr_firstdatacol) { 1457 tt++; 1458 continue; 1459 } 1460 1461 ASSERT3S(i, <, n); 1462 used[i] = c; 1463 i++; 1464 } 1465 1466 /* 1467 * Initialize the interesting rows of the matrix. 1468 */ 1469 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 1470 1471 /* 1472 * Invert the matrix. 1473 */ 1474 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 1475 invrows, used); 1476 1477 /* 1478 * Reconstruct the missing data using the generated matrix. 1479 */ 1480 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 1481 invrows, used); 1482 1483 kmem_free(p, psize); 1484 1485 /* 1486 * copy back from temporary linear abds and free them 1487 */ 1488 if (bufs) { 1489 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1490 raidz_col_t *col = &rr->rr_col[c]; 1491 1492 if (bufs[c] != NULL) { 1493 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1494 abd_free(col->rc_abd); 1495 } 1496 col->rc_abd = bufs[c]; 1497 } 1498 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 1499 } 1500 1501 return (code); 1502 } 1503 1504 static int 1505 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 1506 const int *t, int nt) 1507 { 1508 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1509 int ntgts; 1510 int i, c, ret; 1511 int code; 1512 int nbadparity, nbaddata; 1513 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1514 1515 nbadparity = rr->rr_firstdatacol; 1516 nbaddata = rr->rr_cols - nbadparity; 1517 ntgts = 0; 1518 for (i = 0, c = 0; c < rr->rr_cols; c++) { 1519 if (c < rr->rr_firstdatacol) 1520 parity_valid[c] = B_FALSE; 1521 1522 if (i < nt && c == t[i]) { 1523 tgts[ntgts++] = c; 1524 i++; 1525 } else if (rr->rr_col[c].rc_error != 0) { 1526 tgts[ntgts++] = c; 1527 } else if (c >= rr->rr_firstdatacol) { 1528 nbaddata--; 1529 } else { 1530 parity_valid[c] = B_TRUE; 1531 nbadparity--; 1532 } 1533 } 1534 1535 ASSERT(ntgts >= nt); 1536 ASSERT(nbaddata >= 0); 1537 ASSERT(nbaddata + nbadparity == ntgts); 1538 1539 dt = &tgts[nbadparity]; 1540 1541 /* Reconstruct using the new math implementation */ 1542 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 1543 if (ret != RAIDZ_ORIGINAL_IMPL) 1544 return (ret); 1545 1546 /* 1547 * See if we can use any of our optimized reconstruction routines. 1548 */ 1549 switch (nbaddata) { 1550 case 1: 1551 if (parity_valid[VDEV_RAIDZ_P]) 1552 return (vdev_raidz_reconstruct_p(rr, dt, 1)); 1553 1554 ASSERT(rr->rr_firstdatacol > 1); 1555 1556 if (parity_valid[VDEV_RAIDZ_Q]) 1557 return (vdev_raidz_reconstruct_q(rr, dt, 1)); 1558 1559 ASSERT(rr->rr_firstdatacol > 2); 1560 break; 1561 1562 case 2: 1563 ASSERT(rr->rr_firstdatacol > 1); 1564 1565 if (parity_valid[VDEV_RAIDZ_P] && 1566 parity_valid[VDEV_RAIDZ_Q]) 1567 return (vdev_raidz_reconstruct_pq(rr, dt, 2)); 1568 1569 ASSERT(rr->rr_firstdatacol > 2); 1570 1571 break; 1572 } 1573 1574 code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); 1575 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1576 ASSERT(code > 0); 1577 return (code); 1578 } 1579 1580 static int 1581 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1582 uint64_t *logical_ashift, uint64_t *physical_ashift) 1583 { 1584 vdev_raidz_t *vdrz = vd->vdev_tsd; 1585 uint64_t nparity = vdrz->vd_nparity; 1586 int c; 1587 int lasterror = 0; 1588 int numerrors = 0; 1589 1590 ASSERT(nparity > 0); 1591 1592 if (nparity > VDEV_RAIDZ_MAXPARITY || 1593 vd->vdev_children < nparity + 1) { 1594 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1595 return (SET_ERROR(EINVAL)); 1596 } 1597 1598 vdev_open_children(vd); 1599 1600 for (c = 0; c < vd->vdev_children; c++) { 1601 vdev_t *cvd = vd->vdev_child[c]; 1602 1603 if (cvd->vdev_open_error != 0) { 1604 lasterror = cvd->vdev_open_error; 1605 numerrors++; 1606 continue; 1607 } 1608 1609 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1610 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1611 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1612 *physical_ashift = MAX(*physical_ashift, 1613 cvd->vdev_physical_ashift); 1614 } 1615 1616 *asize *= vd->vdev_children; 1617 *max_asize *= vd->vdev_children; 1618 1619 if (numerrors > nparity) { 1620 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1621 return (lasterror); 1622 } 1623 1624 return (0); 1625 } 1626 1627 static void 1628 vdev_raidz_close(vdev_t *vd) 1629 { 1630 for (int c = 0; c < vd->vdev_children; c++) { 1631 if (vd->vdev_child[c] != NULL) 1632 vdev_close(vd->vdev_child[c]); 1633 } 1634 } 1635 1636 static uint64_t 1637 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1638 { 1639 vdev_raidz_t *vdrz = vd->vdev_tsd; 1640 uint64_t asize; 1641 uint64_t ashift = vd->vdev_top->vdev_ashift; 1642 uint64_t cols = vdrz->vd_logical_width; 1643 uint64_t nparity = vdrz->vd_nparity; 1644 1645 asize = ((psize - 1) >> ashift) + 1; 1646 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1647 asize = roundup(asize, nparity + 1) << ashift; 1648 1649 return (asize); 1650 } 1651 1652 /* 1653 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 1654 * so each child must provide at least 1/Nth of its asize. 1655 */ 1656 static uint64_t 1657 vdev_raidz_min_asize(vdev_t *vd) 1658 { 1659 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 1660 vd->vdev_children); 1661 } 1662 1663 void 1664 vdev_raidz_child_done(zio_t *zio) 1665 { 1666 raidz_col_t *rc = zio->io_private; 1667 1668 rc->rc_error = zio->io_error; 1669 rc->rc_tried = 1; 1670 rc->rc_skipped = 0; 1671 } 1672 1673 static void 1674 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) 1675 { 1676 #ifdef ZFS_DEBUG 1677 vdev_t *tvd = vd->vdev_top; 1678 1679 range_seg64_t logical_rs, physical_rs, remain_rs; 1680 logical_rs.rs_start = rr->rr_offset; 1681 logical_rs.rs_end = logical_rs.rs_start + 1682 vdev_raidz_asize(vd, rr->rr_size); 1683 1684 raidz_col_t *rc = &rr->rr_col[col]; 1685 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1686 1687 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 1688 ASSERT(vdev_xlate_is_empty(&remain_rs)); 1689 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1690 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1691 /* 1692 * It would be nice to assert that rs_end is equal 1693 * to rc_offset + rc_size but there might be an 1694 * optional I/O at the end that is not accounted in 1695 * rc_size. 1696 */ 1697 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1698 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1699 rc->rc_size + (1 << tvd->vdev_ashift)); 1700 } else { 1701 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1702 } 1703 #endif 1704 } 1705 1706 static void 1707 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) 1708 { 1709 vdev_t *vd = zio->io_vd; 1710 raidz_map_t *rm = zio->io_vsd; 1711 int c, i; 1712 1713 vdev_raidz_generate_parity_row(rm, rr); 1714 1715 for (int c = 0; c < rr->rr_cols; c++) { 1716 raidz_col_t *rc = &rr->rr_col[c]; 1717 if (rc->rc_size == 0) 1718 continue; 1719 1720 /* Verify physical to logical translation */ 1721 vdev_raidz_io_verify(vd, rr, c); 1722 1723 zio_nowait(zio_vdev_child_io(zio, NULL, 1724 vd->vdev_child[rc->rc_devidx], rc->rc_offset, 1725 rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 1726 0, vdev_raidz_child_done, rc)); 1727 } 1728 1729 /* 1730 * Generate optional I/Os for skip sectors to improve aggregation 1731 * contiguity. 1732 */ 1733 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1734 ASSERT(c <= rr->rr_scols); 1735 if (c == rr->rr_scols) 1736 c = 0; 1737 1738 raidz_col_t *rc = &rr->rr_col[c]; 1739 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1740 1741 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1742 rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, 1743 zio->io_type, zio->io_priority, 1744 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1745 } 1746 } 1747 1748 static void 1749 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) 1750 { 1751 vdev_t *vd = zio->io_vd; 1752 1753 /* 1754 * Iterate over the columns in reverse order so that we hit the parity 1755 * last -- any errors along the way will force us to read the parity. 1756 */ 1757 for (int c = rr->rr_cols - 1; c >= 0; c--) { 1758 raidz_col_t *rc = &rr->rr_col[c]; 1759 if (rc->rc_size == 0) 1760 continue; 1761 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1762 if (!vdev_readable(cvd)) { 1763 if (c >= rr->rr_firstdatacol) 1764 rr->rr_missingdata++; 1765 else 1766 rr->rr_missingparity++; 1767 rc->rc_error = SET_ERROR(ENXIO); 1768 rc->rc_tried = 1; /* don't even try */ 1769 rc->rc_skipped = 1; 1770 continue; 1771 } 1772 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1773 if (c >= rr->rr_firstdatacol) 1774 rr->rr_missingdata++; 1775 else 1776 rr->rr_missingparity++; 1777 rc->rc_error = SET_ERROR(ESTALE); 1778 rc->rc_skipped = 1; 1779 continue; 1780 } 1781 if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 1782 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1783 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1784 rc->rc_offset, rc->rc_abd, rc->rc_size, 1785 zio->io_type, zio->io_priority, 0, 1786 vdev_raidz_child_done, rc)); 1787 } 1788 } 1789 } 1790 1791 /* 1792 * Start an IO operation on a RAIDZ VDev 1793 * 1794 * Outline: 1795 * - For write operations: 1796 * 1. Generate the parity data 1797 * 2. Create child zio write operations to each column's vdev, for both 1798 * data and parity. 1799 * 3. If the column skips any sectors for padding, create optional dummy 1800 * write zio children for those areas to improve aggregation continuity. 1801 * - For read operations: 1802 * 1. Create child zio read operations to each data column's vdev to read 1803 * the range of data required for zio. 1804 * 2. If this is a scrub or resilver operation, or if any of the data 1805 * vdevs have had errors, then create zio read operations to the parity 1806 * columns' VDevs as well. 1807 */ 1808 static void 1809 vdev_raidz_io_start(zio_t *zio) 1810 { 1811 vdev_t *vd = zio->io_vd; 1812 vdev_t *tvd = vd->vdev_top; 1813 vdev_raidz_t *vdrz = vd->vdev_tsd; 1814 raidz_map_t *rm; 1815 1816 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, 1817 vdrz->vd_logical_width, vdrz->vd_nparity); 1818 1819 /* 1820 * Until raidz expansion is implemented all maps for a raidz vdev 1821 * contain a single row. 1822 */ 1823 ASSERT3U(rm->rm_nrows, ==, 1); 1824 raidz_row_t *rr = rm->rm_row[0]; 1825 1826 zio->io_vsd = rm; 1827 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1828 1829 if (zio->io_type == ZIO_TYPE_WRITE) { 1830 vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); 1831 } else { 1832 ASSERT(zio->io_type == ZIO_TYPE_READ); 1833 vdev_raidz_io_start_read(zio, rr); 1834 } 1835 1836 zio_execute(zio); 1837 } 1838 1839 /* 1840 * Report a checksum error for a child of a RAID-Z device. 1841 */ 1842 static void 1843 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1844 { 1845 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1846 1847 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 1848 zio->io_priority != ZIO_PRIORITY_REBUILD) { 1849 zio_bad_cksum_t zbc; 1850 raidz_map_t *rm = zio->io_vsd; 1851 1852 zbc.zbc_has_cksum = 0; 1853 zbc.zbc_injected = rm->rm_ecksuminjected; 1854 1855 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1856 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1857 rc->rc_abd, bad_data, &zbc); 1858 mutex_enter(&vd->vdev_stat_lock); 1859 vd->vdev_stat.vs_checksum_errors++; 1860 mutex_exit(&vd->vdev_stat_lock); 1861 } 1862 } 1863 1864 /* 1865 * We keep track of whether or not there were any injected errors, so that 1866 * any ereports we generate can note it. 1867 */ 1868 static int 1869 raidz_checksum_verify(zio_t *zio) 1870 { 1871 zio_bad_cksum_t zbc; 1872 raidz_map_t *rm = zio->io_vsd; 1873 1874 bzero(&zbc, sizeof (zio_bad_cksum_t)); 1875 1876 int ret = zio_checksum_error(zio, &zbc); 1877 if (ret != 0 && zbc.zbc_injected != 0) 1878 rm->rm_ecksuminjected = 1; 1879 1880 return (ret); 1881 } 1882 1883 /* 1884 * Generate the parity from the data columns. If we tried and were able to 1885 * read the parity without error, verify that the generated parity matches the 1886 * data we read. If it doesn't, we fire off a checksum error. Return the 1887 * number of such failures. 1888 */ 1889 static int 1890 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 1891 { 1892 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1893 int c, ret = 0; 1894 raidz_map_t *rm = zio->io_vsd; 1895 raidz_col_t *rc; 1896 1897 blkptr_t *bp = zio->io_bp; 1898 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1899 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1900 1901 if (checksum == ZIO_CHECKSUM_NOPARITY) 1902 return (ret); 1903 1904 for (c = 0; c < rr->rr_firstdatacol; c++) { 1905 rc = &rr->rr_col[c]; 1906 if (!rc->rc_tried || rc->rc_error != 0) 1907 continue; 1908 1909 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 1910 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 1911 } 1912 1913 /* 1914 * Regenerates parity even for !tried||rc_error!=0 columns. This 1915 * isn't harmful but it does have the side effect of fixing stuff 1916 * we didn't realize was necessary (i.e. even if we return 0). 1917 */ 1918 vdev_raidz_generate_parity_row(rm, rr); 1919 1920 for (c = 0; c < rr->rr_firstdatacol; c++) { 1921 rc = &rr->rr_col[c]; 1922 1923 if (!rc->rc_tried || rc->rc_error != 0) 1924 continue; 1925 1926 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 1927 raidz_checksum_error(zio, rc, orig[c]); 1928 rc->rc_error = SET_ERROR(ECKSUM); 1929 ret++; 1930 } 1931 abd_free(orig[c]); 1932 } 1933 1934 return (ret); 1935 } 1936 1937 static int 1938 vdev_raidz_worst_error(raidz_row_t *rr) 1939 { 1940 int error = 0; 1941 1942 for (int c = 0; c < rr->rr_cols; c++) 1943 error = zio_worst_error(error, rr->rr_col[c].rc_error); 1944 1945 return (error); 1946 } 1947 1948 static void 1949 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 1950 { 1951 int unexpected_errors = 0; 1952 int parity_errors = 0; 1953 int parity_untried = 0; 1954 int data_errors = 0; 1955 1956 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 1957 1958 for (int c = 0; c < rr->rr_cols; c++) { 1959 raidz_col_t *rc = &rr->rr_col[c]; 1960 1961 if (rc->rc_error) { 1962 if (c < rr->rr_firstdatacol) 1963 parity_errors++; 1964 else 1965 data_errors++; 1966 1967 if (!rc->rc_skipped) 1968 unexpected_errors++; 1969 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 1970 parity_untried++; 1971 } 1972 } 1973 1974 /* 1975 * If we read more parity disks than were used for 1976 * reconstruction, confirm that the other parity disks produced 1977 * correct data. 1978 * 1979 * Note that we also regenerate parity when resilvering so we 1980 * can write it out to failed devices later. 1981 */ 1982 if (parity_errors + parity_untried < 1983 rr->rr_firstdatacol - data_errors || 1984 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1985 int n = raidz_parity_verify(zio, rr); 1986 unexpected_errors += n; 1987 ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); 1988 } 1989 1990 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1991 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1992 /* 1993 * Use the good data we have in hand to repair damaged children. 1994 */ 1995 for (int c = 0; c < rr->rr_cols; c++) { 1996 raidz_col_t *rc = &rr->rr_col[c]; 1997 vdev_t *vd = zio->io_vd; 1998 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1999 2000 if ((rc->rc_error == 0 || rc->rc_size == 0) && 2001 (rc->rc_repair == 0)) { 2002 continue; 2003 } 2004 2005 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2006 rc->rc_offset, rc->rc_abd, rc->rc_size, 2007 ZIO_TYPE_WRITE, 2008 zio->io_priority == ZIO_PRIORITY_REBUILD ? 2009 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 2010 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2011 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2012 } 2013 } 2014 } 2015 2016 static void 2017 raidz_restore_orig_data(raidz_map_t *rm) 2018 { 2019 for (int i = 0; i < rm->rm_nrows; i++) { 2020 raidz_row_t *rr = rm->rm_row[i]; 2021 for (int c = 0; c < rr->rr_cols; c++) { 2022 raidz_col_t *rc = &rr->rr_col[c]; 2023 if (rc->rc_need_orig_restore) { 2024 abd_copy_from_buf(rc->rc_abd, 2025 rc->rc_orig_data, rc->rc_size); 2026 rc->rc_need_orig_restore = B_FALSE; 2027 } 2028 } 2029 } 2030 } 2031 2032 /* 2033 * returns EINVAL if reconstruction of the block will not be possible 2034 * returns ECKSUM if this specific reconstruction failed 2035 * returns 0 on successful reconstruction 2036 */ 2037 static int 2038 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 2039 { 2040 raidz_map_t *rm = zio->io_vsd; 2041 2042 /* Reconstruct each row */ 2043 for (int r = 0; r < rm->rm_nrows; r++) { 2044 raidz_row_t *rr = rm->rm_row[r]; 2045 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 2046 int t = 0; 2047 int dead = 0; 2048 int dead_data = 0; 2049 2050 for (int c = 0; c < rr->rr_cols; c++) { 2051 raidz_col_t *rc = &rr->rr_col[c]; 2052 ASSERT0(rc->rc_need_orig_restore); 2053 if (rc->rc_error != 0) { 2054 dead++; 2055 if (c >= nparity) 2056 dead_data++; 2057 continue; 2058 } 2059 if (rc->rc_size == 0) 2060 continue; 2061 for (int lt = 0; lt < ntgts; lt++) { 2062 if (rc->rc_devidx == ltgts[lt]) { 2063 if (rc->rc_orig_data == NULL) { 2064 rc->rc_orig_data = 2065 zio_buf_alloc(rc->rc_size); 2066 abd_copy_to_buf( 2067 rc->rc_orig_data, 2068 rc->rc_abd, rc->rc_size); 2069 } 2070 rc->rc_need_orig_restore = B_TRUE; 2071 2072 dead++; 2073 if (c >= nparity) 2074 dead_data++; 2075 my_tgts[t++] = c; 2076 break; 2077 } 2078 } 2079 } 2080 if (dead > nparity) { 2081 /* reconstruction not possible */ 2082 raidz_restore_orig_data(rm); 2083 return (EINVAL); 2084 } 2085 rr->rr_code = 0; 2086 if (dead_data > 0) 2087 rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, 2088 my_tgts, t); 2089 } 2090 2091 /* Check for success */ 2092 if (raidz_checksum_verify(zio) == 0) { 2093 2094 /* Reconstruction succeeded - report errors */ 2095 for (int i = 0; i < rm->rm_nrows; i++) { 2096 raidz_row_t *rr = rm->rm_row[i]; 2097 2098 for (int c = 0; c < rr->rr_cols; c++) { 2099 raidz_col_t *rc = &rr->rr_col[c]; 2100 if (rc->rc_need_orig_restore) { 2101 /* 2102 * Note: if this is a parity column, 2103 * we don't really know if it's wrong. 2104 * We need to let 2105 * vdev_raidz_io_done_verified() check 2106 * it, and if we set rc_error, it will 2107 * think that it is a "known" error 2108 * that doesn't need to be checked 2109 * or corrected. 2110 */ 2111 if (rc->rc_error == 0 && 2112 c >= rr->rr_firstdatacol) { 2113 raidz_checksum_error(zio, 2114 rc, rc->rc_gdata); 2115 rc->rc_error = 2116 SET_ERROR(ECKSUM); 2117 } 2118 rc->rc_need_orig_restore = B_FALSE; 2119 } 2120 } 2121 2122 vdev_raidz_io_done_verified(zio, rr); 2123 } 2124 2125 zio_checksum_verified(zio); 2126 2127 return (0); 2128 } 2129 2130 /* Reconstruction failed - restore original data */ 2131 raidz_restore_orig_data(rm); 2132 return (ECKSUM); 2133 } 2134 2135 /* 2136 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 2137 * Note that the algorithm below is non-optimal because it doesn't take into 2138 * account how reconstruction is actually performed. For example, with 2139 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2140 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2141 * cases we'd only use parity information in column 0. 2142 * 2143 * The order that we find the various possible combinations of failed 2144 * disks is dictated by these rules: 2145 * - Examine each "slot" (the "i" in tgts[i]) 2146 * - Try to increment this slot (tgts[i] = tgts[i] + 1) 2147 * - if we can't increment because it runs into the next slot, 2148 * reset our slot to the minimum, and examine the next slot 2149 * 2150 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 2151 * 3 columns to reconstruct), we will generate the following sequence: 2152 * 2153 * STATE ACTION 2154 * 0 1 2 special case: skip since these are all parity 2155 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 2156 * 0 2 3 first slot: increment to 1 2157 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 2158 * 0 1 4 first: reset to 0; middle: increment to 2 2159 * 0 2 4 first: increment to 1 2160 * 1 2 4 first: reset to 0; middle: increment to 3 2161 * 0 3 4 first: increment to 1 2162 * 1 3 4 first: increment to 2 2163 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 2164 * 0 1 5 first: reset to 0; middle: increment to 2 2165 * 0 2 5 first: increment to 1 2166 * 1 2 5 first: reset to 0; middle: increment to 3 2167 * 0 3 5 first: increment to 1 2168 * 1 3 5 first: increment to 2 2169 * 2 3 5 first: reset to 0; middle: increment to 4 2170 * 0 4 5 first: increment to 1 2171 * 1 4 5 first: increment to 2 2172 * 2 4 5 first: increment to 3 2173 * 3 4 5 done 2174 * 2175 * This strategy works for dRAID but is less effecient when there are a large 2176 * number of child vdevs and therefore permutations to check. Furthermore, 2177 * since the raidz_map_t rows likely do not overlap reconstruction would be 2178 * possible as long as there are no more than nparity data errors per row. 2179 * These additional permutations are not currently checked but could be as 2180 * a future improvement. 2181 */ 2182 static int 2183 vdev_raidz_combrec(zio_t *zio) 2184 { 2185 int nparity = vdev_get_nparity(zio->io_vd); 2186 raidz_map_t *rm = zio->io_vsd; 2187 2188 /* Check if there's enough data to attempt reconstrution. */ 2189 for (int i = 0; i < rm->rm_nrows; i++) { 2190 raidz_row_t *rr = rm->rm_row[i]; 2191 int total_errors = 0; 2192 2193 for (int c = 0; c < rr->rr_cols; c++) { 2194 if (rr->rr_col[c].rc_error) 2195 total_errors++; 2196 } 2197 2198 if (total_errors > nparity) 2199 return (vdev_raidz_worst_error(rr)); 2200 } 2201 2202 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 2203 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2204 int *ltgts = &tstore[1]; /* value is logical child ID */ 2205 2206 /* Determine number of logical children, n */ 2207 int n = zio->io_vd->vdev_children; 2208 2209 ASSERT3U(num_failures, <=, nparity); 2210 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 2211 2212 /* Handle corner cases in combrec logic */ 2213 ltgts[-1] = -1; 2214 for (int i = 0; i < num_failures; i++) { 2215 ltgts[i] = i; 2216 } 2217 ltgts[num_failures] = n; 2218 2219 for (;;) { 2220 int err = raidz_reconstruct(zio, ltgts, num_failures, 2221 nparity); 2222 if (err == EINVAL) { 2223 /* 2224 * Reconstruction not possible with this # 2225 * failures; try more failures. 2226 */ 2227 break; 2228 } else if (err == 0) 2229 return (0); 2230 2231 /* Compute next targets to try */ 2232 for (int t = 0; ; t++) { 2233 ASSERT3U(t, <, num_failures); 2234 ltgts[t]++; 2235 if (ltgts[t] == n) { 2236 /* try more failures */ 2237 ASSERT3U(t, ==, num_failures - 1); 2238 break; 2239 } 2240 2241 ASSERT3U(ltgts[t], <, n); 2242 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 2243 2244 /* 2245 * If that spot is available, we're done here. 2246 * Try the next combination. 2247 */ 2248 if (ltgts[t] != ltgts[t + 1]) 2249 break; 2250 2251 /* 2252 * Otherwise, reset this tgt to the minimum, 2253 * and move on to the next tgt. 2254 */ 2255 ltgts[t] = ltgts[t - 1] + 1; 2256 ASSERT3U(ltgts[t], ==, t); 2257 } 2258 2259 /* Increase the number of failures and keep trying. */ 2260 if (ltgts[num_failures - 1] == n) 2261 break; 2262 } 2263 } 2264 2265 return (ECKSUM); 2266 } 2267 2268 void 2269 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 2270 { 2271 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 2272 raidz_row_t *rr = rm->rm_row[row]; 2273 vdev_raidz_reconstruct_row(rm, rr, t, nt); 2274 } 2275 } 2276 2277 /* 2278 * Complete a write IO operation on a RAIDZ VDev 2279 * 2280 * Outline: 2281 * 1. Check for errors on the child IOs. 2282 * 2. Return, setting an error code if too few child VDevs were written 2283 * to reconstruct the data later. Note that partial writes are 2284 * considered successful if they can be reconstructed at all. 2285 */ 2286 static void 2287 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 2288 { 2289 int total_errors = 0; 2290 2291 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2292 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2293 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 2294 2295 for (int c = 0; c < rr->rr_cols; c++) { 2296 raidz_col_t *rc = &rr->rr_col[c]; 2297 2298 if (rc->rc_error) { 2299 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2300 2301 total_errors++; 2302 } 2303 } 2304 2305 /* 2306 * Treat partial writes as a success. If we couldn't write enough 2307 * columns to reconstruct the data, the I/O failed. Otherwise, 2308 * good enough. 2309 * 2310 * Now that we support write reallocation, it would be better 2311 * to treat partial failure as real failure unless there are 2312 * no non-degraded top-level vdevs left, and not update DTLs 2313 * if we intend to reallocate. 2314 */ 2315 if (total_errors > rr->rr_firstdatacol) { 2316 zio->io_error = zio_worst_error(zio->io_error, 2317 vdev_raidz_worst_error(rr)); 2318 } 2319 } 2320 2321 /* 2322 * return 0 if no reconstruction occurred, otherwise the "code" from 2323 * vdev_raidz_reconstruct(). 2324 */ 2325 static int 2326 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 2327 raidz_row_t *rr) 2328 { 2329 int parity_errors = 0; 2330 int parity_untried = 0; 2331 int data_errors = 0; 2332 int total_errors = 0; 2333 int code = 0; 2334 2335 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2336 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2337 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2338 2339 for (int c = 0; c < rr->rr_cols; c++) { 2340 raidz_col_t *rc = &rr->rr_col[c]; 2341 2342 if (rc->rc_error) { 2343 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2344 2345 if (c < rr->rr_firstdatacol) 2346 parity_errors++; 2347 else 2348 data_errors++; 2349 2350 total_errors++; 2351 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2352 parity_untried++; 2353 } 2354 } 2355 2356 /* 2357 * If there were data errors and the number of errors we saw was 2358 * correctable -- less than or equal to the number of parity disks read 2359 * -- reconstruct based on the missing data. 2360 */ 2361 if (data_errors != 0 && 2362 total_errors <= rr->rr_firstdatacol - parity_untried) { 2363 /* 2364 * We either attempt to read all the parity columns or 2365 * none of them. If we didn't try to read parity, we 2366 * wouldn't be here in the correctable case. There must 2367 * also have been fewer parity errors than parity 2368 * columns or, again, we wouldn't be in this code path. 2369 */ 2370 ASSERT(parity_untried == 0); 2371 ASSERT(parity_errors < rr->rr_firstdatacol); 2372 2373 /* 2374 * Identify the data columns that reported an error. 2375 */ 2376 int n = 0; 2377 int tgts[VDEV_RAIDZ_MAXPARITY]; 2378 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2379 raidz_col_t *rc = &rr->rr_col[c]; 2380 if (rc->rc_error != 0) { 2381 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2382 tgts[n++] = c; 2383 } 2384 } 2385 2386 ASSERT(rr->rr_firstdatacol >= n); 2387 2388 code = vdev_raidz_reconstruct_row(rm, rr, tgts, n); 2389 } 2390 2391 return (code); 2392 } 2393 2394 /* 2395 * Return the number of reads issued. 2396 */ 2397 static int 2398 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 2399 { 2400 vdev_t *vd = zio->io_vd; 2401 int nread = 0; 2402 2403 rr->rr_missingdata = 0; 2404 rr->rr_missingparity = 0; 2405 2406 /* 2407 * If this rows contains empty sectors which are not required 2408 * for a normal read then allocate an ABD for them now so they 2409 * may be read, verified, and any needed repairs performed. 2410 */ 2411 if (rr->rr_nempty && rr->rr_abd_empty == NULL) 2412 vdev_draid_map_alloc_empty(zio, rr); 2413 2414 for (int c = 0; c < rr->rr_cols; c++) { 2415 raidz_col_t *rc = &rr->rr_col[c]; 2416 if (rc->rc_tried || rc->rc_size == 0) 2417 continue; 2418 2419 zio_nowait(zio_vdev_child_io(zio, NULL, 2420 vd->vdev_child[rc->rc_devidx], 2421 rc->rc_offset, rc->rc_abd, rc->rc_size, 2422 zio->io_type, zio->io_priority, 0, 2423 vdev_raidz_child_done, rc)); 2424 nread++; 2425 } 2426 return (nread); 2427 } 2428 2429 /* 2430 * We're here because either there were too many errors to even attempt 2431 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 2432 * failed. In either case, there is enough bad data to prevent reconstruction. 2433 * Start checksum ereports for all children which haven't failed. 2434 */ 2435 static void 2436 vdev_raidz_io_done_unrecoverable(zio_t *zio) 2437 { 2438 raidz_map_t *rm = zio->io_vsd; 2439 2440 for (int i = 0; i < rm->rm_nrows; i++) { 2441 raidz_row_t *rr = rm->rm_row[i]; 2442 2443 for (int c = 0; c < rr->rr_cols; c++) { 2444 raidz_col_t *rc = &rr->rr_col[c]; 2445 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2446 2447 if (rc->rc_error != 0) 2448 continue; 2449 2450 zio_bad_cksum_t zbc; 2451 zbc.zbc_has_cksum = 0; 2452 zbc.zbc_injected = rm->rm_ecksuminjected; 2453 2454 (void) zfs_ereport_start_checksum(zio->io_spa, 2455 cvd, &zio->io_bookmark, zio, rc->rc_offset, 2456 rc->rc_size, (void *)(uintptr_t)c, &zbc); 2457 mutex_enter(&cvd->vdev_stat_lock); 2458 cvd->vdev_stat.vs_checksum_errors++; 2459 mutex_exit(&cvd->vdev_stat_lock); 2460 } 2461 } 2462 } 2463 2464 void 2465 vdev_raidz_io_done(zio_t *zio) 2466 { 2467 raidz_map_t *rm = zio->io_vsd; 2468 2469 if (zio->io_type == ZIO_TYPE_WRITE) { 2470 for (int i = 0; i < rm->rm_nrows; i++) { 2471 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 2472 } 2473 } else { 2474 for (int i = 0; i < rm->rm_nrows; i++) { 2475 raidz_row_t *rr = rm->rm_row[i]; 2476 rr->rr_code = 2477 vdev_raidz_io_done_reconstruct_known_missing(zio, 2478 rm, rr); 2479 } 2480 2481 if (raidz_checksum_verify(zio) == 0) { 2482 for (int i = 0; i < rm->rm_nrows; i++) { 2483 raidz_row_t *rr = rm->rm_row[i]; 2484 vdev_raidz_io_done_verified(zio, rr); 2485 } 2486 zio_checksum_verified(zio); 2487 } else { 2488 /* 2489 * A sequential resilver has no checksum which makes 2490 * combinatoral reconstruction impossible. This code 2491 * path is unreachable since raidz_checksum_verify() 2492 * has no checksum to verify and must succeed. 2493 */ 2494 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 2495 2496 /* 2497 * This isn't a typical situation -- either we got a 2498 * read error or a child silently returned bad data. 2499 * Read every block so we can try again with as much 2500 * data and parity as we can track down. If we've 2501 * already been through once before, all children will 2502 * be marked as tried so we'll proceed to combinatorial 2503 * reconstruction. 2504 */ 2505 int nread = 0; 2506 for (int i = 0; i < rm->rm_nrows; i++) { 2507 nread += vdev_raidz_read_all(zio, 2508 rm->rm_row[i]); 2509 } 2510 if (nread != 0) { 2511 /* 2512 * Normally our stage is VDEV_IO_DONE, but if 2513 * we've already called redone(), it will have 2514 * changed to VDEV_IO_START, in which case we 2515 * don't want to call redone() again. 2516 */ 2517 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 2518 zio_vdev_io_redone(zio); 2519 return; 2520 } 2521 2522 zio->io_error = vdev_raidz_combrec(zio); 2523 if (zio->io_error == ECKSUM && 2524 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2525 vdev_raidz_io_done_unrecoverable(zio); 2526 } 2527 } 2528 } 2529 } 2530 2531 static void 2532 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2533 { 2534 vdev_raidz_t *vdrz = vd->vdev_tsd; 2535 if (faulted > vdrz->vd_nparity) 2536 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2537 VDEV_AUX_NO_REPLICAS); 2538 else if (degraded + faulted != 0) 2539 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2540 else 2541 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2542 } 2543 2544 /* 2545 * Determine if any portion of the provided block resides on a child vdev 2546 * with a dirty DTL and therefore needs to be resilvered. The function 2547 * assumes that at least one DTL is dirty which implies that full stripe 2548 * width blocks must be resilvered. 2549 */ 2550 static boolean_t 2551 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 2552 uint64_t phys_birth) 2553 { 2554 vdev_raidz_t *vdrz = vd->vdev_tsd; 2555 uint64_t dcols = vd->vdev_children; 2556 uint64_t nparity = vdrz->vd_nparity; 2557 uint64_t ashift = vd->vdev_top->vdev_ashift; 2558 /* The starting RAIDZ (parent) vdev sector of the block. */ 2559 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 2560 /* The zio's size in units of the vdev's minimum sector size. */ 2561 uint64_t s = ((psize - 1) >> ashift) + 1; 2562 /* The first column for this stripe. */ 2563 uint64_t f = b % dcols; 2564 2565 /* Unreachable by sequential resilver. */ 2566 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 2567 2568 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 2569 return (B_FALSE); 2570 2571 if (s + nparity >= dcols) 2572 return (B_TRUE); 2573 2574 for (uint64_t c = 0; c < s + nparity; c++) { 2575 uint64_t devidx = (f + c) % dcols; 2576 vdev_t *cvd = vd->vdev_child[devidx]; 2577 2578 /* 2579 * dsl_scan_need_resilver() already checked vd with 2580 * vdev_dtl_contains(). So here just check cvd with 2581 * vdev_dtl_empty(), cheaper and a good approximation. 2582 */ 2583 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2584 return (B_TRUE); 2585 } 2586 2587 return (B_FALSE); 2588 } 2589 2590 static void 2591 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 2592 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 2593 { 2594 vdev_t *raidvd = cvd->vdev_parent; 2595 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2596 2597 uint64_t width = raidvd->vdev_children; 2598 uint64_t tgt_col = cvd->vdev_id; 2599 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2600 2601 /* make sure the offsets are block-aligned */ 2602 ASSERT0(logical_rs->rs_start % (1 << ashift)); 2603 ASSERT0(logical_rs->rs_end % (1 << ashift)); 2604 uint64_t b_start = logical_rs->rs_start >> ashift; 2605 uint64_t b_end = logical_rs->rs_end >> ashift; 2606 2607 uint64_t start_row = 0; 2608 if (b_start > tgt_col) /* avoid underflow */ 2609 start_row = ((b_start - tgt_col - 1) / width) + 1; 2610 2611 uint64_t end_row = 0; 2612 if (b_end > tgt_col) 2613 end_row = ((b_end - tgt_col - 1) / width) + 1; 2614 2615 physical_rs->rs_start = start_row << ashift; 2616 physical_rs->rs_end = end_row << ashift; 2617 2618 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 2619 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 2620 logical_rs->rs_end - logical_rs->rs_start); 2621 } 2622 2623 /* 2624 * Initialize private RAIDZ specific fields from the nvlist. 2625 */ 2626 static int 2627 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 2628 { 2629 vdev_raidz_t *vdrz; 2630 uint64_t nparity; 2631 2632 uint_t children; 2633 nvlist_t **child; 2634 int error = nvlist_lookup_nvlist_array(nv, 2635 ZPOOL_CONFIG_CHILDREN, &child, &children); 2636 if (error != 0) 2637 return (SET_ERROR(EINVAL)); 2638 2639 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 2640 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 2641 return (SET_ERROR(EINVAL)); 2642 2643 /* 2644 * Previous versions could only support 1 or 2 parity 2645 * device. 2646 */ 2647 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 2648 return (SET_ERROR(EINVAL)); 2649 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 2650 return (SET_ERROR(EINVAL)); 2651 } else { 2652 /* 2653 * We require the parity to be specified for SPAs that 2654 * support multiple parity levels. 2655 */ 2656 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 2657 return (SET_ERROR(EINVAL)); 2658 2659 /* 2660 * Otherwise, we default to 1 parity device for RAID-Z. 2661 */ 2662 nparity = 1; 2663 } 2664 2665 vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 2666 vdrz->vd_logical_width = children; 2667 vdrz->vd_nparity = nparity; 2668 2669 *tsd = vdrz; 2670 2671 return (0); 2672 } 2673 2674 static void 2675 vdev_raidz_fini(vdev_t *vd) 2676 { 2677 kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); 2678 } 2679 2680 /* 2681 * Add RAIDZ specific fields to the config nvlist. 2682 */ 2683 static void 2684 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 2685 { 2686 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 2687 vdev_raidz_t *vdrz = vd->vdev_tsd; 2688 2689 /* 2690 * Make sure someone hasn't managed to sneak a fancy new vdev 2691 * into a crufty old storage pool. 2692 */ 2693 ASSERT(vdrz->vd_nparity == 1 || 2694 (vdrz->vd_nparity <= 2 && 2695 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 2696 (vdrz->vd_nparity <= 3 && 2697 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 2698 2699 /* 2700 * Note that we'll add these even on storage pools where they 2701 * aren't strictly required -- older software will just ignore 2702 * it. 2703 */ 2704 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 2705 } 2706 2707 static uint64_t 2708 vdev_raidz_nparity(vdev_t *vd) 2709 { 2710 vdev_raidz_t *vdrz = vd->vdev_tsd; 2711 return (vdrz->vd_nparity); 2712 } 2713 2714 static uint64_t 2715 vdev_raidz_ndisks(vdev_t *vd) 2716 { 2717 return (vd->vdev_children); 2718 } 2719 2720 vdev_ops_t vdev_raidz_ops = { 2721 .vdev_op_init = vdev_raidz_init, 2722 .vdev_op_fini = vdev_raidz_fini, 2723 .vdev_op_open = vdev_raidz_open, 2724 .vdev_op_close = vdev_raidz_close, 2725 .vdev_op_asize = vdev_raidz_asize, 2726 .vdev_op_min_asize = vdev_raidz_min_asize, 2727 .vdev_op_min_alloc = NULL, 2728 .vdev_op_io_start = vdev_raidz_io_start, 2729 .vdev_op_io_done = vdev_raidz_io_done, 2730 .vdev_op_state_change = vdev_raidz_state_change, 2731 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2732 .vdev_op_hold = NULL, 2733 .vdev_op_rele = NULL, 2734 .vdev_op_remap = NULL, 2735 .vdev_op_xlate = vdev_raidz_xlate, 2736 .vdev_op_rebuild_asize = NULL, 2737 .vdev_op_metaslab_init = NULL, 2738 .vdev_op_config_generate = vdev_raidz_config_generate, 2739 .vdev_op_nparity = vdev_raidz_nparity, 2740 .vdev_op_ndisks = vdev_raidz_ndisks, 2741 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2742 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2743 }; 2744