1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/zio_checksum.h> 33 #include <sys/abd.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/vdev_raidz.h> 37 #include <sys/vdev_raidz_impl.h> 38 #include <sys/vdev_draid.h> 39 40 #ifdef ZFS_DEBUG 41 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 42 #endif 43 44 /* 45 * Virtual device vector for RAID-Z. 46 * 47 * This vdev supports single, double, and triple parity. For single parity, 48 * we use a simple XOR of all the data columns. For double or triple parity, 49 * we use a special case of Reed-Solomon coding. This extends the 50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 53 * former is also based. The latter is designed to provide higher performance 54 * for writes. 55 * 56 * Note that the Plank paper claimed to support arbitrary N+M, but was then 57 * amended six years later identifying a critical flaw that invalidates its 58 * claims. Nevertheless, the technique can be adapted to work for up to 59 * triple parity. For additional parity, the amendment "Note: Correction to 60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 61 * is viable, but the additional complexity means that write performance will 62 * suffer. 63 * 64 * All of the methods above operate on a Galois field, defined over the 65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 66 * can be expressed with a single byte. Briefly, the operations on the 67 * field are defined as follows: 68 * 69 * o addition (+) is represented by a bitwise XOR 70 * o subtraction (-) is therefore identical to addition: A + B = A - B 71 * o multiplication of A by 2 is defined by the following bitwise expression: 72 * 73 * (A * 2)_7 = A_6 74 * (A * 2)_6 = A_5 75 * (A * 2)_5 = A_4 76 * (A * 2)_4 = A_3 + A_7 77 * (A * 2)_3 = A_2 + A_7 78 * (A * 2)_2 = A_1 + A_7 79 * (A * 2)_1 = A_0 80 * (A * 2)_0 = A_7 81 * 82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 83 * As an aside, this multiplication is derived from the error correcting 84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 85 * 86 * Observe that any number in the field (except for 0) can be expressed as a 87 * power of 2 -- a generator for the field. We store a table of the powers of 88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 90 * than field addition). The inverse of a field element A (A^-1) is therefore 91 * A ^ (255 - 1) = A^254. 92 * 93 * The up-to-three parity columns, P, Q, R over several data columns, 94 * D_0, ... D_n-1, can be expressed by field operations: 95 * 96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 101 * 102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 104 * independent coefficients. (There are no additional coefficients that have 105 * this property which is why the uncorrected Plank method breaks down.) 106 * 107 * See the reconstruction code below for how P, Q and R can used individually 108 * or in concert to recover missing data columns. 109 */ 110 111 #define VDEV_RAIDZ_P 0 112 #define VDEV_RAIDZ_Q 1 113 #define VDEV_RAIDZ_R 2 114 115 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 116 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 117 118 /* 119 * We provide a mechanism to perform the field multiplication operation on a 120 * 64-bit value all at once rather than a byte at a time. This works by 121 * creating a mask from the top bit in each byte and using that to 122 * conditionally apply the XOR of 0x1d. 123 */ 124 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 125 { \ 126 (mask) = (x) & 0x8080808080808080ULL; \ 127 (mask) = ((mask) << 1) - ((mask) >> 7); \ 128 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 129 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 130 } 131 132 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 133 { \ 134 VDEV_RAIDZ_64MUL_2((x), mask); \ 135 VDEV_RAIDZ_64MUL_2((x), mask); \ 136 } 137 138 static void 139 vdev_raidz_row_free(raidz_row_t *rr) 140 { 141 for (int c = 0; c < rr->rr_cols; c++) { 142 raidz_col_t *rc = &rr->rr_col[c]; 143 144 if (rc->rc_size != 0) 145 abd_free(rc->rc_abd); 146 if (rc->rc_orig_data != NULL) 147 abd_free(rc->rc_orig_data); 148 } 149 150 if (rr->rr_abd_empty != NULL) 151 abd_free(rr->rr_abd_empty); 152 153 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 154 } 155 156 void 157 vdev_raidz_map_free(raidz_map_t *rm) 158 { 159 for (int i = 0; i < rm->rm_nrows; i++) 160 vdev_raidz_row_free(rm->rm_row[i]); 161 162 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 163 } 164 165 static void 166 vdev_raidz_map_free_vsd(zio_t *zio) 167 { 168 raidz_map_t *rm = zio->io_vsd; 169 170 vdev_raidz_map_free(rm); 171 } 172 173 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 174 .vsd_free = vdev_raidz_map_free_vsd, 175 }; 176 177 static void 178 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 179 { 180 int c; 181 int nwrapped = 0; 182 uint64_t off = 0; 183 raidz_row_t *rr = rm->rm_row[0]; 184 185 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 186 ASSERT3U(rm->rm_nrows, ==, 1); 187 188 /* 189 * Pad any parity columns with additional space to account for skip 190 * sectors. 191 */ 192 if (rm->rm_skipstart < rr->rr_firstdatacol) { 193 ASSERT0(rm->rm_skipstart); 194 nwrapped = rm->rm_nskip; 195 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 196 nwrapped = 197 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 198 } 199 200 /* 201 * Optional single skip sectors (rc_size == 0) will be handled in 202 * vdev_raidz_io_start_write(). 203 */ 204 int skipped = rr->rr_scols - rr->rr_cols; 205 206 /* Allocate buffers for the parity columns */ 207 for (c = 0; c < rr->rr_firstdatacol; c++) { 208 raidz_col_t *rc = &rr->rr_col[c]; 209 210 /* 211 * Parity columns will pad out a linear ABD to account for 212 * the skip sector. A linear ABD is used here because 213 * parity calculations use the ABD buffer directly to calculate 214 * parity. This avoids doing a memcpy back to the ABD after the 215 * parity has been calculated. By issuing the parity column 216 * with the skip sector we can reduce contention on the child 217 * VDEV queue locks (vq_lock). 218 */ 219 if (c < nwrapped) { 220 rc->rc_abd = abd_alloc_linear( 221 rc->rc_size + (1ULL << ashift), B_FALSE); 222 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 223 skipped++; 224 } else { 225 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 226 } 227 } 228 229 for (off = 0; c < rr->rr_cols; c++) { 230 raidz_col_t *rc = &rr->rr_col[c]; 231 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 232 zio->io_abd, off, rc->rc_size); 233 234 /* 235 * Generate I/O for skip sectors to improve aggregation 236 * continuity. We will use gang ABD's to reduce contention 237 * on the child VDEV queue locks (vq_lock) by issuing 238 * a single I/O that contains the data and skip sector. 239 * 240 * It is important to make sure that rc_size is not updated 241 * even though we are adding a skip sector to the ABD. When 242 * calculating the parity in vdev_raidz_generate_parity_row() 243 * the rc_size is used to iterate through the ABD's. We can 244 * not have zero'd out skip sectors used for calculating 245 * parity for raidz, because those same sectors are not used 246 * during reconstruction. 247 */ 248 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 249 rc->rc_abd = abd_alloc_gang(); 250 abd_gang_add(rc->rc_abd, abd, B_TRUE); 251 abd_gang_add(rc->rc_abd, 252 abd_get_zeros(1ULL << ashift), B_TRUE); 253 skipped++; 254 } else { 255 rc->rc_abd = abd; 256 } 257 off += rc->rc_size; 258 } 259 260 ASSERT3U(off, ==, zio->io_size); 261 ASSERT3S(skipped, ==, rm->rm_nskip); 262 } 263 264 static void 265 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 266 { 267 int c; 268 raidz_row_t *rr = rm->rm_row[0]; 269 270 ASSERT3U(rm->rm_nrows, ==, 1); 271 272 /* Allocate buffers for the parity columns */ 273 for (c = 0; c < rr->rr_firstdatacol; c++) 274 rr->rr_col[c].rc_abd = 275 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 276 277 for (uint64_t off = 0; c < rr->rr_cols; c++) { 278 raidz_col_t *rc = &rr->rr_col[c]; 279 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 280 zio->io_abd, off, rc->rc_size); 281 off += rc->rc_size; 282 } 283 } 284 285 /* 286 * Divides the IO evenly across all child vdevs; usually, dcols is 287 * the number of children in the target vdev. 288 * 289 * Avoid inlining the function to keep vdev_raidz_io_start(), which 290 * is this functions only caller, as small as possible on the stack. 291 */ 292 noinline raidz_map_t * 293 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 294 uint64_t nparity) 295 { 296 raidz_row_t *rr; 297 /* The starting RAIDZ (parent) vdev sector of the block. */ 298 uint64_t b = zio->io_offset >> ashift; 299 /* The zio's size in units of the vdev's minimum sector size. */ 300 uint64_t s = zio->io_size >> ashift; 301 /* The first column for this stripe. */ 302 uint64_t f = b % dcols; 303 /* The starting byte offset on each child vdev. */ 304 uint64_t o = (b / dcols) << ashift; 305 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 306 307 raidz_map_t *rm = 308 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 309 rm->rm_nrows = 1; 310 311 /* 312 * "Quotient": The number of data sectors for this stripe on all but 313 * the "big column" child vdevs that also contain "remainder" data. 314 */ 315 q = s / (dcols - nparity); 316 317 /* 318 * "Remainder": The number of partial stripe data sectors in this I/O. 319 * This will add a sector to some, but not all, child vdevs. 320 */ 321 r = s - q * (dcols - nparity); 322 323 /* The number of "big columns" - those which contain remainder data. */ 324 bc = (r == 0 ? 0 : r + nparity); 325 326 /* 327 * The total number of data and parity sectors associated with 328 * this I/O. 329 */ 330 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 331 332 /* 333 * acols: The columns that will be accessed. 334 * scols: The columns that will be accessed or skipped. 335 */ 336 if (q == 0) { 337 /* Our I/O request doesn't span all child vdevs. */ 338 acols = bc; 339 scols = MIN(dcols, roundup(bc, nparity + 1)); 340 } else { 341 acols = dcols; 342 scols = dcols; 343 } 344 345 ASSERT3U(acols, <=, scols); 346 347 rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); 348 rm->rm_row[0] = rr; 349 350 rr->rr_cols = acols; 351 rr->rr_scols = scols; 352 rr->rr_bigcols = bc; 353 rr->rr_missingdata = 0; 354 rr->rr_missingparity = 0; 355 rr->rr_firstdatacol = nparity; 356 rr->rr_abd_empty = NULL; 357 rr->rr_nempty = 0; 358 #ifdef ZFS_DEBUG 359 rr->rr_offset = zio->io_offset; 360 rr->rr_size = zio->io_size; 361 #endif 362 363 asize = 0; 364 365 for (c = 0; c < scols; c++) { 366 raidz_col_t *rc = &rr->rr_col[c]; 367 col = f + c; 368 coff = o; 369 if (col >= dcols) { 370 col -= dcols; 371 coff += 1ULL << ashift; 372 } 373 rc->rc_devidx = col; 374 rc->rc_offset = coff; 375 rc->rc_abd = NULL; 376 rc->rc_orig_data = NULL; 377 rc->rc_error = 0; 378 rc->rc_tried = 0; 379 rc->rc_skipped = 0; 380 rc->rc_force_repair = 0; 381 rc->rc_allow_repair = 1; 382 rc->rc_need_orig_restore = B_FALSE; 383 384 if (c >= acols) 385 rc->rc_size = 0; 386 else if (c < bc) 387 rc->rc_size = (q + 1) << ashift; 388 else 389 rc->rc_size = q << ashift; 390 391 asize += rc->rc_size; 392 } 393 394 ASSERT3U(asize, ==, tot << ashift); 395 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 396 rm->rm_skipstart = bc; 397 398 /* 399 * If all data stored spans all columns, there's a danger that parity 400 * will always be on the same device and, since parity isn't read 401 * during normal operation, that device's I/O bandwidth won't be 402 * used effectively. We therefore switch the parity every 1MB. 403 * 404 * ... at least that was, ostensibly, the theory. As a practical 405 * matter unless we juggle the parity between all devices evenly, we 406 * won't see any benefit. Further, occasional writes that aren't a 407 * multiple of the LCM of the number of children and the minimum 408 * stripe width are sufficient to avoid pessimal behavior. 409 * Unfortunately, this decision created an implicit on-disk format 410 * requirement that we need to support for all eternity, but only 411 * for single-parity RAID-Z. 412 * 413 * If we intend to skip a sector in the zeroth column for padding 414 * we must make sure to note this swap. We will never intend to 415 * skip the first column since at least one data and one parity 416 * column must appear in each row. 417 */ 418 ASSERT(rr->rr_cols >= 2); 419 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 420 421 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 422 devidx = rr->rr_col[0].rc_devidx; 423 o = rr->rr_col[0].rc_offset; 424 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 425 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 426 rr->rr_col[1].rc_devidx = devidx; 427 rr->rr_col[1].rc_offset = o; 428 429 if (rm->rm_skipstart == 0) 430 rm->rm_skipstart = 1; 431 } 432 433 if (zio->io_type == ZIO_TYPE_WRITE) { 434 vdev_raidz_map_alloc_write(zio, rm, ashift); 435 } else { 436 vdev_raidz_map_alloc_read(zio, rm); 437 } 438 439 /* init RAIDZ parity ops */ 440 rm->rm_ops = vdev_raidz_math_get_ops(); 441 442 return (rm); 443 } 444 445 struct pqr_struct { 446 uint64_t *p; 447 uint64_t *q; 448 uint64_t *r; 449 }; 450 451 static int 452 vdev_raidz_p_func(void *buf, size_t size, void *private) 453 { 454 struct pqr_struct *pqr = private; 455 const uint64_t *src = buf; 456 int i, cnt = size / sizeof (src[0]); 457 458 ASSERT(pqr->p && !pqr->q && !pqr->r); 459 460 for (i = 0; i < cnt; i++, src++, pqr->p++) 461 *pqr->p ^= *src; 462 463 return (0); 464 } 465 466 static int 467 vdev_raidz_pq_func(void *buf, size_t size, void *private) 468 { 469 struct pqr_struct *pqr = private; 470 const uint64_t *src = buf; 471 uint64_t mask; 472 int i, cnt = size / sizeof (src[0]); 473 474 ASSERT(pqr->p && pqr->q && !pqr->r); 475 476 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 477 *pqr->p ^= *src; 478 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 479 *pqr->q ^= *src; 480 } 481 482 return (0); 483 } 484 485 static int 486 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 487 { 488 struct pqr_struct *pqr = private; 489 const uint64_t *src = buf; 490 uint64_t mask; 491 int i, cnt = size / sizeof (src[0]); 492 493 ASSERT(pqr->p && pqr->q && pqr->r); 494 495 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 496 *pqr->p ^= *src; 497 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 498 *pqr->q ^= *src; 499 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 500 *pqr->r ^= *src; 501 } 502 503 return (0); 504 } 505 506 static void 507 vdev_raidz_generate_parity_p(raidz_row_t *rr) 508 { 509 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 510 511 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 512 abd_t *src = rr->rr_col[c].rc_abd; 513 514 if (c == rr->rr_firstdatacol) { 515 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 516 } else { 517 struct pqr_struct pqr = { p, NULL, NULL }; 518 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 519 vdev_raidz_p_func, &pqr); 520 } 521 } 522 } 523 524 static void 525 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 526 { 527 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 528 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 529 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 530 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 531 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 532 533 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 534 abd_t *src = rr->rr_col[c].rc_abd; 535 536 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 537 538 if (c == rr->rr_firstdatacol) { 539 ASSERT(ccnt == pcnt || ccnt == 0); 540 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 541 (void) memcpy(q, p, rr->rr_col[c].rc_size); 542 543 for (uint64_t i = ccnt; i < pcnt; i++) { 544 p[i] = 0; 545 q[i] = 0; 546 } 547 } else { 548 struct pqr_struct pqr = { p, q, NULL }; 549 550 ASSERT(ccnt <= pcnt); 551 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 552 vdev_raidz_pq_func, &pqr); 553 554 /* 555 * Treat short columns as though they are full of 0s. 556 * Note that there's therefore nothing needed for P. 557 */ 558 uint64_t mask; 559 for (uint64_t i = ccnt; i < pcnt; i++) { 560 VDEV_RAIDZ_64MUL_2(q[i], mask); 561 } 562 } 563 } 564 } 565 566 static void 567 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 568 { 569 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 570 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 571 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 572 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 573 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 574 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 575 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 576 rr->rr_col[VDEV_RAIDZ_R].rc_size); 577 578 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 579 abd_t *src = rr->rr_col[c].rc_abd; 580 581 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 582 583 if (c == rr->rr_firstdatacol) { 584 ASSERT(ccnt == pcnt || ccnt == 0); 585 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 586 (void) memcpy(q, p, rr->rr_col[c].rc_size); 587 (void) memcpy(r, p, rr->rr_col[c].rc_size); 588 589 for (uint64_t i = ccnt; i < pcnt; i++) { 590 p[i] = 0; 591 q[i] = 0; 592 r[i] = 0; 593 } 594 } else { 595 struct pqr_struct pqr = { p, q, r }; 596 597 ASSERT(ccnt <= pcnt); 598 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 599 vdev_raidz_pqr_func, &pqr); 600 601 /* 602 * Treat short columns as though they are full of 0s. 603 * Note that there's therefore nothing needed for P. 604 */ 605 uint64_t mask; 606 for (uint64_t i = ccnt; i < pcnt; i++) { 607 VDEV_RAIDZ_64MUL_2(q[i], mask); 608 VDEV_RAIDZ_64MUL_4(r[i], mask); 609 } 610 } 611 } 612 } 613 614 /* 615 * Generate RAID parity in the first virtual columns according to the number of 616 * parity columns available. 617 */ 618 void 619 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 620 { 621 ASSERT3U(rr->rr_cols, !=, 0); 622 623 /* Generate using the new math implementation */ 624 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 625 return; 626 627 switch (rr->rr_firstdatacol) { 628 case 1: 629 vdev_raidz_generate_parity_p(rr); 630 break; 631 case 2: 632 vdev_raidz_generate_parity_pq(rr); 633 break; 634 case 3: 635 vdev_raidz_generate_parity_pqr(rr); 636 break; 637 default: 638 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 639 } 640 } 641 642 void 643 vdev_raidz_generate_parity(raidz_map_t *rm) 644 { 645 for (int i = 0; i < rm->rm_nrows; i++) { 646 raidz_row_t *rr = rm->rm_row[i]; 647 vdev_raidz_generate_parity_row(rm, rr); 648 } 649 } 650 651 /* ARGSUSED */ 652 static int 653 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 654 { 655 uint64_t *dst = dbuf; 656 uint64_t *src = sbuf; 657 int cnt = size / sizeof (src[0]); 658 659 for (int i = 0; i < cnt; i++) { 660 dst[i] ^= src[i]; 661 } 662 663 return (0); 664 } 665 666 /* ARGSUSED */ 667 static int 668 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 669 void *private) 670 { 671 uint64_t *dst = dbuf; 672 uint64_t *src = sbuf; 673 uint64_t mask; 674 int cnt = size / sizeof (dst[0]); 675 676 for (int i = 0; i < cnt; i++, dst++, src++) { 677 VDEV_RAIDZ_64MUL_2(*dst, mask); 678 *dst ^= *src; 679 } 680 681 return (0); 682 } 683 684 /* ARGSUSED */ 685 static int 686 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 687 { 688 uint64_t *dst = buf; 689 uint64_t mask; 690 int cnt = size / sizeof (dst[0]); 691 692 for (int i = 0; i < cnt; i++, dst++) { 693 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 694 VDEV_RAIDZ_64MUL_2(*dst, mask); 695 } 696 697 return (0); 698 } 699 700 struct reconst_q_struct { 701 uint64_t *q; 702 int exp; 703 }; 704 705 static int 706 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 707 { 708 struct reconst_q_struct *rq = private; 709 uint64_t *dst = buf; 710 int cnt = size / sizeof (dst[0]); 711 712 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 713 int j; 714 uint8_t *b; 715 716 *dst ^= *rq->q; 717 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 718 *b = vdev_raidz_exp2(*b, rq->exp); 719 } 720 } 721 722 return (0); 723 } 724 725 struct reconst_pq_struct { 726 uint8_t *p; 727 uint8_t *q; 728 uint8_t *pxy; 729 uint8_t *qxy; 730 int aexp; 731 int bexp; 732 }; 733 734 static int 735 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 736 { 737 struct reconst_pq_struct *rpq = private; 738 uint8_t *xd = xbuf; 739 uint8_t *yd = ybuf; 740 741 for (int i = 0; i < size; 742 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 743 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 744 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 745 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 746 } 747 748 return (0); 749 } 750 751 static int 752 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 753 { 754 struct reconst_pq_struct *rpq = private; 755 uint8_t *xd = xbuf; 756 757 for (int i = 0; i < size; 758 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 759 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 760 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 761 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 762 } 763 764 return (0); 765 } 766 767 static void 768 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 769 { 770 int x = tgts[0]; 771 abd_t *dst, *src; 772 773 ASSERT3U(ntgts, ==, 1); 774 ASSERT3U(x, >=, rr->rr_firstdatacol); 775 ASSERT3U(x, <, rr->rr_cols); 776 777 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 778 779 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 780 dst = rr->rr_col[x].rc_abd; 781 782 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 783 784 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 785 uint64_t size = MIN(rr->rr_col[x].rc_size, 786 rr->rr_col[c].rc_size); 787 788 src = rr->rr_col[c].rc_abd; 789 790 if (c == x) 791 continue; 792 793 (void) abd_iterate_func2(dst, src, 0, 0, size, 794 vdev_raidz_reconst_p_func, NULL); 795 } 796 } 797 798 static void 799 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 800 { 801 int x = tgts[0]; 802 int c, exp; 803 abd_t *dst, *src; 804 805 ASSERT(ntgts == 1); 806 807 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 808 809 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 810 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 811 rr->rr_col[c].rc_size); 812 813 src = rr->rr_col[c].rc_abd; 814 dst = rr->rr_col[x].rc_abd; 815 816 if (c == rr->rr_firstdatacol) { 817 abd_copy(dst, src, size); 818 if (rr->rr_col[x].rc_size > size) { 819 abd_zero_off(dst, size, 820 rr->rr_col[x].rc_size - size); 821 } 822 } else { 823 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 824 (void) abd_iterate_func2(dst, src, 0, 0, size, 825 vdev_raidz_reconst_q_pre_func, NULL); 826 (void) abd_iterate_func(dst, 827 size, rr->rr_col[x].rc_size - size, 828 vdev_raidz_reconst_q_pre_tail_func, NULL); 829 } 830 } 831 832 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 833 dst = rr->rr_col[x].rc_abd; 834 exp = 255 - (rr->rr_cols - 1 - x); 835 836 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 837 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 838 vdev_raidz_reconst_q_post_func, &rq); 839 } 840 841 static void 842 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 843 { 844 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 845 abd_t *pdata, *qdata; 846 uint64_t xsize, ysize; 847 int x = tgts[0]; 848 int y = tgts[1]; 849 abd_t *xd, *yd; 850 851 ASSERT(ntgts == 2); 852 ASSERT(x < y); 853 ASSERT(x >= rr->rr_firstdatacol); 854 ASSERT(y < rr->rr_cols); 855 856 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 857 858 /* 859 * Move the parity data aside -- we're going to compute parity as 860 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 861 * reuse the parity generation mechanism without trashing the actual 862 * parity so we make those columns appear to be full of zeros by 863 * setting their lengths to zero. 864 */ 865 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 866 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 867 xsize = rr->rr_col[x].rc_size; 868 ysize = rr->rr_col[y].rc_size; 869 870 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 871 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 872 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 873 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 874 rr->rr_col[x].rc_size = 0; 875 rr->rr_col[y].rc_size = 0; 876 877 vdev_raidz_generate_parity_pq(rr); 878 879 rr->rr_col[x].rc_size = xsize; 880 rr->rr_col[y].rc_size = ysize; 881 882 p = abd_to_buf(pdata); 883 q = abd_to_buf(qdata); 884 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 885 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 886 xd = rr->rr_col[x].rc_abd; 887 yd = rr->rr_col[y].rc_abd; 888 889 /* 890 * We now have: 891 * Pxy = P + D_x + D_y 892 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 893 * 894 * We can then solve for D_x: 895 * D_x = A * (P + Pxy) + B * (Q + Qxy) 896 * where 897 * A = 2^(x - y) * (2^(x - y) + 1)^-1 898 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 899 * 900 * With D_x in hand, we can easily solve for D_y: 901 * D_y = P + Pxy + D_x 902 */ 903 904 a = vdev_raidz_pow2[255 + x - y]; 905 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 906 tmp = 255 - vdev_raidz_log2[a ^ 1]; 907 908 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 909 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 910 911 ASSERT3U(xsize, >=, ysize); 912 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 913 914 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 915 vdev_raidz_reconst_pq_func, &rpq); 916 (void) abd_iterate_func(xd, ysize, xsize - ysize, 917 vdev_raidz_reconst_pq_tail_func, &rpq); 918 919 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 920 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 921 922 /* 923 * Restore the saved parity data. 924 */ 925 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 926 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 927 } 928 929 /* BEGIN CSTYLED */ 930 /* 931 * In the general case of reconstruction, we must solve the system of linear 932 * equations defined by the coefficients used to generate parity as well as 933 * the contents of the data and parity disks. This can be expressed with 934 * vectors for the original data (D) and the actual data (d) and parity (p) 935 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 936 * 937 * __ __ __ __ 938 * | | __ __ | p_0 | 939 * | V | | D_0 | | p_m-1 | 940 * | | x | : | = | d_0 | 941 * | I | | D_n-1 | | : | 942 * | | ~~ ~~ | d_n-1 | 943 * ~~ ~~ ~~ ~~ 944 * 945 * I is simply a square identity matrix of size n, and V is a vandermonde 946 * matrix defined by the coefficients we chose for the various parity columns 947 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 948 * computation as well as linear separability. 949 * 950 * __ __ __ __ 951 * | 1 .. 1 1 1 | | p_0 | 952 * | 2^n-1 .. 4 2 1 | __ __ | : | 953 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 954 * | 1 .. 0 0 0 | | D_1 | | d_0 | 955 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 956 * | : : : : | | : | | d_2 | 957 * | 0 .. 1 0 0 | | D_n-1 | | : | 958 * | 0 .. 0 1 0 | ~~ ~~ | : | 959 * | 0 .. 0 0 1 | | d_n-1 | 960 * ~~ ~~ ~~ ~~ 961 * 962 * Note that I, V, d, and p are known. To compute D, we must invert the 963 * matrix and use the known data and parity values to reconstruct the unknown 964 * data values. We begin by removing the rows in V|I and d|p that correspond 965 * to failed or missing columns; we then make V|I square (n x n) and d|p 966 * sized n by removing rows corresponding to unused parity from the bottom up 967 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 968 * using Gauss-Jordan elimination. In the example below we use m=3 parity 969 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 970 * __ __ 971 * | 1 1 1 1 1 1 1 1 | 972 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 973 * | 19 205 116 29 64 16 4 1 | / / 974 * | 1 0 0 0 0 0 0 0 | / / 975 * | 0 1 0 0 0 0 0 0 | <--' / 976 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 977 * | 0 0 0 1 0 0 0 0 | 978 * | 0 0 0 0 1 0 0 0 | 979 * | 0 0 0 0 0 1 0 0 | 980 * | 0 0 0 0 0 0 1 0 | 981 * | 0 0 0 0 0 0 0 1 | 982 * ~~ ~~ 983 * __ __ 984 * | 1 1 1 1 1 1 1 1 | 985 * | 128 64 32 16 8 4 2 1 | 986 * | 19 205 116 29 64 16 4 1 | 987 * | 1 0 0 0 0 0 0 0 | 988 * | 0 1 0 0 0 0 0 0 | 989 * (V|I)' = | 0 0 1 0 0 0 0 0 | 990 * | 0 0 0 1 0 0 0 0 | 991 * | 0 0 0 0 1 0 0 0 | 992 * | 0 0 0 0 0 1 0 0 | 993 * | 0 0 0 0 0 0 1 0 | 994 * | 0 0 0 0 0 0 0 1 | 995 * ~~ ~~ 996 * 997 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 998 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 999 * matrix is not singular. 1000 * __ __ 1001 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1002 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1003 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1004 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1005 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1006 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1007 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1008 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1009 * ~~ ~~ 1010 * __ __ 1011 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1012 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1013 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1014 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1015 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1016 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1017 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1018 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1019 * ~~ ~~ 1020 * __ __ 1021 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1022 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1023 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1024 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1025 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1026 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1027 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1028 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1029 * ~~ ~~ 1030 * __ __ 1031 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1032 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1033 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1034 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1035 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1036 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1037 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1038 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1039 * ~~ ~~ 1040 * __ __ 1041 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1042 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1043 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1044 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1045 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1046 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1047 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1048 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1049 * ~~ ~~ 1050 * __ __ 1051 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1052 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1053 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1054 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1055 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1056 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1057 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1058 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1059 * ~~ ~~ 1060 * __ __ 1061 * | 0 0 1 0 0 0 0 0 | 1062 * | 167 100 5 41 159 169 217 208 | 1063 * | 166 100 4 40 158 168 216 209 | 1064 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1065 * | 0 0 0 0 1 0 0 0 | 1066 * | 0 0 0 0 0 1 0 0 | 1067 * | 0 0 0 0 0 0 1 0 | 1068 * | 0 0 0 0 0 0 0 1 | 1069 * ~~ ~~ 1070 * 1071 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1072 * of the missing data. 1073 * 1074 * As is apparent from the example above, the only non-trivial rows in the 1075 * inverse matrix correspond to the data disks that we're trying to 1076 * reconstruct. Indeed, those are the only rows we need as the others would 1077 * only be useful for reconstructing data known or assumed to be valid. For 1078 * that reason, we only build the coefficients in the rows that correspond to 1079 * targeted columns. 1080 */ 1081 /* END CSTYLED */ 1082 1083 static void 1084 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1085 uint8_t **rows) 1086 { 1087 int i, j; 1088 int pow; 1089 1090 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1091 1092 /* 1093 * Fill in the missing rows of interest. 1094 */ 1095 for (i = 0; i < nmap; i++) { 1096 ASSERT3S(0, <=, map[i]); 1097 ASSERT3S(map[i], <=, 2); 1098 1099 pow = map[i] * n; 1100 if (pow > 255) 1101 pow -= 255; 1102 ASSERT(pow <= 255); 1103 1104 for (j = 0; j < n; j++) { 1105 pow -= map[i]; 1106 if (pow < 0) 1107 pow += 255; 1108 rows[i][j] = vdev_raidz_pow2[pow]; 1109 } 1110 } 1111 } 1112 1113 static void 1114 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1115 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1116 { 1117 int i, j, ii, jj; 1118 uint8_t log; 1119 1120 /* 1121 * Assert that the first nmissing entries from the array of used 1122 * columns correspond to parity columns and that subsequent entries 1123 * correspond to data columns. 1124 */ 1125 for (i = 0; i < nmissing; i++) { 1126 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1127 } 1128 for (; i < n; i++) { 1129 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1130 } 1131 1132 /* 1133 * First initialize the storage where we'll compute the inverse rows. 1134 */ 1135 for (i = 0; i < nmissing; i++) { 1136 for (j = 0; j < n; j++) { 1137 invrows[i][j] = (i == j) ? 1 : 0; 1138 } 1139 } 1140 1141 /* 1142 * Subtract all trivial rows from the rows of consequence. 1143 */ 1144 for (i = 0; i < nmissing; i++) { 1145 for (j = nmissing; j < n; j++) { 1146 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1147 jj = used[j] - rr->rr_firstdatacol; 1148 ASSERT3S(jj, <, n); 1149 invrows[i][j] = rows[i][jj]; 1150 rows[i][jj] = 0; 1151 } 1152 } 1153 1154 /* 1155 * For each of the rows of interest, we must normalize it and subtract 1156 * a multiple of it from the other rows. 1157 */ 1158 for (i = 0; i < nmissing; i++) { 1159 for (j = 0; j < missing[i]; j++) { 1160 ASSERT0(rows[i][j]); 1161 } 1162 ASSERT3U(rows[i][missing[i]], !=, 0); 1163 1164 /* 1165 * Compute the inverse of the first element and multiply each 1166 * element in the row by that value. 1167 */ 1168 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1169 1170 for (j = 0; j < n; j++) { 1171 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1172 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1173 } 1174 1175 for (ii = 0; ii < nmissing; ii++) { 1176 if (i == ii) 1177 continue; 1178 1179 ASSERT3U(rows[ii][missing[i]], !=, 0); 1180 1181 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1182 1183 for (j = 0; j < n; j++) { 1184 rows[ii][j] ^= 1185 vdev_raidz_exp2(rows[i][j], log); 1186 invrows[ii][j] ^= 1187 vdev_raidz_exp2(invrows[i][j], log); 1188 } 1189 } 1190 } 1191 1192 /* 1193 * Verify that the data that is left in the rows are properly part of 1194 * an identity matrix. 1195 */ 1196 for (i = 0; i < nmissing; i++) { 1197 for (j = 0; j < n; j++) { 1198 if (j == missing[i]) { 1199 ASSERT3U(rows[i][j], ==, 1); 1200 } else { 1201 ASSERT0(rows[i][j]); 1202 } 1203 } 1204 } 1205 } 1206 1207 static void 1208 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1209 int *missing, uint8_t **invrows, const uint8_t *used) 1210 { 1211 int i, j, x, cc, c; 1212 uint8_t *src; 1213 uint64_t ccount; 1214 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1215 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1216 uint8_t log = 0; 1217 uint8_t val; 1218 int ll; 1219 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1220 uint8_t *p, *pp; 1221 size_t psize; 1222 1223 psize = sizeof (invlog[0][0]) * n * nmissing; 1224 p = kmem_alloc(psize, KM_SLEEP); 1225 1226 for (pp = p, i = 0; i < nmissing; i++) { 1227 invlog[i] = pp; 1228 pp += n; 1229 } 1230 1231 for (i = 0; i < nmissing; i++) { 1232 for (j = 0; j < n; j++) { 1233 ASSERT3U(invrows[i][j], !=, 0); 1234 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1235 } 1236 } 1237 1238 for (i = 0; i < n; i++) { 1239 c = used[i]; 1240 ASSERT3U(c, <, rr->rr_cols); 1241 1242 ccount = rr->rr_col[c].rc_size; 1243 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1244 if (ccount == 0) 1245 continue; 1246 src = abd_to_buf(rr->rr_col[c].rc_abd); 1247 for (j = 0; j < nmissing; j++) { 1248 cc = missing[j] + rr->rr_firstdatacol; 1249 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1250 ASSERT3U(cc, <, rr->rr_cols); 1251 ASSERT3U(cc, !=, c); 1252 1253 dcount[j] = rr->rr_col[cc].rc_size; 1254 if (dcount[j] != 0) 1255 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1256 } 1257 1258 for (x = 0; x < ccount; x++, src++) { 1259 if (*src != 0) 1260 log = vdev_raidz_log2[*src]; 1261 1262 for (cc = 0; cc < nmissing; cc++) { 1263 if (x >= dcount[cc]) 1264 continue; 1265 1266 if (*src == 0) { 1267 val = 0; 1268 } else { 1269 if ((ll = log + invlog[cc][i]) >= 255) 1270 ll -= 255; 1271 val = vdev_raidz_pow2[ll]; 1272 } 1273 1274 if (i == 0) 1275 dst[cc][x] = val; 1276 else 1277 dst[cc][x] ^= val; 1278 } 1279 } 1280 } 1281 1282 kmem_free(p, psize); 1283 } 1284 1285 static void 1286 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1287 { 1288 int n, i, c, t, tt; 1289 int nmissing_rows; 1290 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1291 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1292 uint8_t *p, *pp; 1293 size_t psize; 1294 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1295 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1296 uint8_t *used; 1297 1298 abd_t **bufs = NULL; 1299 1300 /* 1301 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1302 * temporary linear ABDs if any non-linear ABDs are found. 1303 */ 1304 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1305 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1306 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1307 KM_PUSHPAGE); 1308 1309 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1310 raidz_col_t *col = &rr->rr_col[c]; 1311 1312 bufs[c] = col->rc_abd; 1313 if (bufs[c] != NULL) { 1314 col->rc_abd = abd_alloc_linear( 1315 col->rc_size, B_TRUE); 1316 abd_copy(col->rc_abd, bufs[c], 1317 col->rc_size); 1318 } 1319 } 1320 1321 break; 1322 } 1323 } 1324 1325 n = rr->rr_cols - rr->rr_firstdatacol; 1326 1327 /* 1328 * Figure out which data columns are missing. 1329 */ 1330 nmissing_rows = 0; 1331 for (t = 0; t < ntgts; t++) { 1332 if (tgts[t] >= rr->rr_firstdatacol) { 1333 missing_rows[nmissing_rows++] = 1334 tgts[t] - rr->rr_firstdatacol; 1335 } 1336 } 1337 1338 /* 1339 * Figure out which parity columns to use to help generate the missing 1340 * data columns. 1341 */ 1342 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1343 ASSERT(tt < ntgts); 1344 ASSERT(c < rr->rr_firstdatacol); 1345 1346 /* 1347 * Skip any targeted parity columns. 1348 */ 1349 if (c == tgts[tt]) { 1350 tt++; 1351 continue; 1352 } 1353 1354 parity_map[i] = c; 1355 i++; 1356 } 1357 1358 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1359 nmissing_rows * n + sizeof (used[0]) * n; 1360 p = kmem_alloc(psize, KM_SLEEP); 1361 1362 for (pp = p, i = 0; i < nmissing_rows; i++) { 1363 rows[i] = pp; 1364 pp += n; 1365 invrows[i] = pp; 1366 pp += n; 1367 } 1368 used = pp; 1369 1370 for (i = 0; i < nmissing_rows; i++) { 1371 used[i] = parity_map[i]; 1372 } 1373 1374 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1375 if (tt < nmissing_rows && 1376 c == missing_rows[tt] + rr->rr_firstdatacol) { 1377 tt++; 1378 continue; 1379 } 1380 1381 ASSERT3S(i, <, n); 1382 used[i] = c; 1383 i++; 1384 } 1385 1386 /* 1387 * Initialize the interesting rows of the matrix. 1388 */ 1389 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 1390 1391 /* 1392 * Invert the matrix. 1393 */ 1394 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 1395 invrows, used); 1396 1397 /* 1398 * Reconstruct the missing data using the generated matrix. 1399 */ 1400 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 1401 invrows, used); 1402 1403 kmem_free(p, psize); 1404 1405 /* 1406 * copy back from temporary linear abds and free them 1407 */ 1408 if (bufs) { 1409 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1410 raidz_col_t *col = &rr->rr_col[c]; 1411 1412 if (bufs[c] != NULL) { 1413 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1414 abd_free(col->rc_abd); 1415 } 1416 col->rc_abd = bufs[c]; 1417 } 1418 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 1419 } 1420 } 1421 1422 static void 1423 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 1424 const int *t, int nt) 1425 { 1426 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1427 int ntgts; 1428 int i, c, ret; 1429 int nbadparity, nbaddata; 1430 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1431 1432 nbadparity = rr->rr_firstdatacol; 1433 nbaddata = rr->rr_cols - nbadparity; 1434 ntgts = 0; 1435 for (i = 0, c = 0; c < rr->rr_cols; c++) { 1436 if (c < rr->rr_firstdatacol) 1437 parity_valid[c] = B_FALSE; 1438 1439 if (i < nt && c == t[i]) { 1440 tgts[ntgts++] = c; 1441 i++; 1442 } else if (rr->rr_col[c].rc_error != 0) { 1443 tgts[ntgts++] = c; 1444 } else if (c >= rr->rr_firstdatacol) { 1445 nbaddata--; 1446 } else { 1447 parity_valid[c] = B_TRUE; 1448 nbadparity--; 1449 } 1450 } 1451 1452 ASSERT(ntgts >= nt); 1453 ASSERT(nbaddata >= 0); 1454 ASSERT(nbaddata + nbadparity == ntgts); 1455 1456 dt = &tgts[nbadparity]; 1457 1458 /* Reconstruct using the new math implementation */ 1459 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 1460 if (ret != RAIDZ_ORIGINAL_IMPL) 1461 return; 1462 1463 /* 1464 * See if we can use any of our optimized reconstruction routines. 1465 */ 1466 switch (nbaddata) { 1467 case 1: 1468 if (parity_valid[VDEV_RAIDZ_P]) { 1469 vdev_raidz_reconstruct_p(rr, dt, 1); 1470 return; 1471 } 1472 1473 ASSERT(rr->rr_firstdatacol > 1); 1474 1475 if (parity_valid[VDEV_RAIDZ_Q]) { 1476 vdev_raidz_reconstruct_q(rr, dt, 1); 1477 return; 1478 } 1479 1480 ASSERT(rr->rr_firstdatacol > 2); 1481 break; 1482 1483 case 2: 1484 ASSERT(rr->rr_firstdatacol > 1); 1485 1486 if (parity_valid[VDEV_RAIDZ_P] && 1487 parity_valid[VDEV_RAIDZ_Q]) { 1488 vdev_raidz_reconstruct_pq(rr, dt, 2); 1489 return; 1490 } 1491 1492 ASSERT(rr->rr_firstdatacol > 2); 1493 1494 break; 1495 } 1496 1497 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 1498 } 1499 1500 static int 1501 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1502 uint64_t *logical_ashift, uint64_t *physical_ashift) 1503 { 1504 vdev_raidz_t *vdrz = vd->vdev_tsd; 1505 uint64_t nparity = vdrz->vd_nparity; 1506 int c; 1507 int lasterror = 0; 1508 int numerrors = 0; 1509 1510 ASSERT(nparity > 0); 1511 1512 if (nparity > VDEV_RAIDZ_MAXPARITY || 1513 vd->vdev_children < nparity + 1) { 1514 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1515 return (SET_ERROR(EINVAL)); 1516 } 1517 1518 vdev_open_children(vd); 1519 1520 for (c = 0; c < vd->vdev_children; c++) { 1521 vdev_t *cvd = vd->vdev_child[c]; 1522 1523 if (cvd->vdev_open_error != 0) { 1524 lasterror = cvd->vdev_open_error; 1525 numerrors++; 1526 continue; 1527 } 1528 1529 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1530 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1531 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1532 *physical_ashift = MAX(*physical_ashift, 1533 cvd->vdev_physical_ashift); 1534 } 1535 1536 *asize *= vd->vdev_children; 1537 *max_asize *= vd->vdev_children; 1538 1539 if (numerrors > nparity) { 1540 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1541 return (lasterror); 1542 } 1543 1544 return (0); 1545 } 1546 1547 static void 1548 vdev_raidz_close(vdev_t *vd) 1549 { 1550 for (int c = 0; c < vd->vdev_children; c++) { 1551 if (vd->vdev_child[c] != NULL) 1552 vdev_close(vd->vdev_child[c]); 1553 } 1554 } 1555 1556 static uint64_t 1557 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1558 { 1559 vdev_raidz_t *vdrz = vd->vdev_tsd; 1560 uint64_t asize; 1561 uint64_t ashift = vd->vdev_top->vdev_ashift; 1562 uint64_t cols = vdrz->vd_logical_width; 1563 uint64_t nparity = vdrz->vd_nparity; 1564 1565 asize = ((psize - 1) >> ashift) + 1; 1566 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1567 asize = roundup(asize, nparity + 1) << ashift; 1568 1569 return (asize); 1570 } 1571 1572 /* 1573 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 1574 * so each child must provide at least 1/Nth of its asize. 1575 */ 1576 static uint64_t 1577 vdev_raidz_min_asize(vdev_t *vd) 1578 { 1579 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 1580 vd->vdev_children); 1581 } 1582 1583 void 1584 vdev_raidz_child_done(zio_t *zio) 1585 { 1586 raidz_col_t *rc = zio->io_private; 1587 1588 ASSERT3P(rc->rc_abd, !=, NULL); 1589 rc->rc_error = zio->io_error; 1590 rc->rc_tried = 1; 1591 rc->rc_skipped = 0; 1592 } 1593 1594 static void 1595 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) 1596 { 1597 #ifdef ZFS_DEBUG 1598 vdev_t *tvd = vd->vdev_top; 1599 1600 range_seg64_t logical_rs, physical_rs, remain_rs; 1601 logical_rs.rs_start = rr->rr_offset; 1602 logical_rs.rs_end = logical_rs.rs_start + 1603 vdev_raidz_asize(vd, rr->rr_size); 1604 1605 raidz_col_t *rc = &rr->rr_col[col]; 1606 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1607 1608 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 1609 ASSERT(vdev_xlate_is_empty(&remain_rs)); 1610 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1611 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1612 /* 1613 * It would be nice to assert that rs_end is equal 1614 * to rc_offset + rc_size but there might be an 1615 * optional I/O at the end that is not accounted in 1616 * rc_size. 1617 */ 1618 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1619 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1620 rc->rc_size + (1 << tvd->vdev_ashift)); 1621 } else { 1622 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1623 } 1624 #endif 1625 } 1626 1627 static void 1628 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) 1629 { 1630 vdev_t *vd = zio->io_vd; 1631 raidz_map_t *rm = zio->io_vsd; 1632 1633 vdev_raidz_generate_parity_row(rm, rr); 1634 1635 for (int c = 0; c < rr->rr_scols; c++) { 1636 raidz_col_t *rc = &rr->rr_col[c]; 1637 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1638 1639 /* Verify physical to logical translation */ 1640 vdev_raidz_io_verify(vd, rr, c); 1641 1642 if (rc->rc_size > 0) { 1643 ASSERT3P(rc->rc_abd, !=, NULL); 1644 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1645 rc->rc_offset, rc->rc_abd, 1646 abd_get_size(rc->rc_abd), zio->io_type, 1647 zio->io_priority, 0, vdev_raidz_child_done, rc)); 1648 } else { 1649 /* 1650 * Generate optional write for skip sector to improve 1651 * aggregation contiguity. 1652 */ 1653 ASSERT3P(rc->rc_abd, ==, NULL); 1654 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1655 rc->rc_offset, NULL, 1ULL << ashift, 1656 zio->io_type, zio->io_priority, 1657 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, 1658 NULL)); 1659 } 1660 } 1661 } 1662 1663 static void 1664 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) 1665 { 1666 vdev_t *vd = zio->io_vd; 1667 1668 /* 1669 * Iterate over the columns in reverse order so that we hit the parity 1670 * last -- any errors along the way will force us to read the parity. 1671 */ 1672 for (int c = rr->rr_cols - 1; c >= 0; c--) { 1673 raidz_col_t *rc = &rr->rr_col[c]; 1674 if (rc->rc_size == 0) 1675 continue; 1676 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1677 if (!vdev_readable(cvd)) { 1678 if (c >= rr->rr_firstdatacol) 1679 rr->rr_missingdata++; 1680 else 1681 rr->rr_missingparity++; 1682 rc->rc_error = SET_ERROR(ENXIO); 1683 rc->rc_tried = 1; /* don't even try */ 1684 rc->rc_skipped = 1; 1685 continue; 1686 } 1687 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1688 if (c >= rr->rr_firstdatacol) 1689 rr->rr_missingdata++; 1690 else 1691 rr->rr_missingparity++; 1692 rc->rc_error = SET_ERROR(ESTALE); 1693 rc->rc_skipped = 1; 1694 continue; 1695 } 1696 if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 1697 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1698 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1699 rc->rc_offset, rc->rc_abd, rc->rc_size, 1700 zio->io_type, zio->io_priority, 0, 1701 vdev_raidz_child_done, rc)); 1702 } 1703 } 1704 } 1705 1706 /* 1707 * Start an IO operation on a RAIDZ VDev 1708 * 1709 * Outline: 1710 * - For write operations: 1711 * 1. Generate the parity data 1712 * 2. Create child zio write operations to each column's vdev, for both 1713 * data and parity. 1714 * 3. If the column skips any sectors for padding, create optional dummy 1715 * write zio children for those areas to improve aggregation continuity. 1716 * - For read operations: 1717 * 1. Create child zio read operations to each data column's vdev to read 1718 * the range of data required for zio. 1719 * 2. If this is a scrub or resilver operation, or if any of the data 1720 * vdevs have had errors, then create zio read operations to the parity 1721 * columns' VDevs as well. 1722 */ 1723 static void 1724 vdev_raidz_io_start(zio_t *zio) 1725 { 1726 vdev_t *vd = zio->io_vd; 1727 vdev_t *tvd = vd->vdev_top; 1728 vdev_raidz_t *vdrz = vd->vdev_tsd; 1729 1730 raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, 1731 vdrz->vd_logical_width, vdrz->vd_nparity); 1732 zio->io_vsd = rm; 1733 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1734 1735 /* 1736 * Until raidz expansion is implemented all maps for a raidz vdev 1737 * contain a single row. 1738 */ 1739 ASSERT3U(rm->rm_nrows, ==, 1); 1740 raidz_row_t *rr = rm->rm_row[0]; 1741 1742 if (zio->io_type == ZIO_TYPE_WRITE) { 1743 vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); 1744 } else { 1745 ASSERT(zio->io_type == ZIO_TYPE_READ); 1746 vdev_raidz_io_start_read(zio, rr); 1747 } 1748 1749 zio_execute(zio); 1750 } 1751 1752 /* 1753 * Report a checksum error for a child of a RAID-Z device. 1754 */ 1755 static void 1756 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1757 { 1758 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1759 1760 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 1761 zio->io_priority != ZIO_PRIORITY_REBUILD) { 1762 zio_bad_cksum_t zbc; 1763 raidz_map_t *rm = zio->io_vsd; 1764 1765 zbc.zbc_has_cksum = 0; 1766 zbc.zbc_injected = rm->rm_ecksuminjected; 1767 1768 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1769 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1770 rc->rc_abd, bad_data, &zbc); 1771 mutex_enter(&vd->vdev_stat_lock); 1772 vd->vdev_stat.vs_checksum_errors++; 1773 mutex_exit(&vd->vdev_stat_lock); 1774 } 1775 } 1776 1777 /* 1778 * We keep track of whether or not there were any injected errors, so that 1779 * any ereports we generate can note it. 1780 */ 1781 static int 1782 raidz_checksum_verify(zio_t *zio) 1783 { 1784 zio_bad_cksum_t zbc; 1785 raidz_map_t *rm = zio->io_vsd; 1786 1787 bzero(&zbc, sizeof (zio_bad_cksum_t)); 1788 1789 int ret = zio_checksum_error(zio, &zbc); 1790 if (ret != 0 && zbc.zbc_injected != 0) 1791 rm->rm_ecksuminjected = 1; 1792 1793 return (ret); 1794 } 1795 1796 /* 1797 * Generate the parity from the data columns. If we tried and were able to 1798 * read the parity without error, verify that the generated parity matches the 1799 * data we read. If it doesn't, we fire off a checksum error. Return the 1800 * number of such failures. 1801 */ 1802 static int 1803 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 1804 { 1805 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1806 int c, ret = 0; 1807 raidz_map_t *rm = zio->io_vsd; 1808 raidz_col_t *rc; 1809 1810 blkptr_t *bp = zio->io_bp; 1811 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1812 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1813 1814 if (checksum == ZIO_CHECKSUM_NOPARITY) 1815 return (ret); 1816 1817 for (c = 0; c < rr->rr_firstdatacol; c++) { 1818 rc = &rr->rr_col[c]; 1819 if (!rc->rc_tried || rc->rc_error != 0) 1820 continue; 1821 1822 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 1823 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 1824 } 1825 1826 /* 1827 * Regenerates parity even for !tried||rc_error!=0 columns. This 1828 * isn't harmful but it does have the side effect of fixing stuff 1829 * we didn't realize was necessary (i.e. even if we return 0). 1830 */ 1831 vdev_raidz_generate_parity_row(rm, rr); 1832 1833 for (c = 0; c < rr->rr_firstdatacol; c++) { 1834 rc = &rr->rr_col[c]; 1835 1836 if (!rc->rc_tried || rc->rc_error != 0) 1837 continue; 1838 1839 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 1840 raidz_checksum_error(zio, rc, orig[c]); 1841 rc->rc_error = SET_ERROR(ECKSUM); 1842 ret++; 1843 } 1844 abd_free(orig[c]); 1845 } 1846 1847 return (ret); 1848 } 1849 1850 static int 1851 vdev_raidz_worst_error(raidz_row_t *rr) 1852 { 1853 int error = 0; 1854 1855 for (int c = 0; c < rr->rr_cols; c++) 1856 error = zio_worst_error(error, rr->rr_col[c].rc_error); 1857 1858 return (error); 1859 } 1860 1861 static void 1862 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 1863 { 1864 int unexpected_errors = 0; 1865 int parity_errors = 0; 1866 int parity_untried = 0; 1867 int data_errors = 0; 1868 1869 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 1870 1871 for (int c = 0; c < rr->rr_cols; c++) { 1872 raidz_col_t *rc = &rr->rr_col[c]; 1873 1874 if (rc->rc_error) { 1875 if (c < rr->rr_firstdatacol) 1876 parity_errors++; 1877 else 1878 data_errors++; 1879 1880 if (!rc->rc_skipped) 1881 unexpected_errors++; 1882 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 1883 parity_untried++; 1884 } 1885 } 1886 1887 /* 1888 * If we read more parity disks than were used for 1889 * reconstruction, confirm that the other parity disks produced 1890 * correct data. 1891 * 1892 * Note that we also regenerate parity when resilvering so we 1893 * can write it out to failed devices later. 1894 */ 1895 if (parity_errors + parity_untried < 1896 rr->rr_firstdatacol - data_errors || 1897 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1898 int n = raidz_parity_verify(zio, rr); 1899 unexpected_errors += n; 1900 ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); 1901 } 1902 1903 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 1904 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1905 /* 1906 * Use the good data we have in hand to repair damaged children. 1907 */ 1908 for (int c = 0; c < rr->rr_cols; c++) { 1909 raidz_col_t *rc = &rr->rr_col[c]; 1910 vdev_t *vd = zio->io_vd; 1911 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1912 1913 if (!rc->rc_allow_repair) { 1914 continue; 1915 } else if (!rc->rc_force_repair && 1916 (rc->rc_error == 0 || rc->rc_size == 0)) { 1917 continue; 1918 } 1919 1920 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1921 rc->rc_offset, rc->rc_abd, rc->rc_size, 1922 ZIO_TYPE_WRITE, 1923 zio->io_priority == ZIO_PRIORITY_REBUILD ? 1924 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 1925 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 1926 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 1927 } 1928 } 1929 } 1930 1931 static void 1932 raidz_restore_orig_data(raidz_map_t *rm) 1933 { 1934 for (int i = 0; i < rm->rm_nrows; i++) { 1935 raidz_row_t *rr = rm->rm_row[i]; 1936 for (int c = 0; c < rr->rr_cols; c++) { 1937 raidz_col_t *rc = &rr->rr_col[c]; 1938 if (rc->rc_need_orig_restore) { 1939 abd_copy(rc->rc_abd, 1940 rc->rc_orig_data, rc->rc_size); 1941 rc->rc_need_orig_restore = B_FALSE; 1942 } 1943 } 1944 } 1945 } 1946 1947 /* 1948 * returns EINVAL if reconstruction of the block will not be possible 1949 * returns ECKSUM if this specific reconstruction failed 1950 * returns 0 on successful reconstruction 1951 */ 1952 static int 1953 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 1954 { 1955 raidz_map_t *rm = zio->io_vsd; 1956 1957 /* Reconstruct each row */ 1958 for (int r = 0; r < rm->rm_nrows; r++) { 1959 raidz_row_t *rr = rm->rm_row[r]; 1960 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 1961 int t = 0; 1962 int dead = 0; 1963 int dead_data = 0; 1964 1965 for (int c = 0; c < rr->rr_cols; c++) { 1966 raidz_col_t *rc = &rr->rr_col[c]; 1967 ASSERT0(rc->rc_need_orig_restore); 1968 if (rc->rc_error != 0) { 1969 dead++; 1970 if (c >= nparity) 1971 dead_data++; 1972 continue; 1973 } 1974 if (rc->rc_size == 0) 1975 continue; 1976 for (int lt = 0; lt < ntgts; lt++) { 1977 if (rc->rc_devidx == ltgts[lt]) { 1978 if (rc->rc_orig_data == NULL) { 1979 rc->rc_orig_data = 1980 abd_alloc_linear( 1981 rc->rc_size, B_TRUE); 1982 abd_copy(rc->rc_orig_data, 1983 rc->rc_abd, rc->rc_size); 1984 } 1985 rc->rc_need_orig_restore = B_TRUE; 1986 1987 dead++; 1988 if (c >= nparity) 1989 dead_data++; 1990 my_tgts[t++] = c; 1991 break; 1992 } 1993 } 1994 } 1995 if (dead > nparity) { 1996 /* reconstruction not possible */ 1997 raidz_restore_orig_data(rm); 1998 return (EINVAL); 1999 } 2000 if (dead_data > 0) 2001 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 2002 } 2003 2004 /* Check for success */ 2005 if (raidz_checksum_verify(zio) == 0) { 2006 2007 /* Reconstruction succeeded - report errors */ 2008 for (int i = 0; i < rm->rm_nrows; i++) { 2009 raidz_row_t *rr = rm->rm_row[i]; 2010 2011 for (int c = 0; c < rr->rr_cols; c++) { 2012 raidz_col_t *rc = &rr->rr_col[c]; 2013 if (rc->rc_need_orig_restore) { 2014 /* 2015 * Note: if this is a parity column, 2016 * we don't really know if it's wrong. 2017 * We need to let 2018 * vdev_raidz_io_done_verified() check 2019 * it, and if we set rc_error, it will 2020 * think that it is a "known" error 2021 * that doesn't need to be checked 2022 * or corrected. 2023 */ 2024 if (rc->rc_error == 0 && 2025 c >= rr->rr_firstdatacol) { 2026 raidz_checksum_error(zio, 2027 rc, rc->rc_orig_data); 2028 rc->rc_error = 2029 SET_ERROR(ECKSUM); 2030 } 2031 rc->rc_need_orig_restore = B_FALSE; 2032 } 2033 } 2034 2035 vdev_raidz_io_done_verified(zio, rr); 2036 } 2037 2038 zio_checksum_verified(zio); 2039 2040 return (0); 2041 } 2042 2043 /* Reconstruction failed - restore original data */ 2044 raidz_restore_orig_data(rm); 2045 return (ECKSUM); 2046 } 2047 2048 /* 2049 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 2050 * Note that the algorithm below is non-optimal because it doesn't take into 2051 * account how reconstruction is actually performed. For example, with 2052 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2053 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2054 * cases we'd only use parity information in column 0. 2055 * 2056 * The order that we find the various possible combinations of failed 2057 * disks is dictated by these rules: 2058 * - Examine each "slot" (the "i" in tgts[i]) 2059 * - Try to increment this slot (tgts[i] = tgts[i] + 1) 2060 * - if we can't increment because it runs into the next slot, 2061 * reset our slot to the minimum, and examine the next slot 2062 * 2063 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 2064 * 3 columns to reconstruct), we will generate the following sequence: 2065 * 2066 * STATE ACTION 2067 * 0 1 2 special case: skip since these are all parity 2068 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 2069 * 0 2 3 first slot: increment to 1 2070 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 2071 * 0 1 4 first: reset to 0; middle: increment to 2 2072 * 0 2 4 first: increment to 1 2073 * 1 2 4 first: reset to 0; middle: increment to 3 2074 * 0 3 4 first: increment to 1 2075 * 1 3 4 first: increment to 2 2076 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 2077 * 0 1 5 first: reset to 0; middle: increment to 2 2078 * 0 2 5 first: increment to 1 2079 * 1 2 5 first: reset to 0; middle: increment to 3 2080 * 0 3 5 first: increment to 1 2081 * 1 3 5 first: increment to 2 2082 * 2 3 5 first: reset to 0; middle: increment to 4 2083 * 0 4 5 first: increment to 1 2084 * 1 4 5 first: increment to 2 2085 * 2 4 5 first: increment to 3 2086 * 3 4 5 done 2087 * 2088 * This strategy works for dRAID but is less efficient when there are a large 2089 * number of child vdevs and therefore permutations to check. Furthermore, 2090 * since the raidz_map_t rows likely do not overlap reconstruction would be 2091 * possible as long as there are no more than nparity data errors per row. 2092 * These additional permutations are not currently checked but could be as 2093 * a future improvement. 2094 */ 2095 static int 2096 vdev_raidz_combrec(zio_t *zio) 2097 { 2098 int nparity = vdev_get_nparity(zio->io_vd); 2099 raidz_map_t *rm = zio->io_vsd; 2100 2101 /* Check if there's enough data to attempt reconstrution. */ 2102 for (int i = 0; i < rm->rm_nrows; i++) { 2103 raidz_row_t *rr = rm->rm_row[i]; 2104 int total_errors = 0; 2105 2106 for (int c = 0; c < rr->rr_cols; c++) { 2107 if (rr->rr_col[c].rc_error) 2108 total_errors++; 2109 } 2110 2111 if (total_errors > nparity) 2112 return (vdev_raidz_worst_error(rr)); 2113 } 2114 2115 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 2116 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2117 int *ltgts = &tstore[1]; /* value is logical child ID */ 2118 2119 /* Determine number of logical children, n */ 2120 int n = zio->io_vd->vdev_children; 2121 2122 ASSERT3U(num_failures, <=, nparity); 2123 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 2124 2125 /* Handle corner cases in combrec logic */ 2126 ltgts[-1] = -1; 2127 for (int i = 0; i < num_failures; i++) { 2128 ltgts[i] = i; 2129 } 2130 ltgts[num_failures] = n; 2131 2132 for (;;) { 2133 int err = raidz_reconstruct(zio, ltgts, num_failures, 2134 nparity); 2135 if (err == EINVAL) { 2136 /* 2137 * Reconstruction not possible with this # 2138 * failures; try more failures. 2139 */ 2140 break; 2141 } else if (err == 0) 2142 return (0); 2143 2144 /* Compute next targets to try */ 2145 for (int t = 0; ; t++) { 2146 ASSERT3U(t, <, num_failures); 2147 ltgts[t]++; 2148 if (ltgts[t] == n) { 2149 /* try more failures */ 2150 ASSERT3U(t, ==, num_failures - 1); 2151 break; 2152 } 2153 2154 ASSERT3U(ltgts[t], <, n); 2155 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 2156 2157 /* 2158 * If that spot is available, we're done here. 2159 * Try the next combination. 2160 */ 2161 if (ltgts[t] != ltgts[t + 1]) 2162 break; 2163 2164 /* 2165 * Otherwise, reset this tgt to the minimum, 2166 * and move on to the next tgt. 2167 */ 2168 ltgts[t] = ltgts[t - 1] + 1; 2169 ASSERT3U(ltgts[t], ==, t); 2170 } 2171 2172 /* Increase the number of failures and keep trying. */ 2173 if (ltgts[num_failures - 1] == n) 2174 break; 2175 } 2176 } 2177 2178 return (ECKSUM); 2179 } 2180 2181 void 2182 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 2183 { 2184 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 2185 raidz_row_t *rr = rm->rm_row[row]; 2186 vdev_raidz_reconstruct_row(rm, rr, t, nt); 2187 } 2188 } 2189 2190 /* 2191 * Complete a write IO operation on a RAIDZ VDev 2192 * 2193 * Outline: 2194 * 1. Check for errors on the child IOs. 2195 * 2. Return, setting an error code if too few child VDevs were written 2196 * to reconstruct the data later. Note that partial writes are 2197 * considered successful if they can be reconstructed at all. 2198 */ 2199 static void 2200 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 2201 { 2202 int total_errors = 0; 2203 2204 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2205 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2206 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 2207 2208 for (int c = 0; c < rr->rr_cols; c++) { 2209 raidz_col_t *rc = &rr->rr_col[c]; 2210 2211 if (rc->rc_error) { 2212 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2213 2214 total_errors++; 2215 } 2216 } 2217 2218 /* 2219 * Treat partial writes as a success. If we couldn't write enough 2220 * columns to reconstruct the data, the I/O failed. Otherwise, 2221 * good enough. 2222 * 2223 * Now that we support write reallocation, it would be better 2224 * to treat partial failure as real failure unless there are 2225 * no non-degraded top-level vdevs left, and not update DTLs 2226 * if we intend to reallocate. 2227 */ 2228 if (total_errors > rr->rr_firstdatacol) { 2229 zio->io_error = zio_worst_error(zio->io_error, 2230 vdev_raidz_worst_error(rr)); 2231 } 2232 } 2233 2234 static void 2235 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 2236 raidz_row_t *rr) 2237 { 2238 int parity_errors = 0; 2239 int parity_untried = 0; 2240 int data_errors = 0; 2241 int total_errors = 0; 2242 2243 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 2244 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 2245 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2246 2247 for (int c = 0; c < rr->rr_cols; c++) { 2248 raidz_col_t *rc = &rr->rr_col[c]; 2249 2250 if (rc->rc_error) { 2251 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2252 2253 if (c < rr->rr_firstdatacol) 2254 parity_errors++; 2255 else 2256 data_errors++; 2257 2258 total_errors++; 2259 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2260 parity_untried++; 2261 } 2262 } 2263 2264 /* 2265 * If there were data errors and the number of errors we saw was 2266 * correctable -- less than or equal to the number of parity disks read 2267 * -- reconstruct based on the missing data. 2268 */ 2269 if (data_errors != 0 && 2270 total_errors <= rr->rr_firstdatacol - parity_untried) { 2271 /* 2272 * We either attempt to read all the parity columns or 2273 * none of them. If we didn't try to read parity, we 2274 * wouldn't be here in the correctable case. There must 2275 * also have been fewer parity errors than parity 2276 * columns or, again, we wouldn't be in this code path. 2277 */ 2278 ASSERT(parity_untried == 0); 2279 ASSERT(parity_errors < rr->rr_firstdatacol); 2280 2281 /* 2282 * Identify the data columns that reported an error. 2283 */ 2284 int n = 0; 2285 int tgts[VDEV_RAIDZ_MAXPARITY]; 2286 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2287 raidz_col_t *rc = &rr->rr_col[c]; 2288 if (rc->rc_error != 0) { 2289 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2290 tgts[n++] = c; 2291 } 2292 } 2293 2294 ASSERT(rr->rr_firstdatacol >= n); 2295 2296 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 2297 } 2298 } 2299 2300 /* 2301 * Return the number of reads issued. 2302 */ 2303 static int 2304 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 2305 { 2306 vdev_t *vd = zio->io_vd; 2307 int nread = 0; 2308 2309 rr->rr_missingdata = 0; 2310 rr->rr_missingparity = 0; 2311 2312 /* 2313 * If this rows contains empty sectors which are not required 2314 * for a normal read then allocate an ABD for them now so they 2315 * may be read, verified, and any needed repairs performed. 2316 */ 2317 if (rr->rr_nempty && rr->rr_abd_empty == NULL) 2318 vdev_draid_map_alloc_empty(zio, rr); 2319 2320 for (int c = 0; c < rr->rr_cols; c++) { 2321 raidz_col_t *rc = &rr->rr_col[c]; 2322 if (rc->rc_tried || rc->rc_size == 0) 2323 continue; 2324 2325 zio_nowait(zio_vdev_child_io(zio, NULL, 2326 vd->vdev_child[rc->rc_devidx], 2327 rc->rc_offset, rc->rc_abd, rc->rc_size, 2328 zio->io_type, zio->io_priority, 0, 2329 vdev_raidz_child_done, rc)); 2330 nread++; 2331 } 2332 return (nread); 2333 } 2334 2335 /* 2336 * We're here because either there were too many errors to even attempt 2337 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 2338 * failed. In either case, there is enough bad data to prevent reconstruction. 2339 * Start checksum ereports for all children which haven't failed. 2340 */ 2341 static void 2342 vdev_raidz_io_done_unrecoverable(zio_t *zio) 2343 { 2344 raidz_map_t *rm = zio->io_vsd; 2345 2346 for (int i = 0; i < rm->rm_nrows; i++) { 2347 raidz_row_t *rr = rm->rm_row[i]; 2348 2349 for (int c = 0; c < rr->rr_cols; c++) { 2350 raidz_col_t *rc = &rr->rr_col[c]; 2351 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2352 2353 if (rc->rc_error != 0) 2354 continue; 2355 2356 zio_bad_cksum_t zbc; 2357 zbc.zbc_has_cksum = 0; 2358 zbc.zbc_injected = rm->rm_ecksuminjected; 2359 2360 (void) zfs_ereport_start_checksum(zio->io_spa, 2361 cvd, &zio->io_bookmark, zio, rc->rc_offset, 2362 rc->rc_size, &zbc); 2363 mutex_enter(&cvd->vdev_stat_lock); 2364 cvd->vdev_stat.vs_checksum_errors++; 2365 mutex_exit(&cvd->vdev_stat_lock); 2366 } 2367 } 2368 } 2369 2370 void 2371 vdev_raidz_io_done(zio_t *zio) 2372 { 2373 raidz_map_t *rm = zio->io_vsd; 2374 2375 if (zio->io_type == ZIO_TYPE_WRITE) { 2376 for (int i = 0; i < rm->rm_nrows; i++) { 2377 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 2378 } 2379 } else { 2380 for (int i = 0; i < rm->rm_nrows; i++) { 2381 raidz_row_t *rr = rm->rm_row[i]; 2382 vdev_raidz_io_done_reconstruct_known_missing(zio, 2383 rm, rr); 2384 } 2385 2386 if (raidz_checksum_verify(zio) == 0) { 2387 for (int i = 0; i < rm->rm_nrows; i++) { 2388 raidz_row_t *rr = rm->rm_row[i]; 2389 vdev_raidz_io_done_verified(zio, rr); 2390 } 2391 zio_checksum_verified(zio); 2392 } else { 2393 /* 2394 * A sequential resilver has no checksum which makes 2395 * combinatoral reconstruction impossible. This code 2396 * path is unreachable since raidz_checksum_verify() 2397 * has no checksum to verify and must succeed. 2398 */ 2399 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 2400 2401 /* 2402 * This isn't a typical situation -- either we got a 2403 * read error or a child silently returned bad data. 2404 * Read every block so we can try again with as much 2405 * data and parity as we can track down. If we've 2406 * already been through once before, all children will 2407 * be marked as tried so we'll proceed to combinatorial 2408 * reconstruction. 2409 */ 2410 int nread = 0; 2411 for (int i = 0; i < rm->rm_nrows; i++) { 2412 nread += vdev_raidz_read_all(zio, 2413 rm->rm_row[i]); 2414 } 2415 if (nread != 0) { 2416 /* 2417 * Normally our stage is VDEV_IO_DONE, but if 2418 * we've already called redone(), it will have 2419 * changed to VDEV_IO_START, in which case we 2420 * don't want to call redone() again. 2421 */ 2422 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 2423 zio_vdev_io_redone(zio); 2424 return; 2425 } 2426 2427 zio->io_error = vdev_raidz_combrec(zio); 2428 if (zio->io_error == ECKSUM && 2429 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2430 vdev_raidz_io_done_unrecoverable(zio); 2431 } 2432 } 2433 } 2434 } 2435 2436 static void 2437 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2438 { 2439 vdev_raidz_t *vdrz = vd->vdev_tsd; 2440 if (faulted > vdrz->vd_nparity) 2441 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2442 VDEV_AUX_NO_REPLICAS); 2443 else if (degraded + faulted != 0) 2444 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2445 else 2446 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2447 } 2448 2449 /* 2450 * Determine if any portion of the provided block resides on a child vdev 2451 * with a dirty DTL and therefore needs to be resilvered. The function 2452 * assumes that at least one DTL is dirty which implies that full stripe 2453 * width blocks must be resilvered. 2454 */ 2455 static boolean_t 2456 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 2457 uint64_t phys_birth) 2458 { 2459 vdev_raidz_t *vdrz = vd->vdev_tsd; 2460 uint64_t dcols = vd->vdev_children; 2461 uint64_t nparity = vdrz->vd_nparity; 2462 uint64_t ashift = vd->vdev_top->vdev_ashift; 2463 /* The starting RAIDZ (parent) vdev sector of the block. */ 2464 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 2465 /* The zio's size in units of the vdev's minimum sector size. */ 2466 uint64_t s = ((psize - 1) >> ashift) + 1; 2467 /* The first column for this stripe. */ 2468 uint64_t f = b % dcols; 2469 2470 /* Unreachable by sequential resilver. */ 2471 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 2472 2473 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 2474 return (B_FALSE); 2475 2476 if (s + nparity >= dcols) 2477 return (B_TRUE); 2478 2479 for (uint64_t c = 0; c < s + nparity; c++) { 2480 uint64_t devidx = (f + c) % dcols; 2481 vdev_t *cvd = vd->vdev_child[devidx]; 2482 2483 /* 2484 * dsl_scan_need_resilver() already checked vd with 2485 * vdev_dtl_contains(). So here just check cvd with 2486 * vdev_dtl_empty(), cheaper and a good approximation. 2487 */ 2488 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2489 return (B_TRUE); 2490 } 2491 2492 return (B_FALSE); 2493 } 2494 2495 static void 2496 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 2497 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 2498 { 2499 vdev_t *raidvd = cvd->vdev_parent; 2500 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2501 2502 uint64_t width = raidvd->vdev_children; 2503 uint64_t tgt_col = cvd->vdev_id; 2504 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2505 2506 /* make sure the offsets are block-aligned */ 2507 ASSERT0(logical_rs->rs_start % (1 << ashift)); 2508 ASSERT0(logical_rs->rs_end % (1 << ashift)); 2509 uint64_t b_start = logical_rs->rs_start >> ashift; 2510 uint64_t b_end = logical_rs->rs_end >> ashift; 2511 2512 uint64_t start_row = 0; 2513 if (b_start > tgt_col) /* avoid underflow */ 2514 start_row = ((b_start - tgt_col - 1) / width) + 1; 2515 2516 uint64_t end_row = 0; 2517 if (b_end > tgt_col) 2518 end_row = ((b_end - tgt_col - 1) / width) + 1; 2519 2520 physical_rs->rs_start = start_row << ashift; 2521 physical_rs->rs_end = end_row << ashift; 2522 2523 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 2524 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 2525 logical_rs->rs_end - logical_rs->rs_start); 2526 } 2527 2528 /* 2529 * Initialize private RAIDZ specific fields from the nvlist. 2530 */ 2531 static int 2532 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 2533 { 2534 vdev_raidz_t *vdrz; 2535 uint64_t nparity; 2536 2537 uint_t children; 2538 nvlist_t **child; 2539 int error = nvlist_lookup_nvlist_array(nv, 2540 ZPOOL_CONFIG_CHILDREN, &child, &children); 2541 if (error != 0) 2542 return (SET_ERROR(EINVAL)); 2543 2544 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 2545 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 2546 return (SET_ERROR(EINVAL)); 2547 2548 /* 2549 * Previous versions could only support 1 or 2 parity 2550 * device. 2551 */ 2552 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 2553 return (SET_ERROR(EINVAL)); 2554 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 2555 return (SET_ERROR(EINVAL)); 2556 } else { 2557 /* 2558 * We require the parity to be specified for SPAs that 2559 * support multiple parity levels. 2560 */ 2561 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 2562 return (SET_ERROR(EINVAL)); 2563 2564 /* 2565 * Otherwise, we default to 1 parity device for RAID-Z. 2566 */ 2567 nparity = 1; 2568 } 2569 2570 vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 2571 vdrz->vd_logical_width = children; 2572 vdrz->vd_nparity = nparity; 2573 2574 *tsd = vdrz; 2575 2576 return (0); 2577 } 2578 2579 static void 2580 vdev_raidz_fini(vdev_t *vd) 2581 { 2582 kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); 2583 } 2584 2585 /* 2586 * Add RAIDZ specific fields to the config nvlist. 2587 */ 2588 static void 2589 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 2590 { 2591 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 2592 vdev_raidz_t *vdrz = vd->vdev_tsd; 2593 2594 /* 2595 * Make sure someone hasn't managed to sneak a fancy new vdev 2596 * into a crufty old storage pool. 2597 */ 2598 ASSERT(vdrz->vd_nparity == 1 || 2599 (vdrz->vd_nparity <= 2 && 2600 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 2601 (vdrz->vd_nparity <= 3 && 2602 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 2603 2604 /* 2605 * Note that we'll add these even on storage pools where they 2606 * aren't strictly required -- older software will just ignore 2607 * it. 2608 */ 2609 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 2610 } 2611 2612 static uint64_t 2613 vdev_raidz_nparity(vdev_t *vd) 2614 { 2615 vdev_raidz_t *vdrz = vd->vdev_tsd; 2616 return (vdrz->vd_nparity); 2617 } 2618 2619 static uint64_t 2620 vdev_raidz_ndisks(vdev_t *vd) 2621 { 2622 return (vd->vdev_children); 2623 } 2624 2625 vdev_ops_t vdev_raidz_ops = { 2626 .vdev_op_init = vdev_raidz_init, 2627 .vdev_op_fini = vdev_raidz_fini, 2628 .vdev_op_open = vdev_raidz_open, 2629 .vdev_op_close = vdev_raidz_close, 2630 .vdev_op_asize = vdev_raidz_asize, 2631 .vdev_op_min_asize = vdev_raidz_min_asize, 2632 .vdev_op_min_alloc = NULL, 2633 .vdev_op_io_start = vdev_raidz_io_start, 2634 .vdev_op_io_done = vdev_raidz_io_done, 2635 .vdev_op_state_change = vdev_raidz_state_change, 2636 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2637 .vdev_op_hold = NULL, 2638 .vdev_op_rele = NULL, 2639 .vdev_op_remap = NULL, 2640 .vdev_op_xlate = vdev_raidz_xlate, 2641 .vdev_op_rebuild_asize = NULL, 2642 .vdev_op_metaslab_init = NULL, 2643 .vdev_op_config_generate = vdev_raidz_config_generate, 2644 .vdev_op_nparity = vdev_raidz_nparity, 2645 .vdev_op_ndisks = vdev_raidz_ndisks, 2646 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2647 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2648 }; 2649