1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/zio_checksum.h> 33 #include <sys/abd.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/vdev_raidz.h> 37 #include <sys/vdev_raidz_impl.h> 38 39 #ifdef ZFS_DEBUG 40 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 41 #endif 42 43 /* 44 * Virtual device vector for RAID-Z. 45 * 46 * This vdev supports single, double, and triple parity. For single parity, 47 * we use a simple XOR of all the data columns. For double or triple parity, 48 * we use a special case of Reed-Solomon coding. This extends the 49 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 50 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 51 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 52 * former is also based. The latter is designed to provide higher performance 53 * for writes. 54 * 55 * Note that the Plank paper claimed to support arbitrary N+M, but was then 56 * amended six years later identifying a critical flaw that invalidates its 57 * claims. Nevertheless, the technique can be adapted to work for up to 58 * triple parity. For additional parity, the amendment "Note: Correction to 59 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 60 * is viable, but the additional complexity means that write performance will 61 * suffer. 62 * 63 * All of the methods above operate on a Galois field, defined over the 64 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 65 * can be expressed with a single byte. Briefly, the operations on the 66 * field are defined as follows: 67 * 68 * o addition (+) is represented by a bitwise XOR 69 * o subtraction (-) is therefore identical to addition: A + B = A - B 70 * o multiplication of A by 2 is defined by the following bitwise expression: 71 * 72 * (A * 2)_7 = A_6 73 * (A * 2)_6 = A_5 74 * (A * 2)_5 = A_4 75 * (A * 2)_4 = A_3 + A_7 76 * (A * 2)_3 = A_2 + A_7 77 * (A * 2)_2 = A_1 + A_7 78 * (A * 2)_1 = A_0 79 * (A * 2)_0 = A_7 80 * 81 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 82 * As an aside, this multiplication is derived from the error correcting 83 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 84 * 85 * Observe that any number in the field (except for 0) can be expressed as a 86 * power of 2 -- a generator for the field. We store a table of the powers of 87 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 88 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 89 * than field addition). The inverse of a field element A (A^-1) is therefore 90 * A ^ (255 - 1) = A^254. 91 * 92 * The up-to-three parity columns, P, Q, R over several data columns, 93 * D_0, ... D_n-1, can be expressed by field operations: 94 * 95 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 96 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 97 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 98 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 99 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 100 * 101 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 102 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 103 * independent coefficients. (There are no additional coefficients that have 104 * this property which is why the uncorrected Plank method breaks down.) 105 * 106 * See the reconstruction code below for how P, Q and R can used individually 107 * or in concert to recover missing data columns. 108 */ 109 110 #define VDEV_RAIDZ_P 0 111 #define VDEV_RAIDZ_Q 1 112 #define VDEV_RAIDZ_R 2 113 114 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 115 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 116 117 /* 118 * We provide a mechanism to perform the field multiplication operation on a 119 * 64-bit value all at once rather than a byte at a time. This works by 120 * creating a mask from the top bit in each byte and using that to 121 * conditionally apply the XOR of 0x1d. 122 */ 123 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 124 { \ 125 (mask) = (x) & 0x8080808080808080ULL; \ 126 (mask) = ((mask) << 1) - ((mask) >> 7); \ 127 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 128 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 129 } 130 131 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 132 { \ 133 VDEV_RAIDZ_64MUL_2((x), mask); \ 134 VDEV_RAIDZ_64MUL_2((x), mask); \ 135 } 136 137 void 138 vdev_raidz_map_free(raidz_map_t *rm) 139 { 140 int c; 141 142 for (c = 0; c < rm->rm_firstdatacol; c++) { 143 abd_free(rm->rm_col[c].rc_abd); 144 145 if (rm->rm_col[c].rc_gdata != NULL) 146 abd_free(rm->rm_col[c].rc_gdata); 147 } 148 149 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 150 abd_put(rm->rm_col[c].rc_abd); 151 152 if (rm->rm_abd_copy != NULL) 153 abd_free(rm->rm_abd_copy); 154 155 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 156 } 157 158 static void 159 vdev_raidz_map_free_vsd(zio_t *zio) 160 { 161 raidz_map_t *rm = zio->io_vsd; 162 163 ASSERT0(rm->rm_freed); 164 rm->rm_freed = 1; 165 166 if (rm->rm_reports == 0) 167 vdev_raidz_map_free(rm); 168 } 169 170 /*ARGSUSED*/ 171 static void 172 vdev_raidz_cksum_free(void *arg, size_t ignored) 173 { 174 raidz_map_t *rm = arg; 175 176 ASSERT3U(rm->rm_reports, >, 0); 177 178 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 179 vdev_raidz_map_free(rm); 180 } 181 182 static void 183 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) 184 { 185 raidz_map_t *rm = zcr->zcr_cbdata; 186 const size_t c = zcr->zcr_cbinfo; 187 size_t x, offset; 188 189 const abd_t *good = NULL; 190 const abd_t *bad = rm->rm_col[c].rc_abd; 191 192 if (good_data == NULL) { 193 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 194 return; 195 } 196 197 if (c < rm->rm_firstdatacol) { 198 /* 199 * The first time through, calculate the parity blocks for 200 * the good data (this relies on the fact that the good 201 * data never changes for a given logical ZIO) 202 */ 203 if (rm->rm_col[0].rc_gdata == NULL) { 204 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 205 206 /* 207 * Set up the rm_col[]s to generate the parity for 208 * good_data, first saving the parity bufs and 209 * replacing them with buffers to hold the result. 210 */ 211 for (x = 0; x < rm->rm_firstdatacol; x++) { 212 bad_parity[x] = rm->rm_col[x].rc_abd; 213 rm->rm_col[x].rc_abd = 214 rm->rm_col[x].rc_gdata = 215 abd_alloc_sametype(rm->rm_col[x].rc_abd, 216 rm->rm_col[x].rc_size); 217 } 218 219 /* fill in the data columns from good_data */ 220 offset = 0; 221 for (; x < rm->rm_cols; x++) { 222 abd_put(rm->rm_col[x].rc_abd); 223 224 rm->rm_col[x].rc_abd = 225 abd_get_offset_size((abd_t *)good_data, 226 offset, rm->rm_col[x].rc_size); 227 offset += rm->rm_col[x].rc_size; 228 } 229 230 /* 231 * Construct the parity from the good data. 232 */ 233 vdev_raidz_generate_parity(rm); 234 235 /* restore everything back to its original state */ 236 for (x = 0; x < rm->rm_firstdatacol; x++) 237 rm->rm_col[x].rc_abd = bad_parity[x]; 238 239 offset = 0; 240 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 241 abd_put(rm->rm_col[x].rc_abd); 242 rm->rm_col[x].rc_abd = abd_get_offset_size( 243 rm->rm_abd_copy, offset, 244 rm->rm_col[x].rc_size); 245 offset += rm->rm_col[x].rc_size; 246 } 247 } 248 249 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 250 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, 251 rm->rm_col[c].rc_size); 252 } else { 253 /* adjust good_data to point at the start of our column */ 254 offset = 0; 255 for (x = rm->rm_firstdatacol; x < c; x++) 256 offset += rm->rm_col[x].rc_size; 257 258 good = abd_get_offset_size((abd_t *)good_data, offset, 259 rm->rm_col[c].rc_size); 260 } 261 262 /* we drop the ereport if it ends up that the data was good */ 263 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 264 abd_put((abd_t *)good); 265 } 266 267 /* 268 * Invoked indirectly by zfs_ereport_start_checksum(), called 269 * below when our read operation fails completely. The main point 270 * is to keep a copy of everything we read from disk, so that at 271 * vdev_raidz_cksum_finish() time we can compare it with the good data. 272 */ 273 static void 274 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 275 { 276 size_t c = (size_t)(uintptr_t)arg; 277 size_t offset; 278 279 raidz_map_t *rm = zio->io_vsd; 280 size_t size; 281 282 /* set up the report and bump the refcount */ 283 zcr->zcr_cbdata = rm; 284 zcr->zcr_cbinfo = c; 285 zcr->zcr_finish = vdev_raidz_cksum_finish; 286 zcr->zcr_free = vdev_raidz_cksum_free; 287 288 rm->rm_reports++; 289 ASSERT3U(rm->rm_reports, >, 0); 290 291 if (rm->rm_abd_copy != NULL) 292 return; 293 294 /* 295 * It's the first time we're called for this raidz_map_t, so we need 296 * to copy the data aside; there's no guarantee that our zio's buffer 297 * won't be re-used for something else. 298 * 299 * Our parity data is already in separate buffers, so there's no need 300 * to copy them. 301 */ 302 303 size = 0; 304 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 305 size += rm->rm_col[c].rc_size; 306 307 rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); 308 309 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 310 raidz_col_t *col = &rm->rm_col[c]; 311 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, 312 col->rc_size); 313 314 abd_copy(tmp, col->rc_abd, col->rc_size); 315 316 abd_put(col->rc_abd); 317 col->rc_abd = tmp; 318 319 offset += col->rc_size; 320 } 321 ASSERT3U(offset, ==, size); 322 } 323 324 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 325 .vsd_free = vdev_raidz_map_free_vsd, 326 .vsd_cksum_report = vdev_raidz_cksum_report 327 }; 328 329 /* 330 * Divides the IO evenly across all child vdevs; usually, dcols is 331 * the number of children in the target vdev. 332 * 333 * Avoid inlining the function to keep vdev_raidz_io_start(), which 334 * is this functions only caller, as small as possible on the stack. 335 */ 336 noinline raidz_map_t * 337 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 338 uint64_t nparity) 339 { 340 raidz_map_t *rm; 341 /* The starting RAIDZ (parent) vdev sector of the block. */ 342 uint64_t b = zio->io_offset >> ashift; 343 /* The zio's size in units of the vdev's minimum sector size. */ 344 uint64_t s = zio->io_size >> ashift; 345 /* The first column for this stripe. */ 346 uint64_t f = b % dcols; 347 /* The starting byte offset on each child vdev. */ 348 uint64_t o = (b / dcols) << ashift; 349 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 350 uint64_t off = 0; 351 352 /* 353 * "Quotient": The number of data sectors for this stripe on all but 354 * the "big column" child vdevs that also contain "remainder" data. 355 */ 356 q = s / (dcols - nparity); 357 358 /* 359 * "Remainder": The number of partial stripe data sectors in this I/O. 360 * This will add a sector to some, but not all, child vdevs. 361 */ 362 r = s - q * (dcols - nparity); 363 364 /* The number of "big columns" - those which contain remainder data. */ 365 bc = (r == 0 ? 0 : r + nparity); 366 367 /* 368 * The total number of data and parity sectors associated with 369 * this I/O. 370 */ 371 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 372 373 /* acols: The columns that will be accessed. */ 374 /* scols: The columns that will be accessed or skipped. */ 375 if (q == 0) { 376 /* Our I/O request doesn't span all child vdevs. */ 377 acols = bc; 378 scols = MIN(dcols, roundup(bc, nparity + 1)); 379 } else { 380 acols = dcols; 381 scols = dcols; 382 } 383 384 ASSERT3U(acols, <=, scols); 385 386 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 387 388 rm->rm_cols = acols; 389 rm->rm_scols = scols; 390 rm->rm_bigcols = bc; 391 rm->rm_skipstart = bc; 392 rm->rm_missingdata = 0; 393 rm->rm_missingparity = 0; 394 rm->rm_firstdatacol = nparity; 395 rm->rm_abd_copy = NULL; 396 rm->rm_reports = 0; 397 rm->rm_freed = 0; 398 rm->rm_ecksuminjected = 0; 399 400 asize = 0; 401 402 for (c = 0; c < scols; c++) { 403 col = f + c; 404 coff = o; 405 if (col >= dcols) { 406 col -= dcols; 407 coff += 1ULL << ashift; 408 } 409 rm->rm_col[c].rc_devidx = col; 410 rm->rm_col[c].rc_offset = coff; 411 rm->rm_col[c].rc_abd = NULL; 412 rm->rm_col[c].rc_gdata = NULL; 413 rm->rm_col[c].rc_error = 0; 414 rm->rm_col[c].rc_tried = 0; 415 rm->rm_col[c].rc_skipped = 0; 416 417 if (c >= acols) 418 rm->rm_col[c].rc_size = 0; 419 else if (c < bc) 420 rm->rm_col[c].rc_size = (q + 1) << ashift; 421 else 422 rm->rm_col[c].rc_size = q << ashift; 423 424 asize += rm->rm_col[c].rc_size; 425 } 426 427 ASSERT3U(asize, ==, tot << ashift); 428 rm->rm_asize = roundup(asize, (nparity + 1) << ashift); 429 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 430 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); 431 ASSERT3U(rm->rm_nskip, <=, nparity); 432 433 for (c = 0; c < rm->rm_firstdatacol; c++) 434 rm->rm_col[c].rc_abd = 435 abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); 436 437 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, 438 rm->rm_col[c].rc_size); 439 off = rm->rm_col[c].rc_size; 440 441 for (c = c + 1; c < acols; c++) { 442 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, 443 rm->rm_col[c].rc_size); 444 off += rm->rm_col[c].rc_size; 445 } 446 447 /* 448 * If all data stored spans all columns, there's a danger that parity 449 * will always be on the same device and, since parity isn't read 450 * during normal operation, that device's I/O bandwidth won't be 451 * used effectively. We therefore switch the parity every 1MB. 452 * 453 * ... at least that was, ostensibly, the theory. As a practical 454 * matter unless we juggle the parity between all devices evenly, we 455 * won't see any benefit. Further, occasional writes that aren't a 456 * multiple of the LCM of the number of children and the minimum 457 * stripe width are sufficient to avoid pessimal behavior. 458 * Unfortunately, this decision created an implicit on-disk format 459 * requirement that we need to support for all eternity, but only 460 * for single-parity RAID-Z. 461 * 462 * If we intend to skip a sector in the zeroth column for padding 463 * we must make sure to note this swap. We will never intend to 464 * skip the first column since at least one data and one parity 465 * column must appear in each row. 466 */ 467 ASSERT(rm->rm_cols >= 2); 468 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 469 470 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 471 devidx = rm->rm_col[0].rc_devidx; 472 o = rm->rm_col[0].rc_offset; 473 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 474 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 475 rm->rm_col[1].rc_devidx = devidx; 476 rm->rm_col[1].rc_offset = o; 477 478 if (rm->rm_skipstart == 0) 479 rm->rm_skipstart = 1; 480 } 481 482 zio->io_vsd = rm; 483 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 484 485 /* init RAIDZ parity ops */ 486 rm->rm_ops = vdev_raidz_math_get_ops(); 487 488 return (rm); 489 } 490 491 struct pqr_struct { 492 uint64_t *p; 493 uint64_t *q; 494 uint64_t *r; 495 }; 496 497 static int 498 vdev_raidz_p_func(void *buf, size_t size, void *private) 499 { 500 struct pqr_struct *pqr = private; 501 const uint64_t *src = buf; 502 int i, cnt = size / sizeof (src[0]); 503 504 ASSERT(pqr->p && !pqr->q && !pqr->r); 505 506 for (i = 0; i < cnt; i++, src++, pqr->p++) 507 *pqr->p ^= *src; 508 509 return (0); 510 } 511 512 static int 513 vdev_raidz_pq_func(void *buf, size_t size, void *private) 514 { 515 struct pqr_struct *pqr = private; 516 const uint64_t *src = buf; 517 uint64_t mask; 518 int i, cnt = size / sizeof (src[0]); 519 520 ASSERT(pqr->p && pqr->q && !pqr->r); 521 522 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 523 *pqr->p ^= *src; 524 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 525 *pqr->q ^= *src; 526 } 527 528 return (0); 529 } 530 531 static int 532 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 533 { 534 struct pqr_struct *pqr = private; 535 const uint64_t *src = buf; 536 uint64_t mask; 537 int i, cnt = size / sizeof (src[0]); 538 539 ASSERT(pqr->p && pqr->q && pqr->r); 540 541 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 542 *pqr->p ^= *src; 543 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 544 *pqr->q ^= *src; 545 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 546 *pqr->r ^= *src; 547 } 548 549 return (0); 550 } 551 552 static void 553 vdev_raidz_generate_parity_p(raidz_map_t *rm) 554 { 555 uint64_t *p; 556 int c; 557 abd_t *src; 558 559 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 560 src = rm->rm_col[c].rc_abd; 561 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 562 563 if (c == rm->rm_firstdatacol) { 564 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 565 } else { 566 struct pqr_struct pqr = { p, NULL, NULL }; 567 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 568 vdev_raidz_p_func, &pqr); 569 } 570 } 571 } 572 573 static void 574 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 575 { 576 uint64_t *p, *q, pcnt, ccnt, mask, i; 577 int c; 578 abd_t *src; 579 580 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 581 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 582 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 583 584 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 585 src = rm->rm_col[c].rc_abd; 586 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 587 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 588 589 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 590 591 if (c == rm->rm_firstdatacol) { 592 ASSERT(ccnt == pcnt || ccnt == 0); 593 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 594 (void) memcpy(q, p, rm->rm_col[c].rc_size); 595 596 for (i = ccnt; i < pcnt; i++) { 597 p[i] = 0; 598 q[i] = 0; 599 } 600 } else { 601 struct pqr_struct pqr = { p, q, NULL }; 602 603 ASSERT(ccnt <= pcnt); 604 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 605 vdev_raidz_pq_func, &pqr); 606 607 /* 608 * Treat short columns as though they are full of 0s. 609 * Note that there's therefore nothing needed for P. 610 */ 611 for (i = ccnt; i < pcnt; i++) { 612 VDEV_RAIDZ_64MUL_2(q[i], mask); 613 } 614 } 615 } 616 } 617 618 static void 619 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 620 { 621 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 622 int c; 623 abd_t *src; 624 625 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 626 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 627 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 628 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 629 rm->rm_col[VDEV_RAIDZ_R].rc_size); 630 631 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 632 src = rm->rm_col[c].rc_abd; 633 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 634 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 635 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 636 637 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 638 639 if (c == rm->rm_firstdatacol) { 640 ASSERT(ccnt == pcnt || ccnt == 0); 641 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 642 (void) memcpy(q, p, rm->rm_col[c].rc_size); 643 (void) memcpy(r, p, rm->rm_col[c].rc_size); 644 645 for (i = ccnt; i < pcnt; i++) { 646 p[i] = 0; 647 q[i] = 0; 648 r[i] = 0; 649 } 650 } else { 651 struct pqr_struct pqr = { p, q, r }; 652 653 ASSERT(ccnt <= pcnt); 654 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 655 vdev_raidz_pqr_func, &pqr); 656 657 /* 658 * Treat short columns as though they are full of 0s. 659 * Note that there's therefore nothing needed for P. 660 */ 661 for (i = ccnt; i < pcnt; i++) { 662 VDEV_RAIDZ_64MUL_2(q[i], mask); 663 VDEV_RAIDZ_64MUL_4(r[i], mask); 664 } 665 } 666 } 667 } 668 669 /* 670 * Generate RAID parity in the first virtual columns according to the number of 671 * parity columns available. 672 */ 673 void 674 vdev_raidz_generate_parity(raidz_map_t *rm) 675 { 676 /* Generate using the new math implementation */ 677 if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) 678 return; 679 680 switch (rm->rm_firstdatacol) { 681 case 1: 682 vdev_raidz_generate_parity_p(rm); 683 break; 684 case 2: 685 vdev_raidz_generate_parity_pq(rm); 686 break; 687 case 3: 688 vdev_raidz_generate_parity_pqr(rm); 689 break; 690 default: 691 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 692 } 693 } 694 695 /* ARGSUSED */ 696 static int 697 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 698 { 699 uint64_t *dst = dbuf; 700 uint64_t *src = sbuf; 701 int cnt = size / sizeof (src[0]); 702 703 for (int i = 0; i < cnt; i++) { 704 dst[i] ^= src[i]; 705 } 706 707 return (0); 708 } 709 710 /* ARGSUSED */ 711 static int 712 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 713 void *private) 714 { 715 uint64_t *dst = dbuf; 716 uint64_t *src = sbuf; 717 uint64_t mask; 718 int cnt = size / sizeof (dst[0]); 719 720 for (int i = 0; i < cnt; i++, dst++, src++) { 721 VDEV_RAIDZ_64MUL_2(*dst, mask); 722 *dst ^= *src; 723 } 724 725 return (0); 726 } 727 728 /* ARGSUSED */ 729 static int 730 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 731 { 732 uint64_t *dst = buf; 733 uint64_t mask; 734 int cnt = size / sizeof (dst[0]); 735 736 for (int i = 0; i < cnt; i++, dst++) { 737 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 738 VDEV_RAIDZ_64MUL_2(*dst, mask); 739 } 740 741 return (0); 742 } 743 744 struct reconst_q_struct { 745 uint64_t *q; 746 int exp; 747 }; 748 749 static int 750 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 751 { 752 struct reconst_q_struct *rq = private; 753 uint64_t *dst = buf; 754 int cnt = size / sizeof (dst[0]); 755 756 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 757 int j; 758 uint8_t *b; 759 760 *dst ^= *rq->q; 761 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 762 *b = vdev_raidz_exp2(*b, rq->exp); 763 } 764 } 765 766 return (0); 767 } 768 769 struct reconst_pq_struct { 770 uint8_t *p; 771 uint8_t *q; 772 uint8_t *pxy; 773 uint8_t *qxy; 774 int aexp; 775 int bexp; 776 }; 777 778 static int 779 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 780 { 781 struct reconst_pq_struct *rpq = private; 782 uint8_t *xd = xbuf; 783 uint8_t *yd = ybuf; 784 785 for (int i = 0; i < size; 786 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 787 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 788 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 789 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 790 } 791 792 return (0); 793 } 794 795 static int 796 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 797 { 798 struct reconst_pq_struct *rpq = private; 799 uint8_t *xd = xbuf; 800 801 for (int i = 0; i < size; 802 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 803 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 804 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 805 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 806 } 807 808 return (0); 809 } 810 811 static int 812 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 813 { 814 int x = tgts[0]; 815 int c; 816 abd_t *dst, *src; 817 818 ASSERT(ntgts == 1); 819 ASSERT(x >= rm->rm_firstdatacol); 820 ASSERT(x < rm->rm_cols); 821 822 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 823 ASSERT(rm->rm_col[x].rc_size > 0); 824 825 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 826 dst = rm->rm_col[x].rc_abd; 827 828 abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); 829 830 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 831 uint64_t size = MIN(rm->rm_col[x].rc_size, 832 rm->rm_col[c].rc_size); 833 834 src = rm->rm_col[c].rc_abd; 835 dst = rm->rm_col[x].rc_abd; 836 837 if (c == x) 838 continue; 839 840 (void) abd_iterate_func2(dst, src, 0, 0, size, 841 vdev_raidz_reconst_p_func, NULL); 842 } 843 844 return (1 << VDEV_RAIDZ_P); 845 } 846 847 static int 848 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 849 { 850 int x = tgts[0]; 851 int c, exp; 852 abd_t *dst, *src; 853 854 ASSERT(ntgts == 1); 855 856 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 857 858 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 859 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 860 rm->rm_col[c].rc_size); 861 862 src = rm->rm_col[c].rc_abd; 863 dst = rm->rm_col[x].rc_abd; 864 865 if (c == rm->rm_firstdatacol) { 866 abd_copy(dst, src, size); 867 if (rm->rm_col[x].rc_size > size) 868 abd_zero_off(dst, size, 869 rm->rm_col[x].rc_size - size); 870 871 } else { 872 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 873 (void) abd_iterate_func2(dst, src, 0, 0, size, 874 vdev_raidz_reconst_q_pre_func, NULL); 875 (void) abd_iterate_func(dst, 876 size, rm->rm_col[x].rc_size - size, 877 vdev_raidz_reconst_q_pre_tail_func, NULL); 878 } 879 } 880 881 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 882 dst = rm->rm_col[x].rc_abd; 883 exp = 255 - (rm->rm_cols - 1 - x); 884 885 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 886 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 887 vdev_raidz_reconst_q_post_func, &rq); 888 889 return (1 << VDEV_RAIDZ_Q); 890 } 891 892 static int 893 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 894 { 895 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 896 abd_t *pdata, *qdata; 897 uint64_t xsize, ysize; 898 int x = tgts[0]; 899 int y = tgts[1]; 900 abd_t *xd, *yd; 901 902 ASSERT(ntgts == 2); 903 ASSERT(x < y); 904 ASSERT(x >= rm->rm_firstdatacol); 905 ASSERT(y < rm->rm_cols); 906 907 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 908 909 /* 910 * Move the parity data aside -- we're going to compute parity as 911 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 912 * reuse the parity generation mechanism without trashing the actual 913 * parity so we make those columns appear to be full of zeros by 914 * setting their lengths to zero. 915 */ 916 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 917 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 918 xsize = rm->rm_col[x].rc_size; 919 ysize = rm->rm_col[y].rc_size; 920 921 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 922 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 923 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 924 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 925 rm->rm_col[x].rc_size = 0; 926 rm->rm_col[y].rc_size = 0; 927 928 vdev_raidz_generate_parity_pq(rm); 929 930 rm->rm_col[x].rc_size = xsize; 931 rm->rm_col[y].rc_size = ysize; 932 933 p = abd_to_buf(pdata); 934 q = abd_to_buf(qdata); 935 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 936 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 937 xd = rm->rm_col[x].rc_abd; 938 yd = rm->rm_col[y].rc_abd; 939 940 /* 941 * We now have: 942 * Pxy = P + D_x + D_y 943 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 944 * 945 * We can then solve for D_x: 946 * D_x = A * (P + Pxy) + B * (Q + Qxy) 947 * where 948 * A = 2^(x - y) * (2^(x - y) + 1)^-1 949 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 950 * 951 * With D_x in hand, we can easily solve for D_y: 952 * D_y = P + Pxy + D_x 953 */ 954 955 a = vdev_raidz_pow2[255 + x - y]; 956 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 957 tmp = 255 - vdev_raidz_log2[a ^ 1]; 958 959 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 960 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 961 962 ASSERT3U(xsize, >=, ysize); 963 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 964 965 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 966 vdev_raidz_reconst_pq_func, &rpq); 967 (void) abd_iterate_func(xd, ysize, xsize - ysize, 968 vdev_raidz_reconst_pq_tail_func, &rpq); 969 970 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 971 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 972 973 /* 974 * Restore the saved parity data. 975 */ 976 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 977 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 978 979 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 980 } 981 982 /* BEGIN CSTYLED */ 983 /* 984 * In the general case of reconstruction, we must solve the system of linear 985 * equations defined by the coefficients used to generate parity as well as 986 * the contents of the data and parity disks. This can be expressed with 987 * vectors for the original data (D) and the actual data (d) and parity (p) 988 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 989 * 990 * __ __ __ __ 991 * | | __ __ | p_0 | 992 * | V | | D_0 | | p_m-1 | 993 * | | x | : | = | d_0 | 994 * | I | | D_n-1 | | : | 995 * | | ~~ ~~ | d_n-1 | 996 * ~~ ~~ ~~ ~~ 997 * 998 * I is simply a square identity matrix of size n, and V is a vandermonde 999 * matrix defined by the coefficients we chose for the various parity columns 1000 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1001 * computation as well as linear separability. 1002 * 1003 * __ __ __ __ 1004 * | 1 .. 1 1 1 | | p_0 | 1005 * | 2^n-1 .. 4 2 1 | __ __ | : | 1006 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1007 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1008 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1009 * | : : : : | | : | | d_2 | 1010 * | 0 .. 1 0 0 | | D_n-1 | | : | 1011 * | 0 .. 0 1 0 | ~~ ~~ | : | 1012 * | 0 .. 0 0 1 | | d_n-1 | 1013 * ~~ ~~ ~~ ~~ 1014 * 1015 * Note that I, V, d, and p are known. To compute D, we must invert the 1016 * matrix and use the known data and parity values to reconstruct the unknown 1017 * data values. We begin by removing the rows in V|I and d|p that correspond 1018 * to failed or missing columns; we then make V|I square (n x n) and d|p 1019 * sized n by removing rows corresponding to unused parity from the bottom up 1020 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1021 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1022 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1023 * __ __ 1024 * | 1 1 1 1 1 1 1 1 | 1025 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1026 * | 19 205 116 29 64 16 4 1 | / / 1027 * | 1 0 0 0 0 0 0 0 | / / 1028 * | 0 1 0 0 0 0 0 0 | <--' / 1029 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1030 * | 0 0 0 1 0 0 0 0 | 1031 * | 0 0 0 0 1 0 0 0 | 1032 * | 0 0 0 0 0 1 0 0 | 1033 * | 0 0 0 0 0 0 1 0 | 1034 * | 0 0 0 0 0 0 0 1 | 1035 * ~~ ~~ 1036 * __ __ 1037 * | 1 1 1 1 1 1 1 1 | 1038 * | 128 64 32 16 8 4 2 1 | 1039 * | 19 205 116 29 64 16 4 1 | 1040 * | 1 0 0 0 0 0 0 0 | 1041 * | 0 1 0 0 0 0 0 0 | 1042 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1043 * | 0 0 0 1 0 0 0 0 | 1044 * | 0 0 0 0 1 0 0 0 | 1045 * | 0 0 0 0 0 1 0 0 | 1046 * | 0 0 0 0 0 0 1 0 | 1047 * | 0 0 0 0 0 0 0 1 | 1048 * ~~ ~~ 1049 * 1050 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1051 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1052 * matrix is not singular. 1053 * __ __ 1054 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1055 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1058 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1059 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1060 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1061 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1062 * ~~ ~~ 1063 * __ __ 1064 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1065 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1066 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1067 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1068 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1069 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1070 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1071 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1072 * ~~ ~~ 1073 * __ __ 1074 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1075 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1076 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1077 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1078 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1079 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1080 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1081 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1082 * ~~ ~~ 1083 * __ __ 1084 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1085 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1086 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1087 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1088 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1089 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1090 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1091 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1092 * ~~ ~~ 1093 * __ __ 1094 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1095 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1096 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1097 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1098 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1099 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1100 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1101 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1102 * ~~ ~~ 1103 * __ __ 1104 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1105 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1106 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1107 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1108 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1109 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1110 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1111 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1112 * ~~ ~~ 1113 * __ __ 1114 * | 0 0 1 0 0 0 0 0 | 1115 * | 167 100 5 41 159 169 217 208 | 1116 * | 166 100 4 40 158 168 216 209 | 1117 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1118 * | 0 0 0 0 1 0 0 0 | 1119 * | 0 0 0 0 0 1 0 0 | 1120 * | 0 0 0 0 0 0 1 0 | 1121 * | 0 0 0 0 0 0 0 1 | 1122 * ~~ ~~ 1123 * 1124 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1125 * of the missing data. 1126 * 1127 * As is apparent from the example above, the only non-trivial rows in the 1128 * inverse matrix correspond to the data disks that we're trying to 1129 * reconstruct. Indeed, those are the only rows we need as the others would 1130 * only be useful for reconstructing data known or assumed to be valid. For 1131 * that reason, we only build the coefficients in the rows that correspond to 1132 * targeted columns. 1133 */ 1134 /* END CSTYLED */ 1135 1136 static void 1137 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1138 uint8_t **rows) 1139 { 1140 int i, j; 1141 int pow; 1142 1143 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1144 1145 /* 1146 * Fill in the missing rows of interest. 1147 */ 1148 for (i = 0; i < nmap; i++) { 1149 ASSERT3S(0, <=, map[i]); 1150 ASSERT3S(map[i], <=, 2); 1151 1152 pow = map[i] * n; 1153 if (pow > 255) 1154 pow -= 255; 1155 ASSERT(pow <= 255); 1156 1157 for (j = 0; j < n; j++) { 1158 pow -= map[i]; 1159 if (pow < 0) 1160 pow += 255; 1161 rows[i][j] = vdev_raidz_pow2[pow]; 1162 } 1163 } 1164 } 1165 1166 static void 1167 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1168 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1169 { 1170 int i, j, ii, jj; 1171 uint8_t log; 1172 1173 /* 1174 * Assert that the first nmissing entries from the array of used 1175 * columns correspond to parity columns and that subsequent entries 1176 * correspond to data columns. 1177 */ 1178 for (i = 0; i < nmissing; i++) { 1179 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1180 } 1181 for (; i < n; i++) { 1182 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1183 } 1184 1185 /* 1186 * First initialize the storage where we'll compute the inverse rows. 1187 */ 1188 for (i = 0; i < nmissing; i++) { 1189 for (j = 0; j < n; j++) { 1190 invrows[i][j] = (i == j) ? 1 : 0; 1191 } 1192 } 1193 1194 /* 1195 * Subtract all trivial rows from the rows of consequence. 1196 */ 1197 for (i = 0; i < nmissing; i++) { 1198 for (j = nmissing; j < n; j++) { 1199 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1200 jj = used[j] - rm->rm_firstdatacol; 1201 ASSERT3S(jj, <, n); 1202 invrows[i][j] = rows[i][jj]; 1203 rows[i][jj] = 0; 1204 } 1205 } 1206 1207 /* 1208 * For each of the rows of interest, we must normalize it and subtract 1209 * a multiple of it from the other rows. 1210 */ 1211 for (i = 0; i < nmissing; i++) { 1212 for (j = 0; j < missing[i]; j++) { 1213 ASSERT0(rows[i][j]); 1214 } 1215 ASSERT3U(rows[i][missing[i]], !=, 0); 1216 1217 /* 1218 * Compute the inverse of the first element and multiply each 1219 * element in the row by that value. 1220 */ 1221 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1222 1223 for (j = 0; j < n; j++) { 1224 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1225 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1226 } 1227 1228 for (ii = 0; ii < nmissing; ii++) { 1229 if (i == ii) 1230 continue; 1231 1232 ASSERT3U(rows[ii][missing[i]], !=, 0); 1233 1234 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1235 1236 for (j = 0; j < n; j++) { 1237 rows[ii][j] ^= 1238 vdev_raidz_exp2(rows[i][j], log); 1239 invrows[ii][j] ^= 1240 vdev_raidz_exp2(invrows[i][j], log); 1241 } 1242 } 1243 } 1244 1245 /* 1246 * Verify that the data that is left in the rows are properly part of 1247 * an identity matrix. 1248 */ 1249 for (i = 0; i < nmissing; i++) { 1250 for (j = 0; j < n; j++) { 1251 if (j == missing[i]) { 1252 ASSERT3U(rows[i][j], ==, 1); 1253 } else { 1254 ASSERT0(rows[i][j]); 1255 } 1256 } 1257 } 1258 } 1259 1260 static void 1261 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1262 int *missing, uint8_t **invrows, const uint8_t *used) 1263 { 1264 int i, j, x, cc, c; 1265 uint8_t *src; 1266 uint64_t ccount; 1267 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1268 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1269 uint8_t log = 0; 1270 uint8_t val; 1271 int ll; 1272 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1273 uint8_t *p, *pp; 1274 size_t psize; 1275 1276 psize = sizeof (invlog[0][0]) * n * nmissing; 1277 p = kmem_alloc(psize, KM_SLEEP); 1278 1279 for (pp = p, i = 0; i < nmissing; i++) { 1280 invlog[i] = pp; 1281 pp += n; 1282 } 1283 1284 for (i = 0; i < nmissing; i++) { 1285 for (j = 0; j < n; j++) { 1286 ASSERT3U(invrows[i][j], !=, 0); 1287 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1288 } 1289 } 1290 1291 for (i = 0; i < n; i++) { 1292 c = used[i]; 1293 ASSERT3U(c, <, rm->rm_cols); 1294 1295 src = abd_to_buf(rm->rm_col[c].rc_abd); 1296 ccount = rm->rm_col[c].rc_size; 1297 for (j = 0; j < nmissing; j++) { 1298 cc = missing[j] + rm->rm_firstdatacol; 1299 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1300 ASSERT3U(cc, <, rm->rm_cols); 1301 ASSERT3U(cc, !=, c); 1302 1303 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1304 dcount[j] = rm->rm_col[cc].rc_size; 1305 } 1306 1307 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1308 1309 for (x = 0; x < ccount; x++, src++) { 1310 if (*src != 0) 1311 log = vdev_raidz_log2[*src]; 1312 1313 for (cc = 0; cc < nmissing; cc++) { 1314 if (x >= dcount[cc]) 1315 continue; 1316 1317 if (*src == 0) { 1318 val = 0; 1319 } else { 1320 if ((ll = log + invlog[cc][i]) >= 255) 1321 ll -= 255; 1322 val = vdev_raidz_pow2[ll]; 1323 } 1324 1325 if (i == 0) 1326 dst[cc][x] = val; 1327 else 1328 dst[cc][x] ^= val; 1329 } 1330 } 1331 } 1332 1333 kmem_free(p, psize); 1334 } 1335 1336 static int 1337 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1338 { 1339 int n, i, c, t, tt; 1340 int nmissing_rows; 1341 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1342 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1343 1344 uint8_t *p, *pp; 1345 size_t psize; 1346 1347 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1348 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1349 uint8_t *used; 1350 1351 abd_t **bufs = NULL; 1352 1353 int code = 0; 1354 1355 /* 1356 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1357 * temporary linear ABDs. 1358 */ 1359 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1360 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1361 1362 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1363 raidz_col_t *col = &rm->rm_col[c]; 1364 1365 bufs[c] = col->rc_abd; 1366 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1367 abd_copy(col->rc_abd, bufs[c], col->rc_size); 1368 } 1369 } 1370 1371 n = rm->rm_cols - rm->rm_firstdatacol; 1372 1373 /* 1374 * Figure out which data columns are missing. 1375 */ 1376 nmissing_rows = 0; 1377 for (t = 0; t < ntgts; t++) { 1378 if (tgts[t] >= rm->rm_firstdatacol) { 1379 missing_rows[nmissing_rows++] = 1380 tgts[t] - rm->rm_firstdatacol; 1381 } 1382 } 1383 1384 /* 1385 * Figure out which parity columns to use to help generate the missing 1386 * data columns. 1387 */ 1388 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1389 ASSERT(tt < ntgts); 1390 ASSERT(c < rm->rm_firstdatacol); 1391 1392 /* 1393 * Skip any targeted parity columns. 1394 */ 1395 if (c == tgts[tt]) { 1396 tt++; 1397 continue; 1398 } 1399 1400 code |= 1 << c; 1401 1402 parity_map[i] = c; 1403 i++; 1404 } 1405 1406 ASSERT(code != 0); 1407 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1408 1409 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1410 nmissing_rows * n + sizeof (used[0]) * n; 1411 p = kmem_alloc(psize, KM_SLEEP); 1412 1413 for (pp = p, i = 0; i < nmissing_rows; i++) { 1414 rows[i] = pp; 1415 pp += n; 1416 invrows[i] = pp; 1417 pp += n; 1418 } 1419 used = pp; 1420 1421 for (i = 0; i < nmissing_rows; i++) { 1422 used[i] = parity_map[i]; 1423 } 1424 1425 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1426 if (tt < nmissing_rows && 1427 c == missing_rows[tt] + rm->rm_firstdatacol) { 1428 tt++; 1429 continue; 1430 } 1431 1432 ASSERT3S(i, <, n); 1433 used[i] = c; 1434 i++; 1435 } 1436 1437 /* 1438 * Initialize the interesting rows of the matrix. 1439 */ 1440 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1441 1442 /* 1443 * Invert the matrix. 1444 */ 1445 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1446 invrows, used); 1447 1448 /* 1449 * Reconstruct the missing data using the generated matrix. 1450 */ 1451 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1452 invrows, used); 1453 1454 kmem_free(p, psize); 1455 1456 /* 1457 * copy back from temporary linear abds and free them 1458 */ 1459 if (bufs) { 1460 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1461 raidz_col_t *col = &rm->rm_col[c]; 1462 1463 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1464 abd_free(col->rc_abd); 1465 col->rc_abd = bufs[c]; 1466 } 1467 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1468 } 1469 1470 return (code); 1471 } 1472 1473 int 1474 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 1475 { 1476 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1477 int ntgts; 1478 int i, c, ret; 1479 int code; 1480 int nbadparity, nbaddata; 1481 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1482 1483 /* 1484 * The tgts list must already be sorted. 1485 */ 1486 for (i = 1; i < nt; i++) { 1487 ASSERT(t[i] > t[i - 1]); 1488 } 1489 1490 nbadparity = rm->rm_firstdatacol; 1491 nbaddata = rm->rm_cols - nbadparity; 1492 ntgts = 0; 1493 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1494 if (c < rm->rm_firstdatacol) 1495 parity_valid[c] = B_FALSE; 1496 1497 if (i < nt && c == t[i]) { 1498 tgts[ntgts++] = c; 1499 i++; 1500 } else if (rm->rm_col[c].rc_error != 0) { 1501 tgts[ntgts++] = c; 1502 } else if (c >= rm->rm_firstdatacol) { 1503 nbaddata--; 1504 } else { 1505 parity_valid[c] = B_TRUE; 1506 nbadparity--; 1507 } 1508 } 1509 1510 ASSERT(ntgts >= nt); 1511 ASSERT(nbaddata >= 0); 1512 ASSERT(nbaddata + nbadparity == ntgts); 1513 1514 dt = &tgts[nbadparity]; 1515 1516 /* Reconstruct using the new math implementation */ 1517 ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); 1518 if (ret != RAIDZ_ORIGINAL_IMPL) 1519 return (ret); 1520 1521 /* 1522 * See if we can use any of our optimized reconstruction routines. 1523 */ 1524 switch (nbaddata) { 1525 case 1: 1526 if (parity_valid[VDEV_RAIDZ_P]) 1527 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1528 1529 ASSERT(rm->rm_firstdatacol > 1); 1530 1531 if (parity_valid[VDEV_RAIDZ_Q]) 1532 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1533 1534 ASSERT(rm->rm_firstdatacol > 2); 1535 break; 1536 1537 case 2: 1538 ASSERT(rm->rm_firstdatacol > 1); 1539 1540 if (parity_valid[VDEV_RAIDZ_P] && 1541 parity_valid[VDEV_RAIDZ_Q]) 1542 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1543 1544 ASSERT(rm->rm_firstdatacol > 2); 1545 1546 break; 1547 } 1548 1549 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1550 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1551 ASSERT(code > 0); 1552 return (code); 1553 } 1554 1555 static int 1556 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1557 uint64_t *logical_ashift, uint64_t *physical_ashift) 1558 { 1559 vdev_t *cvd; 1560 uint64_t nparity = vd->vdev_nparity; 1561 int c; 1562 int lasterror = 0; 1563 int numerrors = 0; 1564 1565 ASSERT(nparity > 0); 1566 1567 if (nparity > VDEV_RAIDZ_MAXPARITY || 1568 vd->vdev_children < nparity + 1) { 1569 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1570 return (SET_ERROR(EINVAL)); 1571 } 1572 1573 vdev_open_children(vd); 1574 1575 for (c = 0; c < vd->vdev_children; c++) { 1576 cvd = vd->vdev_child[c]; 1577 1578 if (cvd->vdev_open_error != 0) { 1579 lasterror = cvd->vdev_open_error; 1580 numerrors++; 1581 continue; 1582 } 1583 1584 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1585 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1586 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1587 *physical_ashift = MAX(*physical_ashift, 1588 cvd->vdev_physical_ashift); 1589 } 1590 1591 *asize *= vd->vdev_children; 1592 *max_asize *= vd->vdev_children; 1593 1594 if (numerrors > nparity) { 1595 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1596 return (lasterror); 1597 } 1598 1599 return (0); 1600 } 1601 1602 static void 1603 vdev_raidz_close(vdev_t *vd) 1604 { 1605 int c; 1606 1607 for (c = 0; c < vd->vdev_children; c++) 1608 vdev_close(vd->vdev_child[c]); 1609 } 1610 1611 static uint64_t 1612 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1613 { 1614 uint64_t asize; 1615 uint64_t ashift = vd->vdev_top->vdev_ashift; 1616 uint64_t cols = vd->vdev_children; 1617 uint64_t nparity = vd->vdev_nparity; 1618 1619 asize = ((psize - 1) >> ashift) + 1; 1620 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1621 asize = roundup(asize, nparity + 1) << ashift; 1622 1623 return (asize); 1624 } 1625 1626 static void 1627 vdev_raidz_child_done(zio_t *zio) 1628 { 1629 raidz_col_t *rc = zio->io_private; 1630 1631 rc->rc_error = zio->io_error; 1632 rc->rc_tried = 1; 1633 rc->rc_skipped = 0; 1634 } 1635 1636 static void 1637 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1638 { 1639 #ifdef ZFS_DEBUG 1640 vdev_t *vd = zio->io_vd; 1641 vdev_t *tvd = vd->vdev_top; 1642 1643 range_seg64_t logical_rs, physical_rs; 1644 logical_rs.rs_start = zio->io_offset; 1645 logical_rs.rs_end = logical_rs.rs_start + 1646 vdev_raidz_asize(zio->io_vd, zio->io_size); 1647 1648 raidz_col_t *rc = &rm->rm_col[col]; 1649 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1650 1651 vdev_xlate(cvd, &logical_rs, &physical_rs); 1652 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1653 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1654 /* 1655 * It would be nice to assert that rs_end is equal 1656 * to rc_offset + rc_size but there might be an 1657 * optional I/O at the end that is not accounted in 1658 * rc_size. 1659 */ 1660 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1661 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1662 rc->rc_size + (1 << tvd->vdev_ashift)); 1663 } else { 1664 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1665 } 1666 #endif 1667 } 1668 1669 /* 1670 * Start an IO operation on a RAIDZ VDev 1671 * 1672 * Outline: 1673 * - For write operations: 1674 * 1. Generate the parity data 1675 * 2. Create child zio write operations to each column's vdev, for both 1676 * data and parity. 1677 * 3. If the column skips any sectors for padding, create optional dummy 1678 * write zio children for those areas to improve aggregation continuity. 1679 * - For read operations: 1680 * 1. Create child zio read operations to each data column's vdev to read 1681 * the range of data required for zio. 1682 * 2. If this is a scrub or resilver operation, or if any of the data 1683 * vdevs have had errors, then create zio read operations to the parity 1684 * columns' VDevs as well. 1685 */ 1686 static void 1687 vdev_raidz_io_start(zio_t *zio) 1688 { 1689 vdev_t *vd = zio->io_vd; 1690 vdev_t *tvd = vd->vdev_top; 1691 vdev_t *cvd; 1692 raidz_map_t *rm; 1693 raidz_col_t *rc; 1694 int c, i; 1695 1696 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1697 vd->vdev_nparity); 1698 1699 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1700 1701 if (zio->io_type == ZIO_TYPE_WRITE) { 1702 vdev_raidz_generate_parity(rm); 1703 1704 for (c = 0; c < rm->rm_cols; c++) { 1705 rc = &rm->rm_col[c]; 1706 cvd = vd->vdev_child[rc->rc_devidx]; 1707 1708 /* 1709 * Verify physical to logical translation. 1710 */ 1711 vdev_raidz_io_verify(zio, rm, c); 1712 1713 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1714 rc->rc_offset, rc->rc_abd, rc->rc_size, 1715 zio->io_type, zio->io_priority, 0, 1716 vdev_raidz_child_done, rc)); 1717 } 1718 1719 /* 1720 * Generate optional I/Os for any skipped sectors to improve 1721 * aggregation contiguity. 1722 */ 1723 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1724 ASSERT(c <= rm->rm_scols); 1725 if (c == rm->rm_scols) 1726 c = 0; 1727 rc = &rm->rm_col[c]; 1728 cvd = vd->vdev_child[rc->rc_devidx]; 1729 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1730 rc->rc_offset + rc->rc_size, NULL, 1731 1 << tvd->vdev_ashift, 1732 zio->io_type, zio->io_priority, 1733 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1734 } 1735 1736 zio_execute(zio); 1737 return; 1738 } 1739 1740 ASSERT(zio->io_type == ZIO_TYPE_READ); 1741 1742 /* 1743 * Iterate over the columns in reverse order so that we hit the parity 1744 * last -- any errors along the way will force us to read the parity. 1745 */ 1746 for (c = rm->rm_cols - 1; c >= 0; c--) { 1747 rc = &rm->rm_col[c]; 1748 cvd = vd->vdev_child[rc->rc_devidx]; 1749 if (!vdev_readable(cvd)) { 1750 if (c >= rm->rm_firstdatacol) 1751 rm->rm_missingdata++; 1752 else 1753 rm->rm_missingparity++; 1754 rc->rc_error = SET_ERROR(ENXIO); 1755 rc->rc_tried = 1; /* don't even try */ 1756 rc->rc_skipped = 1; 1757 continue; 1758 } 1759 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1760 if (c >= rm->rm_firstdatacol) 1761 rm->rm_missingdata++; 1762 else 1763 rm->rm_missingparity++; 1764 rc->rc_error = SET_ERROR(ESTALE); 1765 rc->rc_skipped = 1; 1766 continue; 1767 } 1768 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1769 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1770 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1771 rc->rc_offset, rc->rc_abd, rc->rc_size, 1772 zio->io_type, zio->io_priority, 0, 1773 vdev_raidz_child_done, rc)); 1774 } 1775 } 1776 1777 zio_execute(zio); 1778 } 1779 1780 1781 /* 1782 * Report a checksum error for a child of a RAID-Z device. 1783 */ 1784 static void 1785 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1786 { 1787 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1788 1789 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1790 zio_bad_cksum_t zbc; 1791 raidz_map_t *rm = zio->io_vsd; 1792 1793 mutex_enter(&vd->vdev_stat_lock); 1794 vd->vdev_stat.vs_checksum_errors++; 1795 mutex_exit(&vd->vdev_stat_lock); 1796 1797 zbc.zbc_has_cksum = 0; 1798 zbc.zbc_injected = rm->rm_ecksuminjected; 1799 1800 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1801 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1802 rc->rc_abd, bad_data, &zbc); 1803 } 1804 } 1805 1806 /* 1807 * We keep track of whether or not there were any injected errors, so that 1808 * any ereports we generate can note it. 1809 */ 1810 static int 1811 raidz_checksum_verify(zio_t *zio) 1812 { 1813 zio_bad_cksum_t zbc; 1814 raidz_map_t *rm = zio->io_vsd; 1815 1816 bzero(&zbc, sizeof (zio_bad_cksum_t)); 1817 1818 int ret = zio_checksum_error(zio, &zbc); 1819 if (ret != 0 && zbc.zbc_injected != 0) 1820 rm->rm_ecksuminjected = 1; 1821 1822 return (ret); 1823 } 1824 1825 /* 1826 * Generate the parity from the data columns. If we tried and were able to 1827 * read the parity without error, verify that the generated parity matches the 1828 * data we read. If it doesn't, we fire off a checksum error. Return the 1829 * number such failures. 1830 */ 1831 static int 1832 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1833 { 1834 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1835 int c, ret = 0; 1836 raidz_col_t *rc; 1837 1838 blkptr_t *bp = zio->io_bp; 1839 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1840 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1841 1842 if (checksum == ZIO_CHECKSUM_NOPARITY) 1843 return (ret); 1844 1845 for (c = 0; c < rm->rm_firstdatacol; c++) { 1846 rc = &rm->rm_col[c]; 1847 if (!rc->rc_tried || rc->rc_error != 0) 1848 continue; 1849 1850 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 1851 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 1852 } 1853 1854 vdev_raidz_generate_parity(rm); 1855 1856 for (c = 0; c < rm->rm_firstdatacol; c++) { 1857 rc = &rm->rm_col[c]; 1858 if (!rc->rc_tried || rc->rc_error != 0) 1859 continue; 1860 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 1861 raidz_checksum_error(zio, rc, orig[c]); 1862 rc->rc_error = SET_ERROR(ECKSUM); 1863 ret++; 1864 } 1865 abd_free(orig[c]); 1866 } 1867 1868 return (ret); 1869 } 1870 1871 static int 1872 vdev_raidz_worst_error(raidz_map_t *rm) 1873 { 1874 int error = 0; 1875 1876 for (int c = 0; c < rm->rm_cols; c++) 1877 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1878 1879 return (error); 1880 } 1881 1882 /* 1883 * Iterate over all combinations of bad data and attempt a reconstruction. 1884 * Note that the algorithm below is non-optimal because it doesn't take into 1885 * account how reconstruction is actually performed. For example, with 1886 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1887 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1888 * cases we'd only use parity information in column 0. 1889 */ 1890 static int 1891 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1892 { 1893 raidz_map_t *rm = zio->io_vsd; 1894 raidz_col_t *rc; 1895 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1896 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1897 int *tgts = &tstore[1]; 1898 int curr, next, i, c, n; 1899 int code, ret = 0; 1900 1901 ASSERT(total_errors < rm->rm_firstdatacol); 1902 1903 /* 1904 * This simplifies one edge condition. 1905 */ 1906 tgts[-1] = -1; 1907 1908 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1909 /* 1910 * Initialize the targets array by finding the first n columns 1911 * that contain no error. 1912 * 1913 * If there were no data errors, we need to ensure that we're 1914 * always explicitly attempting to reconstruct at least one 1915 * data column. To do this, we simply push the highest target 1916 * up into the data columns. 1917 */ 1918 for (c = 0, i = 0; i < n; i++) { 1919 if (i == n - 1 && data_errors == 0 && 1920 c < rm->rm_firstdatacol) { 1921 c = rm->rm_firstdatacol; 1922 } 1923 1924 while (rm->rm_col[c].rc_error != 0) { 1925 c++; 1926 ASSERT3S(c, <, rm->rm_cols); 1927 } 1928 1929 tgts[i] = c++; 1930 } 1931 1932 /* 1933 * Setting tgts[n] simplifies the other edge condition. 1934 */ 1935 tgts[n] = rm->rm_cols; 1936 1937 /* 1938 * These buffers were allocated in previous iterations. 1939 */ 1940 for (i = 0; i < n - 1; i++) { 1941 ASSERT(orig[i] != NULL); 1942 } 1943 1944 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, 1945 rm->rm_col[0].rc_size); 1946 1947 curr = 0; 1948 next = tgts[curr]; 1949 1950 while (curr != n) { 1951 tgts[curr] = next; 1952 curr = 0; 1953 1954 /* 1955 * Save off the original data that we're going to 1956 * attempt to reconstruct. 1957 */ 1958 for (i = 0; i < n; i++) { 1959 ASSERT(orig[i] != NULL); 1960 c = tgts[i]; 1961 ASSERT3S(c, >=, 0); 1962 ASSERT3S(c, <, rm->rm_cols); 1963 rc = &rm->rm_col[c]; 1964 abd_copy(orig[i], rc->rc_abd, rc->rc_size); 1965 } 1966 1967 /* 1968 * Attempt a reconstruction and exit the outer loop on 1969 * success. 1970 */ 1971 code = vdev_raidz_reconstruct(rm, tgts, n); 1972 if (raidz_checksum_verify(zio) == 0) { 1973 1974 for (i = 0; i < n; i++) { 1975 c = tgts[i]; 1976 rc = &rm->rm_col[c]; 1977 ASSERT(rc->rc_error == 0); 1978 if (rc->rc_tried) 1979 raidz_checksum_error(zio, rc, 1980 orig[i]); 1981 rc->rc_error = SET_ERROR(ECKSUM); 1982 } 1983 1984 ret = code; 1985 goto done; 1986 } 1987 1988 /* 1989 * Restore the original data. 1990 */ 1991 for (i = 0; i < n; i++) { 1992 c = tgts[i]; 1993 rc = &rm->rm_col[c]; 1994 abd_copy(rc->rc_abd, orig[i], rc->rc_size); 1995 } 1996 1997 do { 1998 /* 1999 * Find the next valid column after the curr 2000 * position.. 2001 */ 2002 for (next = tgts[curr] + 1; 2003 next < rm->rm_cols && 2004 rm->rm_col[next].rc_error != 0; next++) 2005 continue; 2006 2007 ASSERT(next <= tgts[curr + 1]); 2008 2009 /* 2010 * If that spot is available, we're done here. 2011 */ 2012 if (next != tgts[curr + 1]) 2013 break; 2014 2015 /* 2016 * Otherwise, find the next valid column after 2017 * the previous position. 2018 */ 2019 for (c = tgts[curr - 1] + 1; 2020 rm->rm_col[c].rc_error != 0; c++) 2021 continue; 2022 2023 tgts[curr] = c; 2024 curr++; 2025 2026 } while (curr != n); 2027 } 2028 } 2029 n--; 2030 done: 2031 for (i = 0; i < n; i++) 2032 abd_free(orig[i]); 2033 2034 return (ret); 2035 } 2036 2037 /* 2038 * Complete an IO operation on a RAIDZ VDev 2039 * 2040 * Outline: 2041 * - For write operations: 2042 * 1. Check for errors on the child IOs. 2043 * 2. Return, setting an error code if too few child VDevs were written 2044 * to reconstruct the data later. Note that partial writes are 2045 * considered successful if they can be reconstructed at all. 2046 * - For read operations: 2047 * 1. Check for errors on the child IOs. 2048 * 2. If data errors occurred: 2049 * a. Try to reassemble the data from the parity available. 2050 * b. If we haven't yet read the parity drives, read them now. 2051 * c. If all parity drives have been read but the data still doesn't 2052 * reassemble with a correct checksum, then try combinatorial 2053 * reconstruction. 2054 * d. If that doesn't work, return an error. 2055 * 3. If there were unexpected errors or this is a resilver operation, 2056 * rewrite the vdevs that had errors. 2057 */ 2058 static void 2059 vdev_raidz_io_done(zio_t *zio) 2060 { 2061 vdev_t *vd = zio->io_vd; 2062 vdev_t *cvd; 2063 raidz_map_t *rm = zio->io_vsd; 2064 raidz_col_t *rc = NULL; 2065 int unexpected_errors = 0; 2066 int parity_errors = 0; 2067 int parity_untried = 0; 2068 int data_errors = 0; 2069 int total_errors = 0; 2070 int n, c; 2071 int tgts[VDEV_RAIDZ_MAXPARITY]; 2072 int code; 2073 2074 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2075 2076 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2077 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2078 2079 for (c = 0; c < rm->rm_cols; c++) { 2080 rc = &rm->rm_col[c]; 2081 2082 if (rc->rc_error) { 2083 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2084 2085 if (c < rm->rm_firstdatacol) 2086 parity_errors++; 2087 else 2088 data_errors++; 2089 2090 if (!rc->rc_skipped) 2091 unexpected_errors++; 2092 2093 total_errors++; 2094 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2095 parity_untried++; 2096 } 2097 } 2098 2099 if (zio->io_type == ZIO_TYPE_WRITE) { 2100 /* 2101 * XXX -- for now, treat partial writes as a success. 2102 * (If we couldn't write enough columns to reconstruct 2103 * the data, the I/O failed. Otherwise, good enough.) 2104 * 2105 * Now that we support write reallocation, it would be better 2106 * to treat partial failure as real failure unless there are 2107 * no non-degraded top-level vdevs left, and not update DTLs 2108 * if we intend to reallocate. 2109 */ 2110 /* XXPOLICY */ 2111 if (total_errors > rm->rm_firstdatacol) 2112 zio->io_error = vdev_raidz_worst_error(rm); 2113 2114 return; 2115 } 2116 2117 ASSERT(zio->io_type == ZIO_TYPE_READ); 2118 /* 2119 * There are three potential phases for a read: 2120 * 1. produce valid data from the columns read 2121 * 2. read all disks and try again 2122 * 3. perform combinatorial reconstruction 2123 * 2124 * Each phase is progressively both more expensive and less likely to 2125 * occur. If we encounter more errors than we can repair or all phases 2126 * fail, we have no choice but to return an error. 2127 */ 2128 2129 /* 2130 * If the number of errors we saw was correctable -- less than or equal 2131 * to the number of parity disks read -- attempt to produce data that 2132 * has a valid checksum. Naturally, this case applies in the absence of 2133 * any errors. 2134 */ 2135 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2136 if (data_errors == 0) { 2137 if (raidz_checksum_verify(zio) == 0) { 2138 /* 2139 * If we read parity information (unnecessarily 2140 * as it happens since no reconstruction was 2141 * needed) regenerate and verify the parity. 2142 * We also regenerate parity when resilvering 2143 * so we can write it out to the failed device 2144 * later. 2145 */ 2146 if (parity_errors + parity_untried < 2147 rm->rm_firstdatacol || 2148 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2149 n = raidz_parity_verify(zio, rm); 2150 unexpected_errors += n; 2151 ASSERT(parity_errors + n <= 2152 rm->rm_firstdatacol); 2153 } 2154 goto done; 2155 } 2156 } else { 2157 /* 2158 * We either attempt to read all the parity columns or 2159 * none of them. If we didn't try to read parity, we 2160 * wouldn't be here in the correctable case. There must 2161 * also have been fewer parity errors than parity 2162 * columns or, again, we wouldn't be in this code path. 2163 */ 2164 ASSERT(parity_untried == 0); 2165 ASSERT(parity_errors < rm->rm_firstdatacol); 2166 2167 /* 2168 * Identify the data columns that reported an error. 2169 */ 2170 n = 0; 2171 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2172 rc = &rm->rm_col[c]; 2173 if (rc->rc_error != 0) { 2174 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2175 tgts[n++] = c; 2176 } 2177 } 2178 2179 ASSERT(rm->rm_firstdatacol >= n); 2180 2181 code = vdev_raidz_reconstruct(rm, tgts, n); 2182 2183 if (raidz_checksum_verify(zio) == 0) { 2184 /* 2185 * If we read more parity disks than were used 2186 * for reconstruction, confirm that the other 2187 * parity disks produced correct data. This 2188 * routine is suboptimal in that it regenerates 2189 * the parity that we already used in addition 2190 * to the parity that we're attempting to 2191 * verify, but this should be a relatively 2192 * uncommon case, and can be optimized if it 2193 * becomes a problem. Note that we regenerate 2194 * parity when resilvering so we can write it 2195 * out to failed devices later. 2196 */ 2197 if (parity_errors < rm->rm_firstdatacol - n || 2198 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2199 n = raidz_parity_verify(zio, rm); 2200 unexpected_errors += n; 2201 ASSERT(parity_errors + n <= 2202 rm->rm_firstdatacol); 2203 } 2204 2205 goto done; 2206 } 2207 } 2208 } 2209 2210 /* 2211 * This isn't a typical situation -- either we got a read error or 2212 * a child silently returned bad data. Read every block so we can 2213 * try again with as much data and parity as we can track down. If 2214 * we've already been through once before, all children will be marked 2215 * as tried so we'll proceed to combinatorial reconstruction. 2216 */ 2217 unexpected_errors = 1; 2218 rm->rm_missingdata = 0; 2219 rm->rm_missingparity = 0; 2220 2221 for (c = 0; c < rm->rm_cols; c++) { 2222 if (rm->rm_col[c].rc_tried) 2223 continue; 2224 2225 zio_vdev_io_redone(zio); 2226 do { 2227 rc = &rm->rm_col[c]; 2228 if (rc->rc_tried) 2229 continue; 2230 zio_nowait(zio_vdev_child_io(zio, NULL, 2231 vd->vdev_child[rc->rc_devidx], 2232 rc->rc_offset, rc->rc_abd, rc->rc_size, 2233 zio->io_type, zio->io_priority, 0, 2234 vdev_raidz_child_done, rc)); 2235 } while (++c < rm->rm_cols); 2236 2237 return; 2238 } 2239 2240 /* 2241 * At this point we've attempted to reconstruct the data given the 2242 * errors we detected, and we've attempted to read all columns. There 2243 * must, therefore, be one or more additional problems -- silent errors 2244 * resulting in invalid data rather than explicit I/O errors resulting 2245 * in absent data. We check if there is enough additional data to 2246 * possibly reconstruct the data and then perform combinatorial 2247 * reconstruction over all possible combinations. If that fails, 2248 * we're cooked. 2249 */ 2250 if (total_errors > rm->rm_firstdatacol) { 2251 zio->io_error = vdev_raidz_worst_error(rm); 2252 2253 } else if (total_errors < rm->rm_firstdatacol && 2254 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2255 /* 2256 * If we didn't use all the available parity for the 2257 * combinatorial reconstruction, verify that the remaining 2258 * parity is correct. 2259 */ 2260 if (code != (1 << rm->rm_firstdatacol) - 1) 2261 (void) raidz_parity_verify(zio, rm); 2262 } else { 2263 /* 2264 * We're here because either: 2265 * 2266 * total_errors == rm_first_datacol, or 2267 * vdev_raidz_combrec() failed 2268 * 2269 * In either case, there is enough bad data to prevent 2270 * reconstruction. 2271 * 2272 * Start checksum ereports for all children which haven't 2273 * failed, and the IO wasn't speculative. 2274 */ 2275 zio->io_error = SET_ERROR(ECKSUM); 2276 2277 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2278 for (c = 0; c < rm->rm_cols; c++) { 2279 vdev_t *cvd; 2280 rc = &rm->rm_col[c]; 2281 cvd = vd->vdev_child[rc->rc_devidx]; 2282 if (rc->rc_error == 0) { 2283 zio_bad_cksum_t zbc; 2284 zbc.zbc_has_cksum = 0; 2285 zbc.zbc_injected = 2286 rm->rm_ecksuminjected; 2287 2288 mutex_enter(&cvd->vdev_stat_lock); 2289 cvd->vdev_stat.vs_checksum_errors++; 2290 mutex_exit(&cvd->vdev_stat_lock); 2291 2292 zfs_ereport_start_checksum( 2293 zio->io_spa, cvd, 2294 &zio->io_bookmark, zio, 2295 rc->rc_offset, rc->rc_size, 2296 (void *)(uintptr_t)c, &zbc); 2297 } 2298 } 2299 } 2300 } 2301 2302 done: 2303 zio_checksum_verified(zio); 2304 2305 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2306 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2307 /* 2308 * Use the good data we have in hand to repair damaged children. 2309 */ 2310 for (c = 0; c < rm->rm_cols; c++) { 2311 rc = &rm->rm_col[c]; 2312 cvd = vd->vdev_child[rc->rc_devidx]; 2313 2314 if (rc->rc_error == 0) 2315 continue; 2316 2317 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2318 rc->rc_offset, rc->rc_abd, rc->rc_size, 2319 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2320 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2321 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2322 } 2323 } 2324 } 2325 2326 static void 2327 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2328 { 2329 if (faulted > vd->vdev_nparity) 2330 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2331 VDEV_AUX_NO_REPLICAS); 2332 else if (degraded + faulted != 0) 2333 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2334 else 2335 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2336 } 2337 2338 /* 2339 * Determine if any portion of the provided block resides on a child vdev 2340 * with a dirty DTL and therefore needs to be resilvered. The function 2341 * assumes that at least one DTL is dirty which implies that full stripe 2342 * width blocks must be resilvered. 2343 */ 2344 static boolean_t 2345 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2346 { 2347 uint64_t dcols = vd->vdev_children; 2348 uint64_t nparity = vd->vdev_nparity; 2349 uint64_t ashift = vd->vdev_top->vdev_ashift; 2350 /* The starting RAIDZ (parent) vdev sector of the block. */ 2351 uint64_t b = offset >> ashift; 2352 /* The zio's size in units of the vdev's minimum sector size. */ 2353 uint64_t s = ((psize - 1) >> ashift) + 1; 2354 /* The first column for this stripe. */ 2355 uint64_t f = b % dcols; 2356 2357 if (s + nparity >= dcols) 2358 return (B_TRUE); 2359 2360 for (uint64_t c = 0; c < s + nparity; c++) { 2361 uint64_t devidx = (f + c) % dcols; 2362 vdev_t *cvd = vd->vdev_child[devidx]; 2363 2364 /* 2365 * dsl_scan_need_resilver() already checked vd with 2366 * vdev_dtl_contains(). So here just check cvd with 2367 * vdev_dtl_empty(), cheaper and a good approximation. 2368 */ 2369 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2370 return (B_TRUE); 2371 } 2372 2373 return (B_FALSE); 2374 } 2375 2376 static void 2377 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) 2378 { 2379 vdev_t *raidvd = cvd->vdev_parent; 2380 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2381 2382 uint64_t width = raidvd->vdev_children; 2383 uint64_t tgt_col = cvd->vdev_id; 2384 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2385 2386 /* make sure the offsets are block-aligned */ 2387 ASSERT0(in->rs_start % (1 << ashift)); 2388 ASSERT0(in->rs_end % (1 << ashift)); 2389 uint64_t b_start = in->rs_start >> ashift; 2390 uint64_t b_end = in->rs_end >> ashift; 2391 2392 uint64_t start_row = 0; 2393 if (b_start > tgt_col) /* avoid underflow */ 2394 start_row = ((b_start - tgt_col - 1) / width) + 1; 2395 2396 uint64_t end_row = 0; 2397 if (b_end > tgt_col) 2398 end_row = ((b_end - tgt_col - 1) / width) + 1; 2399 2400 res->rs_start = start_row << ashift; 2401 res->rs_end = end_row << ashift; 2402 2403 ASSERT3U(res->rs_start, <=, in->rs_start); 2404 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2405 } 2406 2407 vdev_ops_t vdev_raidz_ops = { 2408 .vdev_op_open = vdev_raidz_open, 2409 .vdev_op_close = vdev_raidz_close, 2410 .vdev_op_asize = vdev_raidz_asize, 2411 .vdev_op_io_start = vdev_raidz_io_start, 2412 .vdev_op_io_done = vdev_raidz_io_done, 2413 .vdev_op_state_change = vdev_raidz_state_change, 2414 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2415 .vdev_op_hold = NULL, 2416 .vdev_op_rele = NULL, 2417 .vdev_op_remap = NULL, 2418 .vdev_op_xlate = vdev_raidz_xlate, 2419 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2420 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2421 }; 2422