1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 * Copyright 2019 Joyent, Inc. 27 * Copyright (c) 2014 Integros [integros.com] 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_file.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/abd.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/fm/fs/zfs.h> 39 #include <sys/vdev_raidz.h> 40 #include <sys/vdev_raidz_impl.h> 41 42 #ifdef ZFS_DEBUG 43 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 44 #endif 45 46 /* 47 * Virtual device vector for RAID-Z. 48 * 49 * This vdev supports single, double, and triple parity. For single parity, 50 * we use a simple XOR of all the data columns. For double or triple parity, 51 * we use a special case of Reed-Solomon coding. This extends the 52 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 53 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 54 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 55 * former is also based. The latter is designed to provide higher performance 56 * for writes. 57 * 58 * Note that the Plank paper claimed to support arbitrary N+M, but was then 59 * amended six years later identifying a critical flaw that invalidates its 60 * claims. Nevertheless, the technique can be adapted to work for up to 61 * triple parity. For additional parity, the amendment "Note: Correction to 62 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 63 * is viable, but the additional complexity means that write performance will 64 * suffer. 65 * 66 * All of the methods above operate on a Galois field, defined over the 67 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 68 * can be expressed with a single byte. Briefly, the operations on the 69 * field are defined as follows: 70 * 71 * o addition (+) is represented by a bitwise XOR 72 * o subtraction (-) is therefore identical to addition: A + B = A - B 73 * o multiplication of A by 2 is defined by the following bitwise expression: 74 * 75 * (A * 2)_7 = A_6 76 * (A * 2)_6 = A_5 77 * (A * 2)_5 = A_4 78 * (A * 2)_4 = A_3 + A_7 79 * (A * 2)_3 = A_2 + A_7 80 * (A * 2)_2 = A_1 + A_7 81 * (A * 2)_1 = A_0 82 * (A * 2)_0 = A_7 83 * 84 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 85 * As an aside, this multiplication is derived from the error correcting 86 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 87 * 88 * Observe that any number in the field (except for 0) can be expressed as a 89 * power of 2 -- a generator for the field. We store a table of the powers of 90 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 91 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 92 * than field addition). The inverse of a field element A (A^-1) is therefore 93 * A ^ (255 - 1) = A^254. 94 * 95 * The up-to-three parity columns, P, Q, R over several data columns, 96 * D_0, ... D_n-1, can be expressed by field operations: 97 * 98 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 99 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 100 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 101 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 102 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 103 * 104 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 105 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 106 * independent coefficients. (There are no additional coefficients that have 107 * this property which is why the uncorrected Plank method breaks down.) 108 * 109 * See the reconstruction code below for how P, Q and R can used individually 110 * or in concert to recover missing data columns. 111 */ 112 113 #define VDEV_RAIDZ_P 0 114 #define VDEV_RAIDZ_Q 1 115 #define VDEV_RAIDZ_R 2 116 117 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 118 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 119 120 /* 121 * We provide a mechanism to perform the field multiplication operation on a 122 * 64-bit value all at once rather than a byte at a time. This works by 123 * creating a mask from the top bit in each byte and using that to 124 * conditionally apply the XOR of 0x1d. 125 */ 126 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 127 { \ 128 (mask) = (x) & 0x8080808080808080ULL; \ 129 (mask) = ((mask) << 1) - ((mask) >> 7); \ 130 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 131 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 132 } 133 134 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 135 { \ 136 VDEV_RAIDZ_64MUL_2((x), mask); \ 137 VDEV_RAIDZ_64MUL_2((x), mask); \ 138 } 139 140 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 141 142 void 143 vdev_raidz_map_free(raidz_map_t *rm) 144 { 145 int c; 146 147 for (c = 0; c < rm->rm_firstdatacol; c++) { 148 abd_free(rm->rm_col[c].rc_abd); 149 150 if (rm->rm_col[c].rc_gdata != NULL) 151 abd_free(rm->rm_col[c].rc_gdata); 152 } 153 154 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 155 abd_put(rm->rm_col[c].rc_abd); 156 157 if (rm->rm_abd_copy != NULL) 158 abd_free(rm->rm_abd_copy); 159 160 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 161 } 162 163 static void 164 vdev_raidz_map_free_vsd(zio_t *zio) 165 { 166 raidz_map_t *rm = zio->io_vsd; 167 168 ASSERT0(rm->rm_freed); 169 rm->rm_freed = 1; 170 171 if (rm->rm_reports == 0) 172 vdev_raidz_map_free(rm); 173 } 174 175 /*ARGSUSED*/ 176 static void 177 vdev_raidz_cksum_free(void *arg, size_t ignored) 178 { 179 raidz_map_t *rm = arg; 180 181 ASSERT3U(rm->rm_reports, >, 0); 182 183 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 184 vdev_raidz_map_free(rm); 185 } 186 187 static void 188 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) 189 { 190 raidz_map_t *rm = zcr->zcr_cbdata; 191 const size_t c = zcr->zcr_cbinfo; 192 size_t x, offset; 193 194 const abd_t *good = NULL; 195 const abd_t *bad = rm->rm_col[c].rc_abd; 196 197 if (good_data == NULL) { 198 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 199 return; 200 } 201 202 if (c < rm->rm_firstdatacol) { 203 /* 204 * The first time through, calculate the parity blocks for 205 * the good data (this relies on the fact that the good 206 * data never changes for a given logical ZIO) 207 */ 208 if (rm->rm_col[0].rc_gdata == NULL) { 209 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 210 211 /* 212 * Set up the rm_col[]s to generate the parity for 213 * good_data, first saving the parity bufs and 214 * replacing them with buffers to hold the result. 215 */ 216 for (x = 0; x < rm->rm_firstdatacol; x++) { 217 bad_parity[x] = rm->rm_col[x].rc_abd; 218 rm->rm_col[x].rc_abd = 219 rm->rm_col[x].rc_gdata = 220 abd_alloc_sametype(rm->rm_col[x].rc_abd, 221 rm->rm_col[x].rc_size); 222 } 223 224 /* fill in the data columns from good_data */ 225 offset = 0; 226 for (; x < rm->rm_cols; x++) { 227 abd_put(rm->rm_col[x].rc_abd); 228 229 rm->rm_col[x].rc_abd = 230 abd_get_offset_size((abd_t *)good_data, 231 offset, rm->rm_col[x].rc_size); 232 offset += rm->rm_col[x].rc_size; 233 } 234 235 /* 236 * Construct the parity from the good data. 237 */ 238 vdev_raidz_generate_parity(rm); 239 240 /* restore everything back to its original state */ 241 for (x = 0; x < rm->rm_firstdatacol; x++) 242 rm->rm_col[x].rc_abd = bad_parity[x]; 243 244 offset = 0; 245 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 246 abd_put(rm->rm_col[x].rc_abd); 247 rm->rm_col[x].rc_abd = abd_get_offset_size( 248 rm->rm_abd_copy, offset, 249 rm->rm_col[x].rc_size); 250 offset += rm->rm_col[x].rc_size; 251 } 252 } 253 254 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 255 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, 256 rm->rm_col[c].rc_size); 257 } else { 258 /* adjust good_data to point at the start of our column */ 259 offset = 0; 260 for (x = rm->rm_firstdatacol; x < c; x++) 261 offset += rm->rm_col[x].rc_size; 262 263 good = abd_get_offset_size((abd_t *)good_data, offset, 264 rm->rm_col[c].rc_size); 265 } 266 267 /* we drop the ereport if it ends up that the data was good */ 268 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 269 abd_put((abd_t *)good); 270 } 271 272 /* 273 * Invoked indirectly by zfs_ereport_start_checksum(), called 274 * below when our read operation fails completely. The main point 275 * is to keep a copy of everything we read from disk, so that at 276 * vdev_raidz_cksum_finish() time we can compare it with the good data. 277 */ 278 static void 279 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 280 { 281 size_t c = (size_t)(uintptr_t)arg; 282 size_t offset; 283 284 raidz_map_t *rm = zio->io_vsd; 285 size_t size; 286 287 /* set up the report and bump the refcount */ 288 zcr->zcr_cbdata = rm; 289 zcr->zcr_cbinfo = c; 290 zcr->zcr_finish = vdev_raidz_cksum_finish; 291 zcr->zcr_free = vdev_raidz_cksum_free; 292 293 rm->rm_reports++; 294 ASSERT3U(rm->rm_reports, >, 0); 295 296 if (rm->rm_abd_copy != NULL) 297 return; 298 299 /* 300 * It's the first time we're called for this raidz_map_t, so we need 301 * to copy the data aside; there's no guarantee that our zio's buffer 302 * won't be re-used for something else. 303 * 304 * Our parity data is already in separate buffers, so there's no need 305 * to copy them. 306 */ 307 308 size = 0; 309 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 310 size += rm->rm_col[c].rc_size; 311 312 rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); 313 314 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 315 raidz_col_t *col = &rm->rm_col[c]; 316 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, 317 col->rc_size); 318 319 ASSERT3S(tmp->abd_size, >=, col->rc_size); 320 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 321 abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size); 322 abd_put(col->rc_abd); 323 col->rc_abd = tmp; 324 325 offset += col->rc_size; 326 } 327 ASSERT3U(offset, ==, size); 328 } 329 330 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 331 vdev_raidz_map_free_vsd, 332 vdev_raidz_cksum_report 333 }; 334 335 /* 336 * Divides the IO evenly across all child vdevs; usually, dcols is 337 * the number of children in the target vdev. 338 */ 339 raidz_map_t * 340 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 341 uint64_t nparity) 342 { 343 raidz_map_t *rm; 344 /* The starting RAIDZ (parent) vdev sector of the block. */ 345 uint64_t b = zio->io_offset >> ashift; 346 /* The zio's size in units of the vdev's minimum sector size. */ 347 uint64_t s = zio->io_size >> ashift; 348 /* The first column for this stripe. */ 349 uint64_t f = b % dcols; 350 /* The starting byte offset on each child vdev. */ 351 uint64_t o = (b / dcols) << ashift; 352 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 353 uint64_t off = 0; 354 355 /* 356 * "Quotient": The number of data sectors for this stripe on all but 357 * the "big column" child vdevs that also contain "remainder" data. 358 */ 359 q = s / (dcols - nparity); 360 361 /* 362 * "Remainder": The number of partial stripe data sectors in this I/O. 363 * This will add a sector to some, but not all, child vdevs. 364 */ 365 r = s - q * (dcols - nparity); 366 367 /* The number of "big columns" - those which contain remainder data. */ 368 bc = (r == 0 ? 0 : r + nparity); 369 370 /* 371 * The total number of data and parity sectors associated with 372 * this I/O. 373 */ 374 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 375 376 /* acols: The columns that will be accessed. */ 377 /* scols: The columns that will be accessed or skipped. */ 378 if (q == 0) { 379 /* Our I/O request doesn't span all child vdevs. */ 380 acols = bc; 381 scols = MIN(dcols, roundup(bc, nparity + 1)); 382 } else { 383 acols = dcols; 384 scols = dcols; 385 } 386 387 ASSERT3U(acols, <=, scols); 388 389 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 390 391 rm->rm_cols = acols; 392 rm->rm_scols = scols; 393 rm->rm_bigcols = bc; 394 rm->rm_skipstart = bc; 395 rm->rm_missingdata = 0; 396 rm->rm_missingparity = 0; 397 rm->rm_firstdatacol = nparity; 398 rm->rm_abd_copy = NULL; 399 rm->rm_reports = 0; 400 rm->rm_freed = 0; 401 rm->rm_ecksuminjected = 0; 402 403 asize = 0; 404 405 for (c = 0; c < scols; c++) { 406 col = f + c; 407 coff = o; 408 if (col >= dcols) { 409 col -= dcols; 410 coff += 1ULL << ashift; 411 } 412 rm->rm_col[c].rc_devidx = col; 413 rm->rm_col[c].rc_offset = coff; 414 rm->rm_col[c].rc_abd = NULL; 415 rm->rm_col[c].rc_gdata = NULL; 416 rm->rm_col[c].rc_error = 0; 417 rm->rm_col[c].rc_tried = 0; 418 rm->rm_col[c].rc_skipped = 0; 419 420 if (c >= acols) 421 rm->rm_col[c].rc_size = 0; 422 else if (c < bc) 423 rm->rm_col[c].rc_size = (q + 1) << ashift; 424 else 425 rm->rm_col[c].rc_size = q << ashift; 426 427 asize += rm->rm_col[c].rc_size; 428 } 429 430 ASSERT3U(asize, ==, tot << ashift); 431 rm->rm_asize = roundup(asize, (nparity + 1) << ashift); 432 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 433 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); 434 ASSERT3U(rm->rm_nskip, <=, nparity); 435 436 for (c = 0; c < rm->rm_firstdatacol; c++) 437 rm->rm_col[c].rc_abd = 438 abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); 439 440 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, 441 rm->rm_col[c].rc_size); 442 off = rm->rm_col[c].rc_size; 443 444 for (c = c + 1; c < acols; c++) { 445 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, 446 rm->rm_col[c].rc_size); 447 off += rm->rm_col[c].rc_size; 448 } 449 450 /* 451 * If all data stored spans all columns, there's a danger that parity 452 * will always be on the same device and, since parity isn't read 453 * during normal operation, that device's I/O bandwidth won't be 454 * used effectively. We therefore switch the parity every 1MB. 455 * 456 * ... at least that was, ostensibly, the theory. As a practical 457 * matter unless we juggle the parity between all devices evenly, we 458 * won't see any benefit. Further, occasional writes that aren't a 459 * multiple of the LCM of the number of children and the minimum 460 * stripe width are sufficient to avoid pessimal behavior. 461 * Unfortunately, this decision created an implicit on-disk format 462 * requirement that we need to support for all eternity, but only 463 * for single-parity RAID-Z. 464 * 465 * If we intend to skip a sector in the zeroth column for padding 466 * we must make sure to note this swap. We will never intend to 467 * skip the first column since at least one data and one parity 468 * column must appear in each row. 469 */ 470 ASSERT(rm->rm_cols >= 2); 471 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 472 473 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 474 devidx = rm->rm_col[0].rc_devidx; 475 o = rm->rm_col[0].rc_offset; 476 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 477 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 478 rm->rm_col[1].rc_devidx = devidx; 479 rm->rm_col[1].rc_offset = o; 480 481 if (rm->rm_skipstart == 0) 482 rm->rm_skipstart = 1; 483 } 484 485 /* init RAIDZ parity ops */ 486 rm->rm_ops = vdev_raidz_math_get_ops(); 487 488 return (rm); 489 } 490 491 struct pqr_struct { 492 uint64_t *p; 493 uint64_t *q; 494 uint64_t *r; 495 }; 496 497 static int 498 vdev_raidz_p_func(void *buf, size_t size, void *private) 499 { 500 struct pqr_struct *pqr = private; 501 const uint64_t *src = buf; 502 int i, cnt = size / sizeof (src[0]); 503 504 ASSERT(pqr->p && !pqr->q && !pqr->r); 505 506 for (i = 0; i < cnt; i++, src++, pqr->p++) 507 *pqr->p ^= *src; 508 509 return (0); 510 } 511 512 static int 513 vdev_raidz_pq_func(void *buf, size_t size, void *private) 514 { 515 struct pqr_struct *pqr = private; 516 const uint64_t *src = buf; 517 uint64_t mask; 518 int i, cnt = size / sizeof (src[0]); 519 520 ASSERT(pqr->p && pqr->q && !pqr->r); 521 522 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 523 *pqr->p ^= *src; 524 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 525 *pqr->q ^= *src; 526 } 527 528 return (0); 529 } 530 531 static int 532 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 533 { 534 struct pqr_struct *pqr = private; 535 const uint64_t *src = buf; 536 uint64_t mask; 537 int i, cnt = size / sizeof (src[0]); 538 539 ASSERT(pqr->p && pqr->q && pqr->r); 540 541 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 542 *pqr->p ^= *src; 543 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 544 *pqr->q ^= *src; 545 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 546 *pqr->r ^= *src; 547 } 548 549 return (0); 550 } 551 552 static void 553 vdev_raidz_generate_parity_p(raidz_map_t *rm) 554 { 555 uint64_t *p; 556 int c; 557 abd_t *src; 558 559 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 560 src = rm->rm_col[c].rc_abd; 561 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 562 563 if (c == rm->rm_firstdatacol) { 564 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 565 } else { 566 struct pqr_struct pqr = { p, NULL, NULL }; 567 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 568 vdev_raidz_p_func, &pqr); 569 } 570 } 571 } 572 573 static void 574 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 575 { 576 uint64_t *p, *q, pcnt, ccnt, mask, i; 577 int c; 578 abd_t *src; 579 580 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 581 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 582 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 583 584 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 585 src = rm->rm_col[c].rc_abd; 586 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 587 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 588 589 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 590 591 if (c == rm->rm_firstdatacol) { 592 ASSERT(ccnt == pcnt || ccnt == 0); 593 594 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 595 (void) memcpy(q, p, rm->rm_col[c].rc_size); 596 for (i = ccnt; i < pcnt; i++) { 597 p[i] = 0; 598 q[i] = 0; 599 } 600 } else { 601 struct pqr_struct pqr = { p, q, NULL }; 602 603 ASSERT(ccnt <= pcnt); 604 605 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 606 vdev_raidz_pq_func, &pqr); 607 608 /* 609 * Treat short columns as though they are full of 0s. 610 * Note that there's therefore nothing needed for P. 611 */ 612 for (i = ccnt; i < pcnt; i++) { 613 VDEV_RAIDZ_64MUL_2(q[i], mask); 614 } 615 } 616 } 617 } 618 619 static void 620 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 621 { 622 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 623 int c; 624 abd_t *src; 625 626 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 627 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 628 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 629 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 630 rm->rm_col[VDEV_RAIDZ_R].rc_size); 631 632 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 633 src = rm->rm_col[c].rc_abd; 634 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 635 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 636 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 637 638 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 639 640 if (c == rm->rm_firstdatacol) { 641 ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size); 642 ASSERT(ccnt == pcnt || ccnt == 0); 643 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 644 (void) memcpy(q, p, rm->rm_col[c].rc_size); 645 (void) memcpy(r, p, rm->rm_col[c].rc_size); 646 647 for (i = ccnt; i < pcnt; i++) { 648 p[i] = 0; 649 q[i] = 0; 650 r[i] = 0; 651 } 652 } else { 653 struct pqr_struct pqr = { p, q, r }; 654 655 ASSERT(ccnt <= pcnt); 656 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 657 vdev_raidz_pqr_func, &pqr); 658 659 /* 660 * Treat short columns as though they are full of 0s. 661 * Note that there's therefore nothing needed for P. 662 */ 663 for (i = ccnt; i < pcnt; i++) { 664 VDEV_RAIDZ_64MUL_2(q[i], mask); 665 VDEV_RAIDZ_64MUL_4(r[i], mask); 666 } 667 } 668 } 669 } 670 671 /* 672 * Generate RAID parity in the first virtual columns according to the number of 673 * parity columns available. 674 */ 675 void 676 vdev_raidz_generate_parity(raidz_map_t *rm) 677 { 678 /* Generate using the new math implementation */ 679 if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) 680 return; 681 682 switch (rm->rm_firstdatacol) { 683 case 1: 684 vdev_raidz_generate_parity_p(rm); 685 break; 686 case 2: 687 vdev_raidz_generate_parity_pq(rm); 688 break; 689 case 3: 690 vdev_raidz_generate_parity_pqr(rm); 691 break; 692 default: 693 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 694 } 695 } 696 697 /* ARGSUSED */ 698 static int 699 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 700 { 701 uint64_t *dst = dbuf; 702 uint64_t *src = sbuf; 703 int cnt = size / sizeof (src[0]); 704 705 for (int i = 0; i < cnt; i++) { 706 dst[i] ^= src[i]; 707 } 708 709 return (0); 710 } 711 712 /* ARGSUSED */ 713 static int 714 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 715 void *private) 716 { 717 uint64_t *dst = dbuf; 718 uint64_t *src = sbuf; 719 uint64_t mask; 720 int cnt = size / sizeof (dst[0]); 721 722 for (int i = 0; i < cnt; i++, dst++, src++) { 723 VDEV_RAIDZ_64MUL_2(*dst, mask); 724 *dst ^= *src; 725 } 726 727 return (0); 728 } 729 730 /* ARGSUSED */ 731 static int 732 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 733 { 734 uint64_t *dst = buf; 735 uint64_t mask; 736 int cnt = size / sizeof (dst[0]); 737 738 for (int i = 0; i < cnt; i++, dst++) { 739 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 740 VDEV_RAIDZ_64MUL_2(*dst, mask); 741 } 742 743 return (0); 744 } 745 746 struct reconst_q_struct { 747 uint64_t *q; 748 int exp; 749 }; 750 751 static int 752 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 753 { 754 struct reconst_q_struct *rq = private; 755 uint64_t *dst = buf; 756 int cnt = size / sizeof (dst[0]); 757 758 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 759 760 *dst ^= *rq->q; 761 int j; 762 uint8_t *b; 763 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 764 *b = vdev_raidz_exp2(*b, rq->exp); 765 } 766 } 767 768 return (0); 769 } 770 771 struct reconst_pq_struct { 772 uint8_t *p; 773 uint8_t *q; 774 uint8_t *pxy; 775 uint8_t *qxy; 776 int aexp; 777 int bexp; 778 }; 779 780 static int 781 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 782 { 783 struct reconst_pq_struct *rpq = private; 784 uint8_t *xd = xbuf; 785 uint8_t *yd = ybuf; 786 787 for (int i = 0; i < size; 788 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 789 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 790 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 791 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 792 } 793 794 return (0); 795 } 796 797 static int 798 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 799 { 800 struct reconst_pq_struct *rpq = private; 801 uint8_t *xd = xbuf; 802 803 for (int i = 0; i < size; 804 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 805 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 806 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 807 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 808 } 809 810 return (0); 811 } 812 813 static int 814 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 815 { 816 int x = tgts[0]; 817 int c; 818 abd_t *dst, *src; 819 820 ASSERT(ntgts == 1); 821 ASSERT(x >= rm->rm_firstdatacol); 822 ASSERT(x < rm->rm_cols); 823 824 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 825 ASSERT(rm->rm_col[x].rc_size > 0); 826 827 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 828 dst = rm->rm_col[x].rc_abd; 829 830 ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size); 831 ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size); 832 abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size); 833 834 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 835 uint64_t size = MIN(rm->rm_col[x].rc_size, 836 rm->rm_col[c].rc_size); 837 838 src = rm->rm_col[c].rc_abd; 839 dst = rm->rm_col[x].rc_abd; 840 841 if (c == x) 842 continue; 843 844 (void) abd_iterate_func2(dst, src, 0, 0, size, 845 vdev_raidz_reconst_p_func, NULL); 846 } 847 848 return (1 << VDEV_RAIDZ_P); 849 } 850 851 static int 852 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 853 { 854 int x = tgts[0]; 855 int c, exp; 856 abd_t *dst, *src; 857 858 ASSERT(ntgts == 1); 859 860 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 861 862 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 863 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 864 rm->rm_col[c].rc_size); 865 866 src = rm->rm_col[c].rc_abd; 867 dst = rm->rm_col[x].rc_abd; 868 869 if (c == rm->rm_firstdatacol) { 870 if (dst != src) { 871 ASSERT3S(dst->abd_size, >=, size); 872 ASSERT3S(src->abd_size, >=, size); 873 abd_copy_off(dst, src, 0, 0, size); 874 } 875 if (rm->rm_col[x].rc_size > size) 876 abd_zero_off(dst, size, 877 rm->rm_col[x].rc_size - size); 878 } else { 879 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 880 if (src != dst) 881 (void) abd_iterate_func2(dst, src, 0, 0, size, 882 vdev_raidz_reconst_q_pre_func, NULL); 883 (void) abd_iterate_func(dst, 884 size, rm->rm_col[x].rc_size - size, 885 vdev_raidz_reconst_q_pre_tail_func, NULL); 886 } 887 } 888 889 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 890 dst = rm->rm_col[x].rc_abd; 891 exp = 255 - (rm->rm_cols - 1 - x); 892 893 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 894 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 895 vdev_raidz_reconst_q_post_func, &rq); 896 897 return (1 << VDEV_RAIDZ_Q); 898 } 899 900 static int 901 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 902 { 903 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 904 abd_t *pdata, *qdata; 905 uint64_t xsize, ysize; 906 int x = tgts[0]; 907 int y = tgts[1]; 908 abd_t *xd, *yd; 909 910 ASSERT(ntgts == 2); 911 ASSERT(x < y); 912 ASSERT(x >= rm->rm_firstdatacol); 913 ASSERT(y < rm->rm_cols); 914 915 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 916 917 /* 918 * Move the parity data aside -- we're going to compute parity as 919 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 920 * reuse the parity generation mechanism without trashing the actual 921 * parity so we make those columns appear to be full of zeros by 922 * setting their lengths to zero. 923 */ 924 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 925 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 926 xsize = rm->rm_col[x].rc_size; 927 ysize = rm->rm_col[y].rc_size; 928 929 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 930 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 931 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 932 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 933 rm->rm_col[x].rc_size = 0; 934 rm->rm_col[y].rc_size = 0; 935 936 vdev_raidz_generate_parity_pq(rm); 937 938 rm->rm_col[x].rc_size = xsize; 939 rm->rm_col[y].rc_size = ysize; 940 941 p = abd_to_buf(pdata); 942 q = abd_to_buf(qdata); 943 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 944 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 945 xd = rm->rm_col[x].rc_abd; 946 yd = rm->rm_col[y].rc_abd; 947 948 /* 949 * We now have: 950 * Pxy = P + D_x + D_y 951 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 952 * 953 * We can then solve for D_x: 954 * D_x = A * (P + Pxy) + B * (Q + Qxy) 955 * where 956 * A = 2^(x - y) * (2^(x - y) + 1)^-1 957 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 958 * 959 * With D_x in hand, we can easily solve for D_y: 960 * D_y = P + Pxy + D_x 961 */ 962 963 a = vdev_raidz_pow2[255 + x - y]; 964 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 965 tmp = 255 - vdev_raidz_log2[a ^ 1]; 966 967 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 968 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 969 970 ASSERT3U(xsize, >=, ysize); 971 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 972 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 973 vdev_raidz_reconst_pq_func, &rpq); 974 (void) abd_iterate_func(xd, ysize, xsize - ysize, 975 vdev_raidz_reconst_pq_tail_func, &rpq); 976 977 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 978 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 979 980 /* 981 * Restore the saved parity data. 982 */ 983 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 984 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 985 986 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 987 } 988 989 /* BEGIN CSTYLED */ 990 /* 991 * In the general case of reconstruction, we must solve the system of linear 992 * equations defined by the coeffecients used to generate parity as well as 993 * the contents of the data and parity disks. This can be expressed with 994 * vectors for the original data (D) and the actual data (d) and parity (p) 995 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 996 * 997 * __ __ __ __ 998 * | | __ __ | p_0 | 999 * | V | | D_0 | | p_m-1 | 1000 * | | x | : | = | d_0 | 1001 * | I | | D_n-1 | | : | 1002 * | | ~~ ~~ | d_n-1 | 1003 * ~~ ~~ ~~ ~~ 1004 * 1005 * I is simply a square identity matrix of size n, and V is a vandermonde 1006 * matrix defined by the coeffecients we chose for the various parity columns 1007 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1008 * computation as well as linear separability. 1009 * 1010 * __ __ __ __ 1011 * | 1 .. 1 1 1 | | p_0 | 1012 * | 2^n-1 .. 4 2 1 | __ __ | : | 1013 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1014 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1015 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1016 * | : : : : | | : | | d_2 | 1017 * | 0 .. 1 0 0 | | D_n-1 | | : | 1018 * | 0 .. 0 1 0 | ~~ ~~ | : | 1019 * | 0 .. 0 0 1 | | d_n-1 | 1020 * ~~ ~~ ~~ ~~ 1021 * 1022 * Note that I, V, d, and p are known. To compute D, we must invert the 1023 * matrix and use the known data and parity values to reconstruct the unknown 1024 * data values. We begin by removing the rows in V|I and d|p that correspond 1025 * to failed or missing columns; we then make V|I square (n x n) and d|p 1026 * sized n by removing rows corresponding to unused parity from the bottom up 1027 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1028 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1029 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1030 * __ __ 1031 * | 1 1 1 1 1 1 1 1 | 1032 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1033 * | 19 205 116 29 64 16 4 1 | / / 1034 * | 1 0 0 0 0 0 0 0 | / / 1035 * | 0 1 0 0 0 0 0 0 | <--' / 1036 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1037 * | 0 0 0 1 0 0 0 0 | 1038 * | 0 0 0 0 1 0 0 0 | 1039 * | 0 0 0 0 0 1 0 0 | 1040 * | 0 0 0 0 0 0 1 0 | 1041 * | 0 0 0 0 0 0 0 1 | 1042 * ~~ ~~ 1043 * __ __ 1044 * | 1 1 1 1 1 1 1 1 | 1045 * | 128 64 32 16 8 4 2 1 | 1046 * | 19 205 116 29 64 16 4 1 | 1047 * | 1 0 0 0 0 0 0 0 | 1048 * | 0 1 0 0 0 0 0 0 | 1049 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1050 * | 0 0 0 1 0 0 0 0 | 1051 * | 0 0 0 0 1 0 0 0 | 1052 * | 0 0 0 0 0 1 0 0 | 1053 * | 0 0 0 0 0 0 1 0 | 1054 * | 0 0 0 0 0 0 0 1 | 1055 * ~~ ~~ 1056 * 1057 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1058 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1059 * matrix is not singular. 1060 * __ __ 1061 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1062 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1063 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1064 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1065 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1066 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1067 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1068 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1069 * ~~ ~~ 1070 * __ __ 1071 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1072 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1073 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1074 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1075 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1076 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1077 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1078 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1079 * ~~ ~~ 1080 * __ __ 1081 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1082 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1083 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1084 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1085 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1086 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1087 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1088 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1089 * ~~ ~~ 1090 * __ __ 1091 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1092 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1093 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1094 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1095 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1096 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1097 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1098 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1099 * ~~ ~~ 1100 * __ __ 1101 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1102 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1103 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1104 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1105 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1106 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1107 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1108 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1109 * ~~ ~~ 1110 * __ __ 1111 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1112 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1113 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1114 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1115 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1116 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1117 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1118 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1119 * ~~ ~~ 1120 * __ __ 1121 * | 0 0 1 0 0 0 0 0 | 1122 * | 167 100 5 41 159 169 217 208 | 1123 * | 166 100 4 40 158 168 216 209 | 1124 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1125 * | 0 0 0 0 1 0 0 0 | 1126 * | 0 0 0 0 0 1 0 0 | 1127 * | 0 0 0 0 0 0 1 0 | 1128 * | 0 0 0 0 0 0 0 1 | 1129 * ~~ ~~ 1130 * 1131 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1132 * of the missing data. 1133 * 1134 * As is apparent from the example above, the only non-trivial rows in the 1135 * inverse matrix correspond to the data disks that we're trying to 1136 * reconstruct. Indeed, those are the only rows we need as the others would 1137 * only be useful for reconstructing data known or assumed to be valid. For 1138 * that reason, we only build the coefficients in the rows that correspond to 1139 * targeted columns. 1140 */ 1141 /* END CSTYLED */ 1142 1143 static void 1144 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1145 uint8_t **rows) 1146 { 1147 int i, j; 1148 int pow; 1149 1150 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1151 1152 /* 1153 * Fill in the missing rows of interest. 1154 */ 1155 for (i = 0; i < nmap; i++) { 1156 ASSERT3S(0, <=, map[i]); 1157 ASSERT3S(map[i], <=, 2); 1158 1159 pow = map[i] * n; 1160 if (pow > 255) 1161 pow -= 255; 1162 ASSERT(pow <= 255); 1163 1164 for (j = 0; j < n; j++) { 1165 pow -= map[i]; 1166 if (pow < 0) 1167 pow += 255; 1168 rows[i][j] = vdev_raidz_pow2[pow]; 1169 } 1170 } 1171 } 1172 1173 static void 1174 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1175 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1176 { 1177 int i, j, ii, jj; 1178 uint8_t log; 1179 1180 /* 1181 * Assert that the first nmissing entries from the array of used 1182 * columns correspond to parity columns and that subsequent entries 1183 * correspond to data columns. 1184 */ 1185 for (i = 0; i < nmissing; i++) { 1186 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1187 } 1188 for (; i < n; i++) { 1189 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1190 } 1191 1192 /* 1193 * First initialize the storage where we'll compute the inverse rows. 1194 */ 1195 for (i = 0; i < nmissing; i++) { 1196 for (j = 0; j < n; j++) { 1197 invrows[i][j] = (i == j) ? 1 : 0; 1198 } 1199 } 1200 1201 /* 1202 * Subtract all trivial rows from the rows of consequence. 1203 */ 1204 for (i = 0; i < nmissing; i++) { 1205 for (j = nmissing; j < n; j++) { 1206 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1207 jj = used[j] - rm->rm_firstdatacol; 1208 ASSERT3S(jj, <, n); 1209 invrows[i][j] = rows[i][jj]; 1210 rows[i][jj] = 0; 1211 } 1212 } 1213 1214 /* 1215 * For each of the rows of interest, we must normalize it and subtract 1216 * a multiple of it from the other rows. 1217 */ 1218 for (i = 0; i < nmissing; i++) { 1219 for (j = 0; j < missing[i]; j++) { 1220 ASSERT0(rows[i][j]); 1221 } 1222 ASSERT3U(rows[i][missing[i]], !=, 0); 1223 1224 /* 1225 * Compute the inverse of the first element and multiply each 1226 * element in the row by that value. 1227 */ 1228 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1229 1230 for (j = 0; j < n; j++) { 1231 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1232 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1233 } 1234 1235 for (ii = 0; ii < nmissing; ii++) { 1236 if (i == ii) 1237 continue; 1238 1239 ASSERT3U(rows[ii][missing[i]], !=, 0); 1240 1241 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1242 1243 for (j = 0; j < n; j++) { 1244 rows[ii][j] ^= 1245 vdev_raidz_exp2(rows[i][j], log); 1246 invrows[ii][j] ^= 1247 vdev_raidz_exp2(invrows[i][j], log); 1248 } 1249 } 1250 } 1251 1252 /* 1253 * Verify that the data that is left in the rows are properly part of 1254 * an identity matrix. 1255 */ 1256 for (i = 0; i < nmissing; i++) { 1257 for (j = 0; j < n; j++) { 1258 if (j == missing[i]) { 1259 ASSERT3U(rows[i][j], ==, 1); 1260 } else { 1261 ASSERT0(rows[i][j]); 1262 } 1263 } 1264 } 1265 } 1266 1267 static void 1268 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1269 int *missing, uint8_t **invrows, const uint8_t *used) 1270 { 1271 int i, j, x, cc, c; 1272 uint8_t *src; 1273 uint64_t ccount; 1274 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1275 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1276 uint8_t log = 0; 1277 uint8_t val; 1278 int ll; 1279 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1280 uint8_t *p, *pp; 1281 size_t psize; 1282 1283 psize = sizeof (invlog[0][0]) * n * nmissing; 1284 p = kmem_alloc(psize, KM_SLEEP); 1285 1286 for (pp = p, i = 0; i < nmissing; i++) { 1287 invlog[i] = pp; 1288 pp += n; 1289 } 1290 1291 for (i = 0; i < nmissing; i++) { 1292 for (j = 0; j < n; j++) { 1293 ASSERT3U(invrows[i][j], !=, 0); 1294 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1295 } 1296 } 1297 1298 for (i = 0; i < n; i++) { 1299 c = used[i]; 1300 ASSERT3U(c, <, rm->rm_cols); 1301 1302 src = abd_to_buf(rm->rm_col[c].rc_abd); 1303 ccount = rm->rm_col[c].rc_size; 1304 for (j = 0; j < nmissing; j++) { 1305 cc = missing[j] + rm->rm_firstdatacol; 1306 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1307 ASSERT3U(cc, <, rm->rm_cols); 1308 ASSERT3U(cc, !=, c); 1309 1310 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1311 dcount[j] = rm->rm_col[cc].rc_size; 1312 } 1313 1314 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1315 1316 for (x = 0; x < ccount; x++, src++) { 1317 if (*src != 0) 1318 log = vdev_raidz_log2[*src]; 1319 1320 for (cc = 0; cc < nmissing; cc++) { 1321 if (x >= dcount[cc]) 1322 continue; 1323 1324 if (*src == 0) { 1325 val = 0; 1326 } else { 1327 if ((ll = log + invlog[cc][i]) >= 255) 1328 ll -= 255; 1329 val = vdev_raidz_pow2[ll]; 1330 } 1331 1332 if (i == 0) 1333 dst[cc][x] = val; 1334 else 1335 dst[cc][x] ^= val; 1336 } 1337 } 1338 } 1339 1340 kmem_free(p, psize); 1341 } 1342 1343 static int 1344 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1345 { 1346 int n, i, c, t, tt; 1347 int nmissing_rows; 1348 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1349 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1350 1351 uint8_t *p, *pp; 1352 size_t psize; 1353 1354 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1355 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1356 uint8_t *used; 1357 1358 abd_t **bufs = NULL; 1359 1360 int code = 0; 1361 1362 /* 1363 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1364 * temporary linear ABDs. 1365 */ 1366 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1367 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1368 1369 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1370 raidz_col_t *col = &rm->rm_col[c]; 1371 1372 bufs[c] = col->rc_abd; 1373 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1374 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1375 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1376 abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size); 1377 } 1378 } 1379 1380 n = rm->rm_cols - rm->rm_firstdatacol; 1381 1382 /* 1383 * Figure out which data columns are missing. 1384 */ 1385 nmissing_rows = 0; 1386 for (t = 0; t < ntgts; t++) { 1387 if (tgts[t] >= rm->rm_firstdatacol) { 1388 missing_rows[nmissing_rows++] = 1389 tgts[t] - rm->rm_firstdatacol; 1390 } 1391 } 1392 1393 /* 1394 * Figure out which parity columns to use to help generate the missing 1395 * data columns. 1396 */ 1397 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1398 ASSERT(tt < ntgts); 1399 ASSERT(c < rm->rm_firstdatacol); 1400 1401 /* 1402 * Skip any targeted parity columns. 1403 */ 1404 if (c == tgts[tt]) { 1405 tt++; 1406 continue; 1407 } 1408 1409 code |= 1 << c; 1410 1411 parity_map[i] = c; 1412 i++; 1413 } 1414 1415 ASSERT(code != 0); 1416 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1417 1418 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1419 nmissing_rows * n + sizeof (used[0]) * n; 1420 p = kmem_alloc(psize, KM_SLEEP); 1421 1422 for (pp = p, i = 0; i < nmissing_rows; i++) { 1423 rows[i] = pp; 1424 pp += n; 1425 invrows[i] = pp; 1426 pp += n; 1427 } 1428 used = pp; 1429 1430 for (i = 0; i < nmissing_rows; i++) { 1431 used[i] = parity_map[i]; 1432 } 1433 1434 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1435 if (tt < nmissing_rows && 1436 c == missing_rows[tt] + rm->rm_firstdatacol) { 1437 tt++; 1438 continue; 1439 } 1440 1441 ASSERT3S(i, <, n); 1442 used[i] = c; 1443 i++; 1444 } 1445 1446 /* 1447 * Initialize the interesting rows of the matrix. 1448 */ 1449 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1450 1451 /* 1452 * Invert the matrix. 1453 */ 1454 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1455 invrows, used); 1456 1457 /* 1458 * Reconstruct the missing data using the generated matrix. 1459 */ 1460 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1461 invrows, used); 1462 1463 kmem_free(p, psize); 1464 1465 /* 1466 * copy back from temporary linear abds and free them 1467 */ 1468 if (bufs) { 1469 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1470 raidz_col_t *col = &rm->rm_col[c]; 1471 1472 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1473 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1474 abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size); 1475 abd_free(col->rc_abd); 1476 col->rc_abd = bufs[c]; 1477 } 1478 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1479 } 1480 1481 return (code); 1482 } 1483 1484 int 1485 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 1486 { 1487 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1488 int ntgts; 1489 int i, c, ret; 1490 int code; 1491 int nbadparity, nbaddata; 1492 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1493 1494 /* 1495 * The tgts list must already be sorted. 1496 */ 1497 for (i = 1; i < nt; i++) { 1498 ASSERT(t[i] > t[i - 1]); 1499 } 1500 1501 nbadparity = rm->rm_firstdatacol; 1502 nbaddata = rm->rm_cols - nbadparity; 1503 ntgts = 0; 1504 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1505 if (c < rm->rm_firstdatacol) 1506 parity_valid[c] = B_FALSE; 1507 1508 if (i < nt && c == t[i]) { 1509 tgts[ntgts++] = c; 1510 i++; 1511 } else if (rm->rm_col[c].rc_error != 0) { 1512 tgts[ntgts++] = c; 1513 } else if (c >= rm->rm_firstdatacol) { 1514 nbaddata--; 1515 } else { 1516 parity_valid[c] = B_TRUE; 1517 nbadparity--; 1518 } 1519 } 1520 1521 ASSERT(ntgts >= nt); 1522 ASSERT(nbaddata >= 0); 1523 ASSERT(nbaddata + nbadparity == ntgts); 1524 1525 dt = &tgts[nbadparity]; 1526 1527 /* Reconstruct using the new math implementation */ 1528 ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); 1529 if (ret != RAIDZ_ORIGINAL_IMPL) 1530 return (ret); 1531 1532 /* 1533 * See if we can use any of our optimized reconstruction routines. 1534 */ 1535 switch (nbaddata) { 1536 case 1: 1537 if (parity_valid[VDEV_RAIDZ_P]) 1538 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1539 1540 ASSERT(rm->rm_firstdatacol > 1); 1541 1542 if (parity_valid[VDEV_RAIDZ_Q]) 1543 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1544 1545 ASSERT(rm->rm_firstdatacol > 2); 1546 break; 1547 1548 case 2: 1549 ASSERT(rm->rm_firstdatacol > 1); 1550 1551 if (parity_valid[VDEV_RAIDZ_P] && 1552 parity_valid[VDEV_RAIDZ_Q]) 1553 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1554 1555 ASSERT(rm->rm_firstdatacol > 2); 1556 1557 break; 1558 } 1559 1560 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1561 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1562 ASSERT(code > 0); 1563 return (code); 1564 } 1565 1566 static int 1567 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1568 uint64_t *ashift) 1569 { 1570 vdev_t *cvd; 1571 uint64_t nparity = vd->vdev_nparity; 1572 int c; 1573 int lasterror = 0; 1574 int numerrors = 0; 1575 1576 ASSERT(nparity > 0); 1577 1578 if (nparity > VDEV_RAIDZ_MAXPARITY || 1579 vd->vdev_children < nparity + 1) { 1580 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1581 return (SET_ERROR(EINVAL)); 1582 } 1583 1584 vdev_open_children(vd); 1585 1586 for (c = 0; c < vd->vdev_children; c++) { 1587 cvd = vd->vdev_child[c]; 1588 1589 if (cvd->vdev_open_error != 0) { 1590 lasterror = cvd->vdev_open_error; 1591 numerrors++; 1592 continue; 1593 } 1594 1595 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1596 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1597 *ashift = MAX(*ashift, cvd->vdev_ashift); 1598 } 1599 1600 *asize *= vd->vdev_children; 1601 *max_asize *= vd->vdev_children; 1602 1603 if (numerrors > nparity) { 1604 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1605 return (lasterror); 1606 } 1607 1608 return (0); 1609 } 1610 1611 static void 1612 vdev_raidz_close(vdev_t *vd) 1613 { 1614 int c; 1615 1616 for (c = 0; c < vd->vdev_children; c++) 1617 vdev_close(vd->vdev_child[c]); 1618 } 1619 1620 /* 1621 * Handle a read or write I/O to a RAID-Z dump device. 1622 * 1623 * The dump device is in a unique situation compared to other ZFS datasets: 1624 * writing to this device should be as simple and fast as possible. In 1625 * addition, durability matters much less since the dump will be extracted 1626 * once the machine reboots. For that reason, this function eschews parity for 1627 * performance and simplicity. The dump device uses the checksum setting 1628 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1629 * dataset. 1630 * 1631 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1632 * 128 KB will not fill an entire block; in addition, they may not be properly 1633 * aligned. In that case, this function uses the preallocated 128 KB block and 1634 * omits reading or writing any "empty" portions of that block, as opposed to 1635 * allocating a fresh appropriately-sized block. 1636 * 1637 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1638 * 1639 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1640 * 1641 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1642 * allocated which spans all five child vdevs. 8 KB of data would be written to 1643 * each of four vdevs, with the fifth containing the parity bits. 1644 * 1645 * parity data data data data 1646 * | PP | XX | XX | XX | XX | 1647 * ^ ^ ^ ^ ^ 1648 * | | | | | 1649 * 8 KB parity ------8 KB data blocks------ 1650 * 1651 * However, when writing to the dump device, the behavior is different: 1652 * 1653 * vdev_raidz_dumpio(data, size: 32 KB, offset: 64 KB) 1654 * 1655 * Unlike the normal RAID-Z case in which the block is allocated based on the 1656 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1657 * I/O size is less than 128 KB, only the actual portions of data are written. 1658 * In this example the data is written to the third data vdev since that vdev 1659 * contains the offset [64 KB, 96 KB). 1660 * 1661 * parity data data data data 1662 * | | | | XX | | 1663 * ^ 1664 * | 1665 * 32 KB data block 1666 * 1667 * As a result, an individual I/O may not span all child vdevs; moreover, a 1668 * small I/O may only operate on a single child vdev. 1669 * 1670 * Note that since there are no parity bits calculated or written, this format 1671 * remains the same no matter how many parity bits are used in a normal RAID-Z 1672 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1673 * would look like: 1674 * 1675 * parity parity parity data data data data 1676 * | | | | | | XX | | 1677 * ^ 1678 * | 1679 * 32 KB data block 1680 */ 1681 static int 1682 vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, 1683 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1684 { 1685 vdev_t *tvd = vd->vdev_top; 1686 vdev_t *cvd; 1687 raidz_map_t *rm; 1688 raidz_col_t *rc; 1689 int c, err = 0; 1690 1691 uint64_t start, end, colstart, colend; 1692 uint64_t coloffset, colsize, colskip; 1693 1694 #ifdef _KERNEL 1695 1696 /* 1697 * Don't write past the end of the block 1698 */ 1699 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1700 1701 start = offset; 1702 end = start + size; 1703 1704 /* 1705 * Allocate a RAID-Z map for this block. Note that this block starts 1706 * from the "original" offset, this is, the offset of the extent which 1707 * contains the requisite offset of the data being read or written. 1708 * 1709 * Even if this I/O operation doesn't span the full block size, let's 1710 * treat the on-disk format as if the only blocks are the complete 128 1711 * KB size. 1712 */ 1713 1714 /* First, fake a zio for vdev_raidz_map_alloc. */ 1715 zio_t *zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 1716 zio->io_offset = origoffset; 1717 zio->io_size = SPA_OLD_MAXBLOCKSIZE; 1718 zio->io_abd = abd_get_from_buf(data - (offset - origoffset), 1719 SPA_OLD_MAXBLOCKSIZE); 1720 1721 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1722 vd->vdev_nparity); 1723 1724 coloffset = origoffset; 1725 1726 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1727 c++, coloffset += rc->rc_size) { 1728 rc = &rm->rm_col[c]; 1729 cvd = vd->vdev_child[rc->rc_devidx]; 1730 1731 if (cvd->vdev_ops->vdev_op_dumpio == NULL) { 1732 err = EINVAL; 1733 break; 1734 } 1735 1736 /* 1737 * Find the start and end of this column in the RAID-Z map, 1738 * keeping in mind that the stated size and offset of the 1739 * operation may not fill the entire column for this vdev. 1740 * 1741 * If any portion of the data spans this column, issue the 1742 * appropriate operation to the vdev. 1743 */ 1744 if (coloffset + rc->rc_size <= start) 1745 continue; 1746 if (coloffset >= end) 1747 continue; 1748 1749 colstart = MAX(coloffset, start); 1750 colend = MIN(end, coloffset + rc->rc_size); 1751 colsize = colend - colstart; 1752 colskip = colstart - coloffset; 1753 1754 VERIFY3U(colsize, <=, rc->rc_size); 1755 VERIFY3U(colskip, <=, rc->rc_size); 1756 1757 /* 1758 * Note that the child vdev will have a vdev label at the start 1759 * of its range of offsets, hence the need for 1760 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1761 * example of why this calculation is needed. 1762 */ 1763 if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd, 1764 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1765 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 0, 1766 doread, isdump)) != 0) 1767 break; 1768 } 1769 1770 vdev_raidz_map_free(rm); 1771 abd_put(zio->io_abd); 1772 kmem_free(zio, sizeof (zio_t)); 1773 1774 #endif /* KERNEL */ 1775 1776 return (err); 1777 } 1778 1779 static uint64_t 1780 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1781 { 1782 uint64_t asize; 1783 uint64_t ashift = vd->vdev_top->vdev_ashift; 1784 uint64_t cols = vd->vdev_children; 1785 uint64_t nparity = vd->vdev_nparity; 1786 1787 asize = ((psize - 1) >> ashift) + 1; 1788 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1789 asize = roundup(asize, nparity + 1) << ashift; 1790 1791 return (asize); 1792 } 1793 1794 static void 1795 vdev_raidz_child_done(zio_t *zio) 1796 { 1797 raidz_col_t *rc = zio->io_private; 1798 1799 rc->rc_error = zio->io_error; 1800 rc->rc_tried = 1; 1801 rc->rc_skipped = 0; 1802 } 1803 1804 static void 1805 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1806 { 1807 #ifdef ZFS_DEBUG 1808 vdev_t *vd = zio->io_vd; 1809 vdev_t *tvd = vd->vdev_top; 1810 1811 range_seg64_t logical_rs, physical_rs; 1812 logical_rs.rs_start = zio->io_offset; 1813 logical_rs.rs_end = logical_rs.rs_start + 1814 vdev_raidz_asize(zio->io_vd, zio->io_size); 1815 1816 raidz_col_t *rc = &rm->rm_col[col]; 1817 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1818 1819 vdev_xlate(cvd, &logical_rs, &physical_rs); 1820 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1821 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1822 /* 1823 * It would be nice to assert that rs_end is equal 1824 * to rc_offset + rc_size but there might be an 1825 * optional I/O at the end that is not accounted in 1826 * rc_size. 1827 */ 1828 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1829 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1830 rc->rc_size + (1 << tvd->vdev_ashift)); 1831 } else { 1832 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1833 } 1834 #endif 1835 } 1836 1837 /* 1838 * Start an IO operation on a RAIDZ VDev 1839 * 1840 * Outline: 1841 * - For write operations: 1842 * 1. Generate the parity data 1843 * 2. Create child zio write operations to each column's vdev, for both 1844 * data and parity. 1845 * 3. If the column skips any sectors for padding, create optional dummy 1846 * write zio children for those areas to improve aggregation continuity. 1847 * - For read operations: 1848 * 1. Create child zio read operations to each data column's vdev to read 1849 * the range of data required for zio. 1850 * 2. If this is a scrub or resilver operation, or if any of the data 1851 * vdevs have had errors, then create zio read operations to the parity 1852 * columns' VDevs as well. 1853 */ 1854 static void 1855 vdev_raidz_io_start(zio_t *zio) 1856 { 1857 vdev_t *vd = zio->io_vd; 1858 vdev_t *tvd = vd->vdev_top; 1859 vdev_t *cvd; 1860 raidz_map_t *rm; 1861 raidz_col_t *rc; 1862 int c, i; 1863 1864 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1865 vd->vdev_nparity); 1866 1867 zio->io_vsd = rm; 1868 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1869 1870 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1871 1872 if (zio->io_type == ZIO_TYPE_WRITE) { 1873 vdev_raidz_generate_parity(rm); 1874 1875 for (c = 0; c < rm->rm_cols; c++) { 1876 rc = &rm->rm_col[c]; 1877 cvd = vd->vdev_child[rc->rc_devidx]; 1878 1879 /* 1880 * Verify physical to logical translation. 1881 */ 1882 vdev_raidz_io_verify(zio, rm, c); 1883 1884 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1885 rc->rc_offset, rc->rc_abd, rc->rc_size, 1886 zio->io_type, zio->io_priority, 0, 1887 vdev_raidz_child_done, rc)); 1888 } 1889 1890 /* 1891 * Generate optional I/Os for any skipped sectors to improve 1892 * aggregation contiguity. 1893 */ 1894 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1895 ASSERT(c <= rm->rm_scols); 1896 if (c == rm->rm_scols) 1897 c = 0; 1898 rc = &rm->rm_col[c]; 1899 cvd = vd->vdev_child[rc->rc_devidx]; 1900 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1901 rc->rc_offset + rc->rc_size, NULL, 1902 1 << tvd->vdev_ashift, 1903 zio->io_type, zio->io_priority, 1904 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1905 } 1906 1907 zio_execute(zio); 1908 return; 1909 } 1910 1911 ASSERT(zio->io_type == ZIO_TYPE_READ); 1912 1913 /* 1914 * Iterate over the columns in reverse order so that we hit the parity 1915 * last -- any errors along the way will force us to read the parity. 1916 */ 1917 for (c = rm->rm_cols - 1; c >= 0; c--) { 1918 rc = &rm->rm_col[c]; 1919 cvd = vd->vdev_child[rc->rc_devidx]; 1920 if (!vdev_readable(cvd)) { 1921 if (c >= rm->rm_firstdatacol) 1922 rm->rm_missingdata++; 1923 else 1924 rm->rm_missingparity++; 1925 rc->rc_error = SET_ERROR(ENXIO); 1926 rc->rc_tried = 1; /* don't even try */ 1927 rc->rc_skipped = 1; 1928 continue; 1929 } 1930 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1931 if (c >= rm->rm_firstdatacol) 1932 rm->rm_missingdata++; 1933 else 1934 rm->rm_missingparity++; 1935 rc->rc_error = SET_ERROR(ESTALE); 1936 rc->rc_skipped = 1; 1937 continue; 1938 } 1939 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1940 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1941 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1942 rc->rc_offset, rc->rc_abd, rc->rc_size, 1943 zio->io_type, zio->io_priority, 0, 1944 vdev_raidz_child_done, rc)); 1945 } 1946 } 1947 1948 zio_execute(zio); 1949 } 1950 1951 1952 /* 1953 * Report a checksum error for a child of a RAID-Z device. 1954 */ 1955 static void 1956 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1957 { 1958 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1959 1960 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1961 zio_bad_cksum_t zbc; 1962 raidz_map_t *rm = zio->io_vsd; 1963 1964 mutex_enter(&vd->vdev_stat_lock); 1965 vd->vdev_stat.vs_checksum_errors++; 1966 mutex_exit(&vd->vdev_stat_lock); 1967 1968 zbc.zbc_has_cksum = 0; 1969 zbc.zbc_injected = rm->rm_ecksuminjected; 1970 1971 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1972 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1973 rc->rc_abd, bad_data, &zbc); 1974 } 1975 } 1976 1977 /* 1978 * We keep track of whether or not there were any injected errors, so that 1979 * any ereports we generate can note it. 1980 */ 1981 static int 1982 raidz_checksum_verify(zio_t *zio) 1983 { 1984 zio_bad_cksum_t zbc; 1985 raidz_map_t *rm = zio->io_vsd; 1986 1987 int ret = zio_checksum_error(zio, &zbc); 1988 if (ret != 0 && zbc.zbc_injected != 0) 1989 rm->rm_ecksuminjected = 1; 1990 1991 return (ret); 1992 } 1993 1994 /* 1995 * Generate the parity from the data columns. If we tried and were able to 1996 * read the parity without error, verify that the generated parity matches the 1997 * data we read. If it doesn't, we fire off a checksum error. Return the 1998 * number such failures. 1999 */ 2000 static int 2001 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2002 { 2003 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2004 int c, ret = 0; 2005 raidz_col_t *rc; 2006 2007 blkptr_t *bp = zio->io_bp; 2008 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2009 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2010 2011 if (checksum == ZIO_CHECKSUM_NOPARITY) 2012 return (ret); 2013 2014 for (c = 0; c < rm->rm_firstdatacol; c++) { 2015 rc = &rm->rm_col[c]; 2016 if (!rc->rc_tried || rc->rc_error != 0) 2017 continue; 2018 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 2019 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 2020 } 2021 2022 vdev_raidz_generate_parity(rm); 2023 2024 for (c = 0; c < rm->rm_firstdatacol; c++) { 2025 rc = &rm->rm_col[c]; 2026 if (!rc->rc_tried || rc->rc_error != 0) 2027 continue; 2028 if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) { 2029 raidz_checksum_error(zio, rc, orig[c]); 2030 rc->rc_error = SET_ERROR(ECKSUM); 2031 ret++; 2032 } 2033 abd_free(orig[c]); 2034 } 2035 2036 return (ret); 2037 } 2038 2039 static int 2040 vdev_raidz_worst_error(raidz_map_t *rm) 2041 { 2042 int error = 0; 2043 2044 for (int c = 0; c < rm->rm_cols; c++) 2045 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2046 2047 return (error); 2048 } 2049 2050 /* 2051 * Iterate over all combinations of bad data and attempt a reconstruction. 2052 * Note that the algorithm below is non-optimal because it doesn't take into 2053 * account how reconstruction is actually performed. For example, with 2054 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2055 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2056 * cases we'd only use parity information in column 0. 2057 */ 2058 static int 2059 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2060 { 2061 raidz_map_t *rm = zio->io_vsd; 2062 raidz_col_t *rc; 2063 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2064 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2065 int *tgts = &tstore[1]; 2066 int current, next, i, c, n; 2067 int code, ret = 0; 2068 2069 ASSERT(total_errors < rm->rm_firstdatacol); 2070 2071 /* 2072 * This simplifies one edge condition. 2073 */ 2074 tgts[-1] = -1; 2075 2076 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2077 /* 2078 * Initialize the targets array by finding the first n columns 2079 * that contain no error. 2080 * 2081 * If there were no data errors, we need to ensure that we're 2082 * always explicitly attempting to reconstruct at least one 2083 * data column. To do this, we simply push the highest target 2084 * up into the data columns. 2085 */ 2086 for (c = 0, i = 0; i < n; i++) { 2087 if (i == n - 1 && data_errors == 0 && 2088 c < rm->rm_firstdatacol) { 2089 c = rm->rm_firstdatacol; 2090 } 2091 2092 while (rm->rm_col[c].rc_error != 0) { 2093 c++; 2094 ASSERT3S(c, <, rm->rm_cols); 2095 } 2096 2097 tgts[i] = c++; 2098 } 2099 2100 /* 2101 * Setting tgts[n] simplifies the other edge condition. 2102 */ 2103 tgts[n] = rm->rm_cols; 2104 2105 /* 2106 * These buffers were allocated in previous iterations. 2107 */ 2108 for (i = 0; i < n - 1; i++) { 2109 ASSERT(orig[i] != NULL); 2110 } 2111 2112 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, 2113 rm->rm_col[0].rc_size); 2114 2115 current = 0; 2116 next = tgts[current]; 2117 2118 while (current != n) { 2119 tgts[current] = next; 2120 current = 0; 2121 2122 /* 2123 * Save off the original data that we're going to 2124 * attempt to reconstruct. 2125 */ 2126 for (i = 0; i < n; i++) { 2127 ASSERT(orig[i] != NULL); 2128 c = tgts[i]; 2129 ASSERT3S(c, >=, 0); 2130 ASSERT3S(c, <, rm->rm_cols); 2131 rc = &rm->rm_col[c]; 2132 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2133 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2134 abd_copy_off(orig[i], rc->rc_abd, 0, 0, 2135 rc->rc_size); 2136 } 2137 2138 /* 2139 * Attempt a reconstruction and exit the outer loop on 2140 * success. 2141 */ 2142 code = vdev_raidz_reconstruct(rm, tgts, n); 2143 if (raidz_checksum_verify(zio) == 0) { 2144 2145 for (i = 0; i < n; i++) { 2146 c = tgts[i]; 2147 rc = &rm->rm_col[c]; 2148 ASSERT(rc->rc_error == 0); 2149 if (rc->rc_tried) 2150 raidz_checksum_error(zio, rc, 2151 orig[i]); 2152 rc->rc_error = SET_ERROR(ECKSUM); 2153 } 2154 2155 ret = code; 2156 goto done; 2157 } 2158 2159 /* 2160 * Restore the original data. 2161 */ 2162 for (i = 0; i < n; i++) { 2163 c = tgts[i]; 2164 rc = &rm->rm_col[c]; 2165 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2166 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2167 abd_copy_off(rc->rc_abd, orig[i], 0, 0, 2168 rc->rc_size); 2169 } 2170 2171 do { 2172 /* 2173 * Find the next valid column after the current 2174 * position.. 2175 */ 2176 for (next = tgts[current] + 1; 2177 next < rm->rm_cols && 2178 rm->rm_col[next].rc_error != 0; next++) 2179 continue; 2180 2181 ASSERT(next <= tgts[current + 1]); 2182 2183 /* 2184 * If that spot is available, we're done here. 2185 */ 2186 if (next != tgts[current + 1]) 2187 break; 2188 2189 /* 2190 * Otherwise, find the next valid column after 2191 * the previous position. 2192 */ 2193 for (c = tgts[current - 1] + 1; 2194 rm->rm_col[c].rc_error != 0; c++) 2195 continue; 2196 2197 tgts[current] = c; 2198 current++; 2199 2200 } while (current != n); 2201 } 2202 } 2203 n--; 2204 done: 2205 for (i = 0; i < n; i++) 2206 abd_free(orig[i]); 2207 2208 return (ret); 2209 } 2210 2211 /* 2212 * Complete an IO operation on a RAIDZ VDev 2213 * 2214 * Outline: 2215 * - For write operations: 2216 * 1. Check for errors on the child IOs. 2217 * 2. Return, setting an error code if too few child VDevs were written 2218 * to reconstruct the data later. Note that partial writes are 2219 * considered successful if they can be reconstructed at all. 2220 * - For read operations: 2221 * 1. Check for errors on the child IOs. 2222 * 2. If data errors occurred: 2223 * a. Try to reassemble the data from the parity available. 2224 * b. If we haven't yet read the parity drives, read them now. 2225 * c. If all parity drives have been read but the data still doesn't 2226 * reassemble with a correct checksum, then try combinatorial 2227 * reconstruction. 2228 * d. If that doesn't work, return an error. 2229 * 3. If there were unexpected errors or this is a resilver operation, 2230 * rewrite the vdevs that had errors. 2231 */ 2232 static void 2233 vdev_raidz_io_done(zio_t *zio) 2234 { 2235 vdev_t *vd = zio->io_vd; 2236 vdev_t *cvd; 2237 raidz_map_t *rm = zio->io_vsd; 2238 raidz_col_t *rc; 2239 int unexpected_errors = 0; 2240 int parity_errors = 0; 2241 int parity_untried = 0; 2242 int data_errors = 0; 2243 int total_errors = 0; 2244 int n, c; 2245 int tgts[VDEV_RAIDZ_MAXPARITY]; 2246 int code; 2247 2248 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2249 2250 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2251 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2252 2253 for (c = 0; c < rm->rm_cols; c++) { 2254 rc = &rm->rm_col[c]; 2255 2256 if (rc->rc_error) { 2257 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2258 2259 if (c < rm->rm_firstdatacol) 2260 parity_errors++; 2261 else 2262 data_errors++; 2263 2264 if (!rc->rc_skipped) 2265 unexpected_errors++; 2266 2267 total_errors++; 2268 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2269 parity_untried++; 2270 } 2271 } 2272 2273 if (zio->io_type == ZIO_TYPE_WRITE) { 2274 /* 2275 * XXX -- for now, treat partial writes as a success. 2276 * (If we couldn't write enough columns to reconstruct 2277 * the data, the I/O failed. Otherwise, good enough.) 2278 * 2279 * Now that we support write reallocation, it would be better 2280 * to treat partial failure as real failure unless there are 2281 * no non-degraded top-level vdevs left, and not update DTLs 2282 * if we intend to reallocate. 2283 */ 2284 /* XXPOLICY */ 2285 if (total_errors > rm->rm_firstdatacol) 2286 zio->io_error = vdev_raidz_worst_error(rm); 2287 2288 return; 2289 } 2290 2291 ASSERT(zio->io_type == ZIO_TYPE_READ); 2292 /* 2293 * There are three potential phases for a read: 2294 * 1. produce valid data from the columns read 2295 * 2. read all disks and try again 2296 * 3. perform combinatorial reconstruction 2297 * 2298 * Each phase is progressively both more expensive and less likely to 2299 * occur. If we encounter more errors than we can repair or all phases 2300 * fail, we have no choice but to return an error. 2301 */ 2302 2303 /* 2304 * If the number of errors we saw was correctable -- less than or equal 2305 * to the number of parity disks read -- attempt to produce data that 2306 * has a valid checksum. Naturally, this case applies in the absence of 2307 * any errors. 2308 */ 2309 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2310 if (data_errors == 0) { 2311 if (raidz_checksum_verify(zio) == 0) { 2312 /* 2313 * If we read parity information (unnecessarily 2314 * as it happens since no reconstruction was 2315 * needed) regenerate and verify the parity. 2316 * We also regenerate parity when resilvering 2317 * so we can write it out to the failed device 2318 * later. 2319 */ 2320 if (parity_errors + parity_untried < 2321 rm->rm_firstdatacol || 2322 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2323 n = raidz_parity_verify(zio, rm); 2324 unexpected_errors += n; 2325 ASSERT(parity_errors + n <= 2326 rm->rm_firstdatacol); 2327 } 2328 goto done; 2329 } 2330 } else { 2331 /* 2332 * We either attempt to read all the parity columns or 2333 * none of them. If we didn't try to read parity, we 2334 * wouldn't be here in the correctable case. There must 2335 * also have been fewer parity errors than parity 2336 * columns or, again, we wouldn't be in this code path. 2337 */ 2338 ASSERT(parity_untried == 0); 2339 ASSERT(parity_errors < rm->rm_firstdatacol); 2340 2341 /* 2342 * Identify the data columns that reported an error. 2343 */ 2344 n = 0; 2345 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2346 rc = &rm->rm_col[c]; 2347 if (rc->rc_error != 0) { 2348 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2349 tgts[n++] = c; 2350 } 2351 } 2352 2353 ASSERT(rm->rm_firstdatacol >= n); 2354 2355 code = vdev_raidz_reconstruct(rm, tgts, n); 2356 2357 if (raidz_checksum_verify(zio) == 0) { 2358 /* 2359 * If we read more parity disks than were used 2360 * for reconstruction, confirm that the other 2361 * parity disks produced correct data. This 2362 * routine is suboptimal in that it regenerates 2363 * the parity that we already used in addition 2364 * to the parity that we're attempting to 2365 * verify, but this should be a relatively 2366 * uncommon case, and can be optimized if it 2367 * becomes a problem. Note that we regenerate 2368 * parity when resilvering so we can write it 2369 * out to failed devices later. 2370 */ 2371 if (parity_errors < rm->rm_firstdatacol - n || 2372 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2373 n = raidz_parity_verify(zio, rm); 2374 unexpected_errors += n; 2375 ASSERT(parity_errors + n <= 2376 rm->rm_firstdatacol); 2377 } 2378 2379 goto done; 2380 } 2381 } 2382 } 2383 2384 /* 2385 * This isn't a typical situation -- either we got a read error or 2386 * a child silently returned bad data. Read every block so we can 2387 * try again with as much data and parity as we can track down. If 2388 * we've already been through once before, all children will be marked 2389 * as tried so we'll proceed to combinatorial reconstruction. 2390 */ 2391 unexpected_errors = 1; 2392 rm->rm_missingdata = 0; 2393 rm->rm_missingparity = 0; 2394 2395 for (c = 0; c < rm->rm_cols; c++) { 2396 if (rm->rm_col[c].rc_tried) 2397 continue; 2398 2399 zio_vdev_io_redone(zio); 2400 do { 2401 rc = &rm->rm_col[c]; 2402 if (rc->rc_tried) 2403 continue; 2404 zio_nowait(zio_vdev_child_io(zio, NULL, 2405 vd->vdev_child[rc->rc_devidx], 2406 rc->rc_offset, rc->rc_abd, rc->rc_size, 2407 zio->io_type, zio->io_priority, 0, 2408 vdev_raidz_child_done, rc)); 2409 } while (++c < rm->rm_cols); 2410 2411 return; 2412 } 2413 2414 /* 2415 * At this point we've attempted to reconstruct the data given the 2416 * errors we detected, and we've attempted to read all columns. There 2417 * must, therefore, be one or more additional problems -- silent errors 2418 * resulting in invalid data rather than explicit I/O errors resulting 2419 * in absent data. We check if there is enough additional data to 2420 * possibly reconstruct the data and then perform combinatorial 2421 * reconstruction over all possible combinations. If that fails, 2422 * we're cooked. 2423 */ 2424 if (total_errors > rm->rm_firstdatacol) { 2425 zio->io_error = vdev_raidz_worst_error(rm); 2426 2427 } else if (total_errors < rm->rm_firstdatacol && 2428 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2429 /* 2430 * If we didn't use all the available parity for the 2431 * combinatorial reconstruction, verify that the remaining 2432 * parity is correct. 2433 */ 2434 if (code != (1 << rm->rm_firstdatacol) - 1) 2435 (void) raidz_parity_verify(zio, rm); 2436 } else { 2437 /* 2438 * We're here because either: 2439 * 2440 * total_errors == rm_first_datacol, or 2441 * vdev_raidz_combrec() failed 2442 * 2443 * In either case, there is enough bad data to prevent 2444 * reconstruction. 2445 * 2446 * Start checksum ereports for all children which haven't 2447 * failed, and the IO wasn't speculative. 2448 */ 2449 zio->io_error = SET_ERROR(ECKSUM); 2450 2451 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2452 for (c = 0; c < rm->rm_cols; c++) { 2453 rc = &rm->rm_col[c]; 2454 if (rc->rc_error == 0) { 2455 zio_bad_cksum_t zbc; 2456 zbc.zbc_has_cksum = 0; 2457 zbc.zbc_injected = 2458 rm->rm_ecksuminjected; 2459 2460 zfs_ereport_start_checksum( 2461 zio->io_spa, 2462 vd->vdev_child[rc->rc_devidx], 2463 &zio->io_bookmark, zio, 2464 rc->rc_offset, rc->rc_size, 2465 (void *)(uintptr_t)c, &zbc); 2466 } 2467 } 2468 } 2469 } 2470 2471 done: 2472 zio_checksum_verified(zio); 2473 2474 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2475 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2476 /* 2477 * Use the good data we have in hand to repair damaged children. 2478 */ 2479 for (c = 0; c < rm->rm_cols; c++) { 2480 rc = &rm->rm_col[c]; 2481 cvd = vd->vdev_child[rc->rc_devidx]; 2482 2483 if (rc->rc_error == 0) 2484 continue; 2485 2486 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2487 rc->rc_offset, rc->rc_abd, rc->rc_size, 2488 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2489 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2490 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2491 } 2492 } 2493 } 2494 2495 static void 2496 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2497 { 2498 if (faulted > vd->vdev_nparity) 2499 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2500 VDEV_AUX_NO_REPLICAS); 2501 else if (degraded + faulted != 0) 2502 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2503 else 2504 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2505 } 2506 2507 /* 2508 * Determine if any portion of the provided block resides on a child vdev 2509 * with a dirty DTL and therefore needs to be resilvered. The function 2510 * assumes that at least one DTL is dirty which implies that full stripe 2511 * width blocks must be resilvered. 2512 */ 2513 static boolean_t 2514 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2515 { 2516 uint64_t dcols = vd->vdev_children; 2517 uint64_t nparity = vd->vdev_nparity; 2518 uint64_t ashift = vd->vdev_top->vdev_ashift; 2519 /* The starting RAIDZ (parent) vdev sector of the block. */ 2520 uint64_t b = offset >> ashift; 2521 /* The zio's size in units of the vdev's minimum sector size. */ 2522 uint64_t s = ((psize - 1) >> ashift) + 1; 2523 /* The first column for this stripe. */ 2524 uint64_t f = b % dcols; 2525 2526 if (s + nparity >= dcols) 2527 return (B_TRUE); 2528 2529 for (uint64_t c = 0; c < s + nparity; c++) { 2530 uint64_t devidx = (f + c) % dcols; 2531 vdev_t *cvd = vd->vdev_child[devidx]; 2532 2533 /* 2534 * dsl_scan_need_resilver() already checked vd with 2535 * vdev_dtl_contains(). So here just check cvd with 2536 * vdev_dtl_empty(), cheaper and a good approximation. 2537 */ 2538 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2539 return (B_TRUE); 2540 } 2541 2542 return (B_FALSE); 2543 } 2544 2545 static void 2546 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) 2547 { 2548 vdev_t *raidvd = cvd->vdev_parent; 2549 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2550 2551 uint64_t width = raidvd->vdev_children; 2552 uint64_t tgt_col = cvd->vdev_id; 2553 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2554 2555 /* make sure the offsets are block-aligned */ 2556 ASSERT0(in->rs_start % (1 << ashift)); 2557 ASSERT0(in->rs_end % (1 << ashift)); 2558 uint64_t b_start = in->rs_start >> ashift; 2559 uint64_t b_end = in->rs_end >> ashift; 2560 2561 uint64_t start_row = 0; 2562 if (b_start > tgt_col) /* avoid underflow */ 2563 start_row = ((b_start - tgt_col - 1) / width) + 1; 2564 2565 uint64_t end_row = 0; 2566 if (b_end > tgt_col) 2567 end_row = ((b_end - tgt_col - 1) / width) + 1; 2568 2569 res->rs_start = start_row << ashift; 2570 res->rs_end = end_row << ashift; 2571 2572 ASSERT3U(res->rs_start, <=, in->rs_start); 2573 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2574 } 2575 2576 vdev_ops_t vdev_raidz_ops = { 2577 .vdev_op_open = vdev_raidz_open, 2578 .vdev_op_close = vdev_raidz_close, 2579 .vdev_op_asize = vdev_raidz_asize, 2580 .vdev_op_io_start = vdev_raidz_io_start, 2581 .vdev_op_io_done = vdev_raidz_io_done, 2582 .vdev_op_state_change = vdev_raidz_state_change, 2583 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2584 .vdev_op_hold = NULL, 2585 .vdev_op_rele = NULL, 2586 .vdev_op_remap = NULL, 2587 .vdev_op_xlate = vdev_raidz_xlate, 2588 .vdev_op_dumpio = vdev_raidz_dumpio, 2589 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2590 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2591 }; 2592