1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 * Copyright 2019 Joyent, Inc. 27 * Copyright (c) 2014 Integros [integros.com] 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_file.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/abd.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/fm/fs/zfs.h> 39 #include <sys/vdev_raidz.h> 40 #include <sys/vdev_raidz_impl.h> 41 42 #ifdef ZFS_DEBUG 43 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 44 #endif 45 46 /* 47 * Virtual device vector for RAID-Z. 48 * 49 * This vdev supports single, double, and triple parity. For single parity, 50 * we use a simple XOR of all the data columns. For double or triple parity, 51 * we use a special case of Reed-Solomon coding. This extends the 52 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 53 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 54 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 55 * former is also based. The latter is designed to provide higher performance 56 * for writes. 57 * 58 * Note that the Plank paper claimed to support arbitrary N+M, but was then 59 * amended six years later identifying a critical flaw that invalidates its 60 * claims. Nevertheless, the technique can be adapted to work for up to 61 * triple parity. For additional parity, the amendment "Note: Correction to 62 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 63 * is viable, but the additional complexity means that write performance will 64 * suffer. 65 * 66 * All of the methods above operate on a Galois field, defined over the 67 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 68 * can be expressed with a single byte. Briefly, the operations on the 69 * field are defined as follows: 70 * 71 * o addition (+) is represented by a bitwise XOR 72 * o subtraction (-) is therefore identical to addition: A + B = A - B 73 * o multiplication of A by 2 is defined by the following bitwise expression: 74 * 75 * (A * 2)_7 = A_6 76 * (A * 2)_6 = A_5 77 * (A * 2)_5 = A_4 78 * (A * 2)_4 = A_3 + A_7 79 * (A * 2)_3 = A_2 + A_7 80 * (A * 2)_2 = A_1 + A_7 81 * (A * 2)_1 = A_0 82 * (A * 2)_0 = A_7 83 * 84 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 85 * As an aside, this multiplication is derived from the error correcting 86 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 87 * 88 * Observe that any number in the field (except for 0) can be expressed as a 89 * power of 2 -- a generator for the field. We store a table of the powers of 90 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 91 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 92 * than field addition). The inverse of a field element A (A^-1) is therefore 93 * A ^ (255 - 1) = A^254. 94 * 95 * The up-to-three parity columns, P, Q, R over several data columns, 96 * D_0, ... D_n-1, can be expressed by field operations: 97 * 98 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 99 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 100 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 101 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 102 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 103 * 104 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 105 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 106 * independent coefficients. (There are no additional coefficients that have 107 * this property which is why the uncorrected Plank method breaks down.) 108 * 109 * See the reconstruction code below for how P, Q and R can used individually 110 * or in concert to recover missing data columns. 111 */ 112 113 #define VDEV_RAIDZ_P 0 114 #define VDEV_RAIDZ_Q 1 115 #define VDEV_RAIDZ_R 2 116 117 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 118 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 119 120 /* 121 * We provide a mechanism to perform the field multiplication operation on a 122 * 64-bit value all at once rather than a byte at a time. This works by 123 * creating a mask from the top bit in each byte and using that to 124 * conditionally apply the XOR of 0x1d. 125 */ 126 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 127 { \ 128 (mask) = (x) & 0x8080808080808080ULL; \ 129 (mask) = ((mask) << 1) - ((mask) >> 7); \ 130 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 131 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 132 } 133 134 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 135 { \ 136 VDEV_RAIDZ_64MUL_2((x), mask); \ 137 VDEV_RAIDZ_64MUL_2((x), mask); \ 138 } 139 140 void 141 vdev_raidz_map_free(raidz_map_t *rm) 142 { 143 int c; 144 145 for (c = 0; c < rm->rm_firstdatacol; c++) { 146 abd_free(rm->rm_col[c].rc_abd); 147 148 if (rm->rm_col[c].rc_gdata != NULL) 149 abd_free(rm->rm_col[c].rc_gdata); 150 } 151 152 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 153 abd_put(rm->rm_col[c].rc_abd); 154 155 if (rm->rm_abd_copy != NULL) 156 abd_free(rm->rm_abd_copy); 157 158 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 159 } 160 161 static void 162 vdev_raidz_map_free_vsd(zio_t *zio) 163 { 164 raidz_map_t *rm = zio->io_vsd; 165 166 ASSERT0(rm->rm_freed); 167 rm->rm_freed = 1; 168 169 if (rm->rm_reports == 0) 170 vdev_raidz_map_free(rm); 171 } 172 173 /*ARGSUSED*/ 174 static void 175 vdev_raidz_cksum_free(void *arg, size_t ignored) 176 { 177 raidz_map_t *rm = arg; 178 179 ASSERT3U(rm->rm_reports, >, 0); 180 181 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 182 vdev_raidz_map_free(rm); 183 } 184 185 static void 186 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) 187 { 188 raidz_map_t *rm = zcr->zcr_cbdata; 189 const size_t c = zcr->zcr_cbinfo; 190 size_t x, offset; 191 192 const abd_t *good = NULL; 193 const abd_t *bad = rm->rm_col[c].rc_abd; 194 195 if (good_data == NULL) { 196 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 197 return; 198 } 199 200 if (c < rm->rm_firstdatacol) { 201 /* 202 * The first time through, calculate the parity blocks for 203 * the good data (this relies on the fact that the good 204 * data never changes for a given logical ZIO) 205 */ 206 if (rm->rm_col[0].rc_gdata == NULL) { 207 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 208 209 /* 210 * Set up the rm_col[]s to generate the parity for 211 * good_data, first saving the parity bufs and 212 * replacing them with buffers to hold the result. 213 */ 214 for (x = 0; x < rm->rm_firstdatacol; x++) { 215 bad_parity[x] = rm->rm_col[x].rc_abd; 216 rm->rm_col[x].rc_abd = 217 rm->rm_col[x].rc_gdata = 218 abd_alloc_sametype(rm->rm_col[x].rc_abd, 219 rm->rm_col[x].rc_size); 220 } 221 222 /* fill in the data columns from good_data */ 223 offset = 0; 224 for (; x < rm->rm_cols; x++) { 225 abd_put(rm->rm_col[x].rc_abd); 226 227 rm->rm_col[x].rc_abd = 228 abd_get_offset_size((abd_t *)good_data, 229 offset, rm->rm_col[x].rc_size); 230 offset += rm->rm_col[x].rc_size; 231 } 232 233 /* 234 * Construct the parity from the good data. 235 */ 236 vdev_raidz_generate_parity(rm); 237 238 /* restore everything back to its original state */ 239 for (x = 0; x < rm->rm_firstdatacol; x++) 240 rm->rm_col[x].rc_abd = bad_parity[x]; 241 242 offset = 0; 243 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 244 abd_put(rm->rm_col[x].rc_abd); 245 rm->rm_col[x].rc_abd = abd_get_offset_size( 246 rm->rm_abd_copy, offset, 247 rm->rm_col[x].rc_size); 248 offset += rm->rm_col[x].rc_size; 249 } 250 } 251 252 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 253 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, 254 rm->rm_col[c].rc_size); 255 } else { 256 /* adjust good_data to point at the start of our column */ 257 offset = 0; 258 for (x = rm->rm_firstdatacol; x < c; x++) 259 offset += rm->rm_col[x].rc_size; 260 261 good = abd_get_offset_size((abd_t *)good_data, offset, 262 rm->rm_col[c].rc_size); 263 } 264 265 /* we drop the ereport if it ends up that the data was good */ 266 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 267 abd_put((abd_t *)good); 268 } 269 270 /* 271 * Invoked indirectly by zfs_ereport_start_checksum(), called 272 * below when our read operation fails completely. The main point 273 * is to keep a copy of everything we read from disk, so that at 274 * vdev_raidz_cksum_finish() time we can compare it with the good data. 275 */ 276 static void 277 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 278 { 279 size_t c = (size_t)(uintptr_t)arg; 280 size_t offset; 281 282 raidz_map_t *rm = zio->io_vsd; 283 size_t size; 284 285 /* set up the report and bump the refcount */ 286 zcr->zcr_cbdata = rm; 287 zcr->zcr_cbinfo = c; 288 zcr->zcr_finish = vdev_raidz_cksum_finish; 289 zcr->zcr_free = vdev_raidz_cksum_free; 290 291 rm->rm_reports++; 292 ASSERT3U(rm->rm_reports, >, 0); 293 294 if (rm->rm_abd_copy != NULL) 295 return; 296 297 /* 298 * It's the first time we're called for this raidz_map_t, so we need 299 * to copy the data aside; there's no guarantee that our zio's buffer 300 * won't be re-used for something else. 301 * 302 * Our parity data is already in separate buffers, so there's no need 303 * to copy them. 304 */ 305 306 size = 0; 307 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 308 size += rm->rm_col[c].rc_size; 309 310 rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); 311 312 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 313 raidz_col_t *col = &rm->rm_col[c]; 314 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, 315 col->rc_size); 316 317 ASSERT3S(tmp->abd_size, >=, col->rc_size); 318 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 319 abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size); 320 abd_put(col->rc_abd); 321 col->rc_abd = tmp; 322 323 offset += col->rc_size; 324 } 325 ASSERT3U(offset, ==, size); 326 } 327 328 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 329 vdev_raidz_map_free_vsd, 330 vdev_raidz_cksum_report 331 }; 332 333 /* 334 * Divides the IO evenly across all child vdevs; usually, dcols is 335 * the number of children in the target vdev. 336 */ 337 raidz_map_t * 338 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 339 uint64_t nparity) 340 { 341 raidz_map_t *rm; 342 /* The starting RAIDZ (parent) vdev sector of the block. */ 343 uint64_t b = zio->io_offset >> ashift; 344 /* The zio's size in units of the vdev's minimum sector size. */ 345 uint64_t s = zio->io_size >> ashift; 346 /* The first column for this stripe. */ 347 uint64_t f = b % dcols; 348 /* The starting byte offset on each child vdev. */ 349 uint64_t o = (b / dcols) << ashift; 350 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 351 uint64_t off = 0; 352 353 /* 354 * "Quotient": The number of data sectors for this stripe on all but 355 * the "big column" child vdevs that also contain "remainder" data. 356 */ 357 q = s / (dcols - nparity); 358 359 /* 360 * "Remainder": The number of partial stripe data sectors in this I/O. 361 * This will add a sector to some, but not all, child vdevs. 362 */ 363 r = s - q * (dcols - nparity); 364 365 /* The number of "big columns" - those which contain remainder data. */ 366 bc = (r == 0 ? 0 : r + nparity); 367 368 /* 369 * The total number of data and parity sectors associated with 370 * this I/O. 371 */ 372 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 373 374 /* acols: The columns that will be accessed. */ 375 /* scols: The columns that will be accessed or skipped. */ 376 if (q == 0) { 377 /* Our I/O request doesn't span all child vdevs. */ 378 acols = bc; 379 scols = MIN(dcols, roundup(bc, nparity + 1)); 380 } else { 381 acols = dcols; 382 scols = dcols; 383 } 384 385 ASSERT3U(acols, <=, scols); 386 387 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 388 389 rm->rm_cols = acols; 390 rm->rm_scols = scols; 391 rm->rm_bigcols = bc; 392 rm->rm_skipstart = bc; 393 rm->rm_missingdata = 0; 394 rm->rm_missingparity = 0; 395 rm->rm_firstdatacol = nparity; 396 rm->rm_abd_copy = NULL; 397 rm->rm_reports = 0; 398 rm->rm_freed = 0; 399 rm->rm_ecksuminjected = 0; 400 401 asize = 0; 402 403 for (c = 0; c < scols; c++) { 404 col = f + c; 405 coff = o; 406 if (col >= dcols) { 407 col -= dcols; 408 coff += 1ULL << ashift; 409 } 410 rm->rm_col[c].rc_devidx = col; 411 rm->rm_col[c].rc_offset = coff; 412 rm->rm_col[c].rc_abd = NULL; 413 rm->rm_col[c].rc_gdata = NULL; 414 rm->rm_col[c].rc_error = 0; 415 rm->rm_col[c].rc_tried = 0; 416 rm->rm_col[c].rc_skipped = 0; 417 418 if (c >= acols) 419 rm->rm_col[c].rc_size = 0; 420 else if (c < bc) 421 rm->rm_col[c].rc_size = (q + 1) << ashift; 422 else 423 rm->rm_col[c].rc_size = q << ashift; 424 425 asize += rm->rm_col[c].rc_size; 426 } 427 428 ASSERT3U(asize, ==, tot << ashift); 429 rm->rm_asize = roundup(asize, (nparity + 1) << ashift); 430 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 431 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); 432 ASSERT3U(rm->rm_nskip, <=, nparity); 433 434 for (c = 0; c < rm->rm_firstdatacol; c++) 435 rm->rm_col[c].rc_abd = 436 abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); 437 438 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, 439 rm->rm_col[c].rc_size); 440 off = rm->rm_col[c].rc_size; 441 442 for (c = c + 1; c < acols; c++) { 443 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, 444 rm->rm_col[c].rc_size); 445 off += rm->rm_col[c].rc_size; 446 } 447 448 /* 449 * If all data stored spans all columns, there's a danger that parity 450 * will always be on the same device and, since parity isn't read 451 * during normal operation, that device's I/O bandwidth won't be 452 * used effectively. We therefore switch the parity every 1MB. 453 * 454 * ... at least that was, ostensibly, the theory. As a practical 455 * matter unless we juggle the parity between all devices evenly, we 456 * won't see any benefit. Further, occasional writes that aren't a 457 * multiple of the LCM of the number of children and the minimum 458 * stripe width are sufficient to avoid pessimal behavior. 459 * Unfortunately, this decision created an implicit on-disk format 460 * requirement that we need to support for all eternity, but only 461 * for single-parity RAID-Z. 462 * 463 * If we intend to skip a sector in the zeroth column for padding 464 * we must make sure to note this swap. We will never intend to 465 * skip the first column since at least one data and one parity 466 * column must appear in each row. 467 */ 468 ASSERT(rm->rm_cols >= 2); 469 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 470 471 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 472 devidx = rm->rm_col[0].rc_devidx; 473 o = rm->rm_col[0].rc_offset; 474 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 475 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 476 rm->rm_col[1].rc_devidx = devidx; 477 rm->rm_col[1].rc_offset = o; 478 479 if (rm->rm_skipstart == 0) 480 rm->rm_skipstart = 1; 481 } 482 483 /* init RAIDZ parity ops */ 484 rm->rm_ops = vdev_raidz_math_get_ops(); 485 486 return (rm); 487 } 488 489 struct pqr_struct { 490 uint64_t *p; 491 uint64_t *q; 492 uint64_t *r; 493 }; 494 495 static int 496 vdev_raidz_p_func(void *buf, size_t size, void *private) 497 { 498 struct pqr_struct *pqr = private; 499 const uint64_t *src = buf; 500 int i, cnt = size / sizeof (src[0]); 501 502 ASSERT(pqr->p && !pqr->q && !pqr->r); 503 504 for (i = 0; i < cnt; i++, src++, pqr->p++) 505 *pqr->p ^= *src; 506 507 return (0); 508 } 509 510 static int 511 vdev_raidz_pq_func(void *buf, size_t size, void *private) 512 { 513 struct pqr_struct *pqr = private; 514 const uint64_t *src = buf; 515 uint64_t mask; 516 int i, cnt = size / sizeof (src[0]); 517 518 ASSERT(pqr->p && pqr->q && !pqr->r); 519 520 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 521 *pqr->p ^= *src; 522 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 523 *pqr->q ^= *src; 524 } 525 526 return (0); 527 } 528 529 static int 530 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 531 { 532 struct pqr_struct *pqr = private; 533 const uint64_t *src = buf; 534 uint64_t mask; 535 int i, cnt = size / sizeof (src[0]); 536 537 ASSERT(pqr->p && pqr->q && pqr->r); 538 539 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 540 *pqr->p ^= *src; 541 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 542 *pqr->q ^= *src; 543 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 544 *pqr->r ^= *src; 545 } 546 547 return (0); 548 } 549 550 static void 551 vdev_raidz_generate_parity_p(raidz_map_t *rm) 552 { 553 uint64_t *p; 554 int c; 555 abd_t *src; 556 557 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 558 src = rm->rm_col[c].rc_abd; 559 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 560 561 if (c == rm->rm_firstdatacol) { 562 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 563 } else { 564 struct pqr_struct pqr = { p, NULL, NULL }; 565 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 566 vdev_raidz_p_func, &pqr); 567 } 568 } 569 } 570 571 static void 572 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 573 { 574 uint64_t *p, *q, pcnt, ccnt, mask, i; 575 int c; 576 abd_t *src; 577 578 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 579 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 580 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 581 582 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 583 src = rm->rm_col[c].rc_abd; 584 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 585 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 586 587 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 588 589 if (c == rm->rm_firstdatacol) { 590 ASSERT(ccnt == pcnt || ccnt == 0); 591 592 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 593 (void) memcpy(q, p, rm->rm_col[c].rc_size); 594 for (i = ccnt; i < pcnt; i++) { 595 p[i] = 0; 596 q[i] = 0; 597 } 598 } else { 599 struct pqr_struct pqr = { p, q, NULL }; 600 601 ASSERT(ccnt <= pcnt); 602 603 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 604 vdev_raidz_pq_func, &pqr); 605 606 /* 607 * Treat short columns as though they are full of 0s. 608 * Note that there's therefore nothing needed for P. 609 */ 610 for (i = ccnt; i < pcnt; i++) { 611 VDEV_RAIDZ_64MUL_2(q[i], mask); 612 } 613 } 614 } 615 } 616 617 static void 618 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 619 { 620 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 621 int c; 622 abd_t *src; 623 624 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 625 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 626 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 627 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 628 rm->rm_col[VDEV_RAIDZ_R].rc_size); 629 630 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 631 src = rm->rm_col[c].rc_abd; 632 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 633 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 634 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 635 636 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 637 638 if (c == rm->rm_firstdatacol) { 639 ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size); 640 ASSERT(ccnt == pcnt || ccnt == 0); 641 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 642 (void) memcpy(q, p, rm->rm_col[c].rc_size); 643 (void) memcpy(r, p, rm->rm_col[c].rc_size); 644 645 for (i = ccnt; i < pcnt; i++) { 646 p[i] = 0; 647 q[i] = 0; 648 r[i] = 0; 649 } 650 } else { 651 struct pqr_struct pqr = { p, q, r }; 652 653 ASSERT(ccnt <= pcnt); 654 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 655 vdev_raidz_pqr_func, &pqr); 656 657 /* 658 * Treat short columns as though they are full of 0s. 659 * Note that there's therefore nothing needed for P. 660 */ 661 for (i = ccnt; i < pcnt; i++) { 662 VDEV_RAIDZ_64MUL_2(q[i], mask); 663 VDEV_RAIDZ_64MUL_4(r[i], mask); 664 } 665 } 666 } 667 } 668 669 /* 670 * Generate RAID parity in the first virtual columns according to the number of 671 * parity columns available. 672 */ 673 void 674 vdev_raidz_generate_parity(raidz_map_t *rm) 675 { 676 /* Generate using the new math implementation */ 677 if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) 678 return; 679 680 switch (rm->rm_firstdatacol) { 681 case 1: 682 vdev_raidz_generate_parity_p(rm); 683 break; 684 case 2: 685 vdev_raidz_generate_parity_pq(rm); 686 break; 687 case 3: 688 vdev_raidz_generate_parity_pqr(rm); 689 break; 690 default: 691 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 692 } 693 } 694 695 /* ARGSUSED */ 696 static int 697 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 698 { 699 uint64_t *dst = dbuf; 700 uint64_t *src = sbuf; 701 int cnt = size / sizeof (src[0]); 702 703 for (int i = 0; i < cnt; i++) { 704 dst[i] ^= src[i]; 705 } 706 707 return (0); 708 } 709 710 /* ARGSUSED */ 711 static int 712 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 713 void *private) 714 { 715 uint64_t *dst = dbuf; 716 uint64_t *src = sbuf; 717 uint64_t mask; 718 int cnt = size / sizeof (dst[0]); 719 720 for (int i = 0; i < cnt; i++, dst++, src++) { 721 VDEV_RAIDZ_64MUL_2(*dst, mask); 722 *dst ^= *src; 723 } 724 725 return (0); 726 } 727 728 /* ARGSUSED */ 729 static int 730 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 731 { 732 uint64_t *dst = buf; 733 uint64_t mask; 734 int cnt = size / sizeof (dst[0]); 735 736 for (int i = 0; i < cnt; i++, dst++) { 737 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 738 VDEV_RAIDZ_64MUL_2(*dst, mask); 739 } 740 741 return (0); 742 } 743 744 struct reconst_q_struct { 745 uint64_t *q; 746 int exp; 747 }; 748 749 static int 750 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 751 { 752 struct reconst_q_struct *rq = private; 753 uint64_t *dst = buf; 754 int cnt = size / sizeof (dst[0]); 755 756 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 757 758 *dst ^= *rq->q; 759 int j; 760 uint8_t *b; 761 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 762 *b = vdev_raidz_exp2(*b, rq->exp); 763 } 764 } 765 766 return (0); 767 } 768 769 struct reconst_pq_struct { 770 uint8_t *p; 771 uint8_t *q; 772 uint8_t *pxy; 773 uint8_t *qxy; 774 int aexp; 775 int bexp; 776 }; 777 778 static int 779 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 780 { 781 struct reconst_pq_struct *rpq = private; 782 uint8_t *xd = xbuf; 783 uint8_t *yd = ybuf; 784 785 for (int i = 0; i < size; 786 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 787 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 788 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 789 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 790 } 791 792 return (0); 793 } 794 795 static int 796 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 797 { 798 struct reconst_pq_struct *rpq = private; 799 uint8_t *xd = xbuf; 800 801 for (int i = 0; i < size; 802 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 803 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 804 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 805 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 806 } 807 808 return (0); 809 } 810 811 static int 812 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 813 { 814 int x = tgts[0]; 815 int c; 816 abd_t *dst, *src; 817 818 ASSERT(ntgts == 1); 819 ASSERT(x >= rm->rm_firstdatacol); 820 ASSERT(x < rm->rm_cols); 821 822 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 823 ASSERT(rm->rm_col[x].rc_size > 0); 824 825 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 826 dst = rm->rm_col[x].rc_abd; 827 828 ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size); 829 ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size); 830 abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size); 831 832 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 833 uint64_t size = MIN(rm->rm_col[x].rc_size, 834 rm->rm_col[c].rc_size); 835 836 src = rm->rm_col[c].rc_abd; 837 dst = rm->rm_col[x].rc_abd; 838 839 if (c == x) 840 continue; 841 842 (void) abd_iterate_func2(dst, src, 0, 0, size, 843 vdev_raidz_reconst_p_func, NULL); 844 } 845 846 return (1 << VDEV_RAIDZ_P); 847 } 848 849 static int 850 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 851 { 852 int x = tgts[0]; 853 int c, exp; 854 abd_t *dst, *src; 855 856 ASSERT(ntgts == 1); 857 858 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 859 860 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 861 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 862 rm->rm_col[c].rc_size); 863 864 src = rm->rm_col[c].rc_abd; 865 dst = rm->rm_col[x].rc_abd; 866 867 if (c == rm->rm_firstdatacol) { 868 if (dst != src) { 869 ASSERT3S(dst->abd_size, >=, size); 870 ASSERT3S(src->abd_size, >=, size); 871 abd_copy_off(dst, src, 0, 0, size); 872 } 873 if (rm->rm_col[x].rc_size > size) 874 abd_zero_off(dst, size, 875 rm->rm_col[x].rc_size - size); 876 } else { 877 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 878 if (src != dst) 879 (void) abd_iterate_func2(dst, src, 0, 0, size, 880 vdev_raidz_reconst_q_pre_func, NULL); 881 (void) abd_iterate_func(dst, 882 size, rm->rm_col[x].rc_size - size, 883 vdev_raidz_reconst_q_pre_tail_func, NULL); 884 } 885 } 886 887 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 888 dst = rm->rm_col[x].rc_abd; 889 exp = 255 - (rm->rm_cols - 1 - x); 890 891 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 892 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 893 vdev_raidz_reconst_q_post_func, &rq); 894 895 return (1 << VDEV_RAIDZ_Q); 896 } 897 898 static int 899 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 900 { 901 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 902 abd_t *pdata, *qdata; 903 uint64_t xsize, ysize; 904 int x = tgts[0]; 905 int y = tgts[1]; 906 abd_t *xd, *yd; 907 908 ASSERT(ntgts == 2); 909 ASSERT(x < y); 910 ASSERT(x >= rm->rm_firstdatacol); 911 ASSERT(y < rm->rm_cols); 912 913 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 914 915 /* 916 * Move the parity data aside -- we're going to compute parity as 917 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 918 * reuse the parity generation mechanism without trashing the actual 919 * parity so we make those columns appear to be full of zeros by 920 * setting their lengths to zero. 921 */ 922 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 923 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 924 xsize = rm->rm_col[x].rc_size; 925 ysize = rm->rm_col[y].rc_size; 926 927 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 928 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 929 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 930 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 931 rm->rm_col[x].rc_size = 0; 932 rm->rm_col[y].rc_size = 0; 933 934 vdev_raidz_generate_parity_pq(rm); 935 936 rm->rm_col[x].rc_size = xsize; 937 rm->rm_col[y].rc_size = ysize; 938 939 p = abd_to_buf(pdata); 940 q = abd_to_buf(qdata); 941 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 942 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 943 xd = rm->rm_col[x].rc_abd; 944 yd = rm->rm_col[y].rc_abd; 945 946 /* 947 * We now have: 948 * Pxy = P + D_x + D_y 949 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 950 * 951 * We can then solve for D_x: 952 * D_x = A * (P + Pxy) + B * (Q + Qxy) 953 * where 954 * A = 2^(x - y) * (2^(x - y) + 1)^-1 955 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 956 * 957 * With D_x in hand, we can easily solve for D_y: 958 * D_y = P + Pxy + D_x 959 */ 960 961 a = vdev_raidz_pow2[255 + x - y]; 962 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 963 tmp = 255 - vdev_raidz_log2[a ^ 1]; 964 965 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 966 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 967 968 ASSERT3U(xsize, >=, ysize); 969 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 970 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 971 vdev_raidz_reconst_pq_func, &rpq); 972 (void) abd_iterate_func(xd, ysize, xsize - ysize, 973 vdev_raidz_reconst_pq_tail_func, &rpq); 974 975 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 976 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 977 978 /* 979 * Restore the saved parity data. 980 */ 981 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 982 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 983 984 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 985 } 986 987 /* BEGIN CSTYLED */ 988 /* 989 * In the general case of reconstruction, we must solve the system of linear 990 * equations defined by the coeffecients used to generate parity as well as 991 * the contents of the data and parity disks. This can be expressed with 992 * vectors for the original data (D) and the actual data (d) and parity (p) 993 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 994 * 995 * __ __ __ __ 996 * | | __ __ | p_0 | 997 * | V | | D_0 | | p_m-1 | 998 * | | x | : | = | d_0 | 999 * | I | | D_n-1 | | : | 1000 * | | ~~ ~~ | d_n-1 | 1001 * ~~ ~~ ~~ ~~ 1002 * 1003 * I is simply a square identity matrix of size n, and V is a vandermonde 1004 * matrix defined by the coeffecients we chose for the various parity columns 1005 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1006 * computation as well as linear separability. 1007 * 1008 * __ __ __ __ 1009 * | 1 .. 1 1 1 | | p_0 | 1010 * | 2^n-1 .. 4 2 1 | __ __ | : | 1011 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1012 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1013 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1014 * | : : : : | | : | | d_2 | 1015 * | 0 .. 1 0 0 | | D_n-1 | | : | 1016 * | 0 .. 0 1 0 | ~~ ~~ | : | 1017 * | 0 .. 0 0 1 | | d_n-1 | 1018 * ~~ ~~ ~~ ~~ 1019 * 1020 * Note that I, V, d, and p are known. To compute D, we must invert the 1021 * matrix and use the known data and parity values to reconstruct the unknown 1022 * data values. We begin by removing the rows in V|I and d|p that correspond 1023 * to failed or missing columns; we then make V|I square (n x n) and d|p 1024 * sized n by removing rows corresponding to unused parity from the bottom up 1025 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1026 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1027 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1028 * __ __ 1029 * | 1 1 1 1 1 1 1 1 | 1030 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1031 * | 19 205 116 29 64 16 4 1 | / / 1032 * | 1 0 0 0 0 0 0 0 | / / 1033 * | 0 1 0 0 0 0 0 0 | <--' / 1034 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1035 * | 0 0 0 1 0 0 0 0 | 1036 * | 0 0 0 0 1 0 0 0 | 1037 * | 0 0 0 0 0 1 0 0 | 1038 * | 0 0 0 0 0 0 1 0 | 1039 * | 0 0 0 0 0 0 0 1 | 1040 * ~~ ~~ 1041 * __ __ 1042 * | 1 1 1 1 1 1 1 1 | 1043 * | 128 64 32 16 8 4 2 1 | 1044 * | 19 205 116 29 64 16 4 1 | 1045 * | 1 0 0 0 0 0 0 0 | 1046 * | 0 1 0 0 0 0 0 0 | 1047 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1048 * | 0 0 0 1 0 0 0 0 | 1049 * | 0 0 0 0 1 0 0 0 | 1050 * | 0 0 0 0 0 1 0 0 | 1051 * | 0 0 0 0 0 0 1 0 | 1052 * | 0 0 0 0 0 0 0 1 | 1053 * ~~ ~~ 1054 * 1055 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1056 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1057 * matrix is not singular. 1058 * __ __ 1059 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1060 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1061 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1062 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1063 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1064 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1065 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1066 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1067 * ~~ ~~ 1068 * __ __ 1069 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1070 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1071 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1072 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1073 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1074 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1075 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1076 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1077 * ~~ ~~ 1078 * __ __ 1079 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1080 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1081 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1082 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1083 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1084 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1085 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1086 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1087 * ~~ ~~ 1088 * __ __ 1089 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1090 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1091 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1092 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1093 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1094 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1095 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1096 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1097 * ~~ ~~ 1098 * __ __ 1099 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1100 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1101 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1102 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1103 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1104 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1105 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1106 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1107 * ~~ ~~ 1108 * __ __ 1109 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1110 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1111 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1112 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1113 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1114 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1115 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1116 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1117 * ~~ ~~ 1118 * __ __ 1119 * | 0 0 1 0 0 0 0 0 | 1120 * | 167 100 5 41 159 169 217 208 | 1121 * | 166 100 4 40 158 168 216 209 | 1122 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1123 * | 0 0 0 0 1 0 0 0 | 1124 * | 0 0 0 0 0 1 0 0 | 1125 * | 0 0 0 0 0 0 1 0 | 1126 * | 0 0 0 0 0 0 0 1 | 1127 * ~~ ~~ 1128 * 1129 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1130 * of the missing data. 1131 * 1132 * As is apparent from the example above, the only non-trivial rows in the 1133 * inverse matrix correspond to the data disks that we're trying to 1134 * reconstruct. Indeed, those are the only rows we need as the others would 1135 * only be useful for reconstructing data known or assumed to be valid. For 1136 * that reason, we only build the coefficients in the rows that correspond to 1137 * targeted columns. 1138 */ 1139 /* END CSTYLED */ 1140 1141 static void 1142 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1143 uint8_t **rows) 1144 { 1145 int i, j; 1146 int pow; 1147 1148 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1149 1150 /* 1151 * Fill in the missing rows of interest. 1152 */ 1153 for (i = 0; i < nmap; i++) { 1154 ASSERT3S(0, <=, map[i]); 1155 ASSERT3S(map[i], <=, 2); 1156 1157 pow = map[i] * n; 1158 if (pow > 255) 1159 pow -= 255; 1160 ASSERT(pow <= 255); 1161 1162 for (j = 0; j < n; j++) { 1163 pow -= map[i]; 1164 if (pow < 0) 1165 pow += 255; 1166 rows[i][j] = vdev_raidz_pow2[pow]; 1167 } 1168 } 1169 } 1170 1171 static void 1172 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1173 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1174 { 1175 int i, j, ii, jj; 1176 uint8_t log; 1177 1178 /* 1179 * Assert that the first nmissing entries from the array of used 1180 * columns correspond to parity columns and that subsequent entries 1181 * correspond to data columns. 1182 */ 1183 for (i = 0; i < nmissing; i++) { 1184 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1185 } 1186 for (; i < n; i++) { 1187 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1188 } 1189 1190 /* 1191 * First initialize the storage where we'll compute the inverse rows. 1192 */ 1193 for (i = 0; i < nmissing; i++) { 1194 for (j = 0; j < n; j++) { 1195 invrows[i][j] = (i == j) ? 1 : 0; 1196 } 1197 } 1198 1199 /* 1200 * Subtract all trivial rows from the rows of consequence. 1201 */ 1202 for (i = 0; i < nmissing; i++) { 1203 for (j = nmissing; j < n; j++) { 1204 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1205 jj = used[j] - rm->rm_firstdatacol; 1206 ASSERT3S(jj, <, n); 1207 invrows[i][j] = rows[i][jj]; 1208 rows[i][jj] = 0; 1209 } 1210 } 1211 1212 /* 1213 * For each of the rows of interest, we must normalize it and subtract 1214 * a multiple of it from the other rows. 1215 */ 1216 for (i = 0; i < nmissing; i++) { 1217 for (j = 0; j < missing[i]; j++) { 1218 ASSERT0(rows[i][j]); 1219 } 1220 ASSERT3U(rows[i][missing[i]], !=, 0); 1221 1222 /* 1223 * Compute the inverse of the first element and multiply each 1224 * element in the row by that value. 1225 */ 1226 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1227 1228 for (j = 0; j < n; j++) { 1229 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1230 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1231 } 1232 1233 for (ii = 0; ii < nmissing; ii++) { 1234 if (i == ii) 1235 continue; 1236 1237 ASSERT3U(rows[ii][missing[i]], !=, 0); 1238 1239 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1240 1241 for (j = 0; j < n; j++) { 1242 rows[ii][j] ^= 1243 vdev_raidz_exp2(rows[i][j], log); 1244 invrows[ii][j] ^= 1245 vdev_raidz_exp2(invrows[i][j], log); 1246 } 1247 } 1248 } 1249 1250 /* 1251 * Verify that the data that is left in the rows are properly part of 1252 * an identity matrix. 1253 */ 1254 for (i = 0; i < nmissing; i++) { 1255 for (j = 0; j < n; j++) { 1256 if (j == missing[i]) { 1257 ASSERT3U(rows[i][j], ==, 1); 1258 } else { 1259 ASSERT0(rows[i][j]); 1260 } 1261 } 1262 } 1263 } 1264 1265 static void 1266 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1267 int *missing, uint8_t **invrows, const uint8_t *used) 1268 { 1269 int i, j, x, cc, c; 1270 uint8_t *src; 1271 uint64_t ccount; 1272 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1273 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1274 uint8_t log = 0; 1275 uint8_t val; 1276 int ll; 1277 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1278 uint8_t *p, *pp; 1279 size_t psize; 1280 1281 psize = sizeof (invlog[0][0]) * n * nmissing; 1282 p = kmem_alloc(psize, KM_SLEEP); 1283 1284 for (pp = p, i = 0; i < nmissing; i++) { 1285 invlog[i] = pp; 1286 pp += n; 1287 } 1288 1289 for (i = 0; i < nmissing; i++) { 1290 for (j = 0; j < n; j++) { 1291 ASSERT3U(invrows[i][j], !=, 0); 1292 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1293 } 1294 } 1295 1296 for (i = 0; i < n; i++) { 1297 c = used[i]; 1298 ASSERT3U(c, <, rm->rm_cols); 1299 1300 src = abd_to_buf(rm->rm_col[c].rc_abd); 1301 ccount = rm->rm_col[c].rc_size; 1302 for (j = 0; j < nmissing; j++) { 1303 cc = missing[j] + rm->rm_firstdatacol; 1304 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1305 ASSERT3U(cc, <, rm->rm_cols); 1306 ASSERT3U(cc, !=, c); 1307 1308 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1309 dcount[j] = rm->rm_col[cc].rc_size; 1310 } 1311 1312 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1313 1314 for (x = 0; x < ccount; x++, src++) { 1315 if (*src != 0) 1316 log = vdev_raidz_log2[*src]; 1317 1318 for (cc = 0; cc < nmissing; cc++) { 1319 if (x >= dcount[cc]) 1320 continue; 1321 1322 if (*src == 0) { 1323 val = 0; 1324 } else { 1325 if ((ll = log + invlog[cc][i]) >= 255) 1326 ll -= 255; 1327 val = vdev_raidz_pow2[ll]; 1328 } 1329 1330 if (i == 0) 1331 dst[cc][x] = val; 1332 else 1333 dst[cc][x] ^= val; 1334 } 1335 } 1336 } 1337 1338 kmem_free(p, psize); 1339 } 1340 1341 static int 1342 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1343 { 1344 int n, i, c, t, tt; 1345 int nmissing_rows; 1346 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1347 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1348 1349 uint8_t *p, *pp; 1350 size_t psize; 1351 1352 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1353 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1354 uint8_t *used; 1355 1356 abd_t **bufs = NULL; 1357 1358 int code = 0; 1359 1360 /* 1361 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1362 * temporary linear ABDs. 1363 */ 1364 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1365 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1366 1367 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1368 raidz_col_t *col = &rm->rm_col[c]; 1369 1370 bufs[c] = col->rc_abd; 1371 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1372 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1373 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1374 abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size); 1375 } 1376 } 1377 1378 n = rm->rm_cols - rm->rm_firstdatacol; 1379 1380 /* 1381 * Figure out which data columns are missing. 1382 */ 1383 nmissing_rows = 0; 1384 for (t = 0; t < ntgts; t++) { 1385 if (tgts[t] >= rm->rm_firstdatacol) { 1386 missing_rows[nmissing_rows++] = 1387 tgts[t] - rm->rm_firstdatacol; 1388 } 1389 } 1390 1391 /* 1392 * Figure out which parity columns to use to help generate the missing 1393 * data columns. 1394 */ 1395 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1396 ASSERT(tt < ntgts); 1397 ASSERT(c < rm->rm_firstdatacol); 1398 1399 /* 1400 * Skip any targeted parity columns. 1401 */ 1402 if (c == tgts[tt]) { 1403 tt++; 1404 continue; 1405 } 1406 1407 code |= 1 << c; 1408 1409 parity_map[i] = c; 1410 i++; 1411 } 1412 1413 ASSERT(code != 0); 1414 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1415 1416 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1417 nmissing_rows * n + sizeof (used[0]) * n; 1418 p = kmem_alloc(psize, KM_SLEEP); 1419 1420 for (pp = p, i = 0; i < nmissing_rows; i++) { 1421 rows[i] = pp; 1422 pp += n; 1423 invrows[i] = pp; 1424 pp += n; 1425 } 1426 used = pp; 1427 1428 for (i = 0; i < nmissing_rows; i++) { 1429 used[i] = parity_map[i]; 1430 } 1431 1432 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1433 if (tt < nmissing_rows && 1434 c == missing_rows[tt] + rm->rm_firstdatacol) { 1435 tt++; 1436 continue; 1437 } 1438 1439 ASSERT3S(i, <, n); 1440 used[i] = c; 1441 i++; 1442 } 1443 1444 /* 1445 * Initialize the interesting rows of the matrix. 1446 */ 1447 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1448 1449 /* 1450 * Invert the matrix. 1451 */ 1452 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1453 invrows, used); 1454 1455 /* 1456 * Reconstruct the missing data using the generated matrix. 1457 */ 1458 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1459 invrows, used); 1460 1461 kmem_free(p, psize); 1462 1463 /* 1464 * copy back from temporary linear abds and free them 1465 */ 1466 if (bufs) { 1467 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1468 raidz_col_t *col = &rm->rm_col[c]; 1469 1470 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1471 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1472 abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size); 1473 abd_free(col->rc_abd); 1474 col->rc_abd = bufs[c]; 1475 } 1476 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1477 } 1478 1479 return (code); 1480 } 1481 1482 int 1483 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 1484 { 1485 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1486 int ntgts; 1487 int i, c, ret; 1488 int code; 1489 int nbadparity, nbaddata; 1490 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1491 1492 /* 1493 * The tgts list must already be sorted. 1494 */ 1495 for (i = 1; i < nt; i++) { 1496 ASSERT(t[i] > t[i - 1]); 1497 } 1498 1499 nbadparity = rm->rm_firstdatacol; 1500 nbaddata = rm->rm_cols - nbadparity; 1501 ntgts = 0; 1502 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1503 if (c < rm->rm_firstdatacol) 1504 parity_valid[c] = B_FALSE; 1505 1506 if (i < nt && c == t[i]) { 1507 tgts[ntgts++] = c; 1508 i++; 1509 } else if (rm->rm_col[c].rc_error != 0) { 1510 tgts[ntgts++] = c; 1511 } else if (c >= rm->rm_firstdatacol) { 1512 nbaddata--; 1513 } else { 1514 parity_valid[c] = B_TRUE; 1515 nbadparity--; 1516 } 1517 } 1518 1519 ASSERT(ntgts >= nt); 1520 ASSERT(nbaddata >= 0); 1521 ASSERT(nbaddata + nbadparity == ntgts); 1522 1523 dt = &tgts[nbadparity]; 1524 1525 /* Reconstruct using the new math implementation */ 1526 ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); 1527 if (ret != RAIDZ_ORIGINAL_IMPL) 1528 return (ret); 1529 1530 /* 1531 * See if we can use any of our optimized reconstruction routines. 1532 */ 1533 switch (nbaddata) { 1534 case 1: 1535 if (parity_valid[VDEV_RAIDZ_P]) 1536 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1537 1538 ASSERT(rm->rm_firstdatacol > 1); 1539 1540 if (parity_valid[VDEV_RAIDZ_Q]) 1541 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1542 1543 ASSERT(rm->rm_firstdatacol > 2); 1544 break; 1545 1546 case 2: 1547 ASSERT(rm->rm_firstdatacol > 1); 1548 1549 if (parity_valid[VDEV_RAIDZ_P] && 1550 parity_valid[VDEV_RAIDZ_Q]) 1551 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1552 1553 ASSERT(rm->rm_firstdatacol > 2); 1554 1555 break; 1556 } 1557 1558 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1559 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1560 ASSERT(code > 0); 1561 return (code); 1562 } 1563 1564 static int 1565 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1566 uint64_t *ashift) 1567 { 1568 vdev_t *cvd; 1569 uint64_t nparity = vd->vdev_nparity; 1570 int c; 1571 int lasterror = 0; 1572 int numerrors = 0; 1573 1574 ASSERT(nparity > 0); 1575 1576 if (nparity > VDEV_RAIDZ_MAXPARITY || 1577 vd->vdev_children < nparity + 1) { 1578 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1579 return (SET_ERROR(EINVAL)); 1580 } 1581 1582 vdev_open_children(vd); 1583 1584 for (c = 0; c < vd->vdev_children; c++) { 1585 cvd = vd->vdev_child[c]; 1586 1587 if (cvd->vdev_open_error != 0) { 1588 lasterror = cvd->vdev_open_error; 1589 numerrors++; 1590 continue; 1591 } 1592 1593 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1594 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1595 *ashift = MAX(*ashift, cvd->vdev_ashift); 1596 } 1597 1598 *asize *= vd->vdev_children; 1599 *max_asize *= vd->vdev_children; 1600 1601 if (numerrors > nparity) { 1602 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1603 return (lasterror); 1604 } 1605 1606 return (0); 1607 } 1608 1609 static void 1610 vdev_raidz_close(vdev_t *vd) 1611 { 1612 int c; 1613 1614 for (c = 0; c < vd->vdev_children; c++) 1615 vdev_close(vd->vdev_child[c]); 1616 } 1617 1618 /* 1619 * Handle a read or write I/O to a RAID-Z dump device. 1620 * 1621 * The dump device is in a unique situation compared to other ZFS datasets: 1622 * writing to this device should be as simple and fast as possible. In 1623 * addition, durability matters much less since the dump will be extracted 1624 * once the machine reboots. For that reason, this function eschews parity for 1625 * performance and simplicity. The dump device uses the checksum setting 1626 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1627 * dataset. 1628 * 1629 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1630 * 128 KB will not fill an entire block; in addition, they may not be properly 1631 * aligned. In that case, this function uses the preallocated 128 KB block and 1632 * omits reading or writing any "empty" portions of that block, as opposed to 1633 * allocating a fresh appropriately-sized block. 1634 * 1635 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1636 * 1637 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1638 * 1639 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1640 * allocated which spans all five child vdevs. 8 KB of data would be written to 1641 * each of four vdevs, with the fifth containing the parity bits. 1642 * 1643 * parity data data data data 1644 * | PP | XX | XX | XX | XX | 1645 * ^ ^ ^ ^ ^ 1646 * | | | | | 1647 * 8 KB parity ------8 KB data blocks------ 1648 * 1649 * However, when writing to the dump device, the behavior is different: 1650 * 1651 * vdev_raidz_dumpio(data, size: 32 KB, offset: 64 KB) 1652 * 1653 * Unlike the normal RAID-Z case in which the block is allocated based on the 1654 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1655 * I/O size is less than 128 KB, only the actual portions of data are written. 1656 * In this example the data is written to the third data vdev since that vdev 1657 * contains the offset [64 KB, 96 KB). 1658 * 1659 * parity data data data data 1660 * | | | | XX | | 1661 * ^ 1662 * | 1663 * 32 KB data block 1664 * 1665 * As a result, an individual I/O may not span all child vdevs; moreover, a 1666 * small I/O may only operate on a single child vdev. 1667 * 1668 * Note that since there are no parity bits calculated or written, this format 1669 * remains the same no matter how many parity bits are used in a normal RAID-Z 1670 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1671 * would look like: 1672 * 1673 * parity parity parity data data data data 1674 * | | | | | | XX | | 1675 * ^ 1676 * | 1677 * 32 KB data block 1678 */ 1679 static int 1680 vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, 1681 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1682 { 1683 vdev_t *tvd = vd->vdev_top; 1684 vdev_t *cvd; 1685 raidz_map_t *rm; 1686 raidz_col_t *rc; 1687 int c, err = 0; 1688 1689 uint64_t start, end, colstart, colend; 1690 uint64_t coloffset, colsize, colskip; 1691 1692 #ifdef _KERNEL 1693 1694 /* 1695 * Don't write past the end of the block 1696 */ 1697 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1698 1699 start = offset; 1700 end = start + size; 1701 1702 /* 1703 * Allocate a RAID-Z map for this block. Note that this block starts 1704 * from the "original" offset, this is, the offset of the extent which 1705 * contains the requisite offset of the data being read or written. 1706 * 1707 * Even if this I/O operation doesn't span the full block size, let's 1708 * treat the on-disk format as if the only blocks are the complete 128 1709 * KB size. 1710 */ 1711 1712 /* First, fake a zio for vdev_raidz_map_alloc. */ 1713 zio_t *zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 1714 zio->io_offset = origoffset; 1715 zio->io_size = SPA_OLD_MAXBLOCKSIZE; 1716 zio->io_abd = abd_get_from_buf(data - (offset - origoffset), 1717 SPA_OLD_MAXBLOCKSIZE); 1718 1719 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1720 vd->vdev_nparity); 1721 1722 coloffset = origoffset; 1723 1724 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1725 c++, coloffset += rc->rc_size) { 1726 rc = &rm->rm_col[c]; 1727 cvd = vd->vdev_child[rc->rc_devidx]; 1728 1729 if (cvd->vdev_ops->vdev_op_dumpio == NULL) { 1730 err = EINVAL; 1731 break; 1732 } 1733 1734 /* 1735 * Find the start and end of this column in the RAID-Z map, 1736 * keeping in mind that the stated size and offset of the 1737 * operation may not fill the entire column for this vdev. 1738 * 1739 * If any portion of the data spans this column, issue the 1740 * appropriate operation to the vdev. 1741 */ 1742 if (coloffset + rc->rc_size <= start) 1743 continue; 1744 if (coloffset >= end) 1745 continue; 1746 1747 colstart = MAX(coloffset, start); 1748 colend = MIN(end, coloffset + rc->rc_size); 1749 colsize = colend - colstart; 1750 colskip = colstart - coloffset; 1751 1752 VERIFY3U(colsize, <=, rc->rc_size); 1753 VERIFY3U(colskip, <=, rc->rc_size); 1754 1755 if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd, 1756 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1757 rc->rc_offset + colskip, 0, doread, isdump)) != 0) { 1758 break; 1759 } 1760 } 1761 1762 vdev_raidz_map_free(rm); 1763 abd_put(zio->io_abd); 1764 kmem_free(zio, sizeof (zio_t)); 1765 1766 #endif /* KERNEL */ 1767 1768 return (err); 1769 } 1770 1771 static uint64_t 1772 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1773 { 1774 uint64_t asize; 1775 uint64_t ashift = vd->vdev_top->vdev_ashift; 1776 uint64_t cols = vd->vdev_children; 1777 uint64_t nparity = vd->vdev_nparity; 1778 1779 asize = ((psize - 1) >> ashift) + 1; 1780 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1781 asize = roundup(asize, nparity + 1) << ashift; 1782 1783 return (asize); 1784 } 1785 1786 static void 1787 vdev_raidz_child_done(zio_t *zio) 1788 { 1789 raidz_col_t *rc = zio->io_private; 1790 1791 rc->rc_error = zio->io_error; 1792 rc->rc_tried = 1; 1793 rc->rc_skipped = 0; 1794 } 1795 1796 static void 1797 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1798 { 1799 #ifdef ZFS_DEBUG 1800 vdev_t *vd = zio->io_vd; 1801 vdev_t *tvd = vd->vdev_top; 1802 1803 range_seg64_t logical_rs, physical_rs; 1804 logical_rs.rs_start = zio->io_offset; 1805 logical_rs.rs_end = logical_rs.rs_start + 1806 vdev_raidz_asize(zio->io_vd, zio->io_size); 1807 1808 raidz_col_t *rc = &rm->rm_col[col]; 1809 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1810 1811 vdev_xlate(cvd, &logical_rs, &physical_rs); 1812 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1813 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1814 /* 1815 * It would be nice to assert that rs_end is equal 1816 * to rc_offset + rc_size but there might be an 1817 * optional I/O at the end that is not accounted in 1818 * rc_size. 1819 */ 1820 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1821 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1822 rc->rc_size + (1 << tvd->vdev_ashift)); 1823 } else { 1824 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1825 } 1826 #endif 1827 } 1828 1829 /* 1830 * Start an IO operation on a RAIDZ VDev 1831 * 1832 * Outline: 1833 * - For write operations: 1834 * 1. Generate the parity data 1835 * 2. Create child zio write operations to each column's vdev, for both 1836 * data and parity. 1837 * 3. If the column skips any sectors for padding, create optional dummy 1838 * write zio children for those areas to improve aggregation continuity. 1839 * - For read operations: 1840 * 1. Create child zio read operations to each data column's vdev to read 1841 * the range of data required for zio. 1842 * 2. If this is a scrub or resilver operation, or if any of the data 1843 * vdevs have had errors, then create zio read operations to the parity 1844 * columns' VDevs as well. 1845 */ 1846 static void 1847 vdev_raidz_io_start(zio_t *zio) 1848 { 1849 vdev_t *vd = zio->io_vd; 1850 vdev_t *tvd = vd->vdev_top; 1851 vdev_t *cvd; 1852 raidz_map_t *rm; 1853 raidz_col_t *rc; 1854 int c, i; 1855 1856 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1857 vd->vdev_nparity); 1858 1859 zio->io_vsd = rm; 1860 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1861 1862 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1863 1864 if (zio->io_type == ZIO_TYPE_WRITE) { 1865 vdev_raidz_generate_parity(rm); 1866 1867 for (c = 0; c < rm->rm_cols; c++) { 1868 rc = &rm->rm_col[c]; 1869 cvd = vd->vdev_child[rc->rc_devidx]; 1870 1871 /* 1872 * Verify physical to logical translation. 1873 */ 1874 vdev_raidz_io_verify(zio, rm, c); 1875 1876 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1877 rc->rc_offset, rc->rc_abd, rc->rc_size, 1878 zio->io_type, zio->io_priority, 0, 1879 vdev_raidz_child_done, rc)); 1880 } 1881 1882 /* 1883 * Generate optional I/Os for any skipped sectors to improve 1884 * aggregation contiguity. 1885 */ 1886 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1887 ASSERT(c <= rm->rm_scols); 1888 if (c == rm->rm_scols) 1889 c = 0; 1890 rc = &rm->rm_col[c]; 1891 cvd = vd->vdev_child[rc->rc_devidx]; 1892 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1893 rc->rc_offset + rc->rc_size, NULL, 1894 1 << tvd->vdev_ashift, 1895 zio->io_type, zio->io_priority, 1896 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1897 } 1898 1899 zio_execute(zio); 1900 return; 1901 } 1902 1903 ASSERT(zio->io_type == ZIO_TYPE_READ); 1904 1905 /* 1906 * Iterate over the columns in reverse order so that we hit the parity 1907 * last -- any errors along the way will force us to read the parity. 1908 */ 1909 for (c = rm->rm_cols - 1; c >= 0; c--) { 1910 rc = &rm->rm_col[c]; 1911 cvd = vd->vdev_child[rc->rc_devidx]; 1912 if (!vdev_readable(cvd)) { 1913 if (c >= rm->rm_firstdatacol) 1914 rm->rm_missingdata++; 1915 else 1916 rm->rm_missingparity++; 1917 rc->rc_error = SET_ERROR(ENXIO); 1918 rc->rc_tried = 1; /* don't even try */ 1919 rc->rc_skipped = 1; 1920 continue; 1921 } 1922 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1923 if (c >= rm->rm_firstdatacol) 1924 rm->rm_missingdata++; 1925 else 1926 rm->rm_missingparity++; 1927 rc->rc_error = SET_ERROR(ESTALE); 1928 rc->rc_skipped = 1; 1929 continue; 1930 } 1931 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1932 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1933 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1934 rc->rc_offset, rc->rc_abd, rc->rc_size, 1935 zio->io_type, zio->io_priority, 0, 1936 vdev_raidz_child_done, rc)); 1937 } 1938 } 1939 1940 zio_execute(zio); 1941 } 1942 1943 1944 /* 1945 * Report a checksum error for a child of a RAID-Z device. 1946 */ 1947 static void 1948 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 1949 { 1950 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1951 1952 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1953 zio_bad_cksum_t zbc; 1954 raidz_map_t *rm = zio->io_vsd; 1955 1956 mutex_enter(&vd->vdev_stat_lock); 1957 vd->vdev_stat.vs_checksum_errors++; 1958 mutex_exit(&vd->vdev_stat_lock); 1959 1960 zbc.zbc_has_cksum = 0; 1961 zbc.zbc_injected = rm->rm_ecksuminjected; 1962 1963 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 1964 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 1965 rc->rc_abd, bad_data, &zbc); 1966 } 1967 } 1968 1969 /* 1970 * We keep track of whether or not there were any injected errors, so that 1971 * any ereports we generate can note it. 1972 */ 1973 static int 1974 raidz_checksum_verify(zio_t *zio) 1975 { 1976 zio_bad_cksum_t zbc; 1977 raidz_map_t *rm = zio->io_vsd; 1978 1979 int ret = zio_checksum_error(zio, &zbc); 1980 if (ret != 0 && zbc.zbc_injected != 0) 1981 rm->rm_ecksuminjected = 1; 1982 1983 return (ret); 1984 } 1985 1986 /* 1987 * Generate the parity from the data columns. If we tried and were able to 1988 * read the parity without error, verify that the generated parity matches the 1989 * data we read. If it doesn't, we fire off a checksum error. Return the 1990 * number such failures. 1991 */ 1992 static int 1993 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1994 { 1995 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 1996 int c, ret = 0; 1997 raidz_col_t *rc; 1998 1999 blkptr_t *bp = zio->io_bp; 2000 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2001 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2002 2003 if (checksum == ZIO_CHECKSUM_NOPARITY) 2004 return (ret); 2005 2006 for (c = 0; c < rm->rm_firstdatacol; c++) { 2007 rc = &rm->rm_col[c]; 2008 if (!rc->rc_tried || rc->rc_error != 0) 2009 continue; 2010 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 2011 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 2012 } 2013 2014 vdev_raidz_generate_parity(rm); 2015 2016 for (c = 0; c < rm->rm_firstdatacol; c++) { 2017 rc = &rm->rm_col[c]; 2018 if (!rc->rc_tried || rc->rc_error != 0) 2019 continue; 2020 if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) { 2021 raidz_checksum_error(zio, rc, orig[c]); 2022 rc->rc_error = SET_ERROR(ECKSUM); 2023 ret++; 2024 } 2025 abd_free(orig[c]); 2026 } 2027 2028 return (ret); 2029 } 2030 2031 static int 2032 vdev_raidz_worst_error(raidz_map_t *rm) 2033 { 2034 int error = 0; 2035 2036 for (int c = 0; c < rm->rm_cols; c++) 2037 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2038 2039 return (error); 2040 } 2041 2042 /* 2043 * Iterate over all combinations of bad data and attempt a reconstruction. 2044 * Note that the algorithm below is non-optimal because it doesn't take into 2045 * account how reconstruction is actually performed. For example, with 2046 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2047 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2048 * cases we'd only use parity information in column 0. 2049 */ 2050 static int 2051 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2052 { 2053 raidz_map_t *rm = zio->io_vsd; 2054 raidz_col_t *rc; 2055 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2056 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2057 int *tgts = &tstore[1]; 2058 int current, next, i, c, n; 2059 int code, ret = 0; 2060 2061 ASSERT(total_errors < rm->rm_firstdatacol); 2062 2063 /* 2064 * This simplifies one edge condition. 2065 */ 2066 tgts[-1] = -1; 2067 2068 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2069 /* 2070 * Initialize the targets array by finding the first n columns 2071 * that contain no error. 2072 * 2073 * If there were no data errors, we need to ensure that we're 2074 * always explicitly attempting to reconstruct at least one 2075 * data column. To do this, we simply push the highest target 2076 * up into the data columns. 2077 */ 2078 for (c = 0, i = 0; i < n; i++) { 2079 if (i == n - 1 && data_errors == 0 && 2080 c < rm->rm_firstdatacol) { 2081 c = rm->rm_firstdatacol; 2082 } 2083 2084 while (rm->rm_col[c].rc_error != 0) { 2085 c++; 2086 ASSERT3S(c, <, rm->rm_cols); 2087 } 2088 2089 tgts[i] = c++; 2090 } 2091 2092 /* 2093 * Setting tgts[n] simplifies the other edge condition. 2094 */ 2095 tgts[n] = rm->rm_cols; 2096 2097 /* 2098 * These buffers were allocated in previous iterations. 2099 */ 2100 for (i = 0; i < n - 1; i++) { 2101 ASSERT(orig[i] != NULL); 2102 } 2103 2104 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, 2105 rm->rm_col[0].rc_size); 2106 2107 current = 0; 2108 next = tgts[current]; 2109 2110 while (current != n) { 2111 tgts[current] = next; 2112 current = 0; 2113 2114 /* 2115 * Save off the original data that we're going to 2116 * attempt to reconstruct. 2117 */ 2118 for (i = 0; i < n; i++) { 2119 ASSERT(orig[i] != NULL); 2120 c = tgts[i]; 2121 ASSERT3S(c, >=, 0); 2122 ASSERT3S(c, <, rm->rm_cols); 2123 rc = &rm->rm_col[c]; 2124 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2125 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2126 abd_copy_off(orig[i], rc->rc_abd, 0, 0, 2127 rc->rc_size); 2128 } 2129 2130 /* 2131 * Attempt a reconstruction and exit the outer loop on 2132 * success. 2133 */ 2134 code = vdev_raidz_reconstruct(rm, tgts, n); 2135 if (raidz_checksum_verify(zio) == 0) { 2136 2137 for (i = 0; i < n; i++) { 2138 c = tgts[i]; 2139 rc = &rm->rm_col[c]; 2140 ASSERT(rc->rc_error == 0); 2141 if (rc->rc_tried) 2142 raidz_checksum_error(zio, rc, 2143 orig[i]); 2144 rc->rc_error = SET_ERROR(ECKSUM); 2145 } 2146 2147 ret = code; 2148 goto done; 2149 } 2150 2151 /* 2152 * Restore the original data. 2153 */ 2154 for (i = 0; i < n; i++) { 2155 c = tgts[i]; 2156 rc = &rm->rm_col[c]; 2157 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2158 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2159 abd_copy_off(rc->rc_abd, orig[i], 0, 0, 2160 rc->rc_size); 2161 } 2162 2163 do { 2164 /* 2165 * Find the next valid column after the current 2166 * position.. 2167 */ 2168 for (next = tgts[current] + 1; 2169 next < rm->rm_cols && 2170 rm->rm_col[next].rc_error != 0; next++) 2171 continue; 2172 2173 ASSERT(next <= tgts[current + 1]); 2174 2175 /* 2176 * If that spot is available, we're done here. 2177 */ 2178 if (next != tgts[current + 1]) 2179 break; 2180 2181 /* 2182 * Otherwise, find the next valid column after 2183 * the previous position. 2184 */ 2185 for (c = tgts[current - 1] + 1; 2186 rm->rm_col[c].rc_error != 0; c++) 2187 continue; 2188 2189 tgts[current] = c; 2190 current++; 2191 2192 } while (current != n); 2193 } 2194 } 2195 n--; 2196 done: 2197 for (i = 0; i < n; i++) 2198 abd_free(orig[i]); 2199 2200 return (ret); 2201 } 2202 2203 /* 2204 * Complete an IO operation on a RAIDZ VDev 2205 * 2206 * Outline: 2207 * - For write operations: 2208 * 1. Check for errors on the child IOs. 2209 * 2. Return, setting an error code if too few child VDevs were written 2210 * to reconstruct the data later. Note that partial writes are 2211 * considered successful if they can be reconstructed at all. 2212 * - For read operations: 2213 * 1. Check for errors on the child IOs. 2214 * 2. If data errors occurred: 2215 * a. Try to reassemble the data from the parity available. 2216 * b. If we haven't yet read the parity drives, read them now. 2217 * c. If all parity drives have been read but the data still doesn't 2218 * reassemble with a correct checksum, then try combinatorial 2219 * reconstruction. 2220 * d. If that doesn't work, return an error. 2221 * 3. If there were unexpected errors or this is a resilver operation, 2222 * rewrite the vdevs that had errors. 2223 */ 2224 static void 2225 vdev_raidz_io_done(zio_t *zio) 2226 { 2227 vdev_t *vd = zio->io_vd; 2228 vdev_t *cvd; 2229 raidz_map_t *rm = zio->io_vsd; 2230 raidz_col_t *rc; 2231 int unexpected_errors = 0; 2232 int parity_errors = 0; 2233 int parity_untried = 0; 2234 int data_errors = 0; 2235 int total_errors = 0; 2236 int n, c; 2237 int tgts[VDEV_RAIDZ_MAXPARITY]; 2238 int code; 2239 2240 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2241 2242 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2243 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2244 2245 for (c = 0; c < rm->rm_cols; c++) { 2246 rc = &rm->rm_col[c]; 2247 2248 if (rc->rc_error) { 2249 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2250 2251 if (c < rm->rm_firstdatacol) 2252 parity_errors++; 2253 else 2254 data_errors++; 2255 2256 if (!rc->rc_skipped) 2257 unexpected_errors++; 2258 2259 total_errors++; 2260 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2261 parity_untried++; 2262 } 2263 } 2264 2265 if (zio->io_type == ZIO_TYPE_WRITE) { 2266 /* 2267 * XXX -- for now, treat partial writes as a success. 2268 * (If we couldn't write enough columns to reconstruct 2269 * the data, the I/O failed. Otherwise, good enough.) 2270 * 2271 * Now that we support write reallocation, it would be better 2272 * to treat partial failure as real failure unless there are 2273 * no non-degraded top-level vdevs left, and not update DTLs 2274 * if we intend to reallocate. 2275 */ 2276 /* XXPOLICY */ 2277 if (total_errors > rm->rm_firstdatacol) 2278 zio->io_error = vdev_raidz_worst_error(rm); 2279 2280 return; 2281 } 2282 2283 ASSERT(zio->io_type == ZIO_TYPE_READ); 2284 /* 2285 * There are three potential phases for a read: 2286 * 1. produce valid data from the columns read 2287 * 2. read all disks and try again 2288 * 3. perform combinatorial reconstruction 2289 * 2290 * Each phase is progressively both more expensive and less likely to 2291 * occur. If we encounter more errors than we can repair or all phases 2292 * fail, we have no choice but to return an error. 2293 */ 2294 2295 /* 2296 * If the number of errors we saw was correctable -- less than or equal 2297 * to the number of parity disks read -- attempt to produce data that 2298 * has a valid checksum. Naturally, this case applies in the absence of 2299 * any errors. 2300 */ 2301 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2302 if (data_errors == 0) { 2303 if (raidz_checksum_verify(zio) == 0) { 2304 /* 2305 * If we read parity information (unnecessarily 2306 * as it happens since no reconstruction was 2307 * needed) regenerate and verify the parity. 2308 * We also regenerate parity when resilvering 2309 * so we can write it out to the failed device 2310 * later. 2311 */ 2312 if (parity_errors + parity_untried < 2313 rm->rm_firstdatacol || 2314 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2315 n = raidz_parity_verify(zio, rm); 2316 unexpected_errors += n; 2317 ASSERT(parity_errors + n <= 2318 rm->rm_firstdatacol); 2319 } 2320 goto done; 2321 } 2322 } else { 2323 /* 2324 * We either attempt to read all the parity columns or 2325 * none of them. If we didn't try to read parity, we 2326 * wouldn't be here in the correctable case. There must 2327 * also have been fewer parity errors than parity 2328 * columns or, again, we wouldn't be in this code path. 2329 */ 2330 ASSERT(parity_untried == 0); 2331 ASSERT(parity_errors < rm->rm_firstdatacol); 2332 2333 /* 2334 * Identify the data columns that reported an error. 2335 */ 2336 n = 0; 2337 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2338 rc = &rm->rm_col[c]; 2339 if (rc->rc_error != 0) { 2340 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2341 tgts[n++] = c; 2342 } 2343 } 2344 2345 ASSERT(rm->rm_firstdatacol >= n); 2346 2347 code = vdev_raidz_reconstruct(rm, tgts, n); 2348 2349 if (raidz_checksum_verify(zio) == 0) { 2350 /* 2351 * If we read more parity disks than were used 2352 * for reconstruction, confirm that the other 2353 * parity disks produced correct data. This 2354 * routine is suboptimal in that it regenerates 2355 * the parity that we already used in addition 2356 * to the parity that we're attempting to 2357 * verify, but this should be a relatively 2358 * uncommon case, and can be optimized if it 2359 * becomes a problem. Note that we regenerate 2360 * parity when resilvering so we can write it 2361 * out to failed devices later. 2362 */ 2363 if (parity_errors < rm->rm_firstdatacol - n || 2364 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2365 n = raidz_parity_verify(zio, rm); 2366 unexpected_errors += n; 2367 ASSERT(parity_errors + n <= 2368 rm->rm_firstdatacol); 2369 } 2370 2371 goto done; 2372 } 2373 } 2374 } 2375 2376 /* 2377 * This isn't a typical situation -- either we got a read error or 2378 * a child silently returned bad data. Read every block so we can 2379 * try again with as much data and parity as we can track down. If 2380 * we've already been through once before, all children will be marked 2381 * as tried so we'll proceed to combinatorial reconstruction. 2382 */ 2383 unexpected_errors = 1; 2384 rm->rm_missingdata = 0; 2385 rm->rm_missingparity = 0; 2386 2387 for (c = 0; c < rm->rm_cols; c++) { 2388 if (rm->rm_col[c].rc_tried) 2389 continue; 2390 2391 zio_vdev_io_redone(zio); 2392 do { 2393 rc = &rm->rm_col[c]; 2394 if (rc->rc_tried) 2395 continue; 2396 zio_nowait(zio_vdev_child_io(zio, NULL, 2397 vd->vdev_child[rc->rc_devidx], 2398 rc->rc_offset, rc->rc_abd, rc->rc_size, 2399 zio->io_type, zio->io_priority, 0, 2400 vdev_raidz_child_done, rc)); 2401 } while (++c < rm->rm_cols); 2402 2403 return; 2404 } 2405 2406 /* 2407 * At this point we've attempted to reconstruct the data given the 2408 * errors we detected, and we've attempted to read all columns. There 2409 * must, therefore, be one or more additional problems -- silent errors 2410 * resulting in invalid data rather than explicit I/O errors resulting 2411 * in absent data. We check if there is enough additional data to 2412 * possibly reconstruct the data and then perform combinatorial 2413 * reconstruction over all possible combinations. If that fails, 2414 * we're cooked. 2415 */ 2416 if (total_errors > rm->rm_firstdatacol) { 2417 zio->io_error = vdev_raidz_worst_error(rm); 2418 2419 } else if (total_errors < rm->rm_firstdatacol && 2420 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2421 /* 2422 * If we didn't use all the available parity for the 2423 * combinatorial reconstruction, verify that the remaining 2424 * parity is correct. 2425 */ 2426 if (code != (1 << rm->rm_firstdatacol) - 1) 2427 (void) raidz_parity_verify(zio, rm); 2428 } else { 2429 /* 2430 * We're here because either: 2431 * 2432 * total_errors == rm_first_datacol, or 2433 * vdev_raidz_combrec() failed 2434 * 2435 * In either case, there is enough bad data to prevent 2436 * reconstruction. 2437 * 2438 * Start checksum ereports for all children which haven't 2439 * failed, and the IO wasn't speculative. 2440 */ 2441 zio->io_error = SET_ERROR(ECKSUM); 2442 2443 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2444 for (c = 0; c < rm->rm_cols; c++) { 2445 rc = &rm->rm_col[c]; 2446 if (rc->rc_error == 0) { 2447 zio_bad_cksum_t zbc; 2448 zbc.zbc_has_cksum = 0; 2449 zbc.zbc_injected = 2450 rm->rm_ecksuminjected; 2451 2452 zfs_ereport_start_checksum( 2453 zio->io_spa, 2454 vd->vdev_child[rc->rc_devidx], 2455 &zio->io_bookmark, zio, 2456 rc->rc_offset, rc->rc_size, 2457 (void *)(uintptr_t)c, &zbc); 2458 } 2459 } 2460 } 2461 } 2462 2463 done: 2464 zio_checksum_verified(zio); 2465 2466 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2467 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2468 /* 2469 * Use the good data we have in hand to repair damaged children. 2470 */ 2471 for (c = 0; c < rm->rm_cols; c++) { 2472 rc = &rm->rm_col[c]; 2473 cvd = vd->vdev_child[rc->rc_devidx]; 2474 2475 if (rc->rc_error == 0) 2476 continue; 2477 2478 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2479 rc->rc_offset, rc->rc_abd, rc->rc_size, 2480 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2481 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2482 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2483 } 2484 } 2485 } 2486 2487 static void 2488 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2489 { 2490 if (faulted > vd->vdev_nparity) 2491 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2492 VDEV_AUX_NO_REPLICAS); 2493 else if (degraded + faulted != 0) 2494 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2495 else 2496 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2497 } 2498 2499 /* 2500 * Determine if any portion of the provided block resides on a child vdev 2501 * with a dirty DTL and therefore needs to be resilvered. The function 2502 * assumes that at least one DTL is dirty which implies that full stripe 2503 * width blocks must be resilvered. 2504 */ 2505 static boolean_t 2506 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2507 { 2508 uint64_t dcols = vd->vdev_children; 2509 uint64_t nparity = vd->vdev_nparity; 2510 uint64_t ashift = vd->vdev_top->vdev_ashift; 2511 /* The starting RAIDZ (parent) vdev sector of the block. */ 2512 uint64_t b = offset >> ashift; 2513 /* The zio's size in units of the vdev's minimum sector size. */ 2514 uint64_t s = ((psize - 1) >> ashift) + 1; 2515 /* The first column for this stripe. */ 2516 uint64_t f = b % dcols; 2517 2518 if (s + nparity >= dcols) 2519 return (B_TRUE); 2520 2521 for (uint64_t c = 0; c < s + nparity; c++) { 2522 uint64_t devidx = (f + c) % dcols; 2523 vdev_t *cvd = vd->vdev_child[devidx]; 2524 2525 /* 2526 * dsl_scan_need_resilver() already checked vd with 2527 * vdev_dtl_contains(). So here just check cvd with 2528 * vdev_dtl_empty(), cheaper and a good approximation. 2529 */ 2530 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2531 return (B_TRUE); 2532 } 2533 2534 return (B_FALSE); 2535 } 2536 2537 static void 2538 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) 2539 { 2540 vdev_t *raidvd = cvd->vdev_parent; 2541 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2542 2543 uint64_t width = raidvd->vdev_children; 2544 uint64_t tgt_col = cvd->vdev_id; 2545 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2546 2547 /* make sure the offsets are block-aligned */ 2548 ASSERT0(in->rs_start % (1 << ashift)); 2549 ASSERT0(in->rs_end % (1 << ashift)); 2550 uint64_t b_start = in->rs_start >> ashift; 2551 uint64_t b_end = in->rs_end >> ashift; 2552 2553 uint64_t start_row = 0; 2554 if (b_start > tgt_col) /* avoid underflow */ 2555 start_row = ((b_start - tgt_col - 1) / width) + 1; 2556 2557 uint64_t end_row = 0; 2558 if (b_end > tgt_col) 2559 end_row = ((b_end - tgt_col - 1) / width) + 1; 2560 2561 res->rs_start = start_row << ashift; 2562 res->rs_end = end_row << ashift; 2563 2564 ASSERT3U(res->rs_start, <=, in->rs_start); 2565 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2566 } 2567 2568 vdev_ops_t vdev_raidz_ops = { 2569 .vdev_op_open = vdev_raidz_open, 2570 .vdev_op_close = vdev_raidz_close, 2571 .vdev_op_asize = vdev_raidz_asize, 2572 .vdev_op_io_start = vdev_raidz_io_start, 2573 .vdev_op_io_done = vdev_raidz_io_done, 2574 .vdev_op_state_change = vdev_raidz_state_change, 2575 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2576 .vdev_op_hold = NULL, 2577 .vdev_op_rele = NULL, 2578 .vdev_op_remap = NULL, 2579 .vdev_op_xlate = vdev_raidz_xlate, 2580 .vdev_op_dumpio = vdev_raidz_dumpio, 2581 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2582 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2583 }; 2584