1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_file.h> 34 #include <sys/vdev_raidz.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/abd.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/fs/zfs.h> 40 41 /* 42 * Virtual device vector for RAID-Z. 43 * 44 * This vdev supports single, double, and triple parity. For single parity, 45 * we use a simple XOR of all the data columns. For double or triple parity, 46 * we use a special case of Reed-Solomon coding. This extends the 47 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 48 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 49 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 50 * former is also based. The latter is designed to provide higher performance 51 * for writes. 52 * 53 * Note that the Plank paper claimed to support arbitrary N+M, but was then 54 * amended six years later identifying a critical flaw that invalidates its 55 * claims. Nevertheless, the technique can be adapted to work for up to 56 * triple parity. For additional parity, the amendment "Note: Correction to 57 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 58 * is viable, but the additional complexity means that write performance will 59 * suffer. 60 * 61 * All of the methods above operate on a Galois field, defined over the 62 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 63 * can be expressed with a single byte. Briefly, the operations on the 64 * field are defined as follows: 65 * 66 * o addition (+) is represented by a bitwise XOR 67 * o subtraction (-) is therefore identical to addition: A + B = A - B 68 * o multiplication of A by 2 is defined by the following bitwise expression: 69 * 70 * (A * 2)_7 = A_6 71 * (A * 2)_6 = A_5 72 * (A * 2)_5 = A_4 73 * (A * 2)_4 = A_3 + A_7 74 * (A * 2)_3 = A_2 + A_7 75 * (A * 2)_2 = A_1 + A_7 76 * (A * 2)_1 = A_0 77 * (A * 2)_0 = A_7 78 * 79 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 80 * As an aside, this multiplication is derived from the error correcting 81 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 82 * 83 * Observe that any number in the field (except for 0) can be expressed as a 84 * power of 2 -- a generator for the field. We store a table of the powers of 85 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 86 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 87 * than field addition). The inverse of a field element A (A^-1) is therefore 88 * A ^ (255 - 1) = A^254. 89 * 90 * The up-to-three parity columns, P, Q, R over several data columns, 91 * D_0, ... D_n-1, can be expressed by field operations: 92 * 93 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 94 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 95 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 96 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 97 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 98 * 99 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 100 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 101 * independent coefficients. (There are no additional coefficients that have 102 * this property which is why the uncorrected Plank method breaks down.) 103 * 104 * See the reconstruction code below for how P, Q and R can used individually 105 * or in concert to recover missing data columns. 106 */ 107 108 typedef struct raidz_col { 109 uint64_t rc_devidx; /* child device index for I/O */ 110 uint64_t rc_offset; /* device offset */ 111 uint64_t rc_size; /* I/O size */ 112 abd_t *rc_abd; /* I/O data */ 113 void *rc_gdata; /* used to store the "good" version */ 114 int rc_error; /* I/O error for this device */ 115 uint8_t rc_tried; /* Did we attempt this I/O column? */ 116 uint8_t rc_skipped; /* Did we skip this I/O column? */ 117 } raidz_col_t; 118 119 typedef struct raidz_map { 120 uint64_t rm_cols; /* Regular column count */ 121 uint64_t rm_scols; /* Count including skipped columns */ 122 uint64_t rm_bigcols; /* Number of oversized columns */ 123 uint64_t rm_asize; /* Actual total I/O size */ 124 uint64_t rm_missingdata; /* Count of missing data devices */ 125 uint64_t rm_missingparity; /* Count of missing parity devices */ 126 uint64_t rm_firstdatacol; /* First data column/parity count */ 127 uint64_t rm_nskip; /* Skipped sectors for padding */ 128 uint64_t rm_skipstart; /* Column index of padding start */ 129 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ 130 uintptr_t rm_reports; /* # of referencing checksum reports */ 131 uint8_t rm_freed; /* map no longer has referencing ZIO */ 132 uint8_t rm_ecksuminjected; /* checksum error was injected */ 133 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 134 } raidz_map_t; 135 136 #define VDEV_RAIDZ_P 0 137 #define VDEV_RAIDZ_Q 1 138 #define VDEV_RAIDZ_R 2 139 140 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 141 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 142 143 /* 144 * We provide a mechanism to perform the field multiplication operation on a 145 * 64-bit value all at once rather than a byte at a time. This works by 146 * creating a mask from the top bit in each byte and using that to 147 * conditionally apply the XOR of 0x1d. 148 */ 149 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 150 { \ 151 (mask) = (x) & 0x8080808080808080ULL; \ 152 (mask) = ((mask) << 1) - ((mask) >> 7); \ 153 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 154 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 155 } 156 157 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 158 { \ 159 VDEV_RAIDZ_64MUL_2((x), mask); \ 160 VDEV_RAIDZ_64MUL_2((x), mask); \ 161 } 162 163 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 164 165 /* 166 * Force reconstruction to use the general purpose method. 167 */ 168 int vdev_raidz_default_to_general; 169 170 /* Powers of 2 in the Galois field defined above. */ 171 static const uint8_t vdev_raidz_pow2[256] = { 172 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 173 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 174 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 175 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 176 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 177 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 178 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 179 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 180 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 181 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 182 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 183 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 184 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 185 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 186 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 187 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 188 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 189 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 190 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 191 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 192 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 193 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 194 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 195 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 196 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 197 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 198 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 199 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 200 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 201 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 202 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 203 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 204 }; 205 /* Logs of 2 in the Galois field defined above. */ 206 static const uint8_t vdev_raidz_log2[256] = { 207 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 208 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 209 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 210 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 211 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 212 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 213 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 214 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 215 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 216 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 217 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 218 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 219 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 220 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 221 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 222 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 223 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 224 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 225 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 226 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 227 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 228 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 229 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 230 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 231 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 232 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 233 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 234 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 235 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 236 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 237 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 238 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 239 }; 240 241 static void vdev_raidz_generate_parity(raidz_map_t *rm); 242 243 /* 244 * Multiply a given number by 2 raised to the given power. 245 */ 246 static uint8_t 247 vdev_raidz_exp2(uint_t a, int exp) 248 { 249 if (a == 0) 250 return (0); 251 252 ASSERT(exp >= 0); 253 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 254 255 exp += vdev_raidz_log2[a]; 256 if (exp > 255) 257 exp -= 255; 258 259 return (vdev_raidz_pow2[exp]); 260 } 261 262 static void 263 vdev_raidz_map_free(raidz_map_t *rm) 264 { 265 int c; 266 size_t size; 267 268 for (c = 0; c < rm->rm_firstdatacol; c++) { 269 abd_free(rm->rm_col[c].rc_abd); 270 271 if (rm->rm_col[c].rc_gdata != NULL) 272 zio_buf_free(rm->rm_col[c].rc_gdata, 273 rm->rm_col[c].rc_size); 274 } 275 276 size = 0; 277 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 278 abd_put(rm->rm_col[c].rc_abd); 279 size += rm->rm_col[c].rc_size; 280 } 281 282 if (rm->rm_abd_copy != NULL) 283 abd_free(rm->rm_abd_copy); 284 285 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 286 } 287 288 static void 289 vdev_raidz_map_free_vsd(zio_t *zio) 290 { 291 raidz_map_t *rm = zio->io_vsd; 292 293 ASSERT0(rm->rm_freed); 294 rm->rm_freed = 1; 295 296 if (rm->rm_reports == 0) 297 vdev_raidz_map_free(rm); 298 } 299 300 /*ARGSUSED*/ 301 static void 302 vdev_raidz_cksum_free(void *arg, size_t ignored) 303 { 304 raidz_map_t *rm = arg; 305 306 ASSERT3U(rm->rm_reports, >, 0); 307 308 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 309 vdev_raidz_map_free(rm); 310 } 311 312 static void 313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 314 { 315 raidz_map_t *rm = zcr->zcr_cbdata; 316 size_t c = zcr->zcr_cbinfo; 317 size_t x; 318 319 const char *good = NULL; 320 char *bad; 321 322 if (good_data == NULL) { 323 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 324 return; 325 } 326 327 if (c < rm->rm_firstdatacol) { 328 /* 329 * The first time through, calculate the parity blocks for 330 * the good data (this relies on the fact that the good 331 * data never changes for a given logical ZIO) 332 */ 333 if (rm->rm_col[0].rc_gdata == NULL) { 334 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 335 char *buf; 336 int offset; 337 338 /* 339 * Set up the rm_col[]s to generate the parity for 340 * good_data, first saving the parity bufs and 341 * replacing them with buffers to hold the result. 342 */ 343 for (x = 0; x < rm->rm_firstdatacol; x++) { 344 bad_parity[x] = rm->rm_col[x].rc_abd; 345 rm->rm_col[x].rc_gdata = 346 zio_buf_alloc(rm->rm_col[x].rc_size); 347 rm->rm_col[x].rc_abd = 348 abd_get_from_buf(rm->rm_col[x].rc_gdata, 349 rm->rm_col[x].rc_size); 350 } 351 352 /* fill in the data columns from good_data */ 353 buf = (char *)good_data; 354 for (; x < rm->rm_cols; x++) { 355 abd_put(rm->rm_col[x].rc_abd); 356 rm->rm_col[x].rc_abd = abd_get_from_buf(buf, 357 rm->rm_col[x].rc_size); 358 buf += rm->rm_col[x].rc_size; 359 } 360 361 /* 362 * Construct the parity from the good data. 363 */ 364 vdev_raidz_generate_parity(rm); 365 366 /* restore everything back to its original state */ 367 for (x = 0; x < rm->rm_firstdatacol; x++) { 368 abd_put(rm->rm_col[x].rc_abd); 369 rm->rm_col[x].rc_abd = bad_parity[x]; 370 } 371 372 offset = 0; 373 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 374 abd_put(rm->rm_col[x].rc_abd); 375 rm->rm_col[x].rc_abd = abd_get_offset( 376 rm->rm_abd_copy, offset); 377 offset += rm->rm_col[x].rc_size; 378 } 379 } 380 381 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 382 good = rm->rm_col[c].rc_gdata; 383 } else { 384 /* adjust good_data to point at the start of our column */ 385 good = good_data; 386 387 for (x = rm->rm_firstdatacol; x < c; x++) 388 good += rm->rm_col[x].rc_size; 389 } 390 391 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); 392 /* we drop the ereport if it ends up that the data was good */ 393 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 394 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); 395 } 396 397 /* 398 * Invoked indirectly by zfs_ereport_start_checksum(), called 399 * below when our read operation fails completely. The main point 400 * is to keep a copy of everything we read from disk, so that at 401 * vdev_raidz_cksum_finish() time we can compare it with the good data. 402 */ 403 static void 404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 405 { 406 size_t c = (size_t)(uintptr_t)arg; 407 size_t offset; 408 409 raidz_map_t *rm = zio->io_vsd; 410 size_t size; 411 412 /* set up the report and bump the refcount */ 413 zcr->zcr_cbdata = rm; 414 zcr->zcr_cbinfo = c; 415 zcr->zcr_finish = vdev_raidz_cksum_finish; 416 zcr->zcr_free = vdev_raidz_cksum_free; 417 418 rm->rm_reports++; 419 ASSERT3U(rm->rm_reports, >, 0); 420 421 if (rm->rm_abd_copy != NULL) 422 return; 423 424 /* 425 * It's the first time we're called for this raidz_map_t, so we need 426 * to copy the data aside; there's no guarantee that our zio's buffer 427 * won't be re-used for something else. 428 * 429 * Our parity data is already in separate buffers, so there's no need 430 * to copy them. 431 */ 432 433 size = 0; 434 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 435 size += rm->rm_col[c].rc_size; 436 437 rm->rm_abd_copy = 438 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); 439 440 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 441 raidz_col_t *col = &rm->rm_col[c]; 442 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); 443 444 abd_copy(tmp, col->rc_abd, col->rc_size); 445 abd_put(col->rc_abd); 446 col->rc_abd = tmp; 447 448 offset += col->rc_size; 449 } 450 ASSERT3U(offset, ==, size); 451 } 452 453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 454 vdev_raidz_map_free_vsd, 455 vdev_raidz_cksum_report 456 }; 457 458 /* 459 * Divides the IO evenly across all child vdevs; usually, dcols is 460 * the number of children in the target vdev. 461 */ 462 static raidz_map_t * 463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, 464 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 465 { 466 raidz_map_t *rm; 467 /* The starting RAIDZ (parent) vdev sector of the block. */ 468 uint64_t b = offset >> unit_shift; 469 /* The zio's size in units of the vdev's minimum sector size. */ 470 uint64_t s = size >> unit_shift; 471 /* The first column for this stripe. */ 472 uint64_t f = b % dcols; 473 /* The starting byte offset on each child vdev. */ 474 uint64_t o = (b / dcols) << unit_shift; 475 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 476 uint64_t off = 0; 477 478 /* 479 * "Quotient": The number of data sectors for this stripe on all but 480 * the "big column" child vdevs that also contain "remainder" data. 481 */ 482 q = s / (dcols - nparity); 483 484 /* 485 * "Remainder": The number of partial stripe data sectors in this I/O. 486 * This will add a sector to some, but not all, child vdevs. 487 */ 488 r = s - q * (dcols - nparity); 489 490 /* The number of "big columns" - those which contain remainder data. */ 491 bc = (r == 0 ? 0 : r + nparity); 492 493 /* 494 * The total number of data and parity sectors associated with 495 * this I/O. 496 */ 497 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 498 499 /* acols: The columns that will be accessed. */ 500 /* scols: The columns that will be accessed or skipped. */ 501 if (q == 0) { 502 /* Our I/O request doesn't span all child vdevs. */ 503 acols = bc; 504 scols = MIN(dcols, roundup(bc, nparity + 1)); 505 } else { 506 acols = dcols; 507 scols = dcols; 508 } 509 510 ASSERT3U(acols, <=, scols); 511 512 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 513 514 rm->rm_cols = acols; 515 rm->rm_scols = scols; 516 rm->rm_bigcols = bc; 517 rm->rm_skipstart = bc; 518 rm->rm_missingdata = 0; 519 rm->rm_missingparity = 0; 520 rm->rm_firstdatacol = nparity; 521 rm->rm_abd_copy = NULL; 522 rm->rm_reports = 0; 523 rm->rm_freed = 0; 524 rm->rm_ecksuminjected = 0; 525 526 asize = 0; 527 528 for (c = 0; c < scols; c++) { 529 col = f + c; 530 coff = o; 531 if (col >= dcols) { 532 col -= dcols; 533 coff += 1ULL << unit_shift; 534 } 535 rm->rm_col[c].rc_devidx = col; 536 rm->rm_col[c].rc_offset = coff; 537 rm->rm_col[c].rc_abd = NULL; 538 rm->rm_col[c].rc_gdata = NULL; 539 rm->rm_col[c].rc_error = 0; 540 rm->rm_col[c].rc_tried = 0; 541 rm->rm_col[c].rc_skipped = 0; 542 543 if (c >= acols) 544 rm->rm_col[c].rc_size = 0; 545 else if (c < bc) 546 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 547 else 548 rm->rm_col[c].rc_size = q << unit_shift; 549 550 asize += rm->rm_col[c].rc_size; 551 } 552 553 ASSERT3U(asize, ==, tot << unit_shift); 554 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 555 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 556 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 557 ASSERT3U(rm->rm_nskip, <=, nparity); 558 559 for (c = 0; c < rm->rm_firstdatacol; c++) 560 rm->rm_col[c].rc_abd = 561 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); 562 563 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); 564 off = rm->rm_col[c].rc_size; 565 566 for (c = c + 1; c < acols; c++) { 567 rm->rm_col[c].rc_abd = abd_get_offset(abd, off); 568 off += rm->rm_col[c].rc_size; 569 } 570 571 /* 572 * If all data stored spans all columns, there's a danger that parity 573 * will always be on the same device and, since parity isn't read 574 * during normal operation, that that device's I/O bandwidth won't be 575 * used effectively. We therefore switch the parity every 1MB. 576 * 577 * ... at least that was, ostensibly, the theory. As a practical 578 * matter unless we juggle the parity between all devices evenly, we 579 * won't see any benefit. Further, occasional writes that aren't a 580 * multiple of the LCM of the number of children and the minimum 581 * stripe width are sufficient to avoid pessimal behavior. 582 * Unfortunately, this decision created an implicit on-disk format 583 * requirement that we need to support for all eternity, but only 584 * for single-parity RAID-Z. 585 * 586 * If we intend to skip a sector in the zeroth column for padding 587 * we must make sure to note this swap. We will never intend to 588 * skip the first column since at least one data and one parity 589 * column must appear in each row. 590 */ 591 ASSERT(rm->rm_cols >= 2); 592 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 593 594 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 595 devidx = rm->rm_col[0].rc_devidx; 596 o = rm->rm_col[0].rc_offset; 597 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 598 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 599 rm->rm_col[1].rc_devidx = devidx; 600 rm->rm_col[1].rc_offset = o; 601 602 if (rm->rm_skipstart == 0) 603 rm->rm_skipstart = 1; 604 } 605 606 return (rm); 607 } 608 609 struct pqr_struct { 610 uint64_t *p; 611 uint64_t *q; 612 uint64_t *r; 613 }; 614 615 static int 616 vdev_raidz_p_func(void *buf, size_t size, void *private) 617 { 618 struct pqr_struct *pqr = private; 619 const uint64_t *src = buf; 620 int i, cnt = size / sizeof (src[0]); 621 622 ASSERT(pqr->p && !pqr->q && !pqr->r); 623 624 for (i = 0; i < cnt; i++, src++, pqr->p++) 625 *pqr->p ^= *src; 626 627 return (0); 628 } 629 630 static int 631 vdev_raidz_pq_func(void *buf, size_t size, void *private) 632 { 633 struct pqr_struct *pqr = private; 634 const uint64_t *src = buf; 635 uint64_t mask; 636 int i, cnt = size / sizeof (src[0]); 637 638 ASSERT(pqr->p && pqr->q && !pqr->r); 639 640 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 641 *pqr->p ^= *src; 642 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 643 *pqr->q ^= *src; 644 } 645 646 return (0); 647 } 648 649 static int 650 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 651 { 652 struct pqr_struct *pqr = private; 653 const uint64_t *src = buf; 654 uint64_t mask; 655 int i, cnt = size / sizeof (src[0]); 656 657 ASSERT(pqr->p && pqr->q && pqr->r); 658 659 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 660 *pqr->p ^= *src; 661 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 662 *pqr->q ^= *src; 663 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 664 *pqr->r ^= *src; 665 } 666 667 return (0); 668 } 669 670 static void 671 vdev_raidz_generate_parity_p(raidz_map_t *rm) 672 { 673 uint64_t *p; 674 int c; 675 abd_t *src; 676 677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 678 src = rm->rm_col[c].rc_abd; 679 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 680 681 if (c == rm->rm_firstdatacol) { 682 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 683 } else { 684 struct pqr_struct pqr = { p, NULL, NULL }; 685 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 686 vdev_raidz_p_func, &pqr); 687 } 688 } 689 } 690 691 static void 692 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 693 { 694 uint64_t *p, *q, pcnt, ccnt, mask, i; 695 int c; 696 abd_t *src; 697 698 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 699 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 700 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 701 702 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 703 src = rm->rm_col[c].rc_abd; 704 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 705 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 706 707 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 708 709 if (c == rm->rm_firstdatacol) { 710 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 711 (void) memcpy(q, p, rm->rm_col[c].rc_size); 712 } else { 713 struct pqr_struct pqr = { p, q, NULL }; 714 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 715 vdev_raidz_pq_func, &pqr); 716 } 717 718 if (c == rm->rm_firstdatacol) { 719 for (i = ccnt; i < pcnt; i++) { 720 p[i] = 0; 721 q[i] = 0; 722 } 723 } else { 724 /* 725 * Treat short columns as though they are full of 0s. 726 * Note that there's therefore nothing needed for P. 727 */ 728 for (i = ccnt; i < pcnt; i++) { 729 VDEV_RAIDZ_64MUL_2(q[i], mask); 730 } 731 } 732 } 733 } 734 735 static void 736 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 737 { 738 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 739 int c; 740 abd_t *src; 741 742 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 743 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 744 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 745 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 746 rm->rm_col[VDEV_RAIDZ_R].rc_size); 747 748 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 749 src = rm->rm_col[c].rc_abd; 750 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 751 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 752 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 753 754 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 755 756 if (c == rm->rm_firstdatacol) { 757 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 758 (void) memcpy(q, p, rm->rm_col[c].rc_size); 759 (void) memcpy(r, p, rm->rm_col[c].rc_size); 760 } else { 761 struct pqr_struct pqr = { p, q, r }; 762 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 763 vdev_raidz_pqr_func, &pqr); 764 } 765 766 if (c == rm->rm_firstdatacol) { 767 for (i = ccnt; i < pcnt; i++) { 768 p[i] = 0; 769 q[i] = 0; 770 r[i] = 0; 771 } 772 } else { 773 /* 774 * Treat short columns as though they are full of 0s. 775 * Note that there's therefore nothing needed for P. 776 */ 777 for (i = ccnt; i < pcnt; i++) { 778 VDEV_RAIDZ_64MUL_2(q[i], mask); 779 VDEV_RAIDZ_64MUL_4(r[i], mask); 780 } 781 } 782 } 783 } 784 785 /* 786 * Generate RAID parity in the first virtual columns according to the number of 787 * parity columns available. 788 */ 789 static void 790 vdev_raidz_generate_parity(raidz_map_t *rm) 791 { 792 switch (rm->rm_firstdatacol) { 793 case 1: 794 vdev_raidz_generate_parity_p(rm); 795 break; 796 case 2: 797 vdev_raidz_generate_parity_pq(rm); 798 break; 799 case 3: 800 vdev_raidz_generate_parity_pqr(rm); 801 break; 802 default: 803 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 804 } 805 } 806 807 /* ARGSUSED */ 808 static int 809 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 810 { 811 uint64_t *dst = dbuf; 812 uint64_t *src = sbuf; 813 int cnt = size / sizeof (src[0]); 814 815 for (int i = 0; i < cnt; i++) { 816 dst[i] ^= src[i]; 817 } 818 819 return (0); 820 } 821 822 /* ARGSUSED */ 823 static int 824 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 825 void *private) 826 { 827 uint64_t *dst = dbuf; 828 uint64_t *src = sbuf; 829 uint64_t mask; 830 int cnt = size / sizeof (dst[0]); 831 832 for (int i = 0; i < cnt; i++, dst++, src++) { 833 VDEV_RAIDZ_64MUL_2(*dst, mask); 834 *dst ^= *src; 835 } 836 837 return (0); 838 } 839 840 /* ARGSUSED */ 841 static int 842 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 843 { 844 uint64_t *dst = buf; 845 uint64_t mask; 846 int cnt = size / sizeof (dst[0]); 847 848 for (int i = 0; i < cnt; i++, dst++) { 849 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 850 VDEV_RAIDZ_64MUL_2(*dst, mask); 851 } 852 853 return (0); 854 } 855 856 struct reconst_q_struct { 857 uint64_t *q; 858 int exp; 859 }; 860 861 static int 862 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 863 { 864 struct reconst_q_struct *rq = private; 865 uint64_t *dst = buf; 866 int cnt = size / sizeof (dst[0]); 867 868 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 869 *dst ^= *rq->q; 870 871 int j; 872 uint8_t *b; 873 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 874 *b = vdev_raidz_exp2(*b, rq->exp); 875 } 876 } 877 878 return (0); 879 } 880 881 struct reconst_pq_struct { 882 uint8_t *p; 883 uint8_t *q; 884 uint8_t *pxy; 885 uint8_t *qxy; 886 int aexp; 887 int bexp; 888 }; 889 890 static int 891 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 892 { 893 struct reconst_pq_struct *rpq = private; 894 uint8_t *xd = xbuf; 895 uint8_t *yd = ybuf; 896 897 for (int i = 0; i < size; 898 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 899 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 900 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 901 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 902 } 903 904 return (0); 905 } 906 907 static int 908 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 909 { 910 struct reconst_pq_struct *rpq = private; 911 uint8_t *xd = xbuf; 912 913 for (int i = 0; i < size; 914 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 915 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 916 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 917 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 918 } 919 920 return (0); 921 } 922 923 static int 924 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 925 { 926 int x = tgts[0]; 927 int c; 928 abd_t *dst, *src; 929 930 ASSERT(ntgts == 1); 931 ASSERT(x >= rm->rm_firstdatacol); 932 ASSERT(x < rm->rm_cols); 933 934 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 935 ASSERT(rm->rm_col[x].rc_size > 0); 936 937 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 938 dst = rm->rm_col[x].rc_abd; 939 940 abd_copy(dst, src, rm->rm_col[x].rc_size); 941 942 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 943 uint64_t size = MIN(rm->rm_col[x].rc_size, 944 rm->rm_col[c].rc_size); 945 946 src = rm->rm_col[c].rc_abd; 947 dst = rm->rm_col[x].rc_abd; 948 949 if (c == x) 950 continue; 951 952 (void) abd_iterate_func2(dst, src, 0, 0, size, 953 vdev_raidz_reconst_p_func, NULL); 954 } 955 956 return (1 << VDEV_RAIDZ_P); 957 } 958 959 static int 960 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 961 { 962 int x = tgts[0]; 963 int c, exp; 964 abd_t *dst, *src; 965 966 ASSERT(ntgts == 1); 967 968 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 969 970 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 971 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 972 rm->rm_col[c].rc_size); 973 974 src = rm->rm_col[c].rc_abd; 975 dst = rm->rm_col[x].rc_abd; 976 977 if (c == rm->rm_firstdatacol) { 978 abd_copy(dst, src, size); 979 if (rm->rm_col[x].rc_size > size) 980 abd_zero_off(dst, size, 981 rm->rm_col[x].rc_size - size); 982 } else { 983 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 984 (void) abd_iterate_func2(dst, src, 0, 0, size, 985 vdev_raidz_reconst_q_pre_func, NULL); 986 (void) abd_iterate_func(dst, 987 size, rm->rm_col[x].rc_size - size, 988 vdev_raidz_reconst_q_pre_tail_func, NULL); 989 } 990 } 991 992 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 993 dst = rm->rm_col[x].rc_abd; 994 exp = 255 - (rm->rm_cols - 1 - x); 995 996 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 997 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 998 vdev_raidz_reconst_q_post_func, &rq); 999 1000 return (1 << VDEV_RAIDZ_Q); 1001 } 1002 1003 static int 1004 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 1005 { 1006 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1007 abd_t *pdata, *qdata; 1008 uint64_t xsize, ysize; 1009 int x = tgts[0]; 1010 int y = tgts[1]; 1011 abd_t *xd, *yd; 1012 1013 ASSERT(ntgts == 2); 1014 ASSERT(x < y); 1015 ASSERT(x >= rm->rm_firstdatacol); 1016 ASSERT(y < rm->rm_cols); 1017 1018 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 1019 1020 /* 1021 * Move the parity data aside -- we're going to compute parity as 1022 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1023 * reuse the parity generation mechanism without trashing the actual 1024 * parity so we make those columns appear to be full of zeros by 1025 * setting their lengths to zero. 1026 */ 1027 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 1028 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1029 xsize = rm->rm_col[x].rc_size; 1030 ysize = rm->rm_col[y].rc_size; 1031 1032 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 1033 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1034 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 1035 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1036 rm->rm_col[x].rc_size = 0; 1037 rm->rm_col[y].rc_size = 0; 1038 1039 vdev_raidz_generate_parity_pq(rm); 1040 1041 rm->rm_col[x].rc_size = xsize; 1042 rm->rm_col[y].rc_size = ysize; 1043 1044 p = abd_to_buf(pdata); 1045 q = abd_to_buf(qdata); 1046 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1047 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1048 xd = rm->rm_col[x].rc_abd; 1049 yd = rm->rm_col[y].rc_abd; 1050 1051 /* 1052 * We now have: 1053 * Pxy = P + D_x + D_y 1054 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1055 * 1056 * We can then solve for D_x: 1057 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1058 * where 1059 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1060 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1061 * 1062 * With D_x in hand, we can easily solve for D_y: 1063 * D_y = P + Pxy + D_x 1064 */ 1065 1066 a = vdev_raidz_pow2[255 + x - y]; 1067 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 1068 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1069 1070 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1071 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1072 1073 ASSERT3U(xsize, >=, ysize); 1074 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1075 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1076 vdev_raidz_reconst_pq_func, &rpq); 1077 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1078 vdev_raidz_reconst_pq_tail_func, &rpq); 1079 1080 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1081 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1082 1083 /* 1084 * Restore the saved parity data. 1085 */ 1086 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 1087 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1088 1089 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1090 } 1091 1092 /* BEGIN CSTYLED */ 1093 /* 1094 * In the general case of reconstruction, we must solve the system of linear 1095 * equations defined by the coeffecients used to generate parity as well as 1096 * the contents of the data and parity disks. This can be expressed with 1097 * vectors for the original data (D) and the actual data (d) and parity (p) 1098 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1099 * 1100 * __ __ __ __ 1101 * | | __ __ | p_0 | 1102 * | V | | D_0 | | p_m-1 | 1103 * | | x | : | = | d_0 | 1104 * | I | | D_n-1 | | : | 1105 * | | ~~ ~~ | d_n-1 | 1106 * ~~ ~~ ~~ ~~ 1107 * 1108 * I is simply a square identity matrix of size n, and V is a vandermonde 1109 * matrix defined by the coeffecients we chose for the various parity columns 1110 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1111 * computation as well as linear separability. 1112 * 1113 * __ __ __ __ 1114 * | 1 .. 1 1 1 | | p_0 | 1115 * | 2^n-1 .. 4 2 1 | __ __ | : | 1116 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1117 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1118 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1119 * | : : : : | | : | | d_2 | 1120 * | 0 .. 1 0 0 | | D_n-1 | | : | 1121 * | 0 .. 0 1 0 | ~~ ~~ | : | 1122 * | 0 .. 0 0 1 | | d_n-1 | 1123 * ~~ ~~ ~~ ~~ 1124 * 1125 * Note that I, V, d, and p are known. To compute D, we must invert the 1126 * matrix and use the known data and parity values to reconstruct the unknown 1127 * data values. We begin by removing the rows in V|I and d|p that correspond 1128 * to failed or missing columns; we then make V|I square (n x n) and d|p 1129 * sized n by removing rows corresponding to unused parity from the bottom up 1130 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1131 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1132 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1133 * __ __ 1134 * | 1 1 1 1 1 1 1 1 | 1135 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1136 * | 19 205 116 29 64 16 4 1 | / / 1137 * | 1 0 0 0 0 0 0 0 | / / 1138 * | 0 1 0 0 0 0 0 0 | <--' / 1139 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1140 * | 0 0 0 1 0 0 0 0 | 1141 * | 0 0 0 0 1 0 0 0 | 1142 * | 0 0 0 0 0 1 0 0 | 1143 * | 0 0 0 0 0 0 1 0 | 1144 * | 0 0 0 0 0 0 0 1 | 1145 * ~~ ~~ 1146 * __ __ 1147 * | 1 1 1 1 1 1 1 1 | 1148 * | 19 205 116 29 64 16 4 1 | 1149 * | 1 0 0 0 0 0 0 0 | 1150 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1151 * | 0 0 0 0 1 0 0 0 | 1152 * | 0 0 0 0 0 1 0 0 | 1153 * | 0 0 0 0 0 0 1 0 | 1154 * | 0 0 0 0 0 0 0 1 | 1155 * ~~ ~~ 1156 * 1157 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1158 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1159 * matrix is not singular. 1160 * __ __ 1161 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1162 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1163 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1164 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1165 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1166 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1167 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1168 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1169 * ~~ ~~ 1170 * __ __ 1171 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1172 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1173 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1174 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1175 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1176 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1177 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1178 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1179 * ~~ ~~ 1180 * __ __ 1181 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1182 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1183 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1184 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1185 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1186 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1187 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1188 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1189 * ~~ ~~ 1190 * __ __ 1191 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1192 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1193 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1194 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1195 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1196 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1197 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1198 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1199 * ~~ ~~ 1200 * __ __ 1201 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1202 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1203 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1204 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1205 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1206 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1207 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1208 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1209 * ~~ ~~ 1210 * __ __ 1211 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1212 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1213 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1214 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1215 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1216 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1217 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1218 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1219 * ~~ ~~ 1220 * __ __ 1221 * | 0 0 1 0 0 0 0 0 | 1222 * | 167 100 5 41 159 169 217 208 | 1223 * | 166 100 4 40 158 168 216 209 | 1224 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1225 * | 0 0 0 0 1 0 0 0 | 1226 * | 0 0 0 0 0 1 0 0 | 1227 * | 0 0 0 0 0 0 1 0 | 1228 * | 0 0 0 0 0 0 0 1 | 1229 * ~~ ~~ 1230 * 1231 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1232 * of the missing data. 1233 * 1234 * As is apparent from the example above, the only non-trivial rows in the 1235 * inverse matrix correspond to the data disks that we're trying to 1236 * reconstruct. Indeed, those are the only rows we need as the others would 1237 * only be useful for reconstructing data known or assumed to be valid. For 1238 * that reason, we only build the coefficients in the rows that correspond to 1239 * targeted columns. 1240 */ 1241 /* END CSTYLED */ 1242 1243 static void 1244 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1245 uint8_t **rows) 1246 { 1247 int i, j; 1248 int pow; 1249 1250 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1251 1252 /* 1253 * Fill in the missing rows of interest. 1254 */ 1255 for (i = 0; i < nmap; i++) { 1256 ASSERT3S(0, <=, map[i]); 1257 ASSERT3S(map[i], <=, 2); 1258 1259 pow = map[i] * n; 1260 if (pow > 255) 1261 pow -= 255; 1262 ASSERT(pow <= 255); 1263 1264 for (j = 0; j < n; j++) { 1265 pow -= map[i]; 1266 if (pow < 0) 1267 pow += 255; 1268 rows[i][j] = vdev_raidz_pow2[pow]; 1269 } 1270 } 1271 } 1272 1273 static void 1274 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1275 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1276 { 1277 int i, j, ii, jj; 1278 uint8_t log; 1279 1280 /* 1281 * Assert that the first nmissing entries from the array of used 1282 * columns correspond to parity columns and that subsequent entries 1283 * correspond to data columns. 1284 */ 1285 for (i = 0; i < nmissing; i++) { 1286 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1287 } 1288 for (; i < n; i++) { 1289 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1290 } 1291 1292 /* 1293 * First initialize the storage where we'll compute the inverse rows. 1294 */ 1295 for (i = 0; i < nmissing; i++) { 1296 for (j = 0; j < n; j++) { 1297 invrows[i][j] = (i == j) ? 1 : 0; 1298 } 1299 } 1300 1301 /* 1302 * Subtract all trivial rows from the rows of consequence. 1303 */ 1304 for (i = 0; i < nmissing; i++) { 1305 for (j = nmissing; j < n; j++) { 1306 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1307 jj = used[j] - rm->rm_firstdatacol; 1308 ASSERT3S(jj, <, n); 1309 invrows[i][j] = rows[i][jj]; 1310 rows[i][jj] = 0; 1311 } 1312 } 1313 1314 /* 1315 * For each of the rows of interest, we must normalize it and subtract 1316 * a multiple of it from the other rows. 1317 */ 1318 for (i = 0; i < nmissing; i++) { 1319 for (j = 0; j < missing[i]; j++) { 1320 ASSERT0(rows[i][j]); 1321 } 1322 ASSERT3U(rows[i][missing[i]], !=, 0); 1323 1324 /* 1325 * Compute the inverse of the first element and multiply each 1326 * element in the row by that value. 1327 */ 1328 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1329 1330 for (j = 0; j < n; j++) { 1331 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1332 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1333 } 1334 1335 for (ii = 0; ii < nmissing; ii++) { 1336 if (i == ii) 1337 continue; 1338 1339 ASSERT3U(rows[ii][missing[i]], !=, 0); 1340 1341 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1342 1343 for (j = 0; j < n; j++) { 1344 rows[ii][j] ^= 1345 vdev_raidz_exp2(rows[i][j], log); 1346 invrows[ii][j] ^= 1347 vdev_raidz_exp2(invrows[i][j], log); 1348 } 1349 } 1350 } 1351 1352 /* 1353 * Verify that the data that is left in the rows are properly part of 1354 * an identity matrix. 1355 */ 1356 for (i = 0; i < nmissing; i++) { 1357 for (j = 0; j < n; j++) { 1358 if (j == missing[i]) { 1359 ASSERT3U(rows[i][j], ==, 1); 1360 } else { 1361 ASSERT0(rows[i][j]); 1362 } 1363 } 1364 } 1365 } 1366 1367 static void 1368 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1369 int *missing, uint8_t **invrows, const uint8_t *used) 1370 { 1371 int i, j, x, cc, c; 1372 uint8_t *src; 1373 uint64_t ccount; 1374 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1375 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1376 uint8_t log = 0; 1377 uint8_t val; 1378 int ll; 1379 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1380 uint8_t *p, *pp; 1381 size_t psize; 1382 1383 psize = sizeof (invlog[0][0]) * n * nmissing; 1384 p = kmem_alloc(psize, KM_SLEEP); 1385 1386 for (pp = p, i = 0; i < nmissing; i++) { 1387 invlog[i] = pp; 1388 pp += n; 1389 } 1390 1391 for (i = 0; i < nmissing; i++) { 1392 for (j = 0; j < n; j++) { 1393 ASSERT3U(invrows[i][j], !=, 0); 1394 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1395 } 1396 } 1397 1398 for (i = 0; i < n; i++) { 1399 c = used[i]; 1400 ASSERT3U(c, <, rm->rm_cols); 1401 1402 src = abd_to_buf(rm->rm_col[c].rc_abd); 1403 ccount = rm->rm_col[c].rc_size; 1404 for (j = 0; j < nmissing; j++) { 1405 cc = missing[j] + rm->rm_firstdatacol; 1406 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1407 ASSERT3U(cc, <, rm->rm_cols); 1408 ASSERT3U(cc, !=, c); 1409 1410 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1411 dcount[j] = rm->rm_col[cc].rc_size; 1412 } 1413 1414 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1415 1416 for (x = 0; x < ccount; x++, src++) { 1417 if (*src != 0) 1418 log = vdev_raidz_log2[*src]; 1419 1420 for (cc = 0; cc < nmissing; cc++) { 1421 if (x >= dcount[cc]) 1422 continue; 1423 1424 if (*src == 0) { 1425 val = 0; 1426 } else { 1427 if ((ll = log + invlog[cc][i]) >= 255) 1428 ll -= 255; 1429 val = vdev_raidz_pow2[ll]; 1430 } 1431 1432 if (i == 0) 1433 dst[cc][x] = val; 1434 else 1435 dst[cc][x] ^= val; 1436 } 1437 } 1438 } 1439 1440 kmem_free(p, psize); 1441 } 1442 1443 static int 1444 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1445 { 1446 int n, i, c, t, tt; 1447 int nmissing_rows; 1448 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1449 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1450 1451 uint8_t *p, *pp; 1452 size_t psize; 1453 1454 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1455 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1456 uint8_t *used; 1457 1458 abd_t **bufs = NULL; 1459 1460 int code = 0; 1461 1462 /* 1463 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1464 * temporary linear ABDs. 1465 */ 1466 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1467 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1468 1469 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1470 raidz_col_t *col = &rm->rm_col[c]; 1471 1472 bufs[c] = col->rc_abd; 1473 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1474 abd_copy(col->rc_abd, bufs[c], col->rc_size); 1475 } 1476 } 1477 1478 n = rm->rm_cols - rm->rm_firstdatacol; 1479 1480 /* 1481 * Figure out which data columns are missing. 1482 */ 1483 nmissing_rows = 0; 1484 for (t = 0; t < ntgts; t++) { 1485 if (tgts[t] >= rm->rm_firstdatacol) { 1486 missing_rows[nmissing_rows++] = 1487 tgts[t] - rm->rm_firstdatacol; 1488 } 1489 } 1490 1491 /* 1492 * Figure out which parity columns to use to help generate the missing 1493 * data columns. 1494 */ 1495 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1496 ASSERT(tt < ntgts); 1497 ASSERT(c < rm->rm_firstdatacol); 1498 1499 /* 1500 * Skip any targeted parity columns. 1501 */ 1502 if (c == tgts[tt]) { 1503 tt++; 1504 continue; 1505 } 1506 1507 code |= 1 << c; 1508 1509 parity_map[i] = c; 1510 i++; 1511 } 1512 1513 ASSERT(code != 0); 1514 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1515 1516 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1517 nmissing_rows * n + sizeof (used[0]) * n; 1518 p = kmem_alloc(psize, KM_SLEEP); 1519 1520 for (pp = p, i = 0; i < nmissing_rows; i++) { 1521 rows[i] = pp; 1522 pp += n; 1523 invrows[i] = pp; 1524 pp += n; 1525 } 1526 used = pp; 1527 1528 for (i = 0; i < nmissing_rows; i++) { 1529 used[i] = parity_map[i]; 1530 } 1531 1532 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1533 if (tt < nmissing_rows && 1534 c == missing_rows[tt] + rm->rm_firstdatacol) { 1535 tt++; 1536 continue; 1537 } 1538 1539 ASSERT3S(i, <, n); 1540 used[i] = c; 1541 i++; 1542 } 1543 1544 /* 1545 * Initialize the interesting rows of the matrix. 1546 */ 1547 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1548 1549 /* 1550 * Invert the matrix. 1551 */ 1552 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1553 invrows, used); 1554 1555 /* 1556 * Reconstruct the missing data using the generated matrix. 1557 */ 1558 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1559 invrows, used); 1560 1561 kmem_free(p, psize); 1562 1563 /* 1564 * copy back from temporary linear abds and free them 1565 */ 1566 if (bufs) { 1567 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1568 raidz_col_t *col = &rm->rm_col[c]; 1569 1570 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1571 abd_free(col->rc_abd); 1572 col->rc_abd = bufs[c]; 1573 } 1574 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1575 } 1576 1577 return (code); 1578 } 1579 1580 static int 1581 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1582 { 1583 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1584 int ntgts; 1585 int i, c; 1586 int code; 1587 int nbadparity, nbaddata; 1588 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1589 1590 /* 1591 * The tgts list must already be sorted. 1592 */ 1593 for (i = 1; i < nt; i++) { 1594 ASSERT(t[i] > t[i - 1]); 1595 } 1596 1597 nbadparity = rm->rm_firstdatacol; 1598 nbaddata = rm->rm_cols - nbadparity; 1599 ntgts = 0; 1600 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1601 if (c < rm->rm_firstdatacol) 1602 parity_valid[c] = B_FALSE; 1603 1604 if (i < nt && c == t[i]) { 1605 tgts[ntgts++] = c; 1606 i++; 1607 } else if (rm->rm_col[c].rc_error != 0) { 1608 tgts[ntgts++] = c; 1609 } else if (c >= rm->rm_firstdatacol) { 1610 nbaddata--; 1611 } else { 1612 parity_valid[c] = B_TRUE; 1613 nbadparity--; 1614 } 1615 } 1616 1617 ASSERT(ntgts >= nt); 1618 ASSERT(nbaddata >= 0); 1619 ASSERT(nbaddata + nbadparity == ntgts); 1620 1621 dt = &tgts[nbadparity]; 1622 1623 /* 1624 * See if we can use any of our optimized reconstruction routines. 1625 */ 1626 if (!vdev_raidz_default_to_general) { 1627 switch (nbaddata) { 1628 case 1: 1629 if (parity_valid[VDEV_RAIDZ_P]) 1630 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1631 1632 ASSERT(rm->rm_firstdatacol > 1); 1633 1634 if (parity_valid[VDEV_RAIDZ_Q]) 1635 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1636 1637 ASSERT(rm->rm_firstdatacol > 2); 1638 break; 1639 1640 case 2: 1641 ASSERT(rm->rm_firstdatacol > 1); 1642 1643 if (parity_valid[VDEV_RAIDZ_P] && 1644 parity_valid[VDEV_RAIDZ_Q]) 1645 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1646 1647 ASSERT(rm->rm_firstdatacol > 2); 1648 1649 break; 1650 } 1651 } 1652 1653 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1654 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1655 ASSERT(code > 0); 1656 return (code); 1657 } 1658 1659 static int 1660 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1661 uint64_t *ashift) 1662 { 1663 vdev_t *cvd; 1664 uint64_t nparity = vd->vdev_nparity; 1665 int c; 1666 int lasterror = 0; 1667 int numerrors = 0; 1668 1669 ASSERT(nparity > 0); 1670 1671 if (nparity > VDEV_RAIDZ_MAXPARITY || 1672 vd->vdev_children < nparity + 1) { 1673 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1674 return (SET_ERROR(EINVAL)); 1675 } 1676 1677 vdev_open_children(vd); 1678 1679 for (c = 0; c < vd->vdev_children; c++) { 1680 cvd = vd->vdev_child[c]; 1681 1682 if (cvd->vdev_open_error != 0) { 1683 lasterror = cvd->vdev_open_error; 1684 numerrors++; 1685 continue; 1686 } 1687 1688 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1689 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1690 *ashift = MAX(*ashift, cvd->vdev_ashift); 1691 } 1692 1693 *asize *= vd->vdev_children; 1694 *max_asize *= vd->vdev_children; 1695 1696 if (numerrors > nparity) { 1697 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1698 return (lasterror); 1699 } 1700 1701 return (0); 1702 } 1703 1704 static void 1705 vdev_raidz_close(vdev_t *vd) 1706 { 1707 int c; 1708 1709 for (c = 0; c < vd->vdev_children; c++) 1710 vdev_close(vd->vdev_child[c]); 1711 } 1712 1713 /* 1714 * Handle a read or write I/O to a RAID-Z dump device. 1715 * 1716 * The dump device is in a unique situation compared to other ZFS datasets: 1717 * writing to this device should be as simple and fast as possible. In 1718 * addition, durability matters much less since the dump will be extracted 1719 * once the machine reboots. For that reason, this function eschews parity for 1720 * performance and simplicity. The dump device uses the checksum setting 1721 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1722 * dataset. 1723 * 1724 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1725 * 128 KB will not fill an entire block; in addition, they may not be properly 1726 * aligned. In that case, this function uses the preallocated 128 KB block and 1727 * omits reading or writing any "empty" portions of that block, as opposed to 1728 * allocating a fresh appropriately-sized block. 1729 * 1730 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1731 * 1732 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1733 * 1734 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1735 * allocated which spans all five child vdevs. 8 KB of data would be written to 1736 * each of four vdevs, with the fifth containing the parity bits. 1737 * 1738 * parity data data data data 1739 * | PP | XX | XX | XX | XX | 1740 * ^ ^ ^ ^ ^ 1741 * | | | | | 1742 * 8 KB parity ------8 KB data blocks------ 1743 * 1744 * However, when writing to the dump device, the behavior is different: 1745 * 1746 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1747 * 1748 * Unlike the normal RAID-Z case in which the block is allocated based on the 1749 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1750 * I/O size is less than 128 KB, only the actual portions of data are written. 1751 * In this example the data is written to the third data vdev since that vdev 1752 * contains the offset [64 KB, 96 KB). 1753 * 1754 * parity data data data data 1755 * | | | | XX | | 1756 * ^ 1757 * | 1758 * 32 KB data block 1759 * 1760 * As a result, an individual I/O may not span all child vdevs; moreover, a 1761 * small I/O may only operate on a single child vdev. 1762 * 1763 * Note that since there are no parity bits calculated or written, this format 1764 * remains the same no matter how many parity bits are used in a normal RAID-Z 1765 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1766 * would look like: 1767 * 1768 * parity parity parity data data data data 1769 * | | | | | | XX | | 1770 * ^ 1771 * | 1772 * 32 KB data block 1773 */ 1774 int 1775 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1776 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1777 { 1778 vdev_t *tvd = vd->vdev_top; 1779 vdev_t *cvd; 1780 raidz_map_t *rm; 1781 raidz_col_t *rc; 1782 int c, err = 0; 1783 1784 uint64_t start, end, colstart, colend; 1785 uint64_t coloffset, colsize, colskip; 1786 1787 int flags = doread ? B_READ : B_WRITE; 1788 1789 #ifdef _KERNEL 1790 1791 /* 1792 * Don't write past the end of the block 1793 */ 1794 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1795 1796 start = offset; 1797 end = start + size; 1798 1799 /* 1800 * Allocate a RAID-Z map for this block. Note that this block starts 1801 * from the "original" offset, this is, the offset of the extent which 1802 * contains the requisite offset of the data being read or written. 1803 * 1804 * Even if this I/O operation doesn't span the full block size, let's 1805 * treat the on-disk format as if the only blocks are the complete 128 1806 * KB size. 1807 */ 1808 abd_t *abd = abd_get_from_buf(data - (offset - origoffset), 1809 SPA_OLD_MAXBLOCKSIZE); 1810 rm = vdev_raidz_map_alloc(abd, 1811 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1812 vd->vdev_children, vd->vdev_nparity); 1813 1814 coloffset = origoffset; 1815 1816 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1817 c++, coloffset += rc->rc_size) { 1818 rc = &rm->rm_col[c]; 1819 cvd = vd->vdev_child[rc->rc_devidx]; 1820 1821 /* 1822 * Find the start and end of this column in the RAID-Z map, 1823 * keeping in mind that the stated size and offset of the 1824 * operation may not fill the entire column for this vdev. 1825 * 1826 * If any portion of the data spans this column, issue the 1827 * appropriate operation to the vdev. 1828 */ 1829 if (coloffset + rc->rc_size <= start) 1830 continue; 1831 if (coloffset >= end) 1832 continue; 1833 1834 colstart = MAX(coloffset, start); 1835 colend = MIN(end, coloffset + rc->rc_size); 1836 colsize = colend - colstart; 1837 colskip = colstart - coloffset; 1838 1839 VERIFY3U(colsize, <=, rc->rc_size); 1840 VERIFY3U(colskip, <=, rc->rc_size); 1841 1842 /* 1843 * Note that the child vdev will have a vdev label at the start 1844 * of its range of offsets, hence the need for 1845 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1846 * example of why this calculation is needed. 1847 */ 1848 if ((err = vdev_disk_physio(cvd, 1849 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1850 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1851 flags, isdump)) != 0) 1852 break; 1853 } 1854 1855 vdev_raidz_map_free(rm); 1856 abd_put(abd); 1857 #endif /* KERNEL */ 1858 1859 return (err); 1860 } 1861 1862 static uint64_t 1863 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1864 { 1865 uint64_t asize; 1866 uint64_t ashift = vd->vdev_top->vdev_ashift; 1867 uint64_t cols = vd->vdev_children; 1868 uint64_t nparity = vd->vdev_nparity; 1869 1870 asize = ((psize - 1) >> ashift) + 1; 1871 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1872 asize = roundup(asize, nparity + 1) << ashift; 1873 1874 return (asize); 1875 } 1876 1877 static void 1878 vdev_raidz_child_done(zio_t *zio) 1879 { 1880 raidz_col_t *rc = zio->io_private; 1881 1882 rc->rc_error = zio->io_error; 1883 rc->rc_tried = 1; 1884 rc->rc_skipped = 0; 1885 } 1886 1887 /* 1888 * Start an IO operation on a RAIDZ VDev 1889 * 1890 * Outline: 1891 * - For write operations: 1892 * 1. Generate the parity data 1893 * 2. Create child zio write operations to each column's vdev, for both 1894 * data and parity. 1895 * 3. If the column skips any sectors for padding, create optional dummy 1896 * write zio children for those areas to improve aggregation continuity. 1897 * - For read operations: 1898 * 1. Create child zio read operations to each data column's vdev to read 1899 * the range of data required for zio. 1900 * 2. If this is a scrub or resilver operation, or if any of the data 1901 * vdevs have had errors, then create zio read operations to the parity 1902 * columns' VDevs as well. 1903 */ 1904 static void 1905 vdev_raidz_io_start(zio_t *zio) 1906 { 1907 vdev_t *vd = zio->io_vd; 1908 vdev_t *tvd = vd->vdev_top; 1909 vdev_t *cvd; 1910 raidz_map_t *rm; 1911 raidz_col_t *rc; 1912 int c, i; 1913 1914 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, 1915 tvd->vdev_ashift, vd->vdev_children, 1916 vd->vdev_nparity); 1917 1918 zio->io_vsd = rm; 1919 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1920 1921 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1922 1923 if (zio->io_type == ZIO_TYPE_WRITE) { 1924 vdev_raidz_generate_parity(rm); 1925 1926 for (c = 0; c < rm->rm_cols; c++) { 1927 rc = &rm->rm_col[c]; 1928 cvd = vd->vdev_child[rc->rc_devidx]; 1929 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1930 rc->rc_offset, rc->rc_abd, rc->rc_size, 1931 zio->io_type, zio->io_priority, 0, 1932 vdev_raidz_child_done, rc)); 1933 } 1934 1935 /* 1936 * Generate optional I/Os for any skipped sectors to improve 1937 * aggregation contiguity. 1938 */ 1939 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1940 ASSERT(c <= rm->rm_scols); 1941 if (c == rm->rm_scols) 1942 c = 0; 1943 rc = &rm->rm_col[c]; 1944 cvd = vd->vdev_child[rc->rc_devidx]; 1945 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1946 rc->rc_offset + rc->rc_size, NULL, 1947 1 << tvd->vdev_ashift, 1948 zio->io_type, zio->io_priority, 1949 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1950 } 1951 1952 zio_execute(zio); 1953 return; 1954 } 1955 1956 ASSERT(zio->io_type == ZIO_TYPE_READ); 1957 1958 /* 1959 * Iterate over the columns in reverse order so that we hit the parity 1960 * last -- any errors along the way will force us to read the parity. 1961 */ 1962 for (c = rm->rm_cols - 1; c >= 0; c--) { 1963 rc = &rm->rm_col[c]; 1964 cvd = vd->vdev_child[rc->rc_devidx]; 1965 if (!vdev_readable(cvd)) { 1966 if (c >= rm->rm_firstdatacol) 1967 rm->rm_missingdata++; 1968 else 1969 rm->rm_missingparity++; 1970 rc->rc_error = SET_ERROR(ENXIO); 1971 rc->rc_tried = 1; /* don't even try */ 1972 rc->rc_skipped = 1; 1973 continue; 1974 } 1975 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1976 if (c >= rm->rm_firstdatacol) 1977 rm->rm_missingdata++; 1978 else 1979 rm->rm_missingparity++; 1980 rc->rc_error = SET_ERROR(ESTALE); 1981 rc->rc_skipped = 1; 1982 continue; 1983 } 1984 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1985 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1986 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1987 rc->rc_offset, rc->rc_abd, rc->rc_size, 1988 zio->io_type, zio->io_priority, 0, 1989 vdev_raidz_child_done, rc)); 1990 } 1991 } 1992 1993 zio_execute(zio); 1994 } 1995 1996 1997 /* 1998 * Report a checksum error for a child of a RAID-Z device. 1999 */ 2000 static void 2001 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 2002 { 2003 void *buf; 2004 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2005 2006 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2007 zio_bad_cksum_t zbc; 2008 raidz_map_t *rm = zio->io_vsd; 2009 2010 mutex_enter(&vd->vdev_stat_lock); 2011 vd->vdev_stat.vs_checksum_errors++; 2012 mutex_exit(&vd->vdev_stat_lock); 2013 2014 zbc.zbc_has_cksum = 0; 2015 zbc.zbc_injected = rm->rm_ecksuminjected; 2016 2017 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); 2018 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 2019 rc->rc_offset, rc->rc_size, buf, bad_data, 2020 &zbc); 2021 abd_return_buf(rc->rc_abd, buf, rc->rc_size); 2022 } 2023 } 2024 2025 /* 2026 * We keep track of whether or not there were any injected errors, so that 2027 * any ereports we generate can note it. 2028 */ 2029 static int 2030 raidz_checksum_verify(zio_t *zio) 2031 { 2032 zio_bad_cksum_t zbc; 2033 raidz_map_t *rm = zio->io_vsd; 2034 2035 int ret = zio_checksum_error(zio, &zbc); 2036 if (ret != 0 && zbc.zbc_injected != 0) 2037 rm->rm_ecksuminjected = 1; 2038 2039 return (ret); 2040 } 2041 2042 /* 2043 * Generate the parity from the data columns. If we tried and were able to 2044 * read the parity without error, verify that the generated parity matches the 2045 * data we read. If it doesn't, we fire off a checksum error. Return the 2046 * number such failures. 2047 */ 2048 static int 2049 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2050 { 2051 void *orig[VDEV_RAIDZ_MAXPARITY]; 2052 int c, ret = 0; 2053 raidz_col_t *rc; 2054 2055 blkptr_t *bp = zio->io_bp; 2056 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2057 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2058 2059 if (checksum == ZIO_CHECKSUM_NOPARITY) 2060 return (ret); 2061 2062 for (c = 0; c < rm->rm_firstdatacol; c++) { 2063 rc = &rm->rm_col[c]; 2064 if (!rc->rc_tried || rc->rc_error != 0) 2065 continue; 2066 orig[c] = zio_buf_alloc(rc->rc_size); 2067 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); 2068 } 2069 2070 vdev_raidz_generate_parity(rm); 2071 2072 for (c = 0; c < rm->rm_firstdatacol; c++) { 2073 rc = &rm->rm_col[c]; 2074 if (!rc->rc_tried || rc->rc_error != 0) 2075 continue; 2076 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { 2077 raidz_checksum_error(zio, rc, orig[c]); 2078 rc->rc_error = SET_ERROR(ECKSUM); 2079 ret++; 2080 } 2081 zio_buf_free(orig[c], rc->rc_size); 2082 } 2083 2084 return (ret); 2085 } 2086 2087 /* 2088 * Keep statistics on all the ways that we used parity to correct data. 2089 */ 2090 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 2091 2092 static int 2093 vdev_raidz_worst_error(raidz_map_t *rm) 2094 { 2095 int error = 0; 2096 2097 for (int c = 0; c < rm->rm_cols; c++) 2098 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2099 2100 return (error); 2101 } 2102 2103 /* 2104 * Iterate over all combinations of bad data and attempt a reconstruction. 2105 * Note that the algorithm below is non-optimal because it doesn't take into 2106 * account how reconstruction is actually performed. For example, with 2107 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2108 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2109 * cases we'd only use parity information in column 0. 2110 */ 2111 static int 2112 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2113 { 2114 raidz_map_t *rm = zio->io_vsd; 2115 raidz_col_t *rc; 2116 void *orig[VDEV_RAIDZ_MAXPARITY]; 2117 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2118 int *tgts = &tstore[1]; 2119 int current, next, i, c, n; 2120 int code, ret = 0; 2121 2122 ASSERT(total_errors < rm->rm_firstdatacol); 2123 2124 /* 2125 * This simplifies one edge condition. 2126 */ 2127 tgts[-1] = -1; 2128 2129 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2130 /* 2131 * Initialize the targets array by finding the first n columns 2132 * that contain no error. 2133 * 2134 * If there were no data errors, we need to ensure that we're 2135 * always explicitly attempting to reconstruct at least one 2136 * data column. To do this, we simply push the highest target 2137 * up into the data columns. 2138 */ 2139 for (c = 0, i = 0; i < n; i++) { 2140 if (i == n - 1 && data_errors == 0 && 2141 c < rm->rm_firstdatacol) { 2142 c = rm->rm_firstdatacol; 2143 } 2144 2145 while (rm->rm_col[c].rc_error != 0) { 2146 c++; 2147 ASSERT3S(c, <, rm->rm_cols); 2148 } 2149 2150 tgts[i] = c++; 2151 } 2152 2153 /* 2154 * Setting tgts[n] simplifies the other edge condition. 2155 */ 2156 tgts[n] = rm->rm_cols; 2157 2158 /* 2159 * These buffers were allocated in previous iterations. 2160 */ 2161 for (i = 0; i < n - 1; i++) { 2162 ASSERT(orig[i] != NULL); 2163 } 2164 2165 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2166 2167 current = 0; 2168 next = tgts[current]; 2169 2170 while (current != n) { 2171 tgts[current] = next; 2172 current = 0; 2173 2174 /* 2175 * Save off the original data that we're going to 2176 * attempt to reconstruct. 2177 */ 2178 for (i = 0; i < n; i++) { 2179 ASSERT(orig[i] != NULL); 2180 c = tgts[i]; 2181 ASSERT3S(c, >=, 0); 2182 ASSERT3S(c, <, rm->rm_cols); 2183 rc = &rm->rm_col[c]; 2184 abd_copy_to_buf(orig[i], rc->rc_abd, 2185 rc->rc_size); 2186 } 2187 2188 /* 2189 * Attempt a reconstruction and exit the outer loop on 2190 * success. 2191 */ 2192 code = vdev_raidz_reconstruct(rm, tgts, n); 2193 if (raidz_checksum_verify(zio) == 0) { 2194 atomic_inc_64(&raidz_corrected[code]); 2195 2196 for (i = 0; i < n; i++) { 2197 c = tgts[i]; 2198 rc = &rm->rm_col[c]; 2199 ASSERT(rc->rc_error == 0); 2200 if (rc->rc_tried) 2201 raidz_checksum_error(zio, rc, 2202 orig[i]); 2203 rc->rc_error = SET_ERROR(ECKSUM); 2204 } 2205 2206 ret = code; 2207 goto done; 2208 } 2209 2210 /* 2211 * Restore the original data. 2212 */ 2213 for (i = 0; i < n; i++) { 2214 c = tgts[i]; 2215 rc = &rm->rm_col[c]; 2216 abd_copy_from_buf(rc->rc_abd, orig[i], 2217 rc->rc_size); 2218 } 2219 2220 do { 2221 /* 2222 * Find the next valid column after the current 2223 * position.. 2224 */ 2225 for (next = tgts[current] + 1; 2226 next < rm->rm_cols && 2227 rm->rm_col[next].rc_error != 0; next++) 2228 continue; 2229 2230 ASSERT(next <= tgts[current + 1]); 2231 2232 /* 2233 * If that spot is available, we're done here. 2234 */ 2235 if (next != tgts[current + 1]) 2236 break; 2237 2238 /* 2239 * Otherwise, find the next valid column after 2240 * the previous position. 2241 */ 2242 for (c = tgts[current - 1] + 1; 2243 rm->rm_col[c].rc_error != 0; c++) 2244 continue; 2245 2246 tgts[current] = c; 2247 current++; 2248 2249 } while (current != n); 2250 } 2251 } 2252 n--; 2253 done: 2254 for (i = 0; i < n; i++) { 2255 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2256 } 2257 2258 return (ret); 2259 } 2260 2261 /* 2262 * Complete an IO operation on a RAIDZ VDev 2263 * 2264 * Outline: 2265 * - For write operations: 2266 * 1. Check for errors on the child IOs. 2267 * 2. Return, setting an error code if too few child VDevs were written 2268 * to reconstruct the data later. Note that partial writes are 2269 * considered successful if they can be reconstructed at all. 2270 * - For read operations: 2271 * 1. Check for errors on the child IOs. 2272 * 2. If data errors occurred: 2273 * a. Try to reassemble the data from the parity available. 2274 * b. If we haven't yet read the parity drives, read them now. 2275 * c. If all parity drives have been read but the data still doesn't 2276 * reassemble with a correct checksum, then try combinatorial 2277 * reconstruction. 2278 * d. If that doesn't work, return an error. 2279 * 3. If there were unexpected errors or this is a resilver operation, 2280 * rewrite the vdevs that had errors. 2281 */ 2282 static void 2283 vdev_raidz_io_done(zio_t *zio) 2284 { 2285 vdev_t *vd = zio->io_vd; 2286 vdev_t *cvd; 2287 raidz_map_t *rm = zio->io_vsd; 2288 raidz_col_t *rc; 2289 int unexpected_errors = 0; 2290 int parity_errors = 0; 2291 int parity_untried = 0; 2292 int data_errors = 0; 2293 int total_errors = 0; 2294 int n, c; 2295 int tgts[VDEV_RAIDZ_MAXPARITY]; 2296 int code; 2297 2298 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2299 2300 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2301 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2302 2303 for (c = 0; c < rm->rm_cols; c++) { 2304 rc = &rm->rm_col[c]; 2305 2306 if (rc->rc_error) { 2307 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2308 2309 if (c < rm->rm_firstdatacol) 2310 parity_errors++; 2311 else 2312 data_errors++; 2313 2314 if (!rc->rc_skipped) 2315 unexpected_errors++; 2316 2317 total_errors++; 2318 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2319 parity_untried++; 2320 } 2321 } 2322 2323 if (zio->io_type == ZIO_TYPE_WRITE) { 2324 /* 2325 * XXX -- for now, treat partial writes as a success. 2326 * (If we couldn't write enough columns to reconstruct 2327 * the data, the I/O failed. Otherwise, good enough.) 2328 * 2329 * Now that we support write reallocation, it would be better 2330 * to treat partial failure as real failure unless there are 2331 * no non-degraded top-level vdevs left, and not update DTLs 2332 * if we intend to reallocate. 2333 */ 2334 /* XXPOLICY */ 2335 if (total_errors > rm->rm_firstdatacol) 2336 zio->io_error = vdev_raidz_worst_error(rm); 2337 2338 return; 2339 } 2340 2341 ASSERT(zio->io_type == ZIO_TYPE_READ); 2342 /* 2343 * There are three potential phases for a read: 2344 * 1. produce valid data from the columns read 2345 * 2. read all disks and try again 2346 * 3. perform combinatorial reconstruction 2347 * 2348 * Each phase is progressively both more expensive and less likely to 2349 * occur. If we encounter more errors than we can repair or all phases 2350 * fail, we have no choice but to return an error. 2351 */ 2352 2353 /* 2354 * If the number of errors we saw was correctable -- less than or equal 2355 * to the number of parity disks read -- attempt to produce data that 2356 * has a valid checksum. Naturally, this case applies in the absence of 2357 * any errors. 2358 */ 2359 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2360 if (data_errors == 0) { 2361 if (raidz_checksum_verify(zio) == 0) { 2362 /* 2363 * If we read parity information (unnecessarily 2364 * as it happens since no reconstruction was 2365 * needed) regenerate and verify the parity. 2366 * We also regenerate parity when resilvering 2367 * so we can write it out to the failed device 2368 * later. 2369 */ 2370 if (parity_errors + parity_untried < 2371 rm->rm_firstdatacol || 2372 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2373 n = raidz_parity_verify(zio, rm); 2374 unexpected_errors += n; 2375 ASSERT(parity_errors + n <= 2376 rm->rm_firstdatacol); 2377 } 2378 goto done; 2379 } 2380 } else { 2381 /* 2382 * We either attempt to read all the parity columns or 2383 * none of them. If we didn't try to read parity, we 2384 * wouldn't be here in the correctable case. There must 2385 * also have been fewer parity errors than parity 2386 * columns or, again, we wouldn't be in this code path. 2387 */ 2388 ASSERT(parity_untried == 0); 2389 ASSERT(parity_errors < rm->rm_firstdatacol); 2390 2391 /* 2392 * Identify the data columns that reported an error. 2393 */ 2394 n = 0; 2395 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2396 rc = &rm->rm_col[c]; 2397 if (rc->rc_error != 0) { 2398 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2399 tgts[n++] = c; 2400 } 2401 } 2402 2403 ASSERT(rm->rm_firstdatacol >= n); 2404 2405 code = vdev_raidz_reconstruct(rm, tgts, n); 2406 2407 if (raidz_checksum_verify(zio) == 0) { 2408 atomic_inc_64(&raidz_corrected[code]); 2409 2410 /* 2411 * If we read more parity disks than were used 2412 * for reconstruction, confirm that the other 2413 * parity disks produced correct data. This 2414 * routine is suboptimal in that it regenerates 2415 * the parity that we already used in addition 2416 * to the parity that we're attempting to 2417 * verify, but this should be a relatively 2418 * uncommon case, and can be optimized if it 2419 * becomes a problem. Note that we regenerate 2420 * parity when resilvering so we can write it 2421 * out to failed devices later. 2422 */ 2423 if (parity_errors < rm->rm_firstdatacol - n || 2424 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2425 n = raidz_parity_verify(zio, rm); 2426 unexpected_errors += n; 2427 ASSERT(parity_errors + n <= 2428 rm->rm_firstdatacol); 2429 } 2430 2431 goto done; 2432 } 2433 } 2434 } 2435 2436 /* 2437 * This isn't a typical situation -- either we got a read error or 2438 * a child silently returned bad data. Read every block so we can 2439 * try again with as much data and parity as we can track down. If 2440 * we've already been through once before, all children will be marked 2441 * as tried so we'll proceed to combinatorial reconstruction. 2442 */ 2443 unexpected_errors = 1; 2444 rm->rm_missingdata = 0; 2445 rm->rm_missingparity = 0; 2446 2447 for (c = 0; c < rm->rm_cols; c++) { 2448 if (rm->rm_col[c].rc_tried) 2449 continue; 2450 2451 zio_vdev_io_redone(zio); 2452 do { 2453 rc = &rm->rm_col[c]; 2454 if (rc->rc_tried) 2455 continue; 2456 zio_nowait(zio_vdev_child_io(zio, NULL, 2457 vd->vdev_child[rc->rc_devidx], 2458 rc->rc_offset, rc->rc_abd, rc->rc_size, 2459 zio->io_type, zio->io_priority, 0, 2460 vdev_raidz_child_done, rc)); 2461 } while (++c < rm->rm_cols); 2462 2463 return; 2464 } 2465 2466 /* 2467 * At this point we've attempted to reconstruct the data given the 2468 * errors we detected, and we've attempted to read all columns. There 2469 * must, therefore, be one or more additional problems -- silent errors 2470 * resulting in invalid data rather than explicit I/O errors resulting 2471 * in absent data. We check if there is enough additional data to 2472 * possibly reconstruct the data and then perform combinatorial 2473 * reconstruction over all possible combinations. If that fails, 2474 * we're cooked. 2475 */ 2476 if (total_errors > rm->rm_firstdatacol) { 2477 zio->io_error = vdev_raidz_worst_error(rm); 2478 2479 } else if (total_errors < rm->rm_firstdatacol && 2480 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2481 /* 2482 * If we didn't use all the available parity for the 2483 * combinatorial reconstruction, verify that the remaining 2484 * parity is correct. 2485 */ 2486 if (code != (1 << rm->rm_firstdatacol) - 1) 2487 (void) raidz_parity_verify(zio, rm); 2488 } else { 2489 /* 2490 * We're here because either: 2491 * 2492 * total_errors == rm_first_datacol, or 2493 * vdev_raidz_combrec() failed 2494 * 2495 * In either case, there is enough bad data to prevent 2496 * reconstruction. 2497 * 2498 * Start checksum ereports for all children which haven't 2499 * failed, and the IO wasn't speculative. 2500 */ 2501 zio->io_error = SET_ERROR(ECKSUM); 2502 2503 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2504 for (c = 0; c < rm->rm_cols; c++) { 2505 rc = &rm->rm_col[c]; 2506 if (rc->rc_error == 0) { 2507 zio_bad_cksum_t zbc; 2508 zbc.zbc_has_cksum = 0; 2509 zbc.zbc_injected = 2510 rm->rm_ecksuminjected; 2511 2512 zfs_ereport_start_checksum( 2513 zio->io_spa, 2514 vd->vdev_child[rc->rc_devidx], 2515 zio, rc->rc_offset, rc->rc_size, 2516 (void *)(uintptr_t)c, &zbc); 2517 } 2518 } 2519 } 2520 } 2521 2522 done: 2523 zio_checksum_verified(zio); 2524 2525 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2526 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2527 /* 2528 * Use the good data we have in hand to repair damaged children. 2529 */ 2530 for (c = 0; c < rm->rm_cols; c++) { 2531 rc = &rm->rm_col[c]; 2532 cvd = vd->vdev_child[rc->rc_devidx]; 2533 2534 if (rc->rc_error == 0) 2535 continue; 2536 2537 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2538 rc->rc_offset, rc->rc_abd, rc->rc_size, 2539 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2540 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2541 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2542 } 2543 } 2544 } 2545 2546 static void 2547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2548 { 2549 if (faulted > vd->vdev_nparity) 2550 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2551 VDEV_AUX_NO_REPLICAS); 2552 else if (degraded + faulted != 0) 2553 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2554 else 2555 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2556 } 2557 2558 vdev_ops_t vdev_raidz_ops = { 2559 vdev_raidz_open, 2560 vdev_raidz_close, 2561 vdev_raidz_asize, 2562 vdev_raidz_io_start, 2563 vdev_raidz_io_done, 2564 vdev_raidz_state_change, 2565 NULL, 2566 NULL, 2567 NULL, 2568 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2569 B_FALSE /* not a leaf vdev */ 2570 }; 2571