1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_file.h> 34 #include <sys/vdev_raidz.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/abd.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/fs/zfs.h> 40 41 #ifdef ZFS_DEBUG 42 #include <sys/vdev_initialize.h> /* vdev_xlate testing */ 43 #endif 44 45 /* 46 * Virtual device vector for RAID-Z. 47 * 48 * This vdev supports single, double, and triple parity. For single parity, 49 * we use a simple XOR of all the data columns. For double or triple parity, 50 * we use a special case of Reed-Solomon coding. This extends the 51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 54 * former is also based. The latter is designed to provide higher performance 55 * for writes. 56 * 57 * Note that the Plank paper claimed to support arbitrary N+M, but was then 58 * amended six years later identifying a critical flaw that invalidates its 59 * claims. Nevertheless, the technique can be adapted to work for up to 60 * triple parity. For additional parity, the amendment "Note: Correction to 61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 62 * is viable, but the additional complexity means that write performance will 63 * suffer. 64 * 65 * All of the methods above operate on a Galois field, defined over the 66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 67 * can be expressed with a single byte. Briefly, the operations on the 68 * field are defined as follows: 69 * 70 * o addition (+) is represented by a bitwise XOR 71 * o subtraction (-) is therefore identical to addition: A + B = A - B 72 * o multiplication of A by 2 is defined by the following bitwise expression: 73 * 74 * (A * 2)_7 = A_6 75 * (A * 2)_6 = A_5 76 * (A * 2)_5 = A_4 77 * (A * 2)_4 = A_3 + A_7 78 * (A * 2)_3 = A_2 + A_7 79 * (A * 2)_2 = A_1 + A_7 80 * (A * 2)_1 = A_0 81 * (A * 2)_0 = A_7 82 * 83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 84 * As an aside, this multiplication is derived from the error correcting 85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 86 * 87 * Observe that any number in the field (except for 0) can be expressed as a 88 * power of 2 -- a generator for the field. We store a table of the powers of 89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 91 * than field addition). The inverse of a field element A (A^-1) is therefore 92 * A ^ (255 - 1) = A^254. 93 * 94 * The up-to-three parity columns, P, Q, R over several data columns, 95 * D_0, ... D_n-1, can be expressed by field operations: 96 * 97 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 98 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 99 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 100 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 101 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 102 * 103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 105 * independent coefficients. (There are no additional coefficients that have 106 * this property which is why the uncorrected Plank method breaks down.) 107 * 108 * See the reconstruction code below for how P, Q and R can used individually 109 * or in concert to recover missing data columns. 110 */ 111 112 typedef struct raidz_col { 113 uint64_t rc_devidx; /* child device index for I/O */ 114 uint64_t rc_offset; /* device offset */ 115 uint64_t rc_size; /* I/O size */ 116 abd_t *rc_abd; /* I/O data */ 117 void *rc_gdata; /* used to store the "good" version */ 118 int rc_error; /* I/O error for this device */ 119 uint8_t rc_tried; /* Did we attempt this I/O column? */ 120 uint8_t rc_skipped; /* Did we skip this I/O column? */ 121 } raidz_col_t; 122 123 typedef struct raidz_map { 124 uint64_t rm_cols; /* Regular column count */ 125 uint64_t rm_scols; /* Count including skipped columns */ 126 uint64_t rm_bigcols; /* Number of oversized columns */ 127 uint64_t rm_asize; /* Actual total I/O size */ 128 uint64_t rm_missingdata; /* Count of missing data devices */ 129 uint64_t rm_missingparity; /* Count of missing parity devices */ 130 uint64_t rm_firstdatacol; /* First data column/parity count */ 131 uint64_t rm_nskip; /* Skipped sectors for padding */ 132 uint64_t rm_skipstart; /* Column index of padding start */ 133 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ 134 uintptr_t rm_reports; /* # of referencing checksum reports */ 135 uint8_t rm_freed; /* map no longer has referencing ZIO */ 136 uint8_t rm_ecksuminjected; /* checksum error was injected */ 137 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 138 } raidz_map_t; 139 140 #define VDEV_RAIDZ_P 0 141 #define VDEV_RAIDZ_Q 1 142 #define VDEV_RAIDZ_R 2 143 144 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 145 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 146 147 /* 148 * We provide a mechanism to perform the field multiplication operation on a 149 * 64-bit value all at once rather than a byte at a time. This works by 150 * creating a mask from the top bit in each byte and using that to 151 * conditionally apply the XOR of 0x1d. 152 */ 153 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 154 { \ 155 (mask) = (x) & 0x8080808080808080ULL; \ 156 (mask) = ((mask) << 1) - ((mask) >> 7); \ 157 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 158 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 159 } 160 161 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 162 { \ 163 VDEV_RAIDZ_64MUL_2((x), mask); \ 164 VDEV_RAIDZ_64MUL_2((x), mask); \ 165 } 166 167 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 168 169 /* 170 * Force reconstruction to use the general purpose method. 171 */ 172 int vdev_raidz_default_to_general; 173 174 /* Powers of 2 in the Galois field defined above. */ 175 static const uint8_t vdev_raidz_pow2[256] = { 176 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 177 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 178 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 179 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 180 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 181 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 182 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 183 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 184 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 185 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 186 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 187 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 188 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 189 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 190 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 191 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 192 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 193 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 194 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 195 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 196 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 197 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 198 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 199 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 200 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 201 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 202 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 203 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 204 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 205 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 206 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 207 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 208 }; 209 /* Logs of 2 in the Galois field defined above. */ 210 static const uint8_t vdev_raidz_log2[256] = { 211 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 212 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 213 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 214 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 215 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 216 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 217 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 218 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 219 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 220 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 221 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 222 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 223 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 224 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 225 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 226 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 227 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 228 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 229 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 230 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 231 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 232 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 233 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 234 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 235 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 236 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 237 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 238 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 239 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 240 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 241 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 242 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 243 }; 244 245 static void vdev_raidz_generate_parity(raidz_map_t *rm); 246 247 /* 248 * Multiply a given number by 2 raised to the given power. 249 */ 250 static uint8_t 251 vdev_raidz_exp2(uint_t a, int exp) 252 { 253 if (a == 0) 254 return (0); 255 256 ASSERT(exp >= 0); 257 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 258 259 exp += vdev_raidz_log2[a]; 260 if (exp > 255) 261 exp -= 255; 262 263 return (vdev_raidz_pow2[exp]); 264 } 265 266 static void 267 vdev_raidz_map_free(raidz_map_t *rm) 268 { 269 int c; 270 size_t size; 271 272 for (c = 0; c < rm->rm_firstdatacol; c++) { 273 abd_free(rm->rm_col[c].rc_abd); 274 275 if (rm->rm_col[c].rc_gdata != NULL) 276 zio_buf_free(rm->rm_col[c].rc_gdata, 277 rm->rm_col[c].rc_size); 278 } 279 280 size = 0; 281 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 282 abd_put(rm->rm_col[c].rc_abd); 283 size += rm->rm_col[c].rc_size; 284 } 285 286 if (rm->rm_abd_copy != NULL) 287 abd_free(rm->rm_abd_copy); 288 289 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 290 } 291 292 static void 293 vdev_raidz_map_free_vsd(zio_t *zio) 294 { 295 raidz_map_t *rm = zio->io_vsd; 296 297 ASSERT0(rm->rm_freed); 298 rm->rm_freed = 1; 299 300 if (rm->rm_reports == 0) 301 vdev_raidz_map_free(rm); 302 } 303 304 /*ARGSUSED*/ 305 static void 306 vdev_raidz_cksum_free(void *arg, size_t ignored) 307 { 308 raidz_map_t *rm = arg; 309 310 ASSERT3U(rm->rm_reports, >, 0); 311 312 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 313 vdev_raidz_map_free(rm); 314 } 315 316 static void 317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 318 { 319 raidz_map_t *rm = zcr->zcr_cbdata; 320 size_t c = zcr->zcr_cbinfo; 321 size_t x; 322 323 const char *good = NULL; 324 char *bad; 325 326 if (good_data == NULL) { 327 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 328 return; 329 } 330 331 if (c < rm->rm_firstdatacol) { 332 /* 333 * The first time through, calculate the parity blocks for 334 * the good data (this relies on the fact that the good 335 * data never changes for a given logical ZIO) 336 */ 337 if (rm->rm_col[0].rc_gdata == NULL) { 338 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 339 char *buf; 340 int offset; 341 342 /* 343 * Set up the rm_col[]s to generate the parity for 344 * good_data, first saving the parity bufs and 345 * replacing them with buffers to hold the result. 346 */ 347 for (x = 0; x < rm->rm_firstdatacol; x++) { 348 bad_parity[x] = rm->rm_col[x].rc_abd; 349 rm->rm_col[x].rc_gdata = 350 zio_buf_alloc(rm->rm_col[x].rc_size); 351 rm->rm_col[x].rc_abd = 352 abd_get_from_buf(rm->rm_col[x].rc_gdata, 353 rm->rm_col[x].rc_size); 354 } 355 356 /* fill in the data columns from good_data */ 357 buf = (char *)good_data; 358 for (; x < rm->rm_cols; x++) { 359 abd_put(rm->rm_col[x].rc_abd); 360 rm->rm_col[x].rc_abd = abd_get_from_buf(buf, 361 rm->rm_col[x].rc_size); 362 buf += rm->rm_col[x].rc_size; 363 } 364 365 /* 366 * Construct the parity from the good data. 367 */ 368 vdev_raidz_generate_parity(rm); 369 370 /* restore everything back to its original state */ 371 for (x = 0; x < rm->rm_firstdatacol; x++) { 372 abd_put(rm->rm_col[x].rc_abd); 373 rm->rm_col[x].rc_abd = bad_parity[x]; 374 } 375 376 offset = 0; 377 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 378 abd_put(rm->rm_col[x].rc_abd); 379 rm->rm_col[x].rc_abd = abd_get_offset( 380 rm->rm_abd_copy, offset); 381 offset += rm->rm_col[x].rc_size; 382 } 383 } 384 385 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 386 good = rm->rm_col[c].rc_gdata; 387 } else { 388 /* adjust good_data to point at the start of our column */ 389 good = good_data; 390 391 for (x = rm->rm_firstdatacol; x < c; x++) 392 good += rm->rm_col[x].rc_size; 393 } 394 395 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); 396 /* we drop the ereport if it ends up that the data was good */ 397 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 398 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); 399 } 400 401 /* 402 * Invoked indirectly by zfs_ereport_start_checksum(), called 403 * below when our read operation fails completely. The main point 404 * is to keep a copy of everything we read from disk, so that at 405 * vdev_raidz_cksum_finish() time we can compare it with the good data. 406 */ 407 static void 408 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 409 { 410 size_t c = (size_t)(uintptr_t)arg; 411 size_t offset; 412 413 raidz_map_t *rm = zio->io_vsd; 414 size_t size; 415 416 /* set up the report and bump the refcount */ 417 zcr->zcr_cbdata = rm; 418 zcr->zcr_cbinfo = c; 419 zcr->zcr_finish = vdev_raidz_cksum_finish; 420 zcr->zcr_free = vdev_raidz_cksum_free; 421 422 rm->rm_reports++; 423 ASSERT3U(rm->rm_reports, >, 0); 424 425 if (rm->rm_abd_copy != NULL) 426 return; 427 428 /* 429 * It's the first time we're called for this raidz_map_t, so we need 430 * to copy the data aside; there's no guarantee that our zio's buffer 431 * won't be re-used for something else. 432 * 433 * Our parity data is already in separate buffers, so there's no need 434 * to copy them. 435 */ 436 437 size = 0; 438 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 439 size += rm->rm_col[c].rc_size; 440 441 rm->rm_abd_copy = 442 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); 443 444 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 445 raidz_col_t *col = &rm->rm_col[c]; 446 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); 447 448 abd_copy(tmp, col->rc_abd, col->rc_size); 449 abd_put(col->rc_abd); 450 col->rc_abd = tmp; 451 452 offset += col->rc_size; 453 } 454 ASSERT3U(offset, ==, size); 455 } 456 457 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 458 vdev_raidz_map_free_vsd, 459 vdev_raidz_cksum_report 460 }; 461 462 /* 463 * Divides the IO evenly across all child vdevs; usually, dcols is 464 * the number of children in the target vdev. 465 */ 466 static raidz_map_t * 467 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, 468 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 469 { 470 raidz_map_t *rm; 471 /* The starting RAIDZ (parent) vdev sector of the block. */ 472 uint64_t b = offset >> unit_shift; 473 /* The zio's size in units of the vdev's minimum sector size. */ 474 uint64_t s = size >> unit_shift; 475 /* The first column for this stripe. */ 476 uint64_t f = b % dcols; 477 /* The starting byte offset on each child vdev. */ 478 uint64_t o = (b / dcols) << unit_shift; 479 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 480 uint64_t off = 0; 481 482 /* 483 * "Quotient": The number of data sectors for this stripe on all but 484 * the "big column" child vdevs that also contain "remainder" data. 485 */ 486 q = s / (dcols - nparity); 487 488 /* 489 * "Remainder": The number of partial stripe data sectors in this I/O. 490 * This will add a sector to some, but not all, child vdevs. 491 */ 492 r = s - q * (dcols - nparity); 493 494 /* The number of "big columns" - those which contain remainder data. */ 495 bc = (r == 0 ? 0 : r + nparity); 496 497 /* 498 * The total number of data and parity sectors associated with 499 * this I/O. 500 */ 501 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 502 503 /* acols: The columns that will be accessed. */ 504 /* scols: The columns that will be accessed or skipped. */ 505 if (q == 0) { 506 /* Our I/O request doesn't span all child vdevs. */ 507 acols = bc; 508 scols = MIN(dcols, roundup(bc, nparity + 1)); 509 } else { 510 acols = dcols; 511 scols = dcols; 512 } 513 514 ASSERT3U(acols, <=, scols); 515 516 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 517 518 rm->rm_cols = acols; 519 rm->rm_scols = scols; 520 rm->rm_bigcols = bc; 521 rm->rm_skipstart = bc; 522 rm->rm_missingdata = 0; 523 rm->rm_missingparity = 0; 524 rm->rm_firstdatacol = nparity; 525 rm->rm_abd_copy = NULL; 526 rm->rm_reports = 0; 527 rm->rm_freed = 0; 528 rm->rm_ecksuminjected = 0; 529 530 asize = 0; 531 532 for (c = 0; c < scols; c++) { 533 col = f + c; 534 coff = o; 535 if (col >= dcols) { 536 col -= dcols; 537 coff += 1ULL << unit_shift; 538 } 539 rm->rm_col[c].rc_devidx = col; 540 rm->rm_col[c].rc_offset = coff; 541 rm->rm_col[c].rc_abd = NULL; 542 rm->rm_col[c].rc_gdata = NULL; 543 rm->rm_col[c].rc_error = 0; 544 rm->rm_col[c].rc_tried = 0; 545 rm->rm_col[c].rc_skipped = 0; 546 547 if (c >= acols) 548 rm->rm_col[c].rc_size = 0; 549 else if (c < bc) 550 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 551 else 552 rm->rm_col[c].rc_size = q << unit_shift; 553 554 asize += rm->rm_col[c].rc_size; 555 } 556 557 ASSERT3U(asize, ==, tot << unit_shift); 558 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 559 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 560 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 561 ASSERT3U(rm->rm_nskip, <=, nparity); 562 563 for (c = 0; c < rm->rm_firstdatacol; c++) 564 rm->rm_col[c].rc_abd = 565 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); 566 567 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); 568 off = rm->rm_col[c].rc_size; 569 570 for (c = c + 1; c < acols; c++) { 571 rm->rm_col[c].rc_abd = abd_get_offset(abd, off); 572 off += rm->rm_col[c].rc_size; 573 } 574 575 /* 576 * If all data stored spans all columns, there's a danger that parity 577 * will always be on the same device and, since parity isn't read 578 * during normal operation, that that device's I/O bandwidth won't be 579 * used effectively. We therefore switch the parity every 1MB. 580 * 581 * ... at least that was, ostensibly, the theory. As a practical 582 * matter unless we juggle the parity between all devices evenly, we 583 * won't see any benefit. Further, occasional writes that aren't a 584 * multiple of the LCM of the number of children and the minimum 585 * stripe width are sufficient to avoid pessimal behavior. 586 * Unfortunately, this decision created an implicit on-disk format 587 * requirement that we need to support for all eternity, but only 588 * for single-parity RAID-Z. 589 * 590 * If we intend to skip a sector in the zeroth column for padding 591 * we must make sure to note this swap. We will never intend to 592 * skip the first column since at least one data and one parity 593 * column must appear in each row. 594 */ 595 ASSERT(rm->rm_cols >= 2); 596 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 597 598 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 599 devidx = rm->rm_col[0].rc_devidx; 600 o = rm->rm_col[0].rc_offset; 601 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 602 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 603 rm->rm_col[1].rc_devidx = devidx; 604 rm->rm_col[1].rc_offset = o; 605 606 if (rm->rm_skipstart == 0) 607 rm->rm_skipstart = 1; 608 } 609 610 return (rm); 611 } 612 613 struct pqr_struct { 614 uint64_t *p; 615 uint64_t *q; 616 uint64_t *r; 617 }; 618 619 static int 620 vdev_raidz_p_func(void *buf, size_t size, void *private) 621 { 622 struct pqr_struct *pqr = private; 623 const uint64_t *src = buf; 624 int i, cnt = size / sizeof (src[0]); 625 626 ASSERT(pqr->p && !pqr->q && !pqr->r); 627 628 for (i = 0; i < cnt; i++, src++, pqr->p++) 629 *pqr->p ^= *src; 630 631 return (0); 632 } 633 634 static int 635 vdev_raidz_pq_func(void *buf, size_t size, void *private) 636 { 637 struct pqr_struct *pqr = private; 638 const uint64_t *src = buf; 639 uint64_t mask; 640 int i, cnt = size / sizeof (src[0]); 641 642 ASSERT(pqr->p && pqr->q && !pqr->r); 643 644 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 645 *pqr->p ^= *src; 646 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 647 *pqr->q ^= *src; 648 } 649 650 return (0); 651 } 652 653 static int 654 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 655 { 656 struct pqr_struct *pqr = private; 657 const uint64_t *src = buf; 658 uint64_t mask; 659 int i, cnt = size / sizeof (src[0]); 660 661 ASSERT(pqr->p && pqr->q && pqr->r); 662 663 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 664 *pqr->p ^= *src; 665 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 666 *pqr->q ^= *src; 667 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 668 *pqr->r ^= *src; 669 } 670 671 return (0); 672 } 673 674 static void 675 vdev_raidz_generate_parity_p(raidz_map_t *rm) 676 { 677 uint64_t *p; 678 int c; 679 abd_t *src; 680 681 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 682 src = rm->rm_col[c].rc_abd; 683 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 684 685 if (c == rm->rm_firstdatacol) { 686 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 687 } else { 688 struct pqr_struct pqr = { p, NULL, NULL }; 689 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 690 vdev_raidz_p_func, &pqr); 691 } 692 } 693 } 694 695 static void 696 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 697 { 698 uint64_t *p, *q, pcnt, ccnt, mask, i; 699 int c; 700 abd_t *src; 701 702 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 703 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 704 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 705 706 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 707 src = rm->rm_col[c].rc_abd; 708 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 709 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 710 711 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 712 713 if (c == rm->rm_firstdatacol) { 714 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 715 (void) memcpy(q, p, rm->rm_col[c].rc_size); 716 } else { 717 struct pqr_struct pqr = { p, q, NULL }; 718 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 719 vdev_raidz_pq_func, &pqr); 720 } 721 722 if (c == rm->rm_firstdatacol) { 723 for (i = ccnt; i < pcnt; i++) { 724 p[i] = 0; 725 q[i] = 0; 726 } 727 } else { 728 /* 729 * Treat short columns as though they are full of 0s. 730 * Note that there's therefore nothing needed for P. 731 */ 732 for (i = ccnt; i < pcnt; i++) { 733 VDEV_RAIDZ_64MUL_2(q[i], mask); 734 } 735 } 736 } 737 } 738 739 static void 740 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 741 { 742 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 743 int c; 744 abd_t *src; 745 746 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 747 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 748 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 749 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 750 rm->rm_col[VDEV_RAIDZ_R].rc_size); 751 752 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 753 src = rm->rm_col[c].rc_abd; 754 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 755 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 756 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 757 758 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 759 760 if (c == rm->rm_firstdatacol) { 761 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 762 (void) memcpy(q, p, rm->rm_col[c].rc_size); 763 (void) memcpy(r, p, rm->rm_col[c].rc_size); 764 } else { 765 struct pqr_struct pqr = { p, q, r }; 766 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 767 vdev_raidz_pqr_func, &pqr); 768 } 769 770 if (c == rm->rm_firstdatacol) { 771 for (i = ccnt; i < pcnt; i++) { 772 p[i] = 0; 773 q[i] = 0; 774 r[i] = 0; 775 } 776 } else { 777 /* 778 * Treat short columns as though they are full of 0s. 779 * Note that there's therefore nothing needed for P. 780 */ 781 for (i = ccnt; i < pcnt; i++) { 782 VDEV_RAIDZ_64MUL_2(q[i], mask); 783 VDEV_RAIDZ_64MUL_4(r[i], mask); 784 } 785 } 786 } 787 } 788 789 /* 790 * Generate RAID parity in the first virtual columns according to the number of 791 * parity columns available. 792 */ 793 static void 794 vdev_raidz_generate_parity(raidz_map_t *rm) 795 { 796 switch (rm->rm_firstdatacol) { 797 case 1: 798 vdev_raidz_generate_parity_p(rm); 799 break; 800 case 2: 801 vdev_raidz_generate_parity_pq(rm); 802 break; 803 case 3: 804 vdev_raidz_generate_parity_pqr(rm); 805 break; 806 default: 807 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 808 } 809 } 810 811 /* ARGSUSED */ 812 static int 813 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 814 { 815 uint64_t *dst = dbuf; 816 uint64_t *src = sbuf; 817 int cnt = size / sizeof (src[0]); 818 819 for (int i = 0; i < cnt; i++) { 820 dst[i] ^= src[i]; 821 } 822 823 return (0); 824 } 825 826 /* ARGSUSED */ 827 static int 828 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 829 void *private) 830 { 831 uint64_t *dst = dbuf; 832 uint64_t *src = sbuf; 833 uint64_t mask; 834 int cnt = size / sizeof (dst[0]); 835 836 for (int i = 0; i < cnt; i++, dst++, src++) { 837 VDEV_RAIDZ_64MUL_2(*dst, mask); 838 *dst ^= *src; 839 } 840 841 return (0); 842 } 843 844 /* ARGSUSED */ 845 static int 846 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 847 { 848 uint64_t *dst = buf; 849 uint64_t mask; 850 int cnt = size / sizeof (dst[0]); 851 852 for (int i = 0; i < cnt; i++, dst++) { 853 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 854 VDEV_RAIDZ_64MUL_2(*dst, mask); 855 } 856 857 return (0); 858 } 859 860 struct reconst_q_struct { 861 uint64_t *q; 862 int exp; 863 }; 864 865 static int 866 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 867 { 868 struct reconst_q_struct *rq = private; 869 uint64_t *dst = buf; 870 int cnt = size / sizeof (dst[0]); 871 872 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 873 *dst ^= *rq->q; 874 875 int j; 876 uint8_t *b; 877 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 878 *b = vdev_raidz_exp2(*b, rq->exp); 879 } 880 } 881 882 return (0); 883 } 884 885 struct reconst_pq_struct { 886 uint8_t *p; 887 uint8_t *q; 888 uint8_t *pxy; 889 uint8_t *qxy; 890 int aexp; 891 int bexp; 892 }; 893 894 static int 895 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 896 { 897 struct reconst_pq_struct *rpq = private; 898 uint8_t *xd = xbuf; 899 uint8_t *yd = ybuf; 900 901 for (int i = 0; i < size; 902 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 903 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 904 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 905 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 906 } 907 908 return (0); 909 } 910 911 static int 912 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 913 { 914 struct reconst_pq_struct *rpq = private; 915 uint8_t *xd = xbuf; 916 917 for (int i = 0; i < size; 918 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 919 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 920 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 921 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 922 } 923 924 return (0); 925 } 926 927 static int 928 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 929 { 930 int x = tgts[0]; 931 int c; 932 abd_t *dst, *src; 933 934 ASSERT(ntgts == 1); 935 ASSERT(x >= rm->rm_firstdatacol); 936 ASSERT(x < rm->rm_cols); 937 938 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 939 ASSERT(rm->rm_col[x].rc_size > 0); 940 941 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 942 dst = rm->rm_col[x].rc_abd; 943 944 abd_copy(dst, src, rm->rm_col[x].rc_size); 945 946 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 947 uint64_t size = MIN(rm->rm_col[x].rc_size, 948 rm->rm_col[c].rc_size); 949 950 src = rm->rm_col[c].rc_abd; 951 dst = rm->rm_col[x].rc_abd; 952 953 if (c == x) 954 continue; 955 956 (void) abd_iterate_func2(dst, src, 0, 0, size, 957 vdev_raidz_reconst_p_func, NULL); 958 } 959 960 return (1 << VDEV_RAIDZ_P); 961 } 962 963 static int 964 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 965 { 966 int x = tgts[0]; 967 int c, exp; 968 abd_t *dst, *src; 969 970 ASSERT(ntgts == 1); 971 972 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 973 974 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 975 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 976 rm->rm_col[c].rc_size); 977 978 src = rm->rm_col[c].rc_abd; 979 dst = rm->rm_col[x].rc_abd; 980 981 if (c == rm->rm_firstdatacol) { 982 abd_copy(dst, src, size); 983 if (rm->rm_col[x].rc_size > size) 984 abd_zero_off(dst, size, 985 rm->rm_col[x].rc_size - size); 986 } else { 987 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 988 (void) abd_iterate_func2(dst, src, 0, 0, size, 989 vdev_raidz_reconst_q_pre_func, NULL); 990 (void) abd_iterate_func(dst, 991 size, rm->rm_col[x].rc_size - size, 992 vdev_raidz_reconst_q_pre_tail_func, NULL); 993 } 994 } 995 996 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 997 dst = rm->rm_col[x].rc_abd; 998 exp = 255 - (rm->rm_cols - 1 - x); 999 1000 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1001 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 1002 vdev_raidz_reconst_q_post_func, &rq); 1003 1004 return (1 << VDEV_RAIDZ_Q); 1005 } 1006 1007 static int 1008 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 1009 { 1010 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1011 abd_t *pdata, *qdata; 1012 uint64_t xsize, ysize; 1013 int x = tgts[0]; 1014 int y = tgts[1]; 1015 abd_t *xd, *yd; 1016 1017 ASSERT(ntgts == 2); 1018 ASSERT(x < y); 1019 ASSERT(x >= rm->rm_firstdatacol); 1020 ASSERT(y < rm->rm_cols); 1021 1022 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 1023 1024 /* 1025 * Move the parity data aside -- we're going to compute parity as 1026 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1027 * reuse the parity generation mechanism without trashing the actual 1028 * parity so we make those columns appear to be full of zeros by 1029 * setting their lengths to zero. 1030 */ 1031 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 1032 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1033 xsize = rm->rm_col[x].rc_size; 1034 ysize = rm->rm_col[y].rc_size; 1035 1036 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 1037 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1038 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 1039 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1040 rm->rm_col[x].rc_size = 0; 1041 rm->rm_col[y].rc_size = 0; 1042 1043 vdev_raidz_generate_parity_pq(rm); 1044 1045 rm->rm_col[x].rc_size = xsize; 1046 rm->rm_col[y].rc_size = ysize; 1047 1048 p = abd_to_buf(pdata); 1049 q = abd_to_buf(qdata); 1050 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1051 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1052 xd = rm->rm_col[x].rc_abd; 1053 yd = rm->rm_col[y].rc_abd; 1054 1055 /* 1056 * We now have: 1057 * Pxy = P + D_x + D_y 1058 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1059 * 1060 * We can then solve for D_x: 1061 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1062 * where 1063 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1064 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1065 * 1066 * With D_x in hand, we can easily solve for D_y: 1067 * D_y = P + Pxy + D_x 1068 */ 1069 1070 a = vdev_raidz_pow2[255 + x - y]; 1071 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 1072 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1073 1074 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1075 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1076 1077 ASSERT3U(xsize, >=, ysize); 1078 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1079 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1080 vdev_raidz_reconst_pq_func, &rpq); 1081 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1082 vdev_raidz_reconst_pq_tail_func, &rpq); 1083 1084 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1085 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1086 1087 /* 1088 * Restore the saved parity data. 1089 */ 1090 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 1091 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1092 1093 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1094 } 1095 1096 /* BEGIN CSTYLED */ 1097 /* 1098 * In the general case of reconstruction, we must solve the system of linear 1099 * equations defined by the coeffecients used to generate parity as well as 1100 * the contents of the data and parity disks. This can be expressed with 1101 * vectors for the original data (D) and the actual data (d) and parity (p) 1102 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1103 * 1104 * __ __ __ __ 1105 * | | __ __ | p_0 | 1106 * | V | | D_0 | | p_m-1 | 1107 * | | x | : | = | d_0 | 1108 * | I | | D_n-1 | | : | 1109 * | | ~~ ~~ | d_n-1 | 1110 * ~~ ~~ ~~ ~~ 1111 * 1112 * I is simply a square identity matrix of size n, and V is a vandermonde 1113 * matrix defined by the coeffecients we chose for the various parity columns 1114 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1115 * computation as well as linear separability. 1116 * 1117 * __ __ __ __ 1118 * | 1 .. 1 1 1 | | p_0 | 1119 * | 2^n-1 .. 4 2 1 | __ __ | : | 1120 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1121 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1122 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1123 * | : : : : | | : | | d_2 | 1124 * | 0 .. 1 0 0 | | D_n-1 | | : | 1125 * | 0 .. 0 1 0 | ~~ ~~ | : | 1126 * | 0 .. 0 0 1 | | d_n-1 | 1127 * ~~ ~~ ~~ ~~ 1128 * 1129 * Note that I, V, d, and p are known. To compute D, we must invert the 1130 * matrix and use the known data and parity values to reconstruct the unknown 1131 * data values. We begin by removing the rows in V|I and d|p that correspond 1132 * to failed or missing columns; we then make V|I square (n x n) and d|p 1133 * sized n by removing rows corresponding to unused parity from the bottom up 1134 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1135 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1136 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1137 * __ __ 1138 * | 1 1 1 1 1 1 1 1 | 1139 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1140 * | 19 205 116 29 64 16 4 1 | / / 1141 * | 1 0 0 0 0 0 0 0 | / / 1142 * | 0 1 0 0 0 0 0 0 | <--' / 1143 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1144 * | 0 0 0 1 0 0 0 0 | 1145 * | 0 0 0 0 1 0 0 0 | 1146 * | 0 0 0 0 0 1 0 0 | 1147 * | 0 0 0 0 0 0 1 0 | 1148 * | 0 0 0 0 0 0 0 1 | 1149 * ~~ ~~ 1150 * __ __ 1151 * | 1 1 1 1 1 1 1 1 | 1152 * | 19 205 116 29 64 16 4 1 | 1153 * | 1 0 0 0 0 0 0 0 | 1154 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1155 * | 0 0 0 0 1 0 0 0 | 1156 * | 0 0 0 0 0 1 0 0 | 1157 * | 0 0 0 0 0 0 1 0 | 1158 * | 0 0 0 0 0 0 0 1 | 1159 * ~~ ~~ 1160 * 1161 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1162 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1163 * matrix is not singular. 1164 * __ __ 1165 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1166 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1167 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1168 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1169 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1170 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1171 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1172 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1173 * ~~ ~~ 1174 * __ __ 1175 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1176 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1177 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1178 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1179 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1180 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1181 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1182 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1183 * ~~ ~~ 1184 * __ __ 1185 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1186 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1187 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1188 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1189 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1190 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1191 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1192 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1193 * ~~ ~~ 1194 * __ __ 1195 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1196 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1197 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1198 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1199 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1200 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1201 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1202 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1203 * ~~ ~~ 1204 * __ __ 1205 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1206 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1207 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1208 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1209 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1210 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1211 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1212 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1213 * ~~ ~~ 1214 * __ __ 1215 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1216 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1217 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1218 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1219 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1220 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1221 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1222 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1223 * ~~ ~~ 1224 * __ __ 1225 * | 0 0 1 0 0 0 0 0 | 1226 * | 167 100 5 41 159 169 217 208 | 1227 * | 166 100 4 40 158 168 216 209 | 1228 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1229 * | 0 0 0 0 1 0 0 0 | 1230 * | 0 0 0 0 0 1 0 0 | 1231 * | 0 0 0 0 0 0 1 0 | 1232 * | 0 0 0 0 0 0 0 1 | 1233 * ~~ ~~ 1234 * 1235 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1236 * of the missing data. 1237 * 1238 * As is apparent from the example above, the only non-trivial rows in the 1239 * inverse matrix correspond to the data disks that we're trying to 1240 * reconstruct. Indeed, those are the only rows we need as the others would 1241 * only be useful for reconstructing data known or assumed to be valid. For 1242 * that reason, we only build the coefficients in the rows that correspond to 1243 * targeted columns. 1244 */ 1245 /* END CSTYLED */ 1246 1247 static void 1248 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1249 uint8_t **rows) 1250 { 1251 int i, j; 1252 int pow; 1253 1254 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1255 1256 /* 1257 * Fill in the missing rows of interest. 1258 */ 1259 for (i = 0; i < nmap; i++) { 1260 ASSERT3S(0, <=, map[i]); 1261 ASSERT3S(map[i], <=, 2); 1262 1263 pow = map[i] * n; 1264 if (pow > 255) 1265 pow -= 255; 1266 ASSERT(pow <= 255); 1267 1268 for (j = 0; j < n; j++) { 1269 pow -= map[i]; 1270 if (pow < 0) 1271 pow += 255; 1272 rows[i][j] = vdev_raidz_pow2[pow]; 1273 } 1274 } 1275 } 1276 1277 static void 1278 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1279 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1280 { 1281 int i, j, ii, jj; 1282 uint8_t log; 1283 1284 /* 1285 * Assert that the first nmissing entries from the array of used 1286 * columns correspond to parity columns and that subsequent entries 1287 * correspond to data columns. 1288 */ 1289 for (i = 0; i < nmissing; i++) { 1290 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1291 } 1292 for (; i < n; i++) { 1293 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1294 } 1295 1296 /* 1297 * First initialize the storage where we'll compute the inverse rows. 1298 */ 1299 for (i = 0; i < nmissing; i++) { 1300 for (j = 0; j < n; j++) { 1301 invrows[i][j] = (i == j) ? 1 : 0; 1302 } 1303 } 1304 1305 /* 1306 * Subtract all trivial rows from the rows of consequence. 1307 */ 1308 for (i = 0; i < nmissing; i++) { 1309 for (j = nmissing; j < n; j++) { 1310 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1311 jj = used[j] - rm->rm_firstdatacol; 1312 ASSERT3S(jj, <, n); 1313 invrows[i][j] = rows[i][jj]; 1314 rows[i][jj] = 0; 1315 } 1316 } 1317 1318 /* 1319 * For each of the rows of interest, we must normalize it and subtract 1320 * a multiple of it from the other rows. 1321 */ 1322 for (i = 0; i < nmissing; i++) { 1323 for (j = 0; j < missing[i]; j++) { 1324 ASSERT0(rows[i][j]); 1325 } 1326 ASSERT3U(rows[i][missing[i]], !=, 0); 1327 1328 /* 1329 * Compute the inverse of the first element and multiply each 1330 * element in the row by that value. 1331 */ 1332 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1333 1334 for (j = 0; j < n; j++) { 1335 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1336 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1337 } 1338 1339 for (ii = 0; ii < nmissing; ii++) { 1340 if (i == ii) 1341 continue; 1342 1343 ASSERT3U(rows[ii][missing[i]], !=, 0); 1344 1345 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1346 1347 for (j = 0; j < n; j++) { 1348 rows[ii][j] ^= 1349 vdev_raidz_exp2(rows[i][j], log); 1350 invrows[ii][j] ^= 1351 vdev_raidz_exp2(invrows[i][j], log); 1352 } 1353 } 1354 } 1355 1356 /* 1357 * Verify that the data that is left in the rows are properly part of 1358 * an identity matrix. 1359 */ 1360 for (i = 0; i < nmissing; i++) { 1361 for (j = 0; j < n; j++) { 1362 if (j == missing[i]) { 1363 ASSERT3U(rows[i][j], ==, 1); 1364 } else { 1365 ASSERT0(rows[i][j]); 1366 } 1367 } 1368 } 1369 } 1370 1371 static void 1372 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1373 int *missing, uint8_t **invrows, const uint8_t *used) 1374 { 1375 int i, j, x, cc, c; 1376 uint8_t *src; 1377 uint64_t ccount; 1378 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1379 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1380 uint8_t log = 0; 1381 uint8_t val; 1382 int ll; 1383 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1384 uint8_t *p, *pp; 1385 size_t psize; 1386 1387 psize = sizeof (invlog[0][0]) * n * nmissing; 1388 p = kmem_alloc(psize, KM_SLEEP); 1389 1390 for (pp = p, i = 0; i < nmissing; i++) { 1391 invlog[i] = pp; 1392 pp += n; 1393 } 1394 1395 for (i = 0; i < nmissing; i++) { 1396 for (j = 0; j < n; j++) { 1397 ASSERT3U(invrows[i][j], !=, 0); 1398 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1399 } 1400 } 1401 1402 for (i = 0; i < n; i++) { 1403 c = used[i]; 1404 ASSERT3U(c, <, rm->rm_cols); 1405 1406 src = abd_to_buf(rm->rm_col[c].rc_abd); 1407 ccount = rm->rm_col[c].rc_size; 1408 for (j = 0; j < nmissing; j++) { 1409 cc = missing[j] + rm->rm_firstdatacol; 1410 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1411 ASSERT3U(cc, <, rm->rm_cols); 1412 ASSERT3U(cc, !=, c); 1413 1414 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1415 dcount[j] = rm->rm_col[cc].rc_size; 1416 } 1417 1418 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1419 1420 for (x = 0; x < ccount; x++, src++) { 1421 if (*src != 0) 1422 log = vdev_raidz_log2[*src]; 1423 1424 for (cc = 0; cc < nmissing; cc++) { 1425 if (x >= dcount[cc]) 1426 continue; 1427 1428 if (*src == 0) { 1429 val = 0; 1430 } else { 1431 if ((ll = log + invlog[cc][i]) >= 255) 1432 ll -= 255; 1433 val = vdev_raidz_pow2[ll]; 1434 } 1435 1436 if (i == 0) 1437 dst[cc][x] = val; 1438 else 1439 dst[cc][x] ^= val; 1440 } 1441 } 1442 } 1443 1444 kmem_free(p, psize); 1445 } 1446 1447 static int 1448 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1449 { 1450 int n, i, c, t, tt; 1451 int nmissing_rows; 1452 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1453 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1454 1455 uint8_t *p, *pp; 1456 size_t psize; 1457 1458 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1459 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1460 uint8_t *used; 1461 1462 abd_t **bufs = NULL; 1463 1464 int code = 0; 1465 1466 /* 1467 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1468 * temporary linear ABDs. 1469 */ 1470 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1471 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1472 1473 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1474 raidz_col_t *col = &rm->rm_col[c]; 1475 1476 bufs[c] = col->rc_abd; 1477 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1478 abd_copy(col->rc_abd, bufs[c], col->rc_size); 1479 } 1480 } 1481 1482 n = rm->rm_cols - rm->rm_firstdatacol; 1483 1484 /* 1485 * Figure out which data columns are missing. 1486 */ 1487 nmissing_rows = 0; 1488 for (t = 0; t < ntgts; t++) { 1489 if (tgts[t] >= rm->rm_firstdatacol) { 1490 missing_rows[nmissing_rows++] = 1491 tgts[t] - rm->rm_firstdatacol; 1492 } 1493 } 1494 1495 /* 1496 * Figure out which parity columns to use to help generate the missing 1497 * data columns. 1498 */ 1499 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1500 ASSERT(tt < ntgts); 1501 ASSERT(c < rm->rm_firstdatacol); 1502 1503 /* 1504 * Skip any targeted parity columns. 1505 */ 1506 if (c == tgts[tt]) { 1507 tt++; 1508 continue; 1509 } 1510 1511 code |= 1 << c; 1512 1513 parity_map[i] = c; 1514 i++; 1515 } 1516 1517 ASSERT(code != 0); 1518 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1519 1520 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1521 nmissing_rows * n + sizeof (used[0]) * n; 1522 p = kmem_alloc(psize, KM_SLEEP); 1523 1524 for (pp = p, i = 0; i < nmissing_rows; i++) { 1525 rows[i] = pp; 1526 pp += n; 1527 invrows[i] = pp; 1528 pp += n; 1529 } 1530 used = pp; 1531 1532 for (i = 0; i < nmissing_rows; i++) { 1533 used[i] = parity_map[i]; 1534 } 1535 1536 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1537 if (tt < nmissing_rows && 1538 c == missing_rows[tt] + rm->rm_firstdatacol) { 1539 tt++; 1540 continue; 1541 } 1542 1543 ASSERT3S(i, <, n); 1544 used[i] = c; 1545 i++; 1546 } 1547 1548 /* 1549 * Initialize the interesting rows of the matrix. 1550 */ 1551 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1552 1553 /* 1554 * Invert the matrix. 1555 */ 1556 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1557 invrows, used); 1558 1559 /* 1560 * Reconstruct the missing data using the generated matrix. 1561 */ 1562 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1563 invrows, used); 1564 1565 kmem_free(p, psize); 1566 1567 /* 1568 * copy back from temporary linear abds and free them 1569 */ 1570 if (bufs) { 1571 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1572 raidz_col_t *col = &rm->rm_col[c]; 1573 1574 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1575 abd_free(col->rc_abd); 1576 col->rc_abd = bufs[c]; 1577 } 1578 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1579 } 1580 1581 return (code); 1582 } 1583 1584 static int 1585 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1586 { 1587 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1588 int ntgts; 1589 int i, c; 1590 int code; 1591 int nbadparity, nbaddata; 1592 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1593 1594 /* 1595 * The tgts list must already be sorted. 1596 */ 1597 for (i = 1; i < nt; i++) { 1598 ASSERT(t[i] > t[i - 1]); 1599 } 1600 1601 nbadparity = rm->rm_firstdatacol; 1602 nbaddata = rm->rm_cols - nbadparity; 1603 ntgts = 0; 1604 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1605 if (c < rm->rm_firstdatacol) 1606 parity_valid[c] = B_FALSE; 1607 1608 if (i < nt && c == t[i]) { 1609 tgts[ntgts++] = c; 1610 i++; 1611 } else if (rm->rm_col[c].rc_error != 0) { 1612 tgts[ntgts++] = c; 1613 } else if (c >= rm->rm_firstdatacol) { 1614 nbaddata--; 1615 } else { 1616 parity_valid[c] = B_TRUE; 1617 nbadparity--; 1618 } 1619 } 1620 1621 ASSERT(ntgts >= nt); 1622 ASSERT(nbaddata >= 0); 1623 ASSERT(nbaddata + nbadparity == ntgts); 1624 1625 dt = &tgts[nbadparity]; 1626 1627 /* 1628 * See if we can use any of our optimized reconstruction routines. 1629 */ 1630 if (!vdev_raidz_default_to_general) { 1631 switch (nbaddata) { 1632 case 1: 1633 if (parity_valid[VDEV_RAIDZ_P]) 1634 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1635 1636 ASSERT(rm->rm_firstdatacol > 1); 1637 1638 if (parity_valid[VDEV_RAIDZ_Q]) 1639 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1640 1641 ASSERT(rm->rm_firstdatacol > 2); 1642 break; 1643 1644 case 2: 1645 ASSERT(rm->rm_firstdatacol > 1); 1646 1647 if (parity_valid[VDEV_RAIDZ_P] && 1648 parity_valid[VDEV_RAIDZ_Q]) 1649 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1650 1651 ASSERT(rm->rm_firstdatacol > 2); 1652 1653 break; 1654 } 1655 } 1656 1657 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1658 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1659 ASSERT(code > 0); 1660 return (code); 1661 } 1662 1663 static int 1664 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1665 uint64_t *ashift) 1666 { 1667 vdev_t *cvd; 1668 uint64_t nparity = vd->vdev_nparity; 1669 int c; 1670 int lasterror = 0; 1671 int numerrors = 0; 1672 1673 ASSERT(nparity > 0); 1674 1675 if (nparity > VDEV_RAIDZ_MAXPARITY || 1676 vd->vdev_children < nparity + 1) { 1677 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1678 return (SET_ERROR(EINVAL)); 1679 } 1680 1681 vdev_open_children(vd); 1682 1683 for (c = 0; c < vd->vdev_children; c++) { 1684 cvd = vd->vdev_child[c]; 1685 1686 if (cvd->vdev_open_error != 0) { 1687 lasterror = cvd->vdev_open_error; 1688 numerrors++; 1689 continue; 1690 } 1691 1692 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1693 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1694 *ashift = MAX(*ashift, cvd->vdev_ashift); 1695 } 1696 1697 *asize *= vd->vdev_children; 1698 *max_asize *= vd->vdev_children; 1699 1700 if (numerrors > nparity) { 1701 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1702 return (lasterror); 1703 } 1704 1705 return (0); 1706 } 1707 1708 static void 1709 vdev_raidz_close(vdev_t *vd) 1710 { 1711 int c; 1712 1713 for (c = 0; c < vd->vdev_children; c++) 1714 vdev_close(vd->vdev_child[c]); 1715 } 1716 1717 /* 1718 * Handle a read or write I/O to a RAID-Z dump device. 1719 * 1720 * The dump device is in a unique situation compared to other ZFS datasets: 1721 * writing to this device should be as simple and fast as possible. In 1722 * addition, durability matters much less since the dump will be extracted 1723 * once the machine reboots. For that reason, this function eschews parity for 1724 * performance and simplicity. The dump device uses the checksum setting 1725 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1726 * dataset. 1727 * 1728 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1729 * 128 KB will not fill an entire block; in addition, they may not be properly 1730 * aligned. In that case, this function uses the preallocated 128 KB block and 1731 * omits reading or writing any "empty" portions of that block, as opposed to 1732 * allocating a fresh appropriately-sized block. 1733 * 1734 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1735 * 1736 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1737 * 1738 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1739 * allocated which spans all five child vdevs. 8 KB of data would be written to 1740 * each of four vdevs, with the fifth containing the parity bits. 1741 * 1742 * parity data data data data 1743 * | PP | XX | XX | XX | XX | 1744 * ^ ^ ^ ^ ^ 1745 * | | | | | 1746 * 8 KB parity ------8 KB data blocks------ 1747 * 1748 * However, when writing to the dump device, the behavior is different: 1749 * 1750 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1751 * 1752 * Unlike the normal RAID-Z case in which the block is allocated based on the 1753 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1754 * I/O size is less than 128 KB, only the actual portions of data are written. 1755 * In this example the data is written to the third data vdev since that vdev 1756 * contains the offset [64 KB, 96 KB). 1757 * 1758 * parity data data data data 1759 * | | | | XX | | 1760 * ^ 1761 * | 1762 * 32 KB data block 1763 * 1764 * As a result, an individual I/O may not span all child vdevs; moreover, a 1765 * small I/O may only operate on a single child vdev. 1766 * 1767 * Note that since there are no parity bits calculated or written, this format 1768 * remains the same no matter how many parity bits are used in a normal RAID-Z 1769 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1770 * would look like: 1771 * 1772 * parity parity parity data data data data 1773 * | | | | | | XX | | 1774 * ^ 1775 * | 1776 * 32 KB data block 1777 */ 1778 int 1779 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1780 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1781 { 1782 vdev_t *tvd = vd->vdev_top; 1783 vdev_t *cvd; 1784 raidz_map_t *rm; 1785 raidz_col_t *rc; 1786 int c, err = 0; 1787 1788 uint64_t start, end, colstart, colend; 1789 uint64_t coloffset, colsize, colskip; 1790 1791 int flags = doread ? B_READ : B_WRITE; 1792 1793 #ifdef _KERNEL 1794 1795 /* 1796 * Don't write past the end of the block 1797 */ 1798 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1799 1800 start = offset; 1801 end = start + size; 1802 1803 /* 1804 * Allocate a RAID-Z map for this block. Note that this block starts 1805 * from the "original" offset, this is, the offset of the extent which 1806 * contains the requisite offset of the data being read or written. 1807 * 1808 * Even if this I/O operation doesn't span the full block size, let's 1809 * treat the on-disk format as if the only blocks are the complete 128 1810 * KB size. 1811 */ 1812 abd_t *abd = abd_get_from_buf(data - (offset - origoffset), 1813 SPA_OLD_MAXBLOCKSIZE); 1814 rm = vdev_raidz_map_alloc(abd, 1815 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1816 vd->vdev_children, vd->vdev_nparity); 1817 1818 coloffset = origoffset; 1819 1820 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1821 c++, coloffset += rc->rc_size) { 1822 rc = &rm->rm_col[c]; 1823 cvd = vd->vdev_child[rc->rc_devidx]; 1824 1825 /* 1826 * Find the start and end of this column in the RAID-Z map, 1827 * keeping in mind that the stated size and offset of the 1828 * operation may not fill the entire column for this vdev. 1829 * 1830 * If any portion of the data spans this column, issue the 1831 * appropriate operation to the vdev. 1832 */ 1833 if (coloffset + rc->rc_size <= start) 1834 continue; 1835 if (coloffset >= end) 1836 continue; 1837 1838 colstart = MAX(coloffset, start); 1839 colend = MIN(end, coloffset + rc->rc_size); 1840 colsize = colend - colstart; 1841 colskip = colstart - coloffset; 1842 1843 VERIFY3U(colsize, <=, rc->rc_size); 1844 VERIFY3U(colskip, <=, rc->rc_size); 1845 1846 /* 1847 * Note that the child vdev will have a vdev label at the start 1848 * of its range of offsets, hence the need for 1849 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1850 * example of why this calculation is needed. 1851 */ 1852 if ((err = vdev_disk_physio(cvd, 1853 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1854 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1855 flags, isdump)) != 0) 1856 break; 1857 } 1858 1859 vdev_raidz_map_free(rm); 1860 abd_put(abd); 1861 #endif /* KERNEL */ 1862 1863 return (err); 1864 } 1865 1866 static uint64_t 1867 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1868 { 1869 uint64_t asize; 1870 uint64_t ashift = vd->vdev_top->vdev_ashift; 1871 uint64_t cols = vd->vdev_children; 1872 uint64_t nparity = vd->vdev_nparity; 1873 1874 asize = ((psize - 1) >> ashift) + 1; 1875 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1876 asize = roundup(asize, nparity + 1) << ashift; 1877 1878 return (asize); 1879 } 1880 1881 static void 1882 vdev_raidz_child_done(zio_t *zio) 1883 { 1884 raidz_col_t *rc = zio->io_private; 1885 1886 rc->rc_error = zio->io_error; 1887 rc->rc_tried = 1; 1888 rc->rc_skipped = 0; 1889 } 1890 1891 static void 1892 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1893 { 1894 #ifdef ZFS_DEBUG 1895 vdev_t *vd = zio->io_vd; 1896 vdev_t *tvd = vd->vdev_top; 1897 1898 range_seg_t logical_rs, physical_rs; 1899 logical_rs.rs_start = zio->io_offset; 1900 logical_rs.rs_end = logical_rs.rs_start + 1901 vdev_raidz_asize(zio->io_vd, zio->io_size); 1902 1903 raidz_col_t *rc = &rm->rm_col[col]; 1904 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1905 1906 vdev_xlate(cvd, &logical_rs, &physical_rs); 1907 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1908 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1909 /* 1910 * It would be nice to assert that rs_end is equal 1911 * to rc_offset + rc_size but there might be an 1912 * optional I/O at the end that is not accounted in 1913 * rc_size. 1914 */ 1915 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1916 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1917 rc->rc_size + (1 << tvd->vdev_ashift)); 1918 } else { 1919 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1920 } 1921 #endif 1922 } 1923 1924 /* 1925 * Start an IO operation on a RAIDZ VDev 1926 * 1927 * Outline: 1928 * - For write operations: 1929 * 1. Generate the parity data 1930 * 2. Create child zio write operations to each column's vdev, for both 1931 * data and parity. 1932 * 3. If the column skips any sectors for padding, create optional dummy 1933 * write zio children for those areas to improve aggregation continuity. 1934 * - For read operations: 1935 * 1. Create child zio read operations to each data column's vdev to read 1936 * the range of data required for zio. 1937 * 2. If this is a scrub or resilver operation, or if any of the data 1938 * vdevs have had errors, then create zio read operations to the parity 1939 * columns' VDevs as well. 1940 */ 1941 static void 1942 vdev_raidz_io_start(zio_t *zio) 1943 { 1944 vdev_t *vd = zio->io_vd; 1945 vdev_t *tvd = vd->vdev_top; 1946 vdev_t *cvd; 1947 raidz_map_t *rm; 1948 raidz_col_t *rc; 1949 int c, i; 1950 1951 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, 1952 tvd->vdev_ashift, vd->vdev_children, 1953 vd->vdev_nparity); 1954 1955 zio->io_vsd = rm; 1956 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1957 1958 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1959 1960 if (zio->io_type == ZIO_TYPE_WRITE) { 1961 vdev_raidz_generate_parity(rm); 1962 1963 for (c = 0; c < rm->rm_cols; c++) { 1964 rc = &rm->rm_col[c]; 1965 cvd = vd->vdev_child[rc->rc_devidx]; 1966 1967 /* 1968 * Verify physical to logical translation. 1969 */ 1970 vdev_raidz_io_verify(zio, rm, c); 1971 1972 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1973 rc->rc_offset, rc->rc_abd, rc->rc_size, 1974 zio->io_type, zio->io_priority, 0, 1975 vdev_raidz_child_done, rc)); 1976 } 1977 1978 /* 1979 * Generate optional I/Os for any skipped sectors to improve 1980 * aggregation contiguity. 1981 */ 1982 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1983 ASSERT(c <= rm->rm_scols); 1984 if (c == rm->rm_scols) 1985 c = 0; 1986 rc = &rm->rm_col[c]; 1987 cvd = vd->vdev_child[rc->rc_devidx]; 1988 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1989 rc->rc_offset + rc->rc_size, NULL, 1990 1 << tvd->vdev_ashift, 1991 zio->io_type, zio->io_priority, 1992 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1993 } 1994 1995 zio_execute(zio); 1996 return; 1997 } 1998 1999 ASSERT(zio->io_type == ZIO_TYPE_READ); 2000 2001 /* 2002 * Iterate over the columns in reverse order so that we hit the parity 2003 * last -- any errors along the way will force us to read the parity. 2004 */ 2005 for (c = rm->rm_cols - 1; c >= 0; c--) { 2006 rc = &rm->rm_col[c]; 2007 cvd = vd->vdev_child[rc->rc_devidx]; 2008 if (!vdev_readable(cvd)) { 2009 if (c >= rm->rm_firstdatacol) 2010 rm->rm_missingdata++; 2011 else 2012 rm->rm_missingparity++; 2013 rc->rc_error = SET_ERROR(ENXIO); 2014 rc->rc_tried = 1; /* don't even try */ 2015 rc->rc_skipped = 1; 2016 continue; 2017 } 2018 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2019 if (c >= rm->rm_firstdatacol) 2020 rm->rm_missingdata++; 2021 else 2022 rm->rm_missingparity++; 2023 rc->rc_error = SET_ERROR(ESTALE); 2024 rc->rc_skipped = 1; 2025 continue; 2026 } 2027 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 2028 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2029 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2030 rc->rc_offset, rc->rc_abd, rc->rc_size, 2031 zio->io_type, zio->io_priority, 0, 2032 vdev_raidz_child_done, rc)); 2033 } 2034 } 2035 2036 zio_execute(zio); 2037 } 2038 2039 2040 /* 2041 * Report a checksum error for a child of a RAID-Z device. 2042 */ 2043 static void 2044 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 2045 { 2046 void *buf; 2047 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2048 2049 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2050 zio_bad_cksum_t zbc; 2051 raidz_map_t *rm = zio->io_vsd; 2052 2053 mutex_enter(&vd->vdev_stat_lock); 2054 vd->vdev_stat.vs_checksum_errors++; 2055 mutex_exit(&vd->vdev_stat_lock); 2056 2057 zbc.zbc_has_cksum = 0; 2058 zbc.zbc_injected = rm->rm_ecksuminjected; 2059 2060 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); 2061 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 2062 rc->rc_offset, rc->rc_size, buf, bad_data, 2063 &zbc); 2064 abd_return_buf(rc->rc_abd, buf, rc->rc_size); 2065 } 2066 } 2067 2068 /* 2069 * We keep track of whether or not there were any injected errors, so that 2070 * any ereports we generate can note it. 2071 */ 2072 static int 2073 raidz_checksum_verify(zio_t *zio) 2074 { 2075 zio_bad_cksum_t zbc; 2076 raidz_map_t *rm = zio->io_vsd; 2077 2078 int ret = zio_checksum_error(zio, &zbc); 2079 if (ret != 0 && zbc.zbc_injected != 0) 2080 rm->rm_ecksuminjected = 1; 2081 2082 return (ret); 2083 } 2084 2085 /* 2086 * Generate the parity from the data columns. If we tried and were able to 2087 * read the parity without error, verify that the generated parity matches the 2088 * data we read. If it doesn't, we fire off a checksum error. Return the 2089 * number such failures. 2090 */ 2091 static int 2092 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2093 { 2094 void *orig[VDEV_RAIDZ_MAXPARITY]; 2095 int c, ret = 0; 2096 raidz_col_t *rc; 2097 2098 blkptr_t *bp = zio->io_bp; 2099 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2100 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2101 2102 if (checksum == ZIO_CHECKSUM_NOPARITY) 2103 return (ret); 2104 2105 for (c = 0; c < rm->rm_firstdatacol; c++) { 2106 rc = &rm->rm_col[c]; 2107 if (!rc->rc_tried || rc->rc_error != 0) 2108 continue; 2109 orig[c] = zio_buf_alloc(rc->rc_size); 2110 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); 2111 } 2112 2113 vdev_raidz_generate_parity(rm); 2114 2115 for (c = 0; c < rm->rm_firstdatacol; c++) { 2116 rc = &rm->rm_col[c]; 2117 if (!rc->rc_tried || rc->rc_error != 0) 2118 continue; 2119 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { 2120 raidz_checksum_error(zio, rc, orig[c]); 2121 rc->rc_error = SET_ERROR(ECKSUM); 2122 ret++; 2123 } 2124 zio_buf_free(orig[c], rc->rc_size); 2125 } 2126 2127 return (ret); 2128 } 2129 2130 /* 2131 * Keep statistics on all the ways that we used parity to correct data. 2132 */ 2133 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 2134 2135 static int 2136 vdev_raidz_worst_error(raidz_map_t *rm) 2137 { 2138 int error = 0; 2139 2140 for (int c = 0; c < rm->rm_cols; c++) 2141 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2142 2143 return (error); 2144 } 2145 2146 /* 2147 * Iterate over all combinations of bad data and attempt a reconstruction. 2148 * Note that the algorithm below is non-optimal because it doesn't take into 2149 * account how reconstruction is actually performed. For example, with 2150 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2151 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2152 * cases we'd only use parity information in column 0. 2153 */ 2154 static int 2155 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2156 { 2157 raidz_map_t *rm = zio->io_vsd; 2158 raidz_col_t *rc; 2159 void *orig[VDEV_RAIDZ_MAXPARITY]; 2160 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2161 int *tgts = &tstore[1]; 2162 int current, next, i, c, n; 2163 int code, ret = 0; 2164 2165 ASSERT(total_errors < rm->rm_firstdatacol); 2166 2167 /* 2168 * This simplifies one edge condition. 2169 */ 2170 tgts[-1] = -1; 2171 2172 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2173 /* 2174 * Initialize the targets array by finding the first n columns 2175 * that contain no error. 2176 * 2177 * If there were no data errors, we need to ensure that we're 2178 * always explicitly attempting to reconstruct at least one 2179 * data column. To do this, we simply push the highest target 2180 * up into the data columns. 2181 */ 2182 for (c = 0, i = 0; i < n; i++) { 2183 if (i == n - 1 && data_errors == 0 && 2184 c < rm->rm_firstdatacol) { 2185 c = rm->rm_firstdatacol; 2186 } 2187 2188 while (rm->rm_col[c].rc_error != 0) { 2189 c++; 2190 ASSERT3S(c, <, rm->rm_cols); 2191 } 2192 2193 tgts[i] = c++; 2194 } 2195 2196 /* 2197 * Setting tgts[n] simplifies the other edge condition. 2198 */ 2199 tgts[n] = rm->rm_cols; 2200 2201 /* 2202 * These buffers were allocated in previous iterations. 2203 */ 2204 for (i = 0; i < n - 1; i++) { 2205 ASSERT(orig[i] != NULL); 2206 } 2207 2208 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2209 2210 current = 0; 2211 next = tgts[current]; 2212 2213 while (current != n) { 2214 tgts[current] = next; 2215 current = 0; 2216 2217 /* 2218 * Save off the original data that we're going to 2219 * attempt to reconstruct. 2220 */ 2221 for (i = 0; i < n; i++) { 2222 ASSERT(orig[i] != NULL); 2223 c = tgts[i]; 2224 ASSERT3S(c, >=, 0); 2225 ASSERT3S(c, <, rm->rm_cols); 2226 rc = &rm->rm_col[c]; 2227 abd_copy_to_buf(orig[i], rc->rc_abd, 2228 rc->rc_size); 2229 } 2230 2231 /* 2232 * Attempt a reconstruction and exit the outer loop on 2233 * success. 2234 */ 2235 code = vdev_raidz_reconstruct(rm, tgts, n); 2236 if (raidz_checksum_verify(zio) == 0) { 2237 atomic_inc_64(&raidz_corrected[code]); 2238 2239 for (i = 0; i < n; i++) { 2240 c = tgts[i]; 2241 rc = &rm->rm_col[c]; 2242 ASSERT(rc->rc_error == 0); 2243 if (rc->rc_tried) 2244 raidz_checksum_error(zio, rc, 2245 orig[i]); 2246 rc->rc_error = SET_ERROR(ECKSUM); 2247 } 2248 2249 ret = code; 2250 goto done; 2251 } 2252 2253 /* 2254 * Restore the original data. 2255 */ 2256 for (i = 0; i < n; i++) { 2257 c = tgts[i]; 2258 rc = &rm->rm_col[c]; 2259 abd_copy_from_buf(rc->rc_abd, orig[i], 2260 rc->rc_size); 2261 } 2262 2263 do { 2264 /* 2265 * Find the next valid column after the current 2266 * position.. 2267 */ 2268 for (next = tgts[current] + 1; 2269 next < rm->rm_cols && 2270 rm->rm_col[next].rc_error != 0; next++) 2271 continue; 2272 2273 ASSERT(next <= tgts[current + 1]); 2274 2275 /* 2276 * If that spot is available, we're done here. 2277 */ 2278 if (next != tgts[current + 1]) 2279 break; 2280 2281 /* 2282 * Otherwise, find the next valid column after 2283 * the previous position. 2284 */ 2285 for (c = tgts[current - 1] + 1; 2286 rm->rm_col[c].rc_error != 0; c++) 2287 continue; 2288 2289 tgts[current] = c; 2290 current++; 2291 2292 } while (current != n); 2293 } 2294 } 2295 n--; 2296 done: 2297 for (i = 0; i < n; i++) { 2298 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2299 } 2300 2301 return (ret); 2302 } 2303 2304 /* 2305 * Complete an IO operation on a RAIDZ VDev 2306 * 2307 * Outline: 2308 * - For write operations: 2309 * 1. Check for errors on the child IOs. 2310 * 2. Return, setting an error code if too few child VDevs were written 2311 * to reconstruct the data later. Note that partial writes are 2312 * considered successful if they can be reconstructed at all. 2313 * - For read operations: 2314 * 1. Check for errors on the child IOs. 2315 * 2. If data errors occurred: 2316 * a. Try to reassemble the data from the parity available. 2317 * b. If we haven't yet read the parity drives, read them now. 2318 * c. If all parity drives have been read but the data still doesn't 2319 * reassemble with a correct checksum, then try combinatorial 2320 * reconstruction. 2321 * d. If that doesn't work, return an error. 2322 * 3. If there were unexpected errors or this is a resilver operation, 2323 * rewrite the vdevs that had errors. 2324 */ 2325 static void 2326 vdev_raidz_io_done(zio_t *zio) 2327 { 2328 vdev_t *vd = zio->io_vd; 2329 vdev_t *cvd; 2330 raidz_map_t *rm = zio->io_vsd; 2331 raidz_col_t *rc; 2332 int unexpected_errors = 0; 2333 int parity_errors = 0; 2334 int parity_untried = 0; 2335 int data_errors = 0; 2336 int total_errors = 0; 2337 int n, c; 2338 int tgts[VDEV_RAIDZ_MAXPARITY]; 2339 int code; 2340 2341 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2342 2343 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2344 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2345 2346 for (c = 0; c < rm->rm_cols; c++) { 2347 rc = &rm->rm_col[c]; 2348 2349 if (rc->rc_error) { 2350 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2351 2352 if (c < rm->rm_firstdatacol) 2353 parity_errors++; 2354 else 2355 data_errors++; 2356 2357 if (!rc->rc_skipped) 2358 unexpected_errors++; 2359 2360 total_errors++; 2361 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2362 parity_untried++; 2363 } 2364 } 2365 2366 if (zio->io_type == ZIO_TYPE_WRITE) { 2367 /* 2368 * XXX -- for now, treat partial writes as a success. 2369 * (If we couldn't write enough columns to reconstruct 2370 * the data, the I/O failed. Otherwise, good enough.) 2371 * 2372 * Now that we support write reallocation, it would be better 2373 * to treat partial failure as real failure unless there are 2374 * no non-degraded top-level vdevs left, and not update DTLs 2375 * if we intend to reallocate. 2376 */ 2377 /* XXPOLICY */ 2378 if (total_errors > rm->rm_firstdatacol) 2379 zio->io_error = vdev_raidz_worst_error(rm); 2380 2381 return; 2382 } 2383 2384 ASSERT(zio->io_type == ZIO_TYPE_READ); 2385 /* 2386 * There are three potential phases for a read: 2387 * 1. produce valid data from the columns read 2388 * 2. read all disks and try again 2389 * 3. perform combinatorial reconstruction 2390 * 2391 * Each phase is progressively both more expensive and less likely to 2392 * occur. If we encounter more errors than we can repair or all phases 2393 * fail, we have no choice but to return an error. 2394 */ 2395 2396 /* 2397 * If the number of errors we saw was correctable -- less than or equal 2398 * to the number of parity disks read -- attempt to produce data that 2399 * has a valid checksum. Naturally, this case applies in the absence of 2400 * any errors. 2401 */ 2402 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2403 if (data_errors == 0) { 2404 if (raidz_checksum_verify(zio) == 0) { 2405 /* 2406 * If we read parity information (unnecessarily 2407 * as it happens since no reconstruction was 2408 * needed) regenerate and verify the parity. 2409 * We also regenerate parity when resilvering 2410 * so we can write it out to the failed device 2411 * later. 2412 */ 2413 if (parity_errors + parity_untried < 2414 rm->rm_firstdatacol || 2415 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2416 n = raidz_parity_verify(zio, rm); 2417 unexpected_errors += n; 2418 ASSERT(parity_errors + n <= 2419 rm->rm_firstdatacol); 2420 } 2421 goto done; 2422 } 2423 } else { 2424 /* 2425 * We either attempt to read all the parity columns or 2426 * none of them. If we didn't try to read parity, we 2427 * wouldn't be here in the correctable case. There must 2428 * also have been fewer parity errors than parity 2429 * columns or, again, we wouldn't be in this code path. 2430 */ 2431 ASSERT(parity_untried == 0); 2432 ASSERT(parity_errors < rm->rm_firstdatacol); 2433 2434 /* 2435 * Identify the data columns that reported an error. 2436 */ 2437 n = 0; 2438 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2439 rc = &rm->rm_col[c]; 2440 if (rc->rc_error != 0) { 2441 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2442 tgts[n++] = c; 2443 } 2444 } 2445 2446 ASSERT(rm->rm_firstdatacol >= n); 2447 2448 code = vdev_raidz_reconstruct(rm, tgts, n); 2449 2450 if (raidz_checksum_verify(zio) == 0) { 2451 atomic_inc_64(&raidz_corrected[code]); 2452 2453 /* 2454 * If we read more parity disks than were used 2455 * for reconstruction, confirm that the other 2456 * parity disks produced correct data. This 2457 * routine is suboptimal in that it regenerates 2458 * the parity that we already used in addition 2459 * to the parity that we're attempting to 2460 * verify, but this should be a relatively 2461 * uncommon case, and can be optimized if it 2462 * becomes a problem. Note that we regenerate 2463 * parity when resilvering so we can write it 2464 * out to failed devices later. 2465 */ 2466 if (parity_errors < rm->rm_firstdatacol - n || 2467 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2468 n = raidz_parity_verify(zio, rm); 2469 unexpected_errors += n; 2470 ASSERT(parity_errors + n <= 2471 rm->rm_firstdatacol); 2472 } 2473 2474 goto done; 2475 } 2476 } 2477 } 2478 2479 /* 2480 * This isn't a typical situation -- either we got a read error or 2481 * a child silently returned bad data. Read every block so we can 2482 * try again with as much data and parity as we can track down. If 2483 * we've already been through once before, all children will be marked 2484 * as tried so we'll proceed to combinatorial reconstruction. 2485 */ 2486 unexpected_errors = 1; 2487 rm->rm_missingdata = 0; 2488 rm->rm_missingparity = 0; 2489 2490 for (c = 0; c < rm->rm_cols; c++) { 2491 if (rm->rm_col[c].rc_tried) 2492 continue; 2493 2494 zio_vdev_io_redone(zio); 2495 do { 2496 rc = &rm->rm_col[c]; 2497 if (rc->rc_tried) 2498 continue; 2499 zio_nowait(zio_vdev_child_io(zio, NULL, 2500 vd->vdev_child[rc->rc_devidx], 2501 rc->rc_offset, rc->rc_abd, rc->rc_size, 2502 zio->io_type, zio->io_priority, 0, 2503 vdev_raidz_child_done, rc)); 2504 } while (++c < rm->rm_cols); 2505 2506 return; 2507 } 2508 2509 /* 2510 * At this point we've attempted to reconstruct the data given the 2511 * errors we detected, and we've attempted to read all columns. There 2512 * must, therefore, be one or more additional problems -- silent errors 2513 * resulting in invalid data rather than explicit I/O errors resulting 2514 * in absent data. We check if there is enough additional data to 2515 * possibly reconstruct the data and then perform combinatorial 2516 * reconstruction over all possible combinations. If that fails, 2517 * we're cooked. 2518 */ 2519 if (total_errors > rm->rm_firstdatacol) { 2520 zio->io_error = vdev_raidz_worst_error(rm); 2521 2522 } else if (total_errors < rm->rm_firstdatacol && 2523 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2524 /* 2525 * If we didn't use all the available parity for the 2526 * combinatorial reconstruction, verify that the remaining 2527 * parity is correct. 2528 */ 2529 if (code != (1 << rm->rm_firstdatacol) - 1) 2530 (void) raidz_parity_verify(zio, rm); 2531 } else { 2532 /* 2533 * We're here because either: 2534 * 2535 * total_errors == rm_first_datacol, or 2536 * vdev_raidz_combrec() failed 2537 * 2538 * In either case, there is enough bad data to prevent 2539 * reconstruction. 2540 * 2541 * Start checksum ereports for all children which haven't 2542 * failed, and the IO wasn't speculative. 2543 */ 2544 zio->io_error = SET_ERROR(ECKSUM); 2545 2546 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2547 for (c = 0; c < rm->rm_cols; c++) { 2548 rc = &rm->rm_col[c]; 2549 if (rc->rc_error == 0) { 2550 zio_bad_cksum_t zbc; 2551 zbc.zbc_has_cksum = 0; 2552 zbc.zbc_injected = 2553 rm->rm_ecksuminjected; 2554 2555 zfs_ereport_start_checksum( 2556 zio->io_spa, 2557 vd->vdev_child[rc->rc_devidx], 2558 zio, rc->rc_offset, rc->rc_size, 2559 (void *)(uintptr_t)c, &zbc); 2560 } 2561 } 2562 } 2563 } 2564 2565 done: 2566 zio_checksum_verified(zio); 2567 2568 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2569 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2570 /* 2571 * Use the good data we have in hand to repair damaged children. 2572 */ 2573 for (c = 0; c < rm->rm_cols; c++) { 2574 rc = &rm->rm_col[c]; 2575 cvd = vd->vdev_child[rc->rc_devidx]; 2576 2577 if (rc->rc_error == 0) 2578 continue; 2579 2580 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2581 rc->rc_offset, rc->rc_abd, rc->rc_size, 2582 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2583 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2584 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2585 } 2586 } 2587 } 2588 2589 static void 2590 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2591 { 2592 if (faulted > vd->vdev_nparity) 2593 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2594 VDEV_AUX_NO_REPLICAS); 2595 else if (degraded + faulted != 0) 2596 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2597 else 2598 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2599 } 2600 2601 /* 2602 * Determine if any portion of the provided block resides on a child vdev 2603 * with a dirty DTL and therefore needs to be resilvered. The function 2604 * assumes that at least one DTL is dirty which imples that full stripe 2605 * width blocks must be resilvered. 2606 */ 2607 static boolean_t 2608 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2609 { 2610 uint64_t dcols = vd->vdev_children; 2611 uint64_t nparity = vd->vdev_nparity; 2612 uint64_t ashift = vd->vdev_top->vdev_ashift; 2613 /* The starting RAIDZ (parent) vdev sector of the block. */ 2614 uint64_t b = offset >> ashift; 2615 /* The zio's size in units of the vdev's minimum sector size. */ 2616 uint64_t s = ((psize - 1) >> ashift) + 1; 2617 /* The first column for this stripe. */ 2618 uint64_t f = b % dcols; 2619 2620 if (s + nparity >= dcols) 2621 return (B_TRUE); 2622 2623 for (uint64_t c = 0; c < s + nparity; c++) { 2624 uint64_t devidx = (f + c) % dcols; 2625 vdev_t *cvd = vd->vdev_child[devidx]; 2626 2627 /* 2628 * dsl_scan_need_resilver() already checked vd with 2629 * vdev_dtl_contains(). So here just check cvd with 2630 * vdev_dtl_empty(), cheaper and a good approximation. 2631 */ 2632 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2633 return (B_TRUE); 2634 } 2635 2636 return (B_FALSE); 2637 } 2638 2639 static void 2640 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) 2641 { 2642 vdev_t *raidvd = cvd->vdev_parent; 2643 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2644 2645 uint64_t width = raidvd->vdev_children; 2646 uint64_t tgt_col = cvd->vdev_id; 2647 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2648 2649 /* make sure the offsets are block-aligned */ 2650 ASSERT0(in->rs_start % (1 << ashift)); 2651 ASSERT0(in->rs_end % (1 << ashift)); 2652 uint64_t b_start = in->rs_start >> ashift; 2653 uint64_t b_end = in->rs_end >> ashift; 2654 2655 uint64_t start_row = 0; 2656 if (b_start > tgt_col) /* avoid underflow */ 2657 start_row = ((b_start - tgt_col - 1) / width) + 1; 2658 2659 uint64_t end_row = 0; 2660 if (b_end > tgt_col) 2661 end_row = ((b_end - tgt_col - 1) / width) + 1; 2662 2663 res->rs_start = start_row << ashift; 2664 res->rs_end = end_row << ashift; 2665 2666 ASSERT3U(res->rs_start, <=, in->rs_start); 2667 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2668 } 2669 2670 vdev_ops_t vdev_raidz_ops = { 2671 .vdev_op_open = vdev_raidz_open, 2672 .vdev_op_close = vdev_raidz_close, 2673 .vdev_op_asize = vdev_raidz_asize, 2674 .vdev_op_io_start = vdev_raidz_io_start, 2675 .vdev_op_io_done = vdev_raidz_io_done, 2676 .vdev_op_state_change = vdev_raidz_state_change, 2677 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2678 .vdev_op_hold = NULL, 2679 .vdev_op_rele = NULL, 2680 .vdev_op_remap = NULL, 2681 .vdev_op_xlate = vdev_raidz_xlate, 2682 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2683 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2684 }; 2685