1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25 * Copyright 2019 Joyent, Inc. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_file.h> 33 #include <sys/zio.h> 34 #include <sys/zio_checksum.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/fm/fs/zfs.h> 38 39 #ifdef ZFS_DEBUG 40 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 41 #endif 42 43 /* 44 * Virtual device vector for RAID-Z. 45 * 46 * This vdev supports single, double, and triple parity. For single parity, 47 * we use a simple XOR of all the data columns. For double or triple parity, 48 * we use a special case of Reed-Solomon coding. This extends the 49 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 50 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 51 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 52 * former is also based. The latter is designed to provide higher performance 53 * for writes. 54 * 55 * Note that the Plank paper claimed to support arbitrary N+M, but was then 56 * amended six years later identifying a critical flaw that invalidates its 57 * claims. Nevertheless, the technique can be adapted to work for up to 58 * triple parity. For additional parity, the amendment "Note: Correction to 59 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 60 * is viable, but the additional complexity means that write performance will 61 * suffer. 62 * 63 * All of the methods above operate on a Galois field, defined over the 64 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 65 * can be expressed with a single byte. Briefly, the operations on the 66 * field are defined as follows: 67 * 68 * o addition (+) is represented by a bitwise XOR 69 * o subtraction (-) is therefore identical to addition: A + B = A - B 70 * o multiplication of A by 2 is defined by the following bitwise expression: 71 * 72 * (A * 2)_7 = A_6 73 * (A * 2)_6 = A_5 74 * (A * 2)_5 = A_4 75 * (A * 2)_4 = A_3 + A_7 76 * (A * 2)_3 = A_2 + A_7 77 * (A * 2)_2 = A_1 + A_7 78 * (A * 2)_1 = A_0 79 * (A * 2)_0 = A_7 80 * 81 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 82 * As an aside, this multiplication is derived from the error correcting 83 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 84 * 85 * Observe that any number in the field (except for 0) can be expressed as a 86 * power of 2 -- a generator for the field. We store a table of the powers of 87 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 88 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 89 * than field addition). The inverse of a field element A (A^-1) is therefore 90 * A ^ (255 - 1) = A^254. 91 * 92 * The up-to-three parity columns, P, Q, R over several data columns, 93 * D_0, ... D_n-1, can be expressed by field operations: 94 * 95 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 96 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 97 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 98 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 99 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 100 * 101 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 102 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 103 * independent coefficients. (There are no additional coefficients that have 104 * this property which is why the uncorrected Plank method breaks down.) 105 * 106 * See the reconstruction code below for how P, Q and R can used individually 107 * or in concert to recover missing data columns. 108 */ 109 110 typedef struct raidz_col { 111 uint64_t rc_devidx; /* child device index for I/O */ 112 uint64_t rc_offset; /* device offset */ 113 uint64_t rc_size; /* I/O size */ 114 abd_t *rc_abd; /* I/O data */ 115 void *rc_gdata; /* used to store the "good" version */ 116 int rc_error; /* I/O error for this device */ 117 uint8_t rc_tried; /* Did we attempt this I/O column? */ 118 uint8_t rc_skipped; /* Did we skip this I/O column? */ 119 } raidz_col_t; 120 121 typedef struct raidz_map { 122 uint64_t rm_cols; /* Regular column count */ 123 uint64_t rm_scols; /* Count including skipped columns */ 124 uint64_t rm_bigcols; /* Number of oversized columns */ 125 uint64_t rm_asize; /* Actual total I/O size */ 126 uint64_t rm_missingdata; /* Count of missing data devices */ 127 uint64_t rm_missingparity; /* Count of missing parity devices */ 128 uint64_t rm_firstdatacol; /* First data column/parity count */ 129 uint64_t rm_nskip; /* Skipped sectors for padding */ 130 uint64_t rm_skipstart; /* Column index of padding start */ 131 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ 132 uintptr_t rm_reports; /* # of referencing checksum reports */ 133 uint8_t rm_freed; /* map no longer has referencing ZIO */ 134 uint8_t rm_ecksuminjected; /* checksum error was injected */ 135 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 136 } raidz_map_t; 137 138 #define VDEV_RAIDZ_P 0 139 #define VDEV_RAIDZ_Q 1 140 #define VDEV_RAIDZ_R 2 141 142 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 143 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 144 145 /* 146 * We provide a mechanism to perform the field multiplication operation on a 147 * 64-bit value all at once rather than a byte at a time. This works by 148 * creating a mask from the top bit in each byte and using that to 149 * conditionally apply the XOR of 0x1d. 150 */ 151 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 152 { \ 153 (mask) = (x) & 0x8080808080808080ULL; \ 154 (mask) = ((mask) << 1) - ((mask) >> 7); \ 155 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 156 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 157 } 158 159 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 160 { \ 161 VDEV_RAIDZ_64MUL_2((x), mask); \ 162 VDEV_RAIDZ_64MUL_2((x), mask); \ 163 } 164 165 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 166 167 /* 168 * Force reconstruction to use the general purpose method. 169 */ 170 int vdev_raidz_default_to_general; 171 172 /* Powers of 2 in the Galois field defined above. */ 173 static const uint8_t vdev_raidz_pow2[256] = { 174 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 175 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 176 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 177 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 178 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 179 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 180 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 181 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 182 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 183 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 184 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 185 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 186 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 187 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 188 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 189 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 190 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 191 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 192 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 193 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 194 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 195 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 196 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 197 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 198 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 199 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 200 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 201 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 202 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 203 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 204 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 205 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 206 }; 207 /* Logs of 2 in the Galois field defined above. */ 208 static const uint8_t vdev_raidz_log2[256] = { 209 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 210 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 211 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 212 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 213 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 214 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 215 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 216 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 217 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 218 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 219 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 220 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 221 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 222 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 223 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 224 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 225 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 226 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 227 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 228 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 229 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 230 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 231 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 232 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 233 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 234 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 235 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 236 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 237 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 238 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 239 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 240 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 241 }; 242 243 static void vdev_raidz_generate_parity(raidz_map_t *rm); 244 245 /* 246 * Multiply a given number by 2 raised to the given power. 247 */ 248 static uint8_t 249 vdev_raidz_exp2(uint_t a, int exp) 250 { 251 if (a == 0) 252 return (0); 253 254 ASSERT(exp >= 0); 255 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 256 257 exp += vdev_raidz_log2[a]; 258 if (exp > 255) 259 exp -= 255; 260 261 return (vdev_raidz_pow2[exp]); 262 } 263 264 static void 265 vdev_raidz_map_free(raidz_map_t *rm) 266 { 267 int c; 268 269 for (c = 0; c < rm->rm_firstdatacol; c++) { 270 abd_free(rm->rm_col[c].rc_abd); 271 272 if (rm->rm_col[c].rc_gdata != NULL) 273 abd_free(rm->rm_col[c].rc_gdata); 274 275 } 276 277 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 278 abd_put(rm->rm_col[c].rc_abd); 279 280 if (rm->rm_abd_copy != NULL) 281 abd_free(rm->rm_abd_copy); 282 283 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 284 } 285 286 static void 287 vdev_raidz_map_free_vsd(zio_t *zio) 288 { 289 raidz_map_t *rm = zio->io_vsd; 290 291 ASSERT0(rm->rm_freed); 292 rm->rm_freed = 1; 293 294 if (rm->rm_reports == 0) 295 vdev_raidz_map_free(rm); 296 } 297 298 /*ARGSUSED*/ 299 static void 300 vdev_raidz_cksum_free(void *arg, size_t ignored) 301 { 302 raidz_map_t *rm = arg; 303 304 ASSERT3U(rm->rm_reports, >, 0); 305 306 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 307 vdev_raidz_map_free(rm); 308 } 309 310 static void 311 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) 312 { 313 raidz_map_t *rm = zcr->zcr_cbdata; 314 size_t c = zcr->zcr_cbinfo; 315 size_t x, offset; 316 317 const abd_t *good = NULL; 318 const abd_t *bad = rm->rm_col[c].rc_abd; 319 320 if (good_data == NULL) { 321 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 322 return; 323 } 324 325 if (c < rm->rm_firstdatacol) { 326 /* 327 * The first time through, calculate the parity blocks for 328 * the good data (this relies on the fact that the good 329 * data never changes for a given logical ZIO) 330 */ 331 if (rm->rm_col[0].rc_gdata == NULL) { 332 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 333 334 /* 335 * Set up the rm_col[]s to generate the parity for 336 * good_data, first saving the parity bufs and 337 * replacing them with buffers to hold the result. 338 */ 339 for (x = 0; x < rm->rm_firstdatacol; x++) { 340 bad_parity[x] = rm->rm_col[x].rc_abd; 341 rm->rm_col[x].rc_abd = 342 rm->rm_col[x].rc_gdata = 343 abd_alloc_sametype(rm->rm_col[x].rc_abd, 344 rm->rm_col[x].rc_size); 345 } 346 347 /* fill in the data columns from good_data */ 348 offset = 0; 349 for (; x < rm->rm_cols; x++) { 350 abd_put(rm->rm_col[x].rc_abd); 351 352 rm->rm_col[x].rc_abd = 353 abd_get_offset_size((abd_t *)good_data, 354 offset, rm->rm_col[x].rc_size); 355 offset += rm->rm_col[x].rc_size; 356 } 357 358 /* 359 * Construct the parity from the good data. 360 */ 361 vdev_raidz_generate_parity(rm); 362 363 /* restore everything back to its original state */ 364 for (x = 0; x < rm->rm_firstdatacol; x++) 365 rm->rm_col[x].rc_abd = bad_parity[x]; 366 367 offset = 0; 368 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 369 abd_put(rm->rm_col[x].rc_abd); 370 rm->rm_col[x].rc_abd = abd_get_offset_size( 371 rm->rm_abd_copy, offset, 372 rm->rm_col[x].rc_size); 373 offset += rm->rm_col[x].rc_size; 374 } 375 } 376 377 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 378 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, 379 rm->rm_col[c].rc_size); 380 } else { 381 /* adjust good_data to point at the start of our column */ 382 offset = 0; 383 for (x = rm->rm_firstdatacol; x < c; x++) 384 offset += rm->rm_col[x].rc_size; 385 386 good = abd_get_offset_size((abd_t *)good_data, offset, 387 rm->rm_col[c].rc_size); 388 } 389 390 /* we drop the ereport if it ends up that the data was good */ 391 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 392 abd_put((abd_t *)good); 393 } 394 395 /* 396 * Invoked indirectly by zfs_ereport_start_checksum(), called 397 * below when our read operation fails completely. The main point 398 * is to keep a copy of everything we read from disk, so that at 399 * vdev_raidz_cksum_finish() time we can compare it with the good data. 400 */ 401 static void 402 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 403 { 404 size_t c = (size_t)(uintptr_t)arg; 405 size_t offset; 406 407 raidz_map_t *rm = zio->io_vsd; 408 size_t size; 409 410 /* set up the report and bump the refcount */ 411 zcr->zcr_cbdata = rm; 412 zcr->zcr_cbinfo = c; 413 zcr->zcr_finish = vdev_raidz_cksum_finish; 414 zcr->zcr_free = vdev_raidz_cksum_free; 415 416 rm->rm_reports++; 417 ASSERT3U(rm->rm_reports, >, 0); 418 419 if (rm->rm_abd_copy != NULL) 420 return; 421 422 /* 423 * It's the first time we're called for this raidz_map_t, so we need 424 * to copy the data aside; there's no guarantee that our zio's buffer 425 * won't be re-used for something else. 426 * 427 * Our parity data is already in separate buffers, so there's no need 428 * to copy them. 429 */ 430 431 size = 0; 432 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 433 size += rm->rm_col[c].rc_size; 434 435 rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); 436 437 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 438 raidz_col_t *col = &rm->rm_col[c]; 439 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, 440 col->rc_size); 441 442 ASSERT3S(tmp->abd_size, >=, col->rc_size); 443 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 444 abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size); 445 abd_put(col->rc_abd); 446 col->rc_abd = tmp; 447 448 offset += col->rc_size; 449 } 450 ASSERT3U(offset, ==, size); 451 } 452 453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 454 vdev_raidz_map_free_vsd, 455 vdev_raidz_cksum_report 456 }; 457 458 /* 459 * Divides the IO evenly across all child vdevs; usually, dcols is 460 * the number of children in the target vdev. 461 */ 462 static raidz_map_t * 463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, 464 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 465 { 466 raidz_map_t *rm; 467 /* The starting RAIDZ (parent) vdev sector of the block. */ 468 uint64_t b = offset >> unit_shift; 469 /* The zio's size in units of the vdev's minimum sector size. */ 470 uint64_t s = size >> unit_shift; 471 /* The first column for this stripe. */ 472 uint64_t f = b % dcols; 473 /* The starting byte offset on each child vdev. */ 474 uint64_t o = (b / dcols) << unit_shift; 475 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 476 uint64_t off = 0; 477 478 /* 479 * "Quotient": The number of data sectors for this stripe on all but 480 * the "big column" child vdevs that also contain "remainder" data. 481 */ 482 q = s / (dcols - nparity); 483 484 /* 485 * "Remainder": The number of partial stripe data sectors in this I/O. 486 * This will add a sector to some, but not all, child vdevs. 487 */ 488 r = s - q * (dcols - nparity); 489 490 /* The number of "big columns" - those which contain remainder data. */ 491 bc = (r == 0 ? 0 : r + nparity); 492 493 /* 494 * The total number of data and parity sectors associated with 495 * this I/O. 496 */ 497 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 498 499 /* acols: The columns that will be accessed. */ 500 /* scols: The columns that will be accessed or skipped. */ 501 if (q == 0) { 502 /* Our I/O request doesn't span all child vdevs. */ 503 acols = bc; 504 scols = MIN(dcols, roundup(bc, nparity + 1)); 505 } else { 506 acols = dcols; 507 scols = dcols; 508 } 509 510 ASSERT3U(acols, <=, scols); 511 512 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 513 514 rm->rm_cols = acols; 515 rm->rm_scols = scols; 516 rm->rm_bigcols = bc; 517 rm->rm_skipstart = bc; 518 rm->rm_missingdata = 0; 519 rm->rm_missingparity = 0; 520 rm->rm_firstdatacol = nparity; 521 rm->rm_abd_copy = NULL; 522 rm->rm_reports = 0; 523 rm->rm_freed = 0; 524 rm->rm_ecksuminjected = 0; 525 526 asize = 0; 527 528 for (c = 0; c < scols; c++) { 529 col = f + c; 530 coff = o; 531 if (col >= dcols) { 532 col -= dcols; 533 coff += 1ULL << unit_shift; 534 } 535 rm->rm_col[c].rc_devidx = col; 536 rm->rm_col[c].rc_offset = coff; 537 rm->rm_col[c].rc_abd = NULL; 538 rm->rm_col[c].rc_gdata = NULL; 539 rm->rm_col[c].rc_error = 0; 540 rm->rm_col[c].rc_tried = 0; 541 rm->rm_col[c].rc_skipped = 0; 542 543 if (c >= acols) 544 rm->rm_col[c].rc_size = 0; 545 else if (c < bc) 546 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 547 else 548 rm->rm_col[c].rc_size = q << unit_shift; 549 550 asize += rm->rm_col[c].rc_size; 551 } 552 553 ASSERT3U(asize, ==, tot << unit_shift); 554 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 555 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 556 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 557 ASSERT3U(rm->rm_nskip, <=, nparity); 558 559 for (c = 0; c < rm->rm_firstdatacol; c++) 560 rm->rm_col[c].rc_abd = 561 abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); 562 563 rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0, 564 rm->rm_col[c].rc_size); 565 off = rm->rm_col[c].rc_size; 566 567 for (c = c + 1; c < acols; c++) { 568 rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off, 569 rm->rm_col[c].rc_size); 570 off += rm->rm_col[c].rc_size; 571 } 572 573 /* 574 * If all data stored spans all columns, there's a danger that parity 575 * will always be on the same device and, since parity isn't read 576 * during normal operation, that that device's I/O bandwidth won't be 577 * used effectively. We therefore switch the parity every 1MB. 578 * 579 * ... at least that was, ostensibly, the theory. As a practical 580 * matter unless we juggle the parity between all devices evenly, we 581 * won't see any benefit. Further, occasional writes that aren't a 582 * multiple of the LCM of the number of children and the minimum 583 * stripe width are sufficient to avoid pessimal behavior. 584 * Unfortunately, this decision created an implicit on-disk format 585 * requirement that we need to support for all eternity, but only 586 * for single-parity RAID-Z. 587 * 588 * If we intend to skip a sector in the zeroth column for padding 589 * we must make sure to note this swap. We will never intend to 590 * skip the first column since at least one data and one parity 591 * column must appear in each row. 592 */ 593 ASSERT(rm->rm_cols >= 2); 594 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 595 596 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 597 devidx = rm->rm_col[0].rc_devidx; 598 o = rm->rm_col[0].rc_offset; 599 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 600 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 601 rm->rm_col[1].rc_devidx = devidx; 602 rm->rm_col[1].rc_offset = o; 603 604 if (rm->rm_skipstart == 0) 605 rm->rm_skipstart = 1; 606 } 607 608 return (rm); 609 } 610 611 struct pqr_struct { 612 uint64_t *p; 613 uint64_t *q; 614 uint64_t *r; 615 }; 616 617 static int 618 vdev_raidz_p_func(void *buf, size_t size, void *private) 619 { 620 struct pqr_struct *pqr = private; 621 const uint64_t *src = buf; 622 int i, cnt = size / sizeof (src[0]); 623 624 ASSERT(pqr->p && !pqr->q && !pqr->r); 625 626 for (i = 0; i < cnt; i++, src++, pqr->p++) 627 *pqr->p ^= *src; 628 629 return (0); 630 } 631 632 static int 633 vdev_raidz_pq_func(void *buf, size_t size, void *private) 634 { 635 struct pqr_struct *pqr = private; 636 const uint64_t *src = buf; 637 uint64_t mask; 638 int i, cnt = size / sizeof (src[0]); 639 640 ASSERT(pqr->p && pqr->q && !pqr->r); 641 642 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 643 *pqr->p ^= *src; 644 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 645 *pqr->q ^= *src; 646 } 647 648 return (0); 649 } 650 651 static int 652 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 653 { 654 struct pqr_struct *pqr = private; 655 const uint64_t *src = buf; 656 uint64_t mask; 657 int i, cnt = size / sizeof (src[0]); 658 659 ASSERT(pqr->p && pqr->q && pqr->r); 660 661 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 662 *pqr->p ^= *src; 663 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 664 *pqr->q ^= *src; 665 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 666 *pqr->r ^= *src; 667 } 668 669 return (0); 670 } 671 672 static void 673 vdev_raidz_generate_parity_p(raidz_map_t *rm) 674 { 675 uint64_t *p; 676 int c; 677 abd_t *src; 678 679 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 680 src = rm->rm_col[c].rc_abd; 681 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 682 683 if (c == rm->rm_firstdatacol) { 684 ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size); 685 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 686 } else { 687 struct pqr_struct pqr = { p, NULL, NULL }; 688 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 689 vdev_raidz_p_func, &pqr); 690 } 691 } 692 } 693 694 static void 695 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 696 { 697 uint64_t *p, *q, pcnt, ccnt, mask, i; 698 int c; 699 abd_t *src; 700 701 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 702 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 703 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 704 705 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 706 src = rm->rm_col[c].rc_abd; 707 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 708 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 709 710 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 711 712 if (c == rm->rm_firstdatacol) { 713 ASSERT(ccnt == pcnt || ccnt == 0); 714 715 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 716 (void) memcpy(q, p, rm->rm_col[c].rc_size); 717 for (i = ccnt; i < pcnt; i++) { 718 p[i] = 0; 719 q[i] = 0; 720 } 721 } else { 722 struct pqr_struct pqr = { p, q, NULL }; 723 724 ASSERT(ccnt <= pcnt); 725 726 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 727 vdev_raidz_pq_func, &pqr); 728 729 /* 730 * Treat short columns as though they are full of 0s. 731 * Note that there's therefore nothing needed for P. 732 */ 733 for (i = ccnt; i < pcnt; i++) { 734 VDEV_RAIDZ_64MUL_2(q[i], mask); 735 } 736 } 737 } 738 } 739 740 static void 741 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 742 { 743 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 744 int c; 745 abd_t *src; 746 747 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 748 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 749 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 750 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 751 rm->rm_col[VDEV_RAIDZ_R].rc_size); 752 753 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 754 src = rm->rm_col[c].rc_abd; 755 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 756 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 757 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 758 759 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 760 761 if (c == rm->rm_firstdatacol) { 762 ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size); 763 ASSERT(ccnt == pcnt || ccnt == 0); 764 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 765 (void) memcpy(q, p, rm->rm_col[c].rc_size); 766 (void) memcpy(r, p, rm->rm_col[c].rc_size); 767 768 for (i = ccnt; i < pcnt; i++) { 769 p[i] = 0; 770 q[i] = 0; 771 r[i] = 0; 772 } 773 } else { 774 struct pqr_struct pqr = { p, q, r }; 775 776 ASSERT(ccnt <= pcnt); 777 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 778 vdev_raidz_pqr_func, &pqr); 779 780 /* 781 * Treat short columns as though they are full of 0s. 782 * Note that there's therefore nothing needed for P. 783 */ 784 for (i = ccnt; i < pcnt; i++) { 785 VDEV_RAIDZ_64MUL_2(q[i], mask); 786 VDEV_RAIDZ_64MUL_4(r[i], mask); 787 } 788 } 789 } 790 } 791 792 /* 793 * Generate RAID parity in the first virtual columns according to the number of 794 * parity columns available. 795 */ 796 static void 797 vdev_raidz_generate_parity(raidz_map_t *rm) 798 { 799 switch (rm->rm_firstdatacol) { 800 case 1: 801 vdev_raidz_generate_parity_p(rm); 802 break; 803 case 2: 804 vdev_raidz_generate_parity_pq(rm); 805 break; 806 case 3: 807 vdev_raidz_generate_parity_pqr(rm); 808 break; 809 default: 810 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 811 } 812 } 813 814 /* ARGSUSED */ 815 static int 816 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 817 { 818 uint64_t *dst = dbuf; 819 uint64_t *src = sbuf; 820 int cnt = size / sizeof (src[0]); 821 822 for (int i = 0; i < cnt; i++) { 823 dst[i] ^= src[i]; 824 } 825 826 return (0); 827 } 828 829 /* ARGSUSED */ 830 static int 831 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 832 void *private) 833 { 834 uint64_t *dst = dbuf; 835 uint64_t *src = sbuf; 836 uint64_t mask; 837 int cnt = size / sizeof (dst[0]); 838 839 for (int i = 0; i < cnt; i++, dst++, src++) { 840 VDEV_RAIDZ_64MUL_2(*dst, mask); 841 *dst ^= *src; 842 } 843 844 return (0); 845 } 846 847 /* ARGSUSED */ 848 static int 849 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 850 { 851 uint64_t *dst = buf; 852 uint64_t mask; 853 int cnt = size / sizeof (dst[0]); 854 855 for (int i = 0; i < cnt; i++, dst++) { 856 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 857 VDEV_RAIDZ_64MUL_2(*dst, mask); 858 } 859 860 return (0); 861 } 862 863 struct reconst_q_struct { 864 uint64_t *q; 865 int exp; 866 }; 867 868 static int 869 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 870 { 871 struct reconst_q_struct *rq = private; 872 uint64_t *dst = buf; 873 int cnt = size / sizeof (dst[0]); 874 875 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 876 *dst ^= *rq->q; 877 878 int j; 879 uint8_t *b; 880 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 881 *b = vdev_raidz_exp2(*b, rq->exp); 882 } 883 } 884 885 return (0); 886 } 887 888 struct reconst_pq_struct { 889 uint8_t *p; 890 uint8_t *q; 891 uint8_t *pxy; 892 uint8_t *qxy; 893 int aexp; 894 int bexp; 895 }; 896 897 static int 898 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 899 { 900 struct reconst_pq_struct *rpq = private; 901 uint8_t *xd = xbuf; 902 uint8_t *yd = ybuf; 903 904 for (int i = 0; i < size; 905 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 906 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 907 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 908 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 909 } 910 911 return (0); 912 } 913 914 static int 915 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 916 { 917 struct reconst_pq_struct *rpq = private; 918 uint8_t *xd = xbuf; 919 920 for (int i = 0; i < size; 921 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 922 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 923 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 924 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 925 } 926 927 return (0); 928 } 929 930 static int 931 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 932 { 933 int x = tgts[0]; 934 int c; 935 abd_t *dst, *src; 936 937 ASSERT(ntgts == 1); 938 ASSERT(x >= rm->rm_firstdatacol); 939 ASSERT(x < rm->rm_cols); 940 941 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 942 ASSERT(rm->rm_col[x].rc_size > 0); 943 944 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 945 dst = rm->rm_col[x].rc_abd; 946 947 ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size); 948 ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size); 949 abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size); 950 951 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 952 uint64_t size = MIN(rm->rm_col[x].rc_size, 953 rm->rm_col[c].rc_size); 954 955 src = rm->rm_col[c].rc_abd; 956 dst = rm->rm_col[x].rc_abd; 957 958 if (c == x) 959 continue; 960 961 (void) abd_iterate_func2(dst, src, 0, 0, size, 962 vdev_raidz_reconst_p_func, NULL); 963 } 964 965 return (1 << VDEV_RAIDZ_P); 966 } 967 968 static int 969 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 970 { 971 int x = tgts[0]; 972 int c, exp; 973 abd_t *dst, *src; 974 975 ASSERT(ntgts == 1); 976 977 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 978 979 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 980 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 981 rm->rm_col[c].rc_size); 982 983 src = rm->rm_col[c].rc_abd; 984 dst = rm->rm_col[x].rc_abd; 985 986 if (c == rm->rm_firstdatacol) { 987 if (dst != src) { 988 ASSERT3S(dst->abd_size, >=, size); 989 ASSERT3S(src->abd_size, >=, size); 990 abd_copy_off(dst, src, 0, 0, size); 991 } 992 if (rm->rm_col[x].rc_size > size) 993 abd_zero_off(dst, size, 994 rm->rm_col[x].rc_size - size); 995 } else { 996 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 997 if (src != dst) 998 (void) abd_iterate_func2(dst, src, 0, 0, size, 999 vdev_raidz_reconst_q_pre_func, NULL); 1000 (void) abd_iterate_func(dst, 1001 size, rm->rm_col[x].rc_size - size, 1002 vdev_raidz_reconst_q_pre_tail_func, NULL); 1003 } 1004 } 1005 1006 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1007 dst = rm->rm_col[x].rc_abd; 1008 exp = 255 - (rm->rm_cols - 1 - x); 1009 1010 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1011 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 1012 vdev_raidz_reconst_q_post_func, &rq); 1013 1014 return (1 << VDEV_RAIDZ_Q); 1015 } 1016 1017 static int 1018 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 1019 { 1020 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1021 abd_t *pdata, *qdata; 1022 uint64_t xsize, ysize; 1023 int x = tgts[0]; 1024 int y = tgts[1]; 1025 abd_t *xd, *yd; 1026 1027 ASSERT(ntgts == 2); 1028 ASSERT(x < y); 1029 ASSERT(x >= rm->rm_firstdatacol); 1030 ASSERT(y < rm->rm_cols); 1031 1032 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 1033 1034 /* 1035 * Move the parity data aside -- we're going to compute parity as 1036 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1037 * reuse the parity generation mechanism without trashing the actual 1038 * parity so we make those columns appear to be full of zeros by 1039 * setting their lengths to zero. 1040 */ 1041 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 1042 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1043 xsize = rm->rm_col[x].rc_size; 1044 ysize = rm->rm_col[y].rc_size; 1045 1046 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 1047 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1048 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 1049 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1050 rm->rm_col[x].rc_size = 0; 1051 rm->rm_col[y].rc_size = 0; 1052 1053 vdev_raidz_generate_parity_pq(rm); 1054 1055 rm->rm_col[x].rc_size = xsize; 1056 rm->rm_col[y].rc_size = ysize; 1057 1058 p = abd_to_buf(pdata); 1059 q = abd_to_buf(qdata); 1060 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1061 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1062 xd = rm->rm_col[x].rc_abd; 1063 yd = rm->rm_col[y].rc_abd; 1064 1065 /* 1066 * We now have: 1067 * Pxy = P + D_x + D_y 1068 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1069 * 1070 * We can then solve for D_x: 1071 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1072 * where 1073 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1074 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1075 * 1076 * With D_x in hand, we can easily solve for D_y: 1077 * D_y = P + Pxy + D_x 1078 */ 1079 1080 a = vdev_raidz_pow2[255 + x - y]; 1081 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 1082 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1083 1084 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1085 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1086 1087 ASSERT3U(xsize, >=, ysize); 1088 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1089 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1090 vdev_raidz_reconst_pq_func, &rpq); 1091 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1092 vdev_raidz_reconst_pq_tail_func, &rpq); 1093 1094 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1095 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1096 1097 /* 1098 * Restore the saved parity data. 1099 */ 1100 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 1101 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1102 1103 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1104 } 1105 1106 /* BEGIN CSTYLED */ 1107 /* 1108 * In the general case of reconstruction, we must solve the system of linear 1109 * equations defined by the coeffecients used to generate parity as well as 1110 * the contents of the data and parity disks. This can be expressed with 1111 * vectors for the original data (D) and the actual data (d) and parity (p) 1112 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1113 * 1114 * __ __ __ __ 1115 * | | __ __ | p_0 | 1116 * | V | | D_0 | | p_m-1 | 1117 * | | x | : | = | d_0 | 1118 * | I | | D_n-1 | | : | 1119 * | | ~~ ~~ | d_n-1 | 1120 * ~~ ~~ ~~ ~~ 1121 * 1122 * I is simply a square identity matrix of size n, and V is a vandermonde 1123 * matrix defined by the coeffecients we chose for the various parity columns 1124 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1125 * computation as well as linear separability. 1126 * 1127 * __ __ __ __ 1128 * | 1 .. 1 1 1 | | p_0 | 1129 * | 2^n-1 .. 4 2 1 | __ __ | : | 1130 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1131 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1132 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1133 * | : : : : | | : | | d_2 | 1134 * | 0 .. 1 0 0 | | D_n-1 | | : | 1135 * | 0 .. 0 1 0 | ~~ ~~ | : | 1136 * | 0 .. 0 0 1 | | d_n-1 | 1137 * ~~ ~~ ~~ ~~ 1138 * 1139 * Note that I, V, d, and p are known. To compute D, we must invert the 1140 * matrix and use the known data and parity values to reconstruct the unknown 1141 * data values. We begin by removing the rows in V|I and d|p that correspond 1142 * to failed or missing columns; we then make V|I square (n x n) and d|p 1143 * sized n by removing rows corresponding to unused parity from the bottom up 1144 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1145 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1146 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1147 * __ __ 1148 * | 1 1 1 1 1 1 1 1 | 1149 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1150 * | 19 205 116 29 64 16 4 1 | / / 1151 * | 1 0 0 0 0 0 0 0 | / / 1152 * | 0 1 0 0 0 0 0 0 | <--' / 1153 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1154 * | 0 0 0 1 0 0 0 0 | 1155 * | 0 0 0 0 1 0 0 0 | 1156 * | 0 0 0 0 0 1 0 0 | 1157 * | 0 0 0 0 0 0 1 0 | 1158 * | 0 0 0 0 0 0 0 1 | 1159 * ~~ ~~ 1160 * __ __ 1161 * | 1 1 1 1 1 1 1 1 | 1162 * | 19 205 116 29 64 16 4 1 | 1163 * | 1 0 0 0 0 0 0 0 | 1164 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1165 * | 0 0 0 0 1 0 0 0 | 1166 * | 0 0 0 0 0 1 0 0 | 1167 * | 0 0 0 0 0 0 1 0 | 1168 * | 0 0 0 0 0 0 0 1 | 1169 * ~~ ~~ 1170 * 1171 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1172 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1173 * matrix is not singular. 1174 * __ __ 1175 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1176 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1177 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1178 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1179 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1180 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1181 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1182 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1183 * ~~ ~~ 1184 * __ __ 1185 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1186 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1187 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1188 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1189 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1190 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1191 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1192 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1193 * ~~ ~~ 1194 * __ __ 1195 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1196 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1197 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1198 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1199 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1200 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1201 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1202 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1203 * ~~ ~~ 1204 * __ __ 1205 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1206 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1207 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1208 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1209 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1210 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1211 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1212 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1213 * ~~ ~~ 1214 * __ __ 1215 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1216 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1217 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1218 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1219 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1220 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1221 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1222 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1223 * ~~ ~~ 1224 * __ __ 1225 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1226 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1227 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1228 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1229 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1230 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1231 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1232 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1233 * ~~ ~~ 1234 * __ __ 1235 * | 0 0 1 0 0 0 0 0 | 1236 * | 167 100 5 41 159 169 217 208 | 1237 * | 166 100 4 40 158 168 216 209 | 1238 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1239 * | 0 0 0 0 1 0 0 0 | 1240 * | 0 0 0 0 0 1 0 0 | 1241 * | 0 0 0 0 0 0 1 0 | 1242 * | 0 0 0 0 0 0 0 1 | 1243 * ~~ ~~ 1244 * 1245 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1246 * of the missing data. 1247 * 1248 * As is apparent from the example above, the only non-trivial rows in the 1249 * inverse matrix correspond to the data disks that we're trying to 1250 * reconstruct. Indeed, those are the only rows we need as the others would 1251 * only be useful for reconstructing data known or assumed to be valid. For 1252 * that reason, we only build the coefficients in the rows that correspond to 1253 * targeted columns. 1254 */ 1255 /* END CSTYLED */ 1256 1257 static void 1258 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1259 uint8_t **rows) 1260 { 1261 int i, j; 1262 int pow; 1263 1264 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1265 1266 /* 1267 * Fill in the missing rows of interest. 1268 */ 1269 for (i = 0; i < nmap; i++) { 1270 ASSERT3S(0, <=, map[i]); 1271 ASSERT3S(map[i], <=, 2); 1272 1273 pow = map[i] * n; 1274 if (pow > 255) 1275 pow -= 255; 1276 ASSERT(pow <= 255); 1277 1278 for (j = 0; j < n; j++) { 1279 pow -= map[i]; 1280 if (pow < 0) 1281 pow += 255; 1282 rows[i][j] = vdev_raidz_pow2[pow]; 1283 } 1284 } 1285 } 1286 1287 static void 1288 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1289 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1290 { 1291 int i, j, ii, jj; 1292 uint8_t log; 1293 1294 /* 1295 * Assert that the first nmissing entries from the array of used 1296 * columns correspond to parity columns and that subsequent entries 1297 * correspond to data columns. 1298 */ 1299 for (i = 0; i < nmissing; i++) { 1300 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1301 } 1302 for (; i < n; i++) { 1303 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1304 } 1305 1306 /* 1307 * First initialize the storage where we'll compute the inverse rows. 1308 */ 1309 for (i = 0; i < nmissing; i++) { 1310 for (j = 0; j < n; j++) { 1311 invrows[i][j] = (i == j) ? 1 : 0; 1312 } 1313 } 1314 1315 /* 1316 * Subtract all trivial rows from the rows of consequence. 1317 */ 1318 for (i = 0; i < nmissing; i++) { 1319 for (j = nmissing; j < n; j++) { 1320 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1321 jj = used[j] - rm->rm_firstdatacol; 1322 ASSERT3S(jj, <, n); 1323 invrows[i][j] = rows[i][jj]; 1324 rows[i][jj] = 0; 1325 } 1326 } 1327 1328 /* 1329 * For each of the rows of interest, we must normalize it and subtract 1330 * a multiple of it from the other rows. 1331 */ 1332 for (i = 0; i < nmissing; i++) { 1333 for (j = 0; j < missing[i]; j++) { 1334 ASSERT0(rows[i][j]); 1335 } 1336 ASSERT3U(rows[i][missing[i]], !=, 0); 1337 1338 /* 1339 * Compute the inverse of the first element and multiply each 1340 * element in the row by that value. 1341 */ 1342 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1343 1344 for (j = 0; j < n; j++) { 1345 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1346 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1347 } 1348 1349 for (ii = 0; ii < nmissing; ii++) { 1350 if (i == ii) 1351 continue; 1352 1353 ASSERT3U(rows[ii][missing[i]], !=, 0); 1354 1355 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1356 1357 for (j = 0; j < n; j++) { 1358 rows[ii][j] ^= 1359 vdev_raidz_exp2(rows[i][j], log); 1360 invrows[ii][j] ^= 1361 vdev_raidz_exp2(invrows[i][j], log); 1362 } 1363 } 1364 } 1365 1366 /* 1367 * Verify that the data that is left in the rows are properly part of 1368 * an identity matrix. 1369 */ 1370 for (i = 0; i < nmissing; i++) { 1371 for (j = 0; j < n; j++) { 1372 if (j == missing[i]) { 1373 ASSERT3U(rows[i][j], ==, 1); 1374 } else { 1375 ASSERT0(rows[i][j]); 1376 } 1377 } 1378 } 1379 } 1380 1381 static void 1382 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1383 int *missing, uint8_t **invrows, const uint8_t *used) 1384 { 1385 int i, j, x, cc, c; 1386 uint8_t *src; 1387 uint64_t ccount; 1388 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1389 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1390 uint8_t log = 0; 1391 uint8_t val; 1392 int ll; 1393 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1394 uint8_t *p, *pp; 1395 size_t psize; 1396 1397 psize = sizeof (invlog[0][0]) * n * nmissing; 1398 p = kmem_alloc(psize, KM_SLEEP); 1399 1400 for (pp = p, i = 0; i < nmissing; i++) { 1401 invlog[i] = pp; 1402 pp += n; 1403 } 1404 1405 for (i = 0; i < nmissing; i++) { 1406 for (j = 0; j < n; j++) { 1407 ASSERT3U(invrows[i][j], !=, 0); 1408 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1409 } 1410 } 1411 1412 for (i = 0; i < n; i++) { 1413 c = used[i]; 1414 ASSERT3U(c, <, rm->rm_cols); 1415 1416 src = abd_to_buf(rm->rm_col[c].rc_abd); 1417 ccount = rm->rm_col[c].rc_size; 1418 for (j = 0; j < nmissing; j++) { 1419 cc = missing[j] + rm->rm_firstdatacol; 1420 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1421 ASSERT3U(cc, <, rm->rm_cols); 1422 ASSERT3U(cc, !=, c); 1423 1424 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1425 dcount[j] = rm->rm_col[cc].rc_size; 1426 } 1427 1428 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1429 1430 for (x = 0; x < ccount; x++, src++) { 1431 if (*src != 0) 1432 log = vdev_raidz_log2[*src]; 1433 1434 for (cc = 0; cc < nmissing; cc++) { 1435 if (x >= dcount[cc]) 1436 continue; 1437 1438 if (*src == 0) { 1439 val = 0; 1440 } else { 1441 if ((ll = log + invlog[cc][i]) >= 255) 1442 ll -= 255; 1443 val = vdev_raidz_pow2[ll]; 1444 } 1445 1446 if (i == 0) 1447 dst[cc][x] = val; 1448 else 1449 dst[cc][x] ^= val; 1450 } 1451 } 1452 } 1453 1454 kmem_free(p, psize); 1455 } 1456 1457 static int 1458 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1459 { 1460 int n, i, c, t, tt; 1461 int nmissing_rows; 1462 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1463 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1464 1465 uint8_t *p, *pp; 1466 size_t psize; 1467 1468 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1469 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1470 uint8_t *used; 1471 1472 abd_t **bufs = NULL; 1473 1474 int code = 0; 1475 1476 /* 1477 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1478 * temporary linear ABDs. 1479 */ 1480 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1481 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1482 1483 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1484 raidz_col_t *col = &rm->rm_col[c]; 1485 1486 bufs[c] = col->rc_abd; 1487 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1488 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1489 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1490 abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size); 1491 } 1492 } 1493 1494 n = rm->rm_cols - rm->rm_firstdatacol; 1495 1496 /* 1497 * Figure out which data columns are missing. 1498 */ 1499 nmissing_rows = 0; 1500 for (t = 0; t < ntgts; t++) { 1501 if (tgts[t] >= rm->rm_firstdatacol) { 1502 missing_rows[nmissing_rows++] = 1503 tgts[t] - rm->rm_firstdatacol; 1504 } 1505 } 1506 1507 /* 1508 * Figure out which parity columns to use to help generate the missing 1509 * data columns. 1510 */ 1511 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1512 ASSERT(tt < ntgts); 1513 ASSERT(c < rm->rm_firstdatacol); 1514 1515 /* 1516 * Skip any targeted parity columns. 1517 */ 1518 if (c == tgts[tt]) { 1519 tt++; 1520 continue; 1521 } 1522 1523 code |= 1 << c; 1524 1525 parity_map[i] = c; 1526 i++; 1527 } 1528 1529 ASSERT(code != 0); 1530 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1531 1532 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1533 nmissing_rows * n + sizeof (used[0]) * n; 1534 p = kmem_alloc(psize, KM_SLEEP); 1535 1536 for (pp = p, i = 0; i < nmissing_rows; i++) { 1537 rows[i] = pp; 1538 pp += n; 1539 invrows[i] = pp; 1540 pp += n; 1541 } 1542 used = pp; 1543 1544 for (i = 0; i < nmissing_rows; i++) { 1545 used[i] = parity_map[i]; 1546 } 1547 1548 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1549 if (tt < nmissing_rows && 1550 c == missing_rows[tt] + rm->rm_firstdatacol) { 1551 tt++; 1552 continue; 1553 } 1554 1555 ASSERT3S(i, <, n); 1556 used[i] = c; 1557 i++; 1558 } 1559 1560 /* 1561 * Initialize the interesting rows of the matrix. 1562 */ 1563 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1564 1565 /* 1566 * Invert the matrix. 1567 */ 1568 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1569 invrows, used); 1570 1571 /* 1572 * Reconstruct the missing data using the generated matrix. 1573 */ 1574 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1575 invrows, used); 1576 1577 kmem_free(p, psize); 1578 1579 /* 1580 * copy back from temporary linear abds and free them 1581 */ 1582 if (bufs) { 1583 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1584 raidz_col_t *col = &rm->rm_col[c]; 1585 1586 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1587 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1588 abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size); 1589 abd_free(col->rc_abd); 1590 col->rc_abd = bufs[c]; 1591 } 1592 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1593 } 1594 1595 return (code); 1596 } 1597 1598 static int 1599 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1600 { 1601 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1602 int ntgts; 1603 int i, c; 1604 int code; 1605 int nbadparity, nbaddata; 1606 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1607 1608 /* 1609 * The tgts list must already be sorted. 1610 */ 1611 for (i = 1; i < nt; i++) { 1612 ASSERT(t[i] > t[i - 1]); 1613 } 1614 1615 nbadparity = rm->rm_firstdatacol; 1616 nbaddata = rm->rm_cols - nbadparity; 1617 ntgts = 0; 1618 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1619 if (c < rm->rm_firstdatacol) 1620 parity_valid[c] = B_FALSE; 1621 1622 if (i < nt && c == t[i]) { 1623 tgts[ntgts++] = c; 1624 i++; 1625 } else if (rm->rm_col[c].rc_error != 0) { 1626 tgts[ntgts++] = c; 1627 } else if (c >= rm->rm_firstdatacol) { 1628 nbaddata--; 1629 } else { 1630 parity_valid[c] = B_TRUE; 1631 nbadparity--; 1632 } 1633 } 1634 1635 ASSERT(ntgts >= nt); 1636 ASSERT(nbaddata >= 0); 1637 ASSERT(nbaddata + nbadparity == ntgts); 1638 1639 dt = &tgts[nbadparity]; 1640 1641 /* 1642 * See if we can use any of our optimized reconstruction routines. 1643 */ 1644 if (!vdev_raidz_default_to_general) { 1645 switch (nbaddata) { 1646 case 1: 1647 if (parity_valid[VDEV_RAIDZ_P]) 1648 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1649 1650 ASSERT(rm->rm_firstdatacol > 1); 1651 1652 if (parity_valid[VDEV_RAIDZ_Q]) 1653 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1654 1655 ASSERT(rm->rm_firstdatacol > 2); 1656 break; 1657 1658 case 2: 1659 ASSERT(rm->rm_firstdatacol > 1); 1660 1661 if (parity_valid[VDEV_RAIDZ_P] && 1662 parity_valid[VDEV_RAIDZ_Q]) 1663 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1664 1665 ASSERT(rm->rm_firstdatacol > 2); 1666 1667 break; 1668 } 1669 } 1670 1671 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1672 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1673 ASSERT(code > 0); 1674 return (code); 1675 } 1676 1677 static int 1678 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1679 uint64_t *ashift) 1680 { 1681 vdev_t *cvd; 1682 uint64_t nparity = vd->vdev_nparity; 1683 int c; 1684 int lasterror = 0; 1685 int numerrors = 0; 1686 1687 ASSERT(nparity > 0); 1688 1689 if (nparity > VDEV_RAIDZ_MAXPARITY || 1690 vd->vdev_children < nparity + 1) { 1691 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1692 return (SET_ERROR(EINVAL)); 1693 } 1694 1695 vdev_open_children(vd); 1696 1697 for (c = 0; c < vd->vdev_children; c++) { 1698 cvd = vd->vdev_child[c]; 1699 1700 if (cvd->vdev_open_error != 0) { 1701 lasterror = cvd->vdev_open_error; 1702 numerrors++; 1703 continue; 1704 } 1705 1706 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1707 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1708 *ashift = MAX(*ashift, cvd->vdev_ashift); 1709 } 1710 1711 *asize *= vd->vdev_children; 1712 *max_asize *= vd->vdev_children; 1713 1714 if (numerrors > nparity) { 1715 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1716 return (lasterror); 1717 } 1718 1719 return (0); 1720 } 1721 1722 static void 1723 vdev_raidz_close(vdev_t *vd) 1724 { 1725 int c; 1726 1727 for (c = 0; c < vd->vdev_children; c++) 1728 vdev_close(vd->vdev_child[c]); 1729 } 1730 1731 /* 1732 * Handle a read or write I/O to a RAID-Z dump device. 1733 * 1734 * The dump device is in a unique situation compared to other ZFS datasets: 1735 * writing to this device should be as simple and fast as possible. In 1736 * addition, durability matters much less since the dump will be extracted 1737 * once the machine reboots. For that reason, this function eschews parity for 1738 * performance and simplicity. The dump device uses the checksum setting 1739 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1740 * dataset. 1741 * 1742 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1743 * 128 KB will not fill an entire block; in addition, they may not be properly 1744 * aligned. In that case, this function uses the preallocated 128 KB block and 1745 * omits reading or writing any "empty" portions of that block, as opposed to 1746 * allocating a fresh appropriately-sized block. 1747 * 1748 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1749 * 1750 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1751 * 1752 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1753 * allocated which spans all five child vdevs. 8 KB of data would be written to 1754 * each of four vdevs, with the fifth containing the parity bits. 1755 * 1756 * parity data data data data 1757 * | PP | XX | XX | XX | XX | 1758 * ^ ^ ^ ^ ^ 1759 * | | | | | 1760 * 8 KB parity ------8 KB data blocks------ 1761 * 1762 * However, when writing to the dump device, the behavior is different: 1763 * 1764 * vdev_raidz_dumpio(data, size: 32 KB, offset: 64 KB) 1765 * 1766 * Unlike the normal RAID-Z case in which the block is allocated based on the 1767 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1768 * I/O size is less than 128 KB, only the actual portions of data are written. 1769 * In this example the data is written to the third data vdev since that vdev 1770 * contains the offset [64 KB, 96 KB). 1771 * 1772 * parity data data data data 1773 * | | | | XX | | 1774 * ^ 1775 * | 1776 * 32 KB data block 1777 * 1778 * As a result, an individual I/O may not span all child vdevs; moreover, a 1779 * small I/O may only operate on a single child vdev. 1780 * 1781 * Note that since there are no parity bits calculated or written, this format 1782 * remains the same no matter how many parity bits are used in a normal RAID-Z 1783 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1784 * would look like: 1785 * 1786 * parity parity parity data data data data 1787 * | | | | | | XX | | 1788 * ^ 1789 * | 1790 * 32 KB data block 1791 */ 1792 static int 1793 vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, 1794 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1795 { 1796 vdev_t *tvd = vd->vdev_top; 1797 vdev_t *cvd; 1798 raidz_map_t *rm; 1799 raidz_col_t *rc; 1800 int c, err = 0; 1801 1802 uint64_t start, end, colstart, colend; 1803 uint64_t coloffset, colsize, colskip; 1804 1805 #ifdef _KERNEL 1806 1807 /* 1808 * Don't write past the end of the block 1809 */ 1810 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1811 1812 start = offset; 1813 end = start + size; 1814 1815 /* 1816 * Allocate a RAID-Z map for this block. Note that this block starts 1817 * from the "original" offset, this is, the offset of the extent which 1818 * contains the requisite offset of the data being read or written. 1819 * 1820 * Even if this I/O operation doesn't span the full block size, let's 1821 * treat the on-disk format as if the only blocks are the complete 128 1822 * KB size. 1823 */ 1824 abd_t *abd = abd_get_from_buf(data - (offset - origoffset), 1825 SPA_OLD_MAXBLOCKSIZE); 1826 rm = vdev_raidz_map_alloc(abd, 1827 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1828 vd->vdev_children, vd->vdev_nparity); 1829 1830 coloffset = origoffset; 1831 1832 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1833 c++, coloffset += rc->rc_size) { 1834 rc = &rm->rm_col[c]; 1835 cvd = vd->vdev_child[rc->rc_devidx]; 1836 1837 if (cvd->vdev_ops->vdev_op_dumpio == NULL) { 1838 err = EINVAL; 1839 break; 1840 } 1841 1842 /* 1843 * Find the start and end of this column in the RAID-Z map, 1844 * keeping in mind that the stated size and offset of the 1845 * operation may not fill the entire column for this vdev. 1846 * 1847 * If any portion of the data spans this column, issue the 1848 * appropriate operation to the vdev. 1849 */ 1850 if (coloffset + rc->rc_size <= start) 1851 continue; 1852 if (coloffset >= end) 1853 continue; 1854 1855 colstart = MAX(coloffset, start); 1856 colend = MIN(end, coloffset + rc->rc_size); 1857 colsize = colend - colstart; 1858 colskip = colstart - coloffset; 1859 1860 VERIFY3U(colsize, <=, rc->rc_size); 1861 VERIFY3U(colskip, <=, rc->rc_size); 1862 1863 /* 1864 * Note that the child vdev will have a vdev label at the start 1865 * of its range of offsets, hence the need for 1866 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1867 * example of why this calculation is needed. 1868 */ 1869 if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd, 1870 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1871 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 0, 1872 doread, isdump)) != 0) 1873 break; 1874 } 1875 1876 vdev_raidz_map_free(rm); 1877 abd_put(abd); 1878 #endif /* KERNEL */ 1879 1880 return (err); 1881 } 1882 1883 static uint64_t 1884 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1885 { 1886 uint64_t asize; 1887 uint64_t ashift = vd->vdev_top->vdev_ashift; 1888 uint64_t cols = vd->vdev_children; 1889 uint64_t nparity = vd->vdev_nparity; 1890 1891 asize = ((psize - 1) >> ashift) + 1; 1892 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1893 asize = roundup(asize, nparity + 1) << ashift; 1894 1895 return (asize); 1896 } 1897 1898 static void 1899 vdev_raidz_child_done(zio_t *zio) 1900 { 1901 raidz_col_t *rc = zio->io_private; 1902 1903 rc->rc_error = zio->io_error; 1904 rc->rc_tried = 1; 1905 rc->rc_skipped = 0; 1906 } 1907 1908 static void 1909 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1910 { 1911 #ifdef ZFS_DEBUG 1912 vdev_t *vd = zio->io_vd; 1913 vdev_t *tvd = vd->vdev_top; 1914 1915 range_seg64_t logical_rs, physical_rs; 1916 logical_rs.rs_start = zio->io_offset; 1917 logical_rs.rs_end = logical_rs.rs_start + 1918 vdev_raidz_asize(zio->io_vd, zio->io_size); 1919 1920 raidz_col_t *rc = &rm->rm_col[col]; 1921 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1922 1923 vdev_xlate(cvd, &logical_rs, &physical_rs); 1924 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1925 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1926 /* 1927 * It would be nice to assert that rs_end is equal 1928 * to rc_offset + rc_size but there might be an 1929 * optional I/O at the end that is not accounted in 1930 * rc_size. 1931 */ 1932 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1933 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1934 rc->rc_size + (1 << tvd->vdev_ashift)); 1935 } else { 1936 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1937 } 1938 #endif 1939 } 1940 1941 /* 1942 * Start an IO operation on a RAIDZ VDev 1943 * 1944 * Outline: 1945 * - For write operations: 1946 * 1. Generate the parity data 1947 * 2. Create child zio write operations to each column's vdev, for both 1948 * data and parity. 1949 * 3. If the column skips any sectors for padding, create optional dummy 1950 * write zio children for those areas to improve aggregation continuity. 1951 * - For read operations: 1952 * 1. Create child zio read operations to each data column's vdev to read 1953 * the range of data required for zio. 1954 * 2. If this is a scrub or resilver operation, or if any of the data 1955 * vdevs have had errors, then create zio read operations to the parity 1956 * columns' VDevs as well. 1957 */ 1958 static void 1959 vdev_raidz_io_start(zio_t *zio) 1960 { 1961 vdev_t *vd = zio->io_vd; 1962 vdev_t *tvd = vd->vdev_top; 1963 vdev_t *cvd; 1964 raidz_map_t *rm; 1965 raidz_col_t *rc; 1966 int c, i; 1967 1968 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, 1969 tvd->vdev_ashift, vd->vdev_children, 1970 vd->vdev_nparity); 1971 1972 zio->io_vsd = rm; 1973 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1974 1975 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1976 1977 if (zio->io_type == ZIO_TYPE_WRITE) { 1978 vdev_raidz_generate_parity(rm); 1979 1980 for (c = 0; c < rm->rm_cols; c++) { 1981 rc = &rm->rm_col[c]; 1982 cvd = vd->vdev_child[rc->rc_devidx]; 1983 1984 /* 1985 * Verify physical to logical translation. 1986 */ 1987 vdev_raidz_io_verify(zio, rm, c); 1988 1989 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1990 rc->rc_offset, rc->rc_abd, rc->rc_size, 1991 zio->io_type, zio->io_priority, 0, 1992 vdev_raidz_child_done, rc)); 1993 } 1994 1995 /* 1996 * Generate optional I/Os for any skipped sectors to improve 1997 * aggregation contiguity. 1998 */ 1999 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 2000 ASSERT(c <= rm->rm_scols); 2001 if (c == rm->rm_scols) 2002 c = 0; 2003 rc = &rm->rm_col[c]; 2004 cvd = vd->vdev_child[rc->rc_devidx]; 2005 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2006 rc->rc_offset + rc->rc_size, NULL, 2007 1 << tvd->vdev_ashift, 2008 zio->io_type, zio->io_priority, 2009 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2010 } 2011 2012 zio_execute(zio); 2013 return; 2014 } 2015 2016 ASSERT(zio->io_type == ZIO_TYPE_READ); 2017 2018 /* 2019 * Iterate over the columns in reverse order so that we hit the parity 2020 * last -- any errors along the way will force us to read the parity. 2021 */ 2022 for (c = rm->rm_cols - 1; c >= 0; c--) { 2023 rc = &rm->rm_col[c]; 2024 cvd = vd->vdev_child[rc->rc_devidx]; 2025 if (!vdev_readable(cvd)) { 2026 if (c >= rm->rm_firstdatacol) 2027 rm->rm_missingdata++; 2028 else 2029 rm->rm_missingparity++; 2030 rc->rc_error = SET_ERROR(ENXIO); 2031 rc->rc_tried = 1; /* don't even try */ 2032 rc->rc_skipped = 1; 2033 continue; 2034 } 2035 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2036 if (c >= rm->rm_firstdatacol) 2037 rm->rm_missingdata++; 2038 else 2039 rm->rm_missingparity++; 2040 rc->rc_error = SET_ERROR(ESTALE); 2041 rc->rc_skipped = 1; 2042 continue; 2043 } 2044 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 2045 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2046 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2047 rc->rc_offset, rc->rc_abd, rc->rc_size, 2048 zio->io_type, zio->io_priority, 0, 2049 vdev_raidz_child_done, rc)); 2050 } 2051 } 2052 2053 zio_execute(zio); 2054 } 2055 2056 2057 /* 2058 * Report a checksum error for a child of a RAID-Z device. 2059 */ 2060 static void 2061 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2062 { 2063 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2064 2065 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2066 zio_bad_cksum_t zbc; 2067 raidz_map_t *rm = zio->io_vsd; 2068 2069 mutex_enter(&vd->vdev_stat_lock); 2070 vd->vdev_stat.vs_checksum_errors++; 2071 mutex_exit(&vd->vdev_stat_lock); 2072 2073 zbc.zbc_has_cksum = 0; 2074 zbc.zbc_injected = rm->rm_ecksuminjected; 2075 2076 zfs_ereport_post_checksum(zio->io_spa, vd, 2077 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2078 rc->rc_abd, bad_data, &zbc); 2079 } 2080 } 2081 2082 /* 2083 * We keep track of whether or not there were any injected errors, so that 2084 * any ereports we generate can note it. 2085 */ 2086 static int 2087 raidz_checksum_verify(zio_t *zio) 2088 { 2089 zio_bad_cksum_t zbc; 2090 raidz_map_t *rm = zio->io_vsd; 2091 2092 int ret = zio_checksum_error(zio, &zbc); 2093 if (ret != 0 && zbc.zbc_injected != 0) 2094 rm->rm_ecksuminjected = 1; 2095 2096 return (ret); 2097 } 2098 2099 /* 2100 * Generate the parity from the data columns. If we tried and were able to 2101 * read the parity without error, verify that the generated parity matches the 2102 * data we read. If it doesn't, we fire off a checksum error. Return the 2103 * number such failures. 2104 */ 2105 static int 2106 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2107 { 2108 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2109 int c, ret = 0; 2110 raidz_col_t *rc; 2111 2112 blkptr_t *bp = zio->io_bp; 2113 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2114 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2115 2116 if (checksum == ZIO_CHECKSUM_NOPARITY) 2117 return (ret); 2118 2119 for (c = 0; c < rm->rm_firstdatacol; c++) { 2120 rc = &rm->rm_col[c]; 2121 if (!rc->rc_tried || rc->rc_error != 0) 2122 continue; 2123 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 2124 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 2125 } 2126 2127 vdev_raidz_generate_parity(rm); 2128 2129 for (c = 0; c < rm->rm_firstdatacol; c++) { 2130 rc = &rm->rm_col[c]; 2131 if (!rc->rc_tried || rc->rc_error != 0) 2132 continue; 2133 if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) { 2134 raidz_checksum_error(zio, rc, orig[c]); 2135 rc->rc_error = SET_ERROR(ECKSUM); 2136 ret++; 2137 } 2138 abd_free(orig[c]); 2139 } 2140 2141 return (ret); 2142 } 2143 2144 /* 2145 * Keep statistics on all the ways that we used parity to correct data. 2146 */ 2147 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 2148 2149 static int 2150 vdev_raidz_worst_error(raidz_map_t *rm) 2151 { 2152 int error = 0; 2153 2154 for (int c = 0; c < rm->rm_cols; c++) 2155 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2156 2157 return (error); 2158 } 2159 2160 /* 2161 * Iterate over all combinations of bad data and attempt a reconstruction. 2162 * Note that the algorithm below is non-optimal because it doesn't take into 2163 * account how reconstruction is actually performed. For example, with 2164 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2165 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2166 * cases we'd only use parity information in column 0. 2167 */ 2168 static int 2169 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2170 { 2171 raidz_map_t *rm = zio->io_vsd; 2172 raidz_col_t *rc; 2173 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2174 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2175 int *tgts = &tstore[1]; 2176 int current, next, i, c, n; 2177 int code, ret = 0; 2178 2179 ASSERT(total_errors < rm->rm_firstdatacol); 2180 2181 /* 2182 * This simplifies one edge condition. 2183 */ 2184 tgts[-1] = -1; 2185 2186 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2187 /* 2188 * Initialize the targets array by finding the first n columns 2189 * that contain no error. 2190 * 2191 * If there were no data errors, we need to ensure that we're 2192 * always explicitly attempting to reconstruct at least one 2193 * data column. To do this, we simply push the highest target 2194 * up into the data columns. 2195 */ 2196 for (c = 0, i = 0; i < n; i++) { 2197 if (i == n - 1 && data_errors == 0 && 2198 c < rm->rm_firstdatacol) { 2199 c = rm->rm_firstdatacol; 2200 } 2201 2202 while (rm->rm_col[c].rc_error != 0) { 2203 c++; 2204 ASSERT3S(c, <, rm->rm_cols); 2205 } 2206 2207 tgts[i] = c++; 2208 } 2209 2210 /* 2211 * Setting tgts[n] simplifies the other edge condition. 2212 */ 2213 tgts[n] = rm->rm_cols; 2214 2215 /* 2216 * These buffers were allocated in previous iterations. 2217 */ 2218 for (i = 0; i < n - 1; i++) { 2219 ASSERT(orig[i] != NULL); 2220 } 2221 2222 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, 2223 rm->rm_col[0].rc_size); 2224 2225 current = 0; 2226 next = tgts[current]; 2227 2228 while (current != n) { 2229 tgts[current] = next; 2230 current = 0; 2231 2232 /* 2233 * Save off the original data that we're going to 2234 * attempt to reconstruct. 2235 */ 2236 for (i = 0; i < n; i++) { 2237 ASSERT(orig[i] != NULL); 2238 c = tgts[i]; 2239 ASSERT3S(c, >=, 0); 2240 ASSERT3S(c, <, rm->rm_cols); 2241 rc = &rm->rm_col[c]; 2242 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2243 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2244 abd_copy_off(orig[i], rc->rc_abd, 0, 0, 2245 rc->rc_size); 2246 } 2247 2248 /* 2249 * Attempt a reconstruction and exit the outer loop on 2250 * success. 2251 */ 2252 code = vdev_raidz_reconstruct(rm, tgts, n); 2253 if (raidz_checksum_verify(zio) == 0) { 2254 atomic_inc_64(&raidz_corrected[code]); 2255 2256 for (i = 0; i < n; i++) { 2257 c = tgts[i]; 2258 rc = &rm->rm_col[c]; 2259 ASSERT(rc->rc_error == 0); 2260 if (rc->rc_tried) 2261 raidz_checksum_error(zio, rc, 2262 orig[i]); 2263 rc->rc_error = SET_ERROR(ECKSUM); 2264 } 2265 2266 ret = code; 2267 goto done; 2268 } 2269 2270 /* 2271 * Restore the original data. 2272 */ 2273 for (i = 0; i < n; i++) { 2274 c = tgts[i]; 2275 rc = &rm->rm_col[c]; 2276 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2277 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2278 abd_copy_off(rc->rc_abd, orig[i], 0, 0, 2279 rc->rc_size); 2280 } 2281 2282 do { 2283 /* 2284 * Find the next valid column after the current 2285 * position.. 2286 */ 2287 for (next = tgts[current] + 1; 2288 next < rm->rm_cols && 2289 rm->rm_col[next].rc_error != 0; next++) 2290 continue; 2291 2292 ASSERT(next <= tgts[current + 1]); 2293 2294 /* 2295 * If that spot is available, we're done here. 2296 */ 2297 if (next != tgts[current + 1]) 2298 break; 2299 2300 /* 2301 * Otherwise, find the next valid column after 2302 * the previous position. 2303 */ 2304 for (c = tgts[current - 1] + 1; 2305 rm->rm_col[c].rc_error != 0; c++) 2306 continue; 2307 2308 tgts[current] = c; 2309 current++; 2310 2311 } while (current != n); 2312 } 2313 } 2314 n--; 2315 done: 2316 for (i = 0; i < n; i++) 2317 abd_free(orig[i]); 2318 2319 return (ret); 2320 } 2321 2322 /* 2323 * Complete an IO operation on a RAIDZ VDev 2324 * 2325 * Outline: 2326 * - For write operations: 2327 * 1. Check for errors on the child IOs. 2328 * 2. Return, setting an error code if too few child VDevs were written 2329 * to reconstruct the data later. Note that partial writes are 2330 * considered successful if they can be reconstructed at all. 2331 * - For read operations: 2332 * 1. Check for errors on the child IOs. 2333 * 2. If data errors occurred: 2334 * a. Try to reassemble the data from the parity available. 2335 * b. If we haven't yet read the parity drives, read them now. 2336 * c. If all parity drives have been read but the data still doesn't 2337 * reassemble with a correct checksum, then try combinatorial 2338 * reconstruction. 2339 * d. If that doesn't work, return an error. 2340 * 3. If there were unexpected errors or this is a resilver operation, 2341 * rewrite the vdevs that had errors. 2342 */ 2343 static void 2344 vdev_raidz_io_done(zio_t *zio) 2345 { 2346 vdev_t *vd = zio->io_vd; 2347 vdev_t *cvd; 2348 raidz_map_t *rm = zio->io_vsd; 2349 raidz_col_t *rc; 2350 int unexpected_errors = 0; 2351 int parity_errors = 0; 2352 int parity_untried = 0; 2353 int data_errors = 0; 2354 int total_errors = 0; 2355 int n, c; 2356 int tgts[VDEV_RAIDZ_MAXPARITY]; 2357 int code; 2358 2359 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2360 2361 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2362 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2363 2364 for (c = 0; c < rm->rm_cols; c++) { 2365 rc = &rm->rm_col[c]; 2366 2367 if (rc->rc_error) { 2368 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2369 2370 if (c < rm->rm_firstdatacol) 2371 parity_errors++; 2372 else 2373 data_errors++; 2374 2375 if (!rc->rc_skipped) 2376 unexpected_errors++; 2377 2378 total_errors++; 2379 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2380 parity_untried++; 2381 } 2382 } 2383 2384 if (zio->io_type == ZIO_TYPE_WRITE) { 2385 /* 2386 * XXX -- for now, treat partial writes as a success. 2387 * (If we couldn't write enough columns to reconstruct 2388 * the data, the I/O failed. Otherwise, good enough.) 2389 * 2390 * Now that we support write reallocation, it would be better 2391 * to treat partial failure as real failure unless there are 2392 * no non-degraded top-level vdevs left, and not update DTLs 2393 * if we intend to reallocate. 2394 */ 2395 /* XXPOLICY */ 2396 if (total_errors > rm->rm_firstdatacol) 2397 zio->io_error = vdev_raidz_worst_error(rm); 2398 2399 return; 2400 } 2401 2402 ASSERT(zio->io_type == ZIO_TYPE_READ); 2403 /* 2404 * There are three potential phases for a read: 2405 * 1. produce valid data from the columns read 2406 * 2. read all disks and try again 2407 * 3. perform combinatorial reconstruction 2408 * 2409 * Each phase is progressively both more expensive and less likely to 2410 * occur. If we encounter more errors than we can repair or all phases 2411 * fail, we have no choice but to return an error. 2412 */ 2413 2414 /* 2415 * If the number of errors we saw was correctable -- less than or equal 2416 * to the number of parity disks read -- attempt to produce data that 2417 * has a valid checksum. Naturally, this case applies in the absence of 2418 * any errors. 2419 */ 2420 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2421 if (data_errors == 0) { 2422 if (raidz_checksum_verify(zio) == 0) { 2423 /* 2424 * If we read parity information (unnecessarily 2425 * as it happens since no reconstruction was 2426 * needed) regenerate and verify the parity. 2427 * We also regenerate parity when resilvering 2428 * so we can write it out to the failed device 2429 * later. 2430 */ 2431 if (parity_errors + parity_untried < 2432 rm->rm_firstdatacol || 2433 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2434 n = raidz_parity_verify(zio, rm); 2435 unexpected_errors += n; 2436 ASSERT(parity_errors + n <= 2437 rm->rm_firstdatacol); 2438 } 2439 goto done; 2440 } 2441 } else { 2442 /* 2443 * We either attempt to read all the parity columns or 2444 * none of them. If we didn't try to read parity, we 2445 * wouldn't be here in the correctable case. There must 2446 * also have been fewer parity errors than parity 2447 * columns or, again, we wouldn't be in this code path. 2448 */ 2449 ASSERT(parity_untried == 0); 2450 ASSERT(parity_errors < rm->rm_firstdatacol); 2451 2452 /* 2453 * Identify the data columns that reported an error. 2454 */ 2455 n = 0; 2456 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2457 rc = &rm->rm_col[c]; 2458 if (rc->rc_error != 0) { 2459 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2460 tgts[n++] = c; 2461 } 2462 } 2463 2464 ASSERT(rm->rm_firstdatacol >= n); 2465 2466 code = vdev_raidz_reconstruct(rm, tgts, n); 2467 2468 if (raidz_checksum_verify(zio) == 0) { 2469 atomic_inc_64(&raidz_corrected[code]); 2470 2471 /* 2472 * If we read more parity disks than were used 2473 * for reconstruction, confirm that the other 2474 * parity disks produced correct data. This 2475 * routine is suboptimal in that it regenerates 2476 * the parity that we already used in addition 2477 * to the parity that we're attempting to 2478 * verify, but this should be a relatively 2479 * uncommon case, and can be optimized if it 2480 * becomes a problem. Note that we regenerate 2481 * parity when resilvering so we can write it 2482 * out to failed devices later. 2483 */ 2484 if (parity_errors < rm->rm_firstdatacol - n || 2485 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2486 n = raidz_parity_verify(zio, rm); 2487 unexpected_errors += n; 2488 ASSERT(parity_errors + n <= 2489 rm->rm_firstdatacol); 2490 } 2491 2492 goto done; 2493 } 2494 } 2495 } 2496 2497 /* 2498 * This isn't a typical situation -- either we got a read error or 2499 * a child silently returned bad data. Read every block so we can 2500 * try again with as much data and parity as we can track down. If 2501 * we've already been through once before, all children will be marked 2502 * as tried so we'll proceed to combinatorial reconstruction. 2503 */ 2504 unexpected_errors = 1; 2505 rm->rm_missingdata = 0; 2506 rm->rm_missingparity = 0; 2507 2508 for (c = 0; c < rm->rm_cols; c++) { 2509 if (rm->rm_col[c].rc_tried) 2510 continue; 2511 2512 zio_vdev_io_redone(zio); 2513 do { 2514 rc = &rm->rm_col[c]; 2515 if (rc->rc_tried) 2516 continue; 2517 zio_nowait(zio_vdev_child_io(zio, NULL, 2518 vd->vdev_child[rc->rc_devidx], 2519 rc->rc_offset, rc->rc_abd, rc->rc_size, 2520 zio->io_type, zio->io_priority, 0, 2521 vdev_raidz_child_done, rc)); 2522 } while (++c < rm->rm_cols); 2523 2524 return; 2525 } 2526 2527 /* 2528 * At this point we've attempted to reconstruct the data given the 2529 * errors we detected, and we've attempted to read all columns. There 2530 * must, therefore, be one or more additional problems -- silent errors 2531 * resulting in invalid data rather than explicit I/O errors resulting 2532 * in absent data. We check if there is enough additional data to 2533 * possibly reconstruct the data and then perform combinatorial 2534 * reconstruction over all possible combinations. If that fails, 2535 * we're cooked. 2536 */ 2537 if (total_errors > rm->rm_firstdatacol) { 2538 zio->io_error = vdev_raidz_worst_error(rm); 2539 2540 } else if (total_errors < rm->rm_firstdatacol && 2541 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2542 /* 2543 * If we didn't use all the available parity for the 2544 * combinatorial reconstruction, verify that the remaining 2545 * parity is correct. 2546 */ 2547 if (code != (1 << rm->rm_firstdatacol) - 1) 2548 (void) raidz_parity_verify(zio, rm); 2549 } else { 2550 /* 2551 * We're here because either: 2552 * 2553 * total_errors == rm_first_datacol, or 2554 * vdev_raidz_combrec() failed 2555 * 2556 * In either case, there is enough bad data to prevent 2557 * reconstruction. 2558 * 2559 * Start checksum ereports for all children which haven't 2560 * failed, and the IO wasn't speculative. 2561 */ 2562 zio->io_error = SET_ERROR(ECKSUM); 2563 2564 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2565 for (c = 0; c < rm->rm_cols; c++) { 2566 rc = &rm->rm_col[c]; 2567 if (rc->rc_error == 0) { 2568 zio_bad_cksum_t zbc; 2569 zbc.zbc_has_cksum = 0; 2570 zbc.zbc_injected = 2571 rm->rm_ecksuminjected; 2572 2573 zfs_ereport_start_checksum( 2574 zio->io_spa, 2575 vd->vdev_child[rc->rc_devidx], 2576 &zio->io_bookmark, zio, 2577 rc->rc_offset, rc->rc_size, 2578 (void *)(uintptr_t)c, &zbc); 2579 } 2580 } 2581 } 2582 } 2583 2584 done: 2585 zio_checksum_verified(zio); 2586 2587 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2588 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2589 /* 2590 * Use the good data we have in hand to repair damaged children. 2591 */ 2592 for (c = 0; c < rm->rm_cols; c++) { 2593 rc = &rm->rm_col[c]; 2594 cvd = vd->vdev_child[rc->rc_devidx]; 2595 2596 if (rc->rc_error == 0) 2597 continue; 2598 2599 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2600 rc->rc_offset, rc->rc_abd, rc->rc_size, 2601 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2602 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2603 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2604 } 2605 } 2606 } 2607 2608 static void 2609 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2610 { 2611 if (faulted > vd->vdev_nparity) 2612 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2613 VDEV_AUX_NO_REPLICAS); 2614 else if (degraded + faulted != 0) 2615 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2616 else 2617 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2618 } 2619 2620 /* 2621 * Determine if any portion of the provided block resides on a child vdev 2622 * with a dirty DTL and therefore needs to be resilvered. The function 2623 * assumes that at least one DTL is dirty which imples that full stripe 2624 * width blocks must be resilvered. 2625 */ 2626 static boolean_t 2627 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2628 { 2629 uint64_t dcols = vd->vdev_children; 2630 uint64_t nparity = vd->vdev_nparity; 2631 uint64_t ashift = vd->vdev_top->vdev_ashift; 2632 /* The starting RAIDZ (parent) vdev sector of the block. */ 2633 uint64_t b = offset >> ashift; 2634 /* The zio's size in units of the vdev's minimum sector size. */ 2635 uint64_t s = ((psize - 1) >> ashift) + 1; 2636 /* The first column for this stripe. */ 2637 uint64_t f = b % dcols; 2638 2639 if (s + nparity >= dcols) 2640 return (B_TRUE); 2641 2642 for (uint64_t c = 0; c < s + nparity; c++) { 2643 uint64_t devidx = (f + c) % dcols; 2644 vdev_t *cvd = vd->vdev_child[devidx]; 2645 2646 /* 2647 * dsl_scan_need_resilver() already checked vd with 2648 * vdev_dtl_contains(). So here just check cvd with 2649 * vdev_dtl_empty(), cheaper and a good approximation. 2650 */ 2651 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2652 return (B_TRUE); 2653 } 2654 2655 return (B_FALSE); 2656 } 2657 2658 static void 2659 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) 2660 { 2661 vdev_t *raidvd = cvd->vdev_parent; 2662 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2663 2664 uint64_t width = raidvd->vdev_children; 2665 uint64_t tgt_col = cvd->vdev_id; 2666 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2667 2668 /* make sure the offsets are block-aligned */ 2669 ASSERT0(in->rs_start % (1 << ashift)); 2670 ASSERT0(in->rs_end % (1 << ashift)); 2671 uint64_t b_start = in->rs_start >> ashift; 2672 uint64_t b_end = in->rs_end >> ashift; 2673 2674 uint64_t start_row = 0; 2675 if (b_start > tgt_col) /* avoid underflow */ 2676 start_row = ((b_start - tgt_col - 1) / width) + 1; 2677 2678 uint64_t end_row = 0; 2679 if (b_end > tgt_col) 2680 end_row = ((b_end - tgt_col - 1) / width) + 1; 2681 2682 res->rs_start = start_row << ashift; 2683 res->rs_end = end_row << ashift; 2684 2685 ASSERT3U(res->rs_start, <=, in->rs_start); 2686 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2687 } 2688 2689 vdev_ops_t vdev_raidz_ops = { 2690 .vdev_op_open = vdev_raidz_open, 2691 .vdev_op_close = vdev_raidz_close, 2692 .vdev_op_asize = vdev_raidz_asize, 2693 .vdev_op_io_start = vdev_raidz_io_start, 2694 .vdev_op_io_done = vdev_raidz_io_done, 2695 .vdev_op_state_change = vdev_raidz_state_change, 2696 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2697 .vdev_op_hold = NULL, 2698 .vdev_op_rele = NULL, 2699 .vdev_op_remap = NULL, 2700 .vdev_op_xlate = vdev_raidz_xlate, 2701 .vdev_op_dumpio = vdev_raidz_dumpio, 2702 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2703 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2704 }; 2705