1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_file.h> 34 #include <sys/vdev_raidz.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/abd.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/fm/fs/zfs.h> 40 41 #ifdef ZFS_DEBUG 42 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 43 #endif 44 45 /* 46 * Virtual device vector for RAID-Z. 47 * 48 * This vdev supports single, double, and triple parity. For single parity, 49 * we use a simple XOR of all the data columns. For double or triple parity, 50 * we use a special case of Reed-Solomon coding. This extends the 51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 54 * former is also based. The latter is designed to provide higher performance 55 * for writes. 56 * 57 * Note that the Plank paper claimed to support arbitrary N+M, but was then 58 * amended six years later identifying a critical flaw that invalidates its 59 * claims. Nevertheless, the technique can be adapted to work for up to 60 * triple parity. For additional parity, the amendment "Note: Correction to 61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 62 * is viable, but the additional complexity means that write performance will 63 * suffer. 64 * 65 * All of the methods above operate on a Galois field, defined over the 66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 67 * can be expressed with a single byte. Briefly, the operations on the 68 * field are defined as follows: 69 * 70 * o addition (+) is represented by a bitwise XOR 71 * o subtraction (-) is therefore identical to addition: A + B = A - B 72 * o multiplication of A by 2 is defined by the following bitwise expression: 73 * 74 * (A * 2)_7 = A_6 75 * (A * 2)_6 = A_5 76 * (A * 2)_5 = A_4 77 * (A * 2)_4 = A_3 + A_7 78 * (A * 2)_3 = A_2 + A_7 79 * (A * 2)_2 = A_1 + A_7 80 * (A * 2)_1 = A_0 81 * (A * 2)_0 = A_7 82 * 83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 84 * As an aside, this multiplication is derived from the error correcting 85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 86 * 87 * Observe that any number in the field (except for 0) can be expressed as a 88 * power of 2 -- a generator for the field. We store a table of the powers of 89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 91 * than field addition). The inverse of a field element A (A^-1) is therefore 92 * A ^ (255 - 1) = A^254. 93 * 94 * The up-to-three parity columns, P, Q, R over several data columns, 95 * D_0, ... D_n-1, can be expressed by field operations: 96 * 97 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 98 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 99 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 100 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 101 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 102 * 103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 105 * independent coefficients. (There are no additional coefficients that have 106 * this property which is why the uncorrected Plank method breaks down.) 107 * 108 * See the reconstruction code below for how P, Q and R can used individually 109 * or in concert to recover missing data columns. 110 */ 111 112 typedef struct raidz_col { 113 uint64_t rc_devidx; /* child device index for I/O */ 114 uint64_t rc_offset; /* device offset */ 115 uint64_t rc_size; /* I/O size */ 116 abd_t *rc_abd; /* I/O data */ 117 void *rc_gdata; /* used to store the "good" version */ 118 int rc_error; /* I/O error for this device */ 119 uint8_t rc_tried; /* Did we attempt this I/O column? */ 120 uint8_t rc_skipped; /* Did we skip this I/O column? */ 121 } raidz_col_t; 122 123 typedef struct raidz_map { 124 uint64_t rm_cols; /* Regular column count */ 125 uint64_t rm_scols; /* Count including skipped columns */ 126 uint64_t rm_bigcols; /* Number of oversized columns */ 127 uint64_t rm_asize; /* Actual total I/O size */ 128 uint64_t rm_missingdata; /* Count of missing data devices */ 129 uint64_t rm_missingparity; /* Count of missing parity devices */ 130 uint64_t rm_firstdatacol; /* First data column/parity count */ 131 uint64_t rm_nskip; /* Skipped sectors for padding */ 132 uint64_t rm_skipstart; /* Column index of padding start */ 133 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ 134 uintptr_t rm_reports; /* # of referencing checksum reports */ 135 uint8_t rm_freed; /* map no longer has referencing ZIO */ 136 uint8_t rm_ecksuminjected; /* checksum error was injected */ 137 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 138 } raidz_map_t; 139 140 #define VDEV_RAIDZ_P 0 141 #define VDEV_RAIDZ_Q 1 142 #define VDEV_RAIDZ_R 2 143 144 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 145 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 146 147 /* 148 * We provide a mechanism to perform the field multiplication operation on a 149 * 64-bit value all at once rather than a byte at a time. This works by 150 * creating a mask from the top bit in each byte and using that to 151 * conditionally apply the XOR of 0x1d. 152 */ 153 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 154 { \ 155 (mask) = (x) & 0x8080808080808080ULL; \ 156 (mask) = ((mask) << 1) - ((mask) >> 7); \ 157 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 158 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 159 } 160 161 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 162 { \ 163 VDEV_RAIDZ_64MUL_2((x), mask); \ 164 VDEV_RAIDZ_64MUL_2((x), mask); \ 165 } 166 167 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 168 169 /* 170 * Force reconstruction to use the general purpose method. 171 */ 172 int vdev_raidz_default_to_general; 173 174 /* Powers of 2 in the Galois field defined above. */ 175 static const uint8_t vdev_raidz_pow2[256] = { 176 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 177 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 178 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 179 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 180 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 181 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 182 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 183 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 184 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 185 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 186 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 187 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 188 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 189 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 190 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 191 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 192 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 193 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 194 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 195 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 196 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 197 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 198 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 199 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 200 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 201 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 202 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 203 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 204 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 205 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 206 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 207 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 208 }; 209 /* Logs of 2 in the Galois field defined above. */ 210 static const uint8_t vdev_raidz_log2[256] = { 211 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 212 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 213 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 214 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 215 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 216 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 217 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 218 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 219 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 220 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 221 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 222 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 223 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 224 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 225 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 226 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 227 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 228 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 229 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 230 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 231 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 232 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 233 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 234 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 235 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 236 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 237 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 238 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 239 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 240 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 241 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 242 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 243 }; 244 245 static void vdev_raidz_generate_parity(raidz_map_t *rm); 246 247 /* 248 * Multiply a given number by 2 raised to the given power. 249 */ 250 static uint8_t 251 vdev_raidz_exp2(uint_t a, int exp) 252 { 253 if (a == 0) 254 return (0); 255 256 ASSERT(exp >= 0); 257 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 258 259 exp += vdev_raidz_log2[a]; 260 if (exp > 255) 261 exp -= 255; 262 263 return (vdev_raidz_pow2[exp]); 264 } 265 266 static void 267 vdev_raidz_map_free(raidz_map_t *rm) 268 { 269 int c; 270 271 for (c = 0; c < rm->rm_firstdatacol; c++) { 272 abd_free(rm->rm_col[c].rc_abd); 273 274 if (rm->rm_col[c].rc_gdata != NULL) 275 abd_free(rm->rm_col[c].rc_gdata); 276 277 } 278 279 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 280 abd_put(rm->rm_col[c].rc_abd); 281 282 if (rm->rm_abd_copy != NULL) 283 abd_free(rm->rm_abd_copy); 284 285 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 286 } 287 288 static void 289 vdev_raidz_map_free_vsd(zio_t *zio) 290 { 291 raidz_map_t *rm = zio->io_vsd; 292 293 ASSERT0(rm->rm_freed); 294 rm->rm_freed = 1; 295 296 if (rm->rm_reports == 0) 297 vdev_raidz_map_free(rm); 298 } 299 300 /*ARGSUSED*/ 301 static void 302 vdev_raidz_cksum_free(void *arg, size_t ignored) 303 { 304 raidz_map_t *rm = arg; 305 306 ASSERT3U(rm->rm_reports, >, 0); 307 308 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 309 vdev_raidz_map_free(rm); 310 } 311 312 static void 313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) 314 { 315 raidz_map_t *rm = zcr->zcr_cbdata; 316 size_t c = zcr->zcr_cbinfo; 317 size_t x, offset; 318 319 const abd_t *good = NULL; 320 const abd_t *bad = rm->rm_col[c].rc_abd; 321 322 if (good_data == NULL) { 323 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 324 return; 325 } 326 327 if (c < rm->rm_firstdatacol) { 328 /* 329 * The first time through, calculate the parity blocks for 330 * the good data (this relies on the fact that the good 331 * data never changes for a given logical ZIO) 332 */ 333 if (rm->rm_col[0].rc_gdata == NULL) { 334 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 335 336 /* 337 * Set up the rm_col[]s to generate the parity for 338 * good_data, first saving the parity bufs and 339 * replacing them with buffers to hold the result. 340 */ 341 for (x = 0; x < rm->rm_firstdatacol; x++) { 342 bad_parity[x] = rm->rm_col[x].rc_abd; 343 rm->rm_col[x].rc_abd = 344 rm->rm_col[x].rc_gdata = 345 abd_alloc_sametype(rm->rm_col[x].rc_abd, 346 rm->rm_col[x].rc_size); 347 } 348 349 /* fill in the data columns from good_data */ 350 offset = 0; 351 for (; x < rm->rm_cols; x++) { 352 abd_put(rm->rm_col[x].rc_abd); 353 354 rm->rm_col[x].rc_abd = 355 abd_get_offset_size((abd_t *)good_data, 356 offset, rm->rm_col[x].rc_size); 357 offset += rm->rm_col[x].rc_size; 358 } 359 360 /* 361 * Construct the parity from the good data. 362 */ 363 vdev_raidz_generate_parity(rm); 364 365 /* restore everything back to its original state */ 366 for (x = 0; x < rm->rm_firstdatacol; x++) 367 rm->rm_col[x].rc_abd = bad_parity[x]; 368 369 offset = 0; 370 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 371 abd_put(rm->rm_col[x].rc_abd); 372 rm->rm_col[x].rc_abd = abd_get_offset_size( 373 rm->rm_abd_copy, offset, 374 rm->rm_col[x].rc_size); 375 offset += rm->rm_col[x].rc_size; 376 } 377 } 378 379 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 380 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, 381 rm->rm_col[c].rc_size); 382 } else { 383 /* adjust good_data to point at the start of our column */ 384 offset = 0; 385 for (x = rm->rm_firstdatacol; x < c; x++) 386 offset += rm->rm_col[x].rc_size; 387 388 good = abd_get_offset_size((abd_t *)good_data, offset, 389 rm->rm_col[c].rc_size); 390 } 391 392 /* we drop the ereport if it ends up that the data was good */ 393 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 394 abd_put((abd_t *)good); 395 } 396 397 /* 398 * Invoked indirectly by zfs_ereport_start_checksum(), called 399 * below when our read operation fails completely. The main point 400 * is to keep a copy of everything we read from disk, so that at 401 * vdev_raidz_cksum_finish() time we can compare it with the good data. 402 */ 403 static void 404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 405 { 406 size_t c = (size_t)(uintptr_t)arg; 407 size_t offset; 408 409 raidz_map_t *rm = zio->io_vsd; 410 size_t size; 411 412 /* set up the report and bump the refcount */ 413 zcr->zcr_cbdata = rm; 414 zcr->zcr_cbinfo = c; 415 zcr->zcr_finish = vdev_raidz_cksum_finish; 416 zcr->zcr_free = vdev_raidz_cksum_free; 417 418 rm->rm_reports++; 419 ASSERT3U(rm->rm_reports, >, 0); 420 421 if (rm->rm_abd_copy != NULL) 422 return; 423 424 /* 425 * It's the first time we're called for this raidz_map_t, so we need 426 * to copy the data aside; there's no guarantee that our zio's buffer 427 * won't be re-used for something else. 428 * 429 * Our parity data is already in separate buffers, so there's no need 430 * to copy them. 431 */ 432 433 size = 0; 434 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 435 size += rm->rm_col[c].rc_size; 436 437 rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); 438 439 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 440 raidz_col_t *col = &rm->rm_col[c]; 441 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, 442 col->rc_size); 443 444 ASSERT3S(tmp->abd_size, >=, col->rc_size); 445 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 446 abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size); 447 abd_put(col->rc_abd); 448 col->rc_abd = tmp; 449 450 offset += col->rc_size; 451 } 452 ASSERT3U(offset, ==, size); 453 } 454 455 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 456 vdev_raidz_map_free_vsd, 457 vdev_raidz_cksum_report 458 }; 459 460 /* 461 * Divides the IO evenly across all child vdevs; usually, dcols is 462 * the number of children in the target vdev. 463 */ 464 static raidz_map_t * 465 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, 466 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 467 { 468 raidz_map_t *rm; 469 /* The starting RAIDZ (parent) vdev sector of the block. */ 470 uint64_t b = offset >> unit_shift; 471 /* The zio's size in units of the vdev's minimum sector size. */ 472 uint64_t s = size >> unit_shift; 473 /* The first column for this stripe. */ 474 uint64_t f = b % dcols; 475 /* The starting byte offset on each child vdev. */ 476 uint64_t o = (b / dcols) << unit_shift; 477 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 478 uint64_t off = 0; 479 480 /* 481 * "Quotient": The number of data sectors for this stripe on all but 482 * the "big column" child vdevs that also contain "remainder" data. 483 */ 484 q = s / (dcols - nparity); 485 486 /* 487 * "Remainder": The number of partial stripe data sectors in this I/O. 488 * This will add a sector to some, but not all, child vdevs. 489 */ 490 r = s - q * (dcols - nparity); 491 492 /* The number of "big columns" - those which contain remainder data. */ 493 bc = (r == 0 ? 0 : r + nparity); 494 495 /* 496 * The total number of data and parity sectors associated with 497 * this I/O. 498 */ 499 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 500 501 /* acols: The columns that will be accessed. */ 502 /* scols: The columns that will be accessed or skipped. */ 503 if (q == 0) { 504 /* Our I/O request doesn't span all child vdevs. */ 505 acols = bc; 506 scols = MIN(dcols, roundup(bc, nparity + 1)); 507 } else { 508 acols = dcols; 509 scols = dcols; 510 } 511 512 ASSERT3U(acols, <=, scols); 513 514 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 515 516 rm->rm_cols = acols; 517 rm->rm_scols = scols; 518 rm->rm_bigcols = bc; 519 rm->rm_skipstart = bc; 520 rm->rm_missingdata = 0; 521 rm->rm_missingparity = 0; 522 rm->rm_firstdatacol = nparity; 523 rm->rm_abd_copy = NULL; 524 rm->rm_reports = 0; 525 rm->rm_freed = 0; 526 rm->rm_ecksuminjected = 0; 527 528 asize = 0; 529 530 for (c = 0; c < scols; c++) { 531 col = f + c; 532 coff = o; 533 if (col >= dcols) { 534 col -= dcols; 535 coff += 1ULL << unit_shift; 536 } 537 rm->rm_col[c].rc_devidx = col; 538 rm->rm_col[c].rc_offset = coff; 539 rm->rm_col[c].rc_abd = NULL; 540 rm->rm_col[c].rc_gdata = NULL; 541 rm->rm_col[c].rc_error = 0; 542 rm->rm_col[c].rc_tried = 0; 543 rm->rm_col[c].rc_skipped = 0; 544 545 if (c >= acols) 546 rm->rm_col[c].rc_size = 0; 547 else if (c < bc) 548 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 549 else 550 rm->rm_col[c].rc_size = q << unit_shift; 551 552 asize += rm->rm_col[c].rc_size; 553 } 554 555 ASSERT3U(asize, ==, tot << unit_shift); 556 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 557 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 558 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 559 ASSERT3U(rm->rm_nskip, <=, nparity); 560 561 for (c = 0; c < rm->rm_firstdatacol; c++) 562 rm->rm_col[c].rc_abd = 563 abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); 564 565 rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0, 566 rm->rm_col[c].rc_size); 567 off = rm->rm_col[c].rc_size; 568 569 for (c = c + 1; c < acols; c++) { 570 rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off, 571 rm->rm_col[c].rc_size); 572 off += rm->rm_col[c].rc_size; 573 } 574 575 /* 576 * If all data stored spans all columns, there's a danger that parity 577 * will always be on the same device and, since parity isn't read 578 * during normal operation, that that device's I/O bandwidth won't be 579 * used effectively. We therefore switch the parity every 1MB. 580 * 581 * ... at least that was, ostensibly, the theory. As a practical 582 * matter unless we juggle the parity between all devices evenly, we 583 * won't see any benefit. Further, occasional writes that aren't a 584 * multiple of the LCM of the number of children and the minimum 585 * stripe width are sufficient to avoid pessimal behavior. 586 * Unfortunately, this decision created an implicit on-disk format 587 * requirement that we need to support for all eternity, but only 588 * for single-parity RAID-Z. 589 * 590 * If we intend to skip a sector in the zeroth column for padding 591 * we must make sure to note this swap. We will never intend to 592 * skip the first column since at least one data and one parity 593 * column must appear in each row. 594 */ 595 ASSERT(rm->rm_cols >= 2); 596 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 597 598 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 599 devidx = rm->rm_col[0].rc_devidx; 600 o = rm->rm_col[0].rc_offset; 601 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 602 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 603 rm->rm_col[1].rc_devidx = devidx; 604 rm->rm_col[1].rc_offset = o; 605 606 if (rm->rm_skipstart == 0) 607 rm->rm_skipstart = 1; 608 } 609 610 return (rm); 611 } 612 613 struct pqr_struct { 614 uint64_t *p; 615 uint64_t *q; 616 uint64_t *r; 617 }; 618 619 static int 620 vdev_raidz_p_func(void *buf, size_t size, void *private) 621 { 622 struct pqr_struct *pqr = private; 623 const uint64_t *src = buf; 624 int i, cnt = size / sizeof (src[0]); 625 626 ASSERT(pqr->p && !pqr->q && !pqr->r); 627 628 for (i = 0; i < cnt; i++, src++, pqr->p++) 629 *pqr->p ^= *src; 630 631 return (0); 632 } 633 634 static int 635 vdev_raidz_pq_func(void *buf, size_t size, void *private) 636 { 637 struct pqr_struct *pqr = private; 638 const uint64_t *src = buf; 639 uint64_t mask; 640 int i, cnt = size / sizeof (src[0]); 641 642 ASSERT(pqr->p && pqr->q && !pqr->r); 643 644 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 645 *pqr->p ^= *src; 646 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 647 *pqr->q ^= *src; 648 } 649 650 return (0); 651 } 652 653 static int 654 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 655 { 656 struct pqr_struct *pqr = private; 657 const uint64_t *src = buf; 658 uint64_t mask; 659 int i, cnt = size / sizeof (src[0]); 660 661 ASSERT(pqr->p && pqr->q && pqr->r); 662 663 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 664 *pqr->p ^= *src; 665 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 666 *pqr->q ^= *src; 667 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 668 *pqr->r ^= *src; 669 } 670 671 return (0); 672 } 673 674 static void 675 vdev_raidz_generate_parity_p(raidz_map_t *rm) 676 { 677 uint64_t *p; 678 int c; 679 abd_t *src; 680 681 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 682 src = rm->rm_col[c].rc_abd; 683 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 684 685 if (c == rm->rm_firstdatacol) { 686 ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size); 687 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 688 } else { 689 struct pqr_struct pqr = { p, NULL, NULL }; 690 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 691 vdev_raidz_p_func, &pqr); 692 } 693 } 694 } 695 696 static void 697 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 698 { 699 uint64_t *p, *q, pcnt, ccnt, mask, i; 700 int c; 701 abd_t *src; 702 703 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 704 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 705 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 706 707 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 708 src = rm->rm_col[c].rc_abd; 709 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 710 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 711 712 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 713 714 if (c == rm->rm_firstdatacol) { 715 ASSERT(ccnt == pcnt || ccnt == 0); 716 717 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 718 (void) memcpy(q, p, rm->rm_col[c].rc_size); 719 for (i = ccnt; i < pcnt; i++) { 720 p[i] = 0; 721 q[i] = 0; 722 } 723 } else { 724 struct pqr_struct pqr = { p, q, NULL }; 725 726 ASSERT(ccnt <= pcnt); 727 728 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 729 vdev_raidz_pq_func, &pqr); 730 731 /* 732 * Treat short columns as though they are full of 0s. 733 * Note that there's therefore nothing needed for P. 734 */ 735 for (i = ccnt; i < pcnt; i++) { 736 VDEV_RAIDZ_64MUL_2(q[i], mask); 737 } 738 } 739 } 740 } 741 742 static void 743 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 744 { 745 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 746 int c; 747 abd_t *src; 748 749 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 750 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 751 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 752 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 753 rm->rm_col[VDEV_RAIDZ_R].rc_size); 754 755 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 756 src = rm->rm_col[c].rc_abd; 757 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 758 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 759 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 760 761 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 762 763 if (c == rm->rm_firstdatacol) { 764 ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size); 765 ASSERT(ccnt == pcnt || ccnt == 0); 766 abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); 767 (void) memcpy(q, p, rm->rm_col[c].rc_size); 768 (void) memcpy(r, p, rm->rm_col[c].rc_size); 769 770 for (i = ccnt; i < pcnt; i++) { 771 p[i] = 0; 772 q[i] = 0; 773 r[i] = 0; 774 } 775 } else { 776 struct pqr_struct pqr = { p, q, r }; 777 778 ASSERT(ccnt <= pcnt); 779 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 780 vdev_raidz_pqr_func, &pqr); 781 782 /* 783 * Treat short columns as though they are full of 0s. 784 * Note that there's therefore nothing needed for P. 785 */ 786 for (i = ccnt; i < pcnt; i++) { 787 VDEV_RAIDZ_64MUL_2(q[i], mask); 788 VDEV_RAIDZ_64MUL_4(r[i], mask); 789 } 790 } 791 } 792 } 793 794 /* 795 * Generate RAID parity in the first virtual columns according to the number of 796 * parity columns available. 797 */ 798 static void 799 vdev_raidz_generate_parity(raidz_map_t *rm) 800 { 801 switch (rm->rm_firstdatacol) { 802 case 1: 803 vdev_raidz_generate_parity_p(rm); 804 break; 805 case 2: 806 vdev_raidz_generate_parity_pq(rm); 807 break; 808 case 3: 809 vdev_raidz_generate_parity_pqr(rm); 810 break; 811 default: 812 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 813 } 814 } 815 816 /* ARGSUSED */ 817 static int 818 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 819 { 820 uint64_t *dst = dbuf; 821 uint64_t *src = sbuf; 822 int cnt = size / sizeof (src[0]); 823 824 for (int i = 0; i < cnt; i++) { 825 dst[i] ^= src[i]; 826 } 827 828 return (0); 829 } 830 831 /* ARGSUSED */ 832 static int 833 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 834 void *private) 835 { 836 uint64_t *dst = dbuf; 837 uint64_t *src = sbuf; 838 uint64_t mask; 839 int cnt = size / sizeof (dst[0]); 840 841 for (int i = 0; i < cnt; i++, dst++, src++) { 842 VDEV_RAIDZ_64MUL_2(*dst, mask); 843 *dst ^= *src; 844 } 845 846 return (0); 847 } 848 849 /* ARGSUSED */ 850 static int 851 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 852 { 853 uint64_t *dst = buf; 854 uint64_t mask; 855 int cnt = size / sizeof (dst[0]); 856 857 for (int i = 0; i < cnt; i++, dst++) { 858 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 859 VDEV_RAIDZ_64MUL_2(*dst, mask); 860 } 861 862 return (0); 863 } 864 865 struct reconst_q_struct { 866 uint64_t *q; 867 int exp; 868 }; 869 870 static int 871 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 872 { 873 struct reconst_q_struct *rq = private; 874 uint64_t *dst = buf; 875 int cnt = size / sizeof (dst[0]); 876 877 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 878 *dst ^= *rq->q; 879 880 int j; 881 uint8_t *b; 882 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 883 *b = vdev_raidz_exp2(*b, rq->exp); 884 } 885 } 886 887 return (0); 888 } 889 890 struct reconst_pq_struct { 891 uint8_t *p; 892 uint8_t *q; 893 uint8_t *pxy; 894 uint8_t *qxy; 895 int aexp; 896 int bexp; 897 }; 898 899 static int 900 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 901 { 902 struct reconst_pq_struct *rpq = private; 903 uint8_t *xd = xbuf; 904 uint8_t *yd = ybuf; 905 906 for (int i = 0; i < size; 907 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 908 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 909 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 910 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 911 } 912 913 return (0); 914 } 915 916 static int 917 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 918 { 919 struct reconst_pq_struct *rpq = private; 920 uint8_t *xd = xbuf; 921 922 for (int i = 0; i < size; 923 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 924 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 925 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 926 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 927 } 928 929 return (0); 930 } 931 932 static int 933 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 934 { 935 int x = tgts[0]; 936 int c; 937 abd_t *dst, *src; 938 939 ASSERT(ntgts == 1); 940 ASSERT(x >= rm->rm_firstdatacol); 941 ASSERT(x < rm->rm_cols); 942 943 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 944 ASSERT(rm->rm_col[x].rc_size > 0); 945 946 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 947 dst = rm->rm_col[x].rc_abd; 948 949 ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size); 950 ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size); 951 abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size); 952 953 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 954 uint64_t size = MIN(rm->rm_col[x].rc_size, 955 rm->rm_col[c].rc_size); 956 957 src = rm->rm_col[c].rc_abd; 958 dst = rm->rm_col[x].rc_abd; 959 960 if (c == x) 961 continue; 962 963 (void) abd_iterate_func2(dst, src, 0, 0, size, 964 vdev_raidz_reconst_p_func, NULL); 965 } 966 967 return (1 << VDEV_RAIDZ_P); 968 } 969 970 static int 971 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 972 { 973 int x = tgts[0]; 974 int c, exp; 975 abd_t *dst, *src; 976 977 ASSERT(ntgts == 1); 978 979 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 980 981 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 982 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 983 rm->rm_col[c].rc_size); 984 985 src = rm->rm_col[c].rc_abd; 986 dst = rm->rm_col[x].rc_abd; 987 988 if (c == rm->rm_firstdatacol) { 989 if (dst != src) { 990 ASSERT3S(dst->abd_size, >=, size); 991 ASSERT3S(src->abd_size, >=, size); 992 abd_copy_off(dst, src, 0, 0, size); 993 } 994 if (rm->rm_col[x].rc_size > size) 995 abd_zero_off(dst, size, 996 rm->rm_col[x].rc_size - size); 997 } else { 998 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 999 if (src != dst) 1000 (void) abd_iterate_func2(dst, src, 0, 0, size, 1001 vdev_raidz_reconst_q_pre_func, NULL); 1002 (void) abd_iterate_func(dst, 1003 size, rm->rm_col[x].rc_size - size, 1004 vdev_raidz_reconst_q_pre_tail_func, NULL); 1005 } 1006 } 1007 1008 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1009 dst = rm->rm_col[x].rc_abd; 1010 exp = 255 - (rm->rm_cols - 1 - x); 1011 1012 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1013 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 1014 vdev_raidz_reconst_q_post_func, &rq); 1015 1016 return (1 << VDEV_RAIDZ_Q); 1017 } 1018 1019 static int 1020 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 1021 { 1022 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1023 abd_t *pdata, *qdata; 1024 uint64_t xsize, ysize; 1025 int x = tgts[0]; 1026 int y = tgts[1]; 1027 abd_t *xd, *yd; 1028 1029 ASSERT(ntgts == 2); 1030 ASSERT(x < y); 1031 ASSERT(x >= rm->rm_firstdatacol); 1032 ASSERT(y < rm->rm_cols); 1033 1034 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 1035 1036 /* 1037 * Move the parity data aside -- we're going to compute parity as 1038 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1039 * reuse the parity generation mechanism without trashing the actual 1040 * parity so we make those columns appear to be full of zeros by 1041 * setting their lengths to zero. 1042 */ 1043 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 1044 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1045 xsize = rm->rm_col[x].rc_size; 1046 ysize = rm->rm_col[y].rc_size; 1047 1048 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 1049 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1050 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 1051 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1052 rm->rm_col[x].rc_size = 0; 1053 rm->rm_col[y].rc_size = 0; 1054 1055 vdev_raidz_generate_parity_pq(rm); 1056 1057 rm->rm_col[x].rc_size = xsize; 1058 rm->rm_col[y].rc_size = ysize; 1059 1060 p = abd_to_buf(pdata); 1061 q = abd_to_buf(qdata); 1062 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1063 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1064 xd = rm->rm_col[x].rc_abd; 1065 yd = rm->rm_col[y].rc_abd; 1066 1067 /* 1068 * We now have: 1069 * Pxy = P + D_x + D_y 1070 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1071 * 1072 * We can then solve for D_x: 1073 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1074 * where 1075 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1076 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1077 * 1078 * With D_x in hand, we can easily solve for D_y: 1079 * D_y = P + Pxy + D_x 1080 */ 1081 1082 a = vdev_raidz_pow2[255 + x - y]; 1083 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 1084 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1085 1086 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1087 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1088 1089 ASSERT3U(xsize, >=, ysize); 1090 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1091 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1092 vdev_raidz_reconst_pq_func, &rpq); 1093 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1094 vdev_raidz_reconst_pq_tail_func, &rpq); 1095 1096 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1097 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1098 1099 /* 1100 * Restore the saved parity data. 1101 */ 1102 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 1103 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1104 1105 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1106 } 1107 1108 /* BEGIN CSTYLED */ 1109 /* 1110 * In the general case of reconstruction, we must solve the system of linear 1111 * equations defined by the coeffecients used to generate parity as well as 1112 * the contents of the data and parity disks. This can be expressed with 1113 * vectors for the original data (D) and the actual data (d) and parity (p) 1114 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1115 * 1116 * __ __ __ __ 1117 * | | __ __ | p_0 | 1118 * | V | | D_0 | | p_m-1 | 1119 * | | x | : | = | d_0 | 1120 * | I | | D_n-1 | | : | 1121 * | | ~~ ~~ | d_n-1 | 1122 * ~~ ~~ ~~ ~~ 1123 * 1124 * I is simply a square identity matrix of size n, and V is a vandermonde 1125 * matrix defined by the coeffecients we chose for the various parity columns 1126 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1127 * computation as well as linear separability. 1128 * 1129 * __ __ __ __ 1130 * | 1 .. 1 1 1 | | p_0 | 1131 * | 2^n-1 .. 4 2 1 | __ __ | : | 1132 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1133 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1134 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1135 * | : : : : | | : | | d_2 | 1136 * | 0 .. 1 0 0 | | D_n-1 | | : | 1137 * | 0 .. 0 1 0 | ~~ ~~ | : | 1138 * | 0 .. 0 0 1 | | d_n-1 | 1139 * ~~ ~~ ~~ ~~ 1140 * 1141 * Note that I, V, d, and p are known. To compute D, we must invert the 1142 * matrix and use the known data and parity values to reconstruct the unknown 1143 * data values. We begin by removing the rows in V|I and d|p that correspond 1144 * to failed or missing columns; we then make V|I square (n x n) and d|p 1145 * sized n by removing rows corresponding to unused parity from the bottom up 1146 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1147 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1148 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1149 * __ __ 1150 * | 1 1 1 1 1 1 1 1 | 1151 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1152 * | 19 205 116 29 64 16 4 1 | / / 1153 * | 1 0 0 0 0 0 0 0 | / / 1154 * | 0 1 0 0 0 0 0 0 | <--' / 1155 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1156 * | 0 0 0 1 0 0 0 0 | 1157 * | 0 0 0 0 1 0 0 0 | 1158 * | 0 0 0 0 0 1 0 0 | 1159 * | 0 0 0 0 0 0 1 0 | 1160 * | 0 0 0 0 0 0 0 1 | 1161 * ~~ ~~ 1162 * __ __ 1163 * | 1 1 1 1 1 1 1 1 | 1164 * | 19 205 116 29 64 16 4 1 | 1165 * | 1 0 0 0 0 0 0 0 | 1166 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1167 * | 0 0 0 0 1 0 0 0 | 1168 * | 0 0 0 0 0 1 0 0 | 1169 * | 0 0 0 0 0 0 1 0 | 1170 * | 0 0 0 0 0 0 0 1 | 1171 * ~~ ~~ 1172 * 1173 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1174 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1175 * matrix is not singular. 1176 * __ __ 1177 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1178 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1179 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1180 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1181 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1182 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1183 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1184 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1185 * ~~ ~~ 1186 * __ __ 1187 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1188 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1189 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1190 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1191 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1192 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1193 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1194 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1195 * ~~ ~~ 1196 * __ __ 1197 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1198 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1199 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1200 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1201 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1202 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1203 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1204 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1205 * ~~ ~~ 1206 * __ __ 1207 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1208 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1209 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1210 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1211 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1212 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1213 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1214 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1215 * ~~ ~~ 1216 * __ __ 1217 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1218 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1219 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1220 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1221 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1222 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1223 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1224 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1225 * ~~ ~~ 1226 * __ __ 1227 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1228 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1229 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1230 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1231 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1232 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1233 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1234 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1235 * ~~ ~~ 1236 * __ __ 1237 * | 0 0 1 0 0 0 0 0 | 1238 * | 167 100 5 41 159 169 217 208 | 1239 * | 166 100 4 40 158 168 216 209 | 1240 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1241 * | 0 0 0 0 1 0 0 0 | 1242 * | 0 0 0 0 0 1 0 0 | 1243 * | 0 0 0 0 0 0 1 0 | 1244 * | 0 0 0 0 0 0 0 1 | 1245 * ~~ ~~ 1246 * 1247 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1248 * of the missing data. 1249 * 1250 * As is apparent from the example above, the only non-trivial rows in the 1251 * inverse matrix correspond to the data disks that we're trying to 1252 * reconstruct. Indeed, those are the only rows we need as the others would 1253 * only be useful for reconstructing data known or assumed to be valid. For 1254 * that reason, we only build the coefficients in the rows that correspond to 1255 * targeted columns. 1256 */ 1257 /* END CSTYLED */ 1258 1259 static void 1260 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1261 uint8_t **rows) 1262 { 1263 int i, j; 1264 int pow; 1265 1266 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1267 1268 /* 1269 * Fill in the missing rows of interest. 1270 */ 1271 for (i = 0; i < nmap; i++) { 1272 ASSERT3S(0, <=, map[i]); 1273 ASSERT3S(map[i], <=, 2); 1274 1275 pow = map[i] * n; 1276 if (pow > 255) 1277 pow -= 255; 1278 ASSERT(pow <= 255); 1279 1280 for (j = 0; j < n; j++) { 1281 pow -= map[i]; 1282 if (pow < 0) 1283 pow += 255; 1284 rows[i][j] = vdev_raidz_pow2[pow]; 1285 } 1286 } 1287 } 1288 1289 static void 1290 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1291 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1292 { 1293 int i, j, ii, jj; 1294 uint8_t log; 1295 1296 /* 1297 * Assert that the first nmissing entries from the array of used 1298 * columns correspond to parity columns and that subsequent entries 1299 * correspond to data columns. 1300 */ 1301 for (i = 0; i < nmissing; i++) { 1302 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1303 } 1304 for (; i < n; i++) { 1305 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1306 } 1307 1308 /* 1309 * First initialize the storage where we'll compute the inverse rows. 1310 */ 1311 for (i = 0; i < nmissing; i++) { 1312 for (j = 0; j < n; j++) { 1313 invrows[i][j] = (i == j) ? 1 : 0; 1314 } 1315 } 1316 1317 /* 1318 * Subtract all trivial rows from the rows of consequence. 1319 */ 1320 for (i = 0; i < nmissing; i++) { 1321 for (j = nmissing; j < n; j++) { 1322 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1323 jj = used[j] - rm->rm_firstdatacol; 1324 ASSERT3S(jj, <, n); 1325 invrows[i][j] = rows[i][jj]; 1326 rows[i][jj] = 0; 1327 } 1328 } 1329 1330 /* 1331 * For each of the rows of interest, we must normalize it and subtract 1332 * a multiple of it from the other rows. 1333 */ 1334 for (i = 0; i < nmissing; i++) { 1335 for (j = 0; j < missing[i]; j++) { 1336 ASSERT0(rows[i][j]); 1337 } 1338 ASSERT3U(rows[i][missing[i]], !=, 0); 1339 1340 /* 1341 * Compute the inverse of the first element and multiply each 1342 * element in the row by that value. 1343 */ 1344 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1345 1346 for (j = 0; j < n; j++) { 1347 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1348 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1349 } 1350 1351 for (ii = 0; ii < nmissing; ii++) { 1352 if (i == ii) 1353 continue; 1354 1355 ASSERT3U(rows[ii][missing[i]], !=, 0); 1356 1357 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1358 1359 for (j = 0; j < n; j++) { 1360 rows[ii][j] ^= 1361 vdev_raidz_exp2(rows[i][j], log); 1362 invrows[ii][j] ^= 1363 vdev_raidz_exp2(invrows[i][j], log); 1364 } 1365 } 1366 } 1367 1368 /* 1369 * Verify that the data that is left in the rows are properly part of 1370 * an identity matrix. 1371 */ 1372 for (i = 0; i < nmissing; i++) { 1373 for (j = 0; j < n; j++) { 1374 if (j == missing[i]) { 1375 ASSERT3U(rows[i][j], ==, 1); 1376 } else { 1377 ASSERT0(rows[i][j]); 1378 } 1379 } 1380 } 1381 } 1382 1383 static void 1384 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1385 int *missing, uint8_t **invrows, const uint8_t *used) 1386 { 1387 int i, j, x, cc, c; 1388 uint8_t *src; 1389 uint64_t ccount; 1390 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1391 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1392 uint8_t log = 0; 1393 uint8_t val; 1394 int ll; 1395 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1396 uint8_t *p, *pp; 1397 size_t psize; 1398 1399 psize = sizeof (invlog[0][0]) * n * nmissing; 1400 p = kmem_alloc(psize, KM_SLEEP); 1401 1402 for (pp = p, i = 0; i < nmissing; i++) { 1403 invlog[i] = pp; 1404 pp += n; 1405 } 1406 1407 for (i = 0; i < nmissing; i++) { 1408 for (j = 0; j < n; j++) { 1409 ASSERT3U(invrows[i][j], !=, 0); 1410 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1411 } 1412 } 1413 1414 for (i = 0; i < n; i++) { 1415 c = used[i]; 1416 ASSERT3U(c, <, rm->rm_cols); 1417 1418 src = abd_to_buf(rm->rm_col[c].rc_abd); 1419 ccount = rm->rm_col[c].rc_size; 1420 for (j = 0; j < nmissing; j++) { 1421 cc = missing[j] + rm->rm_firstdatacol; 1422 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1423 ASSERT3U(cc, <, rm->rm_cols); 1424 ASSERT3U(cc, !=, c); 1425 1426 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1427 dcount[j] = rm->rm_col[cc].rc_size; 1428 } 1429 1430 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1431 1432 for (x = 0; x < ccount; x++, src++) { 1433 if (*src != 0) 1434 log = vdev_raidz_log2[*src]; 1435 1436 for (cc = 0; cc < nmissing; cc++) { 1437 if (x >= dcount[cc]) 1438 continue; 1439 1440 if (*src == 0) { 1441 val = 0; 1442 } else { 1443 if ((ll = log + invlog[cc][i]) >= 255) 1444 ll -= 255; 1445 val = vdev_raidz_pow2[ll]; 1446 } 1447 1448 if (i == 0) 1449 dst[cc][x] = val; 1450 else 1451 dst[cc][x] ^= val; 1452 } 1453 } 1454 } 1455 1456 kmem_free(p, psize); 1457 } 1458 1459 static int 1460 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1461 { 1462 int n, i, c, t, tt; 1463 int nmissing_rows; 1464 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1465 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1466 1467 uint8_t *p, *pp; 1468 size_t psize; 1469 1470 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1471 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1472 uint8_t *used; 1473 1474 abd_t **bufs = NULL; 1475 1476 int code = 0; 1477 1478 /* 1479 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1480 * temporary linear ABDs. 1481 */ 1482 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1483 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1484 1485 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1486 raidz_col_t *col = &rm->rm_col[c]; 1487 1488 bufs[c] = col->rc_abd; 1489 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1490 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1491 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1492 abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size); 1493 } 1494 } 1495 1496 n = rm->rm_cols - rm->rm_firstdatacol; 1497 1498 /* 1499 * Figure out which data columns are missing. 1500 */ 1501 nmissing_rows = 0; 1502 for (t = 0; t < ntgts; t++) { 1503 if (tgts[t] >= rm->rm_firstdatacol) { 1504 missing_rows[nmissing_rows++] = 1505 tgts[t] - rm->rm_firstdatacol; 1506 } 1507 } 1508 1509 /* 1510 * Figure out which parity columns to use to help generate the missing 1511 * data columns. 1512 */ 1513 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1514 ASSERT(tt < ntgts); 1515 ASSERT(c < rm->rm_firstdatacol); 1516 1517 /* 1518 * Skip any targeted parity columns. 1519 */ 1520 if (c == tgts[tt]) { 1521 tt++; 1522 continue; 1523 } 1524 1525 code |= 1 << c; 1526 1527 parity_map[i] = c; 1528 i++; 1529 } 1530 1531 ASSERT(code != 0); 1532 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1533 1534 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1535 nmissing_rows * n + sizeof (used[0]) * n; 1536 p = kmem_alloc(psize, KM_SLEEP); 1537 1538 for (pp = p, i = 0; i < nmissing_rows; i++) { 1539 rows[i] = pp; 1540 pp += n; 1541 invrows[i] = pp; 1542 pp += n; 1543 } 1544 used = pp; 1545 1546 for (i = 0; i < nmissing_rows; i++) { 1547 used[i] = parity_map[i]; 1548 } 1549 1550 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1551 if (tt < nmissing_rows && 1552 c == missing_rows[tt] + rm->rm_firstdatacol) { 1553 tt++; 1554 continue; 1555 } 1556 1557 ASSERT3S(i, <, n); 1558 used[i] = c; 1559 i++; 1560 } 1561 1562 /* 1563 * Initialize the interesting rows of the matrix. 1564 */ 1565 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1566 1567 /* 1568 * Invert the matrix. 1569 */ 1570 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1571 invrows, used); 1572 1573 /* 1574 * Reconstruct the missing data using the generated matrix. 1575 */ 1576 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1577 invrows, used); 1578 1579 kmem_free(p, psize); 1580 1581 /* 1582 * copy back from temporary linear abds and free them 1583 */ 1584 if (bufs) { 1585 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1586 raidz_col_t *col = &rm->rm_col[c]; 1587 1588 ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); 1589 ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); 1590 abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size); 1591 abd_free(col->rc_abd); 1592 col->rc_abd = bufs[c]; 1593 } 1594 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1595 } 1596 1597 return (code); 1598 } 1599 1600 static int 1601 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1602 { 1603 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1604 int ntgts; 1605 int i, c; 1606 int code; 1607 int nbadparity, nbaddata; 1608 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1609 1610 /* 1611 * The tgts list must already be sorted. 1612 */ 1613 for (i = 1; i < nt; i++) { 1614 ASSERT(t[i] > t[i - 1]); 1615 } 1616 1617 nbadparity = rm->rm_firstdatacol; 1618 nbaddata = rm->rm_cols - nbadparity; 1619 ntgts = 0; 1620 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1621 if (c < rm->rm_firstdatacol) 1622 parity_valid[c] = B_FALSE; 1623 1624 if (i < nt && c == t[i]) { 1625 tgts[ntgts++] = c; 1626 i++; 1627 } else if (rm->rm_col[c].rc_error != 0) { 1628 tgts[ntgts++] = c; 1629 } else if (c >= rm->rm_firstdatacol) { 1630 nbaddata--; 1631 } else { 1632 parity_valid[c] = B_TRUE; 1633 nbadparity--; 1634 } 1635 } 1636 1637 ASSERT(ntgts >= nt); 1638 ASSERT(nbaddata >= 0); 1639 ASSERT(nbaddata + nbadparity == ntgts); 1640 1641 dt = &tgts[nbadparity]; 1642 1643 /* 1644 * See if we can use any of our optimized reconstruction routines. 1645 */ 1646 if (!vdev_raidz_default_to_general) { 1647 switch (nbaddata) { 1648 case 1: 1649 if (parity_valid[VDEV_RAIDZ_P]) 1650 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1651 1652 ASSERT(rm->rm_firstdatacol > 1); 1653 1654 if (parity_valid[VDEV_RAIDZ_Q]) 1655 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1656 1657 ASSERT(rm->rm_firstdatacol > 2); 1658 break; 1659 1660 case 2: 1661 ASSERT(rm->rm_firstdatacol > 1); 1662 1663 if (parity_valid[VDEV_RAIDZ_P] && 1664 parity_valid[VDEV_RAIDZ_Q]) 1665 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1666 1667 ASSERT(rm->rm_firstdatacol > 2); 1668 1669 break; 1670 } 1671 } 1672 1673 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1674 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1675 ASSERT(code > 0); 1676 return (code); 1677 } 1678 1679 static int 1680 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1681 uint64_t *ashift) 1682 { 1683 vdev_t *cvd; 1684 uint64_t nparity = vd->vdev_nparity; 1685 int c; 1686 int lasterror = 0; 1687 int numerrors = 0; 1688 1689 ASSERT(nparity > 0); 1690 1691 if (nparity > VDEV_RAIDZ_MAXPARITY || 1692 vd->vdev_children < nparity + 1) { 1693 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1694 return (SET_ERROR(EINVAL)); 1695 } 1696 1697 vdev_open_children(vd); 1698 1699 for (c = 0; c < vd->vdev_children; c++) { 1700 cvd = vd->vdev_child[c]; 1701 1702 if (cvd->vdev_open_error != 0) { 1703 lasterror = cvd->vdev_open_error; 1704 numerrors++; 1705 continue; 1706 } 1707 1708 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1709 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1710 *ashift = MAX(*ashift, cvd->vdev_ashift); 1711 } 1712 1713 *asize *= vd->vdev_children; 1714 *max_asize *= vd->vdev_children; 1715 1716 if (numerrors > nparity) { 1717 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1718 return (lasterror); 1719 } 1720 1721 return (0); 1722 } 1723 1724 static void 1725 vdev_raidz_close(vdev_t *vd) 1726 { 1727 int c; 1728 1729 for (c = 0; c < vd->vdev_children; c++) 1730 vdev_close(vd->vdev_child[c]); 1731 } 1732 1733 /* 1734 * Handle a read or write I/O to a RAID-Z dump device. 1735 * 1736 * The dump device is in a unique situation compared to other ZFS datasets: 1737 * writing to this device should be as simple and fast as possible. In 1738 * addition, durability matters much less since the dump will be extracted 1739 * once the machine reboots. For that reason, this function eschews parity for 1740 * performance and simplicity. The dump device uses the checksum setting 1741 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1742 * dataset. 1743 * 1744 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1745 * 128 KB will not fill an entire block; in addition, they may not be properly 1746 * aligned. In that case, this function uses the preallocated 128 KB block and 1747 * omits reading or writing any "empty" portions of that block, as opposed to 1748 * allocating a fresh appropriately-sized block. 1749 * 1750 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1751 * 1752 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1753 * 1754 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1755 * allocated which spans all five child vdevs. 8 KB of data would be written to 1756 * each of four vdevs, with the fifth containing the parity bits. 1757 * 1758 * parity data data data data 1759 * | PP | XX | XX | XX | XX | 1760 * ^ ^ ^ ^ ^ 1761 * | | | | | 1762 * 8 KB parity ------8 KB data blocks------ 1763 * 1764 * However, when writing to the dump device, the behavior is different: 1765 * 1766 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1767 * 1768 * Unlike the normal RAID-Z case in which the block is allocated based on the 1769 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1770 * I/O size is less than 128 KB, only the actual portions of data are written. 1771 * In this example the data is written to the third data vdev since that vdev 1772 * contains the offset [64 KB, 96 KB). 1773 * 1774 * parity data data data data 1775 * | | | | XX | | 1776 * ^ 1777 * | 1778 * 32 KB data block 1779 * 1780 * As a result, an individual I/O may not span all child vdevs; moreover, a 1781 * small I/O may only operate on a single child vdev. 1782 * 1783 * Note that since there are no parity bits calculated or written, this format 1784 * remains the same no matter how many parity bits are used in a normal RAID-Z 1785 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1786 * would look like: 1787 * 1788 * parity parity parity data data data data 1789 * | | | | | | XX | | 1790 * ^ 1791 * | 1792 * 32 KB data block 1793 */ 1794 int 1795 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1796 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1797 { 1798 vdev_t *tvd = vd->vdev_top; 1799 vdev_t *cvd; 1800 raidz_map_t *rm; 1801 raidz_col_t *rc; 1802 int c, err = 0; 1803 1804 uint64_t start, end, colstart, colend; 1805 uint64_t coloffset, colsize, colskip; 1806 1807 int flags = doread ? B_READ : B_WRITE; 1808 1809 #ifdef _KERNEL 1810 1811 /* 1812 * Don't write past the end of the block 1813 */ 1814 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1815 1816 start = offset; 1817 end = start + size; 1818 1819 /* 1820 * Allocate a RAID-Z map for this block. Note that this block starts 1821 * from the "original" offset, this is, the offset of the extent which 1822 * contains the requisite offset of the data being read or written. 1823 * 1824 * Even if this I/O operation doesn't span the full block size, let's 1825 * treat the on-disk format as if the only blocks are the complete 128 1826 * KB size. 1827 */ 1828 abd_t *abd = abd_get_from_buf(data - (offset - origoffset), 1829 SPA_OLD_MAXBLOCKSIZE); 1830 rm = vdev_raidz_map_alloc(abd, 1831 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, 1832 vd->vdev_children, vd->vdev_nparity); 1833 1834 coloffset = origoffset; 1835 1836 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1837 c++, coloffset += rc->rc_size) { 1838 rc = &rm->rm_col[c]; 1839 cvd = vd->vdev_child[rc->rc_devidx]; 1840 1841 /* 1842 * Find the start and end of this column in the RAID-Z map, 1843 * keeping in mind that the stated size and offset of the 1844 * operation may not fill the entire column for this vdev. 1845 * 1846 * If any portion of the data spans this column, issue the 1847 * appropriate operation to the vdev. 1848 */ 1849 if (coloffset + rc->rc_size <= start) 1850 continue; 1851 if (coloffset >= end) 1852 continue; 1853 1854 colstart = MAX(coloffset, start); 1855 colend = MIN(end, coloffset + rc->rc_size); 1856 colsize = colend - colstart; 1857 colskip = colstart - coloffset; 1858 1859 VERIFY3U(colsize, <=, rc->rc_size); 1860 VERIFY3U(colskip, <=, rc->rc_size); 1861 1862 /* 1863 * Note that the child vdev will have a vdev label at the start 1864 * of its range of offsets, hence the need for 1865 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1866 * example of why this calculation is needed. 1867 */ 1868 if ((err = vdev_disk_physio(cvd, 1869 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1870 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1871 flags, isdump)) != 0) 1872 break; 1873 } 1874 1875 vdev_raidz_map_free(rm); 1876 abd_put(abd); 1877 #endif /* KERNEL */ 1878 1879 return (err); 1880 } 1881 1882 static uint64_t 1883 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1884 { 1885 uint64_t asize; 1886 uint64_t ashift = vd->vdev_top->vdev_ashift; 1887 uint64_t cols = vd->vdev_children; 1888 uint64_t nparity = vd->vdev_nparity; 1889 1890 asize = ((psize - 1) >> ashift) + 1; 1891 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1892 asize = roundup(asize, nparity + 1) << ashift; 1893 1894 return (asize); 1895 } 1896 1897 static void 1898 vdev_raidz_child_done(zio_t *zio) 1899 { 1900 raidz_col_t *rc = zio->io_private; 1901 1902 rc->rc_error = zio->io_error; 1903 rc->rc_tried = 1; 1904 rc->rc_skipped = 0; 1905 } 1906 1907 static void 1908 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1909 { 1910 #ifdef ZFS_DEBUG 1911 vdev_t *vd = zio->io_vd; 1912 vdev_t *tvd = vd->vdev_top; 1913 1914 range_seg64_t logical_rs, physical_rs; 1915 logical_rs.rs_start = zio->io_offset; 1916 logical_rs.rs_end = logical_rs.rs_start + 1917 vdev_raidz_asize(zio->io_vd, zio->io_size); 1918 1919 raidz_col_t *rc = &rm->rm_col[col]; 1920 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1921 1922 vdev_xlate(cvd, &logical_rs, &physical_rs); 1923 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1924 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1925 /* 1926 * It would be nice to assert that rs_end is equal 1927 * to rc_offset + rc_size but there might be an 1928 * optional I/O at the end that is not accounted in 1929 * rc_size. 1930 */ 1931 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1932 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1933 rc->rc_size + (1 << tvd->vdev_ashift)); 1934 } else { 1935 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1936 } 1937 #endif 1938 } 1939 1940 /* 1941 * Start an IO operation on a RAIDZ VDev 1942 * 1943 * Outline: 1944 * - For write operations: 1945 * 1. Generate the parity data 1946 * 2. Create child zio write operations to each column's vdev, for both 1947 * data and parity. 1948 * 3. If the column skips any sectors for padding, create optional dummy 1949 * write zio children for those areas to improve aggregation continuity. 1950 * - For read operations: 1951 * 1. Create child zio read operations to each data column's vdev to read 1952 * the range of data required for zio. 1953 * 2. If this is a scrub or resilver operation, or if any of the data 1954 * vdevs have had errors, then create zio read operations to the parity 1955 * columns' VDevs as well. 1956 */ 1957 static void 1958 vdev_raidz_io_start(zio_t *zio) 1959 { 1960 vdev_t *vd = zio->io_vd; 1961 vdev_t *tvd = vd->vdev_top; 1962 vdev_t *cvd; 1963 raidz_map_t *rm; 1964 raidz_col_t *rc; 1965 int c, i; 1966 1967 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, 1968 tvd->vdev_ashift, vd->vdev_children, 1969 vd->vdev_nparity); 1970 1971 zio->io_vsd = rm; 1972 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1973 1974 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1975 1976 if (zio->io_type == ZIO_TYPE_WRITE) { 1977 vdev_raidz_generate_parity(rm); 1978 1979 for (c = 0; c < rm->rm_cols; c++) { 1980 rc = &rm->rm_col[c]; 1981 cvd = vd->vdev_child[rc->rc_devidx]; 1982 1983 /* 1984 * Verify physical to logical translation. 1985 */ 1986 vdev_raidz_io_verify(zio, rm, c); 1987 1988 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1989 rc->rc_offset, rc->rc_abd, rc->rc_size, 1990 zio->io_type, zio->io_priority, 0, 1991 vdev_raidz_child_done, rc)); 1992 } 1993 1994 /* 1995 * Generate optional I/Os for any skipped sectors to improve 1996 * aggregation contiguity. 1997 */ 1998 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1999 ASSERT(c <= rm->rm_scols); 2000 if (c == rm->rm_scols) 2001 c = 0; 2002 rc = &rm->rm_col[c]; 2003 cvd = vd->vdev_child[rc->rc_devidx]; 2004 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2005 rc->rc_offset + rc->rc_size, NULL, 2006 1 << tvd->vdev_ashift, 2007 zio->io_type, zio->io_priority, 2008 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2009 } 2010 2011 zio_execute(zio); 2012 return; 2013 } 2014 2015 ASSERT(zio->io_type == ZIO_TYPE_READ); 2016 2017 /* 2018 * Iterate over the columns in reverse order so that we hit the parity 2019 * last -- any errors along the way will force us to read the parity. 2020 */ 2021 for (c = rm->rm_cols - 1; c >= 0; c--) { 2022 rc = &rm->rm_col[c]; 2023 cvd = vd->vdev_child[rc->rc_devidx]; 2024 if (!vdev_readable(cvd)) { 2025 if (c >= rm->rm_firstdatacol) 2026 rm->rm_missingdata++; 2027 else 2028 rm->rm_missingparity++; 2029 rc->rc_error = SET_ERROR(ENXIO); 2030 rc->rc_tried = 1; /* don't even try */ 2031 rc->rc_skipped = 1; 2032 continue; 2033 } 2034 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2035 if (c >= rm->rm_firstdatacol) 2036 rm->rm_missingdata++; 2037 else 2038 rm->rm_missingparity++; 2039 rc->rc_error = SET_ERROR(ESTALE); 2040 rc->rc_skipped = 1; 2041 continue; 2042 } 2043 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 2044 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2045 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2046 rc->rc_offset, rc->rc_abd, rc->rc_size, 2047 zio->io_type, zio->io_priority, 0, 2048 vdev_raidz_child_done, rc)); 2049 } 2050 } 2051 2052 zio_execute(zio); 2053 } 2054 2055 2056 /* 2057 * Report a checksum error for a child of a RAID-Z device. 2058 */ 2059 static void 2060 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2061 { 2062 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2063 2064 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2065 zio_bad_cksum_t zbc; 2066 raidz_map_t *rm = zio->io_vsd; 2067 2068 mutex_enter(&vd->vdev_stat_lock); 2069 vd->vdev_stat.vs_checksum_errors++; 2070 mutex_exit(&vd->vdev_stat_lock); 2071 2072 zbc.zbc_has_cksum = 0; 2073 zbc.zbc_injected = rm->rm_ecksuminjected; 2074 2075 zfs_ereport_post_checksum(zio->io_spa, vd, 2076 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2077 rc->rc_abd, bad_data, &zbc); 2078 } 2079 } 2080 2081 /* 2082 * We keep track of whether or not there were any injected errors, so that 2083 * any ereports we generate can note it. 2084 */ 2085 static int 2086 raidz_checksum_verify(zio_t *zio) 2087 { 2088 zio_bad_cksum_t zbc; 2089 raidz_map_t *rm = zio->io_vsd; 2090 2091 int ret = zio_checksum_error(zio, &zbc); 2092 if (ret != 0 && zbc.zbc_injected != 0) 2093 rm->rm_ecksuminjected = 1; 2094 2095 return (ret); 2096 } 2097 2098 /* 2099 * Generate the parity from the data columns. If we tried and were able to 2100 * read the parity without error, verify that the generated parity matches the 2101 * data we read. If it doesn't, we fire off a checksum error. Return the 2102 * number such failures. 2103 */ 2104 static int 2105 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2106 { 2107 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2108 int c, ret = 0; 2109 raidz_col_t *rc; 2110 2111 blkptr_t *bp = zio->io_bp; 2112 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2113 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2114 2115 if (checksum == ZIO_CHECKSUM_NOPARITY) 2116 return (ret); 2117 2118 for (c = 0; c < rm->rm_firstdatacol; c++) { 2119 rc = &rm->rm_col[c]; 2120 if (!rc->rc_tried || rc->rc_error != 0) 2121 continue; 2122 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); 2123 abd_copy(orig[c], rc->rc_abd, rc->rc_size); 2124 } 2125 2126 vdev_raidz_generate_parity(rm); 2127 2128 for (c = 0; c < rm->rm_firstdatacol; c++) { 2129 rc = &rm->rm_col[c]; 2130 if (!rc->rc_tried || rc->rc_error != 0) 2131 continue; 2132 if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) { 2133 raidz_checksum_error(zio, rc, orig[c]); 2134 rc->rc_error = SET_ERROR(ECKSUM); 2135 ret++; 2136 } 2137 abd_free(orig[c]); 2138 } 2139 2140 return (ret); 2141 } 2142 2143 /* 2144 * Keep statistics on all the ways that we used parity to correct data. 2145 */ 2146 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 2147 2148 static int 2149 vdev_raidz_worst_error(raidz_map_t *rm) 2150 { 2151 int error = 0; 2152 2153 for (int c = 0; c < rm->rm_cols; c++) 2154 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2155 2156 return (error); 2157 } 2158 2159 /* 2160 * Iterate over all combinations of bad data and attempt a reconstruction. 2161 * Note that the algorithm below is non-optimal because it doesn't take into 2162 * account how reconstruction is actually performed. For example, with 2163 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2164 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2165 * cases we'd only use parity information in column 0. 2166 */ 2167 static int 2168 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2169 { 2170 raidz_map_t *rm = zio->io_vsd; 2171 raidz_col_t *rc; 2172 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2173 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2174 int *tgts = &tstore[1]; 2175 int current, next, i, c, n; 2176 int code, ret = 0; 2177 2178 ASSERT(total_errors < rm->rm_firstdatacol); 2179 2180 /* 2181 * This simplifies one edge condition. 2182 */ 2183 tgts[-1] = -1; 2184 2185 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2186 /* 2187 * Initialize the targets array by finding the first n columns 2188 * that contain no error. 2189 * 2190 * If there were no data errors, we need to ensure that we're 2191 * always explicitly attempting to reconstruct at least one 2192 * data column. To do this, we simply push the highest target 2193 * up into the data columns. 2194 */ 2195 for (c = 0, i = 0; i < n; i++) { 2196 if (i == n - 1 && data_errors == 0 && 2197 c < rm->rm_firstdatacol) { 2198 c = rm->rm_firstdatacol; 2199 } 2200 2201 while (rm->rm_col[c].rc_error != 0) { 2202 c++; 2203 ASSERT3S(c, <, rm->rm_cols); 2204 } 2205 2206 tgts[i] = c++; 2207 } 2208 2209 /* 2210 * Setting tgts[n] simplifies the other edge condition. 2211 */ 2212 tgts[n] = rm->rm_cols; 2213 2214 /* 2215 * These buffers were allocated in previous iterations. 2216 */ 2217 for (i = 0; i < n - 1; i++) { 2218 ASSERT(orig[i] != NULL); 2219 } 2220 2221 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, 2222 rm->rm_col[0].rc_size); 2223 2224 current = 0; 2225 next = tgts[current]; 2226 2227 while (current != n) { 2228 tgts[current] = next; 2229 current = 0; 2230 2231 /* 2232 * Save off the original data that we're going to 2233 * attempt to reconstruct. 2234 */ 2235 for (i = 0; i < n; i++) { 2236 ASSERT(orig[i] != NULL); 2237 c = tgts[i]; 2238 ASSERT3S(c, >=, 0); 2239 ASSERT3S(c, <, rm->rm_cols); 2240 rc = &rm->rm_col[c]; 2241 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2242 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2243 abd_copy_off(orig[i], rc->rc_abd, 0, 0, 2244 rc->rc_size); 2245 } 2246 2247 /* 2248 * Attempt a reconstruction and exit the outer loop on 2249 * success. 2250 */ 2251 code = vdev_raidz_reconstruct(rm, tgts, n); 2252 if (raidz_checksum_verify(zio) == 0) { 2253 atomic_inc_64(&raidz_corrected[code]); 2254 2255 for (i = 0; i < n; i++) { 2256 c = tgts[i]; 2257 rc = &rm->rm_col[c]; 2258 ASSERT(rc->rc_error == 0); 2259 if (rc->rc_tried) 2260 raidz_checksum_error(zio, rc, 2261 orig[i]); 2262 rc->rc_error = SET_ERROR(ECKSUM); 2263 } 2264 2265 ret = code; 2266 goto done; 2267 } 2268 2269 /* 2270 * Restore the original data. 2271 */ 2272 for (i = 0; i < n; i++) { 2273 c = tgts[i]; 2274 rc = &rm->rm_col[c]; 2275 ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); 2276 ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); 2277 abd_copy_off(rc->rc_abd, orig[i], 0, 0, 2278 rc->rc_size); 2279 } 2280 2281 do { 2282 /* 2283 * Find the next valid column after the current 2284 * position.. 2285 */ 2286 for (next = tgts[current] + 1; 2287 next < rm->rm_cols && 2288 rm->rm_col[next].rc_error != 0; next++) 2289 continue; 2290 2291 ASSERT(next <= tgts[current + 1]); 2292 2293 /* 2294 * If that spot is available, we're done here. 2295 */ 2296 if (next != tgts[current + 1]) 2297 break; 2298 2299 /* 2300 * Otherwise, find the next valid column after 2301 * the previous position. 2302 */ 2303 for (c = tgts[current - 1] + 1; 2304 rm->rm_col[c].rc_error != 0; c++) 2305 continue; 2306 2307 tgts[current] = c; 2308 current++; 2309 2310 } while (current != n); 2311 } 2312 } 2313 n--; 2314 done: 2315 for (i = 0; i < n; i++) 2316 abd_free(orig[i]); 2317 2318 return (ret); 2319 } 2320 2321 /* 2322 * Complete an IO operation on a RAIDZ VDev 2323 * 2324 * Outline: 2325 * - For write operations: 2326 * 1. Check for errors on the child IOs. 2327 * 2. Return, setting an error code if too few child VDevs were written 2328 * to reconstruct the data later. Note that partial writes are 2329 * considered successful if they can be reconstructed at all. 2330 * - For read operations: 2331 * 1. Check for errors on the child IOs. 2332 * 2. If data errors occurred: 2333 * a. Try to reassemble the data from the parity available. 2334 * b. If we haven't yet read the parity drives, read them now. 2335 * c. If all parity drives have been read but the data still doesn't 2336 * reassemble with a correct checksum, then try combinatorial 2337 * reconstruction. 2338 * d. If that doesn't work, return an error. 2339 * 3. If there were unexpected errors or this is a resilver operation, 2340 * rewrite the vdevs that had errors. 2341 */ 2342 static void 2343 vdev_raidz_io_done(zio_t *zio) 2344 { 2345 vdev_t *vd = zio->io_vd; 2346 vdev_t *cvd; 2347 raidz_map_t *rm = zio->io_vsd; 2348 raidz_col_t *rc; 2349 int unexpected_errors = 0; 2350 int parity_errors = 0; 2351 int parity_untried = 0; 2352 int data_errors = 0; 2353 int total_errors = 0; 2354 int n, c; 2355 int tgts[VDEV_RAIDZ_MAXPARITY]; 2356 int code; 2357 2358 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2359 2360 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2361 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2362 2363 for (c = 0; c < rm->rm_cols; c++) { 2364 rc = &rm->rm_col[c]; 2365 2366 if (rc->rc_error) { 2367 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2368 2369 if (c < rm->rm_firstdatacol) 2370 parity_errors++; 2371 else 2372 data_errors++; 2373 2374 if (!rc->rc_skipped) 2375 unexpected_errors++; 2376 2377 total_errors++; 2378 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2379 parity_untried++; 2380 } 2381 } 2382 2383 if (zio->io_type == ZIO_TYPE_WRITE) { 2384 /* 2385 * XXX -- for now, treat partial writes as a success. 2386 * (If we couldn't write enough columns to reconstruct 2387 * the data, the I/O failed. Otherwise, good enough.) 2388 * 2389 * Now that we support write reallocation, it would be better 2390 * to treat partial failure as real failure unless there are 2391 * no non-degraded top-level vdevs left, and not update DTLs 2392 * if we intend to reallocate. 2393 */ 2394 /* XXPOLICY */ 2395 if (total_errors > rm->rm_firstdatacol) 2396 zio->io_error = vdev_raidz_worst_error(rm); 2397 2398 return; 2399 } 2400 2401 ASSERT(zio->io_type == ZIO_TYPE_READ); 2402 /* 2403 * There are three potential phases for a read: 2404 * 1. produce valid data from the columns read 2405 * 2. read all disks and try again 2406 * 3. perform combinatorial reconstruction 2407 * 2408 * Each phase is progressively both more expensive and less likely to 2409 * occur. If we encounter more errors than we can repair or all phases 2410 * fail, we have no choice but to return an error. 2411 */ 2412 2413 /* 2414 * If the number of errors we saw was correctable -- less than or equal 2415 * to the number of parity disks read -- attempt to produce data that 2416 * has a valid checksum. Naturally, this case applies in the absence of 2417 * any errors. 2418 */ 2419 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2420 if (data_errors == 0) { 2421 if (raidz_checksum_verify(zio) == 0) { 2422 /* 2423 * If we read parity information (unnecessarily 2424 * as it happens since no reconstruction was 2425 * needed) regenerate and verify the parity. 2426 * We also regenerate parity when resilvering 2427 * so we can write it out to the failed device 2428 * later. 2429 */ 2430 if (parity_errors + parity_untried < 2431 rm->rm_firstdatacol || 2432 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2433 n = raidz_parity_verify(zio, rm); 2434 unexpected_errors += n; 2435 ASSERT(parity_errors + n <= 2436 rm->rm_firstdatacol); 2437 } 2438 goto done; 2439 } 2440 } else { 2441 /* 2442 * We either attempt to read all the parity columns or 2443 * none of them. If we didn't try to read parity, we 2444 * wouldn't be here in the correctable case. There must 2445 * also have been fewer parity errors than parity 2446 * columns or, again, we wouldn't be in this code path. 2447 */ 2448 ASSERT(parity_untried == 0); 2449 ASSERT(parity_errors < rm->rm_firstdatacol); 2450 2451 /* 2452 * Identify the data columns that reported an error. 2453 */ 2454 n = 0; 2455 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2456 rc = &rm->rm_col[c]; 2457 if (rc->rc_error != 0) { 2458 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2459 tgts[n++] = c; 2460 } 2461 } 2462 2463 ASSERT(rm->rm_firstdatacol >= n); 2464 2465 code = vdev_raidz_reconstruct(rm, tgts, n); 2466 2467 if (raidz_checksum_verify(zio) == 0) { 2468 atomic_inc_64(&raidz_corrected[code]); 2469 2470 /* 2471 * If we read more parity disks than were used 2472 * for reconstruction, confirm that the other 2473 * parity disks produced correct data. This 2474 * routine is suboptimal in that it regenerates 2475 * the parity that we already used in addition 2476 * to the parity that we're attempting to 2477 * verify, but this should be a relatively 2478 * uncommon case, and can be optimized if it 2479 * becomes a problem. Note that we regenerate 2480 * parity when resilvering so we can write it 2481 * out to failed devices later. 2482 */ 2483 if (parity_errors < rm->rm_firstdatacol - n || 2484 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2485 n = raidz_parity_verify(zio, rm); 2486 unexpected_errors += n; 2487 ASSERT(parity_errors + n <= 2488 rm->rm_firstdatacol); 2489 } 2490 2491 goto done; 2492 } 2493 } 2494 } 2495 2496 /* 2497 * This isn't a typical situation -- either we got a read error or 2498 * a child silently returned bad data. Read every block so we can 2499 * try again with as much data and parity as we can track down. If 2500 * we've already been through once before, all children will be marked 2501 * as tried so we'll proceed to combinatorial reconstruction. 2502 */ 2503 unexpected_errors = 1; 2504 rm->rm_missingdata = 0; 2505 rm->rm_missingparity = 0; 2506 2507 for (c = 0; c < rm->rm_cols; c++) { 2508 if (rm->rm_col[c].rc_tried) 2509 continue; 2510 2511 zio_vdev_io_redone(zio); 2512 do { 2513 rc = &rm->rm_col[c]; 2514 if (rc->rc_tried) 2515 continue; 2516 zio_nowait(zio_vdev_child_io(zio, NULL, 2517 vd->vdev_child[rc->rc_devidx], 2518 rc->rc_offset, rc->rc_abd, rc->rc_size, 2519 zio->io_type, zio->io_priority, 0, 2520 vdev_raidz_child_done, rc)); 2521 } while (++c < rm->rm_cols); 2522 2523 return; 2524 } 2525 2526 /* 2527 * At this point we've attempted to reconstruct the data given the 2528 * errors we detected, and we've attempted to read all columns. There 2529 * must, therefore, be one or more additional problems -- silent errors 2530 * resulting in invalid data rather than explicit I/O errors resulting 2531 * in absent data. We check if there is enough additional data to 2532 * possibly reconstruct the data and then perform combinatorial 2533 * reconstruction over all possible combinations. If that fails, 2534 * we're cooked. 2535 */ 2536 if (total_errors > rm->rm_firstdatacol) { 2537 zio->io_error = vdev_raidz_worst_error(rm); 2538 2539 } else if (total_errors < rm->rm_firstdatacol && 2540 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2541 /* 2542 * If we didn't use all the available parity for the 2543 * combinatorial reconstruction, verify that the remaining 2544 * parity is correct. 2545 */ 2546 if (code != (1 << rm->rm_firstdatacol) - 1) 2547 (void) raidz_parity_verify(zio, rm); 2548 } else { 2549 /* 2550 * We're here because either: 2551 * 2552 * total_errors == rm_first_datacol, or 2553 * vdev_raidz_combrec() failed 2554 * 2555 * In either case, there is enough bad data to prevent 2556 * reconstruction. 2557 * 2558 * Start checksum ereports for all children which haven't 2559 * failed, and the IO wasn't speculative. 2560 */ 2561 zio->io_error = SET_ERROR(ECKSUM); 2562 2563 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2564 for (c = 0; c < rm->rm_cols; c++) { 2565 rc = &rm->rm_col[c]; 2566 if (rc->rc_error == 0) { 2567 zio_bad_cksum_t zbc; 2568 zbc.zbc_has_cksum = 0; 2569 zbc.zbc_injected = 2570 rm->rm_ecksuminjected; 2571 2572 zfs_ereport_start_checksum( 2573 zio->io_spa, 2574 vd->vdev_child[rc->rc_devidx], 2575 &zio->io_bookmark, zio, 2576 rc->rc_offset, rc->rc_size, 2577 (void *)(uintptr_t)c, &zbc); 2578 } 2579 } 2580 } 2581 } 2582 2583 done: 2584 zio_checksum_verified(zio); 2585 2586 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2587 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2588 /* 2589 * Use the good data we have in hand to repair damaged children. 2590 */ 2591 for (c = 0; c < rm->rm_cols; c++) { 2592 rc = &rm->rm_col[c]; 2593 cvd = vd->vdev_child[rc->rc_devidx]; 2594 2595 if (rc->rc_error == 0) 2596 continue; 2597 2598 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2599 rc->rc_offset, rc->rc_abd, rc->rc_size, 2600 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2601 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2602 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2603 } 2604 } 2605 } 2606 2607 static void 2608 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2609 { 2610 if (faulted > vd->vdev_nparity) 2611 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2612 VDEV_AUX_NO_REPLICAS); 2613 else if (degraded + faulted != 0) 2614 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2615 else 2616 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2617 } 2618 2619 /* 2620 * Determine if any portion of the provided block resides on a child vdev 2621 * with a dirty DTL and therefore needs to be resilvered. The function 2622 * assumes that at least one DTL is dirty which imples that full stripe 2623 * width blocks must be resilvered. 2624 */ 2625 static boolean_t 2626 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2627 { 2628 uint64_t dcols = vd->vdev_children; 2629 uint64_t nparity = vd->vdev_nparity; 2630 uint64_t ashift = vd->vdev_top->vdev_ashift; 2631 /* The starting RAIDZ (parent) vdev sector of the block. */ 2632 uint64_t b = offset >> ashift; 2633 /* The zio's size in units of the vdev's minimum sector size. */ 2634 uint64_t s = ((psize - 1) >> ashift) + 1; 2635 /* The first column for this stripe. */ 2636 uint64_t f = b % dcols; 2637 2638 if (s + nparity >= dcols) 2639 return (B_TRUE); 2640 2641 for (uint64_t c = 0; c < s + nparity; c++) { 2642 uint64_t devidx = (f + c) % dcols; 2643 vdev_t *cvd = vd->vdev_child[devidx]; 2644 2645 /* 2646 * dsl_scan_need_resilver() already checked vd with 2647 * vdev_dtl_contains(). So here just check cvd with 2648 * vdev_dtl_empty(), cheaper and a good approximation. 2649 */ 2650 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2651 return (B_TRUE); 2652 } 2653 2654 return (B_FALSE); 2655 } 2656 2657 static void 2658 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) 2659 { 2660 vdev_t *raidvd = cvd->vdev_parent; 2661 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2662 2663 uint64_t width = raidvd->vdev_children; 2664 uint64_t tgt_col = cvd->vdev_id; 2665 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2666 2667 /* make sure the offsets are block-aligned */ 2668 ASSERT0(in->rs_start % (1 << ashift)); 2669 ASSERT0(in->rs_end % (1 << ashift)); 2670 uint64_t b_start = in->rs_start >> ashift; 2671 uint64_t b_end = in->rs_end >> ashift; 2672 2673 uint64_t start_row = 0; 2674 if (b_start > tgt_col) /* avoid underflow */ 2675 start_row = ((b_start - tgt_col - 1) / width) + 1; 2676 2677 uint64_t end_row = 0; 2678 if (b_end > tgt_col) 2679 end_row = ((b_end - tgt_col - 1) / width) + 1; 2680 2681 res->rs_start = start_row << ashift; 2682 res->rs_end = end_row << ashift; 2683 2684 ASSERT3U(res->rs_start, <=, in->rs_start); 2685 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2686 } 2687 2688 vdev_ops_t vdev_raidz_ops = { 2689 .vdev_op_open = vdev_raidz_open, 2690 .vdev_op_close = vdev_raidz_close, 2691 .vdev_op_asize = vdev_raidz_asize, 2692 .vdev_op_io_start = vdev_raidz_io_start, 2693 .vdev_op_io_done = vdev_raidz_io_done, 2694 .vdev_op_state_change = vdev_raidz_state_change, 2695 .vdev_op_need_resilver = vdev_raidz_need_resilver, 2696 .vdev_op_hold = NULL, 2697 .vdev_op_rele = NULL, 2698 .vdev_op_remap = NULL, 2699 .vdev_op_xlate = vdev_raidz_xlate, 2700 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2701 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 2702 }; 2703