1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/spa_impl.h> 31 #include <sys/zap.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/abd.h> 38 #include <sys/zfs_rlock.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/vdev_raidz.h> 42 #include <sys/vdev_raidz_impl.h> 43 #include <sys/vdev_draid.h> 44 #include <sys/uberblock_impl.h> 45 #include <sys/dsl_scan.h> 46 47 #ifdef ZFS_DEBUG 48 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 49 #endif 50 51 /* 52 * Virtual device vector for RAID-Z. 53 * 54 * This vdev supports single, double, and triple parity. For single parity, 55 * we use a simple XOR of all the data columns. For double or triple parity, 56 * we use a special case of Reed-Solomon coding. This extends the 57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 60 * former is also based. The latter is designed to provide higher performance 61 * for writes. 62 * 63 * Note that the Plank paper claimed to support arbitrary N+M, but was then 64 * amended six years later identifying a critical flaw that invalidates its 65 * claims. Nevertheless, the technique can be adapted to work for up to 66 * triple parity. For additional parity, the amendment "Note: Correction to 67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 68 * is viable, but the additional complexity means that write performance will 69 * suffer. 70 * 71 * All of the methods above operate on a Galois field, defined over the 72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 73 * can be expressed with a single byte. Briefly, the operations on the 74 * field are defined as follows: 75 * 76 * o addition (+) is represented by a bitwise XOR 77 * o subtraction (-) is therefore identical to addition: A + B = A - B 78 * o multiplication of A by 2 is defined by the following bitwise expression: 79 * 80 * (A * 2)_7 = A_6 81 * (A * 2)_6 = A_5 82 * (A * 2)_5 = A_4 83 * (A * 2)_4 = A_3 + A_7 84 * (A * 2)_3 = A_2 + A_7 85 * (A * 2)_2 = A_1 + A_7 86 * (A * 2)_1 = A_0 87 * (A * 2)_0 = A_7 88 * 89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 90 * As an aside, this multiplication is derived from the error correcting 91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 92 * 93 * Observe that any number in the field (except for 0) can be expressed as a 94 * power of 2 -- a generator for the field. We store a table of the powers of 95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 97 * than field addition). The inverse of a field element A (A^-1) is therefore 98 * A ^ (255 - 1) = A^254. 99 * 100 * The up-to-three parity columns, P, Q, R over several data columns, 101 * D_0, ... D_n-1, can be expressed by field operations: 102 * 103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 108 * 109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 111 * independent coefficients. (There are no additional coefficients that have 112 * this property which is why the uncorrected Plank method breaks down.) 113 * 114 * See the reconstruction code below for how P, Q and R can used individually 115 * or in concert to recover missing data columns. 116 */ 117 118 #define VDEV_RAIDZ_P 0 119 #define VDEV_RAIDZ_Q 1 120 #define VDEV_RAIDZ_R 2 121 122 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 123 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 124 125 /* 126 * We provide a mechanism to perform the field multiplication operation on a 127 * 64-bit value all at once rather than a byte at a time. This works by 128 * creating a mask from the top bit in each byte and using that to 129 * conditionally apply the XOR of 0x1d. 130 */ 131 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 132 { \ 133 (mask) = (x) & 0x8080808080808080ULL; \ 134 (mask) = ((mask) << 1) - ((mask) >> 7); \ 135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 137 } 138 139 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 140 { \ 141 VDEV_RAIDZ_64MUL_2((x), mask); \ 142 VDEV_RAIDZ_64MUL_2((x), mask); \ 143 } 144 145 146 /* 147 * Big Theory Statement for how a RAIDZ VDEV is expanded 148 * 149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 151 * that have been previously expanded can be expanded again. 152 * 153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 154 * the VDEV) when an expansion starts. And the expansion will pause if any 155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 156 * operations on the pool can continue while an expansion is in progress (e.g. 157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 158 * and zpool initialize which can't be run during an expansion. Following a 159 * reboot or export/import, the expansion resumes where it left off. 160 * 161 * == Reflowing the Data == 162 * 163 * The expansion involves reflowing (copying) the data from the current set 164 * of disks to spread it across the new set which now has one more disk. This 165 * reflow operation is similar to reflowing text when the column width of a 166 * text editor window is expanded. The text doesn’t change but the location of 167 * the text changes to accommodate the new width. An example reflow result for 168 * a 4-wide RAIDZ1 to a 5-wide is shown below. 169 * 170 * Reflow End State 171 * Each letter indicates a parity group (logical stripe) 172 * 173 * Before expansion After Expansion 174 * D1 D2 D3 D4 D1 D2 D3 D4 D5 175 * +------+------+------+------+ +------+------+------+------+------+ 176 * | | | | | | | | | | | 177 * | A | A | A | A | | A | A | A | A | B | 178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 179 * +------+------+------+------+ +------+------+------+------+------+ 180 * | | | | | | | | | | | 181 * | B | B | C | C | | B | C | C | C | C | 182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 183 * +------+------+------+------+ +------+------+------+------+------+ 184 * | | | | | | | | | | | 185 * | C | C | D | D | | D | D | E | E | E | 186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 187 * +------+------+------+------+ +------+------+------+------+------+ 188 * | | | | | | | | | | | 189 * | E | E | E | E | --> | E | F | F | G | G | 190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 191 * +------+------+------+------+ +------+------+------+------+------+ 192 * | | | | | | | | | | | 193 * | F | F | G | G | | G | G | H | H | H | 194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 195 * +------+------+------+------+ +------+------+------+------+------+ 196 * | | | | | | | | | | | 197 * | G | G | H | H | | H | I | I | J | J | 198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 199 * +------+------+------+------+ +------+------+------+------+------+ 200 * | | | | | | | | | | | 201 * | H | H | I | I | | J | J | | | K | 202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 203 * +------+------+------+------+ +------+------+------+------+------+ 204 * 205 * This reflow approach has several advantages. There is no need to read or 206 * modify the block pointers or recompute any block checksums. The reflow 207 * doesn’t need to know where the parity sectors reside. We can read and write 208 * data sequentially and the copy can occur in a background thread in open 209 * context. The design also allows for fast discovery of what data to copy. 210 * 211 * The VDEV metaslabs are processed, one at a time, to copy the block data to 212 * have it flow across all the disks. The metaslab is disabled for allocations 213 * during the copy. As an optimization, we only copy the allocated data which 214 * can be determined by looking at the metaslab range tree. During the copy we 215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 216 * need to be able to survive losing parity count disks). This means we 217 * cannot overwrite data during the reflow that would be needed if a disk is 218 * lost. 219 * 220 * After the reflow completes, all newly-written blocks will have the new 221 * layout, i.e., they will have the parity to data ratio implied by the new 222 * number of disks in the RAIDZ group. Even though the reflow copies all of 223 * the allocated space (data and parity), it is only rearranged, not changed. 224 * 225 * This act of reflowing the data has a few implications about blocks 226 * that were written before the reflow completes: 227 * 228 * - Old blocks will still use the same amount of space (i.e., they will have 229 * the parity to data ratio implied by the old number of disks in the RAIDZ 230 * group). 231 * - Reading old blocks will be slightly slower than before the reflow, for 232 * two reasons. First, we will have to read from all disks in the RAIDZ 233 * VDEV, rather than being able to skip the children that contain only 234 * parity of this block (because the data of a single block is now spread 235 * out across all the disks). Second, in most cases there will be an extra 236 * bcopy, needed to rearrange the data back to its original layout in memory. 237 * 238 * == Scratch Area == 239 * 240 * As we copy the block data, we can only progress to the point that writes 241 * will not overlap with blocks whose progress has not yet been recorded on 242 * disk. Since partially-copied rows are always read from the old location, 243 * we need to stop one row before the sector-wise overlap, to prevent any 244 * row-wise overlap. For example, in the diagram above, when we reflow sector 245 * B6 it will overwite the original location for B5. 246 * 247 * To get around this, a scratch space is used so that we can start copying 248 * without risking data loss by overlapping the row. As an added benefit, it 249 * improves performance at the beginning of the reflow, but that small perf 250 * boost wouldn't be worth the complexity on its own. 251 * 252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 255 * the widths will likely be single digits so we can get a substantial chuck 256 * size using only a few MB of scratch per disk. 257 * 258 * The scratch area is persisted to disk which holds a large amount of reflowed 259 * state. We can always read the partially written stripes when a disk fails or 260 * the copy is interrupted (crash) during the initial copying phase and also 261 * get past a small chunk size restriction. At a minimum, the scratch space 262 * must be large enough to get us to the point that one row does not overlap 263 * itself when moved (i.e new_width^2). But going larger is even better. We 264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 265 * as our scratch space to handle overwriting the initial part of the VDEV. 266 * 267 * 0 256K 512K 4M 268 * +------+------+-----------------------+----------------------------- 269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 270 * | L0 | L1 | Reserved | (Metaslabs) 271 * +------+------+-----------------------+------------------------------- 272 * Scratch Area 273 * 274 * == Reflow Progress Updates == 275 * After the initial scratch-based reflow, the expansion process works 276 * similarly to device removal. We create a new open context thread which 277 * reflows the data, and periodically kicks off sync tasks to update logical 278 * state. In this case, state is the committed progress (offset of next data 279 * to copy). We need to persist the completed offset on disk, so that if we 280 * crash we know which format each VDEV offset is in. 281 * 282 * == Time Dependent Geometry == 283 * 284 * In non-expanded RAIDZ, blocks are read from disk in a column by column 285 * fashion. For a multi-row block, the second sector is in the first column 286 * not in the second column. This allows us to issue full reads for each 287 * column directly into the request buffer. The block data is thus laid out 288 * sequentially in a column-by-column fashion. 289 * 290 * For example, in the before expansion diagram above, one logical block might 291 * be sectors G19-H26. The parity is in G19,H23; and the data is in 292 * G20,H24,G21,H25,G22,H26. 293 * 294 * After a block is reflowed, the sectors that were all in the original column 295 * data can now reside in different columns. When reading from an expanded 296 * VDEV, we need to know the logical stripe width for each block so we can 297 * reconstitute the block’s data after the reads are completed. Likewise, 298 * when we perform the combinatorial reconstruction we need to know the 299 * original width so we can retry combinations from the past layouts. 300 * 301 * Time dependent geometry is what we call having blocks with different layouts 302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the 303 * block’s birth time (+ the time expansion ended) to establish the correct 304 * width for a given block. After an expansion completes, we record the time 305 * for blocks written with a particular width (geometry). 306 * 307 * == On Disk Format Changes == 308 * 309 * New pool feature flag, 'raidz_expansion' whose reference count is the number 310 * of RAIDZ VDEVs that have been expanded. 311 * 312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 313 * 314 * Since the uberblock can point to arbitrary blocks, which might be on the 315 * expanding RAIDZ, and might or might not have been expanded. We need to know 316 * which way a block is laid out before reading it. This info is the next 317 * offset that needs to be reflowed and we persist that in the uberblock, in 318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 319 * After the expansion is complete, we then use the raidz_expand_txgs array 320 * (see below) to determine how to read a block and the ub_raidz_reflow_info 321 * field no longer required. 322 * 323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space 324 * state (i.e., active or not) which is also required before reading a block 325 * during the initial phase of reflowing the data. 326 * 327 * The top-level RAIDZ VDEV has two new entries in the nvlist: 328 * 329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 330 * and used after the expansion is complete to 331 * determine how to read a raidz block 332 * 'raidz_expanding' boolean: present during reflow and removed after completion 333 * used during a spa import to resume an unfinished 334 * expansion 335 * 336 * And finally the VDEVs top zap adds the following informational entries: 337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 341 */ 342 343 /* 344 * For testing only: pause the raidz expansion after reflowing this amount. 345 * (accessed by ZTS and ztest) 346 */ 347 #ifdef _KERNEL 348 static 349 #endif /* _KERNEL */ 350 unsigned long raidz_expand_max_reflow_bytes = 0; 351 352 /* 353 * For testing only: pause the raidz expansion at a certain point. 354 */ 355 uint_t raidz_expand_pause_point = 0; 356 357 /* 358 * Maximum amount of copy io's outstanding at once. 359 */ 360 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 361 362 /* 363 * Apply raidz map abds aggregation if the number of rows in the map is equal 364 * or greater than the value below. 365 */ 366 static unsigned long raidz_io_aggregate_rows = 4; 367 368 /* 369 * Automatically start a pool scrub when a RAIDZ expansion completes in 370 * order to verify the checksums of all blocks which have been copied 371 * during the expansion. Automatic scrubbing is enabled by default and 372 * is strongly recommended. 373 */ 374 static int zfs_scrub_after_expand = 1; 375 376 static void 377 vdev_raidz_row_free(raidz_row_t *rr) 378 { 379 for (int c = 0; c < rr->rr_cols; c++) { 380 raidz_col_t *rc = &rr->rr_col[c]; 381 382 if (rc->rc_size != 0) 383 abd_free(rc->rc_abd); 384 if (rc->rc_orig_data != NULL) 385 abd_free(rc->rc_orig_data); 386 } 387 388 if (rr->rr_abd_empty != NULL) 389 abd_free(rr->rr_abd_empty); 390 391 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 392 } 393 394 void 395 vdev_raidz_map_free(raidz_map_t *rm) 396 { 397 for (int i = 0; i < rm->rm_nrows; i++) 398 vdev_raidz_row_free(rm->rm_row[i]); 399 400 if (rm->rm_nphys_cols) { 401 for (int i = 0; i < rm->rm_nphys_cols; i++) { 402 if (rm->rm_phys_col[i].rc_abd != NULL) 403 abd_free(rm->rm_phys_col[i].rc_abd); 404 } 405 406 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 407 rm->rm_nphys_cols); 408 } 409 410 ASSERT3P(rm->rm_lr, ==, NULL); 411 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 412 } 413 414 static void 415 vdev_raidz_map_free_vsd(zio_t *zio) 416 { 417 raidz_map_t *rm = zio->io_vsd; 418 419 vdev_raidz_map_free(rm); 420 } 421 422 static int 423 vdev_raidz_reflow_compare(const void *x1, const void *x2) 424 { 425 const reflow_node_t *l = x1; 426 const reflow_node_t *r = x2; 427 428 return (TREE_CMP(l->re_txg, r->re_txg)); 429 } 430 431 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432 .vsd_free = vdev_raidz_map_free_vsd, 433 }; 434 435 raidz_row_t * 436 vdev_raidz_row_alloc(int cols, zio_t *zio) 437 { 438 raidz_row_t *rr = 439 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 440 441 rr->rr_cols = cols; 442 rr->rr_scols = cols; 443 444 for (int c = 0; c < cols; c++) { 445 raidz_col_t *rc = &rr->rr_col[c]; 446 rc->rc_shadow_devidx = INT_MAX; 447 rc->rc_shadow_offset = UINT64_MAX; 448 /* 449 * We can not allow self healing to take place for Direct I/O 450 * reads. There is nothing that stops the buffer contents from 451 * being manipulated while the I/O is in flight. It is possible 452 * that the checksum could be verified on the buffer and then 453 * the contents of that buffer are manipulated afterwards. This 454 * could lead to bad data being written out during self 455 * healing. 456 */ 457 if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 458 rc->rc_allow_repair = 1; 459 } 460 return (rr); 461 } 462 463 static void 464 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 465 { 466 int c; 467 int nwrapped = 0; 468 uint64_t off = 0; 469 raidz_row_t *rr = rm->rm_row[0]; 470 471 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 472 ASSERT3U(rm->rm_nrows, ==, 1); 473 474 /* 475 * Pad any parity columns with additional space to account for skip 476 * sectors. 477 */ 478 if (rm->rm_skipstart < rr->rr_firstdatacol) { 479 ASSERT0(rm->rm_skipstart); 480 nwrapped = rm->rm_nskip; 481 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 482 nwrapped = 483 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 484 } 485 486 /* 487 * Optional single skip sectors (rc_size == 0) will be handled in 488 * vdev_raidz_io_start_write(). 489 */ 490 int skipped = rr->rr_scols - rr->rr_cols; 491 492 /* Allocate buffers for the parity columns */ 493 for (c = 0; c < rr->rr_firstdatacol; c++) { 494 raidz_col_t *rc = &rr->rr_col[c]; 495 496 /* 497 * Parity columns will pad out a linear ABD to account for 498 * the skip sector. A linear ABD is used here because 499 * parity calculations use the ABD buffer directly to calculate 500 * parity. This avoids doing a memcpy back to the ABD after the 501 * parity has been calculated. By issuing the parity column 502 * with the skip sector we can reduce contention on the child 503 * VDEV queue locks (vq_lock). 504 */ 505 if (c < nwrapped) { 506 rc->rc_abd = abd_alloc_linear( 507 rc->rc_size + (1ULL << ashift), B_FALSE); 508 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 509 skipped++; 510 } else { 511 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 512 } 513 } 514 515 for (off = 0; c < rr->rr_cols; c++) { 516 raidz_col_t *rc = &rr->rr_col[c]; 517 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 518 zio->io_abd, off, rc->rc_size); 519 520 /* 521 * Generate I/O for skip sectors to improve aggregation 522 * continuity. We will use gang ABD's to reduce contention 523 * on the child VDEV queue locks (vq_lock) by issuing 524 * a single I/O that contains the data and skip sector. 525 * 526 * It is important to make sure that rc_size is not updated 527 * even though we are adding a skip sector to the ABD. When 528 * calculating the parity in vdev_raidz_generate_parity_row() 529 * the rc_size is used to iterate through the ABD's. We can 530 * not have zero'd out skip sectors used for calculating 531 * parity for raidz, because those same sectors are not used 532 * during reconstruction. 533 */ 534 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 535 rc->rc_abd = abd_alloc_gang(); 536 abd_gang_add(rc->rc_abd, abd, B_TRUE); 537 abd_gang_add(rc->rc_abd, 538 abd_get_zeros(1ULL << ashift), B_TRUE); 539 skipped++; 540 } else { 541 rc->rc_abd = abd; 542 } 543 off += rc->rc_size; 544 } 545 546 ASSERT3U(off, ==, zio->io_size); 547 ASSERT3S(skipped, ==, rm->rm_nskip); 548 } 549 550 static void 551 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 552 { 553 int c; 554 raidz_row_t *rr = rm->rm_row[0]; 555 556 ASSERT3U(rm->rm_nrows, ==, 1); 557 558 /* Allocate buffers for the parity columns */ 559 for (c = 0; c < rr->rr_firstdatacol; c++) 560 rr->rr_col[c].rc_abd = 561 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 562 563 for (uint64_t off = 0; c < rr->rr_cols; c++) { 564 raidz_col_t *rc = &rr->rr_col[c]; 565 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 566 zio->io_abd, off, rc->rc_size); 567 off += rc->rc_size; 568 } 569 } 570 571 /* 572 * Divides the IO evenly across all child vdevs; usually, dcols is 573 * the number of children in the target vdev. 574 * 575 * Avoid inlining the function to keep vdev_raidz_io_start(), which 576 * is this functions only caller, as small as possible on the stack. 577 */ 578 noinline raidz_map_t * 579 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 580 uint64_t nparity) 581 { 582 raidz_row_t *rr; 583 /* The starting RAIDZ (parent) vdev sector of the block. */ 584 uint64_t b = zio->io_offset >> ashift; 585 /* The zio's size in units of the vdev's minimum sector size. */ 586 uint64_t s = zio->io_size >> ashift; 587 /* The first column for this stripe. */ 588 uint64_t f = b % dcols; 589 /* The starting byte offset on each child vdev. */ 590 uint64_t o = (b / dcols) << ashift; 591 uint64_t acols, scols; 592 593 raidz_map_t *rm = 594 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 595 rm->rm_nrows = 1; 596 597 /* 598 * "Quotient": The number of data sectors for this stripe on all but 599 * the "big column" child vdevs that also contain "remainder" data. 600 */ 601 uint64_t q = s / (dcols - nparity); 602 603 /* 604 * "Remainder": The number of partial stripe data sectors in this I/O. 605 * This will add a sector to some, but not all, child vdevs. 606 */ 607 uint64_t r = s - q * (dcols - nparity); 608 609 /* The number of "big columns" - those which contain remainder data. */ 610 uint64_t bc = (r == 0 ? 0 : r + nparity); 611 612 /* 613 * The total number of data and parity sectors associated with 614 * this I/O. 615 */ 616 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 617 618 /* 619 * acols: The columns that will be accessed. 620 * scols: The columns that will be accessed or skipped. 621 */ 622 if (q == 0) { 623 /* Our I/O request doesn't span all child vdevs. */ 624 acols = bc; 625 scols = MIN(dcols, roundup(bc, nparity + 1)); 626 } else { 627 acols = dcols; 628 scols = dcols; 629 } 630 631 ASSERT3U(acols, <=, scols); 632 rr = vdev_raidz_row_alloc(scols, zio); 633 rm->rm_row[0] = rr; 634 rr->rr_cols = acols; 635 rr->rr_bigcols = bc; 636 rr->rr_firstdatacol = nparity; 637 #ifdef ZFS_DEBUG 638 rr->rr_offset = zio->io_offset; 639 rr->rr_size = zio->io_size; 640 #endif 641 642 uint64_t asize = 0; 643 644 for (uint64_t c = 0; c < scols; c++) { 645 raidz_col_t *rc = &rr->rr_col[c]; 646 uint64_t col = f + c; 647 uint64_t coff = o; 648 if (col >= dcols) { 649 col -= dcols; 650 coff += 1ULL << ashift; 651 } 652 rc->rc_devidx = col; 653 rc->rc_offset = coff; 654 655 if (c >= acols) 656 rc->rc_size = 0; 657 else if (c < bc) 658 rc->rc_size = (q + 1) << ashift; 659 else 660 rc->rc_size = q << ashift; 661 662 asize += rc->rc_size; 663 } 664 665 ASSERT3U(asize, ==, tot << ashift); 666 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 667 rm->rm_skipstart = bc; 668 669 /* 670 * If all data stored spans all columns, there's a danger that parity 671 * will always be on the same device and, since parity isn't read 672 * during normal operation, that device's I/O bandwidth won't be 673 * used effectively. We therefore switch the parity every 1MB. 674 * 675 * ... at least that was, ostensibly, the theory. As a practical 676 * matter unless we juggle the parity between all devices evenly, we 677 * won't see any benefit. Further, occasional writes that aren't a 678 * multiple of the LCM of the number of children and the minimum 679 * stripe width are sufficient to avoid pessimal behavior. 680 * Unfortunately, this decision created an implicit on-disk format 681 * requirement that we need to support for all eternity, but only 682 * for single-parity RAID-Z. 683 * 684 * If we intend to skip a sector in the zeroth column for padding 685 * we must make sure to note this swap. We will never intend to 686 * skip the first column since at least one data and one parity 687 * column must appear in each row. 688 */ 689 ASSERT(rr->rr_cols >= 2); 690 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 691 692 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 693 uint64_t devidx = rr->rr_col[0].rc_devidx; 694 o = rr->rr_col[0].rc_offset; 695 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 696 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 697 rr->rr_col[1].rc_devidx = devidx; 698 rr->rr_col[1].rc_offset = o; 699 if (rm->rm_skipstart == 0) 700 rm->rm_skipstart = 1; 701 } 702 703 if (zio->io_type == ZIO_TYPE_WRITE) { 704 vdev_raidz_map_alloc_write(zio, rm, ashift); 705 } else { 706 vdev_raidz_map_alloc_read(zio, rm); 707 } 708 /* init RAIDZ parity ops */ 709 rm->rm_ops = vdev_raidz_math_get_ops(); 710 711 return (rm); 712 } 713 714 /* 715 * Everything before reflow_offset_synced should have been moved to the new 716 * location (read and write completed). However, this may not yet be reflected 717 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 718 * uberblock has not yet been written). If reflow is not in progress, 719 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 720 * entirely before reflow_offset_synced, it will come from the new location. 721 * Otherwise this row will come from the old location. Therefore, rows that 722 * straddle the reflow_offset_synced will come from the old location. 723 * 724 * For writes, reflow_offset_next is the next offset to copy. If a sector has 725 * been copied, but not yet reflected in the on-disk progress 726 * (reflow_offset_synced), it will also be written to the new (already copied) 727 * offset. 728 */ 729 noinline raidz_map_t * 730 vdev_raidz_map_alloc_expanded(zio_t *zio, 731 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 732 uint64_t nparity, uint64_t reflow_offset_synced, 733 uint64_t reflow_offset_next, boolean_t use_scratch) 734 { 735 abd_t *abd = zio->io_abd; 736 uint64_t offset = zio->io_offset; 737 uint64_t size = zio->io_size; 738 739 /* The zio's size in units of the vdev's minimum sector size. */ 740 uint64_t s = size >> ashift; 741 742 /* 743 * "Quotient": The number of data sectors for this stripe on all but 744 * the "big column" child vdevs that also contain "remainder" data. 745 * AKA "full rows" 746 */ 747 uint64_t q = s / (logical_cols - nparity); 748 749 /* 750 * "Remainder": The number of partial stripe data sectors in this I/O. 751 * This will add a sector to some, but not all, child vdevs. 752 */ 753 uint64_t r = s - q * (logical_cols - nparity); 754 755 /* The number of "big columns" - those which contain remainder data. */ 756 uint64_t bc = (r == 0 ? 0 : r + nparity); 757 758 /* 759 * The total number of data and parity sectors associated with 760 * this I/O. 761 */ 762 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 763 764 /* How many rows contain data (not skip) */ 765 uint64_t rows = howmany(tot, logical_cols); 766 int cols = MIN(tot, logical_cols); 767 768 raidz_map_t *rm = 769 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 770 KM_SLEEP); 771 rm->rm_nrows = rows; 772 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 773 rm->rm_skipstart = bc; 774 uint64_t asize = 0; 775 776 for (uint64_t row = 0; row < rows; row++) { 777 boolean_t row_use_scratch = B_FALSE; 778 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 779 rm->rm_row[row] = rr; 780 781 /* The starting RAIDZ (parent) vdev sector of the row. */ 782 uint64_t b = (offset >> ashift) + row * logical_cols; 783 784 /* 785 * If we are in the middle of a reflow, and the copying has 786 * not yet completed for any part of this row, then use the 787 * old location of this row. Note that reflow_offset_synced 788 * reflects the i/o that's been completed, because it's 789 * updated by a synctask, after zio_wait(spa_txg_zio[]). 790 * This is sufficient for our check, even if that progress 791 * has not yet been recorded to disk (reflected in 792 * spa_ubsync). Also note that we consider the last row to 793 * be "full width" (`cols`-wide rather than `bc`-wide) for 794 * this calculation. This causes a tiny bit of unnecessary 795 * double-writes but is safe and simpler to calculate. 796 */ 797 int row_phys_cols = physical_cols; 798 if (b + cols > reflow_offset_synced >> ashift) 799 row_phys_cols--; 800 else if (use_scratch) 801 row_use_scratch = B_TRUE; 802 803 /* starting child of this row */ 804 uint64_t child_id = b % row_phys_cols; 805 /* The starting byte offset on each child vdev. */ 806 uint64_t child_offset = (b / row_phys_cols) << ashift; 807 808 /* 809 * Note, rr_cols is the entire width of the block, even 810 * if this row is shorter. This is needed because parity 811 * generation (for Q and R) needs to know the entire width, 812 * because it treats the short row as though it was 813 * full-width (and the "phantom" sectors were zero-filled). 814 * 815 * Another approach to this would be to set cols shorter 816 * (to just the number of columns that we might do i/o to) 817 * and have another mechanism to tell the parity generation 818 * about the "entire width". Reconstruction (at least 819 * vdev_raidz_reconstruct_general()) would also need to 820 * know about the "entire width". 821 */ 822 rr->rr_firstdatacol = nparity; 823 #ifdef ZFS_DEBUG 824 /* 825 * note: rr_size is PSIZE, not ASIZE 826 */ 827 rr->rr_offset = b << ashift; 828 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 829 #endif 830 831 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 832 if (child_id >= row_phys_cols) { 833 child_id -= row_phys_cols; 834 child_offset += 1ULL << ashift; 835 } 836 raidz_col_t *rc = &rr->rr_col[c]; 837 rc->rc_devidx = child_id; 838 rc->rc_offset = child_offset; 839 840 /* 841 * Get this from the scratch space if appropriate. 842 * This only happens if we crashed in the middle of 843 * raidz_reflow_scratch_sync() (while it's running, 844 * the rangelock prevents us from doing concurrent 845 * io), and even then only during zpool import or 846 * when the pool is imported readonly. 847 */ 848 if (row_use_scratch) 849 rc->rc_offset -= VDEV_BOOT_SIZE; 850 851 uint64_t dc = c - rr->rr_firstdatacol; 852 if (c < rr->rr_firstdatacol) { 853 rc->rc_size = 1ULL << ashift; 854 855 /* 856 * Parity sectors' rc_abd's are set below 857 * after determining if this is an aggregation. 858 */ 859 } else if (row == rows - 1 && bc != 0 && c >= bc) { 860 /* 861 * Past the end of the block (even including 862 * skip sectors). This sector is part of the 863 * map so that we have full rows for p/q parity 864 * generation. 865 */ 866 rc->rc_size = 0; 867 rc->rc_abd = NULL; 868 } else { 869 /* "data column" (col excluding parity) */ 870 uint64_t off; 871 872 if (c < bc || r == 0) { 873 off = dc * rows + row; 874 } else { 875 off = r * rows + 876 (dc - r) * (rows - 1) + row; 877 } 878 rc->rc_size = 1ULL << ashift; 879 rc->rc_abd = abd_get_offset_struct( 880 &rc->rc_abdstruct, abd, off << ashift, 881 rc->rc_size); 882 } 883 884 if (rc->rc_size == 0) 885 continue; 886 887 /* 888 * If any part of this row is in both old and new 889 * locations, the primary location is the old 890 * location. If this sector was already copied to the 891 * new location, we need to also write to the new, 892 * "shadow" location. 893 * 894 * Note, `row_phys_cols != physical_cols` indicates 895 * that the primary location is the old location. 896 * `b+c < reflow_offset_next` indicates that the copy 897 * to the new location has been initiated. We know 898 * that the copy has completed because we have the 899 * rangelock, which is held exclusively while the 900 * copy is in progress. 901 */ 902 if (row_use_scratch || 903 (row_phys_cols != physical_cols && 904 b + c < reflow_offset_next >> ashift)) { 905 rc->rc_shadow_devidx = (b + c) % physical_cols; 906 rc->rc_shadow_offset = 907 ((b + c) / physical_cols) << ashift; 908 if (row_use_scratch) 909 rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 910 } 911 912 asize += rc->rc_size; 913 } 914 915 /* 916 * See comment in vdev_raidz_map_alloc() 917 */ 918 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 919 (offset & (1ULL << 20))) { 920 ASSERT(rr->rr_cols >= 2); 921 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 922 923 int devidx0 = rr->rr_col[0].rc_devidx; 924 uint64_t offset0 = rr->rr_col[0].rc_offset; 925 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 926 uint64_t shadow_offset0 = 927 rr->rr_col[0].rc_shadow_offset; 928 929 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 930 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 931 rr->rr_col[0].rc_shadow_devidx = 932 rr->rr_col[1].rc_shadow_devidx; 933 rr->rr_col[0].rc_shadow_offset = 934 rr->rr_col[1].rc_shadow_offset; 935 936 rr->rr_col[1].rc_devidx = devidx0; 937 rr->rr_col[1].rc_offset = offset0; 938 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 939 rr->rr_col[1].rc_shadow_offset = shadow_offset0; 940 } 941 } 942 ASSERT3U(asize, ==, tot << ashift); 943 944 /* 945 * Determine if the block is contiguous, in which case we can use 946 * an aggregation. 947 */ 948 if (rows >= raidz_io_aggregate_rows) { 949 rm->rm_nphys_cols = physical_cols; 950 rm->rm_phys_col = 951 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 952 KM_SLEEP); 953 954 /* 955 * Determine the aggregate io's offset and size, and check 956 * that the io is contiguous. 957 */ 958 for (int i = 0; 959 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 960 raidz_row_t *rr = rm->rm_row[i]; 961 for (int c = 0; c < rr->rr_cols; c++) { 962 raidz_col_t *rc = &rr->rr_col[c]; 963 raidz_col_t *prc = 964 &rm->rm_phys_col[rc->rc_devidx]; 965 966 if (rc->rc_size == 0) 967 continue; 968 969 if (prc->rc_size == 0) { 970 ASSERT0(prc->rc_offset); 971 prc->rc_offset = rc->rc_offset; 972 } else if (prc->rc_offset + prc->rc_size != 973 rc->rc_offset) { 974 /* 975 * This block is not contiguous and 976 * therefore can't be aggregated. 977 * This is expected to be rare, so 978 * the cost of allocating and then 979 * freeing rm_phys_col is not 980 * significant. 981 */ 982 kmem_free(rm->rm_phys_col, 983 sizeof (raidz_col_t) * 984 rm->rm_nphys_cols); 985 rm->rm_phys_col = NULL; 986 rm->rm_nphys_cols = 0; 987 break; 988 } 989 prc->rc_size += rc->rc_size; 990 } 991 } 992 } 993 if (rm->rm_phys_col != NULL) { 994 /* 995 * Allocate aggregate ABD's. 996 */ 997 for (int i = 0; i < rm->rm_nphys_cols; i++) { 998 raidz_col_t *prc = &rm->rm_phys_col[i]; 999 1000 prc->rc_devidx = i; 1001 1002 if (prc->rc_size == 0) 1003 continue; 1004 1005 prc->rc_abd = 1006 abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1007 B_FALSE); 1008 } 1009 1010 /* 1011 * Point the parity abd's into the aggregate abd's. 1012 */ 1013 for (int i = 0; i < rm->rm_nrows; i++) { 1014 raidz_row_t *rr = rm->rm_row[i]; 1015 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1016 raidz_col_t *rc = &rr->rr_col[c]; 1017 raidz_col_t *prc = 1018 &rm->rm_phys_col[rc->rc_devidx]; 1019 rc->rc_abd = 1020 abd_get_offset_struct(&rc->rc_abdstruct, 1021 prc->rc_abd, 1022 rc->rc_offset - prc->rc_offset, 1023 rc->rc_size); 1024 } 1025 } 1026 } else { 1027 /* 1028 * Allocate new abd's for the parity sectors. 1029 */ 1030 for (int i = 0; i < rm->rm_nrows; i++) { 1031 raidz_row_t *rr = rm->rm_row[i]; 1032 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1033 raidz_col_t *rc = &rr->rr_col[c]; 1034 rc->rc_abd = 1035 abd_alloc_linear(rc->rc_size, 1036 B_TRUE); 1037 } 1038 } 1039 } 1040 /* init RAIDZ parity ops */ 1041 rm->rm_ops = vdev_raidz_math_get_ops(); 1042 1043 return (rm); 1044 } 1045 1046 struct pqr_struct { 1047 uint64_t *p; 1048 uint64_t *q; 1049 uint64_t *r; 1050 }; 1051 1052 static int 1053 vdev_raidz_p_func(void *buf, size_t size, void *private) 1054 { 1055 struct pqr_struct *pqr = private; 1056 const uint64_t *src = buf; 1057 int cnt = size / sizeof (src[0]); 1058 1059 ASSERT(pqr->p && !pqr->q && !pqr->r); 1060 1061 for (int i = 0; i < cnt; i++, src++, pqr->p++) 1062 *pqr->p ^= *src; 1063 1064 return (0); 1065 } 1066 1067 static int 1068 vdev_raidz_pq_func(void *buf, size_t size, void *private) 1069 { 1070 struct pqr_struct *pqr = private; 1071 const uint64_t *src = buf; 1072 uint64_t mask; 1073 int cnt = size / sizeof (src[0]); 1074 1075 ASSERT(pqr->p && pqr->q && !pqr->r); 1076 1077 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1078 *pqr->p ^= *src; 1079 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1080 *pqr->q ^= *src; 1081 } 1082 1083 return (0); 1084 } 1085 1086 static int 1087 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1088 { 1089 struct pqr_struct *pqr = private; 1090 const uint64_t *src = buf; 1091 uint64_t mask; 1092 int cnt = size / sizeof (src[0]); 1093 1094 ASSERT(pqr->p && pqr->q && pqr->r); 1095 1096 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1097 *pqr->p ^= *src; 1098 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1099 *pqr->q ^= *src; 1100 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1101 *pqr->r ^= *src; 1102 } 1103 1104 return (0); 1105 } 1106 1107 static void 1108 vdev_raidz_generate_parity_p(raidz_row_t *rr) 1109 { 1110 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1111 1112 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1113 abd_t *src = rr->rr_col[c].rc_abd; 1114 1115 if (c == rr->rr_firstdatacol) { 1116 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1117 } else { 1118 struct pqr_struct pqr = { p, NULL, NULL }; 1119 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1120 vdev_raidz_p_func, &pqr); 1121 } 1122 } 1123 } 1124 1125 static void 1126 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1127 { 1128 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1129 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1130 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1131 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1132 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1133 1134 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1135 abd_t *src = rr->rr_col[c].rc_abd; 1136 1137 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1138 1139 if (c == rr->rr_firstdatacol) { 1140 ASSERT(ccnt == pcnt || ccnt == 0); 1141 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1142 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1143 1144 for (uint64_t i = ccnt; i < pcnt; i++) { 1145 p[i] = 0; 1146 q[i] = 0; 1147 } 1148 } else { 1149 struct pqr_struct pqr = { p, q, NULL }; 1150 1151 ASSERT(ccnt <= pcnt); 1152 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1153 vdev_raidz_pq_func, &pqr); 1154 1155 /* 1156 * Treat short columns as though they are full of 0s. 1157 * Note that there's therefore nothing needed for P. 1158 */ 1159 uint64_t mask; 1160 for (uint64_t i = ccnt; i < pcnt; i++) { 1161 VDEV_RAIDZ_64MUL_2(q[i], mask); 1162 } 1163 } 1164 } 1165 } 1166 1167 static void 1168 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1169 { 1170 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1171 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1172 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 1173 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1174 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1175 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1176 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1177 rr->rr_col[VDEV_RAIDZ_R].rc_size); 1178 1179 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1180 abd_t *src = rr->rr_col[c].rc_abd; 1181 1182 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1183 1184 if (c == rr->rr_firstdatacol) { 1185 ASSERT(ccnt == pcnt || ccnt == 0); 1186 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1187 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1188 (void) memcpy(r, p, rr->rr_col[c].rc_size); 1189 1190 for (uint64_t i = ccnt; i < pcnt; i++) { 1191 p[i] = 0; 1192 q[i] = 0; 1193 r[i] = 0; 1194 } 1195 } else { 1196 struct pqr_struct pqr = { p, q, r }; 1197 1198 ASSERT(ccnt <= pcnt); 1199 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1200 vdev_raidz_pqr_func, &pqr); 1201 1202 /* 1203 * Treat short columns as though they are full of 0s. 1204 * Note that there's therefore nothing needed for P. 1205 */ 1206 uint64_t mask; 1207 for (uint64_t i = ccnt; i < pcnt; i++) { 1208 VDEV_RAIDZ_64MUL_2(q[i], mask); 1209 VDEV_RAIDZ_64MUL_4(r[i], mask); 1210 } 1211 } 1212 } 1213 } 1214 1215 /* 1216 * Generate RAID parity in the first virtual columns according to the number of 1217 * parity columns available. 1218 */ 1219 void 1220 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1221 { 1222 if (rr->rr_cols == 0) { 1223 /* 1224 * We are handling this block one row at a time (because 1225 * this block has a different logical vs physical width, 1226 * due to RAIDZ expansion), and this is a pad-only row, 1227 * which has no parity. 1228 */ 1229 return; 1230 } 1231 1232 /* Generate using the new math implementation */ 1233 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1234 return; 1235 1236 switch (rr->rr_firstdatacol) { 1237 case 1: 1238 vdev_raidz_generate_parity_p(rr); 1239 break; 1240 case 2: 1241 vdev_raidz_generate_parity_pq(rr); 1242 break; 1243 case 3: 1244 vdev_raidz_generate_parity_pqr(rr); 1245 break; 1246 default: 1247 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1248 } 1249 } 1250 1251 void 1252 vdev_raidz_generate_parity(raidz_map_t *rm) 1253 { 1254 for (int i = 0; i < rm->rm_nrows; i++) { 1255 raidz_row_t *rr = rm->rm_row[i]; 1256 vdev_raidz_generate_parity_row(rm, rr); 1257 } 1258 } 1259 1260 static int 1261 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1262 { 1263 (void) private; 1264 uint64_t *dst = dbuf; 1265 uint64_t *src = sbuf; 1266 int cnt = size / sizeof (src[0]); 1267 1268 for (int i = 0; i < cnt; i++) { 1269 dst[i] ^= src[i]; 1270 } 1271 1272 return (0); 1273 } 1274 1275 static int 1276 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1277 void *private) 1278 { 1279 (void) private; 1280 uint64_t *dst = dbuf; 1281 uint64_t *src = sbuf; 1282 uint64_t mask; 1283 int cnt = size / sizeof (dst[0]); 1284 1285 for (int i = 0; i < cnt; i++, dst++, src++) { 1286 VDEV_RAIDZ_64MUL_2(*dst, mask); 1287 *dst ^= *src; 1288 } 1289 1290 return (0); 1291 } 1292 1293 static int 1294 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1295 { 1296 (void) private; 1297 uint64_t *dst = buf; 1298 uint64_t mask; 1299 int cnt = size / sizeof (dst[0]); 1300 1301 for (int i = 0; i < cnt; i++, dst++) { 1302 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1303 VDEV_RAIDZ_64MUL_2(*dst, mask); 1304 } 1305 1306 return (0); 1307 } 1308 1309 struct reconst_q_struct { 1310 uint64_t *q; 1311 int exp; 1312 }; 1313 1314 static int 1315 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1316 { 1317 struct reconst_q_struct *rq = private; 1318 uint64_t *dst = buf; 1319 int cnt = size / sizeof (dst[0]); 1320 1321 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1322 int j; 1323 uint8_t *b; 1324 1325 *dst ^= *rq->q; 1326 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1327 *b = vdev_raidz_exp2(*b, rq->exp); 1328 } 1329 } 1330 1331 return (0); 1332 } 1333 1334 struct reconst_pq_struct { 1335 uint8_t *p; 1336 uint8_t *q; 1337 uint8_t *pxy; 1338 uint8_t *qxy; 1339 int aexp; 1340 int bexp; 1341 }; 1342 1343 static int 1344 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1345 { 1346 struct reconst_pq_struct *rpq = private; 1347 uint8_t *xd = xbuf; 1348 uint8_t *yd = ybuf; 1349 1350 for (int i = 0; i < size; 1351 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1352 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1353 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1354 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1355 } 1356 1357 return (0); 1358 } 1359 1360 static int 1361 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1362 { 1363 struct reconst_pq_struct *rpq = private; 1364 uint8_t *xd = xbuf; 1365 1366 for (int i = 0; i < size; 1367 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1368 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1369 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1370 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1371 } 1372 1373 return (0); 1374 } 1375 1376 static void 1377 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1378 { 1379 int x = tgts[0]; 1380 abd_t *dst, *src; 1381 1382 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1383 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1384 1385 ASSERT3U(ntgts, ==, 1); 1386 ASSERT3U(x, >=, rr->rr_firstdatacol); 1387 ASSERT3U(x, <, rr->rr_cols); 1388 1389 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1390 1391 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1392 dst = rr->rr_col[x].rc_abd; 1393 1394 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1395 1396 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1397 uint64_t size = MIN(rr->rr_col[x].rc_size, 1398 rr->rr_col[c].rc_size); 1399 1400 src = rr->rr_col[c].rc_abd; 1401 1402 if (c == x) 1403 continue; 1404 1405 (void) abd_iterate_func2(dst, src, 0, 0, size, 1406 vdev_raidz_reconst_p_func, NULL); 1407 } 1408 } 1409 1410 static void 1411 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1412 { 1413 int x = tgts[0]; 1414 int c, exp; 1415 abd_t *dst, *src; 1416 1417 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1418 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1419 1420 ASSERT(ntgts == 1); 1421 1422 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1423 1424 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1425 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 1426 rr->rr_col[c].rc_size); 1427 1428 src = rr->rr_col[c].rc_abd; 1429 dst = rr->rr_col[x].rc_abd; 1430 1431 if (c == rr->rr_firstdatacol) { 1432 abd_copy(dst, src, size); 1433 if (rr->rr_col[x].rc_size > size) { 1434 abd_zero_off(dst, size, 1435 rr->rr_col[x].rc_size - size); 1436 } 1437 } else { 1438 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1439 (void) abd_iterate_func2(dst, src, 0, 0, size, 1440 vdev_raidz_reconst_q_pre_func, NULL); 1441 (void) abd_iterate_func(dst, 1442 size, rr->rr_col[x].rc_size - size, 1443 vdev_raidz_reconst_q_pre_tail_func, NULL); 1444 } 1445 } 1446 1447 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1448 dst = rr->rr_col[x].rc_abd; 1449 exp = 255 - (rr->rr_cols - 1 - x); 1450 1451 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1452 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1453 vdev_raidz_reconst_q_post_func, &rq); 1454 } 1455 1456 static void 1457 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1458 { 1459 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1460 abd_t *pdata, *qdata; 1461 uint64_t xsize, ysize; 1462 int x = tgts[0]; 1463 int y = tgts[1]; 1464 abd_t *xd, *yd; 1465 1466 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1467 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1468 1469 ASSERT(ntgts == 2); 1470 ASSERT(x < y); 1471 ASSERT(x >= rr->rr_firstdatacol); 1472 ASSERT(y < rr->rr_cols); 1473 1474 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1475 1476 /* 1477 * Move the parity data aside -- we're going to compute parity as 1478 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1479 * reuse the parity generation mechanism without trashing the actual 1480 * parity so we make those columns appear to be full of zeros by 1481 * setting their lengths to zero. 1482 */ 1483 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1484 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1485 xsize = rr->rr_col[x].rc_size; 1486 ysize = rr->rr_col[y].rc_size; 1487 1488 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 1489 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1490 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 1491 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1492 rr->rr_col[x].rc_size = 0; 1493 rr->rr_col[y].rc_size = 0; 1494 1495 vdev_raidz_generate_parity_pq(rr); 1496 1497 rr->rr_col[x].rc_size = xsize; 1498 rr->rr_col[y].rc_size = ysize; 1499 1500 p = abd_to_buf(pdata); 1501 q = abd_to_buf(qdata); 1502 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1503 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1504 xd = rr->rr_col[x].rc_abd; 1505 yd = rr->rr_col[y].rc_abd; 1506 1507 /* 1508 * We now have: 1509 * Pxy = P + D_x + D_y 1510 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1511 * 1512 * We can then solve for D_x: 1513 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1514 * where 1515 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1516 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1517 * 1518 * With D_x in hand, we can easily solve for D_y: 1519 * D_y = P + Pxy + D_x 1520 */ 1521 1522 a = vdev_raidz_pow2[255 + x - y]; 1523 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1524 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1525 1526 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1527 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1528 1529 ASSERT3U(xsize, >=, ysize); 1530 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1531 1532 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1533 vdev_raidz_reconst_pq_func, &rpq); 1534 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1535 vdev_raidz_reconst_pq_tail_func, &rpq); 1536 1537 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1538 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1539 1540 /* 1541 * Restore the saved parity data. 1542 */ 1543 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 1544 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1545 } 1546 1547 /* 1548 * In the general case of reconstruction, we must solve the system of linear 1549 * equations defined by the coefficients used to generate parity as well as 1550 * the contents of the data and parity disks. This can be expressed with 1551 * vectors for the original data (D) and the actual data (d) and parity (p) 1552 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1553 * 1554 * __ __ __ __ 1555 * | | __ __ | p_0 | 1556 * | V | | D_0 | | p_m-1 | 1557 * | | x | : | = | d_0 | 1558 * | I | | D_n-1 | | : | 1559 * | | ~~ ~~ | d_n-1 | 1560 * ~~ ~~ ~~ ~~ 1561 * 1562 * I is simply a square identity matrix of size n, and V is a vandermonde 1563 * matrix defined by the coefficients we chose for the various parity columns 1564 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1565 * computation as well as linear separability. 1566 * 1567 * __ __ __ __ 1568 * | 1 .. 1 1 1 | | p_0 | 1569 * | 2^n-1 .. 4 2 1 | __ __ | : | 1570 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1571 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1572 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1573 * | : : : : | | : | | d_2 | 1574 * | 0 .. 1 0 0 | | D_n-1 | | : | 1575 * | 0 .. 0 1 0 | ~~ ~~ | : | 1576 * | 0 .. 0 0 1 | | d_n-1 | 1577 * ~~ ~~ ~~ ~~ 1578 * 1579 * Note that I, V, d, and p are known. To compute D, we must invert the 1580 * matrix and use the known data and parity values to reconstruct the unknown 1581 * data values. We begin by removing the rows in V|I and d|p that correspond 1582 * to failed or missing columns; we then make V|I square (n x n) and d|p 1583 * sized n by removing rows corresponding to unused parity from the bottom up 1584 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1585 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1586 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1587 * __ __ 1588 * | 1 1 1 1 1 1 1 1 | 1589 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1590 * | 19 205 116 29 64 16 4 1 | / / 1591 * | 1 0 0 0 0 0 0 0 | / / 1592 * | 0 1 0 0 0 0 0 0 | <--' / 1593 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1594 * | 0 0 0 1 0 0 0 0 | 1595 * | 0 0 0 0 1 0 0 0 | 1596 * | 0 0 0 0 0 1 0 0 | 1597 * | 0 0 0 0 0 0 1 0 | 1598 * | 0 0 0 0 0 0 0 1 | 1599 * ~~ ~~ 1600 * __ __ 1601 * | 1 1 1 1 1 1 1 1 | 1602 * | 128 64 32 16 8 4 2 1 | 1603 * | 19 205 116 29 64 16 4 1 | 1604 * | 1 0 0 0 0 0 0 0 | 1605 * | 0 1 0 0 0 0 0 0 | 1606 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1607 * | 0 0 0 1 0 0 0 0 | 1608 * | 0 0 0 0 1 0 0 0 | 1609 * | 0 0 0 0 0 1 0 0 | 1610 * | 0 0 0 0 0 0 1 0 | 1611 * | 0 0 0 0 0 0 0 1 | 1612 * ~~ ~~ 1613 * 1614 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1615 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1616 * matrix is not singular. 1617 * __ __ 1618 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1619 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1620 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1621 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1622 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1623 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1624 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1625 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1626 * ~~ ~~ 1627 * __ __ 1628 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1629 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1630 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1631 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1632 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1633 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1634 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1635 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1636 * ~~ ~~ 1637 * __ __ 1638 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1639 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1640 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1641 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1642 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1643 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1644 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1645 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1646 * ~~ ~~ 1647 * __ __ 1648 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1649 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1650 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1651 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1652 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1653 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1654 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1655 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1656 * ~~ ~~ 1657 * __ __ 1658 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1659 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1660 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1661 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1662 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1663 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1664 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1665 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1666 * ~~ ~~ 1667 * __ __ 1668 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1669 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1670 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1671 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1672 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1673 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1674 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1675 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1676 * ~~ ~~ 1677 * __ __ 1678 * | 0 0 1 0 0 0 0 0 | 1679 * | 167 100 5 41 159 169 217 208 | 1680 * | 166 100 4 40 158 168 216 209 | 1681 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1682 * | 0 0 0 0 1 0 0 0 | 1683 * | 0 0 0 0 0 1 0 0 | 1684 * | 0 0 0 0 0 0 1 0 | 1685 * | 0 0 0 0 0 0 0 1 | 1686 * ~~ ~~ 1687 * 1688 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1689 * of the missing data. 1690 * 1691 * As is apparent from the example above, the only non-trivial rows in the 1692 * inverse matrix correspond to the data disks that we're trying to 1693 * reconstruct. Indeed, those are the only rows we need as the others would 1694 * only be useful for reconstructing data known or assumed to be valid. For 1695 * that reason, we only build the coefficients in the rows that correspond to 1696 * targeted columns. 1697 */ 1698 1699 static void 1700 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1701 uint8_t **rows) 1702 { 1703 int i, j; 1704 int pow; 1705 1706 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1707 1708 /* 1709 * Fill in the missing rows of interest. 1710 */ 1711 for (i = 0; i < nmap; i++) { 1712 ASSERT3S(0, <=, map[i]); 1713 ASSERT3S(map[i], <=, 2); 1714 1715 pow = map[i] * n; 1716 if (pow > 255) 1717 pow -= 255; 1718 ASSERT(pow <= 255); 1719 1720 for (j = 0; j < n; j++) { 1721 pow -= map[i]; 1722 if (pow < 0) 1723 pow += 255; 1724 rows[i][j] = vdev_raidz_pow2[pow]; 1725 } 1726 } 1727 } 1728 1729 static void 1730 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1731 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1732 { 1733 int i, j, ii, jj; 1734 uint8_t log; 1735 1736 /* 1737 * Assert that the first nmissing entries from the array of used 1738 * columns correspond to parity columns and that subsequent entries 1739 * correspond to data columns. 1740 */ 1741 for (i = 0; i < nmissing; i++) { 1742 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1743 } 1744 for (; i < n; i++) { 1745 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1746 } 1747 1748 /* 1749 * First initialize the storage where we'll compute the inverse rows. 1750 */ 1751 for (i = 0; i < nmissing; i++) { 1752 for (j = 0; j < n; j++) { 1753 invrows[i][j] = (i == j) ? 1 : 0; 1754 } 1755 } 1756 1757 /* 1758 * Subtract all trivial rows from the rows of consequence. 1759 */ 1760 for (i = 0; i < nmissing; i++) { 1761 for (j = nmissing; j < n; j++) { 1762 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1763 jj = used[j] - rr->rr_firstdatacol; 1764 ASSERT3S(jj, <, n); 1765 invrows[i][j] = rows[i][jj]; 1766 rows[i][jj] = 0; 1767 } 1768 } 1769 1770 /* 1771 * For each of the rows of interest, we must normalize it and subtract 1772 * a multiple of it from the other rows. 1773 */ 1774 for (i = 0; i < nmissing; i++) { 1775 for (j = 0; j < missing[i]; j++) { 1776 ASSERT0(rows[i][j]); 1777 } 1778 ASSERT3U(rows[i][missing[i]], !=, 0); 1779 1780 /* 1781 * Compute the inverse of the first element and multiply each 1782 * element in the row by that value. 1783 */ 1784 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1785 1786 for (j = 0; j < n; j++) { 1787 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1788 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1789 } 1790 1791 for (ii = 0; ii < nmissing; ii++) { 1792 if (i == ii) 1793 continue; 1794 1795 ASSERT3U(rows[ii][missing[i]], !=, 0); 1796 1797 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1798 1799 for (j = 0; j < n; j++) { 1800 rows[ii][j] ^= 1801 vdev_raidz_exp2(rows[i][j], log); 1802 invrows[ii][j] ^= 1803 vdev_raidz_exp2(invrows[i][j], log); 1804 } 1805 } 1806 } 1807 1808 /* 1809 * Verify that the data that is left in the rows are properly part of 1810 * an identity matrix. 1811 */ 1812 for (i = 0; i < nmissing; i++) { 1813 for (j = 0; j < n; j++) { 1814 if (j == missing[i]) { 1815 ASSERT3U(rows[i][j], ==, 1); 1816 } else { 1817 ASSERT0(rows[i][j]); 1818 } 1819 } 1820 } 1821 } 1822 1823 static void 1824 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1825 int *missing, uint8_t **invrows, const uint8_t *used) 1826 { 1827 int i, j, x, cc, c; 1828 uint8_t *src; 1829 uint64_t ccount; 1830 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1831 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1832 uint8_t log = 0; 1833 uint8_t val; 1834 int ll; 1835 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1836 uint8_t *p, *pp; 1837 size_t psize; 1838 1839 psize = sizeof (invlog[0][0]) * n * nmissing; 1840 p = kmem_alloc(psize, KM_SLEEP); 1841 1842 for (pp = p, i = 0; i < nmissing; i++) { 1843 invlog[i] = pp; 1844 pp += n; 1845 } 1846 1847 for (i = 0; i < nmissing; i++) { 1848 for (j = 0; j < n; j++) { 1849 ASSERT3U(invrows[i][j], !=, 0); 1850 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1851 } 1852 } 1853 1854 for (i = 0; i < n; i++) { 1855 c = used[i]; 1856 ASSERT3U(c, <, rr->rr_cols); 1857 1858 ccount = rr->rr_col[c].rc_size; 1859 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1860 if (ccount == 0) 1861 continue; 1862 src = abd_to_buf(rr->rr_col[c].rc_abd); 1863 for (j = 0; j < nmissing; j++) { 1864 cc = missing[j] + rr->rr_firstdatacol; 1865 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1866 ASSERT3U(cc, <, rr->rr_cols); 1867 ASSERT3U(cc, !=, c); 1868 1869 dcount[j] = rr->rr_col[cc].rc_size; 1870 if (dcount[j] != 0) 1871 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1872 } 1873 1874 for (x = 0; x < ccount; x++, src++) { 1875 if (*src != 0) 1876 log = vdev_raidz_log2[*src]; 1877 1878 for (cc = 0; cc < nmissing; cc++) { 1879 if (x >= dcount[cc]) 1880 continue; 1881 1882 if (*src == 0) { 1883 val = 0; 1884 } else { 1885 if ((ll = log + invlog[cc][i]) >= 255) 1886 ll -= 255; 1887 val = vdev_raidz_pow2[ll]; 1888 } 1889 1890 if (i == 0) 1891 dst[cc][x] = val; 1892 else 1893 dst[cc][x] ^= val; 1894 } 1895 } 1896 } 1897 1898 kmem_free(p, psize); 1899 } 1900 1901 static void 1902 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1903 { 1904 int i, c, t, tt; 1905 unsigned int n; 1906 unsigned int nmissing_rows; 1907 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1908 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1909 uint8_t *p, *pp; 1910 size_t psize; 1911 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1912 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1913 uint8_t *used; 1914 1915 abd_t **bufs = NULL; 1916 1917 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1918 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1919 /* 1920 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1921 * temporary linear ABDs if any non-linear ABDs are found. 1922 */ 1923 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1924 ASSERT(rr->rr_col[i].rc_abd != NULL); 1925 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1926 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1927 KM_PUSHPAGE); 1928 1929 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1930 raidz_col_t *col = &rr->rr_col[c]; 1931 1932 bufs[c] = col->rc_abd; 1933 if (bufs[c] != NULL) { 1934 col->rc_abd = abd_alloc_linear( 1935 col->rc_size, B_TRUE); 1936 abd_copy(col->rc_abd, bufs[c], 1937 col->rc_size); 1938 } 1939 } 1940 1941 break; 1942 } 1943 } 1944 1945 n = rr->rr_cols - rr->rr_firstdatacol; 1946 1947 /* 1948 * Figure out which data columns are missing. 1949 */ 1950 nmissing_rows = 0; 1951 for (t = 0; t < ntgts; t++) { 1952 if (tgts[t] >= rr->rr_firstdatacol) { 1953 missing_rows[nmissing_rows++] = 1954 tgts[t] - rr->rr_firstdatacol; 1955 } 1956 } 1957 1958 /* 1959 * Figure out which parity columns to use to help generate the missing 1960 * data columns. 1961 */ 1962 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1963 ASSERT(tt < ntgts); 1964 ASSERT(c < rr->rr_firstdatacol); 1965 1966 /* 1967 * Skip any targeted parity columns. 1968 */ 1969 if (c == tgts[tt]) { 1970 tt++; 1971 continue; 1972 } 1973 1974 parity_map[i] = c; 1975 i++; 1976 } 1977 1978 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1979 nmissing_rows * n + sizeof (used[0]) * n; 1980 p = kmem_alloc(psize, KM_SLEEP); 1981 1982 for (pp = p, i = 0; i < nmissing_rows; i++) { 1983 rows[i] = pp; 1984 pp += n; 1985 invrows[i] = pp; 1986 pp += n; 1987 } 1988 used = pp; 1989 1990 for (i = 0; i < nmissing_rows; i++) { 1991 used[i] = parity_map[i]; 1992 } 1993 1994 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1995 if (tt < nmissing_rows && 1996 c == missing_rows[tt] + rr->rr_firstdatacol) { 1997 tt++; 1998 continue; 1999 } 2000 2001 ASSERT3S(i, <, n); 2002 used[i] = c; 2003 i++; 2004 } 2005 2006 /* 2007 * Initialize the interesting rows of the matrix. 2008 */ 2009 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2010 2011 /* 2012 * Invert the matrix. 2013 */ 2014 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2015 invrows, used); 2016 2017 /* 2018 * Reconstruct the missing data using the generated matrix. 2019 */ 2020 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2021 invrows, used); 2022 2023 kmem_free(p, psize); 2024 2025 /* 2026 * copy back from temporary linear abds and free them 2027 */ 2028 if (bufs) { 2029 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2030 raidz_col_t *col = &rr->rr_col[c]; 2031 2032 if (bufs[c] != NULL) { 2033 abd_copy(bufs[c], col->rc_abd, col->rc_size); 2034 abd_free(col->rc_abd); 2035 } 2036 col->rc_abd = bufs[c]; 2037 } 2038 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2039 } 2040 } 2041 2042 static void 2043 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 2044 const int *t, int nt) 2045 { 2046 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2047 int ntgts; 2048 int i, c, ret; 2049 int nbadparity, nbaddata; 2050 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2051 2052 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2053 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2054 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2055 (int)rr->rr_missingparity); 2056 } 2057 2058 nbadparity = rr->rr_firstdatacol; 2059 nbaddata = rr->rr_cols - nbadparity; 2060 ntgts = 0; 2061 for (i = 0, c = 0; c < rr->rr_cols; c++) { 2062 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2063 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2064 "offset=%llx error=%u)", 2065 rr, c, (int)rr->rr_col[c].rc_devidx, 2066 (long long)rr->rr_col[c].rc_offset, 2067 (int)rr->rr_col[c].rc_error); 2068 } 2069 if (c < rr->rr_firstdatacol) 2070 parity_valid[c] = B_FALSE; 2071 2072 if (i < nt && c == t[i]) { 2073 tgts[ntgts++] = c; 2074 i++; 2075 } else if (rr->rr_col[c].rc_error != 0) { 2076 tgts[ntgts++] = c; 2077 } else if (c >= rr->rr_firstdatacol) { 2078 nbaddata--; 2079 } else { 2080 parity_valid[c] = B_TRUE; 2081 nbadparity--; 2082 } 2083 } 2084 2085 ASSERT(ntgts >= nt); 2086 ASSERT(nbaddata >= 0); 2087 ASSERT(nbaddata + nbadparity == ntgts); 2088 2089 dt = &tgts[nbadparity]; 2090 2091 /* Reconstruct using the new math implementation */ 2092 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2093 if (ret != RAIDZ_ORIGINAL_IMPL) 2094 return; 2095 2096 /* 2097 * See if we can use any of our optimized reconstruction routines. 2098 */ 2099 switch (nbaddata) { 2100 case 1: 2101 if (parity_valid[VDEV_RAIDZ_P]) { 2102 vdev_raidz_reconstruct_p(rr, dt, 1); 2103 return; 2104 } 2105 2106 ASSERT(rr->rr_firstdatacol > 1); 2107 2108 if (parity_valid[VDEV_RAIDZ_Q]) { 2109 vdev_raidz_reconstruct_q(rr, dt, 1); 2110 return; 2111 } 2112 2113 ASSERT(rr->rr_firstdatacol > 2); 2114 break; 2115 2116 case 2: 2117 ASSERT(rr->rr_firstdatacol > 1); 2118 2119 if (parity_valid[VDEV_RAIDZ_P] && 2120 parity_valid[VDEV_RAIDZ_Q]) { 2121 vdev_raidz_reconstruct_pq(rr, dt, 2); 2122 return; 2123 } 2124 2125 ASSERT(rr->rr_firstdatacol > 2); 2126 2127 break; 2128 } 2129 2130 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2131 } 2132 2133 static int 2134 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2135 uint64_t *logical_ashift, uint64_t *physical_ashift) 2136 { 2137 vdev_raidz_t *vdrz = vd->vdev_tsd; 2138 uint64_t nparity = vdrz->vd_nparity; 2139 int c; 2140 int lasterror = 0; 2141 int numerrors = 0; 2142 2143 ASSERT(nparity > 0); 2144 2145 if (nparity > VDEV_RAIDZ_MAXPARITY || 2146 vd->vdev_children < nparity + 1) { 2147 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2148 return (SET_ERROR(EINVAL)); 2149 } 2150 2151 vdev_open_children(vd); 2152 2153 for (c = 0; c < vd->vdev_children; c++) { 2154 vdev_t *cvd = vd->vdev_child[c]; 2155 2156 if (cvd->vdev_open_error != 0) { 2157 lasterror = cvd->vdev_open_error; 2158 numerrors++; 2159 continue; 2160 } 2161 2162 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2163 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2164 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2165 } 2166 for (c = 0; c < vd->vdev_children; c++) { 2167 vdev_t *cvd = vd->vdev_child[c]; 2168 2169 if (cvd->vdev_open_error != 0) 2170 continue; 2171 *physical_ashift = vdev_best_ashift(*logical_ashift, 2172 *physical_ashift, cvd->vdev_physical_ashift); 2173 } 2174 2175 if (vd->vdev_rz_expanding) { 2176 *asize *= vd->vdev_children - 1; 2177 *max_asize *= vd->vdev_children - 1; 2178 2179 vd->vdev_min_asize = *asize; 2180 } else { 2181 *asize *= vd->vdev_children; 2182 *max_asize *= vd->vdev_children; 2183 } 2184 2185 if (numerrors > nparity) { 2186 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2187 return (lasterror); 2188 } 2189 2190 return (0); 2191 } 2192 2193 static void 2194 vdev_raidz_close(vdev_t *vd) 2195 { 2196 for (int c = 0; c < vd->vdev_children; c++) { 2197 if (vd->vdev_child[c] != NULL) 2198 vdev_close(vd->vdev_child[c]); 2199 } 2200 } 2201 2202 /* 2203 * Return the logical width to use, given the txg in which the allocation 2204 * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2205 * BP was allocated. Remapped BP's (that were relocated due to device 2206 * removal, see remap_blkptr_cb()), will have a more recent physical birth 2207 * which reflects when the BP was relocated, but we can ignore these because 2208 * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2209 */ 2210 static uint64_t 2211 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2212 { 2213 reflow_node_t lookup = { 2214 .re_txg = txg, 2215 }; 2216 avl_index_t where; 2217 2218 uint64_t width; 2219 mutex_enter(&vdrz->vd_expand_lock); 2220 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2221 if (re != NULL) { 2222 width = re->re_logical_width; 2223 } else { 2224 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2225 if (re != NULL) 2226 width = re->re_logical_width; 2227 else 2228 width = vdrz->vd_original_width; 2229 } 2230 mutex_exit(&vdrz->vd_expand_lock); 2231 return (width); 2232 } 2233 2234 /* 2235 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2236 * more space due to the lower data-to-parity ratio. In this case it's 2237 * important to pass in the correct txg. Note that vdev_gang_header_asize() 2238 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2239 * regardless of txg. This is assured because for a single data sector, we 2240 * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2241 */ 2242 static uint64_t 2243 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2244 { 2245 vdev_raidz_t *vdrz = vd->vdev_tsd; 2246 uint64_t asize; 2247 uint64_t ashift = vd->vdev_top->vdev_ashift; 2248 uint64_t cols = vdrz->vd_original_width; 2249 uint64_t nparity = vdrz->vd_nparity; 2250 2251 cols = vdev_raidz_get_logical_width(vdrz, txg); 2252 2253 asize = ((psize - 1) >> ashift) + 1; 2254 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2255 asize = roundup(asize, nparity + 1) << ashift; 2256 2257 #ifdef ZFS_DEBUG 2258 uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2259 uint64_t ncols_new = vdrz->vd_physical_width; 2260 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2261 (ncols_new - nparity)); 2262 asize_new = roundup(asize_new, nparity + 1) << ashift; 2263 VERIFY3U(asize_new, <=, asize); 2264 #endif 2265 2266 return (asize); 2267 } 2268 2269 /* 2270 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 2271 * so each child must provide at least 1/Nth of its asize. 2272 */ 2273 static uint64_t 2274 vdev_raidz_min_asize(vdev_t *vd) 2275 { 2276 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 2277 vd->vdev_children); 2278 } 2279 2280 void 2281 vdev_raidz_child_done(zio_t *zio) 2282 { 2283 raidz_col_t *rc = zio->io_private; 2284 2285 ASSERT3P(rc->rc_abd, !=, NULL); 2286 rc->rc_error = zio->io_error; 2287 rc->rc_tried = 1; 2288 rc->rc_skipped = 0; 2289 } 2290 2291 static void 2292 vdev_raidz_shadow_child_done(zio_t *zio) 2293 { 2294 raidz_col_t *rc = zio->io_private; 2295 2296 rc->rc_shadow_error = zio->io_error; 2297 } 2298 2299 static void 2300 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2301 { 2302 (void) rm; 2303 #ifdef ZFS_DEBUG 2304 range_seg64_t logical_rs, physical_rs, remain_rs; 2305 logical_rs.rs_start = rr->rr_offset; 2306 logical_rs.rs_end = logical_rs.rs_start + 2307 vdev_raidz_asize(zio->io_vd, rr->rr_size, 2308 BP_GET_BIRTH(zio->io_bp)); 2309 2310 raidz_col_t *rc = &rr->rr_col[col]; 2311 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2312 2313 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 2314 ASSERT(vdev_xlate_is_empty(&remain_rs)); 2315 if (vdev_xlate_is_empty(&physical_rs)) { 2316 /* 2317 * If we are in the middle of expansion, the 2318 * physical->logical mapping is changing so vdev_xlate() 2319 * can't give us a reliable answer. 2320 */ 2321 return; 2322 } 2323 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2324 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2325 /* 2326 * It would be nice to assert that rs_end is equal 2327 * to rc_offset + rc_size but there might be an 2328 * optional I/O at the end that is not accounted in 2329 * rc_size. 2330 */ 2331 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2332 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2333 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2334 } else { 2335 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2336 } 2337 #endif 2338 } 2339 2340 static void 2341 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 2342 { 2343 vdev_t *vd = zio->io_vd; 2344 raidz_map_t *rm = zio->io_vsd; 2345 2346 vdev_raidz_generate_parity_row(rm, rr); 2347 2348 for (int c = 0; c < rr->rr_scols; c++) { 2349 raidz_col_t *rc = &rr->rr_col[c]; 2350 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2351 2352 /* Verify physical to logical translation */ 2353 vdev_raidz_io_verify(zio, rm, rr, c); 2354 2355 if (rc->rc_size == 0) 2356 continue; 2357 2358 ASSERT3U(rc->rc_offset + rc->rc_size, <, 2359 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2360 2361 ASSERT3P(rc->rc_abd, !=, NULL); 2362 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2363 rc->rc_offset, rc->rc_abd, 2364 abd_get_size(rc->rc_abd), zio->io_type, 2365 zio->io_priority, 0, vdev_raidz_child_done, rc)); 2366 2367 if (rc->rc_shadow_devidx != INT_MAX) { 2368 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2369 2370 ASSERT3U( 2371 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2372 cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2373 2374 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2375 rc->rc_shadow_offset, rc->rc_abd, 2376 abd_get_size(rc->rc_abd), 2377 zio->io_type, zio->io_priority, 0, 2378 vdev_raidz_shadow_child_done, rc)); 2379 } 2380 } 2381 } 2382 2383 /* 2384 * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2385 * This only works for vdev_raidz_map_alloc() (not _expanded()). 2386 */ 2387 static void 2388 raidz_start_skip_writes(zio_t *zio) 2389 { 2390 vdev_t *vd = zio->io_vd; 2391 uint64_t ashift = vd->vdev_top->vdev_ashift; 2392 raidz_map_t *rm = zio->io_vsd; 2393 ASSERT3U(rm->rm_nrows, ==, 1); 2394 raidz_row_t *rr = rm->rm_row[0]; 2395 for (int c = 0; c < rr->rr_scols; c++) { 2396 raidz_col_t *rc = &rr->rr_col[c]; 2397 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2398 if (rc->rc_size != 0) 2399 continue; 2400 ASSERT3P(rc->rc_abd, ==, NULL); 2401 2402 ASSERT3U(rc->rc_offset, <, 2403 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2404 2405 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2406 NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2407 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2408 } 2409 } 2410 2411 static void 2412 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 2413 { 2414 vdev_t *vd = zio->io_vd; 2415 2416 /* 2417 * Iterate over the columns in reverse order so that we hit the parity 2418 * last -- any errors along the way will force us to read the parity. 2419 */ 2420 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2421 raidz_col_t *rc = &rr->rr_col[c]; 2422 if (rc->rc_size == 0) 2423 continue; 2424 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2425 if (!vdev_readable(cvd)) { 2426 if (c >= rr->rr_firstdatacol) 2427 rr->rr_missingdata++; 2428 else 2429 rr->rr_missingparity++; 2430 rc->rc_error = SET_ERROR(ENXIO); 2431 rc->rc_tried = 1; /* don't even try */ 2432 rc->rc_skipped = 1; 2433 continue; 2434 } 2435 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2436 if (c >= rr->rr_firstdatacol) 2437 rr->rr_missingdata++; 2438 else 2439 rr->rr_missingparity++; 2440 rc->rc_error = SET_ERROR(ESTALE); 2441 rc->rc_skipped = 1; 2442 continue; 2443 } 2444 if (forceparity || 2445 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 2446 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2447 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2448 rc->rc_offset, rc->rc_abd, rc->rc_size, 2449 zio->io_type, zio->io_priority, 0, 2450 vdev_raidz_child_done, rc)); 2451 } 2452 } 2453 } 2454 2455 static void 2456 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2457 { 2458 vdev_t *vd = zio->io_vd; 2459 2460 for (int i = 0; i < rm->rm_nphys_cols; i++) { 2461 raidz_col_t *prc = &rm->rm_phys_col[i]; 2462 if (prc->rc_size == 0) 2463 continue; 2464 2465 ASSERT3U(prc->rc_devidx, ==, i); 2466 vdev_t *cvd = vd->vdev_child[i]; 2467 if (!vdev_readable(cvd)) { 2468 prc->rc_error = SET_ERROR(ENXIO); 2469 prc->rc_tried = 1; /* don't even try */ 2470 prc->rc_skipped = 1; 2471 continue; 2472 } 2473 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2474 prc->rc_error = SET_ERROR(ESTALE); 2475 prc->rc_skipped = 1; 2476 continue; 2477 } 2478 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2479 prc->rc_offset, prc->rc_abd, prc->rc_size, 2480 zio->io_type, zio->io_priority, 0, 2481 vdev_raidz_child_done, prc)); 2482 } 2483 } 2484 2485 static void 2486 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2487 { 2488 /* 2489 * If there are multiple rows, we will be hitting 2490 * all disks, so go ahead and read the parity so 2491 * that we are reading in decent size chunks. 2492 */ 2493 boolean_t forceparity = rm->rm_nrows > 1; 2494 2495 if (rm->rm_phys_col) { 2496 vdev_raidz_io_start_read_phys_cols(zio, rm); 2497 } else { 2498 for (int i = 0; i < rm->rm_nrows; i++) { 2499 raidz_row_t *rr = rm->rm_row[i]; 2500 vdev_raidz_io_start_read_row(zio, rr, forceparity); 2501 } 2502 } 2503 } 2504 2505 /* 2506 * Start an IO operation on a RAIDZ VDev 2507 * 2508 * Outline: 2509 * - For write operations: 2510 * 1. Generate the parity data 2511 * 2. Create child zio write operations to each column's vdev, for both 2512 * data and parity. 2513 * 3. If the column skips any sectors for padding, create optional dummy 2514 * write zio children for those areas to improve aggregation continuity. 2515 * - For read operations: 2516 * 1. Create child zio read operations to each data column's vdev to read 2517 * the range of data required for zio. 2518 * 2. If this is a scrub or resilver operation, or if any of the data 2519 * vdevs have had errors, then create zio read operations to the parity 2520 * columns' VDevs as well. 2521 */ 2522 static void 2523 vdev_raidz_io_start(zio_t *zio) 2524 { 2525 vdev_t *vd = zio->io_vd; 2526 vdev_t *tvd = vd->vdev_top; 2527 vdev_raidz_t *vdrz = vd->vdev_tsd; 2528 raidz_map_t *rm; 2529 2530 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2531 BP_GET_BIRTH(zio->io_bp)); 2532 if (logical_width != vdrz->vd_physical_width) { 2533 zfs_locked_range_t *lr = NULL; 2534 uint64_t synced_offset = UINT64_MAX; 2535 uint64_t next_offset = UINT64_MAX; 2536 boolean_t use_scratch = B_FALSE; 2537 /* 2538 * Note: when the expansion is completing, we set 2539 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2540 * in a later txg than when we last update spa_ubsync's state 2541 * (see the end of spa_raidz_expand_thread()). Therefore we 2542 * may see vre_state!=SCANNING before 2543 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2544 * on disk, but the copying progress has been synced to disk 2545 * (and reflected in spa_ubsync). In this case it's fine to 2546 * treat the expansion as completed, since if we crash there's 2547 * no additional copying to do. 2548 */ 2549 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2550 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2551 &vdrz->vn_vre); 2552 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2553 zio->io_offset, zio->io_size, RL_READER); 2554 use_scratch = 2555 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2556 RRSS_SCRATCH_VALID); 2557 synced_offset = 2558 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2559 next_offset = vdrz->vn_vre.vre_offset; 2560 /* 2561 * If we haven't resumed expanding since importing the 2562 * pool, vre_offset won't have been set yet. In 2563 * this case the next offset to be copied is the same 2564 * as what was synced. 2565 */ 2566 if (next_offset == UINT64_MAX) { 2567 next_offset = synced_offset; 2568 } 2569 } 2570 if (use_scratch) { 2571 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2572 "%lld next_offset=%lld use_scratch=%u", 2573 zio, 2574 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2575 (long long)zio->io_offset, 2576 (long long)synced_offset, 2577 (long long)next_offset, 2578 use_scratch); 2579 } 2580 2581 rm = vdev_raidz_map_alloc_expanded(zio, 2582 tvd->vdev_ashift, vdrz->vd_physical_width, 2583 logical_width, vdrz->vd_nparity, 2584 synced_offset, next_offset, use_scratch); 2585 rm->rm_lr = lr; 2586 } else { 2587 rm = vdev_raidz_map_alloc(zio, 2588 tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2589 } 2590 rm->rm_original_width = vdrz->vd_original_width; 2591 2592 zio->io_vsd = rm; 2593 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2594 if (zio->io_type == ZIO_TYPE_WRITE) { 2595 for (int i = 0; i < rm->rm_nrows; i++) { 2596 vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2597 } 2598 2599 if (logical_width == vdrz->vd_physical_width) { 2600 raidz_start_skip_writes(zio); 2601 } 2602 } else { 2603 ASSERT(zio->io_type == ZIO_TYPE_READ); 2604 vdev_raidz_io_start_read(zio, rm); 2605 } 2606 2607 zio_execute(zio); 2608 } 2609 2610 /* 2611 * Report a checksum error for a child of a RAID-Z device. 2612 */ 2613 void 2614 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2615 { 2616 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2617 2618 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 2619 zio->io_priority != ZIO_PRIORITY_REBUILD) { 2620 zio_bad_cksum_t zbc; 2621 raidz_map_t *rm = zio->io_vsd; 2622 2623 zbc.zbc_has_cksum = 0; 2624 zbc.zbc_injected = rm->rm_ecksuminjected; 2625 2626 mutex_enter(&vd->vdev_stat_lock); 2627 vd->vdev_stat.vs_checksum_errors++; 2628 mutex_exit(&vd->vdev_stat_lock); 2629 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2630 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2631 rc->rc_abd, bad_data, &zbc); 2632 } 2633 } 2634 2635 /* 2636 * We keep track of whether or not there were any injected errors, so that 2637 * any ereports we generate can note it. 2638 */ 2639 static int 2640 raidz_checksum_verify(zio_t *zio) 2641 { 2642 zio_bad_cksum_t zbc = {0}; 2643 raidz_map_t *rm = zio->io_vsd; 2644 2645 int ret = zio_checksum_error(zio, &zbc); 2646 /* 2647 * Any Direct I/O read that has a checksum error must be treated as 2648 * suspicious as the contents of the buffer could be getting 2649 * manipulated while the I/O is taking place. The checksum verify error 2650 * will be reported to the top-level RAIDZ VDEV. 2651 */ 2652 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 2653 zio->io_error = ret; 2654 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 2655 zio_dio_chksum_verify_error_report(zio); 2656 zio_checksum_verified(zio); 2657 return (0); 2658 } 2659 2660 if (ret != 0 && zbc.zbc_injected != 0) 2661 rm->rm_ecksuminjected = 1; 2662 2663 return (ret); 2664 } 2665 2666 /* 2667 * Generate the parity from the data columns. If we tried and were able to 2668 * read the parity without error, verify that the generated parity matches the 2669 * data we read. If it doesn't, we fire off a checksum error. Return the 2670 * number of such failures. 2671 */ 2672 static int 2673 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2674 { 2675 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2676 int c, ret = 0; 2677 raidz_map_t *rm = zio->io_vsd; 2678 raidz_col_t *rc; 2679 2680 blkptr_t *bp = zio->io_bp; 2681 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2682 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2683 2684 if (checksum == ZIO_CHECKSUM_NOPARITY) 2685 return (ret); 2686 2687 for (c = 0; c < rr->rr_firstdatacol; c++) { 2688 rc = &rr->rr_col[c]; 2689 if (!rc->rc_tried || rc->rc_error != 0) 2690 continue; 2691 2692 orig[c] = rc->rc_abd; 2693 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2694 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2695 } 2696 2697 /* 2698 * Verify any empty sectors are zero filled to ensure the parity 2699 * is calculated correctly even if these non-data sectors are damaged. 2700 */ 2701 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2702 ret += vdev_draid_map_verify_empty(zio, rr); 2703 2704 /* 2705 * Regenerates parity even for !tried||rc_error!=0 columns. This 2706 * isn't harmful but it does have the side effect of fixing stuff 2707 * we didn't realize was necessary (i.e. even if we return 0). 2708 */ 2709 vdev_raidz_generate_parity_row(rm, rr); 2710 2711 for (c = 0; c < rr->rr_firstdatacol; c++) { 2712 rc = &rr->rr_col[c]; 2713 2714 if (!rc->rc_tried || rc->rc_error != 0) 2715 continue; 2716 2717 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2718 zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2719 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2720 vdev_raidz_checksum_error(zio, rc, orig[c]); 2721 rc->rc_error = SET_ERROR(ECKSUM); 2722 ret++; 2723 } 2724 abd_free(orig[c]); 2725 } 2726 2727 return (ret); 2728 } 2729 2730 static int 2731 vdev_raidz_worst_error(raidz_row_t *rr) 2732 { 2733 int error = 0; 2734 2735 for (int c = 0; c < rr->rr_cols; c++) { 2736 error = zio_worst_error(error, rr->rr_col[c].rc_error); 2737 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2738 } 2739 2740 return (error); 2741 } 2742 2743 static void 2744 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2745 { 2746 int unexpected_errors = 0; 2747 int parity_errors = 0; 2748 int parity_untried = 0; 2749 int data_errors = 0; 2750 2751 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2752 2753 for (int c = 0; c < rr->rr_cols; c++) { 2754 raidz_col_t *rc = &rr->rr_col[c]; 2755 2756 if (rc->rc_error) { 2757 if (c < rr->rr_firstdatacol) 2758 parity_errors++; 2759 else 2760 data_errors++; 2761 2762 if (!rc->rc_skipped) 2763 unexpected_errors++; 2764 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2765 parity_untried++; 2766 } 2767 2768 if (rc->rc_force_repair) 2769 unexpected_errors++; 2770 } 2771 2772 /* 2773 * If we read more parity disks than were used for 2774 * reconstruction, confirm that the other parity disks produced 2775 * correct data. 2776 * 2777 * Note that we also regenerate parity when resilvering so we 2778 * can write it out to failed devices later. 2779 */ 2780 if (parity_errors + parity_untried < 2781 rr->rr_firstdatacol - data_errors || 2782 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2783 int n = raidz_parity_verify(zio, rr); 2784 unexpected_errors += n; 2785 } 2786 2787 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2788 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2789 /* 2790 * Use the good data we have in hand to repair damaged children. 2791 */ 2792 for (int c = 0; c < rr->rr_cols; c++) { 2793 raidz_col_t *rc = &rr->rr_col[c]; 2794 vdev_t *vd = zio->io_vd; 2795 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2796 2797 if (!rc->rc_allow_repair) { 2798 continue; 2799 } else if (!rc->rc_force_repair && 2800 (rc->rc_error == 0 || rc->rc_size == 0)) { 2801 continue; 2802 } 2803 /* 2804 * We do not allow self healing for Direct I/O reads. 2805 * See comment in vdev_raid_row_alloc(). 2806 */ 2807 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 2808 2809 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2810 "offset=%llx", 2811 zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2812 2813 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2814 rc->rc_offset, rc->rc_abd, rc->rc_size, 2815 ZIO_TYPE_WRITE, 2816 zio->io_priority == ZIO_PRIORITY_REBUILD ? 2817 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 2818 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2819 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2820 } 2821 } 2822 2823 /* 2824 * Scrub or resilver i/o's: overwrite any shadow locations with the 2825 * good data. This ensures that if we've already copied this sector, 2826 * it will be corrected if it was damaged. This writes more than is 2827 * necessary, but since expansion is paused during scrub/resilver, at 2828 * most a single row will have a shadow location. 2829 */ 2830 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2831 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2832 for (int c = 0; c < rr->rr_cols; c++) { 2833 raidz_col_t *rc = &rr->rr_col[c]; 2834 vdev_t *vd = zio->io_vd; 2835 2836 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2837 continue; 2838 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2839 2840 /* 2841 * Note: We don't want to update the repair stats 2842 * because that would incorrectly indicate that there 2843 * was bad data to repair, which we aren't sure about. 2844 * By clearing the SCAN_THREAD flag, we prevent this 2845 * from happening, despite having the REPAIR flag set. 2846 * We need to set SELF_HEAL so that this i/o can't be 2847 * bypassed by zio_vdev_io_start(). 2848 */ 2849 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2850 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2851 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2852 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2853 NULL, NULL); 2854 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2855 zio_nowait(cio); 2856 } 2857 } 2858 } 2859 2860 static void 2861 raidz_restore_orig_data(raidz_map_t *rm) 2862 { 2863 for (int i = 0; i < rm->rm_nrows; i++) { 2864 raidz_row_t *rr = rm->rm_row[i]; 2865 for (int c = 0; c < rr->rr_cols; c++) { 2866 raidz_col_t *rc = &rr->rr_col[c]; 2867 if (rc->rc_need_orig_restore) { 2868 abd_copy(rc->rc_abd, 2869 rc->rc_orig_data, rc->rc_size); 2870 rc->rc_need_orig_restore = B_FALSE; 2871 } 2872 } 2873 } 2874 } 2875 2876 /* 2877 * During raidz_reconstruct() for expanded VDEV, we need special consideration 2878 * failure simulations. See note in raidz_reconstruct() on simulating failure 2879 * of a pre-expansion device. 2880 * 2881 * Treating logical child i as failed, return TRUE if the given column should 2882 * be treated as failed. The idea of logical children allows us to imagine 2883 * that a disk silently failed before a RAIDZ expansion (reads from this disk 2884 * succeed but return the wrong data). Since the expansion doesn't verify 2885 * checksums, the incorrect data will be moved to new locations spread among 2886 * the children (going diagonally across them). 2887 * 2888 * Higher "logical child failures" (values of `i`) indicate these 2889 * "pre-expansion failures". The first physical_width values imagine that a 2890 * current child failed; the next physical_width-1 values imagine that a 2891 * child failed before the most recent expansion; the next physical_width-2 2892 * values imagine a child failed in the expansion before that, etc. 2893 */ 2894 static boolean_t 2895 raidz_simulate_failure(int physical_width, int original_width, int ashift, 2896 int i, raidz_col_t *rc) 2897 { 2898 uint64_t sector_id = 2899 physical_width * (rc->rc_offset >> ashift) + 2900 rc->rc_devidx; 2901 2902 for (int w = physical_width; w >= original_width; w--) { 2903 if (i < w) { 2904 return (sector_id % w == i); 2905 } else { 2906 i -= w; 2907 } 2908 } 2909 ASSERT(!"invalid logical child id"); 2910 return (B_FALSE); 2911 } 2912 2913 /* 2914 * returns EINVAL if reconstruction of the block will not be possible 2915 * returns ECKSUM if this specific reconstruction failed 2916 * returns 0 on successful reconstruction 2917 */ 2918 static int 2919 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 2920 { 2921 raidz_map_t *rm = zio->io_vsd; 2922 int physical_width = zio->io_vd->vdev_children; 2923 int original_width = (rm->rm_original_width != 0) ? 2924 rm->rm_original_width : physical_width; 2925 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2926 2927 if (dbgmsg) { 2928 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2929 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2930 } 2931 2932 /* Reconstruct each row */ 2933 for (int r = 0; r < rm->rm_nrows; r++) { 2934 raidz_row_t *rr = rm->rm_row[r]; 2935 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 2936 int t = 0; 2937 int dead = 0; 2938 int dead_data = 0; 2939 2940 if (dbgmsg) 2941 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2942 2943 for (int c = 0; c < rr->rr_cols; c++) { 2944 raidz_col_t *rc = &rr->rr_col[c]; 2945 ASSERT0(rc->rc_need_orig_restore); 2946 if (rc->rc_error != 0) { 2947 dead++; 2948 if (c >= nparity) 2949 dead_data++; 2950 continue; 2951 } 2952 if (rc->rc_size == 0) 2953 continue; 2954 for (int lt = 0; lt < ntgts; lt++) { 2955 if (raidz_simulate_failure(physical_width, 2956 original_width, 2957 zio->io_vd->vdev_top->vdev_ashift, 2958 ltgts[lt], rc)) { 2959 if (rc->rc_orig_data == NULL) { 2960 rc->rc_orig_data = 2961 abd_alloc_linear( 2962 rc->rc_size, B_TRUE); 2963 abd_copy(rc->rc_orig_data, 2964 rc->rc_abd, rc->rc_size); 2965 } 2966 rc->rc_need_orig_restore = B_TRUE; 2967 2968 dead++; 2969 if (c >= nparity) 2970 dead_data++; 2971 /* 2972 * Note: simulating failure of a 2973 * pre-expansion device can hit more 2974 * than one column, in which case we 2975 * might try to simulate more failures 2976 * than can be reconstructed, which is 2977 * also more than the size of my_tgts. 2978 * This check prevents accessing past 2979 * the end of my_tgts. The "dead > 2980 * nparity" check below will fail this 2981 * reconstruction attempt. 2982 */ 2983 if (t < VDEV_RAIDZ_MAXPARITY) { 2984 my_tgts[t++] = c; 2985 if (dbgmsg) { 2986 zfs_dbgmsg("simulating " 2987 "failure of col %u " 2988 "devidx %u", c, 2989 (int)rc->rc_devidx); 2990 } 2991 } 2992 break; 2993 } 2994 } 2995 } 2996 if (dead > nparity) { 2997 /* reconstruction not possible */ 2998 if (dbgmsg) { 2999 zfs_dbgmsg("reconstruction not possible; " 3000 "too many failures"); 3001 } 3002 raidz_restore_orig_data(rm); 3003 return (EINVAL); 3004 } 3005 if (dead_data > 0) 3006 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 3007 } 3008 3009 /* Check for success */ 3010 if (raidz_checksum_verify(zio) == 0) { 3011 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3012 return (0); 3013 3014 /* Reconstruction succeeded - report errors */ 3015 for (int i = 0; i < rm->rm_nrows; i++) { 3016 raidz_row_t *rr = rm->rm_row[i]; 3017 3018 for (int c = 0; c < rr->rr_cols; c++) { 3019 raidz_col_t *rc = &rr->rr_col[c]; 3020 if (rc->rc_need_orig_restore) { 3021 /* 3022 * Note: if this is a parity column, 3023 * we don't really know if it's wrong. 3024 * We need to let 3025 * vdev_raidz_io_done_verified() check 3026 * it, and if we set rc_error, it will 3027 * think that it is a "known" error 3028 * that doesn't need to be checked 3029 * or corrected. 3030 */ 3031 if (rc->rc_error == 0 && 3032 c >= rr->rr_firstdatacol) { 3033 vdev_raidz_checksum_error(zio, 3034 rc, rc->rc_orig_data); 3035 rc->rc_error = 3036 SET_ERROR(ECKSUM); 3037 } 3038 rc->rc_need_orig_restore = B_FALSE; 3039 } 3040 } 3041 3042 vdev_raidz_io_done_verified(zio, rr); 3043 } 3044 3045 zio_checksum_verified(zio); 3046 3047 if (dbgmsg) { 3048 zfs_dbgmsg("reconstruction successful " 3049 "(checksum verified)"); 3050 } 3051 return (0); 3052 } 3053 3054 /* Reconstruction failed - restore original data */ 3055 raidz_restore_orig_data(rm); 3056 if (dbgmsg) { 3057 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3058 "failed", zio); 3059 } 3060 return (ECKSUM); 3061 } 3062 3063 /* 3064 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 3065 * Note that the algorithm below is non-optimal because it doesn't take into 3066 * account how reconstruction is actually performed. For example, with 3067 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 3068 * is targeted as invalid as if columns 1 and 4 are targeted since in both 3069 * cases we'd only use parity information in column 0. 3070 * 3071 * The order that we find the various possible combinations of failed 3072 * disks is dictated by these rules: 3073 * - Examine each "slot" (the "i" in tgts[i]) 3074 * - Try to increment this slot (tgts[i] += 1) 3075 * - if we can't increment because it runs into the next slot, 3076 * reset our slot to the minimum, and examine the next slot 3077 * 3078 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 3079 * 3 columns to reconstruct), we will generate the following sequence: 3080 * 3081 * STATE ACTION 3082 * 0 1 2 special case: skip since these are all parity 3083 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 3084 * 0 2 3 first slot: increment to 1 3085 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 3086 * 0 1 4 first: reset to 0; middle: increment to 2 3087 * 0 2 4 first: increment to 1 3088 * 1 2 4 first: reset to 0; middle: increment to 3 3089 * 0 3 4 first: increment to 1 3090 * 1 3 4 first: increment to 2 3091 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 3092 * 0 1 5 first: reset to 0; middle: increment to 2 3093 * 0 2 5 first: increment to 1 3094 * 1 2 5 first: reset to 0; middle: increment to 3 3095 * 0 3 5 first: increment to 1 3096 * 1 3 5 first: increment to 2 3097 * 2 3 5 first: reset to 0; middle: increment to 4 3098 * 0 4 5 first: increment to 1 3099 * 1 4 5 first: increment to 2 3100 * 2 4 5 first: increment to 3 3101 * 3 4 5 done 3102 * 3103 * This strategy works for dRAID but is less efficient when there are a large 3104 * number of child vdevs and therefore permutations to check. Furthermore, 3105 * since the raidz_map_t rows likely do not overlap, reconstruction would be 3106 * possible as long as there are no more than nparity data errors per row. 3107 * These additional permutations are not currently checked but could be as 3108 * a future improvement. 3109 * 3110 * Returns 0 on success, ECKSUM on failure. 3111 */ 3112 static int 3113 vdev_raidz_combrec(zio_t *zio) 3114 { 3115 int nparity = vdev_get_nparity(zio->io_vd); 3116 raidz_map_t *rm = zio->io_vsd; 3117 int physical_width = zio->io_vd->vdev_children; 3118 int original_width = (rm->rm_original_width != 0) ? 3119 rm->rm_original_width : physical_width; 3120 3121 for (int i = 0; i < rm->rm_nrows; i++) { 3122 raidz_row_t *rr = rm->rm_row[i]; 3123 int total_errors = 0; 3124 3125 for (int c = 0; c < rr->rr_cols; c++) { 3126 if (rr->rr_col[c].rc_error) 3127 total_errors++; 3128 } 3129 3130 if (total_errors > nparity) 3131 return (vdev_raidz_worst_error(rr)); 3132 } 3133 3134 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 3135 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 3136 int *ltgts = &tstore[1]; /* value is logical child ID */ 3137 3138 3139 /* 3140 * Determine number of logical children, n. See comment 3141 * above raidz_simulate_failure(). 3142 */ 3143 int n = 0; 3144 for (int w = physical_width; 3145 w >= original_width; w--) { 3146 n += w; 3147 } 3148 3149 ASSERT3U(num_failures, <=, nparity); 3150 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 3151 3152 /* Handle corner cases in combrec logic */ 3153 ltgts[-1] = -1; 3154 for (int i = 0; i < num_failures; i++) { 3155 ltgts[i] = i; 3156 } 3157 ltgts[num_failures] = n; 3158 3159 for (;;) { 3160 int err = raidz_reconstruct(zio, ltgts, num_failures, 3161 nparity); 3162 if (err == EINVAL) { 3163 /* 3164 * Reconstruction not possible with this # 3165 * failures; try more failures. 3166 */ 3167 break; 3168 } else if (err == 0) 3169 return (0); 3170 3171 /* Compute next targets to try */ 3172 for (int t = 0; ; t++) { 3173 ASSERT3U(t, <, num_failures); 3174 ltgts[t]++; 3175 if (ltgts[t] == n) { 3176 /* try more failures */ 3177 ASSERT3U(t, ==, num_failures - 1); 3178 if (zfs_flags & 3179 ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3180 zfs_dbgmsg("reconstruction " 3181 "failed for num_failures=" 3182 "%u; tried all " 3183 "combinations", 3184 num_failures); 3185 } 3186 break; 3187 } 3188 3189 ASSERT3U(ltgts[t], <, n); 3190 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 3191 3192 /* 3193 * If that spot is available, we're done here. 3194 * Try the next combination. 3195 */ 3196 if (ltgts[t] != ltgts[t + 1]) 3197 break; // found next combination 3198 3199 /* 3200 * Otherwise, reset this tgt to the minimum, 3201 * and move on to the next tgt. 3202 */ 3203 ltgts[t] = ltgts[t - 1] + 1; 3204 ASSERT3U(ltgts[t], ==, t); 3205 } 3206 3207 /* Increase the number of failures and keep trying. */ 3208 if (ltgts[num_failures - 1] == n) 3209 break; 3210 } 3211 } 3212 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3213 zfs_dbgmsg("reconstruction failed for all num_failures"); 3214 return (ECKSUM); 3215 } 3216 3217 void 3218 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 3219 { 3220 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 3221 raidz_row_t *rr = rm->rm_row[row]; 3222 vdev_raidz_reconstruct_row(rm, rr, t, nt); 3223 } 3224 } 3225 3226 /* 3227 * Complete a write IO operation on a RAIDZ VDev 3228 * 3229 * Outline: 3230 * 1. Check for errors on the child IOs. 3231 * 2. Return, setting an error code if too few child VDevs were written 3232 * to reconstruct the data later. Note that partial writes are 3233 * considered successful if they can be reconstructed at all. 3234 */ 3235 static void 3236 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 3237 { 3238 int normal_errors = 0; 3239 int shadow_errors = 0; 3240 3241 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3242 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3243 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3244 3245 for (int c = 0; c < rr->rr_cols; c++) { 3246 raidz_col_t *rc = &rr->rr_col[c]; 3247 3248 if (rc->rc_error != 0) { 3249 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3250 normal_errors++; 3251 } 3252 if (rc->rc_shadow_error != 0) { 3253 ASSERT(rc->rc_shadow_error != ECKSUM); 3254 shadow_errors++; 3255 } 3256 } 3257 3258 /* 3259 * Treat partial writes as a success. If we couldn't write enough 3260 * columns to reconstruct the data, the I/O failed. Otherwise, good 3261 * enough. Note that in the case of a shadow write (during raidz 3262 * expansion), depending on if we crash, either the normal (old) or 3263 * shadow (new) location may become the "real" version of the block, 3264 * so both locations must have sufficient redundancy. 3265 * 3266 * Now that we support write reallocation, it would be better 3267 * to treat partial failure as real failure unless there are 3268 * no non-degraded top-level vdevs left, and not update DTLs 3269 * if we intend to reallocate. 3270 */ 3271 if (normal_errors > rr->rr_firstdatacol || 3272 shadow_errors > rr->rr_firstdatacol) { 3273 zio->io_error = zio_worst_error(zio->io_error, 3274 vdev_raidz_worst_error(rr)); 3275 } 3276 } 3277 3278 static void 3279 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 3280 raidz_row_t *rr) 3281 { 3282 int parity_errors = 0; 3283 int parity_untried = 0; 3284 int data_errors = 0; 3285 int total_errors = 0; 3286 3287 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3288 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3289 3290 for (int c = 0; c < rr->rr_cols; c++) { 3291 raidz_col_t *rc = &rr->rr_col[c]; 3292 3293 /* 3294 * If scrubbing and a replacing/sparing child vdev determined 3295 * that not all of its children have an identical copy of the 3296 * data, then clear the error so the column is treated like 3297 * any other read and force a repair to correct the damage. 3298 */ 3299 if (rc->rc_error == ECKSUM) { 3300 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3301 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3302 rc->rc_force_repair = 1; 3303 rc->rc_error = 0; 3304 } 3305 3306 if (rc->rc_error) { 3307 if (c < rr->rr_firstdatacol) 3308 parity_errors++; 3309 else 3310 data_errors++; 3311 3312 total_errors++; 3313 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3314 parity_untried++; 3315 } 3316 } 3317 3318 /* 3319 * If there were data errors and the number of errors we saw was 3320 * correctable -- less than or equal to the number of parity disks read 3321 * -- reconstruct based on the missing data. 3322 */ 3323 if (data_errors != 0 && 3324 total_errors <= rr->rr_firstdatacol - parity_untried) { 3325 /* 3326 * We either attempt to read all the parity columns or 3327 * none of them. If we didn't try to read parity, we 3328 * wouldn't be here in the correctable case. There must 3329 * also have been fewer parity errors than parity 3330 * columns or, again, we wouldn't be in this code path. 3331 */ 3332 ASSERT(parity_untried == 0); 3333 ASSERT(parity_errors < rr->rr_firstdatacol); 3334 3335 /* 3336 * Identify the data columns that reported an error. 3337 */ 3338 int n = 0; 3339 int tgts[VDEV_RAIDZ_MAXPARITY]; 3340 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 3341 raidz_col_t *rc = &rr->rr_col[c]; 3342 if (rc->rc_error != 0) { 3343 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3344 tgts[n++] = c; 3345 } 3346 } 3347 3348 ASSERT(rr->rr_firstdatacol >= n); 3349 3350 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3351 } 3352 } 3353 3354 /* 3355 * Return the number of reads issued. 3356 */ 3357 static int 3358 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 3359 { 3360 vdev_t *vd = zio->io_vd; 3361 int nread = 0; 3362 3363 rr->rr_missingdata = 0; 3364 rr->rr_missingparity = 0; 3365 3366 /* 3367 * If this rows contains empty sectors which are not required 3368 * for a normal read then allocate an ABD for them now so they 3369 * may be read, verified, and any needed repairs performed. 3370 */ 3371 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 3372 vdev_draid_map_alloc_empty(zio, rr); 3373 3374 for (int c = 0; c < rr->rr_cols; c++) { 3375 raidz_col_t *rc = &rr->rr_col[c]; 3376 if (rc->rc_tried || rc->rc_size == 0) 3377 continue; 3378 3379 zio_nowait(zio_vdev_child_io(zio, NULL, 3380 vd->vdev_child[rc->rc_devidx], 3381 rc->rc_offset, rc->rc_abd, rc->rc_size, 3382 zio->io_type, zio->io_priority, 0, 3383 vdev_raidz_child_done, rc)); 3384 nread++; 3385 } 3386 return (nread); 3387 } 3388 3389 /* 3390 * We're here because either there were too many errors to even attempt 3391 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 3392 * failed. In either case, there is enough bad data to prevent reconstruction. 3393 * Start checksum ereports for all children which haven't failed. 3394 */ 3395 static void 3396 vdev_raidz_io_done_unrecoverable(zio_t *zio) 3397 { 3398 raidz_map_t *rm = zio->io_vsd; 3399 3400 for (int i = 0; i < rm->rm_nrows; i++) { 3401 raidz_row_t *rr = rm->rm_row[i]; 3402 3403 for (int c = 0; c < rr->rr_cols; c++) { 3404 raidz_col_t *rc = &rr->rr_col[c]; 3405 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 3406 3407 if (rc->rc_error != 0) 3408 continue; 3409 3410 zio_bad_cksum_t zbc; 3411 zbc.zbc_has_cksum = 0; 3412 zbc.zbc_injected = rm->rm_ecksuminjected; 3413 mutex_enter(&cvd->vdev_stat_lock); 3414 cvd->vdev_stat.vs_checksum_errors++; 3415 mutex_exit(&cvd->vdev_stat_lock); 3416 (void) zfs_ereport_start_checksum(zio->io_spa, 3417 cvd, &zio->io_bookmark, zio, rc->rc_offset, 3418 rc->rc_size, &zbc); 3419 } 3420 } 3421 } 3422 3423 void 3424 vdev_raidz_io_done(zio_t *zio) 3425 { 3426 raidz_map_t *rm = zio->io_vsd; 3427 3428 ASSERT(zio->io_bp != NULL); 3429 if (zio->io_type == ZIO_TYPE_WRITE) { 3430 for (int i = 0; i < rm->rm_nrows; i++) { 3431 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 3432 } 3433 } else { 3434 if (rm->rm_phys_col) { 3435 /* 3436 * This is an aggregated read. Copy the data and status 3437 * from the aggregate abd's to the individual rows. 3438 */ 3439 for (int i = 0; i < rm->rm_nrows; i++) { 3440 raidz_row_t *rr = rm->rm_row[i]; 3441 3442 for (int c = 0; c < rr->rr_cols; c++) { 3443 raidz_col_t *rc = &rr->rr_col[c]; 3444 if (rc->rc_tried || rc->rc_size == 0) 3445 continue; 3446 3447 raidz_col_t *prc = 3448 &rm->rm_phys_col[rc->rc_devidx]; 3449 rc->rc_error = prc->rc_error; 3450 rc->rc_tried = prc->rc_tried; 3451 rc->rc_skipped = prc->rc_skipped; 3452 if (c >= rr->rr_firstdatacol) { 3453 /* 3454 * Note: this is slightly faster 3455 * than using abd_copy_off(). 3456 */ 3457 char *physbuf = abd_to_buf( 3458 prc->rc_abd); 3459 void *physloc = physbuf + 3460 rc->rc_offset - 3461 prc->rc_offset; 3462 3463 abd_copy_from_buf(rc->rc_abd, 3464 physloc, rc->rc_size); 3465 } 3466 } 3467 } 3468 } 3469 3470 for (int i = 0; i < rm->rm_nrows; i++) { 3471 raidz_row_t *rr = rm->rm_row[i]; 3472 vdev_raidz_io_done_reconstruct_known_missing(zio, 3473 rm, rr); 3474 } 3475 3476 if (raidz_checksum_verify(zio) == 0) { 3477 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3478 goto done; 3479 3480 for (int i = 0; i < rm->rm_nrows; i++) { 3481 raidz_row_t *rr = rm->rm_row[i]; 3482 vdev_raidz_io_done_verified(zio, rr); 3483 } 3484 zio_checksum_verified(zio); 3485 } else { 3486 /* 3487 * A sequential resilver has no checksum which makes 3488 * combinatoral reconstruction impossible. This code 3489 * path is unreachable since raidz_checksum_verify() 3490 * has no checksum to verify and must succeed. 3491 */ 3492 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3493 3494 /* 3495 * This isn't a typical situation -- either we got a 3496 * read error or a child silently returned bad data. 3497 * Read every block so we can try again with as much 3498 * data and parity as we can track down. If we've 3499 * already been through once before, all children will 3500 * be marked as tried so we'll proceed to combinatorial 3501 * reconstruction. 3502 */ 3503 int nread = 0; 3504 for (int i = 0; i < rm->rm_nrows; i++) { 3505 nread += vdev_raidz_read_all(zio, 3506 rm->rm_row[i]); 3507 } 3508 if (nread != 0) { 3509 /* 3510 * Normally our stage is VDEV_IO_DONE, but if 3511 * we've already called redone(), it will have 3512 * changed to VDEV_IO_START, in which case we 3513 * don't want to call redone() again. 3514 */ 3515 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 3516 zio_vdev_io_redone(zio); 3517 return; 3518 } 3519 /* 3520 * It would be too expensive to try every possible 3521 * combination of failed sectors in every row, so 3522 * instead we try every combination of failed current or 3523 * past physical disk. This means that if the incorrect 3524 * sectors were all on Nparity disks at any point in the 3525 * past, we will find the correct data. The only known 3526 * case where this is less durable than a non-expanded 3527 * RAIDZ, is if we have a silent failure during 3528 * expansion. In that case, one block could be 3529 * partially in the old format and partially in the 3530 * new format, so we'd lost some sectors from the old 3531 * format and some from the new format. 3532 * 3533 * e.g. logical_width=4 physical_width=6 3534 * the 15 (6+5+4) possible failed disks are: 3535 * width=6 child=0 3536 * width=6 child=1 3537 * width=6 child=2 3538 * width=6 child=3 3539 * width=6 child=4 3540 * width=6 child=5 3541 * width=5 child=0 3542 * width=5 child=1 3543 * width=5 child=2 3544 * width=5 child=3 3545 * width=5 child=4 3546 * width=4 child=0 3547 * width=4 child=1 3548 * width=4 child=2 3549 * width=4 child=3 3550 * And we will try every combination of Nparity of these 3551 * failing. 3552 * 3553 * As a first pass, we can generate every combo, 3554 * and try reconstructing, ignoring any known 3555 * failures. If any row has too many known + simulated 3556 * failures, then we bail on reconstructing with this 3557 * number of simulated failures. As an improvement, 3558 * we could detect the number of whole known failures 3559 * (i.e. we have known failures on these disks for 3560 * every row; the disks never succeeded), and 3561 * subtract that from the max # failures to simulate. 3562 * We could go even further like the current 3563 * combrec code, but that doesn't seem like it 3564 * gains us very much. If we simulate a failure 3565 * that is also a known failure, that's fine. 3566 */ 3567 zio->io_error = vdev_raidz_combrec(zio); 3568 if (zio->io_error == ECKSUM && 3569 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3570 vdev_raidz_io_done_unrecoverable(zio); 3571 } 3572 } 3573 } 3574 done: 3575 if (rm->rm_lr != NULL) { 3576 zfs_rangelock_exit(rm->rm_lr); 3577 rm->rm_lr = NULL; 3578 } 3579 } 3580 3581 static void 3582 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3583 { 3584 vdev_raidz_t *vdrz = vd->vdev_tsd; 3585 if (faulted > vdrz->vd_nparity) 3586 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3587 VDEV_AUX_NO_REPLICAS); 3588 else if (degraded + faulted != 0) 3589 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3590 else 3591 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3592 } 3593 3594 /* 3595 * Determine if any portion of the provided block resides on a child vdev 3596 * with a dirty DTL and therefore needs to be resilvered. The function 3597 * assumes that at least one DTL is dirty which implies that full stripe 3598 * width blocks must be resilvered. 3599 */ 3600 static boolean_t 3601 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3602 uint64_t phys_birth) 3603 { 3604 vdev_raidz_t *vdrz = vd->vdev_tsd; 3605 3606 /* 3607 * If we're in the middle of a RAIDZ expansion, this block may be in 3608 * the old and/or new location. For simplicity, always resilver it. 3609 */ 3610 if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3611 return (B_TRUE); 3612 3613 uint64_t dcols = vd->vdev_children; 3614 uint64_t nparity = vdrz->vd_nparity; 3615 uint64_t ashift = vd->vdev_top->vdev_ashift; 3616 /* The starting RAIDZ (parent) vdev sector of the block. */ 3617 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3618 /* The zio's size in units of the vdev's minimum sector size. */ 3619 uint64_t s = ((psize - 1) >> ashift) + 1; 3620 /* The first column for this stripe. */ 3621 uint64_t f = b % dcols; 3622 3623 /* Unreachable by sequential resilver. */ 3624 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 3625 3626 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 3627 return (B_FALSE); 3628 3629 if (s + nparity >= dcols) 3630 return (B_TRUE); 3631 3632 for (uint64_t c = 0; c < s + nparity; c++) { 3633 uint64_t devidx = (f + c) % dcols; 3634 vdev_t *cvd = vd->vdev_child[devidx]; 3635 3636 /* 3637 * dsl_scan_need_resilver() already checked vd with 3638 * vdev_dtl_contains(). So here just check cvd with 3639 * vdev_dtl_empty(), cheaper and a good approximation. 3640 */ 3641 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3642 return (B_TRUE); 3643 } 3644 3645 return (B_FALSE); 3646 } 3647 3648 static void 3649 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 3650 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 3651 { 3652 (void) remain_rs; 3653 3654 vdev_t *raidvd = cvd->vdev_parent; 3655 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3656 3657 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3658 3659 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3660 /* 3661 * We're in the middle of expansion, in which case the 3662 * translation is in flux. Any answer we give may be wrong 3663 * by the time we return, so it isn't safe for the caller to 3664 * act on it. Therefore we say that this range isn't present 3665 * on any children. The only consumers of this are "zpool 3666 * initialize" and trimming, both of which are "best effort" 3667 * anyway. 3668 */ 3669 physical_rs->rs_start = physical_rs->rs_end = 0; 3670 remain_rs->rs_start = remain_rs->rs_end = 0; 3671 return; 3672 } 3673 3674 uint64_t width = vdrz->vd_physical_width; 3675 uint64_t tgt_col = cvd->vdev_id; 3676 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3677 3678 /* make sure the offsets are block-aligned */ 3679 ASSERT0(logical_rs->rs_start % (1 << ashift)); 3680 ASSERT0(logical_rs->rs_end % (1 << ashift)); 3681 uint64_t b_start = logical_rs->rs_start >> ashift; 3682 uint64_t b_end = logical_rs->rs_end >> ashift; 3683 3684 uint64_t start_row = 0; 3685 if (b_start > tgt_col) /* avoid underflow */ 3686 start_row = ((b_start - tgt_col - 1) / width) + 1; 3687 3688 uint64_t end_row = 0; 3689 if (b_end > tgt_col) 3690 end_row = ((b_end - tgt_col - 1) / width) + 1; 3691 3692 physical_rs->rs_start = start_row << ashift; 3693 physical_rs->rs_end = end_row << ashift; 3694 3695 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 3696 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 3697 logical_rs->rs_end - logical_rs->rs_start); 3698 } 3699 3700 static void 3701 raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3702 { 3703 spa_t *spa = arg; 3704 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3705 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3706 3707 /* 3708 * Ensure there are no i/os to the range that is being committed. 3709 */ 3710 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3711 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3712 3713 mutex_enter(&vre->vre_lock); 3714 uint64_t new_offset = 3715 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3716 /* 3717 * We should not have committed anything that failed. 3718 */ 3719 VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3720 mutex_exit(&vre->vre_lock); 3721 3722 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3723 old_offset, new_offset - old_offset, 3724 RL_WRITER); 3725 3726 /* 3727 * Update the uberblock that will be written when this txg completes. 3728 */ 3729 RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3730 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3731 vre->vre_offset_pertxg[txgoff] = 0; 3732 zfs_rangelock_exit(lr); 3733 3734 mutex_enter(&vre->vre_lock); 3735 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3736 vre->vre_bytes_copied_pertxg[txgoff] = 0; 3737 mutex_exit(&vre->vre_lock); 3738 3739 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3740 VERIFY0(zap_update(spa->spa_meta_objset, 3741 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3742 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3743 } 3744 3745 static void 3746 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3747 { 3748 spa_t *spa = arg; 3749 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3750 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3751 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3752 3753 for (int i = 0; i < TXG_SIZE; i++) 3754 VERIFY0(vre->vre_offset_pertxg[i]); 3755 3756 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3757 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3758 re->re_logical_width = vdrz->vd_physical_width; 3759 mutex_enter(&vdrz->vd_expand_lock); 3760 avl_add(&vdrz->vd_expand_txgs, re); 3761 mutex_exit(&vdrz->vd_expand_lock); 3762 3763 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3764 3765 /* 3766 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3767 * will get written (based on vd_expand_txgs). 3768 */ 3769 vdev_config_dirty(vd); 3770 3771 /* 3772 * Before we change vre_state, the on-disk state must reflect that we 3773 * have completed all copying, so that vdev_raidz_io_start() can use 3774 * vre_state to determine if the reflow is in progress. See also the 3775 * end of spa_raidz_expand_thread(). 3776 */ 3777 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3778 raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3779 3780 vre->vre_end_time = gethrestime_sec(); 3781 vre->vre_state = DSS_FINISHED; 3782 3783 uint64_t state = vre->vre_state; 3784 VERIFY0(zap_update(spa->spa_meta_objset, 3785 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3786 sizeof (state), 1, &state, tx)); 3787 3788 uint64_t end_time = vre->vre_end_time; 3789 VERIFY0(zap_update(spa->spa_meta_objset, 3790 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3791 sizeof (end_time), 1, &end_time, tx)); 3792 3793 spa->spa_uberblock.ub_raidz_reflow_info = 0; 3794 3795 spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3796 "%s vdev %llu new width %llu", spa_name(spa), 3797 (unsigned long long)vd->vdev_id, 3798 (unsigned long long)vd->vdev_children); 3799 3800 spa->spa_raidz_expand = NULL; 3801 raidvd->vdev_rz_expanding = B_FALSE; 3802 3803 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3804 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3805 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3806 3807 spa_notify_waiters(spa); 3808 3809 /* 3810 * While we're in syncing context take the opportunity to 3811 * setup a scrub. All the data has been sucessfully copied 3812 * but we have not validated any checksums. 3813 */ 3814 pool_scan_func_t func = POOL_SCAN_SCRUB; 3815 if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) 3816 dsl_scan_setup_sync(&func, tx); 3817 } 3818 3819 /* 3820 * Struct for one copy zio. 3821 */ 3822 typedef struct raidz_reflow_arg { 3823 vdev_raidz_expand_t *rra_vre; 3824 zfs_locked_range_t *rra_lr; 3825 uint64_t rra_txg; 3826 } raidz_reflow_arg_t; 3827 3828 /* 3829 * The write of the new location is done. 3830 */ 3831 static void 3832 raidz_reflow_write_done(zio_t *zio) 3833 { 3834 raidz_reflow_arg_t *rra = zio->io_private; 3835 vdev_raidz_expand_t *vre = rra->rra_vre; 3836 3837 abd_free(zio->io_abd); 3838 3839 mutex_enter(&vre->vre_lock); 3840 if (zio->io_error != 0) { 3841 /* Force a reflow pause on errors */ 3842 vre->vre_failed_offset = 3843 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3844 } 3845 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3846 vre->vre_outstanding_bytes -= zio->io_size; 3847 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3848 vre->vre_failed_offset) { 3849 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3850 zio->io_size; 3851 } 3852 cv_signal(&vre->vre_cv); 3853 mutex_exit(&vre->vre_lock); 3854 3855 zfs_rangelock_exit(rra->rra_lr); 3856 3857 kmem_free(rra, sizeof (*rra)); 3858 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3859 } 3860 3861 /* 3862 * The read of the old location is done. The parent zio is the write to 3863 * the new location. Allow it to start. 3864 */ 3865 static void 3866 raidz_reflow_read_done(zio_t *zio) 3867 { 3868 raidz_reflow_arg_t *rra = zio->io_private; 3869 vdev_raidz_expand_t *vre = rra->rra_vre; 3870 3871 /* 3872 * If the read failed, or if it was done on a vdev that is not fully 3873 * healthy (e.g. a child that has a resilver in progress), we may not 3874 * have the correct data. Note that it's OK if the write proceeds. 3875 * It may write garbage but the location is otherwise unused and we 3876 * will retry later due to vre_failed_offset. 3877 */ 3878 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3879 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3880 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3881 (long long)rra->rra_lr->lr_offset, 3882 (long long)rra->rra_lr->lr_length, 3883 (long long)rra->rra_txg, 3884 zio->io_error, 3885 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3886 vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3887 mutex_enter(&vre->vre_lock); 3888 /* Force a reflow pause on errors */ 3889 vre->vre_failed_offset = 3890 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3891 mutex_exit(&vre->vre_lock); 3892 } 3893 3894 zio_nowait(zio_unique_parent(zio)); 3895 } 3896 3897 static void 3898 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3899 dmu_tx_t *tx) 3900 { 3901 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3902 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3903 3904 if (offset == 0) 3905 return; 3906 3907 mutex_enter(&vre->vre_lock); 3908 ASSERT3U(vre->vre_offset, <=, offset); 3909 vre->vre_offset = offset; 3910 mutex_exit(&vre->vre_lock); 3911 3912 if (vre->vre_offset_pertxg[txgoff] == 0) { 3913 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3914 spa, tx); 3915 } 3916 vre->vre_offset_pertxg[txgoff] = offset; 3917 } 3918 3919 static boolean_t 3920 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3921 { 3922 for (int i = 0; i < raidz_vd->vdev_children; i++) { 3923 /* Quick check if a child is being replaced */ 3924 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3925 return (B_TRUE); 3926 } 3927 return (B_FALSE); 3928 } 3929 3930 static boolean_t 3931 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, 3932 dmu_tx_t *tx) 3933 { 3934 spa_t *spa = vd->vdev_spa; 3935 int ashift = vd->vdev_top->vdev_ashift; 3936 uint64_t offset, size; 3937 3938 if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, 3939 &offset, &size)) { 3940 return (B_FALSE); 3941 } 3942 ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3943 ASSERT3U(size, >=, 1 << ashift); 3944 uint64_t length = 1 << ashift; 3945 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3946 3947 uint64_t blkid = offset >> ashift; 3948 3949 int old_children = vd->vdev_children - 1; 3950 3951 /* 3952 * We can only progress to the point that writes will not overlap 3953 * with blocks whose progress has not yet been recorded on disk. 3954 * Since partially-copied rows are still read from the old location, 3955 * we need to stop one row before the sector-wise overlap, to prevent 3956 * row-wise overlap. 3957 * 3958 * Note that even if we are skipping over a large unallocated region, 3959 * we can't move the on-disk progress to `offset`, because concurrent 3960 * writes/allocations could still use the currently-unallocated 3961 * region. 3962 */ 3963 uint64_t ubsync_blkid = 3964 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3965 uint64_t next_overwrite_blkid = ubsync_blkid + 3966 ubsync_blkid / old_children - old_children; 3967 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3968 3969 if (blkid >= next_overwrite_blkid) { 3970 raidz_reflow_record_progress(vre, 3971 next_overwrite_blkid << ashift, tx); 3972 return (B_TRUE); 3973 } 3974 3975 range_tree_remove(rt, offset, length); 3976 3977 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); 3978 rra->rra_vre = vre; 3979 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 3980 offset, length, RL_WRITER); 3981 rra->rra_txg = dmu_tx_get_txg(tx); 3982 3983 raidz_reflow_record_progress(vre, offset + length, tx); 3984 3985 mutex_enter(&vre->vre_lock); 3986 vre->vre_outstanding_bytes += length; 3987 mutex_exit(&vre->vre_lock); 3988 3989 /* 3990 * SCL_STATE will be released when the read and write are done, 3991 * by raidz_reflow_write_done(). 3992 */ 3993 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3994 3995 /* check if a replacing vdev was added, if so treat it as an error */ 3996 if (vdev_raidz_expand_child_replacing(vd)) { 3997 zfs_dbgmsg("replacing vdev encountered, reflow paused at " 3998 "offset=%llu txg=%llu", 3999 (long long)rra->rra_lr->lr_offset, 4000 (long long)rra->rra_txg); 4001 4002 mutex_enter(&vre->vre_lock); 4003 vre->vre_failed_offset = 4004 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4005 cv_signal(&vre->vre_cv); 4006 mutex_exit(&vre->vre_lock); 4007 4008 /* drop everything we acquired */ 4009 zfs_rangelock_exit(rra->rra_lr); 4010 kmem_free(rra, sizeof (*rra)); 4011 spa_config_exit(spa, SCL_STATE, spa); 4012 return (B_TRUE); 4013 } 4014 4015 zio_t *pio = spa->spa_txg_zio[txgoff]; 4016 abd_t *abd = abd_alloc_for_io(length, B_FALSE); 4017 zio_t *write_zio = zio_vdev_child_io(pio, NULL, 4018 vd->vdev_child[blkid % vd->vdev_children], 4019 (blkid / vd->vdev_children) << ashift, 4020 abd, length, 4021 ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4022 ZIO_FLAG_CANFAIL, 4023 raidz_reflow_write_done, rra); 4024 4025 zio_nowait(zio_vdev_child_io(write_zio, NULL, 4026 vd->vdev_child[blkid % old_children], 4027 (blkid / old_children) << ashift, 4028 abd, length, 4029 ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4030 ZIO_FLAG_CANFAIL, 4031 raidz_reflow_read_done, rra)); 4032 4033 return (B_FALSE); 4034 } 4035 4036 /* 4037 * For testing (ztest specific) 4038 */ 4039 static void 4040 raidz_expand_pause(uint_t pause_point) 4041 { 4042 while (raidz_expand_pause_point != 0 && 4043 raidz_expand_pause_point <= pause_point) 4044 delay(hz); 4045 } 4046 4047 static void 4048 raidz_scratch_child_done(zio_t *zio) 4049 { 4050 zio_t *pio = zio->io_private; 4051 4052 mutex_enter(&pio->io_lock); 4053 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4054 mutex_exit(&pio->io_lock); 4055 } 4056 4057 /* 4058 * Reflow the beginning portion of the vdev into an intermediate scratch area 4059 * in memory and on disk. This operation must be persisted on disk before we 4060 * proceed to overwrite the beginning portion with the reflowed data. 4061 * 4062 * This multi-step task can fail to complete if disk errors are encountered 4063 * and we can return here after a pause (waiting for disk to become healthy). 4064 */ 4065 static void 4066 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4067 { 4068 vdev_raidz_expand_t *vre = arg; 4069 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4070 zio_t *pio; 4071 int error; 4072 4073 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4074 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4075 int ashift = raidvd->vdev_ashift; 4076 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4077 uint64_t); 4078 uint64_t logical_size = write_size * raidvd->vdev_children; 4079 uint64_t read_size = 4080 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4081 1 << ashift); 4082 4083 /* 4084 * The scratch space must be large enough to get us to the point 4085 * that one row does not overlap itself when moved. This is checked 4086 * by vdev_raidz_attach_check(). 4087 */ 4088 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4089 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4090 VERIFY3U(write_size, <=, read_size); 4091 4092 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4093 0, logical_size, RL_WRITER); 4094 4095 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4096 KM_SLEEP); 4097 for (int i = 0; i < raidvd->vdev_children; i++) { 4098 abds[i] = abd_alloc_linear(read_size, B_FALSE); 4099 } 4100 4101 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4102 4103 /* 4104 * If we have already written the scratch area then we must read from 4105 * there, since new writes were redirected there while we were paused 4106 * or the original location may have been partially overwritten with 4107 * reflowed data. 4108 */ 4109 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4110 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4111 /* 4112 * Read from scratch space. 4113 */ 4114 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4115 for (int i = 0; i < raidvd->vdev_children; i++) { 4116 /* 4117 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4118 * to the offset to calculate the physical offset to 4119 * write to. Passing in a negative offset makes us 4120 * access the scratch area. 4121 */ 4122 zio_nowait(zio_vdev_child_io(pio, NULL, 4123 raidvd->vdev_child[i], 4124 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4125 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, 4126 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4127 } 4128 error = zio_wait(pio); 4129 if (error != 0) { 4130 zfs_dbgmsg("reflow: error %d reading scratch location", 4131 error); 4132 goto io_error_exit; 4133 } 4134 goto overwrite; 4135 } 4136 4137 /* 4138 * Read from original location. 4139 */ 4140 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4141 for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4142 ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4143 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4144 0, abds[i], read_size, ZIO_TYPE_READ, 4145 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 4146 raidz_scratch_child_done, pio)); 4147 } 4148 error = zio_wait(pio); 4149 if (error != 0) { 4150 zfs_dbgmsg("reflow: error %d reading original location", error); 4151 io_error_exit: 4152 for (int i = 0; i < raidvd->vdev_children; i++) 4153 abd_free(abds[i]); 4154 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4155 zfs_rangelock_exit(lr); 4156 spa_config_exit(spa, SCL_STATE, FTAG); 4157 return; 4158 } 4159 4160 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4161 4162 /* 4163 * Reflow in memory. 4164 */ 4165 uint64_t logical_sectors = logical_size >> ashift; 4166 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4167 int oldchild = i % (raidvd->vdev_children - 1); 4168 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4169 4170 int newchild = i % raidvd->vdev_children; 4171 uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4172 4173 /* a single sector should not be copying over itself */ 4174 ASSERT(!(newchild == oldchild && newoff == oldoff)); 4175 4176 abd_copy_off(abds[newchild], abds[oldchild], 4177 newoff, oldoff, 1 << ashift); 4178 } 4179 4180 /* 4181 * Verify that we filled in everything we intended to (write_size on 4182 * each child). 4183 */ 4184 VERIFY0(logical_sectors % raidvd->vdev_children); 4185 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4186 write_size); 4187 4188 /* 4189 * Write to scratch location (boot area). 4190 */ 4191 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4192 for (int i = 0; i < raidvd->vdev_children; i++) { 4193 /* 4194 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4195 * the offset to calculate the physical offset to write to. 4196 * Passing in a negative offset lets us access the boot area. 4197 */ 4198 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4199 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4200 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 4201 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4202 } 4203 error = zio_wait(pio); 4204 if (error != 0) { 4205 zfs_dbgmsg("reflow: error %d writing scratch location", error); 4206 goto io_error_exit; 4207 } 4208 pio = zio_root(spa, NULL, NULL, 0); 4209 zio_flush(pio, raidvd); 4210 zio_wait(pio); 4211 4212 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4213 (long long)logical_size); 4214 4215 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4216 4217 /* 4218 * Update uberblock to indicate that scratch space is valid. This is 4219 * needed because after this point, the real location may be 4220 * overwritten. If we crash, we need to get the data from the 4221 * scratch space, rather than the real location. 4222 * 4223 * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4224 * will prefer this uberblock. 4225 */ 4226 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4227 spa->spa_ubsync.ub_timestamp++; 4228 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4229 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4230 if (spa_multihost(spa)) 4231 mmp_update_uberblock(spa, &spa->spa_ubsync); 4232 4233 zfs_dbgmsg("reflow: uberblock updated " 4234 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4235 (long long)spa->spa_ubsync.ub_txg, 4236 (long long)logical_size, 4237 (long long)spa->spa_ubsync.ub_timestamp); 4238 4239 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4240 4241 /* 4242 * Overwrite with reflow'ed data. 4243 */ 4244 overwrite: 4245 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4246 for (int i = 0; i < raidvd->vdev_children; i++) { 4247 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4248 0, abds[i], write_size, ZIO_TYPE_WRITE, 4249 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, 4250 raidz_scratch_child_done, pio)); 4251 } 4252 error = zio_wait(pio); 4253 if (error != 0) { 4254 /* 4255 * When we exit early here and drop the range lock, new 4256 * writes will go into the scratch area so we'll need to 4257 * read from there when we return after pausing. 4258 */ 4259 zfs_dbgmsg("reflow: error %d writing real location", error); 4260 /* 4261 * Update the uberblock that is written when this txg completes. 4262 */ 4263 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4264 logical_size); 4265 goto io_error_exit; 4266 } 4267 pio = zio_root(spa, NULL, NULL, 0); 4268 zio_flush(pio, raidvd); 4269 zio_wait(pio); 4270 4271 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4272 (long long)logical_size); 4273 for (int i = 0; i < raidvd->vdev_children; i++) 4274 abd_free(abds[i]); 4275 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4276 4277 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4278 4279 /* 4280 * Update uberblock to indicate that the initial part has been 4281 * reflow'ed. This is needed because after this point (when we exit 4282 * the rangelock), we allow regular writes to this region, which will 4283 * be written to the new location only (because reflow_offset_next == 4284 * reflow_offset_synced). If we crashed and re-copied from the 4285 * scratch space, we would lose the regular writes. 4286 */ 4287 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4288 logical_size); 4289 spa->spa_ubsync.ub_timestamp++; 4290 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4291 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4292 if (spa_multihost(spa)) 4293 mmp_update_uberblock(spa, &spa->spa_ubsync); 4294 4295 zfs_dbgmsg("reflow: uberblock updated " 4296 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4297 (long long)spa->spa_ubsync.ub_txg, 4298 (long long)logical_size, 4299 (long long)spa->spa_ubsync.ub_timestamp); 4300 4301 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4302 4303 /* 4304 * Update progress. 4305 */ 4306 vre->vre_offset = logical_size; 4307 zfs_rangelock_exit(lr); 4308 spa_config_exit(spa, SCL_STATE, FTAG); 4309 4310 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4311 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4312 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4313 /* 4314 * Note - raidz_reflow_sync() will update the uberblock state to 4315 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4316 */ 4317 raidz_reflow_sync(spa, tx); 4318 4319 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4320 } 4321 4322 /* 4323 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4324 * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4325 */ 4326 void 4327 vdev_raidz_reflow_copy_scratch(spa_t *spa) 4328 { 4329 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4330 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4331 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4332 4333 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4334 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4335 ASSERT0(logical_size % raidvd->vdev_children); 4336 uint64_t write_size = logical_size / raidvd->vdev_children; 4337 4338 zio_t *pio; 4339 4340 /* 4341 * Read from scratch space. 4342 */ 4343 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4344 KM_SLEEP); 4345 for (int i = 0; i < raidvd->vdev_children; i++) { 4346 abds[i] = abd_alloc_linear(write_size, B_FALSE); 4347 } 4348 4349 pio = zio_root(spa, NULL, NULL, 0); 4350 for (int i = 0; i < raidvd->vdev_children; i++) { 4351 /* 4352 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4353 * the offset to calculate the physical offset to write to. 4354 * Passing in a negative offset lets us access the boot area. 4355 */ 4356 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4357 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4358 write_size, ZIO_TYPE_READ, 4359 ZIO_PRIORITY_ASYNC_READ, 0, 4360 raidz_scratch_child_done, pio)); 4361 } 4362 zio_wait(pio); 4363 4364 /* 4365 * Overwrite real location with reflow'ed data. 4366 */ 4367 pio = zio_root(spa, NULL, NULL, 0); 4368 for (int i = 0; i < raidvd->vdev_children; i++) { 4369 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4370 0, abds[i], write_size, ZIO_TYPE_WRITE, 4371 ZIO_PRIORITY_ASYNC_WRITE, 0, 4372 raidz_scratch_child_done, pio)); 4373 } 4374 zio_wait(pio); 4375 pio = zio_root(spa, NULL, NULL, 0); 4376 zio_flush(pio, raidvd); 4377 zio_wait(pio); 4378 4379 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4380 "to real location", (long long)logical_size); 4381 4382 for (int i = 0; i < raidvd->vdev_children; i++) 4383 abd_free(abds[i]); 4384 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4385 4386 /* 4387 * Update uberblock. 4388 */ 4389 RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4390 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4391 spa->spa_ubsync.ub_timestamp++; 4392 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4393 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4394 if (spa_multihost(spa)) 4395 mmp_update_uberblock(spa, &spa->spa_ubsync); 4396 4397 zfs_dbgmsg("reflow recovery: uberblock updated " 4398 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4399 (long long)spa->spa_ubsync.ub_txg, 4400 (long long)logical_size, 4401 (long long)spa->spa_ubsync.ub_timestamp); 4402 4403 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4404 spa_first_txg(spa)); 4405 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4406 vre->vre_offset = logical_size; 4407 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4408 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4409 /* 4410 * Note that raidz_reflow_sync() will update the uberblock once more 4411 */ 4412 raidz_reflow_sync(spa, tx); 4413 4414 dmu_tx_commit(tx); 4415 4416 spa_config_exit(spa, SCL_STATE, FTAG); 4417 } 4418 4419 static boolean_t 4420 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4421 { 4422 (void) zthr; 4423 spa_t *spa = arg; 4424 4425 return (spa->spa_raidz_expand != NULL && 4426 !spa->spa_raidz_expand->vre_waiting_for_resilver); 4427 } 4428 4429 /* 4430 * RAIDZ expansion background thread 4431 * 4432 * Can be called multiple times if the reflow is paused 4433 */ 4434 static void 4435 spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4436 { 4437 spa_t *spa = arg; 4438 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4439 4440 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4441 vre->vre_offset = 0; 4442 else 4443 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4444 4445 /* Reflow the begining portion using the scratch area */ 4446 if (vre->vre_offset == 0) { 4447 VERIFY0(dsl_sync_task(spa_name(spa), 4448 NULL, raidz_reflow_scratch_sync, 4449 vre, 0, ZFS_SPACE_CHECK_NONE)); 4450 4451 /* if we encountered errors then pause */ 4452 if (vre->vre_offset == 0) { 4453 mutex_enter(&vre->vre_lock); 4454 vre->vre_waiting_for_resilver = B_TRUE; 4455 mutex_exit(&vre->vre_lock); 4456 return; 4457 } 4458 } 4459 4460 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4461 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4462 4463 uint64_t guid = raidvd->vdev_guid; 4464 4465 /* Iterate over all the remaining metaslabs */ 4466 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4467 i < raidvd->vdev_ms_count && 4468 !zthr_iscancelled(zthr) && 4469 vre->vre_failed_offset == UINT64_MAX; i++) { 4470 metaslab_t *msp = raidvd->vdev_ms[i]; 4471 4472 metaslab_disable(msp); 4473 mutex_enter(&msp->ms_lock); 4474 4475 /* 4476 * The metaslab may be newly created (for the expanded 4477 * space), in which case its trees won't exist yet, 4478 * so we need to bail out early. 4479 */ 4480 if (msp->ms_new) { 4481 mutex_exit(&msp->ms_lock); 4482 metaslab_enable(msp, B_FALSE, B_FALSE); 4483 continue; 4484 } 4485 4486 VERIFY0(metaslab_load(msp)); 4487 4488 /* 4489 * We want to copy everything except the free (allocatable) 4490 * space. Note that there may be a little bit more free 4491 * space (e.g. in ms_defer), and it's fine to copy that too. 4492 */ 4493 range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, 4494 NULL, 0, 0); 4495 range_tree_add(rt, msp->ms_start, msp->ms_size); 4496 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); 4497 mutex_exit(&msp->ms_lock); 4498 4499 /* 4500 * Force the last sector of each metaslab to be copied. This 4501 * ensures that we advance the on-disk progress to the end of 4502 * this metaslab while the metaslab is disabled. Otherwise, we 4503 * could move past this metaslab without advancing the on-disk 4504 * progress, and then an allocation to this metaslab would not 4505 * be copied. 4506 */ 4507 int sectorsz = 1 << raidvd->vdev_ashift; 4508 uint64_t ms_last_offset = msp->ms_start + 4509 msp->ms_size - sectorsz; 4510 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { 4511 range_tree_add(rt, ms_last_offset, sectorsz); 4512 } 4513 4514 /* 4515 * When we are resuming from a paused expansion (i.e. 4516 * when importing a pool with a expansion in progress), 4517 * discard any state that we have already processed. 4518 */ 4519 range_tree_clear(rt, 0, vre->vre_offset); 4520 4521 while (!zthr_iscancelled(zthr) && 4522 !range_tree_is_empty(rt) && 4523 vre->vre_failed_offset == UINT64_MAX) { 4524 4525 /* 4526 * We need to periodically drop the config lock so that 4527 * writers can get in. Additionally, we can't wait 4528 * for a txg to sync while holding a config lock 4529 * (since a waiting writer could cause a 3-way deadlock 4530 * with the sync thread, which also gets a config 4531 * lock for reader). So we can't hold the config lock 4532 * while calling dmu_tx_assign(). 4533 */ 4534 spa_config_exit(spa, SCL_CONFIG, FTAG); 4535 4536 /* 4537 * If requested, pause the reflow when the amount 4538 * specified by raidz_expand_max_reflow_bytes is reached 4539 * 4540 * This pause is only used during testing or debugging. 4541 */ 4542 while (raidz_expand_max_reflow_bytes != 0 && 4543 raidz_expand_max_reflow_bytes <= 4544 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4545 delay(hz); 4546 } 4547 4548 mutex_enter(&vre->vre_lock); 4549 while (vre->vre_outstanding_bytes > 4550 raidz_expand_max_copy_bytes) { 4551 cv_wait(&vre->vre_cv, &vre->vre_lock); 4552 } 4553 mutex_exit(&vre->vre_lock); 4554 4555 dmu_tx_t *tx = 4556 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4557 4558 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 4559 uint64_t txg = dmu_tx_get_txg(tx); 4560 4561 /* 4562 * Reacquire the vdev_config lock. Theoretically, the 4563 * vdev_t that we're expanding may have changed. 4564 */ 4565 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4566 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4567 4568 boolean_t needsync = 4569 raidz_reflow_impl(raidvd, vre, rt, tx); 4570 4571 dmu_tx_commit(tx); 4572 4573 if (needsync) { 4574 spa_config_exit(spa, SCL_CONFIG, FTAG); 4575 txg_wait_synced(spa->spa_dsl_pool, txg); 4576 spa_config_enter(spa, SCL_CONFIG, FTAG, 4577 RW_READER); 4578 } 4579 } 4580 4581 spa_config_exit(spa, SCL_CONFIG, FTAG); 4582 4583 metaslab_enable(msp, B_FALSE, B_FALSE); 4584 range_tree_vacate(rt, NULL, NULL); 4585 range_tree_destroy(rt); 4586 4587 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4588 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4589 } 4590 4591 spa_config_exit(spa, SCL_CONFIG, FTAG); 4592 4593 /* 4594 * The txg_wait_synced() here ensures that all reflow zio's have 4595 * completed, and vre_failed_offset has been set if necessary. It 4596 * also ensures that the progress of the last raidz_reflow_sync() is 4597 * written to disk before raidz_reflow_complete_sync() changes the 4598 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4599 * determine if a reflow is in progress, in which case we may need to 4600 * write to both old and new locations. Therefore we can only change 4601 * vre_state once this is not necessary, which is once the on-disk 4602 * progress (in spa_ubsync) has been set past any possible writes (to 4603 * the end of the last metaslab). 4604 */ 4605 txg_wait_synced(spa->spa_dsl_pool, 0); 4606 4607 if (!zthr_iscancelled(zthr) && 4608 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4609 /* 4610 * We are not being canceled or paused, so the reflow must be 4611 * complete. In that case also mark it as completed on disk. 4612 */ 4613 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4614 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4615 raidz_reflow_complete_sync, spa, 4616 0, ZFS_SPACE_CHECK_NONE)); 4617 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4618 } else { 4619 /* 4620 * Wait for all copy zio's to complete and for all the 4621 * raidz_reflow_sync() synctasks to be run. 4622 */ 4623 spa_history_log_internal(spa, "reflow pause", 4624 NULL, "offset=%llu failed_offset=%lld", 4625 (long long)vre->vre_offset, 4626 (long long)vre->vre_failed_offset); 4627 mutex_enter(&vre->vre_lock); 4628 if (vre->vre_failed_offset != UINT64_MAX) { 4629 /* 4630 * Reset progress so that we will retry everything 4631 * after the point that something failed. 4632 */ 4633 vre->vre_offset = vre->vre_failed_offset; 4634 vre->vre_failed_offset = UINT64_MAX; 4635 vre->vre_waiting_for_resilver = B_TRUE; 4636 } 4637 mutex_exit(&vre->vre_lock); 4638 } 4639 } 4640 4641 void 4642 spa_start_raidz_expansion_thread(spa_t *spa) 4643 { 4644 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4645 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4646 spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4647 spa, defclsyspri); 4648 } 4649 4650 void 4651 raidz_dtl_reassessed(vdev_t *vd) 4652 { 4653 spa_t *spa = vd->vdev_spa; 4654 if (spa->spa_raidz_expand != NULL) { 4655 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4656 /* 4657 * we get called often from vdev_dtl_reassess() so make 4658 * sure it's our vdev and any replacing is complete 4659 */ 4660 if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4661 !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4662 mutex_enter(&vre->vre_lock); 4663 if (vre->vre_waiting_for_resilver) { 4664 vdev_dbgmsg(vd, "DTL reassessed, " 4665 "continuing raidz expansion"); 4666 vre->vre_waiting_for_resilver = B_FALSE; 4667 zthr_wakeup(spa->spa_raidz_expand_zthr); 4668 } 4669 mutex_exit(&vre->vre_lock); 4670 } 4671 } 4672 } 4673 4674 int 4675 vdev_raidz_attach_check(vdev_t *new_child) 4676 { 4677 vdev_t *raidvd = new_child->vdev_parent; 4678 uint64_t new_children = raidvd->vdev_children; 4679 4680 /* 4681 * We use the "boot" space as scratch space to handle overwriting the 4682 * initial part of the vdev. If it is too small, then this expansion 4683 * is not allowed. This would be very unusual (e.g. ashift > 13 and 4684 * >200 children). 4685 */ 4686 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4687 return (EINVAL); 4688 } 4689 return (0); 4690 } 4691 4692 void 4693 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4694 { 4695 vdev_t *new_child = arg; 4696 spa_t *spa = new_child->vdev_spa; 4697 vdev_t *raidvd = new_child->vdev_parent; 4698 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4699 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4700 ASSERT3P(raidvd->vdev_top, ==, raidvd); 4701 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4702 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4703 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4704 new_child); 4705 4706 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4707 4708 vdrz->vd_physical_width++; 4709 4710 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4711 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4712 vdrz->vn_vre.vre_offset = 0; 4713 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4714 spa->spa_raidz_expand = &vdrz->vn_vre; 4715 zthr_wakeup(spa->spa_raidz_expand_zthr); 4716 4717 /* 4718 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4719 * written to the config. 4720 */ 4721 vdev_config_dirty(raidvd); 4722 4723 vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4724 vdrz->vn_vre.vre_end_time = 0; 4725 vdrz->vn_vre.vre_state = DSS_SCANNING; 4726 vdrz->vn_vre.vre_bytes_copied = 0; 4727 4728 uint64_t state = vdrz->vn_vre.vre_state; 4729 VERIFY0(zap_update(spa->spa_meta_objset, 4730 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4731 sizeof (state), 1, &state, tx)); 4732 4733 uint64_t start_time = vdrz->vn_vre.vre_start_time; 4734 VERIFY0(zap_update(spa->spa_meta_objset, 4735 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4736 sizeof (start_time), 1, &start_time, tx)); 4737 4738 (void) zap_remove(spa->spa_meta_objset, 4739 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4740 (void) zap_remove(spa->spa_meta_objset, 4741 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4742 4743 spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4744 "%s vdev %llu new width %llu", spa_name(spa), 4745 (unsigned long long)raidvd->vdev_id, 4746 (unsigned long long)raidvd->vdev_children); 4747 } 4748 4749 int 4750 vdev_raidz_load(vdev_t *vd) 4751 { 4752 vdev_raidz_t *vdrz = vd->vdev_tsd; 4753 int err; 4754 4755 uint64_t state = DSS_NONE; 4756 uint64_t start_time = 0; 4757 uint64_t end_time = 0; 4758 uint64_t bytes_copied = 0; 4759 4760 if (vd->vdev_top_zap != 0) { 4761 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4762 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4763 sizeof (state), 1, &state); 4764 if (err != 0 && err != ENOENT) 4765 return (err); 4766 4767 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4768 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4769 sizeof (start_time), 1, &start_time); 4770 if (err != 0 && err != ENOENT) 4771 return (err); 4772 4773 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4774 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4775 sizeof (end_time), 1, &end_time); 4776 if (err != 0 && err != ENOENT) 4777 return (err); 4778 4779 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4780 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4781 sizeof (bytes_copied), 1, &bytes_copied); 4782 if (err != 0 && err != ENOENT) 4783 return (err); 4784 } 4785 4786 /* 4787 * If we are in the middle of expansion, vre_state should have 4788 * already been set by vdev_raidz_init(). 4789 */ 4790 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4791 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4792 vdrz->vn_vre.vre_start_time = start_time; 4793 vdrz->vn_vre.vre_end_time = end_time; 4794 vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4795 4796 return (0); 4797 } 4798 4799 int 4800 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4801 { 4802 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4803 4804 if (vre == NULL) { 4805 /* no removal in progress; find most recent completed */ 4806 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4807 vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4808 if (vd->vdev_ops == &vdev_raidz_ops) { 4809 vdev_raidz_t *vdrz = vd->vdev_tsd; 4810 4811 if (vdrz->vn_vre.vre_end_time != 0 && 4812 (vre == NULL || 4813 vdrz->vn_vre.vre_end_time > 4814 vre->vre_end_time)) { 4815 vre = &vdrz->vn_vre; 4816 } 4817 } 4818 } 4819 } 4820 4821 if (vre == NULL) { 4822 return (SET_ERROR(ENOENT)); 4823 } 4824 4825 pres->pres_state = vre->vre_state; 4826 pres->pres_expanding_vdev = vre->vre_vdev_id; 4827 4828 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4829 pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4830 4831 mutex_enter(&vre->vre_lock); 4832 pres->pres_reflowed = vre->vre_bytes_copied; 4833 for (int i = 0; i < TXG_SIZE; i++) 4834 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4835 mutex_exit(&vre->vre_lock); 4836 4837 pres->pres_start_time = vre->vre_start_time; 4838 pres->pres_end_time = vre->vre_end_time; 4839 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4840 4841 return (0); 4842 } 4843 4844 /* 4845 * Initialize private RAIDZ specific fields from the nvlist. 4846 */ 4847 static int 4848 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 4849 { 4850 uint_t children; 4851 nvlist_t **child; 4852 int error = nvlist_lookup_nvlist_array(nv, 4853 ZPOOL_CONFIG_CHILDREN, &child, &children); 4854 if (error != 0) 4855 return (SET_ERROR(EINVAL)); 4856 4857 uint64_t nparity; 4858 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 4859 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 4860 return (SET_ERROR(EINVAL)); 4861 4862 /* 4863 * Previous versions could only support 1 or 2 parity 4864 * device. 4865 */ 4866 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 4867 return (SET_ERROR(EINVAL)); 4868 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 4869 return (SET_ERROR(EINVAL)); 4870 } else { 4871 /* 4872 * We require the parity to be specified for SPAs that 4873 * support multiple parity levels. 4874 */ 4875 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 4876 return (SET_ERROR(EINVAL)); 4877 4878 /* 4879 * Otherwise, we default to 1 parity device for RAID-Z. 4880 */ 4881 nparity = 1; 4882 } 4883 4884 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4885 vdrz->vn_vre.vre_vdev_id = -1; 4886 vdrz->vn_vre.vre_offset = UINT64_MAX; 4887 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4888 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4889 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4890 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4891 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4892 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4893 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4894 4895 vdrz->vd_physical_width = children; 4896 vdrz->vd_nparity = nparity; 4897 4898 /* note, the ID does not exist when creating a pool */ 4899 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4900 &vdrz->vn_vre.vre_vdev_id); 4901 4902 boolean_t reflow_in_progress = 4903 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4904 if (reflow_in_progress) { 4905 spa->spa_raidz_expand = &vdrz->vn_vre; 4906 vdrz->vn_vre.vre_state = DSS_SCANNING; 4907 } 4908 4909 vdrz->vd_original_width = children; 4910 uint64_t *txgs; 4911 unsigned int txgs_size = 0; 4912 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4913 &txgs, &txgs_size); 4914 if (error == 0) { 4915 for (int i = 0; i < txgs_size; i++) { 4916 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4917 re->re_txg = txgs[txgs_size - i - 1]; 4918 re->re_logical_width = vdrz->vd_physical_width - i; 4919 4920 if (reflow_in_progress) 4921 re->re_logical_width--; 4922 4923 avl_add(&vdrz->vd_expand_txgs, re); 4924 } 4925 4926 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4927 } 4928 if (reflow_in_progress) { 4929 vdrz->vd_original_width--; 4930 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 4931 children, txgs_size); 4932 } 4933 4934 *tsd = vdrz; 4935 4936 return (0); 4937 } 4938 4939 static void 4940 vdev_raidz_fini(vdev_t *vd) 4941 { 4942 vdev_raidz_t *vdrz = vd->vdev_tsd; 4943 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 4944 vd->vdev_spa->spa_raidz_expand = NULL; 4945 reflow_node_t *re; 4946 void *cookie = NULL; 4947 avl_tree_t *tree = &vdrz->vd_expand_txgs; 4948 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 4949 kmem_free(re, sizeof (*re)); 4950 avl_destroy(&vdrz->vd_expand_txgs); 4951 mutex_destroy(&vdrz->vd_expand_lock); 4952 mutex_destroy(&vdrz->vn_vre.vre_lock); 4953 cv_destroy(&vdrz->vn_vre.vre_cv); 4954 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 4955 kmem_free(vdrz, sizeof (*vdrz)); 4956 } 4957 4958 /* 4959 * Add RAIDZ specific fields to the config nvlist. 4960 */ 4961 static void 4962 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 4963 { 4964 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 4965 vdev_raidz_t *vdrz = vd->vdev_tsd; 4966 4967 /* 4968 * Make sure someone hasn't managed to sneak a fancy new vdev 4969 * into a crufty old storage pool. 4970 */ 4971 ASSERT(vdrz->vd_nparity == 1 || 4972 (vdrz->vd_nparity <= 2 && 4973 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 4974 (vdrz->vd_nparity <= 3 && 4975 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 4976 4977 /* 4978 * Note that we'll add these even on storage pools where they 4979 * aren't strictly required -- older software will just ignore 4980 * it. 4981 */ 4982 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 4983 4984 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4985 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4986 } 4987 4988 mutex_enter(&vdrz->vd_expand_lock); 4989 if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 4990 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 4991 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 4992 KM_SLEEP); 4993 uint64_t i = 0; 4994 4995 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 4996 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 4997 txgs[i++] = re->re_txg; 4998 } 4999 5000 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5001 txgs, count); 5002 5003 kmem_free(txgs, sizeof (uint64_t) * count); 5004 } 5005 mutex_exit(&vdrz->vd_expand_lock); 5006 } 5007 5008 static uint64_t 5009 vdev_raidz_nparity(vdev_t *vd) 5010 { 5011 vdev_raidz_t *vdrz = vd->vdev_tsd; 5012 return (vdrz->vd_nparity); 5013 } 5014 5015 static uint64_t 5016 vdev_raidz_ndisks(vdev_t *vd) 5017 { 5018 return (vd->vdev_children); 5019 } 5020 5021 vdev_ops_t vdev_raidz_ops = { 5022 .vdev_op_init = vdev_raidz_init, 5023 .vdev_op_fini = vdev_raidz_fini, 5024 .vdev_op_open = vdev_raidz_open, 5025 .vdev_op_close = vdev_raidz_close, 5026 .vdev_op_asize = vdev_raidz_asize, 5027 .vdev_op_min_asize = vdev_raidz_min_asize, 5028 .vdev_op_min_alloc = NULL, 5029 .vdev_op_io_start = vdev_raidz_io_start, 5030 .vdev_op_io_done = vdev_raidz_io_done, 5031 .vdev_op_state_change = vdev_raidz_state_change, 5032 .vdev_op_need_resilver = vdev_raidz_need_resilver, 5033 .vdev_op_hold = NULL, 5034 .vdev_op_rele = NULL, 5035 .vdev_op_remap = NULL, 5036 .vdev_op_xlate = vdev_raidz_xlate, 5037 .vdev_op_rebuild_asize = NULL, 5038 .vdev_op_metaslab_init = NULL, 5039 .vdev_op_config_generate = vdev_raidz_config_generate, 5040 .vdev_op_nparity = vdev_raidz_nparity, 5041 .vdev_op_ndisks = vdev_raidz_ndisks, 5042 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5043 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5044 }; 5045 5046 /* BEGIN CSTYLED */ 5047 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5048 "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5049 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5050 "Max amount of concurrent i/o for RAIDZ expansion"); 5051 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5052 "For expanded RAIDZ, aggregate reads that have more rows than this"); 5053 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5054 "For expanded RAIDZ, automatically start a pool scrub when expansion " 5055 "completes"); 5056 /* END CSTYLED */ 5057