1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 26 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/zap.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/metaslab_impl.h> 35 #include <sys/zio.h> 36 #include <sys/zio_checksum.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/abd.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/fs/zfs.h> 41 #include <sys/fm/fs/zfs.h> 42 #include <sys/vdev_raidz.h> 43 #include <sys/vdev_raidz_impl.h> 44 #include <sys/vdev_draid.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/dsl_scan.h> 47 48 #ifdef ZFS_DEBUG 49 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 50 #endif 51 52 /* 53 * Virtual device vector for RAID-Z. 54 * 55 * This vdev supports single, double, and triple parity. For single parity, 56 * we use a simple XOR of all the data columns. For double or triple parity, 57 * we use a special case of Reed-Solomon coding. This extends the 58 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 59 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 60 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 61 * former is also based. The latter is designed to provide higher performance 62 * for writes. 63 * 64 * Note that the Plank paper claimed to support arbitrary N+M, but was then 65 * amended six years later identifying a critical flaw that invalidates its 66 * claims. Nevertheless, the technique can be adapted to work for up to 67 * triple parity. For additional parity, the amendment "Note: Correction to 68 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 69 * is viable, but the additional complexity means that write performance will 70 * suffer. 71 * 72 * All of the methods above operate on a Galois field, defined over the 73 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 74 * can be expressed with a single byte. Briefly, the operations on the 75 * field are defined as follows: 76 * 77 * o addition (+) is represented by a bitwise XOR 78 * o subtraction (-) is therefore identical to addition: A + B = A - B 79 * o multiplication of A by 2 is defined by the following bitwise expression: 80 * 81 * (A * 2)_7 = A_6 82 * (A * 2)_6 = A_5 83 * (A * 2)_5 = A_4 84 * (A * 2)_4 = A_3 + A_7 85 * (A * 2)_3 = A_2 + A_7 86 * (A * 2)_2 = A_1 + A_7 87 * (A * 2)_1 = A_0 88 * (A * 2)_0 = A_7 89 * 90 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 91 * As an aside, this multiplication is derived from the error correcting 92 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 93 * 94 * Observe that any number in the field (except for 0) can be expressed as a 95 * power of 2 -- a generator for the field. We store a table of the powers of 96 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 97 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 98 * than field addition). The inverse of a field element A (A^-1) is therefore 99 * A ^ (255 - 1) = A^254. 100 * 101 * The up-to-three parity columns, P, Q, R over several data columns, 102 * D_0, ... D_n-1, can be expressed by field operations: 103 * 104 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 105 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 106 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 107 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 108 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 109 * 110 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 111 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 112 * independent coefficients. (There are no additional coefficients that have 113 * this property which is why the uncorrected Plank method breaks down.) 114 * 115 * See the reconstruction code below for how P, Q and R can used individually 116 * or in concert to recover missing data columns. 117 */ 118 119 #define VDEV_RAIDZ_P 0 120 #define VDEV_RAIDZ_Q 1 121 #define VDEV_RAIDZ_R 2 122 123 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 124 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 125 126 /* 127 * We provide a mechanism to perform the field multiplication operation on a 128 * 64-bit value all at once rather than a byte at a time. This works by 129 * creating a mask from the top bit in each byte and using that to 130 * conditionally apply the XOR of 0x1d. 131 */ 132 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 133 { \ 134 (mask) = (x) & 0x8080808080808080ULL; \ 135 (mask) = ((mask) << 1) - ((mask) >> 7); \ 136 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 137 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 138 } 139 140 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 141 { \ 142 VDEV_RAIDZ_64MUL_2((x), mask); \ 143 VDEV_RAIDZ_64MUL_2((x), mask); \ 144 } 145 146 147 /* 148 * Big Theory Statement for how a RAIDZ VDEV is expanded 149 * 150 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 151 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 152 * that have been previously expanded can be expanded again. 153 * 154 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 155 * the VDEV) when an expansion starts. And the expansion will pause if any 156 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 157 * operations on the pool can continue while an expansion is in progress (e.g. 158 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 159 * and zpool initialize which can't be run during an expansion. Following a 160 * reboot or export/import, the expansion resumes where it left off. 161 * 162 * == Reflowing the Data == 163 * 164 * The expansion involves reflowing (copying) the data from the current set 165 * of disks to spread it across the new set which now has one more disk. This 166 * reflow operation is similar to reflowing text when the column width of a 167 * text editor window is expanded. The text doesn’t change but the location of 168 * the text changes to accommodate the new width. An example reflow result for 169 * a 4-wide RAIDZ1 to a 5-wide is shown below. 170 * 171 * Reflow End State 172 * Each letter indicates a parity group (logical stripe) 173 * 174 * Before expansion After Expansion 175 * D1 D2 D3 D4 D1 D2 D3 D4 D5 176 * +------+------+------+------+ +------+------+------+------+------+ 177 * | | | | | | | | | | | 178 * | A | A | A | A | | A | A | A | A | B | 179 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 180 * +------+------+------+------+ +------+------+------+------+------+ 181 * | | | | | | | | | | | 182 * | B | B | C | C | | B | C | C | C | C | 183 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 184 * +------+------+------+------+ +------+------+------+------+------+ 185 * | | | | | | | | | | | 186 * | C | C | D | D | | D | D | E | E | E | 187 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 188 * +------+------+------+------+ +------+------+------+------+------+ 189 * | | | | | | | | | | | 190 * | E | E | E | E | --> | E | F | F | G | G | 191 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 192 * +------+------+------+------+ +------+------+------+------+------+ 193 * | | | | | | | | | | | 194 * | F | F | G | G | | G | G | H | H | H | 195 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 196 * +------+------+------+------+ +------+------+------+------+------+ 197 * | | | | | | | | | | | 198 * | G | G | H | H | | H | I | I | J | J | 199 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 200 * +------+------+------+------+ +------+------+------+------+------+ 201 * | | | | | | | | | | | 202 * | H | H | I | I | | J | J | | | K | 203 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 204 * +------+------+------+------+ +------+------+------+------+------+ 205 * 206 * This reflow approach has several advantages. There is no need to read or 207 * modify the block pointers or recompute any block checksums. The reflow 208 * doesn’t need to know where the parity sectors reside. We can read and write 209 * data sequentially and the copy can occur in a background thread in open 210 * context. The design also allows for fast discovery of what data to copy. 211 * 212 * The VDEV metaslabs are processed, one at a time, to copy the block data to 213 * have it flow across all the disks. The metaslab is disabled for allocations 214 * during the copy. As an optimization, we only copy the allocated data which 215 * can be determined by looking at the metaslab range tree. During the copy we 216 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 217 * need to be able to survive losing parity count disks). This means we 218 * cannot overwrite data during the reflow that would be needed if a disk is 219 * lost. 220 * 221 * After the reflow completes, all newly-written blocks will have the new 222 * layout, i.e., they will have the parity to data ratio implied by the new 223 * number of disks in the RAIDZ group. Even though the reflow copies all of 224 * the allocated space (data and parity), it is only rearranged, not changed. 225 * 226 * This act of reflowing the data has a few implications about blocks 227 * that were written before the reflow completes: 228 * 229 * - Old blocks will still use the same amount of space (i.e., they will have 230 * the parity to data ratio implied by the old number of disks in the RAIDZ 231 * group). 232 * - Reading old blocks will be slightly slower than before the reflow, for 233 * two reasons. First, we will have to read from all disks in the RAIDZ 234 * VDEV, rather than being able to skip the children that contain only 235 * parity of this block (because the data of a single block is now spread 236 * out across all the disks). Second, in most cases there will be an extra 237 * bcopy, needed to rearrange the data back to its original layout in memory. 238 * 239 * == Scratch Area == 240 * 241 * As we copy the block data, we can only progress to the point that writes 242 * will not overlap with blocks whose progress has not yet been recorded on 243 * disk. Since partially-copied rows are always read from the old location, 244 * we need to stop one row before the sector-wise overlap, to prevent any 245 * row-wise overlap. For example, in the diagram above, when we reflow sector 246 * B6 it will overwite the original location for B5. 247 * 248 * To get around this, a scratch space is used so that we can start copying 249 * without risking data loss by overlapping the row. As an added benefit, it 250 * improves performance at the beginning of the reflow, but that small perf 251 * boost wouldn't be worth the complexity on its own. 252 * 253 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 254 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 255 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 256 * the widths will likely be single digits so we can get a substantial chuck 257 * size using only a few MB of scratch per disk. 258 * 259 * The scratch area is persisted to disk which holds a large amount of reflowed 260 * state. We can always read the partially written stripes when a disk fails or 261 * the copy is interrupted (crash) during the initial copying phase and also 262 * get past a small chunk size restriction. At a minimum, the scratch space 263 * must be large enough to get us to the point that one row does not overlap 264 * itself when moved (i.e new_width^2). But going larger is even better. We 265 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 266 * as our scratch space to handle overwriting the initial part of the VDEV. 267 * 268 * 0 256K 512K 4M 269 * +------+------+-----------------------+----------------------------- 270 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 271 * | L0 | L1 | Reserved | (Metaslabs) 272 * +------+------+-----------------------+------------------------------- 273 * Scratch Area 274 * 275 * == Reflow Progress Updates == 276 * After the initial scratch-based reflow, the expansion process works 277 * similarly to device removal. We create a new open context thread which 278 * reflows the data, and periodically kicks off sync tasks to update logical 279 * state. In this case, state is the committed progress (offset of next data 280 * to copy). We need to persist the completed offset on disk, so that if we 281 * crash we know which format each VDEV offset is in. 282 * 283 * == Time Dependent Geometry == 284 * 285 * In non-expanded RAIDZ, blocks are read from disk in a column by column 286 * fashion. For a multi-row block, the second sector is in the first column 287 * not in the second column. This allows us to issue full reads for each 288 * column directly into the request buffer. The block data is thus laid out 289 * sequentially in a column-by-column fashion. 290 * 291 * For example, in the before expansion diagram above, one logical block might 292 * be sectors G19-H26. The parity is in G19,H23; and the data is in 293 * G20,H24,G21,H25,G22,H26. 294 * 295 * After a block is reflowed, the sectors that were all in the original column 296 * data can now reside in different columns. When reading from an expanded 297 * VDEV, we need to know the logical stripe width for each block so we can 298 * reconstitute the block’s data after the reads are completed. Likewise, 299 * when we perform the combinatorial reconstruction we need to know the 300 * original width so we can retry combinations from the past layouts. 301 * 302 * Time dependent geometry is what we call having blocks with different layouts 303 * (stripe widths) in the same VDEV. This time-dependent geometry uses the 304 * block’s birth time (+ the time expansion ended) to establish the correct 305 * width for a given block. After an expansion completes, we record the time 306 * for blocks written with a particular width (geometry). 307 * 308 * == On Disk Format Changes == 309 * 310 * New pool feature flag, 'raidz_expansion' whose reference count is the number 311 * of RAIDZ VDEVs that have been expanded. 312 * 313 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 314 * 315 * Since the uberblock can point to arbitrary blocks, which might be on the 316 * expanding RAIDZ, and might or might not have been expanded. We need to know 317 * which way a block is laid out before reading it. This info is the next 318 * offset that needs to be reflowed and we persist that in the uberblock, in 319 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 320 * After the expansion is complete, we then use the raidz_expand_txgs array 321 * (see below) to determine how to read a block and the ub_raidz_reflow_info 322 * field no longer required. 323 * 324 * The uberblock's ub_raidz_reflow_info field also holds the scratch space 325 * state (i.e., active or not) which is also required before reading a block 326 * during the initial phase of reflowing the data. 327 * 328 * The top-level RAIDZ VDEV has two new entries in the nvlist: 329 * 330 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 331 * and used after the expansion is complete to 332 * determine how to read a raidz block 333 * 'raidz_expanding' boolean: present during reflow and removed after completion 334 * used during a spa import to resume an unfinished 335 * expansion 336 * 337 * And finally the VDEVs top zap adds the following informational entries: 338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 341 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 342 */ 343 344 /* 345 * For testing only: pause the raidz expansion after reflowing this amount. 346 * (accessed by ZTS and ztest) 347 */ 348 #ifdef _KERNEL 349 static 350 #endif /* _KERNEL */ 351 unsigned long raidz_expand_max_reflow_bytes = 0; 352 353 /* 354 * For testing only: pause the raidz expansion at a certain point. 355 */ 356 uint_t raidz_expand_pause_point = 0; 357 358 /* 359 * Maximum amount of copy io's outstanding at once. 360 */ 361 #ifdef _ILP32 362 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; 363 #else 364 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 365 #endif 366 367 /* 368 * Apply raidz map abds aggregation if the number of rows in the map is equal 369 * or greater than the value below. 370 */ 371 static unsigned long raidz_io_aggregate_rows = 4; 372 373 /* 374 * Automatically start a pool scrub when a RAIDZ expansion completes in 375 * order to verify the checksums of all blocks which have been copied 376 * during the expansion. Automatic scrubbing is enabled by default and 377 * is strongly recommended. 378 */ 379 static int zfs_scrub_after_expand = 1; 380 381 static void 382 vdev_raidz_row_free(raidz_row_t *rr) 383 { 384 for (int c = 0; c < rr->rr_cols; c++) { 385 raidz_col_t *rc = &rr->rr_col[c]; 386 387 if (rc->rc_size != 0) 388 abd_free(rc->rc_abd); 389 if (rc->rc_orig_data != NULL) 390 abd_free(rc->rc_orig_data); 391 } 392 393 if (rr->rr_abd_empty != NULL) 394 abd_free(rr->rr_abd_empty); 395 396 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 397 } 398 399 void 400 vdev_raidz_map_free(raidz_map_t *rm) 401 { 402 for (int i = 0; i < rm->rm_nrows; i++) 403 vdev_raidz_row_free(rm->rm_row[i]); 404 405 if (rm->rm_nphys_cols) { 406 for (int i = 0; i < rm->rm_nphys_cols; i++) { 407 if (rm->rm_phys_col[i].rc_abd != NULL) 408 abd_free(rm->rm_phys_col[i].rc_abd); 409 } 410 411 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 412 rm->rm_nphys_cols); 413 } 414 415 ASSERT3P(rm->rm_lr, ==, NULL); 416 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 417 } 418 419 static void 420 vdev_raidz_map_free_vsd(zio_t *zio) 421 { 422 raidz_map_t *rm = zio->io_vsd; 423 424 vdev_raidz_map_free(rm); 425 } 426 427 static int 428 vdev_raidz_reflow_compare(const void *x1, const void *x2) 429 { 430 const reflow_node_t *l = x1; 431 const reflow_node_t *r = x2; 432 433 return (TREE_CMP(l->re_txg, r->re_txg)); 434 } 435 436 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 437 .vsd_free = vdev_raidz_map_free_vsd, 438 }; 439 440 raidz_row_t * 441 vdev_raidz_row_alloc(int cols, zio_t *zio) 442 { 443 raidz_row_t *rr = 444 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 445 446 rr->rr_cols = cols; 447 rr->rr_scols = cols; 448 449 for (int c = 0; c < cols; c++) { 450 raidz_col_t *rc = &rr->rr_col[c]; 451 rc->rc_shadow_devidx = INT_MAX; 452 rc->rc_shadow_offset = UINT64_MAX; 453 /* 454 * We can not allow self healing to take place for Direct I/O 455 * reads. There is nothing that stops the buffer contents from 456 * being manipulated while the I/O is in flight. It is possible 457 * that the checksum could be verified on the buffer and then 458 * the contents of that buffer are manipulated afterwards. This 459 * could lead to bad data being written out during self 460 * healing. 461 */ 462 if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 463 rc->rc_allow_repair = 1; 464 } 465 return (rr); 466 } 467 468 static void 469 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 470 { 471 int c; 472 int nwrapped = 0; 473 uint64_t off = 0; 474 raidz_row_t *rr = rm->rm_row[0]; 475 476 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 477 ASSERT3U(rm->rm_nrows, ==, 1); 478 479 /* 480 * Pad any parity columns with additional space to account for skip 481 * sectors. 482 */ 483 if (rm->rm_skipstart < rr->rr_firstdatacol) { 484 ASSERT0(rm->rm_skipstart); 485 nwrapped = rm->rm_nskip; 486 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 487 nwrapped = 488 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 489 } 490 491 /* 492 * Optional single skip sectors (rc_size == 0) will be handled in 493 * vdev_raidz_io_start_write(). 494 */ 495 int skipped = rr->rr_scols - rr->rr_cols; 496 497 /* Allocate buffers for the parity columns */ 498 for (c = 0; c < rr->rr_firstdatacol; c++) { 499 raidz_col_t *rc = &rr->rr_col[c]; 500 501 /* 502 * Parity columns will pad out a linear ABD to account for 503 * the skip sector. A linear ABD is used here because 504 * parity calculations use the ABD buffer directly to calculate 505 * parity. This avoids doing a memcpy back to the ABD after the 506 * parity has been calculated. By issuing the parity column 507 * with the skip sector we can reduce contention on the child 508 * VDEV queue locks (vq_lock). 509 */ 510 if (c < nwrapped) { 511 rc->rc_abd = abd_alloc_linear( 512 rc->rc_size + (1ULL << ashift), B_FALSE); 513 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 514 skipped++; 515 } else { 516 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 517 } 518 } 519 520 for (off = 0; c < rr->rr_cols; c++) { 521 raidz_col_t *rc = &rr->rr_col[c]; 522 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 523 zio->io_abd, off, rc->rc_size); 524 525 /* 526 * Generate I/O for skip sectors to improve aggregation 527 * continuity. We will use gang ABD's to reduce contention 528 * on the child VDEV queue locks (vq_lock) by issuing 529 * a single I/O that contains the data and skip sector. 530 * 531 * It is important to make sure that rc_size is not updated 532 * even though we are adding a skip sector to the ABD. When 533 * calculating the parity in vdev_raidz_generate_parity_row() 534 * the rc_size is used to iterate through the ABD's. We can 535 * not have zero'd out skip sectors used for calculating 536 * parity for raidz, because those same sectors are not used 537 * during reconstruction. 538 */ 539 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 540 rc->rc_abd = abd_alloc_gang(); 541 abd_gang_add(rc->rc_abd, abd, B_TRUE); 542 abd_gang_add(rc->rc_abd, 543 abd_get_zeros(1ULL << ashift), B_TRUE); 544 skipped++; 545 } else { 546 rc->rc_abd = abd; 547 } 548 off += rc->rc_size; 549 } 550 551 ASSERT3U(off, ==, zio->io_size); 552 ASSERT3S(skipped, ==, rm->rm_nskip); 553 } 554 555 static void 556 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 557 { 558 int c; 559 raidz_row_t *rr = rm->rm_row[0]; 560 561 ASSERT3U(rm->rm_nrows, ==, 1); 562 563 /* Allocate buffers for the parity columns */ 564 for (c = 0; c < rr->rr_firstdatacol; c++) 565 rr->rr_col[c].rc_abd = 566 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 567 568 for (uint64_t off = 0; c < rr->rr_cols; c++) { 569 raidz_col_t *rc = &rr->rr_col[c]; 570 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 571 zio->io_abd, off, rc->rc_size); 572 off += rc->rc_size; 573 } 574 } 575 576 /* 577 * Divides the IO evenly across all child vdevs; usually, dcols is 578 * the number of children in the target vdev. 579 * 580 * Avoid inlining the function to keep vdev_raidz_io_start(), which 581 * is this functions only caller, as small as possible on the stack. 582 */ 583 noinline raidz_map_t * 584 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 585 uint64_t nparity) 586 { 587 raidz_row_t *rr; 588 /* The starting RAIDZ (parent) vdev sector of the block. */ 589 uint64_t b = zio->io_offset >> ashift; 590 /* The zio's size in units of the vdev's minimum sector size. */ 591 uint64_t s = zio->io_size >> ashift; 592 /* The first column for this stripe. */ 593 uint64_t f = b % dcols; 594 /* The starting byte offset on each child vdev. */ 595 uint64_t o = (b / dcols) << ashift; 596 uint64_t acols, scols; 597 598 raidz_map_t *rm = 599 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 600 rm->rm_nrows = 1; 601 602 /* 603 * "Quotient": The number of data sectors for this stripe on all but 604 * the "big column" child vdevs that also contain "remainder" data. 605 */ 606 uint64_t q = s / (dcols - nparity); 607 608 /* 609 * "Remainder": The number of partial stripe data sectors in this I/O. 610 * This will add a sector to some, but not all, child vdevs. 611 */ 612 uint64_t r = s - q * (dcols - nparity); 613 614 /* The number of "big columns" - those which contain remainder data. */ 615 uint64_t bc = (r == 0 ? 0 : r + nparity); 616 617 /* 618 * The total number of data and parity sectors associated with 619 * this I/O. 620 */ 621 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 622 623 /* 624 * acols: The columns that will be accessed. 625 * scols: The columns that will be accessed or skipped. 626 */ 627 if (q == 0) { 628 /* Our I/O request doesn't span all child vdevs. */ 629 acols = bc; 630 scols = MIN(dcols, roundup(bc, nparity + 1)); 631 } else { 632 acols = dcols; 633 scols = dcols; 634 } 635 636 ASSERT3U(acols, <=, scols); 637 rr = vdev_raidz_row_alloc(scols, zio); 638 rm->rm_row[0] = rr; 639 rr->rr_cols = acols; 640 rr->rr_bigcols = bc; 641 rr->rr_firstdatacol = nparity; 642 #ifdef ZFS_DEBUG 643 rr->rr_offset = zio->io_offset; 644 rr->rr_size = zio->io_size; 645 #endif 646 647 uint64_t asize = 0; 648 649 for (uint64_t c = 0; c < scols; c++) { 650 raidz_col_t *rc = &rr->rr_col[c]; 651 uint64_t col = f + c; 652 uint64_t coff = o; 653 if (col >= dcols) { 654 col -= dcols; 655 coff += 1ULL << ashift; 656 } 657 rc->rc_devidx = col; 658 rc->rc_offset = coff; 659 660 if (c >= acols) 661 rc->rc_size = 0; 662 else if (c < bc) 663 rc->rc_size = (q + 1) << ashift; 664 else 665 rc->rc_size = q << ashift; 666 667 asize += rc->rc_size; 668 } 669 670 ASSERT3U(asize, ==, tot << ashift); 671 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 672 rm->rm_skipstart = bc; 673 674 /* 675 * If all data stored spans all columns, there's a danger that parity 676 * will always be on the same device and, since parity isn't read 677 * during normal operation, that device's I/O bandwidth won't be 678 * used effectively. We therefore switch the parity every 1MB. 679 * 680 * ... at least that was, ostensibly, the theory. As a practical 681 * matter unless we juggle the parity between all devices evenly, we 682 * won't see any benefit. Further, occasional writes that aren't a 683 * multiple of the LCM of the number of children and the minimum 684 * stripe width are sufficient to avoid pessimal behavior. 685 * Unfortunately, this decision created an implicit on-disk format 686 * requirement that we need to support for all eternity, but only 687 * for single-parity RAID-Z. 688 * 689 * If we intend to skip a sector in the zeroth column for padding 690 * we must make sure to note this swap. We will never intend to 691 * skip the first column since at least one data and one parity 692 * column must appear in each row. 693 */ 694 ASSERT(rr->rr_cols >= 2); 695 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 696 697 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 698 uint64_t devidx = rr->rr_col[0].rc_devidx; 699 o = rr->rr_col[0].rc_offset; 700 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 701 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 702 rr->rr_col[1].rc_devidx = devidx; 703 rr->rr_col[1].rc_offset = o; 704 if (rm->rm_skipstart == 0) 705 rm->rm_skipstart = 1; 706 } 707 708 if (zio->io_type == ZIO_TYPE_WRITE) { 709 vdev_raidz_map_alloc_write(zio, rm, ashift); 710 } else { 711 vdev_raidz_map_alloc_read(zio, rm); 712 } 713 /* init RAIDZ parity ops */ 714 rm->rm_ops = vdev_raidz_math_get_ops(); 715 716 return (rm); 717 } 718 719 /* 720 * Everything before reflow_offset_synced should have been moved to the new 721 * location (read and write completed). However, this may not yet be reflected 722 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 723 * uberblock has not yet been written). If reflow is not in progress, 724 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 725 * entirely before reflow_offset_synced, it will come from the new location. 726 * Otherwise this row will come from the old location. Therefore, rows that 727 * straddle the reflow_offset_synced will come from the old location. 728 * 729 * For writes, reflow_offset_next is the next offset to copy. If a sector has 730 * been copied, but not yet reflected in the on-disk progress 731 * (reflow_offset_synced), it will also be written to the new (already copied) 732 * offset. 733 */ 734 noinline raidz_map_t * 735 vdev_raidz_map_alloc_expanded(zio_t *zio, 736 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 737 uint64_t nparity, uint64_t reflow_offset_synced, 738 uint64_t reflow_offset_next, boolean_t use_scratch) 739 { 740 abd_t *abd = zio->io_abd; 741 uint64_t offset = zio->io_offset; 742 uint64_t size = zio->io_size; 743 744 /* The zio's size in units of the vdev's minimum sector size. */ 745 uint64_t s = size >> ashift; 746 747 /* 748 * "Quotient": The number of data sectors for this stripe on all but 749 * the "big column" child vdevs that also contain "remainder" data. 750 * AKA "full rows" 751 */ 752 uint64_t q = s / (logical_cols - nparity); 753 754 /* 755 * "Remainder": The number of partial stripe data sectors in this I/O. 756 * This will add a sector to some, but not all, child vdevs. 757 */ 758 uint64_t r = s - q * (logical_cols - nparity); 759 760 /* The number of "big columns" - those which contain remainder data. */ 761 uint64_t bc = (r == 0 ? 0 : r + nparity); 762 763 /* 764 * The total number of data and parity sectors associated with 765 * this I/O. 766 */ 767 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 768 769 /* How many rows contain data (not skip) */ 770 uint64_t rows = howmany(tot, logical_cols); 771 int cols = MIN(tot, logical_cols); 772 773 raidz_map_t *rm = 774 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 775 KM_SLEEP); 776 rm->rm_nrows = rows; 777 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 778 rm->rm_skipstart = bc; 779 uint64_t asize = 0; 780 781 for (uint64_t row = 0; row < rows; row++) { 782 boolean_t row_use_scratch = B_FALSE; 783 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 784 rm->rm_row[row] = rr; 785 786 /* The starting RAIDZ (parent) vdev sector of the row. */ 787 uint64_t b = (offset >> ashift) + row * logical_cols; 788 789 /* 790 * If we are in the middle of a reflow, and the copying has 791 * not yet completed for any part of this row, then use the 792 * old location of this row. Note that reflow_offset_synced 793 * reflects the i/o that's been completed, because it's 794 * updated by a synctask, after zio_wait(spa_txg_zio[]). 795 * This is sufficient for our check, even if that progress 796 * has not yet been recorded to disk (reflected in 797 * spa_ubsync). Also note that we consider the last row to 798 * be "full width" (`cols`-wide rather than `bc`-wide) for 799 * this calculation. This causes a tiny bit of unnecessary 800 * double-writes but is safe and simpler to calculate. 801 */ 802 int row_phys_cols = physical_cols; 803 if (b + cols > reflow_offset_synced >> ashift) 804 row_phys_cols--; 805 else if (use_scratch) 806 row_use_scratch = B_TRUE; 807 808 /* starting child of this row */ 809 uint64_t child_id = b % row_phys_cols; 810 /* The starting byte offset on each child vdev. */ 811 uint64_t child_offset = (b / row_phys_cols) << ashift; 812 813 /* 814 * Note, rr_cols is the entire width of the block, even 815 * if this row is shorter. This is needed because parity 816 * generation (for Q and R) needs to know the entire width, 817 * because it treats the short row as though it was 818 * full-width (and the "phantom" sectors were zero-filled). 819 * 820 * Another approach to this would be to set cols shorter 821 * (to just the number of columns that we might do i/o to) 822 * and have another mechanism to tell the parity generation 823 * about the "entire width". Reconstruction (at least 824 * vdev_raidz_reconstruct_general()) would also need to 825 * know about the "entire width". 826 */ 827 rr->rr_firstdatacol = nparity; 828 #ifdef ZFS_DEBUG 829 /* 830 * note: rr_size is PSIZE, not ASIZE 831 */ 832 rr->rr_offset = b << ashift; 833 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 834 #endif 835 836 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 837 if (child_id >= row_phys_cols) { 838 child_id -= row_phys_cols; 839 child_offset += 1ULL << ashift; 840 } 841 raidz_col_t *rc = &rr->rr_col[c]; 842 rc->rc_devidx = child_id; 843 rc->rc_offset = child_offset; 844 845 /* 846 * Get this from the scratch space if appropriate. 847 * This only happens if we crashed in the middle of 848 * raidz_reflow_scratch_sync() (while it's running, 849 * the rangelock prevents us from doing concurrent 850 * io), and even then only during zpool import or 851 * when the pool is imported readonly. 852 */ 853 if (row_use_scratch) 854 rc->rc_offset -= VDEV_BOOT_SIZE; 855 856 uint64_t dc = c - rr->rr_firstdatacol; 857 if (c < rr->rr_firstdatacol) { 858 rc->rc_size = 1ULL << ashift; 859 860 /* 861 * Parity sectors' rc_abd's are set below 862 * after determining if this is an aggregation. 863 */ 864 } else if (row == rows - 1 && bc != 0 && c >= bc) { 865 /* 866 * Past the end of the block (even including 867 * skip sectors). This sector is part of the 868 * map so that we have full rows for p/q parity 869 * generation. 870 */ 871 rc->rc_size = 0; 872 rc->rc_abd = NULL; 873 } else { 874 /* "data column" (col excluding parity) */ 875 uint64_t off; 876 877 if (c < bc || r == 0) { 878 off = dc * rows + row; 879 } else { 880 off = r * rows + 881 (dc - r) * (rows - 1) + row; 882 } 883 rc->rc_size = 1ULL << ashift; 884 rc->rc_abd = abd_get_offset_struct( 885 &rc->rc_abdstruct, abd, off << ashift, 886 rc->rc_size); 887 } 888 889 if (rc->rc_size == 0) 890 continue; 891 892 /* 893 * If any part of this row is in both old and new 894 * locations, the primary location is the old 895 * location. If this sector was already copied to the 896 * new location, we need to also write to the new, 897 * "shadow" location. 898 * 899 * Note, `row_phys_cols != physical_cols` indicates 900 * that the primary location is the old location. 901 * `b+c < reflow_offset_next` indicates that the copy 902 * to the new location has been initiated. We know 903 * that the copy has completed because we have the 904 * rangelock, which is held exclusively while the 905 * copy is in progress. 906 */ 907 if (row_use_scratch || 908 (row_phys_cols != physical_cols && 909 b + c < reflow_offset_next >> ashift)) { 910 rc->rc_shadow_devidx = (b + c) % physical_cols; 911 rc->rc_shadow_offset = 912 ((b + c) / physical_cols) << ashift; 913 if (row_use_scratch) 914 rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 915 } 916 917 asize += rc->rc_size; 918 } 919 920 /* 921 * See comment in vdev_raidz_map_alloc() 922 */ 923 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 924 (offset & (1ULL << 20))) { 925 ASSERT(rr->rr_cols >= 2); 926 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 927 928 int devidx0 = rr->rr_col[0].rc_devidx; 929 uint64_t offset0 = rr->rr_col[0].rc_offset; 930 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 931 uint64_t shadow_offset0 = 932 rr->rr_col[0].rc_shadow_offset; 933 934 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 935 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 936 rr->rr_col[0].rc_shadow_devidx = 937 rr->rr_col[1].rc_shadow_devidx; 938 rr->rr_col[0].rc_shadow_offset = 939 rr->rr_col[1].rc_shadow_offset; 940 941 rr->rr_col[1].rc_devidx = devidx0; 942 rr->rr_col[1].rc_offset = offset0; 943 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 944 rr->rr_col[1].rc_shadow_offset = shadow_offset0; 945 } 946 } 947 ASSERT3U(asize, ==, tot << ashift); 948 949 /* 950 * Determine if the block is contiguous, in which case we can use 951 * an aggregation. 952 */ 953 if (rows >= raidz_io_aggregate_rows) { 954 rm->rm_nphys_cols = physical_cols; 955 rm->rm_phys_col = 956 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 957 KM_SLEEP); 958 959 /* 960 * Determine the aggregate io's offset and size, and check 961 * that the io is contiguous. 962 */ 963 for (int i = 0; 964 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 965 raidz_row_t *rr = rm->rm_row[i]; 966 for (int c = 0; c < rr->rr_cols; c++) { 967 raidz_col_t *rc = &rr->rr_col[c]; 968 raidz_col_t *prc = 969 &rm->rm_phys_col[rc->rc_devidx]; 970 971 if (rc->rc_size == 0) 972 continue; 973 974 if (prc->rc_size == 0) { 975 ASSERT0(prc->rc_offset); 976 prc->rc_offset = rc->rc_offset; 977 } else if (prc->rc_offset + prc->rc_size != 978 rc->rc_offset) { 979 /* 980 * This block is not contiguous and 981 * therefore can't be aggregated. 982 * This is expected to be rare, so 983 * the cost of allocating and then 984 * freeing rm_phys_col is not 985 * significant. 986 */ 987 kmem_free(rm->rm_phys_col, 988 sizeof (raidz_col_t) * 989 rm->rm_nphys_cols); 990 rm->rm_phys_col = NULL; 991 rm->rm_nphys_cols = 0; 992 break; 993 } 994 prc->rc_size += rc->rc_size; 995 } 996 } 997 } 998 if (rm->rm_phys_col != NULL) { 999 /* 1000 * Allocate aggregate ABD's. 1001 */ 1002 for (int i = 0; i < rm->rm_nphys_cols; i++) { 1003 raidz_col_t *prc = &rm->rm_phys_col[i]; 1004 1005 prc->rc_devidx = i; 1006 1007 if (prc->rc_size == 0) 1008 continue; 1009 1010 prc->rc_abd = 1011 abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1012 B_FALSE); 1013 } 1014 1015 /* 1016 * Point the parity abd's into the aggregate abd's. 1017 */ 1018 for (int i = 0; i < rm->rm_nrows; i++) { 1019 raidz_row_t *rr = rm->rm_row[i]; 1020 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1021 raidz_col_t *rc = &rr->rr_col[c]; 1022 raidz_col_t *prc = 1023 &rm->rm_phys_col[rc->rc_devidx]; 1024 rc->rc_abd = 1025 abd_get_offset_struct(&rc->rc_abdstruct, 1026 prc->rc_abd, 1027 rc->rc_offset - prc->rc_offset, 1028 rc->rc_size); 1029 } 1030 } 1031 } else { 1032 /* 1033 * Allocate new abd's for the parity sectors. 1034 */ 1035 for (int i = 0; i < rm->rm_nrows; i++) { 1036 raidz_row_t *rr = rm->rm_row[i]; 1037 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1038 raidz_col_t *rc = &rr->rr_col[c]; 1039 rc->rc_abd = 1040 abd_alloc_linear(rc->rc_size, 1041 B_TRUE); 1042 } 1043 } 1044 } 1045 /* init RAIDZ parity ops */ 1046 rm->rm_ops = vdev_raidz_math_get_ops(); 1047 1048 return (rm); 1049 } 1050 1051 struct pqr_struct { 1052 uint64_t *p; 1053 uint64_t *q; 1054 uint64_t *r; 1055 }; 1056 1057 static int 1058 vdev_raidz_p_func(void *buf, size_t size, void *private) 1059 { 1060 struct pqr_struct *pqr = private; 1061 const uint64_t *src = buf; 1062 int cnt = size / sizeof (src[0]); 1063 1064 ASSERT(pqr->p && !pqr->q && !pqr->r); 1065 1066 for (int i = 0; i < cnt; i++, src++, pqr->p++) 1067 *pqr->p ^= *src; 1068 1069 return (0); 1070 } 1071 1072 static int 1073 vdev_raidz_pq_func(void *buf, size_t size, void *private) 1074 { 1075 struct pqr_struct *pqr = private; 1076 const uint64_t *src = buf; 1077 uint64_t mask; 1078 int cnt = size / sizeof (src[0]); 1079 1080 ASSERT(pqr->p && pqr->q && !pqr->r); 1081 1082 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1083 *pqr->p ^= *src; 1084 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1085 *pqr->q ^= *src; 1086 } 1087 1088 return (0); 1089 } 1090 1091 static int 1092 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1093 { 1094 struct pqr_struct *pqr = private; 1095 const uint64_t *src = buf; 1096 uint64_t mask; 1097 int cnt = size / sizeof (src[0]); 1098 1099 ASSERT(pqr->p && pqr->q && pqr->r); 1100 1101 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1102 *pqr->p ^= *src; 1103 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1104 *pqr->q ^= *src; 1105 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1106 *pqr->r ^= *src; 1107 } 1108 1109 return (0); 1110 } 1111 1112 static void 1113 vdev_raidz_generate_parity_p(raidz_row_t *rr) 1114 { 1115 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1116 1117 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1118 abd_t *src = rr->rr_col[c].rc_abd; 1119 1120 if (c == rr->rr_firstdatacol) { 1121 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1122 } else { 1123 struct pqr_struct pqr = { p, NULL, NULL }; 1124 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1125 vdev_raidz_p_func, &pqr); 1126 } 1127 } 1128 } 1129 1130 static void 1131 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1132 { 1133 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1134 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1135 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1136 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1137 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1138 1139 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1140 abd_t *src = rr->rr_col[c].rc_abd; 1141 1142 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1143 1144 if (c == rr->rr_firstdatacol) { 1145 ASSERT(ccnt == pcnt || ccnt == 0); 1146 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1147 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1148 1149 for (uint64_t i = ccnt; i < pcnt; i++) { 1150 p[i] = 0; 1151 q[i] = 0; 1152 } 1153 } else { 1154 struct pqr_struct pqr = { p, q, NULL }; 1155 1156 ASSERT(ccnt <= pcnt); 1157 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1158 vdev_raidz_pq_func, &pqr); 1159 1160 /* 1161 * Treat short columns as though they are full of 0s. 1162 * Note that there's therefore nothing needed for P. 1163 */ 1164 uint64_t mask; 1165 for (uint64_t i = ccnt; i < pcnt; i++) { 1166 VDEV_RAIDZ_64MUL_2(q[i], mask); 1167 } 1168 } 1169 } 1170 } 1171 1172 static void 1173 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1174 { 1175 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1176 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1177 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 1178 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1179 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1180 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1181 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1182 rr->rr_col[VDEV_RAIDZ_R].rc_size); 1183 1184 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1185 abd_t *src = rr->rr_col[c].rc_abd; 1186 1187 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1188 1189 if (c == rr->rr_firstdatacol) { 1190 ASSERT(ccnt == pcnt || ccnt == 0); 1191 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1192 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1193 (void) memcpy(r, p, rr->rr_col[c].rc_size); 1194 1195 for (uint64_t i = ccnt; i < pcnt; i++) { 1196 p[i] = 0; 1197 q[i] = 0; 1198 r[i] = 0; 1199 } 1200 } else { 1201 struct pqr_struct pqr = { p, q, r }; 1202 1203 ASSERT(ccnt <= pcnt); 1204 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1205 vdev_raidz_pqr_func, &pqr); 1206 1207 /* 1208 * Treat short columns as though they are full of 0s. 1209 * Note that there's therefore nothing needed for P. 1210 */ 1211 uint64_t mask; 1212 for (uint64_t i = ccnt; i < pcnt; i++) { 1213 VDEV_RAIDZ_64MUL_2(q[i], mask); 1214 VDEV_RAIDZ_64MUL_4(r[i], mask); 1215 } 1216 } 1217 } 1218 } 1219 1220 /* 1221 * Generate RAID parity in the first virtual columns according to the number of 1222 * parity columns available. 1223 */ 1224 void 1225 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1226 { 1227 if (rr->rr_cols == 0) { 1228 /* 1229 * We are handling this block one row at a time (because 1230 * this block has a different logical vs physical width, 1231 * due to RAIDZ expansion), and this is a pad-only row, 1232 * which has no parity. 1233 */ 1234 return; 1235 } 1236 1237 /* Generate using the new math implementation */ 1238 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1239 return; 1240 1241 switch (rr->rr_firstdatacol) { 1242 case 1: 1243 vdev_raidz_generate_parity_p(rr); 1244 break; 1245 case 2: 1246 vdev_raidz_generate_parity_pq(rr); 1247 break; 1248 case 3: 1249 vdev_raidz_generate_parity_pqr(rr); 1250 break; 1251 default: 1252 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1253 } 1254 } 1255 1256 void 1257 vdev_raidz_generate_parity(raidz_map_t *rm) 1258 { 1259 for (int i = 0; i < rm->rm_nrows; i++) { 1260 raidz_row_t *rr = rm->rm_row[i]; 1261 vdev_raidz_generate_parity_row(rm, rr); 1262 } 1263 } 1264 1265 static int 1266 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1267 { 1268 (void) private; 1269 uint64_t *dst = dbuf; 1270 uint64_t *src = sbuf; 1271 int cnt = size / sizeof (src[0]); 1272 1273 for (int i = 0; i < cnt; i++) { 1274 dst[i] ^= src[i]; 1275 } 1276 1277 return (0); 1278 } 1279 1280 static int 1281 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1282 void *private) 1283 { 1284 (void) private; 1285 uint64_t *dst = dbuf; 1286 uint64_t *src = sbuf; 1287 uint64_t mask; 1288 int cnt = size / sizeof (dst[0]); 1289 1290 for (int i = 0; i < cnt; i++, dst++, src++) { 1291 VDEV_RAIDZ_64MUL_2(*dst, mask); 1292 *dst ^= *src; 1293 } 1294 1295 return (0); 1296 } 1297 1298 static int 1299 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1300 { 1301 (void) private; 1302 uint64_t *dst = buf; 1303 uint64_t mask; 1304 int cnt = size / sizeof (dst[0]); 1305 1306 for (int i = 0; i < cnt; i++, dst++) { 1307 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1308 VDEV_RAIDZ_64MUL_2(*dst, mask); 1309 } 1310 1311 return (0); 1312 } 1313 1314 struct reconst_q_struct { 1315 uint64_t *q; 1316 int exp; 1317 }; 1318 1319 static int 1320 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1321 { 1322 struct reconst_q_struct *rq = private; 1323 uint64_t *dst = buf; 1324 int cnt = size / sizeof (dst[0]); 1325 1326 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1327 int j; 1328 uint8_t *b; 1329 1330 *dst ^= *rq->q; 1331 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1332 *b = vdev_raidz_exp2(*b, rq->exp); 1333 } 1334 } 1335 1336 return (0); 1337 } 1338 1339 struct reconst_pq_struct { 1340 uint8_t *p; 1341 uint8_t *q; 1342 uint8_t *pxy; 1343 uint8_t *qxy; 1344 int aexp; 1345 int bexp; 1346 }; 1347 1348 static int 1349 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1350 { 1351 struct reconst_pq_struct *rpq = private; 1352 uint8_t *xd = xbuf; 1353 uint8_t *yd = ybuf; 1354 1355 for (int i = 0; i < size; 1356 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1357 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1358 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1359 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1360 } 1361 1362 return (0); 1363 } 1364 1365 static int 1366 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1367 { 1368 struct reconst_pq_struct *rpq = private; 1369 uint8_t *xd = xbuf; 1370 1371 for (int i = 0; i < size; 1372 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1373 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1374 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1375 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1376 } 1377 1378 return (0); 1379 } 1380 1381 static void 1382 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1383 { 1384 int x = tgts[0]; 1385 abd_t *dst, *src; 1386 1387 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1388 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1389 1390 ASSERT3U(ntgts, ==, 1); 1391 ASSERT3U(x, >=, rr->rr_firstdatacol); 1392 ASSERT3U(x, <, rr->rr_cols); 1393 1394 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1395 1396 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1397 dst = rr->rr_col[x].rc_abd; 1398 1399 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1400 1401 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1402 uint64_t size = MIN(rr->rr_col[x].rc_size, 1403 rr->rr_col[c].rc_size); 1404 1405 src = rr->rr_col[c].rc_abd; 1406 1407 if (c == x) 1408 continue; 1409 1410 (void) abd_iterate_func2(dst, src, 0, 0, size, 1411 vdev_raidz_reconst_p_func, NULL); 1412 } 1413 } 1414 1415 static void 1416 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1417 { 1418 int x = tgts[0]; 1419 int c, exp; 1420 abd_t *dst, *src; 1421 1422 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1423 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1424 1425 ASSERT(ntgts == 1); 1426 1427 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1428 1429 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1430 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 1431 rr->rr_col[c].rc_size); 1432 1433 src = rr->rr_col[c].rc_abd; 1434 dst = rr->rr_col[x].rc_abd; 1435 1436 if (c == rr->rr_firstdatacol) { 1437 abd_copy(dst, src, size); 1438 if (rr->rr_col[x].rc_size > size) { 1439 abd_zero_off(dst, size, 1440 rr->rr_col[x].rc_size - size); 1441 } 1442 } else { 1443 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1444 (void) abd_iterate_func2(dst, src, 0, 0, size, 1445 vdev_raidz_reconst_q_pre_func, NULL); 1446 (void) abd_iterate_func(dst, 1447 size, rr->rr_col[x].rc_size - size, 1448 vdev_raidz_reconst_q_pre_tail_func, NULL); 1449 } 1450 } 1451 1452 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1453 dst = rr->rr_col[x].rc_abd; 1454 exp = 255 - (rr->rr_cols - 1 - x); 1455 1456 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1457 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1458 vdev_raidz_reconst_q_post_func, &rq); 1459 } 1460 1461 static void 1462 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1463 { 1464 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1465 abd_t *pdata, *qdata; 1466 uint64_t xsize, ysize; 1467 int x = tgts[0]; 1468 int y = tgts[1]; 1469 abd_t *xd, *yd; 1470 1471 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1472 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1473 1474 ASSERT(ntgts == 2); 1475 ASSERT(x < y); 1476 ASSERT(x >= rr->rr_firstdatacol); 1477 ASSERT(y < rr->rr_cols); 1478 1479 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1480 1481 /* 1482 * Move the parity data aside -- we're going to compute parity as 1483 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1484 * reuse the parity generation mechanism without trashing the actual 1485 * parity so we make those columns appear to be full of zeros by 1486 * setting their lengths to zero. 1487 */ 1488 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1489 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1490 xsize = rr->rr_col[x].rc_size; 1491 ysize = rr->rr_col[y].rc_size; 1492 1493 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 1494 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1495 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 1496 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1497 rr->rr_col[x].rc_size = 0; 1498 rr->rr_col[y].rc_size = 0; 1499 1500 vdev_raidz_generate_parity_pq(rr); 1501 1502 rr->rr_col[x].rc_size = xsize; 1503 rr->rr_col[y].rc_size = ysize; 1504 1505 p = abd_to_buf(pdata); 1506 q = abd_to_buf(qdata); 1507 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1508 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1509 xd = rr->rr_col[x].rc_abd; 1510 yd = rr->rr_col[y].rc_abd; 1511 1512 /* 1513 * We now have: 1514 * Pxy = P + D_x + D_y 1515 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1516 * 1517 * We can then solve for D_x: 1518 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1519 * where 1520 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1521 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1522 * 1523 * With D_x in hand, we can easily solve for D_y: 1524 * D_y = P + Pxy + D_x 1525 */ 1526 1527 a = vdev_raidz_pow2[255 + x - y]; 1528 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1529 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1530 1531 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1532 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1533 1534 ASSERT3U(xsize, >=, ysize); 1535 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1536 1537 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1538 vdev_raidz_reconst_pq_func, &rpq); 1539 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1540 vdev_raidz_reconst_pq_tail_func, &rpq); 1541 1542 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1543 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1544 1545 /* 1546 * Restore the saved parity data. 1547 */ 1548 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 1549 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1550 } 1551 1552 /* 1553 * In the general case of reconstruction, we must solve the system of linear 1554 * equations defined by the coefficients used to generate parity as well as 1555 * the contents of the data and parity disks. This can be expressed with 1556 * vectors for the original data (D) and the actual data (d) and parity (p) 1557 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1558 * 1559 * __ __ __ __ 1560 * | | __ __ | p_0 | 1561 * | V | | D_0 | | p_m-1 | 1562 * | | x | : | = | d_0 | 1563 * | I | | D_n-1 | | : | 1564 * | | ~~ ~~ | d_n-1 | 1565 * ~~ ~~ ~~ ~~ 1566 * 1567 * I is simply a square identity matrix of size n, and V is a vandermonde 1568 * matrix defined by the coefficients we chose for the various parity columns 1569 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1570 * computation as well as linear separability. 1571 * 1572 * __ __ __ __ 1573 * | 1 .. 1 1 1 | | p_0 | 1574 * | 2^n-1 .. 4 2 1 | __ __ | : | 1575 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1576 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1577 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1578 * | : : : : | | : | | d_2 | 1579 * | 0 .. 1 0 0 | | D_n-1 | | : | 1580 * | 0 .. 0 1 0 | ~~ ~~ | : | 1581 * | 0 .. 0 0 1 | | d_n-1 | 1582 * ~~ ~~ ~~ ~~ 1583 * 1584 * Note that I, V, d, and p are known. To compute D, we must invert the 1585 * matrix and use the known data and parity values to reconstruct the unknown 1586 * data values. We begin by removing the rows in V|I and d|p that correspond 1587 * to failed or missing columns; we then make V|I square (n x n) and d|p 1588 * sized n by removing rows corresponding to unused parity from the bottom up 1589 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1590 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1591 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1592 * __ __ 1593 * | 1 1 1 1 1 1 1 1 | 1594 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1595 * | 19 205 116 29 64 16 4 1 | / / 1596 * | 1 0 0 0 0 0 0 0 | / / 1597 * | 0 1 0 0 0 0 0 0 | <--' / 1598 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1599 * | 0 0 0 1 0 0 0 0 | 1600 * | 0 0 0 0 1 0 0 0 | 1601 * | 0 0 0 0 0 1 0 0 | 1602 * | 0 0 0 0 0 0 1 0 | 1603 * | 0 0 0 0 0 0 0 1 | 1604 * ~~ ~~ 1605 * __ __ 1606 * | 1 1 1 1 1 1 1 1 | 1607 * | 128 64 32 16 8 4 2 1 | 1608 * | 19 205 116 29 64 16 4 1 | 1609 * | 1 0 0 0 0 0 0 0 | 1610 * | 0 1 0 0 0 0 0 0 | 1611 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1612 * | 0 0 0 1 0 0 0 0 | 1613 * | 0 0 0 0 1 0 0 0 | 1614 * | 0 0 0 0 0 1 0 0 | 1615 * | 0 0 0 0 0 0 1 0 | 1616 * | 0 0 0 0 0 0 0 1 | 1617 * ~~ ~~ 1618 * 1619 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1620 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1621 * matrix is not singular. 1622 * __ __ 1623 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1624 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1625 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1626 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1627 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1628 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1629 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1630 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1631 * ~~ ~~ 1632 * __ __ 1633 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1634 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1635 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1636 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1637 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1638 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1639 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1640 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1641 * ~~ ~~ 1642 * __ __ 1643 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1644 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1645 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1646 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1647 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1648 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1649 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1650 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1651 * ~~ ~~ 1652 * __ __ 1653 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1654 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1655 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1656 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1657 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1658 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1659 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1660 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1661 * ~~ ~~ 1662 * __ __ 1663 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1664 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1665 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1666 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1667 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1668 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1669 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1670 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1671 * ~~ ~~ 1672 * __ __ 1673 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1674 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1675 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1676 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1677 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1678 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1679 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1680 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1681 * ~~ ~~ 1682 * __ __ 1683 * | 0 0 1 0 0 0 0 0 | 1684 * | 167 100 5 41 159 169 217 208 | 1685 * | 166 100 4 40 158 168 216 209 | 1686 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1687 * | 0 0 0 0 1 0 0 0 | 1688 * | 0 0 0 0 0 1 0 0 | 1689 * | 0 0 0 0 0 0 1 0 | 1690 * | 0 0 0 0 0 0 0 1 | 1691 * ~~ ~~ 1692 * 1693 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1694 * of the missing data. 1695 * 1696 * As is apparent from the example above, the only non-trivial rows in the 1697 * inverse matrix correspond to the data disks that we're trying to 1698 * reconstruct. Indeed, those are the only rows we need as the others would 1699 * only be useful for reconstructing data known or assumed to be valid. For 1700 * that reason, we only build the coefficients in the rows that correspond to 1701 * targeted columns. 1702 */ 1703 1704 static void 1705 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1706 uint8_t **rows) 1707 { 1708 int i, j; 1709 int pow; 1710 1711 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1712 1713 /* 1714 * Fill in the missing rows of interest. 1715 */ 1716 for (i = 0; i < nmap; i++) { 1717 ASSERT3S(0, <=, map[i]); 1718 ASSERT3S(map[i], <=, 2); 1719 1720 pow = map[i] * n; 1721 if (pow > 255) 1722 pow -= 255; 1723 ASSERT(pow <= 255); 1724 1725 for (j = 0; j < n; j++) { 1726 pow -= map[i]; 1727 if (pow < 0) 1728 pow += 255; 1729 rows[i][j] = vdev_raidz_pow2[pow]; 1730 } 1731 } 1732 } 1733 1734 static void 1735 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1736 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1737 { 1738 int i, j, ii, jj; 1739 uint8_t log; 1740 1741 /* 1742 * Assert that the first nmissing entries from the array of used 1743 * columns correspond to parity columns and that subsequent entries 1744 * correspond to data columns. 1745 */ 1746 for (i = 0; i < nmissing; i++) { 1747 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1748 } 1749 for (; i < n; i++) { 1750 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1751 } 1752 1753 /* 1754 * First initialize the storage where we'll compute the inverse rows. 1755 */ 1756 for (i = 0; i < nmissing; i++) { 1757 for (j = 0; j < n; j++) { 1758 invrows[i][j] = (i == j) ? 1 : 0; 1759 } 1760 } 1761 1762 /* 1763 * Subtract all trivial rows from the rows of consequence. 1764 */ 1765 for (i = 0; i < nmissing; i++) { 1766 for (j = nmissing; j < n; j++) { 1767 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1768 jj = used[j] - rr->rr_firstdatacol; 1769 ASSERT3S(jj, <, n); 1770 invrows[i][j] = rows[i][jj]; 1771 rows[i][jj] = 0; 1772 } 1773 } 1774 1775 /* 1776 * For each of the rows of interest, we must normalize it and subtract 1777 * a multiple of it from the other rows. 1778 */ 1779 for (i = 0; i < nmissing; i++) { 1780 for (j = 0; j < missing[i]; j++) { 1781 ASSERT0(rows[i][j]); 1782 } 1783 ASSERT3U(rows[i][missing[i]], !=, 0); 1784 1785 /* 1786 * Compute the inverse of the first element and multiply each 1787 * element in the row by that value. 1788 */ 1789 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1790 1791 for (j = 0; j < n; j++) { 1792 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1793 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1794 } 1795 1796 for (ii = 0; ii < nmissing; ii++) { 1797 if (i == ii) 1798 continue; 1799 1800 ASSERT3U(rows[ii][missing[i]], !=, 0); 1801 1802 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1803 1804 for (j = 0; j < n; j++) { 1805 rows[ii][j] ^= 1806 vdev_raidz_exp2(rows[i][j], log); 1807 invrows[ii][j] ^= 1808 vdev_raidz_exp2(invrows[i][j], log); 1809 } 1810 } 1811 } 1812 1813 /* 1814 * Verify that the data that is left in the rows are properly part of 1815 * an identity matrix. 1816 */ 1817 for (i = 0; i < nmissing; i++) { 1818 for (j = 0; j < n; j++) { 1819 if (j == missing[i]) { 1820 ASSERT3U(rows[i][j], ==, 1); 1821 } else { 1822 ASSERT0(rows[i][j]); 1823 } 1824 } 1825 } 1826 } 1827 1828 static void 1829 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1830 int *missing, uint8_t **invrows, const uint8_t *used) 1831 { 1832 int i, j, x, cc, c; 1833 uint8_t *src; 1834 uint64_t ccount; 1835 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1836 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1837 uint8_t log = 0; 1838 uint8_t val; 1839 int ll; 1840 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1841 uint8_t *p, *pp; 1842 size_t psize; 1843 1844 psize = sizeof (invlog[0][0]) * n * nmissing; 1845 p = kmem_alloc(psize, KM_SLEEP); 1846 1847 for (pp = p, i = 0; i < nmissing; i++) { 1848 invlog[i] = pp; 1849 pp += n; 1850 } 1851 1852 for (i = 0; i < nmissing; i++) { 1853 for (j = 0; j < n; j++) { 1854 ASSERT3U(invrows[i][j], !=, 0); 1855 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1856 } 1857 } 1858 1859 for (i = 0; i < n; i++) { 1860 c = used[i]; 1861 ASSERT3U(c, <, rr->rr_cols); 1862 1863 ccount = rr->rr_col[c].rc_size; 1864 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1865 if (ccount == 0) 1866 continue; 1867 src = abd_to_buf(rr->rr_col[c].rc_abd); 1868 for (j = 0; j < nmissing; j++) { 1869 cc = missing[j] + rr->rr_firstdatacol; 1870 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1871 ASSERT3U(cc, <, rr->rr_cols); 1872 ASSERT3U(cc, !=, c); 1873 1874 dcount[j] = rr->rr_col[cc].rc_size; 1875 if (dcount[j] != 0) 1876 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1877 } 1878 1879 for (x = 0; x < ccount; x++, src++) { 1880 if (*src != 0) 1881 log = vdev_raidz_log2[*src]; 1882 1883 for (cc = 0; cc < nmissing; cc++) { 1884 if (x >= dcount[cc]) 1885 continue; 1886 1887 if (*src == 0) { 1888 val = 0; 1889 } else { 1890 if ((ll = log + invlog[cc][i]) >= 255) 1891 ll -= 255; 1892 val = vdev_raidz_pow2[ll]; 1893 } 1894 1895 if (i == 0) 1896 dst[cc][x] = val; 1897 else 1898 dst[cc][x] ^= val; 1899 } 1900 } 1901 } 1902 1903 kmem_free(p, psize); 1904 } 1905 1906 static void 1907 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1908 { 1909 int i, c, t, tt; 1910 unsigned int n; 1911 unsigned int nmissing_rows; 1912 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1913 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1914 uint8_t *p, *pp; 1915 size_t psize; 1916 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1917 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1918 uint8_t *used; 1919 1920 abd_t **bufs = NULL; 1921 1922 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1923 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1924 /* 1925 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1926 * temporary linear ABDs if any non-linear ABDs are found. 1927 */ 1928 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1929 ASSERT(rr->rr_col[i].rc_abd != NULL); 1930 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1931 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1932 KM_PUSHPAGE); 1933 1934 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1935 raidz_col_t *col = &rr->rr_col[c]; 1936 1937 bufs[c] = col->rc_abd; 1938 if (bufs[c] != NULL) { 1939 col->rc_abd = abd_alloc_linear( 1940 col->rc_size, B_TRUE); 1941 abd_copy(col->rc_abd, bufs[c], 1942 col->rc_size); 1943 } 1944 } 1945 1946 break; 1947 } 1948 } 1949 1950 n = rr->rr_cols - rr->rr_firstdatacol; 1951 1952 /* 1953 * Figure out which data columns are missing. 1954 */ 1955 nmissing_rows = 0; 1956 for (t = 0; t < ntgts; t++) { 1957 if (tgts[t] >= rr->rr_firstdatacol) { 1958 missing_rows[nmissing_rows++] = 1959 tgts[t] - rr->rr_firstdatacol; 1960 } 1961 } 1962 1963 /* 1964 * Figure out which parity columns to use to help generate the missing 1965 * data columns. 1966 */ 1967 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1968 ASSERT(tt < ntgts); 1969 ASSERT(c < rr->rr_firstdatacol); 1970 1971 /* 1972 * Skip any targeted parity columns. 1973 */ 1974 if (c == tgts[tt]) { 1975 tt++; 1976 continue; 1977 } 1978 1979 parity_map[i] = c; 1980 i++; 1981 } 1982 1983 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1984 nmissing_rows * n + sizeof (used[0]) * n; 1985 p = kmem_alloc(psize, KM_SLEEP); 1986 1987 for (pp = p, i = 0; i < nmissing_rows; i++) { 1988 rows[i] = pp; 1989 pp += n; 1990 invrows[i] = pp; 1991 pp += n; 1992 } 1993 used = pp; 1994 1995 for (i = 0; i < nmissing_rows; i++) { 1996 used[i] = parity_map[i]; 1997 } 1998 1999 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2000 if (tt < nmissing_rows && 2001 c == missing_rows[tt] + rr->rr_firstdatacol) { 2002 tt++; 2003 continue; 2004 } 2005 2006 ASSERT3S(i, <, n); 2007 used[i] = c; 2008 i++; 2009 } 2010 2011 /* 2012 * Initialize the interesting rows of the matrix. 2013 */ 2014 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2015 2016 /* 2017 * Invert the matrix. 2018 */ 2019 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2020 invrows, used); 2021 2022 /* 2023 * Reconstruct the missing data using the generated matrix. 2024 */ 2025 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2026 invrows, used); 2027 2028 kmem_free(p, psize); 2029 2030 /* 2031 * copy back from temporary linear abds and free them 2032 */ 2033 if (bufs) { 2034 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2035 raidz_col_t *col = &rr->rr_col[c]; 2036 2037 if (bufs[c] != NULL) { 2038 abd_copy(bufs[c], col->rc_abd, col->rc_size); 2039 abd_free(col->rc_abd); 2040 } 2041 col->rc_abd = bufs[c]; 2042 } 2043 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2044 } 2045 } 2046 2047 static void 2048 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 2049 const int *t, int nt) 2050 { 2051 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2052 int ntgts; 2053 int i, c, ret; 2054 int nbadparity, nbaddata; 2055 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2056 2057 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2058 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2059 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2060 (int)rr->rr_missingparity); 2061 } 2062 2063 nbadparity = rr->rr_firstdatacol; 2064 nbaddata = rr->rr_cols - nbadparity; 2065 ntgts = 0; 2066 for (i = 0, c = 0; c < rr->rr_cols; c++) { 2067 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2068 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2069 "offset=%llx error=%u)", 2070 rr, c, (int)rr->rr_col[c].rc_devidx, 2071 (long long)rr->rr_col[c].rc_offset, 2072 (int)rr->rr_col[c].rc_error); 2073 } 2074 if (c < rr->rr_firstdatacol) 2075 parity_valid[c] = B_FALSE; 2076 2077 if (i < nt && c == t[i]) { 2078 tgts[ntgts++] = c; 2079 i++; 2080 } else if (rr->rr_col[c].rc_error != 0) { 2081 tgts[ntgts++] = c; 2082 } else if (c >= rr->rr_firstdatacol) { 2083 nbaddata--; 2084 } else { 2085 parity_valid[c] = B_TRUE; 2086 nbadparity--; 2087 } 2088 } 2089 2090 ASSERT(ntgts >= nt); 2091 ASSERT(nbaddata >= 0); 2092 ASSERT(nbaddata + nbadparity == ntgts); 2093 2094 dt = &tgts[nbadparity]; 2095 2096 /* Reconstruct using the new math implementation */ 2097 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2098 if (ret != RAIDZ_ORIGINAL_IMPL) 2099 return; 2100 2101 /* 2102 * See if we can use any of our optimized reconstruction routines. 2103 */ 2104 switch (nbaddata) { 2105 case 1: 2106 if (parity_valid[VDEV_RAIDZ_P]) { 2107 vdev_raidz_reconstruct_p(rr, dt, 1); 2108 return; 2109 } 2110 2111 ASSERT(rr->rr_firstdatacol > 1); 2112 2113 if (parity_valid[VDEV_RAIDZ_Q]) { 2114 vdev_raidz_reconstruct_q(rr, dt, 1); 2115 return; 2116 } 2117 2118 ASSERT(rr->rr_firstdatacol > 2); 2119 break; 2120 2121 case 2: 2122 ASSERT(rr->rr_firstdatacol > 1); 2123 2124 if (parity_valid[VDEV_RAIDZ_P] && 2125 parity_valid[VDEV_RAIDZ_Q]) { 2126 vdev_raidz_reconstruct_pq(rr, dt, 2); 2127 return; 2128 } 2129 2130 ASSERT(rr->rr_firstdatacol > 2); 2131 2132 break; 2133 } 2134 2135 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2136 } 2137 2138 static int 2139 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2140 uint64_t *logical_ashift, uint64_t *physical_ashift) 2141 { 2142 vdev_raidz_t *vdrz = vd->vdev_tsd; 2143 uint64_t nparity = vdrz->vd_nparity; 2144 int c; 2145 int lasterror = 0; 2146 int numerrors = 0; 2147 2148 ASSERT(nparity > 0); 2149 2150 if (nparity > VDEV_RAIDZ_MAXPARITY || 2151 vd->vdev_children < nparity + 1) { 2152 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2153 return (SET_ERROR(EINVAL)); 2154 } 2155 2156 vdev_open_children(vd); 2157 2158 for (c = 0; c < vd->vdev_children; c++) { 2159 vdev_t *cvd = vd->vdev_child[c]; 2160 2161 if (cvd->vdev_open_error != 0) { 2162 lasterror = cvd->vdev_open_error; 2163 numerrors++; 2164 continue; 2165 } 2166 2167 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2168 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2169 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2170 } 2171 for (c = 0; c < vd->vdev_children; c++) { 2172 vdev_t *cvd = vd->vdev_child[c]; 2173 2174 if (cvd->vdev_open_error != 0) 2175 continue; 2176 *physical_ashift = vdev_best_ashift(*logical_ashift, 2177 *physical_ashift, cvd->vdev_physical_ashift); 2178 } 2179 2180 if (vd->vdev_rz_expanding) { 2181 *asize *= vd->vdev_children - 1; 2182 *max_asize *= vd->vdev_children - 1; 2183 2184 vd->vdev_min_asize = *asize; 2185 } else { 2186 *asize *= vd->vdev_children; 2187 *max_asize *= vd->vdev_children; 2188 } 2189 2190 if (numerrors > nparity) { 2191 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2192 return (lasterror); 2193 } 2194 2195 return (0); 2196 } 2197 2198 static void 2199 vdev_raidz_close(vdev_t *vd) 2200 { 2201 for (int c = 0; c < vd->vdev_children; c++) { 2202 if (vd->vdev_child[c] != NULL) 2203 vdev_close(vd->vdev_child[c]); 2204 } 2205 } 2206 2207 /* 2208 * Return the logical width to use, given the txg in which the allocation 2209 * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2210 * BP was allocated. Remapped BP's (that were relocated due to device 2211 * removal, see remap_blkptr_cb()), will have a more recent physical birth 2212 * which reflects when the BP was relocated, but we can ignore these because 2213 * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2214 */ 2215 static uint64_t 2216 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2217 { 2218 reflow_node_t lookup = { 2219 .re_txg = txg, 2220 }; 2221 avl_index_t where; 2222 2223 uint64_t width; 2224 mutex_enter(&vdrz->vd_expand_lock); 2225 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2226 if (re != NULL) { 2227 width = re->re_logical_width; 2228 } else { 2229 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2230 if (re != NULL) 2231 width = re->re_logical_width; 2232 else 2233 width = vdrz->vd_original_width; 2234 } 2235 mutex_exit(&vdrz->vd_expand_lock); 2236 return (width); 2237 } 2238 /* 2239 * This code converts an asize into the largest psize that can safely be written 2240 * to an allocation of that size for this vdev. 2241 * 2242 * Note that this function will not take into account the effect of gang 2243 * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of 2244 * the psize_to_asize function. 2245 */ 2246 static uint64_t 2247 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) 2248 { 2249 vdev_raidz_t *vdrz = vd->vdev_tsd; 2250 uint64_t psize; 2251 uint64_t ashift = vd->vdev_top->vdev_ashift; 2252 uint64_t cols = vdrz->vd_original_width; 2253 uint64_t nparity = vdrz->vd_nparity; 2254 2255 cols = vdev_raidz_get_logical_width(vdrz, txg); 2256 2257 ASSERT0(asize % (1 << ashift)); 2258 2259 psize = (asize >> ashift); 2260 /* 2261 * If the roundup to nparity + 1 caused us to spill into a new row, we 2262 * need to ignore that row entirely (since it can't store data or 2263 * parity). 2264 */ 2265 uint64_t rows = psize / cols; 2266 psize = psize - (rows * cols) <= nparity ? rows * cols : psize; 2267 /* Subtract out parity sectors for each row storing data. */ 2268 psize -= nparity * DIV_ROUND_UP(psize, cols); 2269 psize <<= ashift; 2270 2271 return (psize); 2272 } 2273 2274 /* 2275 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2276 * more space due to the lower data-to-parity ratio. In this case it's 2277 * important to pass in the correct txg. Note that vdev_gang_header_asize() 2278 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2279 * regardless of txg. This is assured because for a single data sector, we 2280 * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2281 */ 2282 static uint64_t 2283 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2284 { 2285 vdev_raidz_t *vdrz = vd->vdev_tsd; 2286 uint64_t asize; 2287 uint64_t ashift = vd->vdev_top->vdev_ashift; 2288 uint64_t cols = vdrz->vd_original_width; 2289 uint64_t nparity = vdrz->vd_nparity; 2290 2291 cols = vdev_raidz_get_logical_width(vdrz, txg); 2292 2293 asize = ((psize - 1) >> ashift) + 1; 2294 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2295 asize = roundup(asize, nparity + 1) << ashift; 2296 2297 #ifdef ZFS_DEBUG 2298 uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2299 uint64_t ncols_new = vdrz->vd_physical_width; 2300 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2301 (ncols_new - nparity)); 2302 asize_new = roundup(asize_new, nparity + 1) << ashift; 2303 VERIFY3U(asize_new, <=, asize); 2304 #endif 2305 2306 return (asize); 2307 } 2308 2309 /* 2310 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 2311 * so each child must provide at least 1/Nth of its asize. 2312 */ 2313 static uint64_t 2314 vdev_raidz_min_asize(vdev_t *vd) 2315 { 2316 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 2317 vd->vdev_children); 2318 } 2319 2320 void 2321 vdev_raidz_child_done(zio_t *zio) 2322 { 2323 raidz_col_t *rc = zio->io_private; 2324 2325 ASSERT3P(rc->rc_abd, !=, NULL); 2326 rc->rc_error = zio->io_error; 2327 rc->rc_tried = 1; 2328 rc->rc_skipped = 0; 2329 } 2330 2331 static void 2332 vdev_raidz_shadow_child_done(zio_t *zio) 2333 { 2334 raidz_col_t *rc = zio->io_private; 2335 2336 rc->rc_shadow_error = zio->io_error; 2337 } 2338 2339 static void 2340 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2341 { 2342 (void) rm; 2343 #ifdef ZFS_DEBUG 2344 zfs_range_seg64_t logical_rs, physical_rs, remain_rs; 2345 logical_rs.rs_start = rr->rr_offset; 2346 logical_rs.rs_end = logical_rs.rs_start + 2347 vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, 2348 BP_GET_BIRTH(zio->io_bp)); 2349 2350 raidz_col_t *rc = &rr->rr_col[col]; 2351 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2352 2353 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 2354 ASSERT(vdev_xlate_is_empty(&remain_rs)); 2355 if (vdev_xlate_is_empty(&physical_rs)) { 2356 /* 2357 * If we are in the middle of expansion, the 2358 * physical->logical mapping is changing so vdev_xlate() 2359 * can't give us a reliable answer. 2360 */ 2361 return; 2362 } 2363 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2364 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2365 /* 2366 * It would be nice to assert that rs_end is equal 2367 * to rc_offset + rc_size but there might be an 2368 * optional I/O at the end that is not accounted in 2369 * rc_size. 2370 */ 2371 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2372 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2373 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2374 } else { 2375 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2376 } 2377 #endif 2378 } 2379 2380 static void 2381 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 2382 { 2383 vdev_t *vd = zio->io_vd; 2384 raidz_map_t *rm = zio->io_vsd; 2385 2386 vdev_raidz_generate_parity_row(rm, rr); 2387 2388 for (int c = 0; c < rr->rr_scols; c++) { 2389 raidz_col_t *rc = &rr->rr_col[c]; 2390 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2391 2392 /* Verify physical to logical translation */ 2393 vdev_raidz_io_verify(zio, rm, rr, c); 2394 2395 if (rc->rc_size == 0) 2396 continue; 2397 2398 ASSERT3U(rc->rc_offset + rc->rc_size, <, 2399 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2400 2401 ASSERT3P(rc->rc_abd, !=, NULL); 2402 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2403 rc->rc_offset, rc->rc_abd, 2404 abd_get_size(rc->rc_abd), zio->io_type, 2405 zio->io_priority, 0, vdev_raidz_child_done, rc)); 2406 2407 if (rc->rc_shadow_devidx != INT_MAX) { 2408 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2409 2410 ASSERT3U( 2411 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2412 cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2413 2414 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2415 rc->rc_shadow_offset, rc->rc_abd, 2416 abd_get_size(rc->rc_abd), 2417 zio->io_type, zio->io_priority, 0, 2418 vdev_raidz_shadow_child_done, rc)); 2419 } 2420 } 2421 } 2422 2423 /* 2424 * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2425 * This only works for vdev_raidz_map_alloc() (not _expanded()). 2426 */ 2427 static void 2428 raidz_start_skip_writes(zio_t *zio) 2429 { 2430 vdev_t *vd = zio->io_vd; 2431 uint64_t ashift = vd->vdev_top->vdev_ashift; 2432 raidz_map_t *rm = zio->io_vsd; 2433 ASSERT3U(rm->rm_nrows, ==, 1); 2434 raidz_row_t *rr = rm->rm_row[0]; 2435 for (int c = 0; c < rr->rr_scols; c++) { 2436 raidz_col_t *rc = &rr->rr_col[c]; 2437 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2438 if (rc->rc_size != 0) 2439 continue; 2440 ASSERT3P(rc->rc_abd, ==, NULL); 2441 2442 ASSERT3U(rc->rc_offset, <, 2443 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2444 2445 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2446 NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2447 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2448 } 2449 } 2450 2451 static void 2452 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 2453 { 2454 vdev_t *vd = zio->io_vd; 2455 2456 /* 2457 * Iterate over the columns in reverse order so that we hit the parity 2458 * last -- any errors along the way will force us to read the parity. 2459 */ 2460 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2461 raidz_col_t *rc = &rr->rr_col[c]; 2462 if (rc->rc_size == 0) 2463 continue; 2464 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2465 if (!vdev_readable(cvd)) { 2466 if (c >= rr->rr_firstdatacol) 2467 rr->rr_missingdata++; 2468 else 2469 rr->rr_missingparity++; 2470 rc->rc_error = SET_ERROR(ENXIO); 2471 rc->rc_tried = 1; /* don't even try */ 2472 rc->rc_skipped = 1; 2473 continue; 2474 } 2475 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2476 if (c >= rr->rr_firstdatacol) 2477 rr->rr_missingdata++; 2478 else 2479 rr->rr_missingparity++; 2480 rc->rc_error = SET_ERROR(ESTALE); 2481 rc->rc_skipped = 1; 2482 continue; 2483 } 2484 if (forceparity || 2485 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 2486 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2487 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2488 rc->rc_offset, rc->rc_abd, rc->rc_size, 2489 zio->io_type, zio->io_priority, 0, 2490 vdev_raidz_child_done, rc)); 2491 } 2492 } 2493 } 2494 2495 static void 2496 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2497 { 2498 vdev_t *vd = zio->io_vd; 2499 2500 for (int i = 0; i < rm->rm_nphys_cols; i++) { 2501 raidz_col_t *prc = &rm->rm_phys_col[i]; 2502 if (prc->rc_size == 0) 2503 continue; 2504 2505 ASSERT3U(prc->rc_devidx, ==, i); 2506 vdev_t *cvd = vd->vdev_child[i]; 2507 if (!vdev_readable(cvd)) { 2508 prc->rc_error = SET_ERROR(ENXIO); 2509 prc->rc_tried = 1; /* don't even try */ 2510 prc->rc_skipped = 1; 2511 continue; 2512 } 2513 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2514 prc->rc_error = SET_ERROR(ESTALE); 2515 prc->rc_skipped = 1; 2516 continue; 2517 } 2518 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2519 prc->rc_offset, prc->rc_abd, prc->rc_size, 2520 zio->io_type, zio->io_priority, 0, 2521 vdev_raidz_child_done, prc)); 2522 } 2523 } 2524 2525 static void 2526 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2527 { 2528 /* 2529 * If there are multiple rows, we will be hitting 2530 * all disks, so go ahead and read the parity so 2531 * that we are reading in decent size chunks. 2532 */ 2533 boolean_t forceparity = rm->rm_nrows > 1; 2534 2535 if (rm->rm_phys_col) { 2536 vdev_raidz_io_start_read_phys_cols(zio, rm); 2537 } else { 2538 for (int i = 0; i < rm->rm_nrows; i++) { 2539 raidz_row_t *rr = rm->rm_row[i]; 2540 vdev_raidz_io_start_read_row(zio, rr, forceparity); 2541 } 2542 } 2543 } 2544 2545 /* 2546 * Start an IO operation on a RAIDZ VDev 2547 * 2548 * Outline: 2549 * - For write operations: 2550 * 1. Generate the parity data 2551 * 2. Create child zio write operations to each column's vdev, for both 2552 * data and parity. 2553 * 3. If the column skips any sectors for padding, create optional dummy 2554 * write zio children for those areas to improve aggregation continuity. 2555 * - For read operations: 2556 * 1. Create child zio read operations to each data column's vdev to read 2557 * the range of data required for zio. 2558 * 2. If this is a scrub or resilver operation, or if any of the data 2559 * vdevs have had errors, then create zio read operations to the parity 2560 * columns' VDevs as well. 2561 */ 2562 static void 2563 vdev_raidz_io_start(zio_t *zio) 2564 { 2565 vdev_t *vd = zio->io_vd; 2566 vdev_t *tvd = vd->vdev_top; 2567 vdev_raidz_t *vdrz = vd->vdev_tsd; 2568 raidz_map_t *rm; 2569 2570 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2571 BP_GET_BIRTH(zio->io_bp)); 2572 if (logical_width != vdrz->vd_physical_width) { 2573 zfs_locked_range_t *lr = NULL; 2574 uint64_t synced_offset = UINT64_MAX; 2575 uint64_t next_offset = UINT64_MAX; 2576 boolean_t use_scratch = B_FALSE; 2577 /* 2578 * Note: when the expansion is completing, we set 2579 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2580 * in a later txg than when we last update spa_ubsync's state 2581 * (see the end of spa_raidz_expand_thread()). Therefore we 2582 * may see vre_state!=SCANNING before 2583 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2584 * on disk, but the copying progress has been synced to disk 2585 * (and reflected in spa_ubsync). In this case it's fine to 2586 * treat the expansion as completed, since if we crash there's 2587 * no additional copying to do. 2588 */ 2589 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2590 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2591 &vdrz->vn_vre); 2592 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2593 zio->io_offset, zio->io_size, RL_READER); 2594 use_scratch = 2595 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2596 RRSS_SCRATCH_VALID); 2597 synced_offset = 2598 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2599 next_offset = vdrz->vn_vre.vre_offset; 2600 /* 2601 * If we haven't resumed expanding since importing the 2602 * pool, vre_offset won't have been set yet. In 2603 * this case the next offset to be copied is the same 2604 * as what was synced. 2605 */ 2606 if (next_offset == UINT64_MAX) { 2607 next_offset = synced_offset; 2608 } 2609 } 2610 if (use_scratch) { 2611 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2612 "%lld next_offset=%lld use_scratch=%u", 2613 zio, 2614 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2615 (long long)zio->io_offset, 2616 (long long)synced_offset, 2617 (long long)next_offset, 2618 use_scratch); 2619 } 2620 2621 rm = vdev_raidz_map_alloc_expanded(zio, 2622 tvd->vdev_ashift, vdrz->vd_physical_width, 2623 logical_width, vdrz->vd_nparity, 2624 synced_offset, next_offset, use_scratch); 2625 rm->rm_lr = lr; 2626 } else { 2627 rm = vdev_raidz_map_alloc(zio, 2628 tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2629 } 2630 rm->rm_original_width = vdrz->vd_original_width; 2631 2632 zio->io_vsd = rm; 2633 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2634 if (zio->io_type == ZIO_TYPE_WRITE) { 2635 for (int i = 0; i < rm->rm_nrows; i++) { 2636 vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2637 } 2638 2639 if (logical_width == vdrz->vd_physical_width) { 2640 raidz_start_skip_writes(zio); 2641 } 2642 } else { 2643 ASSERT(zio->io_type == ZIO_TYPE_READ); 2644 vdev_raidz_io_start_read(zio, rm); 2645 } 2646 2647 zio_execute(zio); 2648 } 2649 2650 /* 2651 * Report a checksum error for a child of a RAID-Z device. 2652 */ 2653 void 2654 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2655 { 2656 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2657 2658 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 2659 zio->io_priority != ZIO_PRIORITY_REBUILD) { 2660 zio_bad_cksum_t zbc; 2661 raidz_map_t *rm = zio->io_vsd; 2662 2663 zbc.zbc_has_cksum = 0; 2664 zbc.zbc_injected = rm->rm_ecksuminjected; 2665 2666 mutex_enter(&vd->vdev_stat_lock); 2667 vd->vdev_stat.vs_checksum_errors++; 2668 mutex_exit(&vd->vdev_stat_lock); 2669 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2670 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2671 rc->rc_abd, bad_data, &zbc); 2672 } 2673 } 2674 2675 /* 2676 * We keep track of whether or not there were any injected errors, so that 2677 * any ereports we generate can note it. 2678 */ 2679 static int 2680 raidz_checksum_verify(zio_t *zio) 2681 { 2682 zio_bad_cksum_t zbc = {0}; 2683 raidz_map_t *rm = zio->io_vsd; 2684 2685 int ret = zio_checksum_error(zio, &zbc); 2686 /* 2687 * Any Direct I/O read that has a checksum error must be treated as 2688 * suspicious as the contents of the buffer could be getting 2689 * manipulated while the I/O is taking place. The checksum verify error 2690 * will be reported to the top-level RAIDZ VDEV. 2691 */ 2692 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 2693 zio->io_error = ret; 2694 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 2695 zio_dio_chksum_verify_error_report(zio); 2696 zio_checksum_verified(zio); 2697 return (0); 2698 } 2699 2700 if (ret != 0 && zbc.zbc_injected != 0) 2701 rm->rm_ecksuminjected = 1; 2702 2703 return (ret); 2704 } 2705 2706 /* 2707 * Generate the parity from the data columns. If we tried and were able to 2708 * read the parity without error, verify that the generated parity matches the 2709 * data we read. If it doesn't, we fire off a checksum error. Return the 2710 * number of such failures. 2711 */ 2712 static int 2713 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2714 { 2715 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2716 int c, ret = 0; 2717 raidz_map_t *rm = zio->io_vsd; 2718 raidz_col_t *rc; 2719 2720 blkptr_t *bp = zio->io_bp; 2721 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2722 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2723 2724 if (checksum == ZIO_CHECKSUM_NOPARITY) 2725 return (ret); 2726 2727 for (c = 0; c < rr->rr_firstdatacol; c++) { 2728 rc = &rr->rr_col[c]; 2729 if (!rc->rc_tried || rc->rc_error != 0) 2730 continue; 2731 2732 orig[c] = rc->rc_abd; 2733 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2734 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2735 } 2736 2737 /* 2738 * Verify any empty sectors are zero filled to ensure the parity 2739 * is calculated correctly even if these non-data sectors are damaged. 2740 */ 2741 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2742 ret += vdev_draid_map_verify_empty(zio, rr); 2743 2744 /* 2745 * Regenerates parity even for !tried||rc_error!=0 columns. This 2746 * isn't harmful but it does have the side effect of fixing stuff 2747 * we didn't realize was necessary (i.e. even if we return 0). 2748 */ 2749 vdev_raidz_generate_parity_row(rm, rr); 2750 2751 for (c = 0; c < rr->rr_firstdatacol; c++) { 2752 rc = &rr->rr_col[c]; 2753 2754 if (!rc->rc_tried || rc->rc_error != 0) 2755 continue; 2756 2757 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2758 zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2759 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2760 vdev_raidz_checksum_error(zio, rc, orig[c]); 2761 rc->rc_error = SET_ERROR(ECKSUM); 2762 ret++; 2763 } 2764 abd_free(orig[c]); 2765 } 2766 2767 return (ret); 2768 } 2769 2770 static int 2771 vdev_raidz_worst_error(raidz_row_t *rr) 2772 { 2773 int error = 0; 2774 2775 for (int c = 0; c < rr->rr_cols; c++) { 2776 error = zio_worst_error(error, rr->rr_col[c].rc_error); 2777 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2778 } 2779 2780 return (error); 2781 } 2782 2783 static void 2784 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2785 { 2786 int unexpected_errors = 0; 2787 int parity_errors = 0; 2788 int parity_untried = 0; 2789 int data_errors = 0; 2790 2791 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2792 2793 for (int c = 0; c < rr->rr_cols; c++) { 2794 raidz_col_t *rc = &rr->rr_col[c]; 2795 2796 if (rc->rc_error) { 2797 if (c < rr->rr_firstdatacol) 2798 parity_errors++; 2799 else 2800 data_errors++; 2801 2802 if (!rc->rc_skipped) 2803 unexpected_errors++; 2804 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2805 parity_untried++; 2806 } 2807 2808 if (rc->rc_force_repair) 2809 unexpected_errors++; 2810 } 2811 2812 /* 2813 * If we read more parity disks than were used for 2814 * reconstruction, confirm that the other parity disks produced 2815 * correct data. 2816 * 2817 * Note that we also regenerate parity when resilvering so we 2818 * can write it out to failed devices later. 2819 */ 2820 if (parity_errors + parity_untried < 2821 rr->rr_firstdatacol - data_errors || 2822 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2823 int n = raidz_parity_verify(zio, rr); 2824 unexpected_errors += n; 2825 } 2826 2827 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2828 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2829 /* 2830 * Use the good data we have in hand to repair damaged children. 2831 */ 2832 for (int c = 0; c < rr->rr_cols; c++) { 2833 raidz_col_t *rc = &rr->rr_col[c]; 2834 vdev_t *vd = zio->io_vd; 2835 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2836 2837 if (!rc->rc_allow_repair) { 2838 continue; 2839 } else if (!rc->rc_force_repair && 2840 (rc->rc_error == 0 || rc->rc_size == 0)) { 2841 continue; 2842 } 2843 /* 2844 * We do not allow self healing for Direct I/O reads. 2845 * See comment in vdev_raid_row_alloc(). 2846 */ 2847 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 2848 2849 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2850 "offset=%llx", 2851 zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2852 2853 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2854 rc->rc_offset, rc->rc_abd, rc->rc_size, 2855 ZIO_TYPE_WRITE, 2856 zio->io_priority == ZIO_PRIORITY_REBUILD ? 2857 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 2858 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2859 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2860 } 2861 } 2862 2863 /* 2864 * Scrub or resilver i/o's: overwrite any shadow locations with the 2865 * good data. This ensures that if we've already copied this sector, 2866 * it will be corrected if it was damaged. This writes more than is 2867 * necessary, but since expansion is paused during scrub/resilver, at 2868 * most a single row will have a shadow location. 2869 */ 2870 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2871 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2872 for (int c = 0; c < rr->rr_cols; c++) { 2873 raidz_col_t *rc = &rr->rr_col[c]; 2874 vdev_t *vd = zio->io_vd; 2875 2876 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2877 continue; 2878 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2879 2880 /* 2881 * Note: We don't want to update the repair stats 2882 * because that would incorrectly indicate that there 2883 * was bad data to repair, which we aren't sure about. 2884 * By clearing the SCAN_THREAD flag, we prevent this 2885 * from happening, despite having the REPAIR flag set. 2886 * We need to set SELF_HEAL so that this i/o can't be 2887 * bypassed by zio_vdev_io_start(). 2888 */ 2889 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2890 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2891 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2892 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2893 NULL, NULL); 2894 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2895 zio_nowait(cio); 2896 } 2897 } 2898 } 2899 2900 static void 2901 raidz_restore_orig_data(raidz_map_t *rm) 2902 { 2903 for (int i = 0; i < rm->rm_nrows; i++) { 2904 raidz_row_t *rr = rm->rm_row[i]; 2905 for (int c = 0; c < rr->rr_cols; c++) { 2906 raidz_col_t *rc = &rr->rr_col[c]; 2907 if (rc->rc_need_orig_restore) { 2908 abd_copy(rc->rc_abd, 2909 rc->rc_orig_data, rc->rc_size); 2910 rc->rc_need_orig_restore = B_FALSE; 2911 } 2912 } 2913 } 2914 } 2915 2916 /* 2917 * During raidz_reconstruct() for expanded VDEV, we need special consideration 2918 * failure simulations. See note in raidz_reconstruct() on simulating failure 2919 * of a pre-expansion device. 2920 * 2921 * Treating logical child i as failed, return TRUE if the given column should 2922 * be treated as failed. The idea of logical children allows us to imagine 2923 * that a disk silently failed before a RAIDZ expansion (reads from this disk 2924 * succeed but return the wrong data). Since the expansion doesn't verify 2925 * checksums, the incorrect data will be moved to new locations spread among 2926 * the children (going diagonally across them). 2927 * 2928 * Higher "logical child failures" (values of `i`) indicate these 2929 * "pre-expansion failures". The first physical_width values imagine that a 2930 * current child failed; the next physical_width-1 values imagine that a 2931 * child failed before the most recent expansion; the next physical_width-2 2932 * values imagine a child failed in the expansion before that, etc. 2933 */ 2934 static boolean_t 2935 raidz_simulate_failure(int physical_width, int original_width, int ashift, 2936 int i, raidz_col_t *rc) 2937 { 2938 uint64_t sector_id = 2939 physical_width * (rc->rc_offset >> ashift) + 2940 rc->rc_devidx; 2941 2942 for (int w = physical_width; w >= original_width; w--) { 2943 if (i < w) { 2944 return (sector_id % w == i); 2945 } else { 2946 i -= w; 2947 } 2948 } 2949 ASSERT(!"invalid logical child id"); 2950 return (B_FALSE); 2951 } 2952 2953 /* 2954 * returns EINVAL if reconstruction of the block will not be possible 2955 * returns ECKSUM if this specific reconstruction failed 2956 * returns 0 on successful reconstruction 2957 */ 2958 static int 2959 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 2960 { 2961 raidz_map_t *rm = zio->io_vsd; 2962 int physical_width = zio->io_vd->vdev_children; 2963 int original_width = (rm->rm_original_width != 0) ? 2964 rm->rm_original_width : physical_width; 2965 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2966 2967 if (dbgmsg) { 2968 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2969 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2970 } 2971 2972 /* Reconstruct each row */ 2973 for (int r = 0; r < rm->rm_nrows; r++) { 2974 raidz_row_t *rr = rm->rm_row[r]; 2975 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 2976 int t = 0; 2977 int dead = 0; 2978 int dead_data = 0; 2979 2980 if (dbgmsg) 2981 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2982 2983 for (int c = 0; c < rr->rr_cols; c++) { 2984 raidz_col_t *rc = &rr->rr_col[c]; 2985 ASSERT0(rc->rc_need_orig_restore); 2986 if (rc->rc_error != 0) { 2987 dead++; 2988 if (c >= nparity) 2989 dead_data++; 2990 continue; 2991 } 2992 if (rc->rc_size == 0) 2993 continue; 2994 for (int lt = 0; lt < ntgts; lt++) { 2995 if (raidz_simulate_failure(physical_width, 2996 original_width, 2997 zio->io_vd->vdev_top->vdev_ashift, 2998 ltgts[lt], rc)) { 2999 if (rc->rc_orig_data == NULL) { 3000 rc->rc_orig_data = 3001 abd_alloc_linear( 3002 rc->rc_size, B_TRUE); 3003 abd_copy(rc->rc_orig_data, 3004 rc->rc_abd, rc->rc_size); 3005 } 3006 rc->rc_need_orig_restore = B_TRUE; 3007 3008 dead++; 3009 if (c >= nparity) 3010 dead_data++; 3011 /* 3012 * Note: simulating failure of a 3013 * pre-expansion device can hit more 3014 * than one column, in which case we 3015 * might try to simulate more failures 3016 * than can be reconstructed, which is 3017 * also more than the size of my_tgts. 3018 * This check prevents accessing past 3019 * the end of my_tgts. The "dead > 3020 * nparity" check below will fail this 3021 * reconstruction attempt. 3022 */ 3023 if (t < VDEV_RAIDZ_MAXPARITY) { 3024 my_tgts[t++] = c; 3025 if (dbgmsg) { 3026 zfs_dbgmsg("simulating " 3027 "failure of col %u " 3028 "devidx %u", c, 3029 (int)rc->rc_devidx); 3030 } 3031 } 3032 break; 3033 } 3034 } 3035 } 3036 if (dead > nparity) { 3037 /* reconstruction not possible */ 3038 if (dbgmsg) { 3039 zfs_dbgmsg("reconstruction not possible; " 3040 "too many failures"); 3041 } 3042 raidz_restore_orig_data(rm); 3043 return (EINVAL); 3044 } 3045 if (dead_data > 0) 3046 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 3047 } 3048 3049 /* Check for success */ 3050 if (raidz_checksum_verify(zio) == 0) { 3051 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3052 return (0); 3053 3054 /* Reconstruction succeeded - report errors */ 3055 for (int i = 0; i < rm->rm_nrows; i++) { 3056 raidz_row_t *rr = rm->rm_row[i]; 3057 3058 for (int c = 0; c < rr->rr_cols; c++) { 3059 raidz_col_t *rc = &rr->rr_col[c]; 3060 if (rc->rc_need_orig_restore) { 3061 /* 3062 * Note: if this is a parity column, 3063 * we don't really know if it's wrong. 3064 * We need to let 3065 * vdev_raidz_io_done_verified() check 3066 * it, and if we set rc_error, it will 3067 * think that it is a "known" error 3068 * that doesn't need to be checked 3069 * or corrected. 3070 */ 3071 if (rc->rc_error == 0 && 3072 c >= rr->rr_firstdatacol) { 3073 vdev_raidz_checksum_error(zio, 3074 rc, rc->rc_orig_data); 3075 rc->rc_error = 3076 SET_ERROR(ECKSUM); 3077 } 3078 rc->rc_need_orig_restore = B_FALSE; 3079 } 3080 } 3081 3082 vdev_raidz_io_done_verified(zio, rr); 3083 } 3084 3085 zio_checksum_verified(zio); 3086 3087 if (dbgmsg) { 3088 zfs_dbgmsg("reconstruction successful " 3089 "(checksum verified)"); 3090 } 3091 return (0); 3092 } 3093 3094 /* Reconstruction failed - restore original data */ 3095 raidz_restore_orig_data(rm); 3096 if (dbgmsg) { 3097 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3098 "failed", zio); 3099 } 3100 return (ECKSUM); 3101 } 3102 3103 /* 3104 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 3105 * Note that the algorithm below is non-optimal because it doesn't take into 3106 * account how reconstruction is actually performed. For example, with 3107 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 3108 * is targeted as invalid as if columns 1 and 4 are targeted since in both 3109 * cases we'd only use parity information in column 0. 3110 * 3111 * The order that we find the various possible combinations of failed 3112 * disks is dictated by these rules: 3113 * - Examine each "slot" (the "i" in tgts[i]) 3114 * - Try to increment this slot (tgts[i] += 1) 3115 * - if we can't increment because it runs into the next slot, 3116 * reset our slot to the minimum, and examine the next slot 3117 * 3118 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 3119 * 3 columns to reconstruct), we will generate the following sequence: 3120 * 3121 * STATE ACTION 3122 * 0 1 2 special case: skip since these are all parity 3123 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 3124 * 0 2 3 first slot: increment to 1 3125 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 3126 * 0 1 4 first: reset to 0; middle: increment to 2 3127 * 0 2 4 first: increment to 1 3128 * 1 2 4 first: reset to 0; middle: increment to 3 3129 * 0 3 4 first: increment to 1 3130 * 1 3 4 first: increment to 2 3131 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 3132 * 0 1 5 first: reset to 0; middle: increment to 2 3133 * 0 2 5 first: increment to 1 3134 * 1 2 5 first: reset to 0; middle: increment to 3 3135 * 0 3 5 first: increment to 1 3136 * 1 3 5 first: increment to 2 3137 * 2 3 5 first: reset to 0; middle: increment to 4 3138 * 0 4 5 first: increment to 1 3139 * 1 4 5 first: increment to 2 3140 * 2 4 5 first: increment to 3 3141 * 3 4 5 done 3142 * 3143 * This strategy works for dRAID but is less efficient when there are a large 3144 * number of child vdevs and therefore permutations to check. Furthermore, 3145 * since the raidz_map_t rows likely do not overlap, reconstruction would be 3146 * possible as long as there are no more than nparity data errors per row. 3147 * These additional permutations are not currently checked but could be as 3148 * a future improvement. 3149 * 3150 * Returns 0 on success, ECKSUM on failure. 3151 */ 3152 static int 3153 vdev_raidz_combrec(zio_t *zio) 3154 { 3155 int nparity = vdev_get_nparity(zio->io_vd); 3156 raidz_map_t *rm = zio->io_vsd; 3157 int physical_width = zio->io_vd->vdev_children; 3158 int original_width = (rm->rm_original_width != 0) ? 3159 rm->rm_original_width : physical_width; 3160 3161 for (int i = 0; i < rm->rm_nrows; i++) { 3162 raidz_row_t *rr = rm->rm_row[i]; 3163 int total_errors = 0; 3164 3165 for (int c = 0; c < rr->rr_cols; c++) { 3166 if (rr->rr_col[c].rc_error) 3167 total_errors++; 3168 } 3169 3170 if (total_errors > nparity) 3171 return (vdev_raidz_worst_error(rr)); 3172 } 3173 3174 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 3175 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 3176 int *ltgts = &tstore[1]; /* value is logical child ID */ 3177 3178 3179 /* 3180 * Determine number of logical children, n. See comment 3181 * above raidz_simulate_failure(). 3182 */ 3183 int n = 0; 3184 for (int w = physical_width; 3185 w >= original_width; w--) { 3186 n += w; 3187 } 3188 3189 ASSERT3U(num_failures, <=, nparity); 3190 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 3191 3192 /* Handle corner cases in combrec logic */ 3193 ltgts[-1] = -1; 3194 for (int i = 0; i < num_failures; i++) { 3195 ltgts[i] = i; 3196 } 3197 ltgts[num_failures] = n; 3198 3199 for (;;) { 3200 int err = raidz_reconstruct(zio, ltgts, num_failures, 3201 nparity); 3202 if (err == EINVAL) { 3203 /* 3204 * Reconstruction not possible with this # 3205 * failures; try more failures. 3206 */ 3207 break; 3208 } else if (err == 0) 3209 return (0); 3210 3211 /* Compute next targets to try */ 3212 for (int t = 0; ; t++) { 3213 ASSERT3U(t, <, num_failures); 3214 ltgts[t]++; 3215 if (ltgts[t] == n) { 3216 /* try more failures */ 3217 ASSERT3U(t, ==, num_failures - 1); 3218 if (zfs_flags & 3219 ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3220 zfs_dbgmsg("reconstruction " 3221 "failed for num_failures=" 3222 "%u; tried all " 3223 "combinations", 3224 num_failures); 3225 } 3226 break; 3227 } 3228 3229 ASSERT3U(ltgts[t], <, n); 3230 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 3231 3232 /* 3233 * If that spot is available, we're done here. 3234 * Try the next combination. 3235 */ 3236 if (ltgts[t] != ltgts[t + 1]) 3237 break; // found next combination 3238 3239 /* 3240 * Otherwise, reset this tgt to the minimum, 3241 * and move on to the next tgt. 3242 */ 3243 ltgts[t] = ltgts[t - 1] + 1; 3244 ASSERT3U(ltgts[t], ==, t); 3245 } 3246 3247 /* Increase the number of failures and keep trying. */ 3248 if (ltgts[num_failures - 1] == n) 3249 break; 3250 } 3251 } 3252 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3253 zfs_dbgmsg("reconstruction failed for all num_failures"); 3254 return (ECKSUM); 3255 } 3256 3257 void 3258 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 3259 { 3260 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 3261 raidz_row_t *rr = rm->rm_row[row]; 3262 vdev_raidz_reconstruct_row(rm, rr, t, nt); 3263 } 3264 } 3265 3266 /* 3267 * Complete a write IO operation on a RAIDZ VDev 3268 * 3269 * Outline: 3270 * 1. Check for errors on the child IOs. 3271 * 2. Return, setting an error code if too few child VDevs were written 3272 * to reconstruct the data later. Note that partial writes are 3273 * considered successful if they can be reconstructed at all. 3274 */ 3275 static void 3276 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 3277 { 3278 int normal_errors = 0; 3279 int shadow_errors = 0; 3280 3281 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3282 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3283 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3284 3285 for (int c = 0; c < rr->rr_cols; c++) { 3286 raidz_col_t *rc = &rr->rr_col[c]; 3287 3288 if (rc->rc_error != 0) { 3289 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3290 normal_errors++; 3291 } 3292 if (rc->rc_shadow_error != 0) { 3293 ASSERT(rc->rc_shadow_error != ECKSUM); 3294 shadow_errors++; 3295 } 3296 } 3297 3298 /* 3299 * Treat partial writes as a success. If we couldn't write enough 3300 * columns to reconstruct the data, the I/O failed. Otherwise, good 3301 * enough. Note that in the case of a shadow write (during raidz 3302 * expansion), depending on if we crash, either the normal (old) or 3303 * shadow (new) location may become the "real" version of the block, 3304 * so both locations must have sufficient redundancy. 3305 * 3306 * Now that we support write reallocation, it would be better 3307 * to treat partial failure as real failure unless there are 3308 * no non-degraded top-level vdevs left, and not update DTLs 3309 * if we intend to reallocate. 3310 */ 3311 if (normal_errors > rr->rr_firstdatacol || 3312 shadow_errors > rr->rr_firstdatacol) { 3313 zio->io_error = zio_worst_error(zio->io_error, 3314 vdev_raidz_worst_error(rr)); 3315 } 3316 } 3317 3318 static void 3319 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 3320 raidz_row_t *rr) 3321 { 3322 int parity_errors = 0; 3323 int parity_untried = 0; 3324 int data_errors = 0; 3325 int total_errors = 0; 3326 3327 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3328 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3329 3330 for (int c = 0; c < rr->rr_cols; c++) { 3331 raidz_col_t *rc = &rr->rr_col[c]; 3332 3333 /* 3334 * If scrubbing and a replacing/sparing child vdev determined 3335 * that not all of its children have an identical copy of the 3336 * data, then clear the error so the column is treated like 3337 * any other read and force a repair to correct the damage. 3338 */ 3339 if (rc->rc_error == ECKSUM) { 3340 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3341 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3342 rc->rc_force_repair = 1; 3343 rc->rc_error = 0; 3344 } 3345 3346 if (rc->rc_error) { 3347 if (c < rr->rr_firstdatacol) 3348 parity_errors++; 3349 else 3350 data_errors++; 3351 3352 total_errors++; 3353 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3354 parity_untried++; 3355 } 3356 } 3357 3358 /* 3359 * If there were data errors and the number of errors we saw was 3360 * correctable -- less than or equal to the number of parity disks read 3361 * -- reconstruct based on the missing data. 3362 */ 3363 if (data_errors != 0 && 3364 total_errors <= rr->rr_firstdatacol - parity_untried) { 3365 /* 3366 * We either attempt to read all the parity columns or 3367 * none of them. If we didn't try to read parity, we 3368 * wouldn't be here in the correctable case. There must 3369 * also have been fewer parity errors than parity 3370 * columns or, again, we wouldn't be in this code path. 3371 */ 3372 ASSERT(parity_untried == 0); 3373 ASSERT(parity_errors < rr->rr_firstdatacol); 3374 3375 /* 3376 * Identify the data columns that reported an error. 3377 */ 3378 int n = 0; 3379 int tgts[VDEV_RAIDZ_MAXPARITY]; 3380 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 3381 raidz_col_t *rc = &rr->rr_col[c]; 3382 if (rc->rc_error != 0) { 3383 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3384 tgts[n++] = c; 3385 } 3386 } 3387 3388 ASSERT(rr->rr_firstdatacol >= n); 3389 3390 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3391 } 3392 } 3393 3394 /* 3395 * Return the number of reads issued. 3396 */ 3397 static int 3398 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 3399 { 3400 vdev_t *vd = zio->io_vd; 3401 int nread = 0; 3402 3403 rr->rr_missingdata = 0; 3404 rr->rr_missingparity = 0; 3405 3406 /* 3407 * If this rows contains empty sectors which are not required 3408 * for a normal read then allocate an ABD for them now so they 3409 * may be read, verified, and any needed repairs performed. 3410 */ 3411 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 3412 vdev_draid_map_alloc_empty(zio, rr); 3413 3414 for (int c = 0; c < rr->rr_cols; c++) { 3415 raidz_col_t *rc = &rr->rr_col[c]; 3416 if (rc->rc_tried || rc->rc_size == 0) 3417 continue; 3418 3419 zio_nowait(zio_vdev_child_io(zio, NULL, 3420 vd->vdev_child[rc->rc_devidx], 3421 rc->rc_offset, rc->rc_abd, rc->rc_size, 3422 zio->io_type, zio->io_priority, 0, 3423 vdev_raidz_child_done, rc)); 3424 nread++; 3425 } 3426 return (nread); 3427 } 3428 3429 /* 3430 * We're here because either there were too many errors to even attempt 3431 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 3432 * failed. In either case, there is enough bad data to prevent reconstruction. 3433 * Start checksum ereports for all children which haven't failed. 3434 */ 3435 static void 3436 vdev_raidz_io_done_unrecoverable(zio_t *zio) 3437 { 3438 raidz_map_t *rm = zio->io_vsd; 3439 3440 for (int i = 0; i < rm->rm_nrows; i++) { 3441 raidz_row_t *rr = rm->rm_row[i]; 3442 3443 for (int c = 0; c < rr->rr_cols; c++) { 3444 raidz_col_t *rc = &rr->rr_col[c]; 3445 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 3446 3447 if (rc->rc_error != 0) 3448 continue; 3449 3450 zio_bad_cksum_t zbc; 3451 zbc.zbc_has_cksum = 0; 3452 zbc.zbc_injected = rm->rm_ecksuminjected; 3453 mutex_enter(&cvd->vdev_stat_lock); 3454 cvd->vdev_stat.vs_checksum_errors++; 3455 mutex_exit(&cvd->vdev_stat_lock); 3456 (void) zfs_ereport_start_checksum(zio->io_spa, 3457 cvd, &zio->io_bookmark, zio, rc->rc_offset, 3458 rc->rc_size, &zbc); 3459 } 3460 } 3461 } 3462 3463 void 3464 vdev_raidz_io_done(zio_t *zio) 3465 { 3466 raidz_map_t *rm = zio->io_vsd; 3467 3468 ASSERT(zio->io_bp != NULL); 3469 if (zio->io_type == ZIO_TYPE_WRITE) { 3470 for (int i = 0; i < rm->rm_nrows; i++) { 3471 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 3472 } 3473 } else { 3474 if (rm->rm_phys_col) { 3475 /* 3476 * This is an aggregated read. Copy the data and status 3477 * from the aggregate abd's to the individual rows. 3478 */ 3479 for (int i = 0; i < rm->rm_nrows; i++) { 3480 raidz_row_t *rr = rm->rm_row[i]; 3481 3482 for (int c = 0; c < rr->rr_cols; c++) { 3483 raidz_col_t *rc = &rr->rr_col[c]; 3484 if (rc->rc_tried || rc->rc_size == 0) 3485 continue; 3486 3487 raidz_col_t *prc = 3488 &rm->rm_phys_col[rc->rc_devidx]; 3489 rc->rc_error = prc->rc_error; 3490 rc->rc_tried = prc->rc_tried; 3491 rc->rc_skipped = prc->rc_skipped; 3492 if (c >= rr->rr_firstdatacol) { 3493 /* 3494 * Note: this is slightly faster 3495 * than using abd_copy_off(). 3496 */ 3497 char *physbuf = abd_to_buf( 3498 prc->rc_abd); 3499 void *physloc = physbuf + 3500 rc->rc_offset - 3501 prc->rc_offset; 3502 3503 abd_copy_from_buf(rc->rc_abd, 3504 physloc, rc->rc_size); 3505 } 3506 } 3507 } 3508 } 3509 3510 for (int i = 0; i < rm->rm_nrows; i++) { 3511 raidz_row_t *rr = rm->rm_row[i]; 3512 vdev_raidz_io_done_reconstruct_known_missing(zio, 3513 rm, rr); 3514 } 3515 3516 if (raidz_checksum_verify(zio) == 0) { 3517 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3518 goto done; 3519 3520 for (int i = 0; i < rm->rm_nrows; i++) { 3521 raidz_row_t *rr = rm->rm_row[i]; 3522 vdev_raidz_io_done_verified(zio, rr); 3523 } 3524 zio_checksum_verified(zio); 3525 } else { 3526 /* 3527 * A sequential resilver has no checksum which makes 3528 * combinatoral reconstruction impossible. This code 3529 * path is unreachable since raidz_checksum_verify() 3530 * has no checksum to verify and must succeed. 3531 */ 3532 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3533 3534 /* 3535 * This isn't a typical situation -- either we got a 3536 * read error or a child silently returned bad data. 3537 * Read every block so we can try again with as much 3538 * data and parity as we can track down. If we've 3539 * already been through once before, all children will 3540 * be marked as tried so we'll proceed to combinatorial 3541 * reconstruction. 3542 */ 3543 int nread = 0; 3544 for (int i = 0; i < rm->rm_nrows; i++) { 3545 nread += vdev_raidz_read_all(zio, 3546 rm->rm_row[i]); 3547 } 3548 if (nread != 0) { 3549 /* 3550 * Normally our stage is VDEV_IO_DONE, but if 3551 * we've already called redone(), it will have 3552 * changed to VDEV_IO_START, in which case we 3553 * don't want to call redone() again. 3554 */ 3555 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 3556 zio_vdev_io_redone(zio); 3557 return; 3558 } 3559 /* 3560 * It would be too expensive to try every possible 3561 * combination of failed sectors in every row, so 3562 * instead we try every combination of failed current or 3563 * past physical disk. This means that if the incorrect 3564 * sectors were all on Nparity disks at any point in the 3565 * past, we will find the correct data. The only known 3566 * case where this is less durable than a non-expanded 3567 * RAIDZ, is if we have a silent failure during 3568 * expansion. In that case, one block could be 3569 * partially in the old format and partially in the 3570 * new format, so we'd lost some sectors from the old 3571 * format and some from the new format. 3572 * 3573 * e.g. logical_width=4 physical_width=6 3574 * the 15 (6+5+4) possible failed disks are: 3575 * width=6 child=0 3576 * width=6 child=1 3577 * width=6 child=2 3578 * width=6 child=3 3579 * width=6 child=4 3580 * width=6 child=5 3581 * width=5 child=0 3582 * width=5 child=1 3583 * width=5 child=2 3584 * width=5 child=3 3585 * width=5 child=4 3586 * width=4 child=0 3587 * width=4 child=1 3588 * width=4 child=2 3589 * width=4 child=3 3590 * And we will try every combination of Nparity of these 3591 * failing. 3592 * 3593 * As a first pass, we can generate every combo, 3594 * and try reconstructing, ignoring any known 3595 * failures. If any row has too many known + simulated 3596 * failures, then we bail on reconstructing with this 3597 * number of simulated failures. As an improvement, 3598 * we could detect the number of whole known failures 3599 * (i.e. we have known failures on these disks for 3600 * every row; the disks never succeeded), and 3601 * subtract that from the max # failures to simulate. 3602 * We could go even further like the current 3603 * combrec code, but that doesn't seem like it 3604 * gains us very much. If we simulate a failure 3605 * that is also a known failure, that's fine. 3606 */ 3607 zio->io_error = vdev_raidz_combrec(zio); 3608 if (zio->io_error == ECKSUM && 3609 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3610 vdev_raidz_io_done_unrecoverable(zio); 3611 } 3612 } 3613 } 3614 done: 3615 if (rm->rm_lr != NULL) { 3616 zfs_rangelock_exit(rm->rm_lr); 3617 rm->rm_lr = NULL; 3618 } 3619 } 3620 3621 static void 3622 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3623 { 3624 vdev_raidz_t *vdrz = vd->vdev_tsd; 3625 if (faulted > vdrz->vd_nparity) 3626 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3627 VDEV_AUX_NO_REPLICAS); 3628 else if (degraded + faulted != 0) 3629 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3630 else 3631 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3632 } 3633 3634 /* 3635 * Determine if any portion of the provided block resides on a child vdev 3636 * with a dirty DTL and therefore needs to be resilvered. The function 3637 * assumes that at least one DTL is dirty which implies that full stripe 3638 * width blocks must be resilvered. 3639 */ 3640 static boolean_t 3641 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3642 uint64_t phys_birth) 3643 { 3644 vdev_raidz_t *vdrz = vd->vdev_tsd; 3645 3646 /* 3647 * If we're in the middle of a RAIDZ expansion, this block may be in 3648 * the old and/or new location. For simplicity, always resilver it. 3649 */ 3650 if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3651 return (B_TRUE); 3652 3653 uint64_t dcols = vd->vdev_children; 3654 uint64_t nparity = vdrz->vd_nparity; 3655 uint64_t ashift = vd->vdev_top->vdev_ashift; 3656 /* The starting RAIDZ (parent) vdev sector of the block. */ 3657 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3658 /* The zio's size in units of the vdev's minimum sector size. */ 3659 uint64_t s = ((psize - 1) >> ashift) + 1; 3660 /* The first column for this stripe. */ 3661 uint64_t f = b % dcols; 3662 3663 /* Unreachable by sequential resilver. */ 3664 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 3665 3666 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 3667 return (B_FALSE); 3668 3669 if (s + nparity >= dcols) 3670 return (B_TRUE); 3671 3672 for (uint64_t c = 0; c < s + nparity; c++) { 3673 uint64_t devidx = (f + c) % dcols; 3674 vdev_t *cvd = vd->vdev_child[devidx]; 3675 3676 /* 3677 * dsl_scan_need_resilver() already checked vd with 3678 * vdev_dtl_contains(). So here just check cvd with 3679 * vdev_dtl_empty(), cheaper and a good approximation. 3680 */ 3681 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3682 return (B_TRUE); 3683 } 3684 3685 return (B_FALSE); 3686 } 3687 3688 static void 3689 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, 3690 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 3691 { 3692 (void) remain_rs; 3693 3694 vdev_t *raidvd = cvd->vdev_parent; 3695 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3696 3697 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3698 3699 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3700 /* 3701 * We're in the middle of expansion, in which case the 3702 * translation is in flux. Any answer we give may be wrong 3703 * by the time we return, so it isn't safe for the caller to 3704 * act on it. Therefore we say that this range isn't present 3705 * on any children. The only consumers of this are "zpool 3706 * initialize" and trimming, both of which are "best effort" 3707 * anyway. 3708 */ 3709 physical_rs->rs_start = physical_rs->rs_end = 0; 3710 remain_rs->rs_start = remain_rs->rs_end = 0; 3711 return; 3712 } 3713 3714 uint64_t width = vdrz->vd_physical_width; 3715 uint64_t tgt_col = cvd->vdev_id; 3716 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3717 3718 /* make sure the offsets are block-aligned */ 3719 ASSERT0(logical_rs->rs_start % (1 << ashift)); 3720 ASSERT0(logical_rs->rs_end % (1 << ashift)); 3721 uint64_t b_start = logical_rs->rs_start >> ashift; 3722 uint64_t b_end = logical_rs->rs_end >> ashift; 3723 3724 uint64_t start_row = 0; 3725 if (b_start > tgt_col) /* avoid underflow */ 3726 start_row = ((b_start - tgt_col - 1) / width) + 1; 3727 3728 uint64_t end_row = 0; 3729 if (b_end > tgt_col) 3730 end_row = ((b_end - tgt_col - 1) / width) + 1; 3731 3732 physical_rs->rs_start = start_row << ashift; 3733 physical_rs->rs_end = end_row << ashift; 3734 3735 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 3736 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 3737 logical_rs->rs_end - logical_rs->rs_start); 3738 } 3739 3740 static void 3741 raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3742 { 3743 spa_t *spa = arg; 3744 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3745 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3746 3747 /* 3748 * Ensure there are no i/os to the range that is being committed. 3749 */ 3750 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3751 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3752 3753 mutex_enter(&vre->vre_lock); 3754 uint64_t new_offset = 3755 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3756 /* 3757 * We should not have committed anything that failed. 3758 */ 3759 VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3760 mutex_exit(&vre->vre_lock); 3761 3762 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3763 old_offset, new_offset - old_offset, 3764 RL_WRITER); 3765 3766 /* 3767 * Update the uberblock that will be written when this txg completes. 3768 */ 3769 RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3770 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3771 vre->vre_offset_pertxg[txgoff] = 0; 3772 zfs_rangelock_exit(lr); 3773 3774 mutex_enter(&vre->vre_lock); 3775 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3776 vre->vre_bytes_copied_pertxg[txgoff] = 0; 3777 mutex_exit(&vre->vre_lock); 3778 3779 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3780 VERIFY0(zap_update(spa->spa_meta_objset, 3781 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3782 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3783 } 3784 3785 static void 3786 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3787 { 3788 spa_t *spa = arg; 3789 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3790 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3791 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3792 3793 for (int i = 0; i < TXG_SIZE; i++) 3794 VERIFY0(vre->vre_offset_pertxg[i]); 3795 3796 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3797 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3798 re->re_logical_width = vdrz->vd_physical_width; 3799 mutex_enter(&vdrz->vd_expand_lock); 3800 avl_add(&vdrz->vd_expand_txgs, re); 3801 mutex_exit(&vdrz->vd_expand_lock); 3802 3803 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3804 3805 /* 3806 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3807 * will get written (based on vd_expand_txgs). 3808 */ 3809 vdev_config_dirty(vd); 3810 3811 /* 3812 * Before we change vre_state, the on-disk state must reflect that we 3813 * have completed all copying, so that vdev_raidz_io_start() can use 3814 * vre_state to determine if the reflow is in progress. See also the 3815 * end of spa_raidz_expand_thread(). 3816 */ 3817 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3818 raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3819 3820 vre->vre_end_time = gethrestime_sec(); 3821 vre->vre_state = DSS_FINISHED; 3822 3823 uint64_t state = vre->vre_state; 3824 VERIFY0(zap_update(spa->spa_meta_objset, 3825 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3826 sizeof (state), 1, &state, tx)); 3827 3828 uint64_t end_time = vre->vre_end_time; 3829 VERIFY0(zap_update(spa->spa_meta_objset, 3830 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3831 sizeof (end_time), 1, &end_time, tx)); 3832 3833 spa->spa_uberblock.ub_raidz_reflow_info = 0; 3834 3835 spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3836 "%s vdev %llu new width %llu", spa_name(spa), 3837 (unsigned long long)vd->vdev_id, 3838 (unsigned long long)vd->vdev_children); 3839 3840 spa->spa_raidz_expand = NULL; 3841 raidvd->vdev_rz_expanding = B_FALSE; 3842 3843 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3844 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3845 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3846 3847 spa_notify_waiters(spa); 3848 3849 /* 3850 * While we're in syncing context take the opportunity to 3851 * setup a scrub. All the data has been sucessfully copied 3852 * but we have not validated any checksums. 3853 */ 3854 setup_sync_arg_t setup_sync_arg = { 3855 .func = POOL_SCAN_SCRUB, 3856 .txgstart = 0, 3857 .txgend = 0, 3858 }; 3859 if (zfs_scrub_after_expand && 3860 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { 3861 dsl_scan_setup_sync(&setup_sync_arg, tx); 3862 } 3863 } 3864 3865 /* 3866 * State of one copy batch. 3867 */ 3868 typedef struct raidz_reflow_arg { 3869 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ 3870 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ 3871 uint64_t rra_txg; /* TXG of this batch. */ 3872 uint_t rra_ashift; /* Ashift of the vdev. */ 3873 uint32_t rra_tbd; /* Number of in-flight ZIOs. */ 3874 uint32_t rra_writes; /* Number of write ZIOs. */ 3875 zio_t *rra_zio[]; /* Write ZIO pointers. */ 3876 } raidz_reflow_arg_t; 3877 3878 /* 3879 * Write of the new location on one child is done. Once all of them are done 3880 * we can unlock and free everything. 3881 */ 3882 static void 3883 raidz_reflow_write_done(zio_t *zio) 3884 { 3885 raidz_reflow_arg_t *rra = zio->io_private; 3886 vdev_raidz_expand_t *vre = rra->rra_vre; 3887 3888 abd_free(zio->io_abd); 3889 3890 mutex_enter(&vre->vre_lock); 3891 if (zio->io_error != 0) { 3892 /* Force a reflow pause on errors */ 3893 vre->vre_failed_offset = 3894 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3895 } 3896 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3897 vre->vre_outstanding_bytes -= zio->io_size; 3898 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3899 vre->vre_failed_offset) { 3900 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3901 zio->io_size; 3902 } 3903 cv_signal(&vre->vre_cv); 3904 boolean_t done = (--rra->rra_tbd == 0); 3905 mutex_exit(&vre->vre_lock); 3906 3907 if (!done) 3908 return; 3909 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3910 zfs_rangelock_exit(rra->rra_lr); 3911 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); 3912 } 3913 3914 /* 3915 * Read of the old location on one child is done. Once all of them are done 3916 * writes should have all the data and we can issue them. 3917 */ 3918 static void 3919 raidz_reflow_read_done(zio_t *zio) 3920 { 3921 raidz_reflow_arg_t *rra = zio->io_private; 3922 vdev_raidz_expand_t *vre = rra->rra_vre; 3923 3924 /* Reads of only one block use write ABDs. For bigger free gangs. */ 3925 if (zio->io_size > (1 << rra->rra_ashift)) 3926 abd_free(zio->io_abd); 3927 3928 /* 3929 * If the read failed, or if it was done on a vdev that is not fully 3930 * healthy (e.g. a child that has a resilver in progress), we may not 3931 * have the correct data. Note that it's OK if the write proceeds. 3932 * It may write garbage but the location is otherwise unused and we 3933 * will retry later due to vre_failed_offset. 3934 */ 3935 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3936 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3937 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3938 (long long)rra->rra_lr->lr_offset, 3939 (long long)rra->rra_lr->lr_length, 3940 (long long)rra->rra_txg, 3941 zio->io_error, 3942 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3943 vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3944 mutex_enter(&vre->vre_lock); 3945 /* Force a reflow pause on errors */ 3946 vre->vre_failed_offset = 3947 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3948 mutex_exit(&vre->vre_lock); 3949 } 3950 3951 if (atomic_dec_32_nv(&rra->rra_tbd) > 0) 3952 return; 3953 uint32_t writes = rra->rra_tbd = rra->rra_writes; 3954 for (uint64_t i = 0; i < writes; i++) 3955 zio_nowait(rra->rra_zio[i]); 3956 } 3957 3958 static void 3959 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3960 dmu_tx_t *tx) 3961 { 3962 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3963 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3964 3965 if (offset == 0) 3966 return; 3967 3968 mutex_enter(&vre->vre_lock); 3969 ASSERT3U(vre->vre_offset, <=, offset); 3970 vre->vre_offset = offset; 3971 mutex_exit(&vre->vre_lock); 3972 3973 if (vre->vre_offset_pertxg[txgoff] == 0) { 3974 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3975 spa, tx); 3976 } 3977 vre->vre_offset_pertxg[txgoff] = offset; 3978 } 3979 3980 static boolean_t 3981 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3982 { 3983 for (int i = 0; i < raidz_vd->vdev_children; i++) { 3984 /* Quick check if a child is being replaced */ 3985 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3986 return (B_TRUE); 3987 } 3988 return (B_FALSE); 3989 } 3990 3991 static boolean_t 3992 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, 3993 dmu_tx_t *tx) 3994 { 3995 spa_t *spa = vd->vdev_spa; 3996 uint_t ashift = vd->vdev_top->vdev_ashift; 3997 3998 zfs_range_seg_t *rs = zfs_range_tree_first(rt); 3999 if (rt == NULL) 4000 return (B_FALSE); 4001 uint64_t offset = zfs_rs_get_start(rs, rt); 4002 ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 4003 uint64_t size = zfs_rs_get_end(rs, rt) - offset; 4004 ASSERT3U(size, >=, 1 << ashift); 4005 ASSERT(IS_P2ALIGNED(size, 1 << ashift)); 4006 4007 uint64_t blkid = offset >> ashift; 4008 uint_t old_children = vd->vdev_children - 1; 4009 4010 /* 4011 * We can only progress to the point that writes will not overlap 4012 * with blocks whose progress has not yet been recorded on disk. 4013 * Since partially-copied rows are still read from the old location, 4014 * we need to stop one row before the sector-wise overlap, to prevent 4015 * row-wise overlap. 4016 * 4017 * Note that even if we are skipping over a large unallocated region, 4018 * we can't move the on-disk progress to `offset`, because concurrent 4019 * writes/allocations could still use the currently-unallocated 4020 * region. 4021 */ 4022 uint64_t ubsync_blkid = 4023 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 4024 uint64_t next_overwrite_blkid = ubsync_blkid + 4025 ubsync_blkid / old_children - old_children; 4026 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 4027 if (blkid >= next_overwrite_blkid) { 4028 raidz_reflow_record_progress(vre, 4029 next_overwrite_blkid << ashift, tx); 4030 return (B_TRUE); 4031 } 4032 4033 size = MIN(size, raidz_expand_max_copy_bytes); 4034 size = MIN(size, (uint64_t)old_children * 4035 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); 4036 size = MAX(size, 1 << ashift); 4037 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); 4038 size = (uint64_t)blocks << ashift; 4039 4040 zfs_range_tree_remove(rt, offset, size); 4041 4042 uint_t reads = MIN(blocks, old_children); 4043 uint_t writes = MIN(blocks, vd->vdev_children); 4044 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + 4045 sizeof (zio_t *) * writes, KM_SLEEP); 4046 rra->rra_vre = vre; 4047 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 4048 offset, size, RL_WRITER); 4049 rra->rra_txg = dmu_tx_get_txg(tx); 4050 rra->rra_ashift = ashift; 4051 rra->rra_tbd = reads; 4052 rra->rra_writes = writes; 4053 4054 raidz_reflow_record_progress(vre, offset + size, tx); 4055 4056 /* 4057 * SCL_STATE will be released when the read and write are done, 4058 * by raidz_reflow_write_done(). 4059 */ 4060 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4061 4062 /* check if a replacing vdev was added, if so treat it as an error */ 4063 if (vdev_raidz_expand_child_replacing(vd)) { 4064 zfs_dbgmsg("replacing vdev encountered, reflow paused at " 4065 "offset=%llu txg=%llu", 4066 (long long)rra->rra_lr->lr_offset, 4067 (long long)rra->rra_txg); 4068 4069 mutex_enter(&vre->vre_lock); 4070 vre->vre_failed_offset = 4071 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4072 cv_signal(&vre->vre_cv); 4073 mutex_exit(&vre->vre_lock); 4074 4075 /* drop everything we acquired */ 4076 spa_config_exit(spa, SCL_STATE, spa); 4077 zfs_rangelock_exit(rra->rra_lr); 4078 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); 4079 return (B_TRUE); 4080 } 4081 4082 mutex_enter(&vre->vre_lock); 4083 vre->vre_outstanding_bytes += size; 4084 mutex_exit(&vre->vre_lock); 4085 4086 /* Allocate ABD and ZIO for each child we write. */ 4087 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4088 zio_t *pio = spa->spa_txg_zio[txgoff]; 4089 uint_t b = blocks / vd->vdev_children; 4090 uint_t bb = blocks % vd->vdev_children; 4091 for (uint_t i = 0; i < writes; i++) { 4092 uint_t n = b + (i < bb); 4093 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); 4094 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, 4095 vd->vdev_child[(blkid + i) % vd->vdev_children], 4096 ((blkid + i) / vd->vdev_children) << ashift, 4097 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4098 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); 4099 } 4100 4101 /* 4102 * Allocate and issue ZIO for each child we read. For reads of only 4103 * one block we can use respective writer ABDs, since they will also 4104 * have only one block. For bigger reads create gang ABDs and fill 4105 * them with respective blocks from writer ABDs. 4106 */ 4107 b = blocks / old_children; 4108 bb = blocks % old_children; 4109 for (uint_t i = 0; i < reads; i++) { 4110 uint_t n = b + (i < bb); 4111 abd_t *abd; 4112 if (n > 1) { 4113 abd = abd_alloc_gang(); 4114 for (uint_t j = 0; j < n; j++) { 4115 uint_t b = j * old_children + i; 4116 abd_t *cabd = abd_get_offset_size( 4117 rra->rra_zio[b % vd->vdev_children]->io_abd, 4118 (b / vd->vdev_children) << ashift, 4119 1 << ashift); 4120 abd_gang_add(abd, cabd, B_TRUE); 4121 } 4122 } else { 4123 abd = rra->rra_zio[i]->io_abd; 4124 } 4125 zio_nowait(zio_vdev_child_io(pio, NULL, 4126 vd->vdev_child[(blkid + i) % old_children], 4127 ((blkid + i) / old_children) << ashift, abd, 4128 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4129 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); 4130 } 4131 4132 return (B_FALSE); 4133 } 4134 4135 /* 4136 * For testing (ztest specific) 4137 */ 4138 static void 4139 raidz_expand_pause(uint_t pause_point) 4140 { 4141 while (raidz_expand_pause_point != 0 && 4142 raidz_expand_pause_point <= pause_point) 4143 delay(hz); 4144 } 4145 4146 static void 4147 raidz_scratch_child_done(zio_t *zio) 4148 { 4149 zio_t *pio = zio->io_private; 4150 4151 mutex_enter(&pio->io_lock); 4152 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4153 mutex_exit(&pio->io_lock); 4154 } 4155 4156 /* 4157 * Reflow the beginning portion of the vdev into an intermediate scratch area 4158 * in memory and on disk. This operation must be persisted on disk before we 4159 * proceed to overwrite the beginning portion with the reflowed data. 4160 * 4161 * This multi-step task can fail to complete if disk errors are encountered 4162 * and we can return here after a pause (waiting for disk to become healthy). 4163 */ 4164 static void 4165 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4166 { 4167 vdev_raidz_expand_t *vre = arg; 4168 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4169 zio_t *pio; 4170 int error; 4171 4172 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4173 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4174 int ashift = raidvd->vdev_ashift; 4175 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4176 uint64_t); 4177 uint64_t logical_size = write_size * raidvd->vdev_children; 4178 uint64_t read_size = 4179 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4180 1 << ashift); 4181 4182 /* 4183 * The scratch space must be large enough to get us to the point 4184 * that one row does not overlap itself when moved. This is checked 4185 * by vdev_raidz_attach_check(). 4186 */ 4187 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4188 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4189 VERIFY3U(write_size, <=, read_size); 4190 4191 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4192 0, logical_size, RL_WRITER); 4193 4194 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4195 KM_SLEEP); 4196 for (int i = 0; i < raidvd->vdev_children; i++) { 4197 abds[i] = abd_alloc_linear(read_size, B_FALSE); 4198 } 4199 4200 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4201 4202 /* 4203 * If we have already written the scratch area then we must read from 4204 * there, since new writes were redirected there while we were paused 4205 * or the original location may have been partially overwritten with 4206 * reflowed data. 4207 */ 4208 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4209 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4210 /* 4211 * Read from scratch space. 4212 */ 4213 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4214 for (int i = 0; i < raidvd->vdev_children; i++) { 4215 /* 4216 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4217 * to the offset to calculate the physical offset to 4218 * write to. Passing in a negative offset makes us 4219 * access the scratch area. 4220 */ 4221 zio_nowait(zio_vdev_child_io(pio, NULL, 4222 raidvd->vdev_child[i], 4223 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4224 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4225 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4226 } 4227 error = zio_wait(pio); 4228 if (error != 0) { 4229 zfs_dbgmsg("reflow: error %d reading scratch location", 4230 error); 4231 goto io_error_exit; 4232 } 4233 goto overwrite; 4234 } 4235 4236 /* 4237 * Read from original location. 4238 */ 4239 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4240 for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4241 ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4242 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4243 0, abds[i], read_size, ZIO_TYPE_READ, 4244 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4245 raidz_scratch_child_done, pio)); 4246 } 4247 error = zio_wait(pio); 4248 if (error != 0) { 4249 zfs_dbgmsg("reflow: error %d reading original location", error); 4250 io_error_exit: 4251 for (int i = 0; i < raidvd->vdev_children; i++) 4252 abd_free(abds[i]); 4253 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4254 zfs_rangelock_exit(lr); 4255 spa_config_exit(spa, SCL_STATE, FTAG); 4256 return; 4257 } 4258 4259 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4260 4261 /* 4262 * Reflow in memory. 4263 */ 4264 uint64_t logical_sectors = logical_size >> ashift; 4265 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4266 int oldchild = i % (raidvd->vdev_children - 1); 4267 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4268 4269 int newchild = i % raidvd->vdev_children; 4270 uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4271 4272 /* a single sector should not be copying over itself */ 4273 ASSERT(!(newchild == oldchild && newoff == oldoff)); 4274 4275 abd_copy_off(abds[newchild], abds[oldchild], 4276 newoff, oldoff, 1 << ashift); 4277 } 4278 4279 /* 4280 * Verify that we filled in everything we intended to (write_size on 4281 * each child). 4282 */ 4283 VERIFY0(logical_sectors % raidvd->vdev_children); 4284 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4285 write_size); 4286 4287 /* 4288 * Write to scratch location (boot area). 4289 */ 4290 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4291 for (int i = 0; i < raidvd->vdev_children; i++) { 4292 /* 4293 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4294 * the offset to calculate the physical offset to write to. 4295 * Passing in a negative offset lets us access the boot area. 4296 */ 4297 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4298 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4299 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4300 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4301 } 4302 error = zio_wait(pio); 4303 if (error != 0) { 4304 zfs_dbgmsg("reflow: error %d writing scratch location", error); 4305 goto io_error_exit; 4306 } 4307 pio = zio_root(spa, NULL, NULL, 0); 4308 zio_flush(pio, raidvd); 4309 zio_wait(pio); 4310 4311 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4312 (long long)logical_size); 4313 4314 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4315 4316 /* 4317 * Update uberblock to indicate that scratch space is valid. This is 4318 * needed because after this point, the real location may be 4319 * overwritten. If we crash, we need to get the data from the 4320 * scratch space, rather than the real location. 4321 * 4322 * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4323 * will prefer this uberblock. 4324 */ 4325 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4326 spa->spa_ubsync.ub_timestamp++; 4327 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4328 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4329 if (spa_multihost(spa)) 4330 mmp_update_uberblock(spa, &spa->spa_ubsync); 4331 4332 zfs_dbgmsg("reflow: uberblock updated " 4333 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4334 (long long)spa->spa_ubsync.ub_txg, 4335 (long long)logical_size, 4336 (long long)spa->spa_ubsync.ub_timestamp); 4337 4338 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4339 4340 /* 4341 * Overwrite with reflow'ed data. 4342 */ 4343 overwrite: 4344 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4345 for (int i = 0; i < raidvd->vdev_children; i++) { 4346 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4347 0, abds[i], write_size, ZIO_TYPE_WRITE, 4348 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4349 raidz_scratch_child_done, pio)); 4350 } 4351 error = zio_wait(pio); 4352 if (error != 0) { 4353 /* 4354 * When we exit early here and drop the range lock, new 4355 * writes will go into the scratch area so we'll need to 4356 * read from there when we return after pausing. 4357 */ 4358 zfs_dbgmsg("reflow: error %d writing real location", error); 4359 /* 4360 * Update the uberblock that is written when this txg completes. 4361 */ 4362 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4363 logical_size); 4364 goto io_error_exit; 4365 } 4366 pio = zio_root(spa, NULL, NULL, 0); 4367 zio_flush(pio, raidvd); 4368 zio_wait(pio); 4369 4370 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4371 (long long)logical_size); 4372 for (int i = 0; i < raidvd->vdev_children; i++) 4373 abd_free(abds[i]); 4374 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4375 4376 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4377 4378 /* 4379 * Update uberblock to indicate that the initial part has been 4380 * reflow'ed. This is needed because after this point (when we exit 4381 * the rangelock), we allow regular writes to this region, which will 4382 * be written to the new location only (because reflow_offset_next == 4383 * reflow_offset_synced). If we crashed and re-copied from the 4384 * scratch space, we would lose the regular writes. 4385 */ 4386 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4387 logical_size); 4388 spa->spa_ubsync.ub_timestamp++; 4389 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4390 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4391 if (spa_multihost(spa)) 4392 mmp_update_uberblock(spa, &spa->spa_ubsync); 4393 4394 zfs_dbgmsg("reflow: uberblock updated " 4395 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4396 (long long)spa->spa_ubsync.ub_txg, 4397 (long long)logical_size, 4398 (long long)spa->spa_ubsync.ub_timestamp); 4399 4400 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4401 4402 /* 4403 * Update progress. 4404 */ 4405 vre->vre_offset = logical_size; 4406 zfs_rangelock_exit(lr); 4407 spa_config_exit(spa, SCL_STATE, FTAG); 4408 4409 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4410 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4411 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4412 /* 4413 * Note - raidz_reflow_sync() will update the uberblock state to 4414 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4415 */ 4416 raidz_reflow_sync(spa, tx); 4417 4418 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4419 } 4420 4421 /* 4422 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4423 * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4424 */ 4425 void 4426 vdev_raidz_reflow_copy_scratch(spa_t *spa) 4427 { 4428 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4429 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4430 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4431 4432 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4433 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4434 ASSERT0(logical_size % raidvd->vdev_children); 4435 uint64_t write_size = logical_size / raidvd->vdev_children; 4436 4437 zio_t *pio; 4438 4439 /* 4440 * Read from scratch space. 4441 */ 4442 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4443 KM_SLEEP); 4444 for (int i = 0; i < raidvd->vdev_children; i++) { 4445 abds[i] = abd_alloc_linear(write_size, B_FALSE); 4446 } 4447 4448 pio = zio_root(spa, NULL, NULL, 0); 4449 for (int i = 0; i < raidvd->vdev_children; i++) { 4450 /* 4451 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4452 * the offset to calculate the physical offset to write to. 4453 * Passing in a negative offset lets us access the boot area. 4454 */ 4455 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4456 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4457 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, 4458 raidz_scratch_child_done, pio)); 4459 } 4460 zio_wait(pio); 4461 4462 /* 4463 * Overwrite real location with reflow'ed data. 4464 */ 4465 pio = zio_root(spa, NULL, NULL, 0); 4466 for (int i = 0; i < raidvd->vdev_children; i++) { 4467 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4468 0, abds[i], write_size, ZIO_TYPE_WRITE, 4469 ZIO_PRIORITY_REMOVAL, 0, 4470 raidz_scratch_child_done, pio)); 4471 } 4472 zio_wait(pio); 4473 pio = zio_root(spa, NULL, NULL, 0); 4474 zio_flush(pio, raidvd); 4475 zio_wait(pio); 4476 4477 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4478 "to real location", (long long)logical_size); 4479 4480 for (int i = 0; i < raidvd->vdev_children; i++) 4481 abd_free(abds[i]); 4482 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4483 4484 /* 4485 * Update uberblock. 4486 */ 4487 RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4488 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4489 spa->spa_ubsync.ub_timestamp++; 4490 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4491 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4492 if (spa_multihost(spa)) 4493 mmp_update_uberblock(spa, &spa->spa_ubsync); 4494 4495 zfs_dbgmsg("reflow recovery: uberblock updated " 4496 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4497 (long long)spa->spa_ubsync.ub_txg, 4498 (long long)logical_size, 4499 (long long)spa->spa_ubsync.ub_timestamp); 4500 4501 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4502 spa_first_txg(spa)); 4503 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4504 vre->vre_offset = logical_size; 4505 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4506 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4507 /* 4508 * Note that raidz_reflow_sync() will update the uberblock once more 4509 */ 4510 raidz_reflow_sync(spa, tx); 4511 4512 dmu_tx_commit(tx); 4513 4514 spa_config_exit(spa, SCL_STATE, FTAG); 4515 } 4516 4517 static boolean_t 4518 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4519 { 4520 (void) zthr; 4521 spa_t *spa = arg; 4522 4523 return (spa->spa_raidz_expand != NULL && 4524 !spa->spa_raidz_expand->vre_waiting_for_resilver); 4525 } 4526 4527 /* 4528 * RAIDZ expansion background thread 4529 * 4530 * Can be called multiple times if the reflow is paused 4531 */ 4532 static void 4533 spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4534 { 4535 spa_t *spa = arg; 4536 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4537 4538 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4539 vre->vre_offset = 0; 4540 else 4541 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4542 4543 /* Reflow the begining portion using the scratch area */ 4544 if (vre->vre_offset == 0) { 4545 VERIFY0(dsl_sync_task(spa_name(spa), 4546 NULL, raidz_reflow_scratch_sync, 4547 vre, 0, ZFS_SPACE_CHECK_NONE)); 4548 4549 /* if we encountered errors then pause */ 4550 if (vre->vre_offset == 0) { 4551 mutex_enter(&vre->vre_lock); 4552 vre->vre_waiting_for_resilver = B_TRUE; 4553 mutex_exit(&vre->vre_lock); 4554 return; 4555 } 4556 } 4557 4558 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4559 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4560 4561 uint64_t guid = raidvd->vdev_guid; 4562 4563 /* Iterate over all the remaining metaslabs */ 4564 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4565 i < raidvd->vdev_ms_count && 4566 !zthr_iscancelled(zthr) && 4567 vre->vre_failed_offset == UINT64_MAX; i++) { 4568 metaslab_t *msp = raidvd->vdev_ms[i]; 4569 4570 metaslab_disable(msp); 4571 mutex_enter(&msp->ms_lock); 4572 4573 /* 4574 * The metaslab may be newly created (for the expanded 4575 * space), in which case its trees won't exist yet, 4576 * so we need to bail out early. 4577 */ 4578 if (msp->ms_new) { 4579 mutex_exit(&msp->ms_lock); 4580 metaslab_enable(msp, B_FALSE, B_FALSE); 4581 continue; 4582 } 4583 4584 VERIFY0(metaslab_load(msp)); 4585 4586 /* 4587 * We want to copy everything except the free (allocatable) 4588 * space. Note that there may be a little bit more free 4589 * space (e.g. in ms_defer), and it's fine to copy that too. 4590 */ 4591 uint64_t shift, start; 4592 zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( 4593 raidvd, msp, &start, &shift); 4594 zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, 4595 start, shift); 4596 zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); 4597 zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, 4598 rt); 4599 mutex_exit(&msp->ms_lock); 4600 4601 /* 4602 * Force the last sector of each metaslab to be copied. This 4603 * ensures that we advance the on-disk progress to the end of 4604 * this metaslab while the metaslab is disabled. Otherwise, we 4605 * could move past this metaslab without advancing the on-disk 4606 * progress, and then an allocation to this metaslab would not 4607 * be copied. 4608 */ 4609 int sectorsz = 1 << raidvd->vdev_ashift; 4610 uint64_t ms_last_offset = msp->ms_start + 4611 msp->ms_size - sectorsz; 4612 if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { 4613 zfs_range_tree_add(rt, ms_last_offset, sectorsz); 4614 } 4615 4616 /* 4617 * When we are resuming from a paused expansion (i.e. 4618 * when importing a pool with a expansion in progress), 4619 * discard any state that we have already processed. 4620 */ 4621 if (vre->vre_offset > msp->ms_start) { 4622 zfs_range_tree_clear(rt, msp->ms_start, 4623 vre->vre_offset - msp->ms_start); 4624 } 4625 4626 while (!zthr_iscancelled(zthr) && 4627 !zfs_range_tree_is_empty(rt) && 4628 vre->vre_failed_offset == UINT64_MAX) { 4629 4630 /* 4631 * We need to periodically drop the config lock so that 4632 * writers can get in. Additionally, we can't wait 4633 * for a txg to sync while holding a config lock 4634 * (since a waiting writer could cause a 3-way deadlock 4635 * with the sync thread, which also gets a config 4636 * lock for reader). So we can't hold the config lock 4637 * while calling dmu_tx_assign(). 4638 */ 4639 spa_config_exit(spa, SCL_CONFIG, FTAG); 4640 4641 /* 4642 * If requested, pause the reflow when the amount 4643 * specified by raidz_expand_max_reflow_bytes is reached 4644 * 4645 * This pause is only used during testing or debugging. 4646 */ 4647 while (raidz_expand_max_reflow_bytes != 0 && 4648 raidz_expand_max_reflow_bytes <= 4649 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4650 delay(hz); 4651 } 4652 4653 mutex_enter(&vre->vre_lock); 4654 while (vre->vre_outstanding_bytes > 4655 raidz_expand_max_copy_bytes) { 4656 cv_wait(&vre->vre_cv, &vre->vre_lock); 4657 } 4658 mutex_exit(&vre->vre_lock); 4659 4660 dmu_tx_t *tx = 4661 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4662 4663 VERIFY0(dmu_tx_assign(tx, 4664 DMU_TX_WAIT | DMU_TX_SUSPEND)); 4665 uint64_t txg = dmu_tx_get_txg(tx); 4666 4667 /* 4668 * Reacquire the vdev_config lock. Theoretically, the 4669 * vdev_t that we're expanding may have changed. 4670 */ 4671 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4672 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4673 4674 boolean_t needsync = 4675 raidz_reflow_impl(raidvd, vre, rt, tx); 4676 4677 dmu_tx_commit(tx); 4678 4679 if (needsync) { 4680 spa_config_exit(spa, SCL_CONFIG, FTAG); 4681 txg_wait_synced(spa->spa_dsl_pool, txg); 4682 spa_config_enter(spa, SCL_CONFIG, FTAG, 4683 RW_READER); 4684 } 4685 } 4686 4687 spa_config_exit(spa, SCL_CONFIG, FTAG); 4688 4689 metaslab_enable(msp, B_FALSE, B_FALSE); 4690 zfs_range_tree_vacate(rt, NULL, NULL); 4691 zfs_range_tree_destroy(rt); 4692 4693 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4694 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4695 } 4696 4697 spa_config_exit(spa, SCL_CONFIG, FTAG); 4698 4699 /* 4700 * The txg_wait_synced() here ensures that all reflow zio's have 4701 * completed, and vre_failed_offset has been set if necessary. It 4702 * also ensures that the progress of the last raidz_reflow_sync() is 4703 * written to disk before raidz_reflow_complete_sync() changes the 4704 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4705 * determine if a reflow is in progress, in which case we may need to 4706 * write to both old and new locations. Therefore we can only change 4707 * vre_state once this is not necessary, which is once the on-disk 4708 * progress (in spa_ubsync) has been set past any possible writes (to 4709 * the end of the last metaslab). 4710 */ 4711 txg_wait_synced(spa->spa_dsl_pool, 0); 4712 4713 if (!zthr_iscancelled(zthr) && 4714 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4715 /* 4716 * We are not being canceled or paused, so the reflow must be 4717 * complete. In that case also mark it as completed on disk. 4718 */ 4719 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4720 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4721 raidz_reflow_complete_sync, spa, 4722 0, ZFS_SPACE_CHECK_NONE)); 4723 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4724 } else { 4725 /* 4726 * Wait for all copy zio's to complete and for all the 4727 * raidz_reflow_sync() synctasks to be run. 4728 */ 4729 spa_history_log_internal(spa, "reflow pause", 4730 NULL, "offset=%llu failed_offset=%lld", 4731 (long long)vre->vre_offset, 4732 (long long)vre->vre_failed_offset); 4733 mutex_enter(&vre->vre_lock); 4734 if (vre->vre_failed_offset != UINT64_MAX) { 4735 /* 4736 * Reset progress so that we will retry everything 4737 * after the point that something failed. 4738 */ 4739 vre->vre_offset = vre->vre_failed_offset; 4740 vre->vre_failed_offset = UINT64_MAX; 4741 vre->vre_waiting_for_resilver = B_TRUE; 4742 } 4743 mutex_exit(&vre->vre_lock); 4744 } 4745 } 4746 4747 void 4748 spa_start_raidz_expansion_thread(spa_t *spa) 4749 { 4750 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4751 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4752 spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4753 spa, defclsyspri); 4754 } 4755 4756 void 4757 raidz_dtl_reassessed(vdev_t *vd) 4758 { 4759 spa_t *spa = vd->vdev_spa; 4760 if (spa->spa_raidz_expand != NULL) { 4761 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4762 /* 4763 * we get called often from vdev_dtl_reassess() so make 4764 * sure it's our vdev and any replacing is complete 4765 */ 4766 if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4767 !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4768 mutex_enter(&vre->vre_lock); 4769 if (vre->vre_waiting_for_resilver) { 4770 vdev_dbgmsg(vd, "DTL reassessed, " 4771 "continuing raidz expansion"); 4772 vre->vre_waiting_for_resilver = B_FALSE; 4773 zthr_wakeup(spa->spa_raidz_expand_zthr); 4774 } 4775 mutex_exit(&vre->vre_lock); 4776 } 4777 } 4778 } 4779 4780 int 4781 vdev_raidz_attach_check(vdev_t *new_child) 4782 { 4783 vdev_t *raidvd = new_child->vdev_parent; 4784 uint64_t new_children = raidvd->vdev_children; 4785 4786 /* 4787 * We use the "boot" space as scratch space to handle overwriting the 4788 * initial part of the vdev. If it is too small, then this expansion 4789 * is not allowed. This would be very unusual (e.g. ashift > 13 and 4790 * >200 children). 4791 */ 4792 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4793 return (EINVAL); 4794 } 4795 return (0); 4796 } 4797 4798 void 4799 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4800 { 4801 vdev_t *new_child = arg; 4802 spa_t *spa = new_child->vdev_spa; 4803 vdev_t *raidvd = new_child->vdev_parent; 4804 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4805 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4806 ASSERT3P(raidvd->vdev_top, ==, raidvd); 4807 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4808 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4809 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4810 new_child); 4811 4812 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4813 4814 vdrz->vd_physical_width++; 4815 4816 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4817 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4818 vdrz->vn_vre.vre_offset = 0; 4819 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4820 spa->spa_raidz_expand = &vdrz->vn_vre; 4821 zthr_wakeup(spa->spa_raidz_expand_zthr); 4822 4823 /* 4824 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4825 * written to the config. 4826 */ 4827 vdev_config_dirty(raidvd); 4828 4829 vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4830 vdrz->vn_vre.vre_end_time = 0; 4831 vdrz->vn_vre.vre_state = DSS_SCANNING; 4832 vdrz->vn_vre.vre_bytes_copied = 0; 4833 4834 uint64_t state = vdrz->vn_vre.vre_state; 4835 VERIFY0(zap_update(spa->spa_meta_objset, 4836 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4837 sizeof (state), 1, &state, tx)); 4838 4839 uint64_t start_time = vdrz->vn_vre.vre_start_time; 4840 VERIFY0(zap_update(spa->spa_meta_objset, 4841 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4842 sizeof (start_time), 1, &start_time, tx)); 4843 4844 (void) zap_remove(spa->spa_meta_objset, 4845 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4846 (void) zap_remove(spa->spa_meta_objset, 4847 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4848 4849 spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4850 "%s vdev %llu new width %llu", spa_name(spa), 4851 (unsigned long long)raidvd->vdev_id, 4852 (unsigned long long)raidvd->vdev_children); 4853 } 4854 4855 int 4856 vdev_raidz_load(vdev_t *vd) 4857 { 4858 vdev_raidz_t *vdrz = vd->vdev_tsd; 4859 int err; 4860 4861 uint64_t state = DSS_NONE; 4862 uint64_t start_time = 0; 4863 uint64_t end_time = 0; 4864 uint64_t bytes_copied = 0; 4865 4866 if (vd->vdev_top_zap != 0) { 4867 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4868 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4869 sizeof (state), 1, &state); 4870 if (err != 0 && err != ENOENT) 4871 return (err); 4872 4873 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4874 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4875 sizeof (start_time), 1, &start_time); 4876 if (err != 0 && err != ENOENT) 4877 return (err); 4878 4879 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4880 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4881 sizeof (end_time), 1, &end_time); 4882 if (err != 0 && err != ENOENT) 4883 return (err); 4884 4885 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4886 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4887 sizeof (bytes_copied), 1, &bytes_copied); 4888 if (err != 0 && err != ENOENT) 4889 return (err); 4890 } 4891 4892 /* 4893 * If we are in the middle of expansion, vre_state should have 4894 * already been set by vdev_raidz_init(). 4895 */ 4896 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4897 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4898 vdrz->vn_vre.vre_start_time = start_time; 4899 vdrz->vn_vre.vre_end_time = end_time; 4900 vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4901 4902 return (0); 4903 } 4904 4905 int 4906 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4907 { 4908 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4909 4910 if (vre == NULL) { 4911 /* no removal in progress; find most recent completed */ 4912 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4913 vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4914 if (vd->vdev_ops == &vdev_raidz_ops) { 4915 vdev_raidz_t *vdrz = vd->vdev_tsd; 4916 4917 if (vdrz->vn_vre.vre_end_time != 0 && 4918 (vre == NULL || 4919 vdrz->vn_vre.vre_end_time > 4920 vre->vre_end_time)) { 4921 vre = &vdrz->vn_vre; 4922 } 4923 } 4924 } 4925 } 4926 4927 if (vre == NULL) { 4928 return (SET_ERROR(ENOENT)); 4929 } 4930 4931 pres->pres_state = vre->vre_state; 4932 pres->pres_expanding_vdev = vre->vre_vdev_id; 4933 4934 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4935 pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4936 4937 mutex_enter(&vre->vre_lock); 4938 pres->pres_reflowed = vre->vre_bytes_copied; 4939 for (int i = 0; i < TXG_SIZE; i++) 4940 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4941 mutex_exit(&vre->vre_lock); 4942 4943 pres->pres_start_time = vre->vre_start_time; 4944 pres->pres_end_time = vre->vre_end_time; 4945 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4946 4947 return (0); 4948 } 4949 4950 /* 4951 * Initialize private RAIDZ specific fields from the nvlist. 4952 */ 4953 static int 4954 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 4955 { 4956 uint_t children; 4957 nvlist_t **child; 4958 int error = nvlist_lookup_nvlist_array(nv, 4959 ZPOOL_CONFIG_CHILDREN, &child, &children); 4960 if (error != 0) 4961 return (SET_ERROR(EINVAL)); 4962 4963 uint64_t nparity; 4964 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 4965 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 4966 return (SET_ERROR(EINVAL)); 4967 4968 /* 4969 * Previous versions could only support 1 or 2 parity 4970 * device. 4971 */ 4972 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 4973 return (SET_ERROR(EINVAL)); 4974 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 4975 return (SET_ERROR(EINVAL)); 4976 } else { 4977 /* 4978 * We require the parity to be specified for SPAs that 4979 * support multiple parity levels. 4980 */ 4981 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 4982 return (SET_ERROR(EINVAL)); 4983 4984 /* 4985 * Otherwise, we default to 1 parity device for RAID-Z. 4986 */ 4987 nparity = 1; 4988 } 4989 4990 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4991 vdrz->vn_vre.vre_vdev_id = -1; 4992 vdrz->vn_vre.vre_offset = UINT64_MAX; 4993 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4994 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4995 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4996 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4997 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4998 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4999 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 5000 5001 vdrz->vd_physical_width = children; 5002 vdrz->vd_nparity = nparity; 5003 5004 /* note, the ID does not exist when creating a pool */ 5005 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 5006 &vdrz->vn_vre.vre_vdev_id); 5007 5008 boolean_t reflow_in_progress = 5009 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5010 if (reflow_in_progress) { 5011 spa->spa_raidz_expand = &vdrz->vn_vre; 5012 vdrz->vn_vre.vre_state = DSS_SCANNING; 5013 } 5014 5015 vdrz->vd_original_width = children; 5016 uint64_t *txgs; 5017 unsigned int txgs_size = 0; 5018 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5019 &txgs, &txgs_size); 5020 if (error == 0) { 5021 for (int i = 0; i < txgs_size; i++) { 5022 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 5023 re->re_txg = txgs[txgs_size - i - 1]; 5024 re->re_logical_width = vdrz->vd_physical_width - i; 5025 5026 if (reflow_in_progress) 5027 re->re_logical_width--; 5028 5029 avl_add(&vdrz->vd_expand_txgs, re); 5030 } 5031 5032 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 5033 } 5034 if (reflow_in_progress) { 5035 vdrz->vd_original_width--; 5036 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 5037 children, txgs_size); 5038 } 5039 5040 *tsd = vdrz; 5041 5042 return (0); 5043 } 5044 5045 static void 5046 vdev_raidz_fini(vdev_t *vd) 5047 { 5048 vdev_raidz_t *vdrz = vd->vdev_tsd; 5049 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 5050 vd->vdev_spa->spa_raidz_expand = NULL; 5051 reflow_node_t *re; 5052 void *cookie = NULL; 5053 avl_tree_t *tree = &vdrz->vd_expand_txgs; 5054 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 5055 kmem_free(re, sizeof (*re)); 5056 avl_destroy(&vdrz->vd_expand_txgs); 5057 mutex_destroy(&vdrz->vd_expand_lock); 5058 mutex_destroy(&vdrz->vn_vre.vre_lock); 5059 cv_destroy(&vdrz->vn_vre.vre_cv); 5060 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 5061 kmem_free(vdrz, sizeof (*vdrz)); 5062 } 5063 5064 /* 5065 * Add RAIDZ specific fields to the config nvlist. 5066 */ 5067 static void 5068 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 5069 { 5070 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 5071 vdev_raidz_t *vdrz = vd->vdev_tsd; 5072 5073 /* 5074 * Make sure someone hasn't managed to sneak a fancy new vdev 5075 * into a crufty old storage pool. 5076 */ 5077 ASSERT(vdrz->vd_nparity == 1 || 5078 (vdrz->vd_nparity <= 2 && 5079 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 5080 (vdrz->vd_nparity <= 3 && 5081 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 5082 5083 /* 5084 * Note that we'll add these even on storage pools where they 5085 * aren't strictly required -- older software will just ignore 5086 * it. 5087 */ 5088 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 5089 5090 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 5091 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5092 } 5093 5094 mutex_enter(&vdrz->vd_expand_lock); 5095 if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 5096 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 5097 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 5098 KM_SLEEP); 5099 uint64_t i = 0; 5100 5101 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 5102 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 5103 txgs[i++] = re->re_txg; 5104 } 5105 5106 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5107 txgs, count); 5108 5109 kmem_free(txgs, sizeof (uint64_t) * count); 5110 } 5111 mutex_exit(&vdrz->vd_expand_lock); 5112 } 5113 5114 static uint64_t 5115 vdev_raidz_nparity(vdev_t *vd) 5116 { 5117 vdev_raidz_t *vdrz = vd->vdev_tsd; 5118 return (vdrz->vd_nparity); 5119 } 5120 5121 static uint64_t 5122 vdev_raidz_ndisks(vdev_t *vd) 5123 { 5124 return (vd->vdev_children); 5125 } 5126 5127 vdev_ops_t vdev_raidz_ops = { 5128 .vdev_op_init = vdev_raidz_init, 5129 .vdev_op_fini = vdev_raidz_fini, 5130 .vdev_op_open = vdev_raidz_open, 5131 .vdev_op_close = vdev_raidz_close, 5132 .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, 5133 .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, 5134 .vdev_op_min_asize = vdev_raidz_min_asize, 5135 .vdev_op_min_alloc = NULL, 5136 .vdev_op_io_start = vdev_raidz_io_start, 5137 .vdev_op_io_done = vdev_raidz_io_done, 5138 .vdev_op_state_change = vdev_raidz_state_change, 5139 .vdev_op_need_resilver = vdev_raidz_need_resilver, 5140 .vdev_op_hold = NULL, 5141 .vdev_op_rele = NULL, 5142 .vdev_op_remap = NULL, 5143 .vdev_op_xlate = vdev_raidz_xlate, 5144 .vdev_op_rebuild_asize = NULL, 5145 .vdev_op_metaslab_init = NULL, 5146 .vdev_op_config_generate = vdev_raidz_config_generate, 5147 .vdev_op_nparity = vdev_raidz_nparity, 5148 .vdev_op_ndisks = vdev_raidz_ndisks, 5149 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5150 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5151 }; 5152 5153 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5154 "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5155 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5156 "Max amount of concurrent i/o for RAIDZ expansion"); 5157 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5158 "For expanded RAIDZ, aggregate reads that have more rows than this"); 5159 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5160 "For expanded RAIDZ, automatically start a pool scrub when expansion " 5161 "completes"); 5162