1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/spa_impl.h> 31 #include <sys/zap.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/abd.h> 38 #include <sys/zfs_rlock.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/vdev_raidz.h> 42 #include <sys/vdev_raidz_impl.h> 43 #include <sys/vdev_draid.h> 44 #include <sys/uberblock_impl.h> 45 #include <sys/dsl_scan.h> 46 47 #ifdef ZFS_DEBUG 48 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 49 #endif 50 51 /* 52 * Virtual device vector for RAID-Z. 53 * 54 * This vdev supports single, double, and triple parity. For single parity, 55 * we use a simple XOR of all the data columns. For double or triple parity, 56 * we use a special case of Reed-Solomon coding. This extends the 57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 60 * former is also based. The latter is designed to provide higher performance 61 * for writes. 62 * 63 * Note that the Plank paper claimed to support arbitrary N+M, but was then 64 * amended six years later identifying a critical flaw that invalidates its 65 * claims. Nevertheless, the technique can be adapted to work for up to 66 * triple parity. For additional parity, the amendment "Note: Correction to 67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 68 * is viable, but the additional complexity means that write performance will 69 * suffer. 70 * 71 * All of the methods above operate on a Galois field, defined over the 72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 73 * can be expressed with a single byte. Briefly, the operations on the 74 * field are defined as follows: 75 * 76 * o addition (+) is represented by a bitwise XOR 77 * o subtraction (-) is therefore identical to addition: A + B = A - B 78 * o multiplication of A by 2 is defined by the following bitwise expression: 79 * 80 * (A * 2)_7 = A_6 81 * (A * 2)_6 = A_5 82 * (A * 2)_5 = A_4 83 * (A * 2)_4 = A_3 + A_7 84 * (A * 2)_3 = A_2 + A_7 85 * (A * 2)_2 = A_1 + A_7 86 * (A * 2)_1 = A_0 87 * (A * 2)_0 = A_7 88 * 89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 90 * As an aside, this multiplication is derived from the error correcting 91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 92 * 93 * Observe that any number in the field (except for 0) can be expressed as a 94 * power of 2 -- a generator for the field. We store a table of the powers of 95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 97 * than field addition). The inverse of a field element A (A^-1) is therefore 98 * A ^ (255 - 1) = A^254. 99 * 100 * The up-to-three parity columns, P, Q, R over several data columns, 101 * D_0, ... D_n-1, can be expressed by field operations: 102 * 103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 108 * 109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 111 * independent coefficients. (There are no additional coefficients that have 112 * this property which is why the uncorrected Plank method breaks down.) 113 * 114 * See the reconstruction code below for how P, Q and R can used individually 115 * or in concert to recover missing data columns. 116 */ 117 118 #define VDEV_RAIDZ_P 0 119 #define VDEV_RAIDZ_Q 1 120 #define VDEV_RAIDZ_R 2 121 122 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 123 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 124 125 /* 126 * We provide a mechanism to perform the field multiplication operation on a 127 * 64-bit value all at once rather than a byte at a time. This works by 128 * creating a mask from the top bit in each byte and using that to 129 * conditionally apply the XOR of 0x1d. 130 */ 131 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 132 { \ 133 (mask) = (x) & 0x8080808080808080ULL; \ 134 (mask) = ((mask) << 1) - ((mask) >> 7); \ 135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 137 } 138 139 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 140 { \ 141 VDEV_RAIDZ_64MUL_2((x), mask); \ 142 VDEV_RAIDZ_64MUL_2((x), mask); \ 143 } 144 145 146 /* 147 * Big Theory Statement for how a RAIDZ VDEV is expanded 148 * 149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 151 * that have been previously expanded can be expanded again. 152 * 153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 154 * the VDEV) when an expansion starts. And the expansion will pause if any 155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 156 * operations on the pool can continue while an expansion is in progress (e.g. 157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 158 * and zpool initialize which can't be run during an expansion. Following a 159 * reboot or export/import, the expansion resumes where it left off. 160 * 161 * == Reflowing the Data == 162 * 163 * The expansion involves reflowing (copying) the data from the current set 164 * of disks to spread it across the new set which now has one more disk. This 165 * reflow operation is similar to reflowing text when the column width of a 166 * text editor window is expanded. The text doesn’t change but the location of 167 * the text changes to accommodate the new width. An example reflow result for 168 * a 4-wide RAIDZ1 to a 5-wide is shown below. 169 * 170 * Reflow End State 171 * Each letter indicates a parity group (logical stripe) 172 * 173 * Before expansion After Expansion 174 * D1 D2 D3 D4 D1 D2 D3 D4 D5 175 * +------+------+------+------+ +------+------+------+------+------+ 176 * | | | | | | | | | | | 177 * | A | A | A | A | | A | A | A | A | B | 178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 179 * +------+------+------+------+ +------+------+------+------+------+ 180 * | | | | | | | | | | | 181 * | B | B | C | C | | B | C | C | C | C | 182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 183 * +------+------+------+------+ +------+------+------+------+------+ 184 * | | | | | | | | | | | 185 * | C | C | D | D | | D | D | E | E | E | 186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 187 * +------+------+------+------+ +------+------+------+------+------+ 188 * | | | | | | | | | | | 189 * | E | E | E | E | --> | E | F | F | G | G | 190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 191 * +------+------+------+------+ +------+------+------+------+------+ 192 * | | | | | | | | | | | 193 * | F | F | G | G | | G | G | H | H | H | 194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 195 * +------+------+------+------+ +------+------+------+------+------+ 196 * | | | | | | | | | | | 197 * | G | G | H | H | | H | I | I | J | J | 198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 199 * +------+------+------+------+ +------+------+------+------+------+ 200 * | | | | | | | | | | | 201 * | H | H | I | I | | J | J | | | K | 202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 203 * +------+------+------+------+ +------+------+------+------+------+ 204 * 205 * This reflow approach has several advantages. There is no need to read or 206 * modify the block pointers or recompute any block checksums. The reflow 207 * doesn’t need to know where the parity sectors reside. We can read and write 208 * data sequentially and the copy can occur in a background thread in open 209 * context. The design also allows for fast discovery of what data to copy. 210 * 211 * The VDEV metaslabs are processed, one at a time, to copy the block data to 212 * have it flow across all the disks. The metaslab is disabled for allocations 213 * during the copy. As an optimization, we only copy the allocated data which 214 * can be determined by looking at the metaslab range tree. During the copy we 215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 216 * need to be able to survive losing parity count disks). This means we 217 * cannot overwrite data during the reflow that would be needed if a disk is 218 * lost. 219 * 220 * After the reflow completes, all newly-written blocks will have the new 221 * layout, i.e., they will have the parity to data ratio implied by the new 222 * number of disks in the RAIDZ group. Even though the reflow copies all of 223 * the allocated space (data and parity), it is only rearranged, not changed. 224 * 225 * This act of reflowing the data has a few implications about blocks 226 * that were written before the reflow completes: 227 * 228 * - Old blocks will still use the same amount of space (i.e., they will have 229 * the parity to data ratio implied by the old number of disks in the RAIDZ 230 * group). 231 * - Reading old blocks will be slightly slower than before the reflow, for 232 * two reasons. First, we will have to read from all disks in the RAIDZ 233 * VDEV, rather than being able to skip the children that contain only 234 * parity of this block (because the data of a single block is now spread 235 * out across all the disks). Second, in most cases there will be an extra 236 * bcopy, needed to rearrange the data back to its original layout in memory. 237 * 238 * == Scratch Area == 239 * 240 * As we copy the block data, we can only progress to the point that writes 241 * will not overlap with blocks whose progress has not yet been recorded on 242 * disk. Since partially-copied rows are always read from the old location, 243 * we need to stop one row before the sector-wise overlap, to prevent any 244 * row-wise overlap. For example, in the diagram above, when we reflow sector 245 * B6 it will overwite the original location for B5. 246 * 247 * To get around this, a scratch space is used so that we can start copying 248 * without risking data loss by overlapping the row. As an added benefit, it 249 * improves performance at the beginning of the reflow, but that small perf 250 * boost wouldn't be worth the complexity on its own. 251 * 252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 255 * the widths will likely be single digits so we can get a substantial chuck 256 * size using only a few MB of scratch per disk. 257 * 258 * The scratch area is persisted to disk which holds a large amount of reflowed 259 * state. We can always read the partially written stripes when a disk fails or 260 * the copy is interrupted (crash) during the initial copying phase and also 261 * get past a small chunk size restriction. At a minimum, the scratch space 262 * must be large enough to get us to the point that one row does not overlap 263 * itself when moved (i.e new_width^2). But going larger is even better. We 264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 265 * as our scratch space to handle overwriting the initial part of the VDEV. 266 * 267 * 0 256K 512K 4M 268 * +------+------+-----------------------+----------------------------- 269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 270 * | L0 | L1 | Reserved | (Metaslabs) 271 * +------+------+-----------------------+------------------------------- 272 * Scratch Area 273 * 274 * == Reflow Progress Updates == 275 * After the initial scratch-based reflow, the expansion process works 276 * similarly to device removal. We create a new open context thread which 277 * reflows the data, and periodically kicks off sync tasks to update logical 278 * state. In this case, state is the committed progress (offset of next data 279 * to copy). We need to persist the completed offset on disk, so that if we 280 * crash we know which format each VDEV offset is in. 281 * 282 * == Time Dependent Geometry == 283 * 284 * In non-expanded RAIDZ, blocks are read from disk in a column by column 285 * fashion. For a multi-row block, the second sector is in the first column 286 * not in the second column. This allows us to issue full reads for each 287 * column directly into the request buffer. The block data is thus laid out 288 * sequentially in a column-by-column fashion. 289 * 290 * For example, in the before expansion diagram above, one logical block might 291 * be sectors G19-H26. The parity is in G19,H23; and the data is in 292 * G20,H24,G21,H25,G22,H26. 293 * 294 * After a block is reflowed, the sectors that were all in the original column 295 * data can now reside in different columns. When reading from an expanded 296 * VDEV, we need to know the logical stripe width for each block so we can 297 * reconstitute the block’s data after the reads are completed. Likewise, 298 * when we perform the combinatorial reconstruction we need to know the 299 * original width so we can retry combinations from the past layouts. 300 * 301 * Time dependent geometry is what we call having blocks with different layouts 302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the 303 * block’s birth time (+ the time expansion ended) to establish the correct 304 * width for a given block. After an expansion completes, we record the time 305 * for blocks written with a particular width (geometry). 306 * 307 * == On Disk Format Changes == 308 * 309 * New pool feature flag, 'raidz_expansion' whose reference count is the number 310 * of RAIDZ VDEVs that have been expanded. 311 * 312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 313 * 314 * Since the uberblock can point to arbitrary blocks, which might be on the 315 * expanding RAIDZ, and might or might not have been expanded. We need to know 316 * which way a block is laid out before reading it. This info is the next 317 * offset that needs to be reflowed and we persist that in the uberblock, in 318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 319 * After the expansion is complete, we then use the raidz_expand_txgs array 320 * (see below) to determine how to read a block and the ub_raidz_reflow_info 321 * field no longer required. 322 * 323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space 324 * state (i.e., active or not) which is also required before reading a block 325 * during the initial phase of reflowing the data. 326 * 327 * The top-level RAIDZ VDEV has two new entries in the nvlist: 328 * 329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 330 * and used after the expansion is complete to 331 * determine how to read a raidz block 332 * 'raidz_expanding' boolean: present during reflow and removed after completion 333 * used during a spa import to resume an unfinished 334 * expansion 335 * 336 * And finally the VDEVs top zap adds the following informational entries: 337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 341 */ 342 343 /* 344 * For testing only: pause the raidz expansion after reflowing this amount. 345 * (accessed by ZTS and ztest) 346 */ 347 #ifdef _KERNEL 348 static 349 #endif /* _KERNEL */ 350 unsigned long raidz_expand_max_reflow_bytes = 0; 351 352 /* 353 * For testing only: pause the raidz expansion at a certain point. 354 */ 355 uint_t raidz_expand_pause_point = 0; 356 357 /* 358 * Maximum amount of copy io's outstanding at once. 359 */ 360 #ifdef _ILP32 361 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; 362 #else 363 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 364 #endif 365 366 /* 367 * Apply raidz map abds aggregation if the number of rows in the map is equal 368 * or greater than the value below. 369 */ 370 static unsigned long raidz_io_aggregate_rows = 4; 371 372 /* 373 * Automatically start a pool scrub when a RAIDZ expansion completes in 374 * order to verify the checksums of all blocks which have been copied 375 * during the expansion. Automatic scrubbing is enabled by default and 376 * is strongly recommended. 377 */ 378 static int zfs_scrub_after_expand = 1; 379 380 static void 381 vdev_raidz_row_free(raidz_row_t *rr) 382 { 383 for (int c = 0; c < rr->rr_cols; c++) { 384 raidz_col_t *rc = &rr->rr_col[c]; 385 386 if (rc->rc_size != 0) 387 abd_free(rc->rc_abd); 388 if (rc->rc_orig_data != NULL) 389 abd_free(rc->rc_orig_data); 390 } 391 392 if (rr->rr_abd_empty != NULL) 393 abd_free(rr->rr_abd_empty); 394 395 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 396 } 397 398 void 399 vdev_raidz_map_free(raidz_map_t *rm) 400 { 401 for (int i = 0; i < rm->rm_nrows; i++) 402 vdev_raidz_row_free(rm->rm_row[i]); 403 404 if (rm->rm_nphys_cols) { 405 for (int i = 0; i < rm->rm_nphys_cols; i++) { 406 if (rm->rm_phys_col[i].rc_abd != NULL) 407 abd_free(rm->rm_phys_col[i].rc_abd); 408 } 409 410 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 411 rm->rm_nphys_cols); 412 } 413 414 ASSERT3P(rm->rm_lr, ==, NULL); 415 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 416 } 417 418 static void 419 vdev_raidz_map_free_vsd(zio_t *zio) 420 { 421 raidz_map_t *rm = zio->io_vsd; 422 423 vdev_raidz_map_free(rm); 424 } 425 426 static int 427 vdev_raidz_reflow_compare(const void *x1, const void *x2) 428 { 429 const reflow_node_t *l = x1; 430 const reflow_node_t *r = x2; 431 432 return (TREE_CMP(l->re_txg, r->re_txg)); 433 } 434 435 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 436 .vsd_free = vdev_raidz_map_free_vsd, 437 }; 438 439 raidz_row_t * 440 vdev_raidz_row_alloc(int cols, zio_t *zio) 441 { 442 raidz_row_t *rr = 443 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 444 445 rr->rr_cols = cols; 446 rr->rr_scols = cols; 447 448 for (int c = 0; c < cols; c++) { 449 raidz_col_t *rc = &rr->rr_col[c]; 450 rc->rc_shadow_devidx = INT_MAX; 451 rc->rc_shadow_offset = UINT64_MAX; 452 /* 453 * We can not allow self healing to take place for Direct I/O 454 * reads. There is nothing that stops the buffer contents from 455 * being manipulated while the I/O is in flight. It is possible 456 * that the checksum could be verified on the buffer and then 457 * the contents of that buffer are manipulated afterwards. This 458 * could lead to bad data being written out during self 459 * healing. 460 */ 461 if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 462 rc->rc_allow_repair = 1; 463 } 464 return (rr); 465 } 466 467 static void 468 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 469 { 470 int c; 471 int nwrapped = 0; 472 uint64_t off = 0; 473 raidz_row_t *rr = rm->rm_row[0]; 474 475 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 476 ASSERT3U(rm->rm_nrows, ==, 1); 477 478 /* 479 * Pad any parity columns with additional space to account for skip 480 * sectors. 481 */ 482 if (rm->rm_skipstart < rr->rr_firstdatacol) { 483 ASSERT0(rm->rm_skipstart); 484 nwrapped = rm->rm_nskip; 485 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 486 nwrapped = 487 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 488 } 489 490 /* 491 * Optional single skip sectors (rc_size == 0) will be handled in 492 * vdev_raidz_io_start_write(). 493 */ 494 int skipped = rr->rr_scols - rr->rr_cols; 495 496 /* Allocate buffers for the parity columns */ 497 for (c = 0; c < rr->rr_firstdatacol; c++) { 498 raidz_col_t *rc = &rr->rr_col[c]; 499 500 /* 501 * Parity columns will pad out a linear ABD to account for 502 * the skip sector. A linear ABD is used here because 503 * parity calculations use the ABD buffer directly to calculate 504 * parity. This avoids doing a memcpy back to the ABD after the 505 * parity has been calculated. By issuing the parity column 506 * with the skip sector we can reduce contention on the child 507 * VDEV queue locks (vq_lock). 508 */ 509 if (c < nwrapped) { 510 rc->rc_abd = abd_alloc_linear( 511 rc->rc_size + (1ULL << ashift), B_FALSE); 512 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 513 skipped++; 514 } else { 515 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 516 } 517 } 518 519 for (off = 0; c < rr->rr_cols; c++) { 520 raidz_col_t *rc = &rr->rr_col[c]; 521 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 522 zio->io_abd, off, rc->rc_size); 523 524 /* 525 * Generate I/O for skip sectors to improve aggregation 526 * continuity. We will use gang ABD's to reduce contention 527 * on the child VDEV queue locks (vq_lock) by issuing 528 * a single I/O that contains the data and skip sector. 529 * 530 * It is important to make sure that rc_size is not updated 531 * even though we are adding a skip sector to the ABD. When 532 * calculating the parity in vdev_raidz_generate_parity_row() 533 * the rc_size is used to iterate through the ABD's. We can 534 * not have zero'd out skip sectors used for calculating 535 * parity for raidz, because those same sectors are not used 536 * during reconstruction. 537 */ 538 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 539 rc->rc_abd = abd_alloc_gang(); 540 abd_gang_add(rc->rc_abd, abd, B_TRUE); 541 abd_gang_add(rc->rc_abd, 542 abd_get_zeros(1ULL << ashift), B_TRUE); 543 skipped++; 544 } else { 545 rc->rc_abd = abd; 546 } 547 off += rc->rc_size; 548 } 549 550 ASSERT3U(off, ==, zio->io_size); 551 ASSERT3S(skipped, ==, rm->rm_nskip); 552 } 553 554 static void 555 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 556 { 557 int c; 558 raidz_row_t *rr = rm->rm_row[0]; 559 560 ASSERT3U(rm->rm_nrows, ==, 1); 561 562 /* Allocate buffers for the parity columns */ 563 for (c = 0; c < rr->rr_firstdatacol; c++) 564 rr->rr_col[c].rc_abd = 565 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 566 567 for (uint64_t off = 0; c < rr->rr_cols; c++) { 568 raidz_col_t *rc = &rr->rr_col[c]; 569 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 570 zio->io_abd, off, rc->rc_size); 571 off += rc->rc_size; 572 } 573 } 574 575 /* 576 * Divides the IO evenly across all child vdevs; usually, dcols is 577 * the number of children in the target vdev. 578 * 579 * Avoid inlining the function to keep vdev_raidz_io_start(), which 580 * is this functions only caller, as small as possible on the stack. 581 */ 582 noinline raidz_map_t * 583 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 584 uint64_t nparity) 585 { 586 raidz_row_t *rr; 587 /* The starting RAIDZ (parent) vdev sector of the block. */ 588 uint64_t b = zio->io_offset >> ashift; 589 /* The zio's size in units of the vdev's minimum sector size. */ 590 uint64_t s = zio->io_size >> ashift; 591 /* The first column for this stripe. */ 592 uint64_t f = b % dcols; 593 /* The starting byte offset on each child vdev. */ 594 uint64_t o = (b / dcols) << ashift; 595 uint64_t acols, scols; 596 597 raidz_map_t *rm = 598 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 599 rm->rm_nrows = 1; 600 601 /* 602 * "Quotient": The number of data sectors for this stripe on all but 603 * the "big column" child vdevs that also contain "remainder" data. 604 */ 605 uint64_t q = s / (dcols - nparity); 606 607 /* 608 * "Remainder": The number of partial stripe data sectors in this I/O. 609 * This will add a sector to some, but not all, child vdevs. 610 */ 611 uint64_t r = s - q * (dcols - nparity); 612 613 /* The number of "big columns" - those which contain remainder data. */ 614 uint64_t bc = (r == 0 ? 0 : r + nparity); 615 616 /* 617 * The total number of data and parity sectors associated with 618 * this I/O. 619 */ 620 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 621 622 /* 623 * acols: The columns that will be accessed. 624 * scols: The columns that will be accessed or skipped. 625 */ 626 if (q == 0) { 627 /* Our I/O request doesn't span all child vdevs. */ 628 acols = bc; 629 scols = MIN(dcols, roundup(bc, nparity + 1)); 630 } else { 631 acols = dcols; 632 scols = dcols; 633 } 634 635 ASSERT3U(acols, <=, scols); 636 rr = vdev_raidz_row_alloc(scols, zio); 637 rm->rm_row[0] = rr; 638 rr->rr_cols = acols; 639 rr->rr_bigcols = bc; 640 rr->rr_firstdatacol = nparity; 641 #ifdef ZFS_DEBUG 642 rr->rr_offset = zio->io_offset; 643 rr->rr_size = zio->io_size; 644 #endif 645 646 uint64_t asize = 0; 647 648 for (uint64_t c = 0; c < scols; c++) { 649 raidz_col_t *rc = &rr->rr_col[c]; 650 uint64_t col = f + c; 651 uint64_t coff = o; 652 if (col >= dcols) { 653 col -= dcols; 654 coff += 1ULL << ashift; 655 } 656 rc->rc_devidx = col; 657 rc->rc_offset = coff; 658 659 if (c >= acols) 660 rc->rc_size = 0; 661 else if (c < bc) 662 rc->rc_size = (q + 1) << ashift; 663 else 664 rc->rc_size = q << ashift; 665 666 asize += rc->rc_size; 667 } 668 669 ASSERT3U(asize, ==, tot << ashift); 670 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 671 rm->rm_skipstart = bc; 672 673 /* 674 * If all data stored spans all columns, there's a danger that parity 675 * will always be on the same device and, since parity isn't read 676 * during normal operation, that device's I/O bandwidth won't be 677 * used effectively. We therefore switch the parity every 1MB. 678 * 679 * ... at least that was, ostensibly, the theory. As a practical 680 * matter unless we juggle the parity between all devices evenly, we 681 * won't see any benefit. Further, occasional writes that aren't a 682 * multiple of the LCM of the number of children and the minimum 683 * stripe width are sufficient to avoid pessimal behavior. 684 * Unfortunately, this decision created an implicit on-disk format 685 * requirement that we need to support for all eternity, but only 686 * for single-parity RAID-Z. 687 * 688 * If we intend to skip a sector in the zeroth column for padding 689 * we must make sure to note this swap. We will never intend to 690 * skip the first column since at least one data and one parity 691 * column must appear in each row. 692 */ 693 ASSERT(rr->rr_cols >= 2); 694 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 695 696 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 697 uint64_t devidx = rr->rr_col[0].rc_devidx; 698 o = rr->rr_col[0].rc_offset; 699 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 700 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 701 rr->rr_col[1].rc_devidx = devidx; 702 rr->rr_col[1].rc_offset = o; 703 if (rm->rm_skipstart == 0) 704 rm->rm_skipstart = 1; 705 } 706 707 if (zio->io_type == ZIO_TYPE_WRITE) { 708 vdev_raidz_map_alloc_write(zio, rm, ashift); 709 } else { 710 vdev_raidz_map_alloc_read(zio, rm); 711 } 712 /* init RAIDZ parity ops */ 713 rm->rm_ops = vdev_raidz_math_get_ops(); 714 715 return (rm); 716 } 717 718 /* 719 * Everything before reflow_offset_synced should have been moved to the new 720 * location (read and write completed). However, this may not yet be reflected 721 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 722 * uberblock has not yet been written). If reflow is not in progress, 723 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 724 * entirely before reflow_offset_synced, it will come from the new location. 725 * Otherwise this row will come from the old location. Therefore, rows that 726 * straddle the reflow_offset_synced will come from the old location. 727 * 728 * For writes, reflow_offset_next is the next offset to copy. If a sector has 729 * been copied, but not yet reflected in the on-disk progress 730 * (reflow_offset_synced), it will also be written to the new (already copied) 731 * offset. 732 */ 733 noinline raidz_map_t * 734 vdev_raidz_map_alloc_expanded(zio_t *zio, 735 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 736 uint64_t nparity, uint64_t reflow_offset_synced, 737 uint64_t reflow_offset_next, boolean_t use_scratch) 738 { 739 abd_t *abd = zio->io_abd; 740 uint64_t offset = zio->io_offset; 741 uint64_t size = zio->io_size; 742 743 /* The zio's size in units of the vdev's minimum sector size. */ 744 uint64_t s = size >> ashift; 745 746 /* 747 * "Quotient": The number of data sectors for this stripe on all but 748 * the "big column" child vdevs that also contain "remainder" data. 749 * AKA "full rows" 750 */ 751 uint64_t q = s / (logical_cols - nparity); 752 753 /* 754 * "Remainder": The number of partial stripe data sectors in this I/O. 755 * This will add a sector to some, but not all, child vdevs. 756 */ 757 uint64_t r = s - q * (logical_cols - nparity); 758 759 /* The number of "big columns" - those which contain remainder data. */ 760 uint64_t bc = (r == 0 ? 0 : r + nparity); 761 762 /* 763 * The total number of data and parity sectors associated with 764 * this I/O. 765 */ 766 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 767 768 /* How many rows contain data (not skip) */ 769 uint64_t rows = howmany(tot, logical_cols); 770 int cols = MIN(tot, logical_cols); 771 772 raidz_map_t *rm = 773 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 774 KM_SLEEP); 775 rm->rm_nrows = rows; 776 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 777 rm->rm_skipstart = bc; 778 uint64_t asize = 0; 779 780 for (uint64_t row = 0; row < rows; row++) { 781 boolean_t row_use_scratch = B_FALSE; 782 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 783 rm->rm_row[row] = rr; 784 785 /* The starting RAIDZ (parent) vdev sector of the row. */ 786 uint64_t b = (offset >> ashift) + row * logical_cols; 787 788 /* 789 * If we are in the middle of a reflow, and the copying has 790 * not yet completed for any part of this row, then use the 791 * old location of this row. Note that reflow_offset_synced 792 * reflects the i/o that's been completed, because it's 793 * updated by a synctask, after zio_wait(spa_txg_zio[]). 794 * This is sufficient for our check, even if that progress 795 * has not yet been recorded to disk (reflected in 796 * spa_ubsync). Also note that we consider the last row to 797 * be "full width" (`cols`-wide rather than `bc`-wide) for 798 * this calculation. This causes a tiny bit of unnecessary 799 * double-writes but is safe and simpler to calculate. 800 */ 801 int row_phys_cols = physical_cols; 802 if (b + cols > reflow_offset_synced >> ashift) 803 row_phys_cols--; 804 else if (use_scratch) 805 row_use_scratch = B_TRUE; 806 807 /* starting child of this row */ 808 uint64_t child_id = b % row_phys_cols; 809 /* The starting byte offset on each child vdev. */ 810 uint64_t child_offset = (b / row_phys_cols) << ashift; 811 812 /* 813 * Note, rr_cols is the entire width of the block, even 814 * if this row is shorter. This is needed because parity 815 * generation (for Q and R) needs to know the entire width, 816 * because it treats the short row as though it was 817 * full-width (and the "phantom" sectors were zero-filled). 818 * 819 * Another approach to this would be to set cols shorter 820 * (to just the number of columns that we might do i/o to) 821 * and have another mechanism to tell the parity generation 822 * about the "entire width". Reconstruction (at least 823 * vdev_raidz_reconstruct_general()) would also need to 824 * know about the "entire width". 825 */ 826 rr->rr_firstdatacol = nparity; 827 #ifdef ZFS_DEBUG 828 /* 829 * note: rr_size is PSIZE, not ASIZE 830 */ 831 rr->rr_offset = b << ashift; 832 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 833 #endif 834 835 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 836 if (child_id >= row_phys_cols) { 837 child_id -= row_phys_cols; 838 child_offset += 1ULL << ashift; 839 } 840 raidz_col_t *rc = &rr->rr_col[c]; 841 rc->rc_devidx = child_id; 842 rc->rc_offset = child_offset; 843 844 /* 845 * Get this from the scratch space if appropriate. 846 * This only happens if we crashed in the middle of 847 * raidz_reflow_scratch_sync() (while it's running, 848 * the rangelock prevents us from doing concurrent 849 * io), and even then only during zpool import or 850 * when the pool is imported readonly. 851 */ 852 if (row_use_scratch) 853 rc->rc_offset -= VDEV_BOOT_SIZE; 854 855 uint64_t dc = c - rr->rr_firstdatacol; 856 if (c < rr->rr_firstdatacol) { 857 rc->rc_size = 1ULL << ashift; 858 859 /* 860 * Parity sectors' rc_abd's are set below 861 * after determining if this is an aggregation. 862 */ 863 } else if (row == rows - 1 && bc != 0 && c >= bc) { 864 /* 865 * Past the end of the block (even including 866 * skip sectors). This sector is part of the 867 * map so that we have full rows for p/q parity 868 * generation. 869 */ 870 rc->rc_size = 0; 871 rc->rc_abd = NULL; 872 } else { 873 /* "data column" (col excluding parity) */ 874 uint64_t off; 875 876 if (c < bc || r == 0) { 877 off = dc * rows + row; 878 } else { 879 off = r * rows + 880 (dc - r) * (rows - 1) + row; 881 } 882 rc->rc_size = 1ULL << ashift; 883 rc->rc_abd = abd_get_offset_struct( 884 &rc->rc_abdstruct, abd, off << ashift, 885 rc->rc_size); 886 } 887 888 if (rc->rc_size == 0) 889 continue; 890 891 /* 892 * If any part of this row is in both old and new 893 * locations, the primary location is the old 894 * location. If this sector was already copied to the 895 * new location, we need to also write to the new, 896 * "shadow" location. 897 * 898 * Note, `row_phys_cols != physical_cols` indicates 899 * that the primary location is the old location. 900 * `b+c < reflow_offset_next` indicates that the copy 901 * to the new location has been initiated. We know 902 * that the copy has completed because we have the 903 * rangelock, which is held exclusively while the 904 * copy is in progress. 905 */ 906 if (row_use_scratch || 907 (row_phys_cols != physical_cols && 908 b + c < reflow_offset_next >> ashift)) { 909 rc->rc_shadow_devidx = (b + c) % physical_cols; 910 rc->rc_shadow_offset = 911 ((b + c) / physical_cols) << ashift; 912 if (row_use_scratch) 913 rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 914 } 915 916 asize += rc->rc_size; 917 } 918 919 /* 920 * See comment in vdev_raidz_map_alloc() 921 */ 922 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 923 (offset & (1ULL << 20))) { 924 ASSERT(rr->rr_cols >= 2); 925 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 926 927 int devidx0 = rr->rr_col[0].rc_devidx; 928 uint64_t offset0 = rr->rr_col[0].rc_offset; 929 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 930 uint64_t shadow_offset0 = 931 rr->rr_col[0].rc_shadow_offset; 932 933 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 934 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 935 rr->rr_col[0].rc_shadow_devidx = 936 rr->rr_col[1].rc_shadow_devidx; 937 rr->rr_col[0].rc_shadow_offset = 938 rr->rr_col[1].rc_shadow_offset; 939 940 rr->rr_col[1].rc_devidx = devidx0; 941 rr->rr_col[1].rc_offset = offset0; 942 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 943 rr->rr_col[1].rc_shadow_offset = shadow_offset0; 944 } 945 } 946 ASSERT3U(asize, ==, tot << ashift); 947 948 /* 949 * Determine if the block is contiguous, in which case we can use 950 * an aggregation. 951 */ 952 if (rows >= raidz_io_aggregate_rows) { 953 rm->rm_nphys_cols = physical_cols; 954 rm->rm_phys_col = 955 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 956 KM_SLEEP); 957 958 /* 959 * Determine the aggregate io's offset and size, and check 960 * that the io is contiguous. 961 */ 962 for (int i = 0; 963 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 964 raidz_row_t *rr = rm->rm_row[i]; 965 for (int c = 0; c < rr->rr_cols; c++) { 966 raidz_col_t *rc = &rr->rr_col[c]; 967 raidz_col_t *prc = 968 &rm->rm_phys_col[rc->rc_devidx]; 969 970 if (rc->rc_size == 0) 971 continue; 972 973 if (prc->rc_size == 0) { 974 ASSERT0(prc->rc_offset); 975 prc->rc_offset = rc->rc_offset; 976 } else if (prc->rc_offset + prc->rc_size != 977 rc->rc_offset) { 978 /* 979 * This block is not contiguous and 980 * therefore can't be aggregated. 981 * This is expected to be rare, so 982 * the cost of allocating and then 983 * freeing rm_phys_col is not 984 * significant. 985 */ 986 kmem_free(rm->rm_phys_col, 987 sizeof (raidz_col_t) * 988 rm->rm_nphys_cols); 989 rm->rm_phys_col = NULL; 990 rm->rm_nphys_cols = 0; 991 break; 992 } 993 prc->rc_size += rc->rc_size; 994 } 995 } 996 } 997 if (rm->rm_phys_col != NULL) { 998 /* 999 * Allocate aggregate ABD's. 1000 */ 1001 for (int i = 0; i < rm->rm_nphys_cols; i++) { 1002 raidz_col_t *prc = &rm->rm_phys_col[i]; 1003 1004 prc->rc_devidx = i; 1005 1006 if (prc->rc_size == 0) 1007 continue; 1008 1009 prc->rc_abd = 1010 abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1011 B_FALSE); 1012 } 1013 1014 /* 1015 * Point the parity abd's into the aggregate abd's. 1016 */ 1017 for (int i = 0; i < rm->rm_nrows; i++) { 1018 raidz_row_t *rr = rm->rm_row[i]; 1019 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1020 raidz_col_t *rc = &rr->rr_col[c]; 1021 raidz_col_t *prc = 1022 &rm->rm_phys_col[rc->rc_devidx]; 1023 rc->rc_abd = 1024 abd_get_offset_struct(&rc->rc_abdstruct, 1025 prc->rc_abd, 1026 rc->rc_offset - prc->rc_offset, 1027 rc->rc_size); 1028 } 1029 } 1030 } else { 1031 /* 1032 * Allocate new abd's for the parity sectors. 1033 */ 1034 for (int i = 0; i < rm->rm_nrows; i++) { 1035 raidz_row_t *rr = rm->rm_row[i]; 1036 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1037 raidz_col_t *rc = &rr->rr_col[c]; 1038 rc->rc_abd = 1039 abd_alloc_linear(rc->rc_size, 1040 B_TRUE); 1041 } 1042 } 1043 } 1044 /* init RAIDZ parity ops */ 1045 rm->rm_ops = vdev_raidz_math_get_ops(); 1046 1047 return (rm); 1048 } 1049 1050 struct pqr_struct { 1051 uint64_t *p; 1052 uint64_t *q; 1053 uint64_t *r; 1054 }; 1055 1056 static int 1057 vdev_raidz_p_func(void *buf, size_t size, void *private) 1058 { 1059 struct pqr_struct *pqr = private; 1060 const uint64_t *src = buf; 1061 int cnt = size / sizeof (src[0]); 1062 1063 ASSERT(pqr->p && !pqr->q && !pqr->r); 1064 1065 for (int i = 0; i < cnt; i++, src++, pqr->p++) 1066 *pqr->p ^= *src; 1067 1068 return (0); 1069 } 1070 1071 static int 1072 vdev_raidz_pq_func(void *buf, size_t size, void *private) 1073 { 1074 struct pqr_struct *pqr = private; 1075 const uint64_t *src = buf; 1076 uint64_t mask; 1077 int cnt = size / sizeof (src[0]); 1078 1079 ASSERT(pqr->p && pqr->q && !pqr->r); 1080 1081 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1082 *pqr->p ^= *src; 1083 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1084 *pqr->q ^= *src; 1085 } 1086 1087 return (0); 1088 } 1089 1090 static int 1091 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1092 { 1093 struct pqr_struct *pqr = private; 1094 const uint64_t *src = buf; 1095 uint64_t mask; 1096 int cnt = size / sizeof (src[0]); 1097 1098 ASSERT(pqr->p && pqr->q && pqr->r); 1099 1100 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1101 *pqr->p ^= *src; 1102 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1103 *pqr->q ^= *src; 1104 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1105 *pqr->r ^= *src; 1106 } 1107 1108 return (0); 1109 } 1110 1111 static void 1112 vdev_raidz_generate_parity_p(raidz_row_t *rr) 1113 { 1114 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1115 1116 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1117 abd_t *src = rr->rr_col[c].rc_abd; 1118 1119 if (c == rr->rr_firstdatacol) { 1120 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1121 } else { 1122 struct pqr_struct pqr = { p, NULL, NULL }; 1123 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1124 vdev_raidz_p_func, &pqr); 1125 } 1126 } 1127 } 1128 1129 static void 1130 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1131 { 1132 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1133 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1134 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1135 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1136 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1137 1138 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1139 abd_t *src = rr->rr_col[c].rc_abd; 1140 1141 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1142 1143 if (c == rr->rr_firstdatacol) { 1144 ASSERT(ccnt == pcnt || ccnt == 0); 1145 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1146 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1147 1148 for (uint64_t i = ccnt; i < pcnt; i++) { 1149 p[i] = 0; 1150 q[i] = 0; 1151 } 1152 } else { 1153 struct pqr_struct pqr = { p, q, NULL }; 1154 1155 ASSERT(ccnt <= pcnt); 1156 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1157 vdev_raidz_pq_func, &pqr); 1158 1159 /* 1160 * Treat short columns as though they are full of 0s. 1161 * Note that there's therefore nothing needed for P. 1162 */ 1163 uint64_t mask; 1164 for (uint64_t i = ccnt; i < pcnt; i++) { 1165 VDEV_RAIDZ_64MUL_2(q[i], mask); 1166 } 1167 } 1168 } 1169 } 1170 1171 static void 1172 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1173 { 1174 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1175 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1176 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 1177 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1178 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1179 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1180 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1181 rr->rr_col[VDEV_RAIDZ_R].rc_size); 1182 1183 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1184 abd_t *src = rr->rr_col[c].rc_abd; 1185 1186 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1187 1188 if (c == rr->rr_firstdatacol) { 1189 ASSERT(ccnt == pcnt || ccnt == 0); 1190 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1191 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1192 (void) memcpy(r, p, rr->rr_col[c].rc_size); 1193 1194 for (uint64_t i = ccnt; i < pcnt; i++) { 1195 p[i] = 0; 1196 q[i] = 0; 1197 r[i] = 0; 1198 } 1199 } else { 1200 struct pqr_struct pqr = { p, q, r }; 1201 1202 ASSERT(ccnt <= pcnt); 1203 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1204 vdev_raidz_pqr_func, &pqr); 1205 1206 /* 1207 * Treat short columns as though they are full of 0s. 1208 * Note that there's therefore nothing needed for P. 1209 */ 1210 uint64_t mask; 1211 for (uint64_t i = ccnt; i < pcnt; i++) { 1212 VDEV_RAIDZ_64MUL_2(q[i], mask); 1213 VDEV_RAIDZ_64MUL_4(r[i], mask); 1214 } 1215 } 1216 } 1217 } 1218 1219 /* 1220 * Generate RAID parity in the first virtual columns according to the number of 1221 * parity columns available. 1222 */ 1223 void 1224 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1225 { 1226 if (rr->rr_cols == 0) { 1227 /* 1228 * We are handling this block one row at a time (because 1229 * this block has a different logical vs physical width, 1230 * due to RAIDZ expansion), and this is a pad-only row, 1231 * which has no parity. 1232 */ 1233 return; 1234 } 1235 1236 /* Generate using the new math implementation */ 1237 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1238 return; 1239 1240 switch (rr->rr_firstdatacol) { 1241 case 1: 1242 vdev_raidz_generate_parity_p(rr); 1243 break; 1244 case 2: 1245 vdev_raidz_generate_parity_pq(rr); 1246 break; 1247 case 3: 1248 vdev_raidz_generate_parity_pqr(rr); 1249 break; 1250 default: 1251 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1252 } 1253 } 1254 1255 void 1256 vdev_raidz_generate_parity(raidz_map_t *rm) 1257 { 1258 for (int i = 0; i < rm->rm_nrows; i++) { 1259 raidz_row_t *rr = rm->rm_row[i]; 1260 vdev_raidz_generate_parity_row(rm, rr); 1261 } 1262 } 1263 1264 static int 1265 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1266 { 1267 (void) private; 1268 uint64_t *dst = dbuf; 1269 uint64_t *src = sbuf; 1270 int cnt = size / sizeof (src[0]); 1271 1272 for (int i = 0; i < cnt; i++) { 1273 dst[i] ^= src[i]; 1274 } 1275 1276 return (0); 1277 } 1278 1279 static int 1280 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1281 void *private) 1282 { 1283 (void) private; 1284 uint64_t *dst = dbuf; 1285 uint64_t *src = sbuf; 1286 uint64_t mask; 1287 int cnt = size / sizeof (dst[0]); 1288 1289 for (int i = 0; i < cnt; i++, dst++, src++) { 1290 VDEV_RAIDZ_64MUL_2(*dst, mask); 1291 *dst ^= *src; 1292 } 1293 1294 return (0); 1295 } 1296 1297 static int 1298 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1299 { 1300 (void) private; 1301 uint64_t *dst = buf; 1302 uint64_t mask; 1303 int cnt = size / sizeof (dst[0]); 1304 1305 for (int i = 0; i < cnt; i++, dst++) { 1306 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1307 VDEV_RAIDZ_64MUL_2(*dst, mask); 1308 } 1309 1310 return (0); 1311 } 1312 1313 struct reconst_q_struct { 1314 uint64_t *q; 1315 int exp; 1316 }; 1317 1318 static int 1319 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1320 { 1321 struct reconst_q_struct *rq = private; 1322 uint64_t *dst = buf; 1323 int cnt = size / sizeof (dst[0]); 1324 1325 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1326 int j; 1327 uint8_t *b; 1328 1329 *dst ^= *rq->q; 1330 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1331 *b = vdev_raidz_exp2(*b, rq->exp); 1332 } 1333 } 1334 1335 return (0); 1336 } 1337 1338 struct reconst_pq_struct { 1339 uint8_t *p; 1340 uint8_t *q; 1341 uint8_t *pxy; 1342 uint8_t *qxy; 1343 int aexp; 1344 int bexp; 1345 }; 1346 1347 static int 1348 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1349 { 1350 struct reconst_pq_struct *rpq = private; 1351 uint8_t *xd = xbuf; 1352 uint8_t *yd = ybuf; 1353 1354 for (int i = 0; i < size; 1355 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1356 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1357 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1358 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1359 } 1360 1361 return (0); 1362 } 1363 1364 static int 1365 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1366 { 1367 struct reconst_pq_struct *rpq = private; 1368 uint8_t *xd = xbuf; 1369 1370 for (int i = 0; i < size; 1371 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1372 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1373 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1374 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1375 } 1376 1377 return (0); 1378 } 1379 1380 static void 1381 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1382 { 1383 int x = tgts[0]; 1384 abd_t *dst, *src; 1385 1386 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1387 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1388 1389 ASSERT3U(ntgts, ==, 1); 1390 ASSERT3U(x, >=, rr->rr_firstdatacol); 1391 ASSERT3U(x, <, rr->rr_cols); 1392 1393 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1394 1395 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1396 dst = rr->rr_col[x].rc_abd; 1397 1398 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1399 1400 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1401 uint64_t size = MIN(rr->rr_col[x].rc_size, 1402 rr->rr_col[c].rc_size); 1403 1404 src = rr->rr_col[c].rc_abd; 1405 1406 if (c == x) 1407 continue; 1408 1409 (void) abd_iterate_func2(dst, src, 0, 0, size, 1410 vdev_raidz_reconst_p_func, NULL); 1411 } 1412 } 1413 1414 static void 1415 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1416 { 1417 int x = tgts[0]; 1418 int c, exp; 1419 abd_t *dst, *src; 1420 1421 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1422 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1423 1424 ASSERT(ntgts == 1); 1425 1426 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1427 1428 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1429 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 1430 rr->rr_col[c].rc_size); 1431 1432 src = rr->rr_col[c].rc_abd; 1433 dst = rr->rr_col[x].rc_abd; 1434 1435 if (c == rr->rr_firstdatacol) { 1436 abd_copy(dst, src, size); 1437 if (rr->rr_col[x].rc_size > size) { 1438 abd_zero_off(dst, size, 1439 rr->rr_col[x].rc_size - size); 1440 } 1441 } else { 1442 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1443 (void) abd_iterate_func2(dst, src, 0, 0, size, 1444 vdev_raidz_reconst_q_pre_func, NULL); 1445 (void) abd_iterate_func(dst, 1446 size, rr->rr_col[x].rc_size - size, 1447 vdev_raidz_reconst_q_pre_tail_func, NULL); 1448 } 1449 } 1450 1451 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1452 dst = rr->rr_col[x].rc_abd; 1453 exp = 255 - (rr->rr_cols - 1 - x); 1454 1455 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1456 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1457 vdev_raidz_reconst_q_post_func, &rq); 1458 } 1459 1460 static void 1461 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1462 { 1463 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1464 abd_t *pdata, *qdata; 1465 uint64_t xsize, ysize; 1466 int x = tgts[0]; 1467 int y = tgts[1]; 1468 abd_t *xd, *yd; 1469 1470 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1471 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1472 1473 ASSERT(ntgts == 2); 1474 ASSERT(x < y); 1475 ASSERT(x >= rr->rr_firstdatacol); 1476 ASSERT(y < rr->rr_cols); 1477 1478 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1479 1480 /* 1481 * Move the parity data aside -- we're going to compute parity as 1482 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1483 * reuse the parity generation mechanism without trashing the actual 1484 * parity so we make those columns appear to be full of zeros by 1485 * setting their lengths to zero. 1486 */ 1487 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1488 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1489 xsize = rr->rr_col[x].rc_size; 1490 ysize = rr->rr_col[y].rc_size; 1491 1492 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 1493 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1494 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 1495 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1496 rr->rr_col[x].rc_size = 0; 1497 rr->rr_col[y].rc_size = 0; 1498 1499 vdev_raidz_generate_parity_pq(rr); 1500 1501 rr->rr_col[x].rc_size = xsize; 1502 rr->rr_col[y].rc_size = ysize; 1503 1504 p = abd_to_buf(pdata); 1505 q = abd_to_buf(qdata); 1506 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1507 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1508 xd = rr->rr_col[x].rc_abd; 1509 yd = rr->rr_col[y].rc_abd; 1510 1511 /* 1512 * We now have: 1513 * Pxy = P + D_x + D_y 1514 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1515 * 1516 * We can then solve for D_x: 1517 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1518 * where 1519 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1520 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1521 * 1522 * With D_x in hand, we can easily solve for D_y: 1523 * D_y = P + Pxy + D_x 1524 */ 1525 1526 a = vdev_raidz_pow2[255 + x - y]; 1527 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1528 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1529 1530 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1531 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1532 1533 ASSERT3U(xsize, >=, ysize); 1534 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1535 1536 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1537 vdev_raidz_reconst_pq_func, &rpq); 1538 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1539 vdev_raidz_reconst_pq_tail_func, &rpq); 1540 1541 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1542 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1543 1544 /* 1545 * Restore the saved parity data. 1546 */ 1547 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 1548 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1549 } 1550 1551 /* 1552 * In the general case of reconstruction, we must solve the system of linear 1553 * equations defined by the coefficients used to generate parity as well as 1554 * the contents of the data and parity disks. This can be expressed with 1555 * vectors for the original data (D) and the actual data (d) and parity (p) 1556 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1557 * 1558 * __ __ __ __ 1559 * | | __ __ | p_0 | 1560 * | V | | D_0 | | p_m-1 | 1561 * | | x | : | = | d_0 | 1562 * | I | | D_n-1 | | : | 1563 * | | ~~ ~~ | d_n-1 | 1564 * ~~ ~~ ~~ ~~ 1565 * 1566 * I is simply a square identity matrix of size n, and V is a vandermonde 1567 * matrix defined by the coefficients we chose for the various parity columns 1568 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1569 * computation as well as linear separability. 1570 * 1571 * __ __ __ __ 1572 * | 1 .. 1 1 1 | | p_0 | 1573 * | 2^n-1 .. 4 2 1 | __ __ | : | 1574 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1575 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1576 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1577 * | : : : : | | : | | d_2 | 1578 * | 0 .. 1 0 0 | | D_n-1 | | : | 1579 * | 0 .. 0 1 0 | ~~ ~~ | : | 1580 * | 0 .. 0 0 1 | | d_n-1 | 1581 * ~~ ~~ ~~ ~~ 1582 * 1583 * Note that I, V, d, and p are known. To compute D, we must invert the 1584 * matrix and use the known data and parity values to reconstruct the unknown 1585 * data values. We begin by removing the rows in V|I and d|p that correspond 1586 * to failed or missing columns; we then make V|I square (n x n) and d|p 1587 * sized n by removing rows corresponding to unused parity from the bottom up 1588 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1589 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1590 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1591 * __ __ 1592 * | 1 1 1 1 1 1 1 1 | 1593 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1594 * | 19 205 116 29 64 16 4 1 | / / 1595 * | 1 0 0 0 0 0 0 0 | / / 1596 * | 0 1 0 0 0 0 0 0 | <--' / 1597 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1598 * | 0 0 0 1 0 0 0 0 | 1599 * | 0 0 0 0 1 0 0 0 | 1600 * | 0 0 0 0 0 1 0 0 | 1601 * | 0 0 0 0 0 0 1 0 | 1602 * | 0 0 0 0 0 0 0 1 | 1603 * ~~ ~~ 1604 * __ __ 1605 * | 1 1 1 1 1 1 1 1 | 1606 * | 128 64 32 16 8 4 2 1 | 1607 * | 19 205 116 29 64 16 4 1 | 1608 * | 1 0 0 0 0 0 0 0 | 1609 * | 0 1 0 0 0 0 0 0 | 1610 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1611 * | 0 0 0 1 0 0 0 0 | 1612 * | 0 0 0 0 1 0 0 0 | 1613 * | 0 0 0 0 0 1 0 0 | 1614 * | 0 0 0 0 0 0 1 0 | 1615 * | 0 0 0 0 0 0 0 1 | 1616 * ~~ ~~ 1617 * 1618 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1619 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1620 * matrix is not singular. 1621 * __ __ 1622 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1623 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1624 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1625 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1626 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1627 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1628 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1629 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1630 * ~~ ~~ 1631 * __ __ 1632 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1633 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1634 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1635 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1636 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1637 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1638 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1639 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1640 * ~~ ~~ 1641 * __ __ 1642 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1643 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1644 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1645 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1646 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1647 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1648 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1649 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1650 * ~~ ~~ 1651 * __ __ 1652 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1653 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1654 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1655 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1656 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1657 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1658 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1659 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1660 * ~~ ~~ 1661 * __ __ 1662 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1663 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1664 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1665 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1666 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1667 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1668 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1669 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1670 * ~~ ~~ 1671 * __ __ 1672 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1673 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1674 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1675 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1676 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1677 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1678 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1679 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1680 * ~~ ~~ 1681 * __ __ 1682 * | 0 0 1 0 0 0 0 0 | 1683 * | 167 100 5 41 159 169 217 208 | 1684 * | 166 100 4 40 158 168 216 209 | 1685 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1686 * | 0 0 0 0 1 0 0 0 | 1687 * | 0 0 0 0 0 1 0 0 | 1688 * | 0 0 0 0 0 0 1 0 | 1689 * | 0 0 0 0 0 0 0 1 | 1690 * ~~ ~~ 1691 * 1692 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1693 * of the missing data. 1694 * 1695 * As is apparent from the example above, the only non-trivial rows in the 1696 * inverse matrix correspond to the data disks that we're trying to 1697 * reconstruct. Indeed, those are the only rows we need as the others would 1698 * only be useful for reconstructing data known or assumed to be valid. For 1699 * that reason, we only build the coefficients in the rows that correspond to 1700 * targeted columns. 1701 */ 1702 1703 static void 1704 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1705 uint8_t **rows) 1706 { 1707 int i, j; 1708 int pow; 1709 1710 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1711 1712 /* 1713 * Fill in the missing rows of interest. 1714 */ 1715 for (i = 0; i < nmap; i++) { 1716 ASSERT3S(0, <=, map[i]); 1717 ASSERT3S(map[i], <=, 2); 1718 1719 pow = map[i] * n; 1720 if (pow > 255) 1721 pow -= 255; 1722 ASSERT(pow <= 255); 1723 1724 for (j = 0; j < n; j++) { 1725 pow -= map[i]; 1726 if (pow < 0) 1727 pow += 255; 1728 rows[i][j] = vdev_raidz_pow2[pow]; 1729 } 1730 } 1731 } 1732 1733 static void 1734 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1735 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1736 { 1737 int i, j, ii, jj; 1738 uint8_t log; 1739 1740 /* 1741 * Assert that the first nmissing entries from the array of used 1742 * columns correspond to parity columns and that subsequent entries 1743 * correspond to data columns. 1744 */ 1745 for (i = 0; i < nmissing; i++) { 1746 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1747 } 1748 for (; i < n; i++) { 1749 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1750 } 1751 1752 /* 1753 * First initialize the storage where we'll compute the inverse rows. 1754 */ 1755 for (i = 0; i < nmissing; i++) { 1756 for (j = 0; j < n; j++) { 1757 invrows[i][j] = (i == j) ? 1 : 0; 1758 } 1759 } 1760 1761 /* 1762 * Subtract all trivial rows from the rows of consequence. 1763 */ 1764 for (i = 0; i < nmissing; i++) { 1765 for (j = nmissing; j < n; j++) { 1766 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1767 jj = used[j] - rr->rr_firstdatacol; 1768 ASSERT3S(jj, <, n); 1769 invrows[i][j] = rows[i][jj]; 1770 rows[i][jj] = 0; 1771 } 1772 } 1773 1774 /* 1775 * For each of the rows of interest, we must normalize it and subtract 1776 * a multiple of it from the other rows. 1777 */ 1778 for (i = 0; i < nmissing; i++) { 1779 for (j = 0; j < missing[i]; j++) { 1780 ASSERT0(rows[i][j]); 1781 } 1782 ASSERT3U(rows[i][missing[i]], !=, 0); 1783 1784 /* 1785 * Compute the inverse of the first element and multiply each 1786 * element in the row by that value. 1787 */ 1788 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1789 1790 for (j = 0; j < n; j++) { 1791 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1792 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1793 } 1794 1795 for (ii = 0; ii < nmissing; ii++) { 1796 if (i == ii) 1797 continue; 1798 1799 ASSERT3U(rows[ii][missing[i]], !=, 0); 1800 1801 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1802 1803 for (j = 0; j < n; j++) { 1804 rows[ii][j] ^= 1805 vdev_raidz_exp2(rows[i][j], log); 1806 invrows[ii][j] ^= 1807 vdev_raidz_exp2(invrows[i][j], log); 1808 } 1809 } 1810 } 1811 1812 /* 1813 * Verify that the data that is left in the rows are properly part of 1814 * an identity matrix. 1815 */ 1816 for (i = 0; i < nmissing; i++) { 1817 for (j = 0; j < n; j++) { 1818 if (j == missing[i]) { 1819 ASSERT3U(rows[i][j], ==, 1); 1820 } else { 1821 ASSERT0(rows[i][j]); 1822 } 1823 } 1824 } 1825 } 1826 1827 static void 1828 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1829 int *missing, uint8_t **invrows, const uint8_t *used) 1830 { 1831 int i, j, x, cc, c; 1832 uint8_t *src; 1833 uint64_t ccount; 1834 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1835 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1836 uint8_t log = 0; 1837 uint8_t val; 1838 int ll; 1839 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1840 uint8_t *p, *pp; 1841 size_t psize; 1842 1843 psize = sizeof (invlog[0][0]) * n * nmissing; 1844 p = kmem_alloc(psize, KM_SLEEP); 1845 1846 for (pp = p, i = 0; i < nmissing; i++) { 1847 invlog[i] = pp; 1848 pp += n; 1849 } 1850 1851 for (i = 0; i < nmissing; i++) { 1852 for (j = 0; j < n; j++) { 1853 ASSERT3U(invrows[i][j], !=, 0); 1854 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1855 } 1856 } 1857 1858 for (i = 0; i < n; i++) { 1859 c = used[i]; 1860 ASSERT3U(c, <, rr->rr_cols); 1861 1862 ccount = rr->rr_col[c].rc_size; 1863 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1864 if (ccount == 0) 1865 continue; 1866 src = abd_to_buf(rr->rr_col[c].rc_abd); 1867 for (j = 0; j < nmissing; j++) { 1868 cc = missing[j] + rr->rr_firstdatacol; 1869 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1870 ASSERT3U(cc, <, rr->rr_cols); 1871 ASSERT3U(cc, !=, c); 1872 1873 dcount[j] = rr->rr_col[cc].rc_size; 1874 if (dcount[j] != 0) 1875 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1876 } 1877 1878 for (x = 0; x < ccount; x++, src++) { 1879 if (*src != 0) 1880 log = vdev_raidz_log2[*src]; 1881 1882 for (cc = 0; cc < nmissing; cc++) { 1883 if (x >= dcount[cc]) 1884 continue; 1885 1886 if (*src == 0) { 1887 val = 0; 1888 } else { 1889 if ((ll = log + invlog[cc][i]) >= 255) 1890 ll -= 255; 1891 val = vdev_raidz_pow2[ll]; 1892 } 1893 1894 if (i == 0) 1895 dst[cc][x] = val; 1896 else 1897 dst[cc][x] ^= val; 1898 } 1899 } 1900 } 1901 1902 kmem_free(p, psize); 1903 } 1904 1905 static void 1906 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1907 { 1908 int i, c, t, tt; 1909 unsigned int n; 1910 unsigned int nmissing_rows; 1911 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1912 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1913 uint8_t *p, *pp; 1914 size_t psize; 1915 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1916 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1917 uint8_t *used; 1918 1919 abd_t **bufs = NULL; 1920 1921 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1922 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1923 /* 1924 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1925 * temporary linear ABDs if any non-linear ABDs are found. 1926 */ 1927 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1928 ASSERT(rr->rr_col[i].rc_abd != NULL); 1929 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1930 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1931 KM_PUSHPAGE); 1932 1933 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1934 raidz_col_t *col = &rr->rr_col[c]; 1935 1936 bufs[c] = col->rc_abd; 1937 if (bufs[c] != NULL) { 1938 col->rc_abd = abd_alloc_linear( 1939 col->rc_size, B_TRUE); 1940 abd_copy(col->rc_abd, bufs[c], 1941 col->rc_size); 1942 } 1943 } 1944 1945 break; 1946 } 1947 } 1948 1949 n = rr->rr_cols - rr->rr_firstdatacol; 1950 1951 /* 1952 * Figure out which data columns are missing. 1953 */ 1954 nmissing_rows = 0; 1955 for (t = 0; t < ntgts; t++) { 1956 if (tgts[t] >= rr->rr_firstdatacol) { 1957 missing_rows[nmissing_rows++] = 1958 tgts[t] - rr->rr_firstdatacol; 1959 } 1960 } 1961 1962 /* 1963 * Figure out which parity columns to use to help generate the missing 1964 * data columns. 1965 */ 1966 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1967 ASSERT(tt < ntgts); 1968 ASSERT(c < rr->rr_firstdatacol); 1969 1970 /* 1971 * Skip any targeted parity columns. 1972 */ 1973 if (c == tgts[tt]) { 1974 tt++; 1975 continue; 1976 } 1977 1978 parity_map[i] = c; 1979 i++; 1980 } 1981 1982 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1983 nmissing_rows * n + sizeof (used[0]) * n; 1984 p = kmem_alloc(psize, KM_SLEEP); 1985 1986 for (pp = p, i = 0; i < nmissing_rows; i++) { 1987 rows[i] = pp; 1988 pp += n; 1989 invrows[i] = pp; 1990 pp += n; 1991 } 1992 used = pp; 1993 1994 for (i = 0; i < nmissing_rows; i++) { 1995 used[i] = parity_map[i]; 1996 } 1997 1998 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1999 if (tt < nmissing_rows && 2000 c == missing_rows[tt] + rr->rr_firstdatacol) { 2001 tt++; 2002 continue; 2003 } 2004 2005 ASSERT3S(i, <, n); 2006 used[i] = c; 2007 i++; 2008 } 2009 2010 /* 2011 * Initialize the interesting rows of the matrix. 2012 */ 2013 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2014 2015 /* 2016 * Invert the matrix. 2017 */ 2018 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2019 invrows, used); 2020 2021 /* 2022 * Reconstruct the missing data using the generated matrix. 2023 */ 2024 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2025 invrows, used); 2026 2027 kmem_free(p, psize); 2028 2029 /* 2030 * copy back from temporary linear abds and free them 2031 */ 2032 if (bufs) { 2033 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2034 raidz_col_t *col = &rr->rr_col[c]; 2035 2036 if (bufs[c] != NULL) { 2037 abd_copy(bufs[c], col->rc_abd, col->rc_size); 2038 abd_free(col->rc_abd); 2039 } 2040 col->rc_abd = bufs[c]; 2041 } 2042 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2043 } 2044 } 2045 2046 static void 2047 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 2048 const int *t, int nt) 2049 { 2050 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2051 int ntgts; 2052 int i, c, ret; 2053 int nbadparity, nbaddata; 2054 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2055 2056 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2057 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2058 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2059 (int)rr->rr_missingparity); 2060 } 2061 2062 nbadparity = rr->rr_firstdatacol; 2063 nbaddata = rr->rr_cols - nbadparity; 2064 ntgts = 0; 2065 for (i = 0, c = 0; c < rr->rr_cols; c++) { 2066 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2067 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2068 "offset=%llx error=%u)", 2069 rr, c, (int)rr->rr_col[c].rc_devidx, 2070 (long long)rr->rr_col[c].rc_offset, 2071 (int)rr->rr_col[c].rc_error); 2072 } 2073 if (c < rr->rr_firstdatacol) 2074 parity_valid[c] = B_FALSE; 2075 2076 if (i < nt && c == t[i]) { 2077 tgts[ntgts++] = c; 2078 i++; 2079 } else if (rr->rr_col[c].rc_error != 0) { 2080 tgts[ntgts++] = c; 2081 } else if (c >= rr->rr_firstdatacol) { 2082 nbaddata--; 2083 } else { 2084 parity_valid[c] = B_TRUE; 2085 nbadparity--; 2086 } 2087 } 2088 2089 ASSERT(ntgts >= nt); 2090 ASSERT(nbaddata >= 0); 2091 ASSERT(nbaddata + nbadparity == ntgts); 2092 2093 dt = &tgts[nbadparity]; 2094 2095 /* Reconstruct using the new math implementation */ 2096 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2097 if (ret != RAIDZ_ORIGINAL_IMPL) 2098 return; 2099 2100 /* 2101 * See if we can use any of our optimized reconstruction routines. 2102 */ 2103 switch (nbaddata) { 2104 case 1: 2105 if (parity_valid[VDEV_RAIDZ_P]) { 2106 vdev_raidz_reconstruct_p(rr, dt, 1); 2107 return; 2108 } 2109 2110 ASSERT(rr->rr_firstdatacol > 1); 2111 2112 if (parity_valid[VDEV_RAIDZ_Q]) { 2113 vdev_raidz_reconstruct_q(rr, dt, 1); 2114 return; 2115 } 2116 2117 ASSERT(rr->rr_firstdatacol > 2); 2118 break; 2119 2120 case 2: 2121 ASSERT(rr->rr_firstdatacol > 1); 2122 2123 if (parity_valid[VDEV_RAIDZ_P] && 2124 parity_valid[VDEV_RAIDZ_Q]) { 2125 vdev_raidz_reconstruct_pq(rr, dt, 2); 2126 return; 2127 } 2128 2129 ASSERT(rr->rr_firstdatacol > 2); 2130 2131 break; 2132 } 2133 2134 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2135 } 2136 2137 static int 2138 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2139 uint64_t *logical_ashift, uint64_t *physical_ashift) 2140 { 2141 vdev_raidz_t *vdrz = vd->vdev_tsd; 2142 uint64_t nparity = vdrz->vd_nparity; 2143 int c; 2144 int lasterror = 0; 2145 int numerrors = 0; 2146 2147 ASSERT(nparity > 0); 2148 2149 if (nparity > VDEV_RAIDZ_MAXPARITY || 2150 vd->vdev_children < nparity + 1) { 2151 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2152 return (SET_ERROR(EINVAL)); 2153 } 2154 2155 vdev_open_children(vd); 2156 2157 for (c = 0; c < vd->vdev_children; c++) { 2158 vdev_t *cvd = vd->vdev_child[c]; 2159 2160 if (cvd->vdev_open_error != 0) { 2161 lasterror = cvd->vdev_open_error; 2162 numerrors++; 2163 continue; 2164 } 2165 2166 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2167 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2168 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2169 } 2170 for (c = 0; c < vd->vdev_children; c++) { 2171 vdev_t *cvd = vd->vdev_child[c]; 2172 2173 if (cvd->vdev_open_error != 0) 2174 continue; 2175 *physical_ashift = vdev_best_ashift(*logical_ashift, 2176 *physical_ashift, cvd->vdev_physical_ashift); 2177 } 2178 2179 if (vd->vdev_rz_expanding) { 2180 *asize *= vd->vdev_children - 1; 2181 *max_asize *= vd->vdev_children - 1; 2182 2183 vd->vdev_min_asize = *asize; 2184 } else { 2185 *asize *= vd->vdev_children; 2186 *max_asize *= vd->vdev_children; 2187 } 2188 2189 if (numerrors > nparity) { 2190 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2191 return (lasterror); 2192 } 2193 2194 return (0); 2195 } 2196 2197 static void 2198 vdev_raidz_close(vdev_t *vd) 2199 { 2200 for (int c = 0; c < vd->vdev_children; c++) { 2201 if (vd->vdev_child[c] != NULL) 2202 vdev_close(vd->vdev_child[c]); 2203 } 2204 } 2205 2206 /* 2207 * Return the logical width to use, given the txg in which the allocation 2208 * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2209 * BP was allocated. Remapped BP's (that were relocated due to device 2210 * removal, see remap_blkptr_cb()), will have a more recent physical birth 2211 * which reflects when the BP was relocated, but we can ignore these because 2212 * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2213 */ 2214 static uint64_t 2215 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2216 { 2217 reflow_node_t lookup = { 2218 .re_txg = txg, 2219 }; 2220 avl_index_t where; 2221 2222 uint64_t width; 2223 mutex_enter(&vdrz->vd_expand_lock); 2224 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2225 if (re != NULL) { 2226 width = re->re_logical_width; 2227 } else { 2228 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2229 if (re != NULL) 2230 width = re->re_logical_width; 2231 else 2232 width = vdrz->vd_original_width; 2233 } 2234 mutex_exit(&vdrz->vd_expand_lock); 2235 return (width); 2236 } 2237 2238 /* 2239 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2240 * more space due to the lower data-to-parity ratio. In this case it's 2241 * important to pass in the correct txg. Note that vdev_gang_header_asize() 2242 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2243 * regardless of txg. This is assured because for a single data sector, we 2244 * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2245 */ 2246 static uint64_t 2247 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2248 { 2249 vdev_raidz_t *vdrz = vd->vdev_tsd; 2250 uint64_t asize; 2251 uint64_t ashift = vd->vdev_top->vdev_ashift; 2252 uint64_t cols = vdrz->vd_original_width; 2253 uint64_t nparity = vdrz->vd_nparity; 2254 2255 cols = vdev_raidz_get_logical_width(vdrz, txg); 2256 2257 asize = ((psize - 1) >> ashift) + 1; 2258 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2259 asize = roundup(asize, nparity + 1) << ashift; 2260 2261 #ifdef ZFS_DEBUG 2262 uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2263 uint64_t ncols_new = vdrz->vd_physical_width; 2264 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2265 (ncols_new - nparity)); 2266 asize_new = roundup(asize_new, nparity + 1) << ashift; 2267 VERIFY3U(asize_new, <=, asize); 2268 #endif 2269 2270 return (asize); 2271 } 2272 2273 /* 2274 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 2275 * so each child must provide at least 1/Nth of its asize. 2276 */ 2277 static uint64_t 2278 vdev_raidz_min_asize(vdev_t *vd) 2279 { 2280 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 2281 vd->vdev_children); 2282 } 2283 2284 void 2285 vdev_raidz_child_done(zio_t *zio) 2286 { 2287 raidz_col_t *rc = zio->io_private; 2288 2289 ASSERT3P(rc->rc_abd, !=, NULL); 2290 rc->rc_error = zio->io_error; 2291 rc->rc_tried = 1; 2292 rc->rc_skipped = 0; 2293 } 2294 2295 static void 2296 vdev_raidz_shadow_child_done(zio_t *zio) 2297 { 2298 raidz_col_t *rc = zio->io_private; 2299 2300 rc->rc_shadow_error = zio->io_error; 2301 } 2302 2303 static void 2304 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2305 { 2306 (void) rm; 2307 #ifdef ZFS_DEBUG 2308 range_seg64_t logical_rs, physical_rs, remain_rs; 2309 logical_rs.rs_start = rr->rr_offset; 2310 logical_rs.rs_end = logical_rs.rs_start + 2311 vdev_raidz_asize(zio->io_vd, rr->rr_size, 2312 BP_GET_BIRTH(zio->io_bp)); 2313 2314 raidz_col_t *rc = &rr->rr_col[col]; 2315 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2316 2317 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 2318 ASSERT(vdev_xlate_is_empty(&remain_rs)); 2319 if (vdev_xlate_is_empty(&physical_rs)) { 2320 /* 2321 * If we are in the middle of expansion, the 2322 * physical->logical mapping is changing so vdev_xlate() 2323 * can't give us a reliable answer. 2324 */ 2325 return; 2326 } 2327 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2328 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2329 /* 2330 * It would be nice to assert that rs_end is equal 2331 * to rc_offset + rc_size but there might be an 2332 * optional I/O at the end that is not accounted in 2333 * rc_size. 2334 */ 2335 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2336 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2337 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2338 } else { 2339 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2340 } 2341 #endif 2342 } 2343 2344 static void 2345 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 2346 { 2347 vdev_t *vd = zio->io_vd; 2348 raidz_map_t *rm = zio->io_vsd; 2349 2350 vdev_raidz_generate_parity_row(rm, rr); 2351 2352 for (int c = 0; c < rr->rr_scols; c++) { 2353 raidz_col_t *rc = &rr->rr_col[c]; 2354 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2355 2356 /* Verify physical to logical translation */ 2357 vdev_raidz_io_verify(zio, rm, rr, c); 2358 2359 if (rc->rc_size == 0) 2360 continue; 2361 2362 ASSERT3U(rc->rc_offset + rc->rc_size, <, 2363 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2364 2365 ASSERT3P(rc->rc_abd, !=, NULL); 2366 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2367 rc->rc_offset, rc->rc_abd, 2368 abd_get_size(rc->rc_abd), zio->io_type, 2369 zio->io_priority, 0, vdev_raidz_child_done, rc)); 2370 2371 if (rc->rc_shadow_devidx != INT_MAX) { 2372 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2373 2374 ASSERT3U( 2375 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2376 cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2377 2378 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2379 rc->rc_shadow_offset, rc->rc_abd, 2380 abd_get_size(rc->rc_abd), 2381 zio->io_type, zio->io_priority, 0, 2382 vdev_raidz_shadow_child_done, rc)); 2383 } 2384 } 2385 } 2386 2387 /* 2388 * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2389 * This only works for vdev_raidz_map_alloc() (not _expanded()). 2390 */ 2391 static void 2392 raidz_start_skip_writes(zio_t *zio) 2393 { 2394 vdev_t *vd = zio->io_vd; 2395 uint64_t ashift = vd->vdev_top->vdev_ashift; 2396 raidz_map_t *rm = zio->io_vsd; 2397 ASSERT3U(rm->rm_nrows, ==, 1); 2398 raidz_row_t *rr = rm->rm_row[0]; 2399 for (int c = 0; c < rr->rr_scols; c++) { 2400 raidz_col_t *rc = &rr->rr_col[c]; 2401 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2402 if (rc->rc_size != 0) 2403 continue; 2404 ASSERT3P(rc->rc_abd, ==, NULL); 2405 2406 ASSERT3U(rc->rc_offset, <, 2407 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2408 2409 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2410 NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2411 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2412 } 2413 } 2414 2415 static void 2416 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 2417 { 2418 vdev_t *vd = zio->io_vd; 2419 2420 /* 2421 * Iterate over the columns in reverse order so that we hit the parity 2422 * last -- any errors along the way will force us to read the parity. 2423 */ 2424 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2425 raidz_col_t *rc = &rr->rr_col[c]; 2426 if (rc->rc_size == 0) 2427 continue; 2428 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2429 if (!vdev_readable(cvd)) { 2430 if (c >= rr->rr_firstdatacol) 2431 rr->rr_missingdata++; 2432 else 2433 rr->rr_missingparity++; 2434 rc->rc_error = SET_ERROR(ENXIO); 2435 rc->rc_tried = 1; /* don't even try */ 2436 rc->rc_skipped = 1; 2437 continue; 2438 } 2439 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2440 if (c >= rr->rr_firstdatacol) 2441 rr->rr_missingdata++; 2442 else 2443 rr->rr_missingparity++; 2444 rc->rc_error = SET_ERROR(ESTALE); 2445 rc->rc_skipped = 1; 2446 continue; 2447 } 2448 if (forceparity || 2449 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 2450 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2451 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2452 rc->rc_offset, rc->rc_abd, rc->rc_size, 2453 zio->io_type, zio->io_priority, 0, 2454 vdev_raidz_child_done, rc)); 2455 } 2456 } 2457 } 2458 2459 static void 2460 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2461 { 2462 vdev_t *vd = zio->io_vd; 2463 2464 for (int i = 0; i < rm->rm_nphys_cols; i++) { 2465 raidz_col_t *prc = &rm->rm_phys_col[i]; 2466 if (prc->rc_size == 0) 2467 continue; 2468 2469 ASSERT3U(prc->rc_devidx, ==, i); 2470 vdev_t *cvd = vd->vdev_child[i]; 2471 if (!vdev_readable(cvd)) { 2472 prc->rc_error = SET_ERROR(ENXIO); 2473 prc->rc_tried = 1; /* don't even try */ 2474 prc->rc_skipped = 1; 2475 continue; 2476 } 2477 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2478 prc->rc_error = SET_ERROR(ESTALE); 2479 prc->rc_skipped = 1; 2480 continue; 2481 } 2482 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2483 prc->rc_offset, prc->rc_abd, prc->rc_size, 2484 zio->io_type, zio->io_priority, 0, 2485 vdev_raidz_child_done, prc)); 2486 } 2487 } 2488 2489 static void 2490 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2491 { 2492 /* 2493 * If there are multiple rows, we will be hitting 2494 * all disks, so go ahead and read the parity so 2495 * that we are reading in decent size chunks. 2496 */ 2497 boolean_t forceparity = rm->rm_nrows > 1; 2498 2499 if (rm->rm_phys_col) { 2500 vdev_raidz_io_start_read_phys_cols(zio, rm); 2501 } else { 2502 for (int i = 0; i < rm->rm_nrows; i++) { 2503 raidz_row_t *rr = rm->rm_row[i]; 2504 vdev_raidz_io_start_read_row(zio, rr, forceparity); 2505 } 2506 } 2507 } 2508 2509 /* 2510 * Start an IO operation on a RAIDZ VDev 2511 * 2512 * Outline: 2513 * - For write operations: 2514 * 1. Generate the parity data 2515 * 2. Create child zio write operations to each column's vdev, for both 2516 * data and parity. 2517 * 3. If the column skips any sectors for padding, create optional dummy 2518 * write zio children for those areas to improve aggregation continuity. 2519 * - For read operations: 2520 * 1. Create child zio read operations to each data column's vdev to read 2521 * the range of data required for zio. 2522 * 2. If this is a scrub or resilver operation, or if any of the data 2523 * vdevs have had errors, then create zio read operations to the parity 2524 * columns' VDevs as well. 2525 */ 2526 static void 2527 vdev_raidz_io_start(zio_t *zio) 2528 { 2529 vdev_t *vd = zio->io_vd; 2530 vdev_t *tvd = vd->vdev_top; 2531 vdev_raidz_t *vdrz = vd->vdev_tsd; 2532 raidz_map_t *rm; 2533 2534 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2535 BP_GET_BIRTH(zio->io_bp)); 2536 if (logical_width != vdrz->vd_physical_width) { 2537 zfs_locked_range_t *lr = NULL; 2538 uint64_t synced_offset = UINT64_MAX; 2539 uint64_t next_offset = UINT64_MAX; 2540 boolean_t use_scratch = B_FALSE; 2541 /* 2542 * Note: when the expansion is completing, we set 2543 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2544 * in a later txg than when we last update spa_ubsync's state 2545 * (see the end of spa_raidz_expand_thread()). Therefore we 2546 * may see vre_state!=SCANNING before 2547 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2548 * on disk, but the copying progress has been synced to disk 2549 * (and reflected in spa_ubsync). In this case it's fine to 2550 * treat the expansion as completed, since if we crash there's 2551 * no additional copying to do. 2552 */ 2553 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2554 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2555 &vdrz->vn_vre); 2556 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2557 zio->io_offset, zio->io_size, RL_READER); 2558 use_scratch = 2559 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2560 RRSS_SCRATCH_VALID); 2561 synced_offset = 2562 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2563 next_offset = vdrz->vn_vre.vre_offset; 2564 /* 2565 * If we haven't resumed expanding since importing the 2566 * pool, vre_offset won't have been set yet. In 2567 * this case the next offset to be copied is the same 2568 * as what was synced. 2569 */ 2570 if (next_offset == UINT64_MAX) { 2571 next_offset = synced_offset; 2572 } 2573 } 2574 if (use_scratch) { 2575 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2576 "%lld next_offset=%lld use_scratch=%u", 2577 zio, 2578 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2579 (long long)zio->io_offset, 2580 (long long)synced_offset, 2581 (long long)next_offset, 2582 use_scratch); 2583 } 2584 2585 rm = vdev_raidz_map_alloc_expanded(zio, 2586 tvd->vdev_ashift, vdrz->vd_physical_width, 2587 logical_width, vdrz->vd_nparity, 2588 synced_offset, next_offset, use_scratch); 2589 rm->rm_lr = lr; 2590 } else { 2591 rm = vdev_raidz_map_alloc(zio, 2592 tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2593 } 2594 rm->rm_original_width = vdrz->vd_original_width; 2595 2596 zio->io_vsd = rm; 2597 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2598 if (zio->io_type == ZIO_TYPE_WRITE) { 2599 for (int i = 0; i < rm->rm_nrows; i++) { 2600 vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2601 } 2602 2603 if (logical_width == vdrz->vd_physical_width) { 2604 raidz_start_skip_writes(zio); 2605 } 2606 } else { 2607 ASSERT(zio->io_type == ZIO_TYPE_READ); 2608 vdev_raidz_io_start_read(zio, rm); 2609 } 2610 2611 zio_execute(zio); 2612 } 2613 2614 /* 2615 * Report a checksum error for a child of a RAID-Z device. 2616 */ 2617 void 2618 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2619 { 2620 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2621 2622 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 2623 zio->io_priority != ZIO_PRIORITY_REBUILD) { 2624 zio_bad_cksum_t zbc; 2625 raidz_map_t *rm = zio->io_vsd; 2626 2627 zbc.zbc_has_cksum = 0; 2628 zbc.zbc_injected = rm->rm_ecksuminjected; 2629 2630 mutex_enter(&vd->vdev_stat_lock); 2631 vd->vdev_stat.vs_checksum_errors++; 2632 mutex_exit(&vd->vdev_stat_lock); 2633 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2634 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2635 rc->rc_abd, bad_data, &zbc); 2636 } 2637 } 2638 2639 /* 2640 * We keep track of whether or not there were any injected errors, so that 2641 * any ereports we generate can note it. 2642 */ 2643 static int 2644 raidz_checksum_verify(zio_t *zio) 2645 { 2646 zio_bad_cksum_t zbc = {0}; 2647 raidz_map_t *rm = zio->io_vsd; 2648 2649 int ret = zio_checksum_error(zio, &zbc); 2650 /* 2651 * Any Direct I/O read that has a checksum error must be treated as 2652 * suspicious as the contents of the buffer could be getting 2653 * manipulated while the I/O is taking place. The checksum verify error 2654 * will be reported to the top-level RAIDZ VDEV. 2655 */ 2656 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 2657 zio->io_error = ret; 2658 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 2659 zio_dio_chksum_verify_error_report(zio); 2660 zio_checksum_verified(zio); 2661 return (0); 2662 } 2663 2664 if (ret != 0 && zbc.zbc_injected != 0) 2665 rm->rm_ecksuminjected = 1; 2666 2667 return (ret); 2668 } 2669 2670 /* 2671 * Generate the parity from the data columns. If we tried and were able to 2672 * read the parity without error, verify that the generated parity matches the 2673 * data we read. If it doesn't, we fire off a checksum error. Return the 2674 * number of such failures. 2675 */ 2676 static int 2677 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2678 { 2679 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2680 int c, ret = 0; 2681 raidz_map_t *rm = zio->io_vsd; 2682 raidz_col_t *rc; 2683 2684 blkptr_t *bp = zio->io_bp; 2685 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2686 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2687 2688 if (checksum == ZIO_CHECKSUM_NOPARITY) 2689 return (ret); 2690 2691 for (c = 0; c < rr->rr_firstdatacol; c++) { 2692 rc = &rr->rr_col[c]; 2693 if (!rc->rc_tried || rc->rc_error != 0) 2694 continue; 2695 2696 orig[c] = rc->rc_abd; 2697 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2698 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2699 } 2700 2701 /* 2702 * Verify any empty sectors are zero filled to ensure the parity 2703 * is calculated correctly even if these non-data sectors are damaged. 2704 */ 2705 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2706 ret += vdev_draid_map_verify_empty(zio, rr); 2707 2708 /* 2709 * Regenerates parity even for !tried||rc_error!=0 columns. This 2710 * isn't harmful but it does have the side effect of fixing stuff 2711 * we didn't realize was necessary (i.e. even if we return 0). 2712 */ 2713 vdev_raidz_generate_parity_row(rm, rr); 2714 2715 for (c = 0; c < rr->rr_firstdatacol; c++) { 2716 rc = &rr->rr_col[c]; 2717 2718 if (!rc->rc_tried || rc->rc_error != 0) 2719 continue; 2720 2721 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2722 zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2723 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2724 vdev_raidz_checksum_error(zio, rc, orig[c]); 2725 rc->rc_error = SET_ERROR(ECKSUM); 2726 ret++; 2727 } 2728 abd_free(orig[c]); 2729 } 2730 2731 return (ret); 2732 } 2733 2734 static int 2735 vdev_raidz_worst_error(raidz_row_t *rr) 2736 { 2737 int error = 0; 2738 2739 for (int c = 0; c < rr->rr_cols; c++) { 2740 error = zio_worst_error(error, rr->rr_col[c].rc_error); 2741 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2742 } 2743 2744 return (error); 2745 } 2746 2747 static void 2748 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2749 { 2750 int unexpected_errors = 0; 2751 int parity_errors = 0; 2752 int parity_untried = 0; 2753 int data_errors = 0; 2754 2755 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2756 2757 for (int c = 0; c < rr->rr_cols; c++) { 2758 raidz_col_t *rc = &rr->rr_col[c]; 2759 2760 if (rc->rc_error) { 2761 if (c < rr->rr_firstdatacol) 2762 parity_errors++; 2763 else 2764 data_errors++; 2765 2766 if (!rc->rc_skipped) 2767 unexpected_errors++; 2768 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2769 parity_untried++; 2770 } 2771 2772 if (rc->rc_force_repair) 2773 unexpected_errors++; 2774 } 2775 2776 /* 2777 * If we read more parity disks than were used for 2778 * reconstruction, confirm that the other parity disks produced 2779 * correct data. 2780 * 2781 * Note that we also regenerate parity when resilvering so we 2782 * can write it out to failed devices later. 2783 */ 2784 if (parity_errors + parity_untried < 2785 rr->rr_firstdatacol - data_errors || 2786 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2787 int n = raidz_parity_verify(zio, rr); 2788 unexpected_errors += n; 2789 } 2790 2791 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2792 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2793 /* 2794 * Use the good data we have in hand to repair damaged children. 2795 */ 2796 for (int c = 0; c < rr->rr_cols; c++) { 2797 raidz_col_t *rc = &rr->rr_col[c]; 2798 vdev_t *vd = zio->io_vd; 2799 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2800 2801 if (!rc->rc_allow_repair) { 2802 continue; 2803 } else if (!rc->rc_force_repair && 2804 (rc->rc_error == 0 || rc->rc_size == 0)) { 2805 continue; 2806 } 2807 /* 2808 * We do not allow self healing for Direct I/O reads. 2809 * See comment in vdev_raid_row_alloc(). 2810 */ 2811 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 2812 2813 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2814 "offset=%llx", 2815 zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2816 2817 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2818 rc->rc_offset, rc->rc_abd, rc->rc_size, 2819 ZIO_TYPE_WRITE, 2820 zio->io_priority == ZIO_PRIORITY_REBUILD ? 2821 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 2822 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2823 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2824 } 2825 } 2826 2827 /* 2828 * Scrub or resilver i/o's: overwrite any shadow locations with the 2829 * good data. This ensures that if we've already copied this sector, 2830 * it will be corrected if it was damaged. This writes more than is 2831 * necessary, but since expansion is paused during scrub/resilver, at 2832 * most a single row will have a shadow location. 2833 */ 2834 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2835 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2836 for (int c = 0; c < rr->rr_cols; c++) { 2837 raidz_col_t *rc = &rr->rr_col[c]; 2838 vdev_t *vd = zio->io_vd; 2839 2840 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2841 continue; 2842 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2843 2844 /* 2845 * Note: We don't want to update the repair stats 2846 * because that would incorrectly indicate that there 2847 * was bad data to repair, which we aren't sure about. 2848 * By clearing the SCAN_THREAD flag, we prevent this 2849 * from happening, despite having the REPAIR flag set. 2850 * We need to set SELF_HEAL so that this i/o can't be 2851 * bypassed by zio_vdev_io_start(). 2852 */ 2853 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2854 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2855 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2856 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2857 NULL, NULL); 2858 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2859 zio_nowait(cio); 2860 } 2861 } 2862 } 2863 2864 static void 2865 raidz_restore_orig_data(raidz_map_t *rm) 2866 { 2867 for (int i = 0; i < rm->rm_nrows; i++) { 2868 raidz_row_t *rr = rm->rm_row[i]; 2869 for (int c = 0; c < rr->rr_cols; c++) { 2870 raidz_col_t *rc = &rr->rr_col[c]; 2871 if (rc->rc_need_orig_restore) { 2872 abd_copy(rc->rc_abd, 2873 rc->rc_orig_data, rc->rc_size); 2874 rc->rc_need_orig_restore = B_FALSE; 2875 } 2876 } 2877 } 2878 } 2879 2880 /* 2881 * During raidz_reconstruct() for expanded VDEV, we need special consideration 2882 * failure simulations. See note in raidz_reconstruct() on simulating failure 2883 * of a pre-expansion device. 2884 * 2885 * Treating logical child i as failed, return TRUE if the given column should 2886 * be treated as failed. The idea of logical children allows us to imagine 2887 * that a disk silently failed before a RAIDZ expansion (reads from this disk 2888 * succeed but return the wrong data). Since the expansion doesn't verify 2889 * checksums, the incorrect data will be moved to new locations spread among 2890 * the children (going diagonally across them). 2891 * 2892 * Higher "logical child failures" (values of `i`) indicate these 2893 * "pre-expansion failures". The first physical_width values imagine that a 2894 * current child failed; the next physical_width-1 values imagine that a 2895 * child failed before the most recent expansion; the next physical_width-2 2896 * values imagine a child failed in the expansion before that, etc. 2897 */ 2898 static boolean_t 2899 raidz_simulate_failure(int physical_width, int original_width, int ashift, 2900 int i, raidz_col_t *rc) 2901 { 2902 uint64_t sector_id = 2903 physical_width * (rc->rc_offset >> ashift) + 2904 rc->rc_devidx; 2905 2906 for (int w = physical_width; w >= original_width; w--) { 2907 if (i < w) { 2908 return (sector_id % w == i); 2909 } else { 2910 i -= w; 2911 } 2912 } 2913 ASSERT(!"invalid logical child id"); 2914 return (B_FALSE); 2915 } 2916 2917 /* 2918 * returns EINVAL if reconstruction of the block will not be possible 2919 * returns ECKSUM if this specific reconstruction failed 2920 * returns 0 on successful reconstruction 2921 */ 2922 static int 2923 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 2924 { 2925 raidz_map_t *rm = zio->io_vsd; 2926 int physical_width = zio->io_vd->vdev_children; 2927 int original_width = (rm->rm_original_width != 0) ? 2928 rm->rm_original_width : physical_width; 2929 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2930 2931 if (dbgmsg) { 2932 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2933 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2934 } 2935 2936 /* Reconstruct each row */ 2937 for (int r = 0; r < rm->rm_nrows; r++) { 2938 raidz_row_t *rr = rm->rm_row[r]; 2939 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 2940 int t = 0; 2941 int dead = 0; 2942 int dead_data = 0; 2943 2944 if (dbgmsg) 2945 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2946 2947 for (int c = 0; c < rr->rr_cols; c++) { 2948 raidz_col_t *rc = &rr->rr_col[c]; 2949 ASSERT0(rc->rc_need_orig_restore); 2950 if (rc->rc_error != 0) { 2951 dead++; 2952 if (c >= nparity) 2953 dead_data++; 2954 continue; 2955 } 2956 if (rc->rc_size == 0) 2957 continue; 2958 for (int lt = 0; lt < ntgts; lt++) { 2959 if (raidz_simulate_failure(physical_width, 2960 original_width, 2961 zio->io_vd->vdev_top->vdev_ashift, 2962 ltgts[lt], rc)) { 2963 if (rc->rc_orig_data == NULL) { 2964 rc->rc_orig_data = 2965 abd_alloc_linear( 2966 rc->rc_size, B_TRUE); 2967 abd_copy(rc->rc_orig_data, 2968 rc->rc_abd, rc->rc_size); 2969 } 2970 rc->rc_need_orig_restore = B_TRUE; 2971 2972 dead++; 2973 if (c >= nparity) 2974 dead_data++; 2975 /* 2976 * Note: simulating failure of a 2977 * pre-expansion device can hit more 2978 * than one column, in which case we 2979 * might try to simulate more failures 2980 * than can be reconstructed, which is 2981 * also more than the size of my_tgts. 2982 * This check prevents accessing past 2983 * the end of my_tgts. The "dead > 2984 * nparity" check below will fail this 2985 * reconstruction attempt. 2986 */ 2987 if (t < VDEV_RAIDZ_MAXPARITY) { 2988 my_tgts[t++] = c; 2989 if (dbgmsg) { 2990 zfs_dbgmsg("simulating " 2991 "failure of col %u " 2992 "devidx %u", c, 2993 (int)rc->rc_devidx); 2994 } 2995 } 2996 break; 2997 } 2998 } 2999 } 3000 if (dead > nparity) { 3001 /* reconstruction not possible */ 3002 if (dbgmsg) { 3003 zfs_dbgmsg("reconstruction not possible; " 3004 "too many failures"); 3005 } 3006 raidz_restore_orig_data(rm); 3007 return (EINVAL); 3008 } 3009 if (dead_data > 0) 3010 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 3011 } 3012 3013 /* Check for success */ 3014 if (raidz_checksum_verify(zio) == 0) { 3015 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3016 return (0); 3017 3018 /* Reconstruction succeeded - report errors */ 3019 for (int i = 0; i < rm->rm_nrows; i++) { 3020 raidz_row_t *rr = rm->rm_row[i]; 3021 3022 for (int c = 0; c < rr->rr_cols; c++) { 3023 raidz_col_t *rc = &rr->rr_col[c]; 3024 if (rc->rc_need_orig_restore) { 3025 /* 3026 * Note: if this is a parity column, 3027 * we don't really know if it's wrong. 3028 * We need to let 3029 * vdev_raidz_io_done_verified() check 3030 * it, and if we set rc_error, it will 3031 * think that it is a "known" error 3032 * that doesn't need to be checked 3033 * or corrected. 3034 */ 3035 if (rc->rc_error == 0 && 3036 c >= rr->rr_firstdatacol) { 3037 vdev_raidz_checksum_error(zio, 3038 rc, rc->rc_orig_data); 3039 rc->rc_error = 3040 SET_ERROR(ECKSUM); 3041 } 3042 rc->rc_need_orig_restore = B_FALSE; 3043 } 3044 } 3045 3046 vdev_raidz_io_done_verified(zio, rr); 3047 } 3048 3049 zio_checksum_verified(zio); 3050 3051 if (dbgmsg) { 3052 zfs_dbgmsg("reconstruction successful " 3053 "(checksum verified)"); 3054 } 3055 return (0); 3056 } 3057 3058 /* Reconstruction failed - restore original data */ 3059 raidz_restore_orig_data(rm); 3060 if (dbgmsg) { 3061 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3062 "failed", zio); 3063 } 3064 return (ECKSUM); 3065 } 3066 3067 /* 3068 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 3069 * Note that the algorithm below is non-optimal because it doesn't take into 3070 * account how reconstruction is actually performed. For example, with 3071 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 3072 * is targeted as invalid as if columns 1 and 4 are targeted since in both 3073 * cases we'd only use parity information in column 0. 3074 * 3075 * The order that we find the various possible combinations of failed 3076 * disks is dictated by these rules: 3077 * - Examine each "slot" (the "i" in tgts[i]) 3078 * - Try to increment this slot (tgts[i] += 1) 3079 * - if we can't increment because it runs into the next slot, 3080 * reset our slot to the minimum, and examine the next slot 3081 * 3082 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 3083 * 3 columns to reconstruct), we will generate the following sequence: 3084 * 3085 * STATE ACTION 3086 * 0 1 2 special case: skip since these are all parity 3087 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 3088 * 0 2 3 first slot: increment to 1 3089 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 3090 * 0 1 4 first: reset to 0; middle: increment to 2 3091 * 0 2 4 first: increment to 1 3092 * 1 2 4 first: reset to 0; middle: increment to 3 3093 * 0 3 4 first: increment to 1 3094 * 1 3 4 first: increment to 2 3095 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 3096 * 0 1 5 first: reset to 0; middle: increment to 2 3097 * 0 2 5 first: increment to 1 3098 * 1 2 5 first: reset to 0; middle: increment to 3 3099 * 0 3 5 first: increment to 1 3100 * 1 3 5 first: increment to 2 3101 * 2 3 5 first: reset to 0; middle: increment to 4 3102 * 0 4 5 first: increment to 1 3103 * 1 4 5 first: increment to 2 3104 * 2 4 5 first: increment to 3 3105 * 3 4 5 done 3106 * 3107 * This strategy works for dRAID but is less efficient when there are a large 3108 * number of child vdevs and therefore permutations to check. Furthermore, 3109 * since the raidz_map_t rows likely do not overlap, reconstruction would be 3110 * possible as long as there are no more than nparity data errors per row. 3111 * These additional permutations are not currently checked but could be as 3112 * a future improvement. 3113 * 3114 * Returns 0 on success, ECKSUM on failure. 3115 */ 3116 static int 3117 vdev_raidz_combrec(zio_t *zio) 3118 { 3119 int nparity = vdev_get_nparity(zio->io_vd); 3120 raidz_map_t *rm = zio->io_vsd; 3121 int physical_width = zio->io_vd->vdev_children; 3122 int original_width = (rm->rm_original_width != 0) ? 3123 rm->rm_original_width : physical_width; 3124 3125 for (int i = 0; i < rm->rm_nrows; i++) { 3126 raidz_row_t *rr = rm->rm_row[i]; 3127 int total_errors = 0; 3128 3129 for (int c = 0; c < rr->rr_cols; c++) { 3130 if (rr->rr_col[c].rc_error) 3131 total_errors++; 3132 } 3133 3134 if (total_errors > nparity) 3135 return (vdev_raidz_worst_error(rr)); 3136 } 3137 3138 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 3139 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 3140 int *ltgts = &tstore[1]; /* value is logical child ID */ 3141 3142 3143 /* 3144 * Determine number of logical children, n. See comment 3145 * above raidz_simulate_failure(). 3146 */ 3147 int n = 0; 3148 for (int w = physical_width; 3149 w >= original_width; w--) { 3150 n += w; 3151 } 3152 3153 ASSERT3U(num_failures, <=, nparity); 3154 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 3155 3156 /* Handle corner cases in combrec logic */ 3157 ltgts[-1] = -1; 3158 for (int i = 0; i < num_failures; i++) { 3159 ltgts[i] = i; 3160 } 3161 ltgts[num_failures] = n; 3162 3163 for (;;) { 3164 int err = raidz_reconstruct(zio, ltgts, num_failures, 3165 nparity); 3166 if (err == EINVAL) { 3167 /* 3168 * Reconstruction not possible with this # 3169 * failures; try more failures. 3170 */ 3171 break; 3172 } else if (err == 0) 3173 return (0); 3174 3175 /* Compute next targets to try */ 3176 for (int t = 0; ; t++) { 3177 ASSERT3U(t, <, num_failures); 3178 ltgts[t]++; 3179 if (ltgts[t] == n) { 3180 /* try more failures */ 3181 ASSERT3U(t, ==, num_failures - 1); 3182 if (zfs_flags & 3183 ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3184 zfs_dbgmsg("reconstruction " 3185 "failed for num_failures=" 3186 "%u; tried all " 3187 "combinations", 3188 num_failures); 3189 } 3190 break; 3191 } 3192 3193 ASSERT3U(ltgts[t], <, n); 3194 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 3195 3196 /* 3197 * If that spot is available, we're done here. 3198 * Try the next combination. 3199 */ 3200 if (ltgts[t] != ltgts[t + 1]) 3201 break; // found next combination 3202 3203 /* 3204 * Otherwise, reset this tgt to the minimum, 3205 * and move on to the next tgt. 3206 */ 3207 ltgts[t] = ltgts[t - 1] + 1; 3208 ASSERT3U(ltgts[t], ==, t); 3209 } 3210 3211 /* Increase the number of failures and keep trying. */ 3212 if (ltgts[num_failures - 1] == n) 3213 break; 3214 } 3215 } 3216 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3217 zfs_dbgmsg("reconstruction failed for all num_failures"); 3218 return (ECKSUM); 3219 } 3220 3221 void 3222 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 3223 { 3224 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 3225 raidz_row_t *rr = rm->rm_row[row]; 3226 vdev_raidz_reconstruct_row(rm, rr, t, nt); 3227 } 3228 } 3229 3230 /* 3231 * Complete a write IO operation on a RAIDZ VDev 3232 * 3233 * Outline: 3234 * 1. Check for errors on the child IOs. 3235 * 2. Return, setting an error code if too few child VDevs were written 3236 * to reconstruct the data later. Note that partial writes are 3237 * considered successful if they can be reconstructed at all. 3238 */ 3239 static void 3240 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 3241 { 3242 int normal_errors = 0; 3243 int shadow_errors = 0; 3244 3245 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3246 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3247 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3248 3249 for (int c = 0; c < rr->rr_cols; c++) { 3250 raidz_col_t *rc = &rr->rr_col[c]; 3251 3252 if (rc->rc_error != 0) { 3253 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3254 normal_errors++; 3255 } 3256 if (rc->rc_shadow_error != 0) { 3257 ASSERT(rc->rc_shadow_error != ECKSUM); 3258 shadow_errors++; 3259 } 3260 } 3261 3262 /* 3263 * Treat partial writes as a success. If we couldn't write enough 3264 * columns to reconstruct the data, the I/O failed. Otherwise, good 3265 * enough. Note that in the case of a shadow write (during raidz 3266 * expansion), depending on if we crash, either the normal (old) or 3267 * shadow (new) location may become the "real" version of the block, 3268 * so both locations must have sufficient redundancy. 3269 * 3270 * Now that we support write reallocation, it would be better 3271 * to treat partial failure as real failure unless there are 3272 * no non-degraded top-level vdevs left, and not update DTLs 3273 * if we intend to reallocate. 3274 */ 3275 if (normal_errors > rr->rr_firstdatacol || 3276 shadow_errors > rr->rr_firstdatacol) { 3277 zio->io_error = zio_worst_error(zio->io_error, 3278 vdev_raidz_worst_error(rr)); 3279 } 3280 } 3281 3282 static void 3283 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 3284 raidz_row_t *rr) 3285 { 3286 int parity_errors = 0; 3287 int parity_untried = 0; 3288 int data_errors = 0; 3289 int total_errors = 0; 3290 3291 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3292 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3293 3294 for (int c = 0; c < rr->rr_cols; c++) { 3295 raidz_col_t *rc = &rr->rr_col[c]; 3296 3297 /* 3298 * If scrubbing and a replacing/sparing child vdev determined 3299 * that not all of its children have an identical copy of the 3300 * data, then clear the error so the column is treated like 3301 * any other read and force a repair to correct the damage. 3302 */ 3303 if (rc->rc_error == ECKSUM) { 3304 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3305 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3306 rc->rc_force_repair = 1; 3307 rc->rc_error = 0; 3308 } 3309 3310 if (rc->rc_error) { 3311 if (c < rr->rr_firstdatacol) 3312 parity_errors++; 3313 else 3314 data_errors++; 3315 3316 total_errors++; 3317 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3318 parity_untried++; 3319 } 3320 } 3321 3322 /* 3323 * If there were data errors and the number of errors we saw was 3324 * correctable -- less than or equal to the number of parity disks read 3325 * -- reconstruct based on the missing data. 3326 */ 3327 if (data_errors != 0 && 3328 total_errors <= rr->rr_firstdatacol - parity_untried) { 3329 /* 3330 * We either attempt to read all the parity columns or 3331 * none of them. If we didn't try to read parity, we 3332 * wouldn't be here in the correctable case. There must 3333 * also have been fewer parity errors than parity 3334 * columns or, again, we wouldn't be in this code path. 3335 */ 3336 ASSERT(parity_untried == 0); 3337 ASSERT(parity_errors < rr->rr_firstdatacol); 3338 3339 /* 3340 * Identify the data columns that reported an error. 3341 */ 3342 int n = 0; 3343 int tgts[VDEV_RAIDZ_MAXPARITY]; 3344 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 3345 raidz_col_t *rc = &rr->rr_col[c]; 3346 if (rc->rc_error != 0) { 3347 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3348 tgts[n++] = c; 3349 } 3350 } 3351 3352 ASSERT(rr->rr_firstdatacol >= n); 3353 3354 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3355 } 3356 } 3357 3358 /* 3359 * Return the number of reads issued. 3360 */ 3361 static int 3362 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 3363 { 3364 vdev_t *vd = zio->io_vd; 3365 int nread = 0; 3366 3367 rr->rr_missingdata = 0; 3368 rr->rr_missingparity = 0; 3369 3370 /* 3371 * If this rows contains empty sectors which are not required 3372 * for a normal read then allocate an ABD for them now so they 3373 * may be read, verified, and any needed repairs performed. 3374 */ 3375 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 3376 vdev_draid_map_alloc_empty(zio, rr); 3377 3378 for (int c = 0; c < rr->rr_cols; c++) { 3379 raidz_col_t *rc = &rr->rr_col[c]; 3380 if (rc->rc_tried || rc->rc_size == 0) 3381 continue; 3382 3383 zio_nowait(zio_vdev_child_io(zio, NULL, 3384 vd->vdev_child[rc->rc_devidx], 3385 rc->rc_offset, rc->rc_abd, rc->rc_size, 3386 zio->io_type, zio->io_priority, 0, 3387 vdev_raidz_child_done, rc)); 3388 nread++; 3389 } 3390 return (nread); 3391 } 3392 3393 /* 3394 * We're here because either there were too many errors to even attempt 3395 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 3396 * failed. In either case, there is enough bad data to prevent reconstruction. 3397 * Start checksum ereports for all children which haven't failed. 3398 */ 3399 static void 3400 vdev_raidz_io_done_unrecoverable(zio_t *zio) 3401 { 3402 raidz_map_t *rm = zio->io_vsd; 3403 3404 for (int i = 0; i < rm->rm_nrows; i++) { 3405 raidz_row_t *rr = rm->rm_row[i]; 3406 3407 for (int c = 0; c < rr->rr_cols; c++) { 3408 raidz_col_t *rc = &rr->rr_col[c]; 3409 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 3410 3411 if (rc->rc_error != 0) 3412 continue; 3413 3414 zio_bad_cksum_t zbc; 3415 zbc.zbc_has_cksum = 0; 3416 zbc.zbc_injected = rm->rm_ecksuminjected; 3417 mutex_enter(&cvd->vdev_stat_lock); 3418 cvd->vdev_stat.vs_checksum_errors++; 3419 mutex_exit(&cvd->vdev_stat_lock); 3420 (void) zfs_ereport_start_checksum(zio->io_spa, 3421 cvd, &zio->io_bookmark, zio, rc->rc_offset, 3422 rc->rc_size, &zbc); 3423 } 3424 } 3425 } 3426 3427 void 3428 vdev_raidz_io_done(zio_t *zio) 3429 { 3430 raidz_map_t *rm = zio->io_vsd; 3431 3432 ASSERT(zio->io_bp != NULL); 3433 if (zio->io_type == ZIO_TYPE_WRITE) { 3434 for (int i = 0; i < rm->rm_nrows; i++) { 3435 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 3436 } 3437 } else { 3438 if (rm->rm_phys_col) { 3439 /* 3440 * This is an aggregated read. Copy the data and status 3441 * from the aggregate abd's to the individual rows. 3442 */ 3443 for (int i = 0; i < rm->rm_nrows; i++) { 3444 raidz_row_t *rr = rm->rm_row[i]; 3445 3446 for (int c = 0; c < rr->rr_cols; c++) { 3447 raidz_col_t *rc = &rr->rr_col[c]; 3448 if (rc->rc_tried || rc->rc_size == 0) 3449 continue; 3450 3451 raidz_col_t *prc = 3452 &rm->rm_phys_col[rc->rc_devidx]; 3453 rc->rc_error = prc->rc_error; 3454 rc->rc_tried = prc->rc_tried; 3455 rc->rc_skipped = prc->rc_skipped; 3456 if (c >= rr->rr_firstdatacol) { 3457 /* 3458 * Note: this is slightly faster 3459 * than using abd_copy_off(). 3460 */ 3461 char *physbuf = abd_to_buf( 3462 prc->rc_abd); 3463 void *physloc = physbuf + 3464 rc->rc_offset - 3465 prc->rc_offset; 3466 3467 abd_copy_from_buf(rc->rc_abd, 3468 physloc, rc->rc_size); 3469 } 3470 } 3471 } 3472 } 3473 3474 for (int i = 0; i < rm->rm_nrows; i++) { 3475 raidz_row_t *rr = rm->rm_row[i]; 3476 vdev_raidz_io_done_reconstruct_known_missing(zio, 3477 rm, rr); 3478 } 3479 3480 if (raidz_checksum_verify(zio) == 0) { 3481 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3482 goto done; 3483 3484 for (int i = 0; i < rm->rm_nrows; i++) { 3485 raidz_row_t *rr = rm->rm_row[i]; 3486 vdev_raidz_io_done_verified(zio, rr); 3487 } 3488 zio_checksum_verified(zio); 3489 } else { 3490 /* 3491 * A sequential resilver has no checksum which makes 3492 * combinatoral reconstruction impossible. This code 3493 * path is unreachable since raidz_checksum_verify() 3494 * has no checksum to verify and must succeed. 3495 */ 3496 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3497 3498 /* 3499 * This isn't a typical situation -- either we got a 3500 * read error or a child silently returned bad data. 3501 * Read every block so we can try again with as much 3502 * data and parity as we can track down. If we've 3503 * already been through once before, all children will 3504 * be marked as tried so we'll proceed to combinatorial 3505 * reconstruction. 3506 */ 3507 int nread = 0; 3508 for (int i = 0; i < rm->rm_nrows; i++) { 3509 nread += vdev_raidz_read_all(zio, 3510 rm->rm_row[i]); 3511 } 3512 if (nread != 0) { 3513 /* 3514 * Normally our stage is VDEV_IO_DONE, but if 3515 * we've already called redone(), it will have 3516 * changed to VDEV_IO_START, in which case we 3517 * don't want to call redone() again. 3518 */ 3519 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 3520 zio_vdev_io_redone(zio); 3521 return; 3522 } 3523 /* 3524 * It would be too expensive to try every possible 3525 * combination of failed sectors in every row, so 3526 * instead we try every combination of failed current or 3527 * past physical disk. This means that if the incorrect 3528 * sectors were all on Nparity disks at any point in the 3529 * past, we will find the correct data. The only known 3530 * case where this is less durable than a non-expanded 3531 * RAIDZ, is if we have a silent failure during 3532 * expansion. In that case, one block could be 3533 * partially in the old format and partially in the 3534 * new format, so we'd lost some sectors from the old 3535 * format and some from the new format. 3536 * 3537 * e.g. logical_width=4 physical_width=6 3538 * the 15 (6+5+4) possible failed disks are: 3539 * width=6 child=0 3540 * width=6 child=1 3541 * width=6 child=2 3542 * width=6 child=3 3543 * width=6 child=4 3544 * width=6 child=5 3545 * width=5 child=0 3546 * width=5 child=1 3547 * width=5 child=2 3548 * width=5 child=3 3549 * width=5 child=4 3550 * width=4 child=0 3551 * width=4 child=1 3552 * width=4 child=2 3553 * width=4 child=3 3554 * And we will try every combination of Nparity of these 3555 * failing. 3556 * 3557 * As a first pass, we can generate every combo, 3558 * and try reconstructing, ignoring any known 3559 * failures. If any row has too many known + simulated 3560 * failures, then we bail on reconstructing with this 3561 * number of simulated failures. As an improvement, 3562 * we could detect the number of whole known failures 3563 * (i.e. we have known failures on these disks for 3564 * every row; the disks never succeeded), and 3565 * subtract that from the max # failures to simulate. 3566 * We could go even further like the current 3567 * combrec code, but that doesn't seem like it 3568 * gains us very much. If we simulate a failure 3569 * that is also a known failure, that's fine. 3570 */ 3571 zio->io_error = vdev_raidz_combrec(zio); 3572 if (zio->io_error == ECKSUM && 3573 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3574 vdev_raidz_io_done_unrecoverable(zio); 3575 } 3576 } 3577 } 3578 done: 3579 if (rm->rm_lr != NULL) { 3580 zfs_rangelock_exit(rm->rm_lr); 3581 rm->rm_lr = NULL; 3582 } 3583 } 3584 3585 static void 3586 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3587 { 3588 vdev_raidz_t *vdrz = vd->vdev_tsd; 3589 if (faulted > vdrz->vd_nparity) 3590 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3591 VDEV_AUX_NO_REPLICAS); 3592 else if (degraded + faulted != 0) 3593 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3594 else 3595 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3596 } 3597 3598 /* 3599 * Determine if any portion of the provided block resides on a child vdev 3600 * with a dirty DTL and therefore needs to be resilvered. The function 3601 * assumes that at least one DTL is dirty which implies that full stripe 3602 * width blocks must be resilvered. 3603 */ 3604 static boolean_t 3605 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3606 uint64_t phys_birth) 3607 { 3608 vdev_raidz_t *vdrz = vd->vdev_tsd; 3609 3610 /* 3611 * If we're in the middle of a RAIDZ expansion, this block may be in 3612 * the old and/or new location. For simplicity, always resilver it. 3613 */ 3614 if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3615 return (B_TRUE); 3616 3617 uint64_t dcols = vd->vdev_children; 3618 uint64_t nparity = vdrz->vd_nparity; 3619 uint64_t ashift = vd->vdev_top->vdev_ashift; 3620 /* The starting RAIDZ (parent) vdev sector of the block. */ 3621 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3622 /* The zio's size in units of the vdev's minimum sector size. */ 3623 uint64_t s = ((psize - 1) >> ashift) + 1; 3624 /* The first column for this stripe. */ 3625 uint64_t f = b % dcols; 3626 3627 /* Unreachable by sequential resilver. */ 3628 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 3629 3630 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 3631 return (B_FALSE); 3632 3633 if (s + nparity >= dcols) 3634 return (B_TRUE); 3635 3636 for (uint64_t c = 0; c < s + nparity; c++) { 3637 uint64_t devidx = (f + c) % dcols; 3638 vdev_t *cvd = vd->vdev_child[devidx]; 3639 3640 /* 3641 * dsl_scan_need_resilver() already checked vd with 3642 * vdev_dtl_contains(). So here just check cvd with 3643 * vdev_dtl_empty(), cheaper and a good approximation. 3644 */ 3645 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3646 return (B_TRUE); 3647 } 3648 3649 return (B_FALSE); 3650 } 3651 3652 static void 3653 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 3654 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 3655 { 3656 (void) remain_rs; 3657 3658 vdev_t *raidvd = cvd->vdev_parent; 3659 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3660 3661 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3662 3663 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3664 /* 3665 * We're in the middle of expansion, in which case the 3666 * translation is in flux. Any answer we give may be wrong 3667 * by the time we return, so it isn't safe for the caller to 3668 * act on it. Therefore we say that this range isn't present 3669 * on any children. The only consumers of this are "zpool 3670 * initialize" and trimming, both of which are "best effort" 3671 * anyway. 3672 */ 3673 physical_rs->rs_start = physical_rs->rs_end = 0; 3674 remain_rs->rs_start = remain_rs->rs_end = 0; 3675 return; 3676 } 3677 3678 uint64_t width = vdrz->vd_physical_width; 3679 uint64_t tgt_col = cvd->vdev_id; 3680 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3681 3682 /* make sure the offsets are block-aligned */ 3683 ASSERT0(logical_rs->rs_start % (1 << ashift)); 3684 ASSERT0(logical_rs->rs_end % (1 << ashift)); 3685 uint64_t b_start = logical_rs->rs_start >> ashift; 3686 uint64_t b_end = logical_rs->rs_end >> ashift; 3687 3688 uint64_t start_row = 0; 3689 if (b_start > tgt_col) /* avoid underflow */ 3690 start_row = ((b_start - tgt_col - 1) / width) + 1; 3691 3692 uint64_t end_row = 0; 3693 if (b_end > tgt_col) 3694 end_row = ((b_end - tgt_col - 1) / width) + 1; 3695 3696 physical_rs->rs_start = start_row << ashift; 3697 physical_rs->rs_end = end_row << ashift; 3698 3699 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 3700 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 3701 logical_rs->rs_end - logical_rs->rs_start); 3702 } 3703 3704 static void 3705 raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3706 { 3707 spa_t *spa = arg; 3708 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3709 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3710 3711 /* 3712 * Ensure there are no i/os to the range that is being committed. 3713 */ 3714 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3715 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3716 3717 mutex_enter(&vre->vre_lock); 3718 uint64_t new_offset = 3719 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3720 /* 3721 * We should not have committed anything that failed. 3722 */ 3723 VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3724 mutex_exit(&vre->vre_lock); 3725 3726 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3727 old_offset, new_offset - old_offset, 3728 RL_WRITER); 3729 3730 /* 3731 * Update the uberblock that will be written when this txg completes. 3732 */ 3733 RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3734 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3735 vre->vre_offset_pertxg[txgoff] = 0; 3736 zfs_rangelock_exit(lr); 3737 3738 mutex_enter(&vre->vre_lock); 3739 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3740 vre->vre_bytes_copied_pertxg[txgoff] = 0; 3741 mutex_exit(&vre->vre_lock); 3742 3743 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3744 VERIFY0(zap_update(spa->spa_meta_objset, 3745 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3746 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3747 } 3748 3749 static void 3750 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3751 { 3752 spa_t *spa = arg; 3753 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3754 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3755 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3756 3757 for (int i = 0; i < TXG_SIZE; i++) 3758 VERIFY0(vre->vre_offset_pertxg[i]); 3759 3760 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3761 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3762 re->re_logical_width = vdrz->vd_physical_width; 3763 mutex_enter(&vdrz->vd_expand_lock); 3764 avl_add(&vdrz->vd_expand_txgs, re); 3765 mutex_exit(&vdrz->vd_expand_lock); 3766 3767 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3768 3769 /* 3770 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3771 * will get written (based on vd_expand_txgs). 3772 */ 3773 vdev_config_dirty(vd); 3774 3775 /* 3776 * Before we change vre_state, the on-disk state must reflect that we 3777 * have completed all copying, so that vdev_raidz_io_start() can use 3778 * vre_state to determine if the reflow is in progress. See also the 3779 * end of spa_raidz_expand_thread(). 3780 */ 3781 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3782 raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3783 3784 vre->vre_end_time = gethrestime_sec(); 3785 vre->vre_state = DSS_FINISHED; 3786 3787 uint64_t state = vre->vre_state; 3788 VERIFY0(zap_update(spa->spa_meta_objset, 3789 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3790 sizeof (state), 1, &state, tx)); 3791 3792 uint64_t end_time = vre->vre_end_time; 3793 VERIFY0(zap_update(spa->spa_meta_objset, 3794 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3795 sizeof (end_time), 1, &end_time, tx)); 3796 3797 spa->spa_uberblock.ub_raidz_reflow_info = 0; 3798 3799 spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3800 "%s vdev %llu new width %llu", spa_name(spa), 3801 (unsigned long long)vd->vdev_id, 3802 (unsigned long long)vd->vdev_children); 3803 3804 spa->spa_raidz_expand = NULL; 3805 raidvd->vdev_rz_expanding = B_FALSE; 3806 3807 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3808 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3809 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3810 3811 spa_notify_waiters(spa); 3812 3813 /* 3814 * While we're in syncing context take the opportunity to 3815 * setup a scrub. All the data has been sucessfully copied 3816 * but we have not validated any checksums. 3817 */ 3818 setup_sync_arg_t setup_sync_arg = { 3819 .func = POOL_SCAN_SCRUB, 3820 .txgstart = 0, 3821 .txgend = 0, 3822 }; 3823 if (zfs_scrub_after_expand && 3824 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { 3825 dsl_scan_setup_sync(&setup_sync_arg, tx); 3826 } 3827 } 3828 3829 /* 3830 * State of one copy batch. 3831 */ 3832 typedef struct raidz_reflow_arg { 3833 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ 3834 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ 3835 uint64_t rra_txg; /* TXG of this batch. */ 3836 uint_t rra_ashift; /* Ashift of the vdev. */ 3837 uint32_t rra_tbd; /* Number of in-flight ZIOs. */ 3838 uint32_t rra_writes; /* Number of write ZIOs. */ 3839 zio_t *rra_zio[]; /* Write ZIO pointers. */ 3840 } raidz_reflow_arg_t; 3841 3842 /* 3843 * Write of the new location on one child is done. Once all of them are done 3844 * we can unlock and free everything. 3845 */ 3846 static void 3847 raidz_reflow_write_done(zio_t *zio) 3848 { 3849 raidz_reflow_arg_t *rra = zio->io_private; 3850 vdev_raidz_expand_t *vre = rra->rra_vre; 3851 3852 abd_free(zio->io_abd); 3853 3854 mutex_enter(&vre->vre_lock); 3855 if (zio->io_error != 0) { 3856 /* Force a reflow pause on errors */ 3857 vre->vre_failed_offset = 3858 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3859 } 3860 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3861 vre->vre_outstanding_bytes -= zio->io_size; 3862 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3863 vre->vre_failed_offset) { 3864 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3865 zio->io_size; 3866 } 3867 cv_signal(&vre->vre_cv); 3868 boolean_t done = (--rra->rra_tbd == 0); 3869 mutex_exit(&vre->vre_lock); 3870 3871 if (!done) 3872 return; 3873 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3874 zfs_rangelock_exit(rra->rra_lr); 3875 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); 3876 } 3877 3878 /* 3879 * Read of the old location on one child is done. Once all of them are done 3880 * writes should have all the data and we can issue them. 3881 */ 3882 static void 3883 raidz_reflow_read_done(zio_t *zio) 3884 { 3885 raidz_reflow_arg_t *rra = zio->io_private; 3886 vdev_raidz_expand_t *vre = rra->rra_vre; 3887 3888 /* Reads of only one block use write ABDs. For bigger free gangs. */ 3889 if (zio->io_size > (1 << rra->rra_ashift)) 3890 abd_free(zio->io_abd); 3891 3892 /* 3893 * If the read failed, or if it was done on a vdev that is not fully 3894 * healthy (e.g. a child that has a resilver in progress), we may not 3895 * have the correct data. Note that it's OK if the write proceeds. 3896 * It may write garbage but the location is otherwise unused and we 3897 * will retry later due to vre_failed_offset. 3898 */ 3899 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3900 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3901 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3902 (long long)rra->rra_lr->lr_offset, 3903 (long long)rra->rra_lr->lr_length, 3904 (long long)rra->rra_txg, 3905 zio->io_error, 3906 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3907 vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3908 mutex_enter(&vre->vre_lock); 3909 /* Force a reflow pause on errors */ 3910 vre->vre_failed_offset = 3911 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3912 mutex_exit(&vre->vre_lock); 3913 } 3914 3915 if (atomic_dec_32_nv(&rra->rra_tbd) > 0) 3916 return; 3917 rra->rra_tbd = rra->rra_writes; 3918 for (uint64_t i = 0; i < rra->rra_writes; i++) 3919 zio_nowait(rra->rra_zio[i]); 3920 } 3921 3922 static void 3923 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3924 dmu_tx_t *tx) 3925 { 3926 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3927 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3928 3929 if (offset == 0) 3930 return; 3931 3932 mutex_enter(&vre->vre_lock); 3933 ASSERT3U(vre->vre_offset, <=, offset); 3934 vre->vre_offset = offset; 3935 mutex_exit(&vre->vre_lock); 3936 3937 if (vre->vre_offset_pertxg[txgoff] == 0) { 3938 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3939 spa, tx); 3940 } 3941 vre->vre_offset_pertxg[txgoff] = offset; 3942 } 3943 3944 static boolean_t 3945 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3946 { 3947 for (int i = 0; i < raidz_vd->vdev_children; i++) { 3948 /* Quick check if a child is being replaced */ 3949 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3950 return (B_TRUE); 3951 } 3952 return (B_FALSE); 3953 } 3954 3955 static boolean_t 3956 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, 3957 dmu_tx_t *tx) 3958 { 3959 spa_t *spa = vd->vdev_spa; 3960 uint_t ashift = vd->vdev_top->vdev_ashift; 3961 3962 range_seg_t *rs = range_tree_first(rt); 3963 if (rt == NULL) 3964 return (B_FALSE); 3965 uint64_t offset = rs_get_start(rs, rt); 3966 ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3967 uint64_t size = rs_get_end(rs, rt) - offset; 3968 ASSERT3U(size, >=, 1 << ashift); 3969 ASSERT(IS_P2ALIGNED(size, 1 << ashift)); 3970 3971 uint64_t blkid = offset >> ashift; 3972 uint_t old_children = vd->vdev_children - 1; 3973 3974 /* 3975 * We can only progress to the point that writes will not overlap 3976 * with blocks whose progress has not yet been recorded on disk. 3977 * Since partially-copied rows are still read from the old location, 3978 * we need to stop one row before the sector-wise overlap, to prevent 3979 * row-wise overlap. 3980 * 3981 * Note that even if we are skipping over a large unallocated region, 3982 * we can't move the on-disk progress to `offset`, because concurrent 3983 * writes/allocations could still use the currently-unallocated 3984 * region. 3985 */ 3986 uint64_t ubsync_blkid = 3987 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3988 uint64_t next_overwrite_blkid = ubsync_blkid + 3989 ubsync_blkid / old_children - old_children; 3990 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3991 if (blkid >= next_overwrite_blkid) { 3992 raidz_reflow_record_progress(vre, 3993 next_overwrite_blkid << ashift, tx); 3994 return (B_TRUE); 3995 } 3996 3997 size = MIN(size, raidz_expand_max_copy_bytes); 3998 size = MIN(size, (uint64_t)old_children * 3999 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); 4000 size = MAX(size, 1 << ashift); 4001 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); 4002 size = (uint64_t)blocks << ashift; 4003 4004 range_tree_remove(rt, offset, size); 4005 4006 uint_t reads = MIN(blocks, old_children); 4007 uint_t writes = MIN(blocks, vd->vdev_children); 4008 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + 4009 sizeof (zio_t *) * writes, KM_SLEEP); 4010 rra->rra_vre = vre; 4011 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 4012 offset, size, RL_WRITER); 4013 rra->rra_txg = dmu_tx_get_txg(tx); 4014 rra->rra_ashift = ashift; 4015 rra->rra_tbd = reads; 4016 rra->rra_writes = writes; 4017 4018 raidz_reflow_record_progress(vre, offset + size, tx); 4019 4020 /* 4021 * SCL_STATE will be released when the read and write are done, 4022 * by raidz_reflow_write_done(). 4023 */ 4024 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4025 4026 /* check if a replacing vdev was added, if so treat it as an error */ 4027 if (vdev_raidz_expand_child_replacing(vd)) { 4028 zfs_dbgmsg("replacing vdev encountered, reflow paused at " 4029 "offset=%llu txg=%llu", 4030 (long long)rra->rra_lr->lr_offset, 4031 (long long)rra->rra_txg); 4032 4033 mutex_enter(&vre->vre_lock); 4034 vre->vre_failed_offset = 4035 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4036 cv_signal(&vre->vre_cv); 4037 mutex_exit(&vre->vre_lock); 4038 4039 /* drop everything we acquired */ 4040 spa_config_exit(spa, SCL_STATE, spa); 4041 zfs_rangelock_exit(rra->rra_lr); 4042 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); 4043 return (B_TRUE); 4044 } 4045 4046 mutex_enter(&vre->vre_lock); 4047 vre->vre_outstanding_bytes += size; 4048 mutex_exit(&vre->vre_lock); 4049 4050 /* Allocate ABD and ZIO for each child we write. */ 4051 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4052 zio_t *pio = spa->spa_txg_zio[txgoff]; 4053 uint_t b = blocks / vd->vdev_children; 4054 uint_t bb = blocks % vd->vdev_children; 4055 for (uint_t i = 0; i < writes; i++) { 4056 uint_t n = b + (i < bb); 4057 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); 4058 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, 4059 vd->vdev_child[(blkid + i) % vd->vdev_children], 4060 ((blkid + i) / vd->vdev_children) << ashift, 4061 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4062 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); 4063 } 4064 4065 /* 4066 * Allocate and issue ZIO for each child we read. For reads of only 4067 * one block we can use respective writer ABDs, since they will also 4068 * have only one block. For bigger reads create gang ABDs and fill 4069 * them with respective blocks from writer ABDs. 4070 */ 4071 b = blocks / old_children; 4072 bb = blocks % old_children; 4073 for (uint_t i = 0; i < reads; i++) { 4074 uint_t n = b + (i < bb); 4075 abd_t *abd; 4076 if (n > 1) { 4077 abd = abd_alloc_gang(); 4078 for (uint_t j = 0; j < n; j++) { 4079 uint_t b = j * old_children + i; 4080 abd_t *cabd = abd_get_offset_size( 4081 rra->rra_zio[b % vd->vdev_children]->io_abd, 4082 (b / vd->vdev_children) << ashift, 4083 1 << ashift); 4084 abd_gang_add(abd, cabd, B_TRUE); 4085 } 4086 } else { 4087 abd = rra->rra_zio[i]->io_abd; 4088 } 4089 zio_nowait(zio_vdev_child_io(pio, NULL, 4090 vd->vdev_child[(blkid + i) % old_children], 4091 ((blkid + i) / old_children) << ashift, abd, 4092 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4093 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); 4094 } 4095 4096 return (B_FALSE); 4097 } 4098 4099 /* 4100 * For testing (ztest specific) 4101 */ 4102 static void 4103 raidz_expand_pause(uint_t pause_point) 4104 { 4105 while (raidz_expand_pause_point != 0 && 4106 raidz_expand_pause_point <= pause_point) 4107 delay(hz); 4108 } 4109 4110 static void 4111 raidz_scratch_child_done(zio_t *zio) 4112 { 4113 zio_t *pio = zio->io_private; 4114 4115 mutex_enter(&pio->io_lock); 4116 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4117 mutex_exit(&pio->io_lock); 4118 } 4119 4120 /* 4121 * Reflow the beginning portion of the vdev into an intermediate scratch area 4122 * in memory and on disk. This operation must be persisted on disk before we 4123 * proceed to overwrite the beginning portion with the reflowed data. 4124 * 4125 * This multi-step task can fail to complete if disk errors are encountered 4126 * and we can return here after a pause (waiting for disk to become healthy). 4127 */ 4128 static void 4129 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4130 { 4131 vdev_raidz_expand_t *vre = arg; 4132 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4133 zio_t *pio; 4134 int error; 4135 4136 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4137 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4138 int ashift = raidvd->vdev_ashift; 4139 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4140 uint64_t); 4141 uint64_t logical_size = write_size * raidvd->vdev_children; 4142 uint64_t read_size = 4143 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4144 1 << ashift); 4145 4146 /* 4147 * The scratch space must be large enough to get us to the point 4148 * that one row does not overlap itself when moved. This is checked 4149 * by vdev_raidz_attach_check(). 4150 */ 4151 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4152 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4153 VERIFY3U(write_size, <=, read_size); 4154 4155 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4156 0, logical_size, RL_WRITER); 4157 4158 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4159 KM_SLEEP); 4160 for (int i = 0; i < raidvd->vdev_children; i++) { 4161 abds[i] = abd_alloc_linear(read_size, B_FALSE); 4162 } 4163 4164 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4165 4166 /* 4167 * If we have already written the scratch area then we must read from 4168 * there, since new writes were redirected there while we were paused 4169 * or the original location may have been partially overwritten with 4170 * reflowed data. 4171 */ 4172 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4173 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4174 /* 4175 * Read from scratch space. 4176 */ 4177 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4178 for (int i = 0; i < raidvd->vdev_children; i++) { 4179 /* 4180 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4181 * to the offset to calculate the physical offset to 4182 * write to. Passing in a negative offset makes us 4183 * access the scratch area. 4184 */ 4185 zio_nowait(zio_vdev_child_io(pio, NULL, 4186 raidvd->vdev_child[i], 4187 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4188 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4189 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4190 } 4191 error = zio_wait(pio); 4192 if (error != 0) { 4193 zfs_dbgmsg("reflow: error %d reading scratch location", 4194 error); 4195 goto io_error_exit; 4196 } 4197 goto overwrite; 4198 } 4199 4200 /* 4201 * Read from original location. 4202 */ 4203 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4204 for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4205 ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4206 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4207 0, abds[i], read_size, ZIO_TYPE_READ, 4208 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4209 raidz_scratch_child_done, pio)); 4210 } 4211 error = zio_wait(pio); 4212 if (error != 0) { 4213 zfs_dbgmsg("reflow: error %d reading original location", error); 4214 io_error_exit: 4215 for (int i = 0; i < raidvd->vdev_children; i++) 4216 abd_free(abds[i]); 4217 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4218 zfs_rangelock_exit(lr); 4219 spa_config_exit(spa, SCL_STATE, FTAG); 4220 return; 4221 } 4222 4223 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4224 4225 /* 4226 * Reflow in memory. 4227 */ 4228 uint64_t logical_sectors = logical_size >> ashift; 4229 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4230 int oldchild = i % (raidvd->vdev_children - 1); 4231 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4232 4233 int newchild = i % raidvd->vdev_children; 4234 uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4235 4236 /* a single sector should not be copying over itself */ 4237 ASSERT(!(newchild == oldchild && newoff == oldoff)); 4238 4239 abd_copy_off(abds[newchild], abds[oldchild], 4240 newoff, oldoff, 1 << ashift); 4241 } 4242 4243 /* 4244 * Verify that we filled in everything we intended to (write_size on 4245 * each child). 4246 */ 4247 VERIFY0(logical_sectors % raidvd->vdev_children); 4248 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4249 write_size); 4250 4251 /* 4252 * Write to scratch location (boot area). 4253 */ 4254 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4255 for (int i = 0; i < raidvd->vdev_children; i++) { 4256 /* 4257 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4258 * the offset to calculate the physical offset to write to. 4259 * Passing in a negative offset lets us access the boot area. 4260 */ 4261 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4262 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4263 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4264 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4265 } 4266 error = zio_wait(pio); 4267 if (error != 0) { 4268 zfs_dbgmsg("reflow: error %d writing scratch location", error); 4269 goto io_error_exit; 4270 } 4271 pio = zio_root(spa, NULL, NULL, 0); 4272 zio_flush(pio, raidvd); 4273 zio_wait(pio); 4274 4275 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4276 (long long)logical_size); 4277 4278 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4279 4280 /* 4281 * Update uberblock to indicate that scratch space is valid. This is 4282 * needed because after this point, the real location may be 4283 * overwritten. If we crash, we need to get the data from the 4284 * scratch space, rather than the real location. 4285 * 4286 * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4287 * will prefer this uberblock. 4288 */ 4289 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4290 spa->spa_ubsync.ub_timestamp++; 4291 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4292 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4293 if (spa_multihost(spa)) 4294 mmp_update_uberblock(spa, &spa->spa_ubsync); 4295 4296 zfs_dbgmsg("reflow: uberblock updated " 4297 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4298 (long long)spa->spa_ubsync.ub_txg, 4299 (long long)logical_size, 4300 (long long)spa->spa_ubsync.ub_timestamp); 4301 4302 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4303 4304 /* 4305 * Overwrite with reflow'ed data. 4306 */ 4307 overwrite: 4308 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4309 for (int i = 0; i < raidvd->vdev_children; i++) { 4310 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4311 0, abds[i], write_size, ZIO_TYPE_WRITE, 4312 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4313 raidz_scratch_child_done, pio)); 4314 } 4315 error = zio_wait(pio); 4316 if (error != 0) { 4317 /* 4318 * When we exit early here and drop the range lock, new 4319 * writes will go into the scratch area so we'll need to 4320 * read from there when we return after pausing. 4321 */ 4322 zfs_dbgmsg("reflow: error %d writing real location", error); 4323 /* 4324 * Update the uberblock that is written when this txg completes. 4325 */ 4326 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4327 logical_size); 4328 goto io_error_exit; 4329 } 4330 pio = zio_root(spa, NULL, NULL, 0); 4331 zio_flush(pio, raidvd); 4332 zio_wait(pio); 4333 4334 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4335 (long long)logical_size); 4336 for (int i = 0; i < raidvd->vdev_children; i++) 4337 abd_free(abds[i]); 4338 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4339 4340 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4341 4342 /* 4343 * Update uberblock to indicate that the initial part has been 4344 * reflow'ed. This is needed because after this point (when we exit 4345 * the rangelock), we allow regular writes to this region, which will 4346 * be written to the new location only (because reflow_offset_next == 4347 * reflow_offset_synced). If we crashed and re-copied from the 4348 * scratch space, we would lose the regular writes. 4349 */ 4350 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4351 logical_size); 4352 spa->spa_ubsync.ub_timestamp++; 4353 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4354 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4355 if (spa_multihost(spa)) 4356 mmp_update_uberblock(spa, &spa->spa_ubsync); 4357 4358 zfs_dbgmsg("reflow: uberblock updated " 4359 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4360 (long long)spa->spa_ubsync.ub_txg, 4361 (long long)logical_size, 4362 (long long)spa->spa_ubsync.ub_timestamp); 4363 4364 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4365 4366 /* 4367 * Update progress. 4368 */ 4369 vre->vre_offset = logical_size; 4370 zfs_rangelock_exit(lr); 4371 spa_config_exit(spa, SCL_STATE, FTAG); 4372 4373 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4374 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4375 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4376 /* 4377 * Note - raidz_reflow_sync() will update the uberblock state to 4378 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4379 */ 4380 raidz_reflow_sync(spa, tx); 4381 4382 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4383 } 4384 4385 /* 4386 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4387 * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4388 */ 4389 void 4390 vdev_raidz_reflow_copy_scratch(spa_t *spa) 4391 { 4392 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4393 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4394 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4395 4396 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4397 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4398 ASSERT0(logical_size % raidvd->vdev_children); 4399 uint64_t write_size = logical_size / raidvd->vdev_children; 4400 4401 zio_t *pio; 4402 4403 /* 4404 * Read from scratch space. 4405 */ 4406 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4407 KM_SLEEP); 4408 for (int i = 0; i < raidvd->vdev_children; i++) { 4409 abds[i] = abd_alloc_linear(write_size, B_FALSE); 4410 } 4411 4412 pio = zio_root(spa, NULL, NULL, 0); 4413 for (int i = 0; i < raidvd->vdev_children; i++) { 4414 /* 4415 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4416 * the offset to calculate the physical offset to write to. 4417 * Passing in a negative offset lets us access the boot area. 4418 */ 4419 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4420 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4421 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, 4422 raidz_scratch_child_done, pio)); 4423 } 4424 zio_wait(pio); 4425 4426 /* 4427 * Overwrite real location with reflow'ed data. 4428 */ 4429 pio = zio_root(spa, NULL, NULL, 0); 4430 for (int i = 0; i < raidvd->vdev_children; i++) { 4431 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4432 0, abds[i], write_size, ZIO_TYPE_WRITE, 4433 ZIO_PRIORITY_REMOVAL, 0, 4434 raidz_scratch_child_done, pio)); 4435 } 4436 zio_wait(pio); 4437 pio = zio_root(spa, NULL, NULL, 0); 4438 zio_flush(pio, raidvd); 4439 zio_wait(pio); 4440 4441 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4442 "to real location", (long long)logical_size); 4443 4444 for (int i = 0; i < raidvd->vdev_children; i++) 4445 abd_free(abds[i]); 4446 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4447 4448 /* 4449 * Update uberblock. 4450 */ 4451 RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4452 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4453 spa->spa_ubsync.ub_timestamp++; 4454 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4455 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4456 if (spa_multihost(spa)) 4457 mmp_update_uberblock(spa, &spa->spa_ubsync); 4458 4459 zfs_dbgmsg("reflow recovery: uberblock updated " 4460 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4461 (long long)spa->spa_ubsync.ub_txg, 4462 (long long)logical_size, 4463 (long long)spa->spa_ubsync.ub_timestamp); 4464 4465 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4466 spa_first_txg(spa)); 4467 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4468 vre->vre_offset = logical_size; 4469 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4470 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4471 /* 4472 * Note that raidz_reflow_sync() will update the uberblock once more 4473 */ 4474 raidz_reflow_sync(spa, tx); 4475 4476 dmu_tx_commit(tx); 4477 4478 spa_config_exit(spa, SCL_STATE, FTAG); 4479 } 4480 4481 static boolean_t 4482 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4483 { 4484 (void) zthr; 4485 spa_t *spa = arg; 4486 4487 return (spa->spa_raidz_expand != NULL && 4488 !spa->spa_raidz_expand->vre_waiting_for_resilver); 4489 } 4490 4491 /* 4492 * RAIDZ expansion background thread 4493 * 4494 * Can be called multiple times if the reflow is paused 4495 */ 4496 static void 4497 spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4498 { 4499 spa_t *spa = arg; 4500 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4501 4502 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4503 vre->vre_offset = 0; 4504 else 4505 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4506 4507 /* Reflow the begining portion using the scratch area */ 4508 if (vre->vre_offset == 0) { 4509 VERIFY0(dsl_sync_task(spa_name(spa), 4510 NULL, raidz_reflow_scratch_sync, 4511 vre, 0, ZFS_SPACE_CHECK_NONE)); 4512 4513 /* if we encountered errors then pause */ 4514 if (vre->vre_offset == 0) { 4515 mutex_enter(&vre->vre_lock); 4516 vre->vre_waiting_for_resilver = B_TRUE; 4517 mutex_exit(&vre->vre_lock); 4518 return; 4519 } 4520 } 4521 4522 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4523 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4524 4525 uint64_t guid = raidvd->vdev_guid; 4526 4527 /* Iterate over all the remaining metaslabs */ 4528 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4529 i < raidvd->vdev_ms_count && 4530 !zthr_iscancelled(zthr) && 4531 vre->vre_failed_offset == UINT64_MAX; i++) { 4532 metaslab_t *msp = raidvd->vdev_ms[i]; 4533 4534 metaslab_disable(msp); 4535 mutex_enter(&msp->ms_lock); 4536 4537 /* 4538 * The metaslab may be newly created (for the expanded 4539 * space), in which case its trees won't exist yet, 4540 * so we need to bail out early. 4541 */ 4542 if (msp->ms_new) { 4543 mutex_exit(&msp->ms_lock); 4544 metaslab_enable(msp, B_FALSE, B_FALSE); 4545 continue; 4546 } 4547 4548 VERIFY0(metaslab_load(msp)); 4549 4550 /* 4551 * We want to copy everything except the free (allocatable) 4552 * space. Note that there may be a little bit more free 4553 * space (e.g. in ms_defer), and it's fine to copy that too. 4554 */ 4555 uint64_t shift, start; 4556 range_seg_type_t type = metaslab_calculate_range_tree_type( 4557 raidvd, msp, &start, &shift); 4558 range_tree_t *rt = range_tree_create(NULL, type, NULL, 4559 start, shift); 4560 range_tree_add(rt, msp->ms_start, msp->ms_size); 4561 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); 4562 mutex_exit(&msp->ms_lock); 4563 4564 /* 4565 * Force the last sector of each metaslab to be copied. This 4566 * ensures that we advance the on-disk progress to the end of 4567 * this metaslab while the metaslab is disabled. Otherwise, we 4568 * could move past this metaslab without advancing the on-disk 4569 * progress, and then an allocation to this metaslab would not 4570 * be copied. 4571 */ 4572 int sectorsz = 1 << raidvd->vdev_ashift; 4573 uint64_t ms_last_offset = msp->ms_start + 4574 msp->ms_size - sectorsz; 4575 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { 4576 range_tree_add(rt, ms_last_offset, sectorsz); 4577 } 4578 4579 /* 4580 * When we are resuming from a paused expansion (i.e. 4581 * when importing a pool with a expansion in progress), 4582 * discard any state that we have already processed. 4583 */ 4584 if (vre->vre_offset > msp->ms_start) { 4585 range_tree_clear(rt, msp->ms_start, 4586 vre->vre_offset - msp->ms_start); 4587 } 4588 4589 while (!zthr_iscancelled(zthr) && 4590 !range_tree_is_empty(rt) && 4591 vre->vre_failed_offset == UINT64_MAX) { 4592 4593 /* 4594 * We need to periodically drop the config lock so that 4595 * writers can get in. Additionally, we can't wait 4596 * for a txg to sync while holding a config lock 4597 * (since a waiting writer could cause a 3-way deadlock 4598 * with the sync thread, which also gets a config 4599 * lock for reader). So we can't hold the config lock 4600 * while calling dmu_tx_assign(). 4601 */ 4602 spa_config_exit(spa, SCL_CONFIG, FTAG); 4603 4604 /* 4605 * If requested, pause the reflow when the amount 4606 * specified by raidz_expand_max_reflow_bytes is reached 4607 * 4608 * This pause is only used during testing or debugging. 4609 */ 4610 while (raidz_expand_max_reflow_bytes != 0 && 4611 raidz_expand_max_reflow_bytes <= 4612 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4613 delay(hz); 4614 } 4615 4616 mutex_enter(&vre->vre_lock); 4617 while (vre->vre_outstanding_bytes > 4618 raidz_expand_max_copy_bytes) { 4619 cv_wait(&vre->vre_cv, &vre->vre_lock); 4620 } 4621 mutex_exit(&vre->vre_lock); 4622 4623 dmu_tx_t *tx = 4624 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4625 4626 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 4627 uint64_t txg = dmu_tx_get_txg(tx); 4628 4629 /* 4630 * Reacquire the vdev_config lock. Theoretically, the 4631 * vdev_t that we're expanding may have changed. 4632 */ 4633 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4634 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4635 4636 boolean_t needsync = 4637 raidz_reflow_impl(raidvd, vre, rt, tx); 4638 4639 dmu_tx_commit(tx); 4640 4641 if (needsync) { 4642 spa_config_exit(spa, SCL_CONFIG, FTAG); 4643 txg_wait_synced(spa->spa_dsl_pool, txg); 4644 spa_config_enter(spa, SCL_CONFIG, FTAG, 4645 RW_READER); 4646 } 4647 } 4648 4649 spa_config_exit(spa, SCL_CONFIG, FTAG); 4650 4651 metaslab_enable(msp, B_FALSE, B_FALSE); 4652 range_tree_vacate(rt, NULL, NULL); 4653 range_tree_destroy(rt); 4654 4655 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4656 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4657 } 4658 4659 spa_config_exit(spa, SCL_CONFIG, FTAG); 4660 4661 /* 4662 * The txg_wait_synced() here ensures that all reflow zio's have 4663 * completed, and vre_failed_offset has been set if necessary. It 4664 * also ensures that the progress of the last raidz_reflow_sync() is 4665 * written to disk before raidz_reflow_complete_sync() changes the 4666 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4667 * determine if a reflow is in progress, in which case we may need to 4668 * write to both old and new locations. Therefore we can only change 4669 * vre_state once this is not necessary, which is once the on-disk 4670 * progress (in spa_ubsync) has been set past any possible writes (to 4671 * the end of the last metaslab). 4672 */ 4673 txg_wait_synced(spa->spa_dsl_pool, 0); 4674 4675 if (!zthr_iscancelled(zthr) && 4676 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4677 /* 4678 * We are not being canceled or paused, so the reflow must be 4679 * complete. In that case also mark it as completed on disk. 4680 */ 4681 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4682 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4683 raidz_reflow_complete_sync, spa, 4684 0, ZFS_SPACE_CHECK_NONE)); 4685 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4686 } else { 4687 /* 4688 * Wait for all copy zio's to complete and for all the 4689 * raidz_reflow_sync() synctasks to be run. 4690 */ 4691 spa_history_log_internal(spa, "reflow pause", 4692 NULL, "offset=%llu failed_offset=%lld", 4693 (long long)vre->vre_offset, 4694 (long long)vre->vre_failed_offset); 4695 mutex_enter(&vre->vre_lock); 4696 if (vre->vre_failed_offset != UINT64_MAX) { 4697 /* 4698 * Reset progress so that we will retry everything 4699 * after the point that something failed. 4700 */ 4701 vre->vre_offset = vre->vre_failed_offset; 4702 vre->vre_failed_offset = UINT64_MAX; 4703 vre->vre_waiting_for_resilver = B_TRUE; 4704 } 4705 mutex_exit(&vre->vre_lock); 4706 } 4707 } 4708 4709 void 4710 spa_start_raidz_expansion_thread(spa_t *spa) 4711 { 4712 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4713 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4714 spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4715 spa, defclsyspri); 4716 } 4717 4718 void 4719 raidz_dtl_reassessed(vdev_t *vd) 4720 { 4721 spa_t *spa = vd->vdev_spa; 4722 if (spa->spa_raidz_expand != NULL) { 4723 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4724 /* 4725 * we get called often from vdev_dtl_reassess() so make 4726 * sure it's our vdev and any replacing is complete 4727 */ 4728 if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4729 !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4730 mutex_enter(&vre->vre_lock); 4731 if (vre->vre_waiting_for_resilver) { 4732 vdev_dbgmsg(vd, "DTL reassessed, " 4733 "continuing raidz expansion"); 4734 vre->vre_waiting_for_resilver = B_FALSE; 4735 zthr_wakeup(spa->spa_raidz_expand_zthr); 4736 } 4737 mutex_exit(&vre->vre_lock); 4738 } 4739 } 4740 } 4741 4742 int 4743 vdev_raidz_attach_check(vdev_t *new_child) 4744 { 4745 vdev_t *raidvd = new_child->vdev_parent; 4746 uint64_t new_children = raidvd->vdev_children; 4747 4748 /* 4749 * We use the "boot" space as scratch space to handle overwriting the 4750 * initial part of the vdev. If it is too small, then this expansion 4751 * is not allowed. This would be very unusual (e.g. ashift > 13 and 4752 * >200 children). 4753 */ 4754 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4755 return (EINVAL); 4756 } 4757 return (0); 4758 } 4759 4760 void 4761 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4762 { 4763 vdev_t *new_child = arg; 4764 spa_t *spa = new_child->vdev_spa; 4765 vdev_t *raidvd = new_child->vdev_parent; 4766 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4767 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4768 ASSERT3P(raidvd->vdev_top, ==, raidvd); 4769 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4770 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4771 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4772 new_child); 4773 4774 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4775 4776 vdrz->vd_physical_width++; 4777 4778 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4779 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4780 vdrz->vn_vre.vre_offset = 0; 4781 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4782 spa->spa_raidz_expand = &vdrz->vn_vre; 4783 zthr_wakeup(spa->spa_raidz_expand_zthr); 4784 4785 /* 4786 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4787 * written to the config. 4788 */ 4789 vdev_config_dirty(raidvd); 4790 4791 vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4792 vdrz->vn_vre.vre_end_time = 0; 4793 vdrz->vn_vre.vre_state = DSS_SCANNING; 4794 vdrz->vn_vre.vre_bytes_copied = 0; 4795 4796 uint64_t state = vdrz->vn_vre.vre_state; 4797 VERIFY0(zap_update(spa->spa_meta_objset, 4798 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4799 sizeof (state), 1, &state, tx)); 4800 4801 uint64_t start_time = vdrz->vn_vre.vre_start_time; 4802 VERIFY0(zap_update(spa->spa_meta_objset, 4803 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4804 sizeof (start_time), 1, &start_time, tx)); 4805 4806 (void) zap_remove(spa->spa_meta_objset, 4807 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4808 (void) zap_remove(spa->spa_meta_objset, 4809 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4810 4811 spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4812 "%s vdev %llu new width %llu", spa_name(spa), 4813 (unsigned long long)raidvd->vdev_id, 4814 (unsigned long long)raidvd->vdev_children); 4815 } 4816 4817 int 4818 vdev_raidz_load(vdev_t *vd) 4819 { 4820 vdev_raidz_t *vdrz = vd->vdev_tsd; 4821 int err; 4822 4823 uint64_t state = DSS_NONE; 4824 uint64_t start_time = 0; 4825 uint64_t end_time = 0; 4826 uint64_t bytes_copied = 0; 4827 4828 if (vd->vdev_top_zap != 0) { 4829 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4830 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4831 sizeof (state), 1, &state); 4832 if (err != 0 && err != ENOENT) 4833 return (err); 4834 4835 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4836 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4837 sizeof (start_time), 1, &start_time); 4838 if (err != 0 && err != ENOENT) 4839 return (err); 4840 4841 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4842 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4843 sizeof (end_time), 1, &end_time); 4844 if (err != 0 && err != ENOENT) 4845 return (err); 4846 4847 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4848 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4849 sizeof (bytes_copied), 1, &bytes_copied); 4850 if (err != 0 && err != ENOENT) 4851 return (err); 4852 } 4853 4854 /* 4855 * If we are in the middle of expansion, vre_state should have 4856 * already been set by vdev_raidz_init(). 4857 */ 4858 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4859 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4860 vdrz->vn_vre.vre_start_time = start_time; 4861 vdrz->vn_vre.vre_end_time = end_time; 4862 vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4863 4864 return (0); 4865 } 4866 4867 int 4868 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4869 { 4870 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4871 4872 if (vre == NULL) { 4873 /* no removal in progress; find most recent completed */ 4874 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4875 vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4876 if (vd->vdev_ops == &vdev_raidz_ops) { 4877 vdev_raidz_t *vdrz = vd->vdev_tsd; 4878 4879 if (vdrz->vn_vre.vre_end_time != 0 && 4880 (vre == NULL || 4881 vdrz->vn_vre.vre_end_time > 4882 vre->vre_end_time)) { 4883 vre = &vdrz->vn_vre; 4884 } 4885 } 4886 } 4887 } 4888 4889 if (vre == NULL) { 4890 return (SET_ERROR(ENOENT)); 4891 } 4892 4893 pres->pres_state = vre->vre_state; 4894 pres->pres_expanding_vdev = vre->vre_vdev_id; 4895 4896 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4897 pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4898 4899 mutex_enter(&vre->vre_lock); 4900 pres->pres_reflowed = vre->vre_bytes_copied; 4901 for (int i = 0; i < TXG_SIZE; i++) 4902 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4903 mutex_exit(&vre->vre_lock); 4904 4905 pres->pres_start_time = vre->vre_start_time; 4906 pres->pres_end_time = vre->vre_end_time; 4907 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4908 4909 return (0); 4910 } 4911 4912 /* 4913 * Initialize private RAIDZ specific fields from the nvlist. 4914 */ 4915 static int 4916 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 4917 { 4918 uint_t children; 4919 nvlist_t **child; 4920 int error = nvlist_lookup_nvlist_array(nv, 4921 ZPOOL_CONFIG_CHILDREN, &child, &children); 4922 if (error != 0) 4923 return (SET_ERROR(EINVAL)); 4924 4925 uint64_t nparity; 4926 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 4927 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 4928 return (SET_ERROR(EINVAL)); 4929 4930 /* 4931 * Previous versions could only support 1 or 2 parity 4932 * device. 4933 */ 4934 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 4935 return (SET_ERROR(EINVAL)); 4936 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 4937 return (SET_ERROR(EINVAL)); 4938 } else { 4939 /* 4940 * We require the parity to be specified for SPAs that 4941 * support multiple parity levels. 4942 */ 4943 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 4944 return (SET_ERROR(EINVAL)); 4945 4946 /* 4947 * Otherwise, we default to 1 parity device for RAID-Z. 4948 */ 4949 nparity = 1; 4950 } 4951 4952 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4953 vdrz->vn_vre.vre_vdev_id = -1; 4954 vdrz->vn_vre.vre_offset = UINT64_MAX; 4955 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4956 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4957 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4958 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4959 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4960 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4961 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4962 4963 vdrz->vd_physical_width = children; 4964 vdrz->vd_nparity = nparity; 4965 4966 /* note, the ID does not exist when creating a pool */ 4967 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4968 &vdrz->vn_vre.vre_vdev_id); 4969 4970 boolean_t reflow_in_progress = 4971 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4972 if (reflow_in_progress) { 4973 spa->spa_raidz_expand = &vdrz->vn_vre; 4974 vdrz->vn_vre.vre_state = DSS_SCANNING; 4975 } 4976 4977 vdrz->vd_original_width = children; 4978 uint64_t *txgs; 4979 unsigned int txgs_size = 0; 4980 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4981 &txgs, &txgs_size); 4982 if (error == 0) { 4983 for (int i = 0; i < txgs_size; i++) { 4984 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4985 re->re_txg = txgs[txgs_size - i - 1]; 4986 re->re_logical_width = vdrz->vd_physical_width - i; 4987 4988 if (reflow_in_progress) 4989 re->re_logical_width--; 4990 4991 avl_add(&vdrz->vd_expand_txgs, re); 4992 } 4993 4994 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4995 } 4996 if (reflow_in_progress) { 4997 vdrz->vd_original_width--; 4998 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 4999 children, txgs_size); 5000 } 5001 5002 *tsd = vdrz; 5003 5004 return (0); 5005 } 5006 5007 static void 5008 vdev_raidz_fini(vdev_t *vd) 5009 { 5010 vdev_raidz_t *vdrz = vd->vdev_tsd; 5011 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 5012 vd->vdev_spa->spa_raidz_expand = NULL; 5013 reflow_node_t *re; 5014 void *cookie = NULL; 5015 avl_tree_t *tree = &vdrz->vd_expand_txgs; 5016 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 5017 kmem_free(re, sizeof (*re)); 5018 avl_destroy(&vdrz->vd_expand_txgs); 5019 mutex_destroy(&vdrz->vd_expand_lock); 5020 mutex_destroy(&vdrz->vn_vre.vre_lock); 5021 cv_destroy(&vdrz->vn_vre.vre_cv); 5022 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 5023 kmem_free(vdrz, sizeof (*vdrz)); 5024 } 5025 5026 /* 5027 * Add RAIDZ specific fields to the config nvlist. 5028 */ 5029 static void 5030 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 5031 { 5032 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 5033 vdev_raidz_t *vdrz = vd->vdev_tsd; 5034 5035 /* 5036 * Make sure someone hasn't managed to sneak a fancy new vdev 5037 * into a crufty old storage pool. 5038 */ 5039 ASSERT(vdrz->vd_nparity == 1 || 5040 (vdrz->vd_nparity <= 2 && 5041 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 5042 (vdrz->vd_nparity <= 3 && 5043 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 5044 5045 /* 5046 * Note that we'll add these even on storage pools where they 5047 * aren't strictly required -- older software will just ignore 5048 * it. 5049 */ 5050 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 5051 5052 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 5053 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5054 } 5055 5056 mutex_enter(&vdrz->vd_expand_lock); 5057 if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 5058 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 5059 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 5060 KM_SLEEP); 5061 uint64_t i = 0; 5062 5063 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 5064 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 5065 txgs[i++] = re->re_txg; 5066 } 5067 5068 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5069 txgs, count); 5070 5071 kmem_free(txgs, sizeof (uint64_t) * count); 5072 } 5073 mutex_exit(&vdrz->vd_expand_lock); 5074 } 5075 5076 static uint64_t 5077 vdev_raidz_nparity(vdev_t *vd) 5078 { 5079 vdev_raidz_t *vdrz = vd->vdev_tsd; 5080 return (vdrz->vd_nparity); 5081 } 5082 5083 static uint64_t 5084 vdev_raidz_ndisks(vdev_t *vd) 5085 { 5086 return (vd->vdev_children); 5087 } 5088 5089 vdev_ops_t vdev_raidz_ops = { 5090 .vdev_op_init = vdev_raidz_init, 5091 .vdev_op_fini = vdev_raidz_fini, 5092 .vdev_op_open = vdev_raidz_open, 5093 .vdev_op_close = vdev_raidz_close, 5094 .vdev_op_asize = vdev_raidz_asize, 5095 .vdev_op_min_asize = vdev_raidz_min_asize, 5096 .vdev_op_min_alloc = NULL, 5097 .vdev_op_io_start = vdev_raidz_io_start, 5098 .vdev_op_io_done = vdev_raidz_io_done, 5099 .vdev_op_state_change = vdev_raidz_state_change, 5100 .vdev_op_need_resilver = vdev_raidz_need_resilver, 5101 .vdev_op_hold = NULL, 5102 .vdev_op_rele = NULL, 5103 .vdev_op_remap = NULL, 5104 .vdev_op_xlate = vdev_raidz_xlate, 5105 .vdev_op_rebuild_asize = NULL, 5106 .vdev_op_metaslab_init = NULL, 5107 .vdev_op_config_generate = vdev_raidz_config_generate, 5108 .vdev_op_nparity = vdev_raidz_nparity, 5109 .vdev_op_ndisks = vdev_raidz_ndisks, 5110 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5111 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5112 }; 5113 5114 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5115 "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5116 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5117 "Max amount of concurrent i/o for RAIDZ expansion"); 5118 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5119 "For expanded RAIDZ, aggregate reads that have more rows than this"); 5120 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5121 "For expanded RAIDZ, automatically start a pool scrub when expansion " 5122 "completes"); 5123