1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/spa_impl.h> 31 #include <sys/zap.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/abd.h> 38 #include <sys/zfs_rlock.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/vdev_raidz.h> 42 #include <sys/vdev_raidz_impl.h> 43 #include <sys/vdev_draid.h> 44 #include <sys/uberblock_impl.h> 45 #include <sys/dsl_scan.h> 46 47 #ifdef ZFS_DEBUG 48 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 49 #endif 50 51 /* 52 * Virtual device vector for RAID-Z. 53 * 54 * This vdev supports single, double, and triple parity. For single parity, 55 * we use a simple XOR of all the data columns. For double or triple parity, 56 * we use a special case of Reed-Solomon coding. This extends the 57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 60 * former is also based. The latter is designed to provide higher performance 61 * for writes. 62 * 63 * Note that the Plank paper claimed to support arbitrary N+M, but was then 64 * amended six years later identifying a critical flaw that invalidates its 65 * claims. Nevertheless, the technique can be adapted to work for up to 66 * triple parity. For additional parity, the amendment "Note: Correction to 67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 68 * is viable, but the additional complexity means that write performance will 69 * suffer. 70 * 71 * All of the methods above operate on a Galois field, defined over the 72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 73 * can be expressed with a single byte. Briefly, the operations on the 74 * field are defined as follows: 75 * 76 * o addition (+) is represented by a bitwise XOR 77 * o subtraction (-) is therefore identical to addition: A + B = A - B 78 * o multiplication of A by 2 is defined by the following bitwise expression: 79 * 80 * (A * 2)_7 = A_6 81 * (A * 2)_6 = A_5 82 * (A * 2)_5 = A_4 83 * (A * 2)_4 = A_3 + A_7 84 * (A * 2)_3 = A_2 + A_7 85 * (A * 2)_2 = A_1 + A_7 86 * (A * 2)_1 = A_0 87 * (A * 2)_0 = A_7 88 * 89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 90 * As an aside, this multiplication is derived from the error correcting 91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 92 * 93 * Observe that any number in the field (except for 0) can be expressed as a 94 * power of 2 -- a generator for the field. We store a table of the powers of 95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 97 * than field addition). The inverse of a field element A (A^-1) is therefore 98 * A ^ (255 - 1) = A^254. 99 * 100 * The up-to-three parity columns, P, Q, R over several data columns, 101 * D_0, ... D_n-1, can be expressed by field operations: 102 * 103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 108 * 109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 111 * independent coefficients. (There are no additional coefficients that have 112 * this property which is why the uncorrected Plank method breaks down.) 113 * 114 * See the reconstruction code below for how P, Q and R can used individually 115 * or in concert to recover missing data columns. 116 */ 117 118 #define VDEV_RAIDZ_P 0 119 #define VDEV_RAIDZ_Q 1 120 #define VDEV_RAIDZ_R 2 121 122 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 123 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 124 125 /* 126 * We provide a mechanism to perform the field multiplication operation on a 127 * 64-bit value all at once rather than a byte at a time. This works by 128 * creating a mask from the top bit in each byte and using that to 129 * conditionally apply the XOR of 0x1d. 130 */ 131 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 132 { \ 133 (mask) = (x) & 0x8080808080808080ULL; \ 134 (mask) = ((mask) << 1) - ((mask) >> 7); \ 135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 137 } 138 139 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 140 { \ 141 VDEV_RAIDZ_64MUL_2((x), mask); \ 142 VDEV_RAIDZ_64MUL_2((x), mask); \ 143 } 144 145 146 /* 147 * Big Theory Statement for how a RAIDZ VDEV is expanded 148 * 149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 151 * that have been previously expanded can be expanded again. 152 * 153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 154 * the VDEV) when an expansion starts. And the expansion will pause if any 155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 156 * operations on the pool can continue while an expansion is in progress (e.g. 157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 158 * and zpool initialize which can't be run during an expansion. Following a 159 * reboot or export/import, the expansion resumes where it left off. 160 * 161 * == Reflowing the Data == 162 * 163 * The expansion involves reflowing (copying) the data from the current set 164 * of disks to spread it across the new set which now has one more disk. This 165 * reflow operation is similar to reflowing text when the column width of a 166 * text editor window is expanded. The text doesn’t change but the location of 167 * the text changes to accommodate the new width. An example reflow result for 168 * a 4-wide RAIDZ1 to a 5-wide is shown below. 169 * 170 * Reflow End State 171 * Each letter indicates a parity group (logical stripe) 172 * 173 * Before expansion After Expansion 174 * D1 D2 D3 D4 D1 D2 D3 D4 D5 175 * +------+------+------+------+ +------+------+------+------+------+ 176 * | | | | | | | | | | | 177 * | A | A | A | A | | A | A | A | A | B | 178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 179 * +------+------+------+------+ +------+------+------+------+------+ 180 * | | | | | | | | | | | 181 * | B | B | C | C | | B | C | C | C | C | 182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 183 * +------+------+------+------+ +------+------+------+------+------+ 184 * | | | | | | | | | | | 185 * | C | C | D | D | | D | D | E | E | E | 186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 187 * +------+------+------+------+ +------+------+------+------+------+ 188 * | | | | | | | | | | | 189 * | E | E | E | E | --> | E | F | F | G | G | 190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 191 * +------+------+------+------+ +------+------+------+------+------+ 192 * | | | | | | | | | | | 193 * | F | F | G | G | | G | G | H | H | H | 194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 195 * +------+------+------+------+ +------+------+------+------+------+ 196 * | | | | | | | | | | | 197 * | G | G | H | H | | H | I | I | J | J | 198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 199 * +------+------+------+------+ +------+------+------+------+------+ 200 * | | | | | | | | | | | 201 * | H | H | I | I | | J | J | | | K | 202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 203 * +------+------+------+------+ +------+------+------+------+------+ 204 * 205 * This reflow approach has several advantages. There is no need to read or 206 * modify the block pointers or recompute any block checksums. The reflow 207 * doesn’t need to know where the parity sectors reside. We can read and write 208 * data sequentially and the copy can occur in a background thread in open 209 * context. The design also allows for fast discovery of what data to copy. 210 * 211 * The VDEV metaslabs are processed, one at a time, to copy the block data to 212 * have it flow across all the disks. The metaslab is disabled for allocations 213 * during the copy. As an optimization, we only copy the allocated data which 214 * can be determined by looking at the metaslab range tree. During the copy we 215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 216 * need to be able to survive losing parity count disks). This means we 217 * cannot overwrite data during the reflow that would be needed if a disk is 218 * lost. 219 * 220 * After the reflow completes, all newly-written blocks will have the new 221 * layout, i.e., they will have the parity to data ratio implied by the new 222 * number of disks in the RAIDZ group. Even though the reflow copies all of 223 * the allocated space (data and parity), it is only rearranged, not changed. 224 * 225 * This act of reflowing the data has a few implications about blocks 226 * that were written before the reflow completes: 227 * 228 * - Old blocks will still use the same amount of space (i.e., they will have 229 * the parity to data ratio implied by the old number of disks in the RAIDZ 230 * group). 231 * - Reading old blocks will be slightly slower than before the reflow, for 232 * two reasons. First, we will have to read from all disks in the RAIDZ 233 * VDEV, rather than being able to skip the children that contain only 234 * parity of this block (because the data of a single block is now spread 235 * out across all the disks). Second, in most cases there will be an extra 236 * bcopy, needed to rearrange the data back to its original layout in memory. 237 * 238 * == Scratch Area == 239 * 240 * As we copy the block data, we can only progress to the point that writes 241 * will not overlap with blocks whose progress has not yet been recorded on 242 * disk. Since partially-copied rows are always read from the old location, 243 * we need to stop one row before the sector-wise overlap, to prevent any 244 * row-wise overlap. For example, in the diagram above, when we reflow sector 245 * B6 it will overwite the original location for B5. 246 * 247 * To get around this, a scratch space is used so that we can start copying 248 * without risking data loss by overlapping the row. As an added benefit, it 249 * improves performance at the beginning of the reflow, but that small perf 250 * boost wouldn't be worth the complexity on its own. 251 * 252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 255 * the widths will likely be single digits so we can get a substantial chuck 256 * size using only a few MB of scratch per disk. 257 * 258 * The scratch area is persisted to disk which holds a large amount of reflowed 259 * state. We can always read the partially written stripes when a disk fails or 260 * the copy is interrupted (crash) during the initial copying phase and also 261 * get past a small chunk size restriction. At a minimum, the scratch space 262 * must be large enough to get us to the point that one row does not overlap 263 * itself when moved (i.e new_width^2). But going larger is even better. We 264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 265 * as our scratch space to handle overwriting the initial part of the VDEV. 266 * 267 * 0 256K 512K 4M 268 * +------+------+-----------------------+----------------------------- 269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 270 * | L0 | L1 | Reserved | (Metaslabs) 271 * +------+------+-----------------------+------------------------------- 272 * Scratch Area 273 * 274 * == Reflow Progress Updates == 275 * After the initial scratch-based reflow, the expansion process works 276 * similarly to device removal. We create a new open context thread which 277 * reflows the data, and periodically kicks off sync tasks to update logical 278 * state. In this case, state is the committed progress (offset of next data 279 * to copy). We need to persist the completed offset on disk, so that if we 280 * crash we know which format each VDEV offset is in. 281 * 282 * == Time Dependent Geometry == 283 * 284 * In non-expanded RAIDZ, blocks are read from disk in a column by column 285 * fashion. For a multi-row block, the second sector is in the first column 286 * not in the second column. This allows us to issue full reads for each 287 * column directly into the request buffer. The block data is thus laid out 288 * sequentially in a column-by-column fashion. 289 * 290 * For example, in the before expansion diagram above, one logical block might 291 * be sectors G19-H26. The parity is in G19,H23; and the data is in 292 * G20,H24,G21,H25,G22,H26. 293 * 294 * After a block is reflowed, the sectors that were all in the original column 295 * data can now reside in different columns. When reading from an expanded 296 * VDEV, we need to know the logical stripe width for each block so we can 297 * reconstitute the block’s data after the reads are completed. Likewise, 298 * when we perform the combinatorial reconstruction we need to know the 299 * original width so we can retry combinations from the past layouts. 300 * 301 * Time dependent geometry is what we call having blocks with different layouts 302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the 303 * block’s birth time (+ the time expansion ended) to establish the correct 304 * width for a given block. After an expansion completes, we record the time 305 * for blocks written with a particular width (geometry). 306 * 307 * == On Disk Format Changes == 308 * 309 * New pool feature flag, 'raidz_expansion' whose reference count is the number 310 * of RAIDZ VDEVs that have been expanded. 311 * 312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 313 * 314 * Since the uberblock can point to arbitrary blocks, which might be on the 315 * expanding RAIDZ, and might or might not have been expanded. We need to know 316 * which way a block is laid out before reading it. This info is the next 317 * offset that needs to be reflowed and we persist that in the uberblock, in 318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 319 * After the expansion is complete, we then use the raidz_expand_txgs array 320 * (see below) to determine how to read a block and the ub_raidz_reflow_info 321 * field no longer required. 322 * 323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space 324 * state (i.e., active or not) which is also required before reading a block 325 * during the initial phase of reflowing the data. 326 * 327 * The top-level RAIDZ VDEV has two new entries in the nvlist: 328 * 329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 330 * and used after the expansion is complete to 331 * determine how to read a raidz block 332 * 'raidz_expanding' boolean: present during reflow and removed after completion 333 * used during a spa import to resume an unfinished 334 * expansion 335 * 336 * And finally the VDEVs top zap adds the following informational entries: 337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 341 */ 342 343 /* 344 * For testing only: pause the raidz expansion after reflowing this amount. 345 * (accessed by ZTS and ztest) 346 */ 347 #ifdef _KERNEL 348 static 349 #endif /* _KERNEL */ 350 unsigned long raidz_expand_max_reflow_bytes = 0; 351 352 /* 353 * For testing only: pause the raidz expansion at a certain point. 354 */ 355 uint_t raidz_expand_pause_point = 0; 356 357 /* 358 * Maximum amount of copy io's outstanding at once. 359 */ 360 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 361 362 /* 363 * Apply raidz map abds aggregation if the number of rows in the map is equal 364 * or greater than the value below. 365 */ 366 static unsigned long raidz_io_aggregate_rows = 4; 367 368 /* 369 * Automatically start a pool scrub when a RAIDZ expansion completes in 370 * order to verify the checksums of all blocks which have been copied 371 * during the expansion. Automatic scrubbing is enabled by default and 372 * is strongly recommended. 373 */ 374 static int zfs_scrub_after_expand = 1; 375 376 static void 377 vdev_raidz_row_free(raidz_row_t *rr) 378 { 379 for (int c = 0; c < rr->rr_cols; c++) { 380 raidz_col_t *rc = &rr->rr_col[c]; 381 382 if (rc->rc_size != 0) 383 abd_free(rc->rc_abd); 384 if (rc->rc_orig_data != NULL) 385 abd_free(rc->rc_orig_data); 386 } 387 388 if (rr->rr_abd_empty != NULL) 389 abd_free(rr->rr_abd_empty); 390 391 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 392 } 393 394 void 395 vdev_raidz_map_free(raidz_map_t *rm) 396 { 397 for (int i = 0; i < rm->rm_nrows; i++) 398 vdev_raidz_row_free(rm->rm_row[i]); 399 400 if (rm->rm_nphys_cols) { 401 for (int i = 0; i < rm->rm_nphys_cols; i++) { 402 if (rm->rm_phys_col[i].rc_abd != NULL) 403 abd_free(rm->rm_phys_col[i].rc_abd); 404 } 405 406 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 407 rm->rm_nphys_cols); 408 } 409 410 ASSERT3P(rm->rm_lr, ==, NULL); 411 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 412 } 413 414 static void 415 vdev_raidz_map_free_vsd(zio_t *zio) 416 { 417 raidz_map_t *rm = zio->io_vsd; 418 419 vdev_raidz_map_free(rm); 420 } 421 422 static int 423 vdev_raidz_reflow_compare(const void *x1, const void *x2) 424 { 425 const reflow_node_t *l = x1; 426 const reflow_node_t *r = x2; 427 428 return (TREE_CMP(l->re_txg, r->re_txg)); 429 } 430 431 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432 .vsd_free = vdev_raidz_map_free_vsd, 433 }; 434 435 raidz_row_t * 436 vdev_raidz_row_alloc(int cols) 437 { 438 raidz_row_t *rr = 439 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 440 441 rr->rr_cols = cols; 442 rr->rr_scols = cols; 443 444 for (int c = 0; c < cols; c++) { 445 raidz_col_t *rc = &rr->rr_col[c]; 446 rc->rc_shadow_devidx = INT_MAX; 447 rc->rc_shadow_offset = UINT64_MAX; 448 rc->rc_allow_repair = 1; 449 } 450 return (rr); 451 } 452 453 static void 454 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 455 { 456 int c; 457 int nwrapped = 0; 458 uint64_t off = 0; 459 raidz_row_t *rr = rm->rm_row[0]; 460 461 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 462 ASSERT3U(rm->rm_nrows, ==, 1); 463 464 /* 465 * Pad any parity columns with additional space to account for skip 466 * sectors. 467 */ 468 if (rm->rm_skipstart < rr->rr_firstdatacol) { 469 ASSERT0(rm->rm_skipstart); 470 nwrapped = rm->rm_nskip; 471 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 472 nwrapped = 473 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 474 } 475 476 /* 477 * Optional single skip sectors (rc_size == 0) will be handled in 478 * vdev_raidz_io_start_write(). 479 */ 480 int skipped = rr->rr_scols - rr->rr_cols; 481 482 /* Allocate buffers for the parity columns */ 483 for (c = 0; c < rr->rr_firstdatacol; c++) { 484 raidz_col_t *rc = &rr->rr_col[c]; 485 486 /* 487 * Parity columns will pad out a linear ABD to account for 488 * the skip sector. A linear ABD is used here because 489 * parity calculations use the ABD buffer directly to calculate 490 * parity. This avoids doing a memcpy back to the ABD after the 491 * parity has been calculated. By issuing the parity column 492 * with the skip sector we can reduce contention on the child 493 * VDEV queue locks (vq_lock). 494 */ 495 if (c < nwrapped) { 496 rc->rc_abd = abd_alloc_linear( 497 rc->rc_size + (1ULL << ashift), B_FALSE); 498 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 499 skipped++; 500 } else { 501 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 502 } 503 } 504 505 for (off = 0; c < rr->rr_cols; c++) { 506 raidz_col_t *rc = &rr->rr_col[c]; 507 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 508 zio->io_abd, off, rc->rc_size); 509 510 /* 511 * Generate I/O for skip sectors to improve aggregation 512 * continuity. We will use gang ABD's to reduce contention 513 * on the child VDEV queue locks (vq_lock) by issuing 514 * a single I/O that contains the data and skip sector. 515 * 516 * It is important to make sure that rc_size is not updated 517 * even though we are adding a skip sector to the ABD. When 518 * calculating the parity in vdev_raidz_generate_parity_row() 519 * the rc_size is used to iterate through the ABD's. We can 520 * not have zero'd out skip sectors used for calculating 521 * parity for raidz, because those same sectors are not used 522 * during reconstruction. 523 */ 524 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 525 rc->rc_abd = abd_alloc_gang(); 526 abd_gang_add(rc->rc_abd, abd, B_TRUE); 527 abd_gang_add(rc->rc_abd, 528 abd_get_zeros(1ULL << ashift), B_TRUE); 529 skipped++; 530 } else { 531 rc->rc_abd = abd; 532 } 533 off += rc->rc_size; 534 } 535 536 ASSERT3U(off, ==, zio->io_size); 537 ASSERT3S(skipped, ==, rm->rm_nskip); 538 } 539 540 static void 541 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 542 { 543 int c; 544 raidz_row_t *rr = rm->rm_row[0]; 545 546 ASSERT3U(rm->rm_nrows, ==, 1); 547 548 /* Allocate buffers for the parity columns */ 549 for (c = 0; c < rr->rr_firstdatacol; c++) 550 rr->rr_col[c].rc_abd = 551 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 552 553 for (uint64_t off = 0; c < rr->rr_cols; c++) { 554 raidz_col_t *rc = &rr->rr_col[c]; 555 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 556 zio->io_abd, off, rc->rc_size); 557 off += rc->rc_size; 558 } 559 } 560 561 /* 562 * Divides the IO evenly across all child vdevs; usually, dcols is 563 * the number of children in the target vdev. 564 * 565 * Avoid inlining the function to keep vdev_raidz_io_start(), which 566 * is this functions only caller, as small as possible on the stack. 567 */ 568 noinline raidz_map_t * 569 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 570 uint64_t nparity) 571 { 572 raidz_row_t *rr; 573 /* The starting RAIDZ (parent) vdev sector of the block. */ 574 uint64_t b = zio->io_offset >> ashift; 575 /* The zio's size in units of the vdev's minimum sector size. */ 576 uint64_t s = zio->io_size >> ashift; 577 /* The first column for this stripe. */ 578 uint64_t f = b % dcols; 579 /* The starting byte offset on each child vdev. */ 580 uint64_t o = (b / dcols) << ashift; 581 uint64_t acols, scols; 582 583 raidz_map_t *rm = 584 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 585 rm->rm_nrows = 1; 586 587 /* 588 * "Quotient": The number of data sectors for this stripe on all but 589 * the "big column" child vdevs that also contain "remainder" data. 590 */ 591 uint64_t q = s / (dcols - nparity); 592 593 /* 594 * "Remainder": The number of partial stripe data sectors in this I/O. 595 * This will add a sector to some, but not all, child vdevs. 596 */ 597 uint64_t r = s - q * (dcols - nparity); 598 599 /* The number of "big columns" - those which contain remainder data. */ 600 uint64_t bc = (r == 0 ? 0 : r + nparity); 601 602 /* 603 * The total number of data and parity sectors associated with 604 * this I/O. 605 */ 606 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 607 608 /* 609 * acols: The columns that will be accessed. 610 * scols: The columns that will be accessed or skipped. 611 */ 612 if (q == 0) { 613 /* Our I/O request doesn't span all child vdevs. */ 614 acols = bc; 615 scols = MIN(dcols, roundup(bc, nparity + 1)); 616 } else { 617 acols = dcols; 618 scols = dcols; 619 } 620 621 ASSERT3U(acols, <=, scols); 622 rr = vdev_raidz_row_alloc(scols); 623 rm->rm_row[0] = rr; 624 rr->rr_cols = acols; 625 rr->rr_bigcols = bc; 626 rr->rr_firstdatacol = nparity; 627 #ifdef ZFS_DEBUG 628 rr->rr_offset = zio->io_offset; 629 rr->rr_size = zio->io_size; 630 #endif 631 632 uint64_t asize = 0; 633 634 for (uint64_t c = 0; c < scols; c++) { 635 raidz_col_t *rc = &rr->rr_col[c]; 636 uint64_t col = f + c; 637 uint64_t coff = o; 638 if (col >= dcols) { 639 col -= dcols; 640 coff += 1ULL << ashift; 641 } 642 rc->rc_devidx = col; 643 rc->rc_offset = coff; 644 645 if (c >= acols) 646 rc->rc_size = 0; 647 else if (c < bc) 648 rc->rc_size = (q + 1) << ashift; 649 else 650 rc->rc_size = q << ashift; 651 652 asize += rc->rc_size; 653 } 654 655 ASSERT3U(asize, ==, tot << ashift); 656 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 657 rm->rm_skipstart = bc; 658 659 /* 660 * If all data stored spans all columns, there's a danger that parity 661 * will always be on the same device and, since parity isn't read 662 * during normal operation, that device's I/O bandwidth won't be 663 * used effectively. We therefore switch the parity every 1MB. 664 * 665 * ... at least that was, ostensibly, the theory. As a practical 666 * matter unless we juggle the parity between all devices evenly, we 667 * won't see any benefit. Further, occasional writes that aren't a 668 * multiple of the LCM of the number of children and the minimum 669 * stripe width are sufficient to avoid pessimal behavior. 670 * Unfortunately, this decision created an implicit on-disk format 671 * requirement that we need to support for all eternity, but only 672 * for single-parity RAID-Z. 673 * 674 * If we intend to skip a sector in the zeroth column for padding 675 * we must make sure to note this swap. We will never intend to 676 * skip the first column since at least one data and one parity 677 * column must appear in each row. 678 */ 679 ASSERT(rr->rr_cols >= 2); 680 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 681 682 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 683 uint64_t devidx = rr->rr_col[0].rc_devidx; 684 o = rr->rr_col[0].rc_offset; 685 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 686 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 687 rr->rr_col[1].rc_devidx = devidx; 688 rr->rr_col[1].rc_offset = o; 689 if (rm->rm_skipstart == 0) 690 rm->rm_skipstart = 1; 691 } 692 693 if (zio->io_type == ZIO_TYPE_WRITE) { 694 vdev_raidz_map_alloc_write(zio, rm, ashift); 695 } else { 696 vdev_raidz_map_alloc_read(zio, rm); 697 } 698 /* init RAIDZ parity ops */ 699 rm->rm_ops = vdev_raidz_math_get_ops(); 700 701 return (rm); 702 } 703 704 /* 705 * Everything before reflow_offset_synced should have been moved to the new 706 * location (read and write completed). However, this may not yet be reflected 707 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 708 * uberblock has not yet been written). If reflow is not in progress, 709 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 710 * entirely before reflow_offset_synced, it will come from the new location. 711 * Otherwise this row will come from the old location. Therefore, rows that 712 * straddle the reflow_offset_synced will come from the old location. 713 * 714 * For writes, reflow_offset_next is the next offset to copy. If a sector has 715 * been copied, but not yet reflected in the on-disk progress 716 * (reflow_offset_synced), it will also be written to the new (already copied) 717 * offset. 718 */ 719 noinline raidz_map_t * 720 vdev_raidz_map_alloc_expanded(zio_t *zio, 721 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 722 uint64_t nparity, uint64_t reflow_offset_synced, 723 uint64_t reflow_offset_next, boolean_t use_scratch) 724 { 725 abd_t *abd = zio->io_abd; 726 uint64_t offset = zio->io_offset; 727 uint64_t size = zio->io_size; 728 729 /* The zio's size in units of the vdev's minimum sector size. */ 730 uint64_t s = size >> ashift; 731 732 /* 733 * "Quotient": The number of data sectors for this stripe on all but 734 * the "big column" child vdevs that also contain "remainder" data. 735 * AKA "full rows" 736 */ 737 uint64_t q = s / (logical_cols - nparity); 738 739 /* 740 * "Remainder": The number of partial stripe data sectors in this I/O. 741 * This will add a sector to some, but not all, child vdevs. 742 */ 743 uint64_t r = s - q * (logical_cols - nparity); 744 745 /* The number of "big columns" - those which contain remainder data. */ 746 uint64_t bc = (r == 0 ? 0 : r + nparity); 747 748 /* 749 * The total number of data and parity sectors associated with 750 * this I/O. 751 */ 752 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 753 754 /* How many rows contain data (not skip) */ 755 uint64_t rows = howmany(tot, logical_cols); 756 int cols = MIN(tot, logical_cols); 757 758 raidz_map_t *rm = 759 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 760 KM_SLEEP); 761 rm->rm_nrows = rows; 762 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 763 rm->rm_skipstart = bc; 764 uint64_t asize = 0; 765 766 for (uint64_t row = 0; row < rows; row++) { 767 boolean_t row_use_scratch = B_FALSE; 768 raidz_row_t *rr = vdev_raidz_row_alloc(cols); 769 rm->rm_row[row] = rr; 770 771 /* The starting RAIDZ (parent) vdev sector of the row. */ 772 uint64_t b = (offset >> ashift) + row * logical_cols; 773 774 /* 775 * If we are in the middle of a reflow, and the copying has 776 * not yet completed for any part of this row, then use the 777 * old location of this row. Note that reflow_offset_synced 778 * reflects the i/o that's been completed, because it's 779 * updated by a synctask, after zio_wait(spa_txg_zio[]). 780 * This is sufficient for our check, even if that progress 781 * has not yet been recorded to disk (reflected in 782 * spa_ubsync). Also note that we consider the last row to 783 * be "full width" (`cols`-wide rather than `bc`-wide) for 784 * this calculation. This causes a tiny bit of unnecessary 785 * double-writes but is safe and simpler to calculate. 786 */ 787 int row_phys_cols = physical_cols; 788 if (b + cols > reflow_offset_synced >> ashift) 789 row_phys_cols--; 790 else if (use_scratch) 791 row_use_scratch = B_TRUE; 792 793 /* starting child of this row */ 794 uint64_t child_id = b % row_phys_cols; 795 /* The starting byte offset on each child vdev. */ 796 uint64_t child_offset = (b / row_phys_cols) << ashift; 797 798 /* 799 * Note, rr_cols is the entire width of the block, even 800 * if this row is shorter. This is needed because parity 801 * generation (for Q and R) needs to know the entire width, 802 * because it treats the short row as though it was 803 * full-width (and the "phantom" sectors were zero-filled). 804 * 805 * Another approach to this would be to set cols shorter 806 * (to just the number of columns that we might do i/o to) 807 * and have another mechanism to tell the parity generation 808 * about the "entire width". Reconstruction (at least 809 * vdev_raidz_reconstruct_general()) would also need to 810 * know about the "entire width". 811 */ 812 rr->rr_firstdatacol = nparity; 813 #ifdef ZFS_DEBUG 814 /* 815 * note: rr_size is PSIZE, not ASIZE 816 */ 817 rr->rr_offset = b << ashift; 818 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 819 #endif 820 821 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 822 if (child_id >= row_phys_cols) { 823 child_id -= row_phys_cols; 824 child_offset += 1ULL << ashift; 825 } 826 raidz_col_t *rc = &rr->rr_col[c]; 827 rc->rc_devidx = child_id; 828 rc->rc_offset = child_offset; 829 830 /* 831 * Get this from the scratch space if appropriate. 832 * This only happens if we crashed in the middle of 833 * raidz_reflow_scratch_sync() (while it's running, 834 * the rangelock prevents us from doing concurrent 835 * io), and even then only during zpool import or 836 * when the pool is imported readonly. 837 */ 838 if (row_use_scratch) 839 rc->rc_offset -= VDEV_BOOT_SIZE; 840 841 uint64_t dc = c - rr->rr_firstdatacol; 842 if (c < rr->rr_firstdatacol) { 843 rc->rc_size = 1ULL << ashift; 844 845 /* 846 * Parity sectors' rc_abd's are set below 847 * after determining if this is an aggregation. 848 */ 849 } else if (row == rows - 1 && bc != 0 && c >= bc) { 850 /* 851 * Past the end of the block (even including 852 * skip sectors). This sector is part of the 853 * map so that we have full rows for p/q parity 854 * generation. 855 */ 856 rc->rc_size = 0; 857 rc->rc_abd = NULL; 858 } else { 859 /* "data column" (col excluding parity) */ 860 uint64_t off; 861 862 if (c < bc || r == 0) { 863 off = dc * rows + row; 864 } else { 865 off = r * rows + 866 (dc - r) * (rows - 1) + row; 867 } 868 rc->rc_size = 1ULL << ashift; 869 rc->rc_abd = abd_get_offset_struct( 870 &rc->rc_abdstruct, abd, off << ashift, 871 rc->rc_size); 872 } 873 874 if (rc->rc_size == 0) 875 continue; 876 877 /* 878 * If any part of this row is in both old and new 879 * locations, the primary location is the old 880 * location. If this sector was already copied to the 881 * new location, we need to also write to the new, 882 * "shadow" location. 883 * 884 * Note, `row_phys_cols != physical_cols` indicates 885 * that the primary location is the old location. 886 * `b+c < reflow_offset_next` indicates that the copy 887 * to the new location has been initiated. We know 888 * that the copy has completed because we have the 889 * rangelock, which is held exclusively while the 890 * copy is in progress. 891 */ 892 if (row_use_scratch || 893 (row_phys_cols != physical_cols && 894 b + c < reflow_offset_next >> ashift)) { 895 rc->rc_shadow_devidx = (b + c) % physical_cols; 896 rc->rc_shadow_offset = 897 ((b + c) / physical_cols) << ashift; 898 if (row_use_scratch) 899 rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 900 } 901 902 asize += rc->rc_size; 903 } 904 905 /* 906 * See comment in vdev_raidz_map_alloc() 907 */ 908 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 909 (offset & (1ULL << 20))) { 910 ASSERT(rr->rr_cols >= 2); 911 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 912 913 int devidx0 = rr->rr_col[0].rc_devidx; 914 uint64_t offset0 = rr->rr_col[0].rc_offset; 915 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 916 uint64_t shadow_offset0 = 917 rr->rr_col[0].rc_shadow_offset; 918 919 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 920 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 921 rr->rr_col[0].rc_shadow_devidx = 922 rr->rr_col[1].rc_shadow_devidx; 923 rr->rr_col[0].rc_shadow_offset = 924 rr->rr_col[1].rc_shadow_offset; 925 926 rr->rr_col[1].rc_devidx = devidx0; 927 rr->rr_col[1].rc_offset = offset0; 928 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 929 rr->rr_col[1].rc_shadow_offset = shadow_offset0; 930 } 931 } 932 ASSERT3U(asize, ==, tot << ashift); 933 934 /* 935 * Determine if the block is contiguous, in which case we can use 936 * an aggregation. 937 */ 938 if (rows >= raidz_io_aggregate_rows) { 939 rm->rm_nphys_cols = physical_cols; 940 rm->rm_phys_col = 941 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 942 KM_SLEEP); 943 944 /* 945 * Determine the aggregate io's offset and size, and check 946 * that the io is contiguous. 947 */ 948 for (int i = 0; 949 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 950 raidz_row_t *rr = rm->rm_row[i]; 951 for (int c = 0; c < rr->rr_cols; c++) { 952 raidz_col_t *rc = &rr->rr_col[c]; 953 raidz_col_t *prc = 954 &rm->rm_phys_col[rc->rc_devidx]; 955 956 if (rc->rc_size == 0) 957 continue; 958 959 if (prc->rc_size == 0) { 960 ASSERT0(prc->rc_offset); 961 prc->rc_offset = rc->rc_offset; 962 } else if (prc->rc_offset + prc->rc_size != 963 rc->rc_offset) { 964 /* 965 * This block is not contiguous and 966 * therefore can't be aggregated. 967 * This is expected to be rare, so 968 * the cost of allocating and then 969 * freeing rm_phys_col is not 970 * significant. 971 */ 972 kmem_free(rm->rm_phys_col, 973 sizeof (raidz_col_t) * 974 rm->rm_nphys_cols); 975 rm->rm_phys_col = NULL; 976 rm->rm_nphys_cols = 0; 977 break; 978 } 979 prc->rc_size += rc->rc_size; 980 } 981 } 982 } 983 if (rm->rm_phys_col != NULL) { 984 /* 985 * Allocate aggregate ABD's. 986 */ 987 for (int i = 0; i < rm->rm_nphys_cols; i++) { 988 raidz_col_t *prc = &rm->rm_phys_col[i]; 989 990 prc->rc_devidx = i; 991 992 if (prc->rc_size == 0) 993 continue; 994 995 prc->rc_abd = 996 abd_alloc_linear(rm->rm_phys_col[i].rc_size, 997 B_FALSE); 998 } 999 1000 /* 1001 * Point the parity abd's into the aggregate abd's. 1002 */ 1003 for (int i = 0; i < rm->rm_nrows; i++) { 1004 raidz_row_t *rr = rm->rm_row[i]; 1005 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1006 raidz_col_t *rc = &rr->rr_col[c]; 1007 raidz_col_t *prc = 1008 &rm->rm_phys_col[rc->rc_devidx]; 1009 rc->rc_abd = 1010 abd_get_offset_struct(&rc->rc_abdstruct, 1011 prc->rc_abd, 1012 rc->rc_offset - prc->rc_offset, 1013 rc->rc_size); 1014 } 1015 } 1016 } else { 1017 /* 1018 * Allocate new abd's for the parity sectors. 1019 */ 1020 for (int i = 0; i < rm->rm_nrows; i++) { 1021 raidz_row_t *rr = rm->rm_row[i]; 1022 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1023 raidz_col_t *rc = &rr->rr_col[c]; 1024 rc->rc_abd = 1025 abd_alloc_linear(rc->rc_size, 1026 B_TRUE); 1027 } 1028 } 1029 } 1030 /* init RAIDZ parity ops */ 1031 rm->rm_ops = vdev_raidz_math_get_ops(); 1032 1033 return (rm); 1034 } 1035 1036 struct pqr_struct { 1037 uint64_t *p; 1038 uint64_t *q; 1039 uint64_t *r; 1040 }; 1041 1042 static int 1043 vdev_raidz_p_func(void *buf, size_t size, void *private) 1044 { 1045 struct pqr_struct *pqr = private; 1046 const uint64_t *src = buf; 1047 int cnt = size / sizeof (src[0]); 1048 1049 ASSERT(pqr->p && !pqr->q && !pqr->r); 1050 1051 for (int i = 0; i < cnt; i++, src++, pqr->p++) 1052 *pqr->p ^= *src; 1053 1054 return (0); 1055 } 1056 1057 static int 1058 vdev_raidz_pq_func(void *buf, size_t size, void *private) 1059 { 1060 struct pqr_struct *pqr = private; 1061 const uint64_t *src = buf; 1062 uint64_t mask; 1063 int cnt = size / sizeof (src[0]); 1064 1065 ASSERT(pqr->p && pqr->q && !pqr->r); 1066 1067 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1068 *pqr->p ^= *src; 1069 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1070 *pqr->q ^= *src; 1071 } 1072 1073 return (0); 1074 } 1075 1076 static int 1077 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1078 { 1079 struct pqr_struct *pqr = private; 1080 const uint64_t *src = buf; 1081 uint64_t mask; 1082 int cnt = size / sizeof (src[0]); 1083 1084 ASSERT(pqr->p && pqr->q && pqr->r); 1085 1086 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1087 *pqr->p ^= *src; 1088 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1089 *pqr->q ^= *src; 1090 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1091 *pqr->r ^= *src; 1092 } 1093 1094 return (0); 1095 } 1096 1097 static void 1098 vdev_raidz_generate_parity_p(raidz_row_t *rr) 1099 { 1100 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1101 1102 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1103 abd_t *src = rr->rr_col[c].rc_abd; 1104 1105 if (c == rr->rr_firstdatacol) { 1106 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1107 } else { 1108 struct pqr_struct pqr = { p, NULL, NULL }; 1109 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1110 vdev_raidz_p_func, &pqr); 1111 } 1112 } 1113 } 1114 1115 static void 1116 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1117 { 1118 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1119 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1120 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1121 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1122 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1123 1124 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1125 abd_t *src = rr->rr_col[c].rc_abd; 1126 1127 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1128 1129 if (c == rr->rr_firstdatacol) { 1130 ASSERT(ccnt == pcnt || ccnt == 0); 1131 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1132 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1133 1134 for (uint64_t i = ccnt; i < pcnt; i++) { 1135 p[i] = 0; 1136 q[i] = 0; 1137 } 1138 } else { 1139 struct pqr_struct pqr = { p, q, NULL }; 1140 1141 ASSERT(ccnt <= pcnt); 1142 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1143 vdev_raidz_pq_func, &pqr); 1144 1145 /* 1146 * Treat short columns as though they are full of 0s. 1147 * Note that there's therefore nothing needed for P. 1148 */ 1149 uint64_t mask; 1150 for (uint64_t i = ccnt; i < pcnt; i++) { 1151 VDEV_RAIDZ_64MUL_2(q[i], mask); 1152 } 1153 } 1154 } 1155 } 1156 1157 static void 1158 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1159 { 1160 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1161 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1162 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 1163 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1164 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1165 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1166 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1167 rr->rr_col[VDEV_RAIDZ_R].rc_size); 1168 1169 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1170 abd_t *src = rr->rr_col[c].rc_abd; 1171 1172 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1173 1174 if (c == rr->rr_firstdatacol) { 1175 ASSERT(ccnt == pcnt || ccnt == 0); 1176 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1177 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1178 (void) memcpy(r, p, rr->rr_col[c].rc_size); 1179 1180 for (uint64_t i = ccnt; i < pcnt; i++) { 1181 p[i] = 0; 1182 q[i] = 0; 1183 r[i] = 0; 1184 } 1185 } else { 1186 struct pqr_struct pqr = { p, q, r }; 1187 1188 ASSERT(ccnt <= pcnt); 1189 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1190 vdev_raidz_pqr_func, &pqr); 1191 1192 /* 1193 * Treat short columns as though they are full of 0s. 1194 * Note that there's therefore nothing needed for P. 1195 */ 1196 uint64_t mask; 1197 for (uint64_t i = ccnt; i < pcnt; i++) { 1198 VDEV_RAIDZ_64MUL_2(q[i], mask); 1199 VDEV_RAIDZ_64MUL_4(r[i], mask); 1200 } 1201 } 1202 } 1203 } 1204 1205 /* 1206 * Generate RAID parity in the first virtual columns according to the number of 1207 * parity columns available. 1208 */ 1209 void 1210 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1211 { 1212 if (rr->rr_cols == 0) { 1213 /* 1214 * We are handling this block one row at a time (because 1215 * this block has a different logical vs physical width, 1216 * due to RAIDZ expansion), and this is a pad-only row, 1217 * which has no parity. 1218 */ 1219 return; 1220 } 1221 1222 /* Generate using the new math implementation */ 1223 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1224 return; 1225 1226 switch (rr->rr_firstdatacol) { 1227 case 1: 1228 vdev_raidz_generate_parity_p(rr); 1229 break; 1230 case 2: 1231 vdev_raidz_generate_parity_pq(rr); 1232 break; 1233 case 3: 1234 vdev_raidz_generate_parity_pqr(rr); 1235 break; 1236 default: 1237 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1238 } 1239 } 1240 1241 void 1242 vdev_raidz_generate_parity(raidz_map_t *rm) 1243 { 1244 for (int i = 0; i < rm->rm_nrows; i++) { 1245 raidz_row_t *rr = rm->rm_row[i]; 1246 vdev_raidz_generate_parity_row(rm, rr); 1247 } 1248 } 1249 1250 static int 1251 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1252 { 1253 (void) private; 1254 uint64_t *dst = dbuf; 1255 uint64_t *src = sbuf; 1256 int cnt = size / sizeof (src[0]); 1257 1258 for (int i = 0; i < cnt; i++) { 1259 dst[i] ^= src[i]; 1260 } 1261 1262 return (0); 1263 } 1264 1265 static int 1266 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1267 void *private) 1268 { 1269 (void) private; 1270 uint64_t *dst = dbuf; 1271 uint64_t *src = sbuf; 1272 uint64_t mask; 1273 int cnt = size / sizeof (dst[0]); 1274 1275 for (int i = 0; i < cnt; i++, dst++, src++) { 1276 VDEV_RAIDZ_64MUL_2(*dst, mask); 1277 *dst ^= *src; 1278 } 1279 1280 return (0); 1281 } 1282 1283 static int 1284 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1285 { 1286 (void) private; 1287 uint64_t *dst = buf; 1288 uint64_t mask; 1289 int cnt = size / sizeof (dst[0]); 1290 1291 for (int i = 0; i < cnt; i++, dst++) { 1292 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1293 VDEV_RAIDZ_64MUL_2(*dst, mask); 1294 } 1295 1296 return (0); 1297 } 1298 1299 struct reconst_q_struct { 1300 uint64_t *q; 1301 int exp; 1302 }; 1303 1304 static int 1305 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1306 { 1307 struct reconst_q_struct *rq = private; 1308 uint64_t *dst = buf; 1309 int cnt = size / sizeof (dst[0]); 1310 1311 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1312 int j; 1313 uint8_t *b; 1314 1315 *dst ^= *rq->q; 1316 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1317 *b = vdev_raidz_exp2(*b, rq->exp); 1318 } 1319 } 1320 1321 return (0); 1322 } 1323 1324 struct reconst_pq_struct { 1325 uint8_t *p; 1326 uint8_t *q; 1327 uint8_t *pxy; 1328 uint8_t *qxy; 1329 int aexp; 1330 int bexp; 1331 }; 1332 1333 static int 1334 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1335 { 1336 struct reconst_pq_struct *rpq = private; 1337 uint8_t *xd = xbuf; 1338 uint8_t *yd = ybuf; 1339 1340 for (int i = 0; i < size; 1341 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1342 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1343 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1344 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1345 } 1346 1347 return (0); 1348 } 1349 1350 static int 1351 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1352 { 1353 struct reconst_pq_struct *rpq = private; 1354 uint8_t *xd = xbuf; 1355 1356 for (int i = 0; i < size; 1357 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1358 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1359 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1360 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1361 } 1362 1363 return (0); 1364 } 1365 1366 static void 1367 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1368 { 1369 int x = tgts[0]; 1370 abd_t *dst, *src; 1371 1372 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1373 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1374 1375 ASSERT3U(ntgts, ==, 1); 1376 ASSERT3U(x, >=, rr->rr_firstdatacol); 1377 ASSERT3U(x, <, rr->rr_cols); 1378 1379 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1380 1381 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1382 dst = rr->rr_col[x].rc_abd; 1383 1384 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1385 1386 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1387 uint64_t size = MIN(rr->rr_col[x].rc_size, 1388 rr->rr_col[c].rc_size); 1389 1390 src = rr->rr_col[c].rc_abd; 1391 1392 if (c == x) 1393 continue; 1394 1395 (void) abd_iterate_func2(dst, src, 0, 0, size, 1396 vdev_raidz_reconst_p_func, NULL); 1397 } 1398 } 1399 1400 static void 1401 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1402 { 1403 int x = tgts[0]; 1404 int c, exp; 1405 abd_t *dst, *src; 1406 1407 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1408 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1409 1410 ASSERT(ntgts == 1); 1411 1412 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1413 1414 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1415 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 1416 rr->rr_col[c].rc_size); 1417 1418 src = rr->rr_col[c].rc_abd; 1419 dst = rr->rr_col[x].rc_abd; 1420 1421 if (c == rr->rr_firstdatacol) { 1422 abd_copy(dst, src, size); 1423 if (rr->rr_col[x].rc_size > size) { 1424 abd_zero_off(dst, size, 1425 rr->rr_col[x].rc_size - size); 1426 } 1427 } else { 1428 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1429 (void) abd_iterate_func2(dst, src, 0, 0, size, 1430 vdev_raidz_reconst_q_pre_func, NULL); 1431 (void) abd_iterate_func(dst, 1432 size, rr->rr_col[x].rc_size - size, 1433 vdev_raidz_reconst_q_pre_tail_func, NULL); 1434 } 1435 } 1436 1437 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1438 dst = rr->rr_col[x].rc_abd; 1439 exp = 255 - (rr->rr_cols - 1 - x); 1440 1441 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1442 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1443 vdev_raidz_reconst_q_post_func, &rq); 1444 } 1445 1446 static void 1447 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1448 { 1449 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1450 abd_t *pdata, *qdata; 1451 uint64_t xsize, ysize; 1452 int x = tgts[0]; 1453 int y = tgts[1]; 1454 abd_t *xd, *yd; 1455 1456 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1457 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1458 1459 ASSERT(ntgts == 2); 1460 ASSERT(x < y); 1461 ASSERT(x >= rr->rr_firstdatacol); 1462 ASSERT(y < rr->rr_cols); 1463 1464 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1465 1466 /* 1467 * Move the parity data aside -- we're going to compute parity as 1468 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1469 * reuse the parity generation mechanism without trashing the actual 1470 * parity so we make those columns appear to be full of zeros by 1471 * setting their lengths to zero. 1472 */ 1473 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1474 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1475 xsize = rr->rr_col[x].rc_size; 1476 ysize = rr->rr_col[y].rc_size; 1477 1478 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 1479 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1480 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 1481 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1482 rr->rr_col[x].rc_size = 0; 1483 rr->rr_col[y].rc_size = 0; 1484 1485 vdev_raidz_generate_parity_pq(rr); 1486 1487 rr->rr_col[x].rc_size = xsize; 1488 rr->rr_col[y].rc_size = ysize; 1489 1490 p = abd_to_buf(pdata); 1491 q = abd_to_buf(qdata); 1492 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1493 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1494 xd = rr->rr_col[x].rc_abd; 1495 yd = rr->rr_col[y].rc_abd; 1496 1497 /* 1498 * We now have: 1499 * Pxy = P + D_x + D_y 1500 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1501 * 1502 * We can then solve for D_x: 1503 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1504 * where 1505 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1506 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1507 * 1508 * With D_x in hand, we can easily solve for D_y: 1509 * D_y = P + Pxy + D_x 1510 */ 1511 1512 a = vdev_raidz_pow2[255 + x - y]; 1513 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1514 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1515 1516 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1517 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1518 1519 ASSERT3U(xsize, >=, ysize); 1520 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1521 1522 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1523 vdev_raidz_reconst_pq_func, &rpq); 1524 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1525 vdev_raidz_reconst_pq_tail_func, &rpq); 1526 1527 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1528 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1529 1530 /* 1531 * Restore the saved parity data. 1532 */ 1533 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 1534 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1535 } 1536 1537 /* 1538 * In the general case of reconstruction, we must solve the system of linear 1539 * equations defined by the coefficients used to generate parity as well as 1540 * the contents of the data and parity disks. This can be expressed with 1541 * vectors for the original data (D) and the actual data (d) and parity (p) 1542 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1543 * 1544 * __ __ __ __ 1545 * | | __ __ | p_0 | 1546 * | V | | D_0 | | p_m-1 | 1547 * | | x | : | = | d_0 | 1548 * | I | | D_n-1 | | : | 1549 * | | ~~ ~~ | d_n-1 | 1550 * ~~ ~~ ~~ ~~ 1551 * 1552 * I is simply a square identity matrix of size n, and V is a vandermonde 1553 * matrix defined by the coefficients we chose for the various parity columns 1554 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1555 * computation as well as linear separability. 1556 * 1557 * __ __ __ __ 1558 * | 1 .. 1 1 1 | | p_0 | 1559 * | 2^n-1 .. 4 2 1 | __ __ | : | 1560 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1561 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1562 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1563 * | : : : : | | : | | d_2 | 1564 * | 0 .. 1 0 0 | | D_n-1 | | : | 1565 * | 0 .. 0 1 0 | ~~ ~~ | : | 1566 * | 0 .. 0 0 1 | | d_n-1 | 1567 * ~~ ~~ ~~ ~~ 1568 * 1569 * Note that I, V, d, and p are known. To compute D, we must invert the 1570 * matrix and use the known data and parity values to reconstruct the unknown 1571 * data values. We begin by removing the rows in V|I and d|p that correspond 1572 * to failed or missing columns; we then make V|I square (n x n) and d|p 1573 * sized n by removing rows corresponding to unused parity from the bottom up 1574 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1575 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1576 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1577 * __ __ 1578 * | 1 1 1 1 1 1 1 1 | 1579 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1580 * | 19 205 116 29 64 16 4 1 | / / 1581 * | 1 0 0 0 0 0 0 0 | / / 1582 * | 0 1 0 0 0 0 0 0 | <--' / 1583 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1584 * | 0 0 0 1 0 0 0 0 | 1585 * | 0 0 0 0 1 0 0 0 | 1586 * | 0 0 0 0 0 1 0 0 | 1587 * | 0 0 0 0 0 0 1 0 | 1588 * | 0 0 0 0 0 0 0 1 | 1589 * ~~ ~~ 1590 * __ __ 1591 * | 1 1 1 1 1 1 1 1 | 1592 * | 128 64 32 16 8 4 2 1 | 1593 * | 19 205 116 29 64 16 4 1 | 1594 * | 1 0 0 0 0 0 0 0 | 1595 * | 0 1 0 0 0 0 0 0 | 1596 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1597 * | 0 0 0 1 0 0 0 0 | 1598 * | 0 0 0 0 1 0 0 0 | 1599 * | 0 0 0 0 0 1 0 0 | 1600 * | 0 0 0 0 0 0 1 0 | 1601 * | 0 0 0 0 0 0 0 1 | 1602 * ~~ ~~ 1603 * 1604 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1605 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1606 * matrix is not singular. 1607 * __ __ 1608 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1609 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1610 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1611 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1612 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1613 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1614 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1615 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1616 * ~~ ~~ 1617 * __ __ 1618 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1619 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1620 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1621 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1622 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1623 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1624 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1625 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1626 * ~~ ~~ 1627 * __ __ 1628 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1629 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1630 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1631 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1632 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1633 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1634 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1635 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1636 * ~~ ~~ 1637 * __ __ 1638 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1639 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1640 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1641 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1642 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1643 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1644 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1645 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1646 * ~~ ~~ 1647 * __ __ 1648 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1649 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1650 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1651 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1652 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1653 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1654 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1655 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1656 * ~~ ~~ 1657 * __ __ 1658 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1659 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1660 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1661 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1662 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1663 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1664 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1665 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1666 * ~~ ~~ 1667 * __ __ 1668 * | 0 0 1 0 0 0 0 0 | 1669 * | 167 100 5 41 159 169 217 208 | 1670 * | 166 100 4 40 158 168 216 209 | 1671 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1672 * | 0 0 0 0 1 0 0 0 | 1673 * | 0 0 0 0 0 1 0 0 | 1674 * | 0 0 0 0 0 0 1 0 | 1675 * | 0 0 0 0 0 0 0 1 | 1676 * ~~ ~~ 1677 * 1678 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1679 * of the missing data. 1680 * 1681 * As is apparent from the example above, the only non-trivial rows in the 1682 * inverse matrix correspond to the data disks that we're trying to 1683 * reconstruct. Indeed, those are the only rows we need as the others would 1684 * only be useful for reconstructing data known or assumed to be valid. For 1685 * that reason, we only build the coefficients in the rows that correspond to 1686 * targeted columns. 1687 */ 1688 1689 static void 1690 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1691 uint8_t **rows) 1692 { 1693 int i, j; 1694 int pow; 1695 1696 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1697 1698 /* 1699 * Fill in the missing rows of interest. 1700 */ 1701 for (i = 0; i < nmap; i++) { 1702 ASSERT3S(0, <=, map[i]); 1703 ASSERT3S(map[i], <=, 2); 1704 1705 pow = map[i] * n; 1706 if (pow > 255) 1707 pow -= 255; 1708 ASSERT(pow <= 255); 1709 1710 for (j = 0; j < n; j++) { 1711 pow -= map[i]; 1712 if (pow < 0) 1713 pow += 255; 1714 rows[i][j] = vdev_raidz_pow2[pow]; 1715 } 1716 } 1717 } 1718 1719 static void 1720 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1721 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1722 { 1723 int i, j, ii, jj; 1724 uint8_t log; 1725 1726 /* 1727 * Assert that the first nmissing entries from the array of used 1728 * columns correspond to parity columns and that subsequent entries 1729 * correspond to data columns. 1730 */ 1731 for (i = 0; i < nmissing; i++) { 1732 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1733 } 1734 for (; i < n; i++) { 1735 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1736 } 1737 1738 /* 1739 * First initialize the storage where we'll compute the inverse rows. 1740 */ 1741 for (i = 0; i < nmissing; i++) { 1742 for (j = 0; j < n; j++) { 1743 invrows[i][j] = (i == j) ? 1 : 0; 1744 } 1745 } 1746 1747 /* 1748 * Subtract all trivial rows from the rows of consequence. 1749 */ 1750 for (i = 0; i < nmissing; i++) { 1751 for (j = nmissing; j < n; j++) { 1752 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1753 jj = used[j] - rr->rr_firstdatacol; 1754 ASSERT3S(jj, <, n); 1755 invrows[i][j] = rows[i][jj]; 1756 rows[i][jj] = 0; 1757 } 1758 } 1759 1760 /* 1761 * For each of the rows of interest, we must normalize it and subtract 1762 * a multiple of it from the other rows. 1763 */ 1764 for (i = 0; i < nmissing; i++) { 1765 for (j = 0; j < missing[i]; j++) { 1766 ASSERT0(rows[i][j]); 1767 } 1768 ASSERT3U(rows[i][missing[i]], !=, 0); 1769 1770 /* 1771 * Compute the inverse of the first element and multiply each 1772 * element in the row by that value. 1773 */ 1774 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1775 1776 for (j = 0; j < n; j++) { 1777 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1778 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1779 } 1780 1781 for (ii = 0; ii < nmissing; ii++) { 1782 if (i == ii) 1783 continue; 1784 1785 ASSERT3U(rows[ii][missing[i]], !=, 0); 1786 1787 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1788 1789 for (j = 0; j < n; j++) { 1790 rows[ii][j] ^= 1791 vdev_raidz_exp2(rows[i][j], log); 1792 invrows[ii][j] ^= 1793 vdev_raidz_exp2(invrows[i][j], log); 1794 } 1795 } 1796 } 1797 1798 /* 1799 * Verify that the data that is left in the rows are properly part of 1800 * an identity matrix. 1801 */ 1802 for (i = 0; i < nmissing; i++) { 1803 for (j = 0; j < n; j++) { 1804 if (j == missing[i]) { 1805 ASSERT3U(rows[i][j], ==, 1); 1806 } else { 1807 ASSERT0(rows[i][j]); 1808 } 1809 } 1810 } 1811 } 1812 1813 static void 1814 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1815 int *missing, uint8_t **invrows, const uint8_t *used) 1816 { 1817 int i, j, x, cc, c; 1818 uint8_t *src; 1819 uint64_t ccount; 1820 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1821 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1822 uint8_t log = 0; 1823 uint8_t val; 1824 int ll; 1825 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1826 uint8_t *p, *pp; 1827 size_t psize; 1828 1829 psize = sizeof (invlog[0][0]) * n * nmissing; 1830 p = kmem_alloc(psize, KM_SLEEP); 1831 1832 for (pp = p, i = 0; i < nmissing; i++) { 1833 invlog[i] = pp; 1834 pp += n; 1835 } 1836 1837 for (i = 0; i < nmissing; i++) { 1838 for (j = 0; j < n; j++) { 1839 ASSERT3U(invrows[i][j], !=, 0); 1840 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1841 } 1842 } 1843 1844 for (i = 0; i < n; i++) { 1845 c = used[i]; 1846 ASSERT3U(c, <, rr->rr_cols); 1847 1848 ccount = rr->rr_col[c].rc_size; 1849 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1850 if (ccount == 0) 1851 continue; 1852 src = abd_to_buf(rr->rr_col[c].rc_abd); 1853 for (j = 0; j < nmissing; j++) { 1854 cc = missing[j] + rr->rr_firstdatacol; 1855 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1856 ASSERT3U(cc, <, rr->rr_cols); 1857 ASSERT3U(cc, !=, c); 1858 1859 dcount[j] = rr->rr_col[cc].rc_size; 1860 if (dcount[j] != 0) 1861 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1862 } 1863 1864 for (x = 0; x < ccount; x++, src++) { 1865 if (*src != 0) 1866 log = vdev_raidz_log2[*src]; 1867 1868 for (cc = 0; cc < nmissing; cc++) { 1869 if (x >= dcount[cc]) 1870 continue; 1871 1872 if (*src == 0) { 1873 val = 0; 1874 } else { 1875 if ((ll = log + invlog[cc][i]) >= 255) 1876 ll -= 255; 1877 val = vdev_raidz_pow2[ll]; 1878 } 1879 1880 if (i == 0) 1881 dst[cc][x] = val; 1882 else 1883 dst[cc][x] ^= val; 1884 } 1885 } 1886 } 1887 1888 kmem_free(p, psize); 1889 } 1890 1891 static void 1892 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1893 { 1894 int i, c, t, tt; 1895 unsigned int n; 1896 unsigned int nmissing_rows; 1897 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1898 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1899 uint8_t *p, *pp; 1900 size_t psize; 1901 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1902 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1903 uint8_t *used; 1904 1905 abd_t **bufs = NULL; 1906 1907 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1908 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1909 /* 1910 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1911 * temporary linear ABDs if any non-linear ABDs are found. 1912 */ 1913 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1914 ASSERT(rr->rr_col[i].rc_abd != NULL); 1915 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1916 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1917 KM_PUSHPAGE); 1918 1919 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1920 raidz_col_t *col = &rr->rr_col[c]; 1921 1922 bufs[c] = col->rc_abd; 1923 if (bufs[c] != NULL) { 1924 col->rc_abd = abd_alloc_linear( 1925 col->rc_size, B_TRUE); 1926 abd_copy(col->rc_abd, bufs[c], 1927 col->rc_size); 1928 } 1929 } 1930 1931 break; 1932 } 1933 } 1934 1935 n = rr->rr_cols - rr->rr_firstdatacol; 1936 1937 /* 1938 * Figure out which data columns are missing. 1939 */ 1940 nmissing_rows = 0; 1941 for (t = 0; t < ntgts; t++) { 1942 if (tgts[t] >= rr->rr_firstdatacol) { 1943 missing_rows[nmissing_rows++] = 1944 tgts[t] - rr->rr_firstdatacol; 1945 } 1946 } 1947 1948 /* 1949 * Figure out which parity columns to use to help generate the missing 1950 * data columns. 1951 */ 1952 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1953 ASSERT(tt < ntgts); 1954 ASSERT(c < rr->rr_firstdatacol); 1955 1956 /* 1957 * Skip any targeted parity columns. 1958 */ 1959 if (c == tgts[tt]) { 1960 tt++; 1961 continue; 1962 } 1963 1964 parity_map[i] = c; 1965 i++; 1966 } 1967 1968 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1969 nmissing_rows * n + sizeof (used[0]) * n; 1970 p = kmem_alloc(psize, KM_SLEEP); 1971 1972 for (pp = p, i = 0; i < nmissing_rows; i++) { 1973 rows[i] = pp; 1974 pp += n; 1975 invrows[i] = pp; 1976 pp += n; 1977 } 1978 used = pp; 1979 1980 for (i = 0; i < nmissing_rows; i++) { 1981 used[i] = parity_map[i]; 1982 } 1983 1984 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1985 if (tt < nmissing_rows && 1986 c == missing_rows[tt] + rr->rr_firstdatacol) { 1987 tt++; 1988 continue; 1989 } 1990 1991 ASSERT3S(i, <, n); 1992 used[i] = c; 1993 i++; 1994 } 1995 1996 /* 1997 * Initialize the interesting rows of the matrix. 1998 */ 1999 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2000 2001 /* 2002 * Invert the matrix. 2003 */ 2004 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2005 invrows, used); 2006 2007 /* 2008 * Reconstruct the missing data using the generated matrix. 2009 */ 2010 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2011 invrows, used); 2012 2013 kmem_free(p, psize); 2014 2015 /* 2016 * copy back from temporary linear abds and free them 2017 */ 2018 if (bufs) { 2019 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2020 raidz_col_t *col = &rr->rr_col[c]; 2021 2022 if (bufs[c] != NULL) { 2023 abd_copy(bufs[c], col->rc_abd, col->rc_size); 2024 abd_free(col->rc_abd); 2025 } 2026 col->rc_abd = bufs[c]; 2027 } 2028 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2029 } 2030 } 2031 2032 static void 2033 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 2034 const int *t, int nt) 2035 { 2036 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2037 int ntgts; 2038 int i, c, ret; 2039 int nbadparity, nbaddata; 2040 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2041 2042 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2043 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2044 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2045 (int)rr->rr_missingparity); 2046 } 2047 2048 nbadparity = rr->rr_firstdatacol; 2049 nbaddata = rr->rr_cols - nbadparity; 2050 ntgts = 0; 2051 for (i = 0, c = 0; c < rr->rr_cols; c++) { 2052 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2053 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2054 "offset=%llx error=%u)", 2055 rr, c, (int)rr->rr_col[c].rc_devidx, 2056 (long long)rr->rr_col[c].rc_offset, 2057 (int)rr->rr_col[c].rc_error); 2058 } 2059 if (c < rr->rr_firstdatacol) 2060 parity_valid[c] = B_FALSE; 2061 2062 if (i < nt && c == t[i]) { 2063 tgts[ntgts++] = c; 2064 i++; 2065 } else if (rr->rr_col[c].rc_error != 0) { 2066 tgts[ntgts++] = c; 2067 } else if (c >= rr->rr_firstdatacol) { 2068 nbaddata--; 2069 } else { 2070 parity_valid[c] = B_TRUE; 2071 nbadparity--; 2072 } 2073 } 2074 2075 ASSERT(ntgts >= nt); 2076 ASSERT(nbaddata >= 0); 2077 ASSERT(nbaddata + nbadparity == ntgts); 2078 2079 dt = &tgts[nbadparity]; 2080 2081 /* Reconstruct using the new math implementation */ 2082 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2083 if (ret != RAIDZ_ORIGINAL_IMPL) 2084 return; 2085 2086 /* 2087 * See if we can use any of our optimized reconstruction routines. 2088 */ 2089 switch (nbaddata) { 2090 case 1: 2091 if (parity_valid[VDEV_RAIDZ_P]) { 2092 vdev_raidz_reconstruct_p(rr, dt, 1); 2093 return; 2094 } 2095 2096 ASSERT(rr->rr_firstdatacol > 1); 2097 2098 if (parity_valid[VDEV_RAIDZ_Q]) { 2099 vdev_raidz_reconstruct_q(rr, dt, 1); 2100 return; 2101 } 2102 2103 ASSERT(rr->rr_firstdatacol > 2); 2104 break; 2105 2106 case 2: 2107 ASSERT(rr->rr_firstdatacol > 1); 2108 2109 if (parity_valid[VDEV_RAIDZ_P] && 2110 parity_valid[VDEV_RAIDZ_Q]) { 2111 vdev_raidz_reconstruct_pq(rr, dt, 2); 2112 return; 2113 } 2114 2115 ASSERT(rr->rr_firstdatacol > 2); 2116 2117 break; 2118 } 2119 2120 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2121 } 2122 2123 static int 2124 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2125 uint64_t *logical_ashift, uint64_t *physical_ashift) 2126 { 2127 vdev_raidz_t *vdrz = vd->vdev_tsd; 2128 uint64_t nparity = vdrz->vd_nparity; 2129 int c; 2130 int lasterror = 0; 2131 int numerrors = 0; 2132 2133 ASSERT(nparity > 0); 2134 2135 if (nparity > VDEV_RAIDZ_MAXPARITY || 2136 vd->vdev_children < nparity + 1) { 2137 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2138 return (SET_ERROR(EINVAL)); 2139 } 2140 2141 vdev_open_children(vd); 2142 2143 for (c = 0; c < vd->vdev_children; c++) { 2144 vdev_t *cvd = vd->vdev_child[c]; 2145 2146 if (cvd->vdev_open_error != 0) { 2147 lasterror = cvd->vdev_open_error; 2148 numerrors++; 2149 continue; 2150 } 2151 2152 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2153 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2154 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2155 } 2156 for (c = 0; c < vd->vdev_children; c++) { 2157 vdev_t *cvd = vd->vdev_child[c]; 2158 2159 if (cvd->vdev_open_error != 0) 2160 continue; 2161 *physical_ashift = vdev_best_ashift(*logical_ashift, 2162 *physical_ashift, cvd->vdev_physical_ashift); 2163 } 2164 2165 if (vd->vdev_rz_expanding) { 2166 *asize *= vd->vdev_children - 1; 2167 *max_asize *= vd->vdev_children - 1; 2168 2169 vd->vdev_min_asize = *asize; 2170 } else { 2171 *asize *= vd->vdev_children; 2172 *max_asize *= vd->vdev_children; 2173 } 2174 2175 if (numerrors > nparity) { 2176 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2177 return (lasterror); 2178 } 2179 2180 return (0); 2181 } 2182 2183 static void 2184 vdev_raidz_close(vdev_t *vd) 2185 { 2186 for (int c = 0; c < vd->vdev_children; c++) { 2187 if (vd->vdev_child[c] != NULL) 2188 vdev_close(vd->vdev_child[c]); 2189 } 2190 } 2191 2192 /* 2193 * Return the logical width to use, given the txg in which the allocation 2194 * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2195 * BP was allocated. Remapped BP's (that were relocated due to device 2196 * removal, see remap_blkptr_cb()), will have a more recent physical birth 2197 * which reflects when the BP was relocated, but we can ignore these because 2198 * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2199 */ 2200 static uint64_t 2201 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2202 { 2203 reflow_node_t lookup = { 2204 .re_txg = txg, 2205 }; 2206 avl_index_t where; 2207 2208 uint64_t width; 2209 mutex_enter(&vdrz->vd_expand_lock); 2210 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2211 if (re != NULL) { 2212 width = re->re_logical_width; 2213 } else { 2214 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2215 if (re != NULL) 2216 width = re->re_logical_width; 2217 else 2218 width = vdrz->vd_original_width; 2219 } 2220 mutex_exit(&vdrz->vd_expand_lock); 2221 return (width); 2222 } 2223 2224 /* 2225 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2226 * more space due to the lower data-to-parity ratio. In this case it's 2227 * important to pass in the correct txg. Note that vdev_gang_header_asize() 2228 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2229 * regardless of txg. This is assured because for a single data sector, we 2230 * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2231 */ 2232 static uint64_t 2233 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2234 { 2235 vdev_raidz_t *vdrz = vd->vdev_tsd; 2236 uint64_t asize; 2237 uint64_t ashift = vd->vdev_top->vdev_ashift; 2238 uint64_t cols = vdrz->vd_original_width; 2239 uint64_t nparity = vdrz->vd_nparity; 2240 2241 cols = vdev_raidz_get_logical_width(vdrz, txg); 2242 2243 asize = ((psize - 1) >> ashift) + 1; 2244 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2245 asize = roundup(asize, nparity + 1) << ashift; 2246 2247 #ifdef ZFS_DEBUG 2248 uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2249 uint64_t ncols_new = vdrz->vd_physical_width; 2250 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2251 (ncols_new - nparity)); 2252 asize_new = roundup(asize_new, nparity + 1) << ashift; 2253 VERIFY3U(asize_new, <=, asize); 2254 #endif 2255 2256 return (asize); 2257 } 2258 2259 /* 2260 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 2261 * so each child must provide at least 1/Nth of its asize. 2262 */ 2263 static uint64_t 2264 vdev_raidz_min_asize(vdev_t *vd) 2265 { 2266 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 2267 vd->vdev_children); 2268 } 2269 2270 void 2271 vdev_raidz_child_done(zio_t *zio) 2272 { 2273 raidz_col_t *rc = zio->io_private; 2274 2275 ASSERT3P(rc->rc_abd, !=, NULL); 2276 rc->rc_error = zio->io_error; 2277 rc->rc_tried = 1; 2278 rc->rc_skipped = 0; 2279 } 2280 2281 static void 2282 vdev_raidz_shadow_child_done(zio_t *zio) 2283 { 2284 raidz_col_t *rc = zio->io_private; 2285 2286 rc->rc_shadow_error = zio->io_error; 2287 } 2288 2289 static void 2290 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2291 { 2292 (void) rm; 2293 #ifdef ZFS_DEBUG 2294 range_seg64_t logical_rs, physical_rs, remain_rs; 2295 logical_rs.rs_start = rr->rr_offset; 2296 logical_rs.rs_end = logical_rs.rs_start + 2297 vdev_raidz_asize(zio->io_vd, rr->rr_size, 2298 BP_GET_BIRTH(zio->io_bp)); 2299 2300 raidz_col_t *rc = &rr->rr_col[col]; 2301 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2302 2303 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 2304 ASSERT(vdev_xlate_is_empty(&remain_rs)); 2305 if (vdev_xlate_is_empty(&physical_rs)) { 2306 /* 2307 * If we are in the middle of expansion, the 2308 * physical->logical mapping is changing so vdev_xlate() 2309 * can't give us a reliable answer. 2310 */ 2311 return; 2312 } 2313 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2314 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2315 /* 2316 * It would be nice to assert that rs_end is equal 2317 * to rc_offset + rc_size but there might be an 2318 * optional I/O at the end that is not accounted in 2319 * rc_size. 2320 */ 2321 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2322 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2323 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2324 } else { 2325 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2326 } 2327 #endif 2328 } 2329 2330 static void 2331 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 2332 { 2333 vdev_t *vd = zio->io_vd; 2334 raidz_map_t *rm = zio->io_vsd; 2335 2336 vdev_raidz_generate_parity_row(rm, rr); 2337 2338 for (int c = 0; c < rr->rr_scols; c++) { 2339 raidz_col_t *rc = &rr->rr_col[c]; 2340 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2341 2342 /* Verify physical to logical translation */ 2343 vdev_raidz_io_verify(zio, rm, rr, c); 2344 2345 if (rc->rc_size == 0) 2346 continue; 2347 2348 ASSERT3U(rc->rc_offset + rc->rc_size, <, 2349 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2350 2351 ASSERT3P(rc->rc_abd, !=, NULL); 2352 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2353 rc->rc_offset, rc->rc_abd, 2354 abd_get_size(rc->rc_abd), zio->io_type, 2355 zio->io_priority, 0, vdev_raidz_child_done, rc)); 2356 2357 if (rc->rc_shadow_devidx != INT_MAX) { 2358 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2359 2360 ASSERT3U( 2361 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2362 cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2363 2364 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2365 rc->rc_shadow_offset, rc->rc_abd, 2366 abd_get_size(rc->rc_abd), 2367 zio->io_type, zio->io_priority, 0, 2368 vdev_raidz_shadow_child_done, rc)); 2369 } 2370 } 2371 } 2372 2373 /* 2374 * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2375 * This only works for vdev_raidz_map_alloc() (not _expanded()). 2376 */ 2377 static void 2378 raidz_start_skip_writes(zio_t *zio) 2379 { 2380 vdev_t *vd = zio->io_vd; 2381 uint64_t ashift = vd->vdev_top->vdev_ashift; 2382 raidz_map_t *rm = zio->io_vsd; 2383 ASSERT3U(rm->rm_nrows, ==, 1); 2384 raidz_row_t *rr = rm->rm_row[0]; 2385 for (int c = 0; c < rr->rr_scols; c++) { 2386 raidz_col_t *rc = &rr->rr_col[c]; 2387 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2388 if (rc->rc_size != 0) 2389 continue; 2390 ASSERT3P(rc->rc_abd, ==, NULL); 2391 2392 ASSERT3U(rc->rc_offset, <, 2393 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2394 2395 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2396 NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2397 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2398 } 2399 } 2400 2401 static void 2402 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 2403 { 2404 vdev_t *vd = zio->io_vd; 2405 2406 /* 2407 * Iterate over the columns in reverse order so that we hit the parity 2408 * last -- any errors along the way will force us to read the parity. 2409 */ 2410 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2411 raidz_col_t *rc = &rr->rr_col[c]; 2412 if (rc->rc_size == 0) 2413 continue; 2414 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2415 if (!vdev_readable(cvd)) { 2416 if (c >= rr->rr_firstdatacol) 2417 rr->rr_missingdata++; 2418 else 2419 rr->rr_missingparity++; 2420 rc->rc_error = SET_ERROR(ENXIO); 2421 rc->rc_tried = 1; /* don't even try */ 2422 rc->rc_skipped = 1; 2423 continue; 2424 } 2425 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2426 if (c >= rr->rr_firstdatacol) 2427 rr->rr_missingdata++; 2428 else 2429 rr->rr_missingparity++; 2430 rc->rc_error = SET_ERROR(ESTALE); 2431 rc->rc_skipped = 1; 2432 continue; 2433 } 2434 if (forceparity || 2435 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 2436 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2437 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2438 rc->rc_offset, rc->rc_abd, rc->rc_size, 2439 zio->io_type, zio->io_priority, 0, 2440 vdev_raidz_child_done, rc)); 2441 } 2442 } 2443 } 2444 2445 static void 2446 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2447 { 2448 vdev_t *vd = zio->io_vd; 2449 2450 for (int i = 0; i < rm->rm_nphys_cols; i++) { 2451 raidz_col_t *prc = &rm->rm_phys_col[i]; 2452 if (prc->rc_size == 0) 2453 continue; 2454 2455 ASSERT3U(prc->rc_devidx, ==, i); 2456 vdev_t *cvd = vd->vdev_child[i]; 2457 if (!vdev_readable(cvd)) { 2458 prc->rc_error = SET_ERROR(ENXIO); 2459 prc->rc_tried = 1; /* don't even try */ 2460 prc->rc_skipped = 1; 2461 continue; 2462 } 2463 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2464 prc->rc_error = SET_ERROR(ESTALE); 2465 prc->rc_skipped = 1; 2466 continue; 2467 } 2468 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2469 prc->rc_offset, prc->rc_abd, prc->rc_size, 2470 zio->io_type, zio->io_priority, 0, 2471 vdev_raidz_child_done, prc)); 2472 } 2473 } 2474 2475 static void 2476 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2477 { 2478 /* 2479 * If there are multiple rows, we will be hitting 2480 * all disks, so go ahead and read the parity so 2481 * that we are reading in decent size chunks. 2482 */ 2483 boolean_t forceparity = rm->rm_nrows > 1; 2484 2485 if (rm->rm_phys_col) { 2486 vdev_raidz_io_start_read_phys_cols(zio, rm); 2487 } else { 2488 for (int i = 0; i < rm->rm_nrows; i++) { 2489 raidz_row_t *rr = rm->rm_row[i]; 2490 vdev_raidz_io_start_read_row(zio, rr, forceparity); 2491 } 2492 } 2493 } 2494 2495 /* 2496 * Start an IO operation on a RAIDZ VDev 2497 * 2498 * Outline: 2499 * - For write operations: 2500 * 1. Generate the parity data 2501 * 2. Create child zio write operations to each column's vdev, for both 2502 * data and parity. 2503 * 3. If the column skips any sectors for padding, create optional dummy 2504 * write zio children for those areas to improve aggregation continuity. 2505 * - For read operations: 2506 * 1. Create child zio read operations to each data column's vdev to read 2507 * the range of data required for zio. 2508 * 2. If this is a scrub or resilver operation, or if any of the data 2509 * vdevs have had errors, then create zio read operations to the parity 2510 * columns' VDevs as well. 2511 */ 2512 static void 2513 vdev_raidz_io_start(zio_t *zio) 2514 { 2515 vdev_t *vd = zio->io_vd; 2516 vdev_t *tvd = vd->vdev_top; 2517 vdev_raidz_t *vdrz = vd->vdev_tsd; 2518 raidz_map_t *rm; 2519 2520 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2521 BP_GET_BIRTH(zio->io_bp)); 2522 if (logical_width != vdrz->vd_physical_width) { 2523 zfs_locked_range_t *lr = NULL; 2524 uint64_t synced_offset = UINT64_MAX; 2525 uint64_t next_offset = UINT64_MAX; 2526 boolean_t use_scratch = B_FALSE; 2527 /* 2528 * Note: when the expansion is completing, we set 2529 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2530 * in a later txg than when we last update spa_ubsync's state 2531 * (see the end of spa_raidz_expand_thread()). Therefore we 2532 * may see vre_state!=SCANNING before 2533 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2534 * on disk, but the copying progress has been synced to disk 2535 * (and reflected in spa_ubsync). In this case it's fine to 2536 * treat the expansion as completed, since if we crash there's 2537 * no additional copying to do. 2538 */ 2539 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2540 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2541 &vdrz->vn_vre); 2542 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2543 zio->io_offset, zio->io_size, RL_READER); 2544 use_scratch = 2545 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2546 RRSS_SCRATCH_VALID); 2547 synced_offset = 2548 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2549 next_offset = vdrz->vn_vre.vre_offset; 2550 /* 2551 * If we haven't resumed expanding since importing the 2552 * pool, vre_offset won't have been set yet. In 2553 * this case the next offset to be copied is the same 2554 * as what was synced. 2555 */ 2556 if (next_offset == UINT64_MAX) { 2557 next_offset = synced_offset; 2558 } 2559 } 2560 if (use_scratch) { 2561 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2562 "%lld next_offset=%lld use_scratch=%u", 2563 zio, 2564 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2565 (long long)zio->io_offset, 2566 (long long)synced_offset, 2567 (long long)next_offset, 2568 use_scratch); 2569 } 2570 2571 rm = vdev_raidz_map_alloc_expanded(zio, 2572 tvd->vdev_ashift, vdrz->vd_physical_width, 2573 logical_width, vdrz->vd_nparity, 2574 synced_offset, next_offset, use_scratch); 2575 rm->rm_lr = lr; 2576 } else { 2577 rm = vdev_raidz_map_alloc(zio, 2578 tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2579 } 2580 rm->rm_original_width = vdrz->vd_original_width; 2581 2582 zio->io_vsd = rm; 2583 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2584 if (zio->io_type == ZIO_TYPE_WRITE) { 2585 for (int i = 0; i < rm->rm_nrows; i++) { 2586 vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2587 } 2588 2589 if (logical_width == vdrz->vd_physical_width) { 2590 raidz_start_skip_writes(zio); 2591 } 2592 } else { 2593 ASSERT(zio->io_type == ZIO_TYPE_READ); 2594 vdev_raidz_io_start_read(zio, rm); 2595 } 2596 2597 zio_execute(zio); 2598 } 2599 2600 /* 2601 * Report a checksum error for a child of a RAID-Z device. 2602 */ 2603 void 2604 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2605 { 2606 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2607 2608 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 2609 zio->io_priority != ZIO_PRIORITY_REBUILD) { 2610 zio_bad_cksum_t zbc; 2611 raidz_map_t *rm = zio->io_vsd; 2612 2613 zbc.zbc_has_cksum = 0; 2614 zbc.zbc_injected = rm->rm_ecksuminjected; 2615 2616 mutex_enter(&vd->vdev_stat_lock); 2617 vd->vdev_stat.vs_checksum_errors++; 2618 mutex_exit(&vd->vdev_stat_lock); 2619 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2620 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2621 rc->rc_abd, bad_data, &zbc); 2622 } 2623 } 2624 2625 /* 2626 * We keep track of whether or not there were any injected errors, so that 2627 * any ereports we generate can note it. 2628 */ 2629 static int 2630 raidz_checksum_verify(zio_t *zio) 2631 { 2632 zio_bad_cksum_t zbc = {0}; 2633 raidz_map_t *rm = zio->io_vsd; 2634 2635 int ret = zio_checksum_error(zio, &zbc); 2636 if (ret != 0 && zbc.zbc_injected != 0) 2637 rm->rm_ecksuminjected = 1; 2638 2639 return (ret); 2640 } 2641 2642 /* 2643 * Generate the parity from the data columns. If we tried and were able to 2644 * read the parity without error, verify that the generated parity matches the 2645 * data we read. If it doesn't, we fire off a checksum error. Return the 2646 * number of such failures. 2647 */ 2648 static int 2649 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2650 { 2651 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2652 int c, ret = 0; 2653 raidz_map_t *rm = zio->io_vsd; 2654 raidz_col_t *rc; 2655 2656 blkptr_t *bp = zio->io_bp; 2657 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2658 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2659 2660 if (checksum == ZIO_CHECKSUM_NOPARITY) 2661 return (ret); 2662 2663 for (c = 0; c < rr->rr_firstdatacol; c++) { 2664 rc = &rr->rr_col[c]; 2665 if (!rc->rc_tried || rc->rc_error != 0) 2666 continue; 2667 2668 orig[c] = rc->rc_abd; 2669 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2670 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2671 } 2672 2673 /* 2674 * Verify any empty sectors are zero filled to ensure the parity 2675 * is calculated correctly even if these non-data sectors are damaged. 2676 */ 2677 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2678 ret += vdev_draid_map_verify_empty(zio, rr); 2679 2680 /* 2681 * Regenerates parity even for !tried||rc_error!=0 columns. This 2682 * isn't harmful but it does have the side effect of fixing stuff 2683 * we didn't realize was necessary (i.e. even if we return 0). 2684 */ 2685 vdev_raidz_generate_parity_row(rm, rr); 2686 2687 for (c = 0; c < rr->rr_firstdatacol; c++) { 2688 rc = &rr->rr_col[c]; 2689 2690 if (!rc->rc_tried || rc->rc_error != 0) 2691 continue; 2692 2693 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2694 zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2695 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2696 vdev_raidz_checksum_error(zio, rc, orig[c]); 2697 rc->rc_error = SET_ERROR(ECKSUM); 2698 ret++; 2699 } 2700 abd_free(orig[c]); 2701 } 2702 2703 return (ret); 2704 } 2705 2706 static int 2707 vdev_raidz_worst_error(raidz_row_t *rr) 2708 { 2709 int error = 0; 2710 2711 for (int c = 0; c < rr->rr_cols; c++) { 2712 error = zio_worst_error(error, rr->rr_col[c].rc_error); 2713 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2714 } 2715 2716 return (error); 2717 } 2718 2719 static void 2720 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2721 { 2722 int unexpected_errors = 0; 2723 int parity_errors = 0; 2724 int parity_untried = 0; 2725 int data_errors = 0; 2726 2727 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2728 2729 for (int c = 0; c < rr->rr_cols; c++) { 2730 raidz_col_t *rc = &rr->rr_col[c]; 2731 2732 if (rc->rc_error) { 2733 if (c < rr->rr_firstdatacol) 2734 parity_errors++; 2735 else 2736 data_errors++; 2737 2738 if (!rc->rc_skipped) 2739 unexpected_errors++; 2740 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2741 parity_untried++; 2742 } 2743 2744 if (rc->rc_force_repair) 2745 unexpected_errors++; 2746 } 2747 2748 /* 2749 * If we read more parity disks than were used for 2750 * reconstruction, confirm that the other parity disks produced 2751 * correct data. 2752 * 2753 * Note that we also regenerate parity when resilvering so we 2754 * can write it out to failed devices later. 2755 */ 2756 if (parity_errors + parity_untried < 2757 rr->rr_firstdatacol - data_errors || 2758 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2759 int n = raidz_parity_verify(zio, rr); 2760 unexpected_errors += n; 2761 } 2762 2763 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2764 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2765 /* 2766 * Use the good data we have in hand to repair damaged children. 2767 */ 2768 for (int c = 0; c < rr->rr_cols; c++) { 2769 raidz_col_t *rc = &rr->rr_col[c]; 2770 vdev_t *vd = zio->io_vd; 2771 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2772 2773 if (!rc->rc_allow_repair) { 2774 continue; 2775 } else if (!rc->rc_force_repair && 2776 (rc->rc_error == 0 || rc->rc_size == 0)) { 2777 continue; 2778 } 2779 2780 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2781 "offset=%llx", 2782 zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2783 2784 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2785 rc->rc_offset, rc->rc_abd, rc->rc_size, 2786 ZIO_TYPE_WRITE, 2787 zio->io_priority == ZIO_PRIORITY_REBUILD ? 2788 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 2789 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2790 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2791 } 2792 } 2793 2794 /* 2795 * Scrub or resilver i/o's: overwrite any shadow locations with the 2796 * good data. This ensures that if we've already copied this sector, 2797 * it will be corrected if it was damaged. This writes more than is 2798 * necessary, but since expansion is paused during scrub/resilver, at 2799 * most a single row will have a shadow location. 2800 */ 2801 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2802 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2803 for (int c = 0; c < rr->rr_cols; c++) { 2804 raidz_col_t *rc = &rr->rr_col[c]; 2805 vdev_t *vd = zio->io_vd; 2806 2807 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2808 continue; 2809 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2810 2811 /* 2812 * Note: We don't want to update the repair stats 2813 * because that would incorrectly indicate that there 2814 * was bad data to repair, which we aren't sure about. 2815 * By clearing the SCAN_THREAD flag, we prevent this 2816 * from happening, despite having the REPAIR flag set. 2817 * We need to set SELF_HEAL so that this i/o can't be 2818 * bypassed by zio_vdev_io_start(). 2819 */ 2820 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2821 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2822 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2823 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2824 NULL, NULL); 2825 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2826 zio_nowait(cio); 2827 } 2828 } 2829 } 2830 2831 static void 2832 raidz_restore_orig_data(raidz_map_t *rm) 2833 { 2834 for (int i = 0; i < rm->rm_nrows; i++) { 2835 raidz_row_t *rr = rm->rm_row[i]; 2836 for (int c = 0; c < rr->rr_cols; c++) { 2837 raidz_col_t *rc = &rr->rr_col[c]; 2838 if (rc->rc_need_orig_restore) { 2839 abd_copy(rc->rc_abd, 2840 rc->rc_orig_data, rc->rc_size); 2841 rc->rc_need_orig_restore = B_FALSE; 2842 } 2843 } 2844 } 2845 } 2846 2847 /* 2848 * During raidz_reconstruct() for expanded VDEV, we need special consideration 2849 * failure simulations. See note in raidz_reconstruct() on simulating failure 2850 * of a pre-expansion device. 2851 * 2852 * Treating logical child i as failed, return TRUE if the given column should 2853 * be treated as failed. The idea of logical children allows us to imagine 2854 * that a disk silently failed before a RAIDZ expansion (reads from this disk 2855 * succeed but return the wrong data). Since the expansion doesn't verify 2856 * checksums, the incorrect data will be moved to new locations spread among 2857 * the children (going diagonally across them). 2858 * 2859 * Higher "logical child failures" (values of `i`) indicate these 2860 * "pre-expansion failures". The first physical_width values imagine that a 2861 * current child failed; the next physical_width-1 values imagine that a 2862 * child failed before the most recent expansion; the next physical_width-2 2863 * values imagine a child failed in the expansion before that, etc. 2864 */ 2865 static boolean_t 2866 raidz_simulate_failure(int physical_width, int original_width, int ashift, 2867 int i, raidz_col_t *rc) 2868 { 2869 uint64_t sector_id = 2870 physical_width * (rc->rc_offset >> ashift) + 2871 rc->rc_devidx; 2872 2873 for (int w = physical_width; w >= original_width; w--) { 2874 if (i < w) { 2875 return (sector_id % w == i); 2876 } else { 2877 i -= w; 2878 } 2879 } 2880 ASSERT(!"invalid logical child id"); 2881 return (B_FALSE); 2882 } 2883 2884 /* 2885 * returns EINVAL if reconstruction of the block will not be possible 2886 * returns ECKSUM if this specific reconstruction failed 2887 * returns 0 on successful reconstruction 2888 */ 2889 static int 2890 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 2891 { 2892 raidz_map_t *rm = zio->io_vsd; 2893 int physical_width = zio->io_vd->vdev_children; 2894 int original_width = (rm->rm_original_width != 0) ? 2895 rm->rm_original_width : physical_width; 2896 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2897 2898 if (dbgmsg) { 2899 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2900 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2901 } 2902 2903 /* Reconstruct each row */ 2904 for (int r = 0; r < rm->rm_nrows; r++) { 2905 raidz_row_t *rr = rm->rm_row[r]; 2906 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 2907 int t = 0; 2908 int dead = 0; 2909 int dead_data = 0; 2910 2911 if (dbgmsg) 2912 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2913 2914 for (int c = 0; c < rr->rr_cols; c++) { 2915 raidz_col_t *rc = &rr->rr_col[c]; 2916 ASSERT0(rc->rc_need_orig_restore); 2917 if (rc->rc_error != 0) { 2918 dead++; 2919 if (c >= nparity) 2920 dead_data++; 2921 continue; 2922 } 2923 if (rc->rc_size == 0) 2924 continue; 2925 for (int lt = 0; lt < ntgts; lt++) { 2926 if (raidz_simulate_failure(physical_width, 2927 original_width, 2928 zio->io_vd->vdev_top->vdev_ashift, 2929 ltgts[lt], rc)) { 2930 if (rc->rc_orig_data == NULL) { 2931 rc->rc_orig_data = 2932 abd_alloc_linear( 2933 rc->rc_size, B_TRUE); 2934 abd_copy(rc->rc_orig_data, 2935 rc->rc_abd, rc->rc_size); 2936 } 2937 rc->rc_need_orig_restore = B_TRUE; 2938 2939 dead++; 2940 if (c >= nparity) 2941 dead_data++; 2942 /* 2943 * Note: simulating failure of a 2944 * pre-expansion device can hit more 2945 * than one column, in which case we 2946 * might try to simulate more failures 2947 * than can be reconstructed, which is 2948 * also more than the size of my_tgts. 2949 * This check prevents accessing past 2950 * the end of my_tgts. The "dead > 2951 * nparity" check below will fail this 2952 * reconstruction attempt. 2953 */ 2954 if (t < VDEV_RAIDZ_MAXPARITY) { 2955 my_tgts[t++] = c; 2956 if (dbgmsg) { 2957 zfs_dbgmsg("simulating " 2958 "failure of col %u " 2959 "devidx %u", c, 2960 (int)rc->rc_devidx); 2961 } 2962 } 2963 break; 2964 } 2965 } 2966 } 2967 if (dead > nparity) { 2968 /* reconstruction not possible */ 2969 if (dbgmsg) { 2970 zfs_dbgmsg("reconstruction not possible; " 2971 "too many failures"); 2972 } 2973 raidz_restore_orig_data(rm); 2974 return (EINVAL); 2975 } 2976 if (dead_data > 0) 2977 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 2978 } 2979 2980 /* Check for success */ 2981 if (raidz_checksum_verify(zio) == 0) { 2982 2983 /* Reconstruction succeeded - report errors */ 2984 for (int i = 0; i < rm->rm_nrows; i++) { 2985 raidz_row_t *rr = rm->rm_row[i]; 2986 2987 for (int c = 0; c < rr->rr_cols; c++) { 2988 raidz_col_t *rc = &rr->rr_col[c]; 2989 if (rc->rc_need_orig_restore) { 2990 /* 2991 * Note: if this is a parity column, 2992 * we don't really know if it's wrong. 2993 * We need to let 2994 * vdev_raidz_io_done_verified() check 2995 * it, and if we set rc_error, it will 2996 * think that it is a "known" error 2997 * that doesn't need to be checked 2998 * or corrected. 2999 */ 3000 if (rc->rc_error == 0 && 3001 c >= rr->rr_firstdatacol) { 3002 vdev_raidz_checksum_error(zio, 3003 rc, rc->rc_orig_data); 3004 rc->rc_error = 3005 SET_ERROR(ECKSUM); 3006 } 3007 rc->rc_need_orig_restore = B_FALSE; 3008 } 3009 } 3010 3011 vdev_raidz_io_done_verified(zio, rr); 3012 } 3013 3014 zio_checksum_verified(zio); 3015 3016 if (dbgmsg) { 3017 zfs_dbgmsg("reconstruction successful " 3018 "(checksum verified)"); 3019 } 3020 return (0); 3021 } 3022 3023 /* Reconstruction failed - restore original data */ 3024 raidz_restore_orig_data(rm); 3025 if (dbgmsg) { 3026 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3027 "failed", zio); 3028 } 3029 return (ECKSUM); 3030 } 3031 3032 /* 3033 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 3034 * Note that the algorithm below is non-optimal because it doesn't take into 3035 * account how reconstruction is actually performed. For example, with 3036 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 3037 * is targeted as invalid as if columns 1 and 4 are targeted since in both 3038 * cases we'd only use parity information in column 0. 3039 * 3040 * The order that we find the various possible combinations of failed 3041 * disks is dictated by these rules: 3042 * - Examine each "slot" (the "i" in tgts[i]) 3043 * - Try to increment this slot (tgts[i] += 1) 3044 * - if we can't increment because it runs into the next slot, 3045 * reset our slot to the minimum, and examine the next slot 3046 * 3047 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 3048 * 3 columns to reconstruct), we will generate the following sequence: 3049 * 3050 * STATE ACTION 3051 * 0 1 2 special case: skip since these are all parity 3052 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 3053 * 0 2 3 first slot: increment to 1 3054 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 3055 * 0 1 4 first: reset to 0; middle: increment to 2 3056 * 0 2 4 first: increment to 1 3057 * 1 2 4 first: reset to 0; middle: increment to 3 3058 * 0 3 4 first: increment to 1 3059 * 1 3 4 first: increment to 2 3060 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 3061 * 0 1 5 first: reset to 0; middle: increment to 2 3062 * 0 2 5 first: increment to 1 3063 * 1 2 5 first: reset to 0; middle: increment to 3 3064 * 0 3 5 first: increment to 1 3065 * 1 3 5 first: increment to 2 3066 * 2 3 5 first: reset to 0; middle: increment to 4 3067 * 0 4 5 first: increment to 1 3068 * 1 4 5 first: increment to 2 3069 * 2 4 5 first: increment to 3 3070 * 3 4 5 done 3071 * 3072 * This strategy works for dRAID but is less efficient when there are a large 3073 * number of child vdevs and therefore permutations to check. Furthermore, 3074 * since the raidz_map_t rows likely do not overlap, reconstruction would be 3075 * possible as long as there are no more than nparity data errors per row. 3076 * These additional permutations are not currently checked but could be as 3077 * a future improvement. 3078 * 3079 * Returns 0 on success, ECKSUM on failure. 3080 */ 3081 static int 3082 vdev_raidz_combrec(zio_t *zio) 3083 { 3084 int nparity = vdev_get_nparity(zio->io_vd); 3085 raidz_map_t *rm = zio->io_vsd; 3086 int physical_width = zio->io_vd->vdev_children; 3087 int original_width = (rm->rm_original_width != 0) ? 3088 rm->rm_original_width : physical_width; 3089 3090 for (int i = 0; i < rm->rm_nrows; i++) { 3091 raidz_row_t *rr = rm->rm_row[i]; 3092 int total_errors = 0; 3093 3094 for (int c = 0; c < rr->rr_cols; c++) { 3095 if (rr->rr_col[c].rc_error) 3096 total_errors++; 3097 } 3098 3099 if (total_errors > nparity) 3100 return (vdev_raidz_worst_error(rr)); 3101 } 3102 3103 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 3104 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 3105 int *ltgts = &tstore[1]; /* value is logical child ID */ 3106 3107 3108 /* 3109 * Determine number of logical children, n. See comment 3110 * above raidz_simulate_failure(). 3111 */ 3112 int n = 0; 3113 for (int w = physical_width; 3114 w >= original_width; w--) { 3115 n += w; 3116 } 3117 3118 ASSERT3U(num_failures, <=, nparity); 3119 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 3120 3121 /* Handle corner cases in combrec logic */ 3122 ltgts[-1] = -1; 3123 for (int i = 0; i < num_failures; i++) { 3124 ltgts[i] = i; 3125 } 3126 ltgts[num_failures] = n; 3127 3128 for (;;) { 3129 int err = raidz_reconstruct(zio, ltgts, num_failures, 3130 nparity); 3131 if (err == EINVAL) { 3132 /* 3133 * Reconstruction not possible with this # 3134 * failures; try more failures. 3135 */ 3136 break; 3137 } else if (err == 0) 3138 return (0); 3139 3140 /* Compute next targets to try */ 3141 for (int t = 0; ; t++) { 3142 ASSERT3U(t, <, num_failures); 3143 ltgts[t]++; 3144 if (ltgts[t] == n) { 3145 /* try more failures */ 3146 ASSERT3U(t, ==, num_failures - 1); 3147 if (zfs_flags & 3148 ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3149 zfs_dbgmsg("reconstruction " 3150 "failed for num_failures=" 3151 "%u; tried all " 3152 "combinations", 3153 num_failures); 3154 } 3155 break; 3156 } 3157 3158 ASSERT3U(ltgts[t], <, n); 3159 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 3160 3161 /* 3162 * If that spot is available, we're done here. 3163 * Try the next combination. 3164 */ 3165 if (ltgts[t] != ltgts[t + 1]) 3166 break; // found next combination 3167 3168 /* 3169 * Otherwise, reset this tgt to the minimum, 3170 * and move on to the next tgt. 3171 */ 3172 ltgts[t] = ltgts[t - 1] + 1; 3173 ASSERT3U(ltgts[t], ==, t); 3174 } 3175 3176 /* Increase the number of failures and keep trying. */ 3177 if (ltgts[num_failures - 1] == n) 3178 break; 3179 } 3180 } 3181 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3182 zfs_dbgmsg("reconstruction failed for all num_failures"); 3183 return (ECKSUM); 3184 } 3185 3186 void 3187 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 3188 { 3189 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 3190 raidz_row_t *rr = rm->rm_row[row]; 3191 vdev_raidz_reconstruct_row(rm, rr, t, nt); 3192 } 3193 } 3194 3195 /* 3196 * Complete a write IO operation on a RAIDZ VDev 3197 * 3198 * Outline: 3199 * 1. Check for errors on the child IOs. 3200 * 2. Return, setting an error code if too few child VDevs were written 3201 * to reconstruct the data later. Note that partial writes are 3202 * considered successful if they can be reconstructed at all. 3203 */ 3204 static void 3205 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 3206 { 3207 int normal_errors = 0; 3208 int shadow_errors = 0; 3209 3210 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3211 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3212 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3213 3214 for (int c = 0; c < rr->rr_cols; c++) { 3215 raidz_col_t *rc = &rr->rr_col[c]; 3216 3217 if (rc->rc_error != 0) { 3218 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3219 normal_errors++; 3220 } 3221 if (rc->rc_shadow_error != 0) { 3222 ASSERT(rc->rc_shadow_error != ECKSUM); 3223 shadow_errors++; 3224 } 3225 } 3226 3227 /* 3228 * Treat partial writes as a success. If we couldn't write enough 3229 * columns to reconstruct the data, the I/O failed. Otherwise, good 3230 * enough. Note that in the case of a shadow write (during raidz 3231 * expansion), depending on if we crash, either the normal (old) or 3232 * shadow (new) location may become the "real" version of the block, 3233 * so both locations must have sufficient redundancy. 3234 * 3235 * Now that we support write reallocation, it would be better 3236 * to treat partial failure as real failure unless there are 3237 * no non-degraded top-level vdevs left, and not update DTLs 3238 * if we intend to reallocate. 3239 */ 3240 if (normal_errors > rr->rr_firstdatacol || 3241 shadow_errors > rr->rr_firstdatacol) { 3242 zio->io_error = zio_worst_error(zio->io_error, 3243 vdev_raidz_worst_error(rr)); 3244 } 3245 } 3246 3247 static void 3248 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 3249 raidz_row_t *rr) 3250 { 3251 int parity_errors = 0; 3252 int parity_untried = 0; 3253 int data_errors = 0; 3254 int total_errors = 0; 3255 3256 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3257 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3258 3259 for (int c = 0; c < rr->rr_cols; c++) { 3260 raidz_col_t *rc = &rr->rr_col[c]; 3261 3262 /* 3263 * If scrubbing and a replacing/sparing child vdev determined 3264 * that not all of its children have an identical copy of the 3265 * data, then clear the error so the column is treated like 3266 * any other read and force a repair to correct the damage. 3267 */ 3268 if (rc->rc_error == ECKSUM) { 3269 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3270 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3271 rc->rc_force_repair = 1; 3272 rc->rc_error = 0; 3273 } 3274 3275 if (rc->rc_error) { 3276 if (c < rr->rr_firstdatacol) 3277 parity_errors++; 3278 else 3279 data_errors++; 3280 3281 total_errors++; 3282 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3283 parity_untried++; 3284 } 3285 } 3286 3287 /* 3288 * If there were data errors and the number of errors we saw was 3289 * correctable -- less than or equal to the number of parity disks read 3290 * -- reconstruct based on the missing data. 3291 */ 3292 if (data_errors != 0 && 3293 total_errors <= rr->rr_firstdatacol - parity_untried) { 3294 /* 3295 * We either attempt to read all the parity columns or 3296 * none of them. If we didn't try to read parity, we 3297 * wouldn't be here in the correctable case. There must 3298 * also have been fewer parity errors than parity 3299 * columns or, again, we wouldn't be in this code path. 3300 */ 3301 ASSERT(parity_untried == 0); 3302 ASSERT(parity_errors < rr->rr_firstdatacol); 3303 3304 /* 3305 * Identify the data columns that reported an error. 3306 */ 3307 int n = 0; 3308 int tgts[VDEV_RAIDZ_MAXPARITY]; 3309 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 3310 raidz_col_t *rc = &rr->rr_col[c]; 3311 if (rc->rc_error != 0) { 3312 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3313 tgts[n++] = c; 3314 } 3315 } 3316 3317 ASSERT(rr->rr_firstdatacol >= n); 3318 3319 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3320 } 3321 } 3322 3323 /* 3324 * Return the number of reads issued. 3325 */ 3326 static int 3327 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 3328 { 3329 vdev_t *vd = zio->io_vd; 3330 int nread = 0; 3331 3332 rr->rr_missingdata = 0; 3333 rr->rr_missingparity = 0; 3334 3335 /* 3336 * If this rows contains empty sectors which are not required 3337 * for a normal read then allocate an ABD for them now so they 3338 * may be read, verified, and any needed repairs performed. 3339 */ 3340 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 3341 vdev_draid_map_alloc_empty(zio, rr); 3342 3343 for (int c = 0; c < rr->rr_cols; c++) { 3344 raidz_col_t *rc = &rr->rr_col[c]; 3345 if (rc->rc_tried || rc->rc_size == 0) 3346 continue; 3347 3348 zio_nowait(zio_vdev_child_io(zio, NULL, 3349 vd->vdev_child[rc->rc_devidx], 3350 rc->rc_offset, rc->rc_abd, rc->rc_size, 3351 zio->io_type, zio->io_priority, 0, 3352 vdev_raidz_child_done, rc)); 3353 nread++; 3354 } 3355 return (nread); 3356 } 3357 3358 /* 3359 * We're here because either there were too many errors to even attempt 3360 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 3361 * failed. In either case, there is enough bad data to prevent reconstruction. 3362 * Start checksum ereports for all children which haven't failed. 3363 */ 3364 static void 3365 vdev_raidz_io_done_unrecoverable(zio_t *zio) 3366 { 3367 raidz_map_t *rm = zio->io_vsd; 3368 3369 for (int i = 0; i < rm->rm_nrows; i++) { 3370 raidz_row_t *rr = rm->rm_row[i]; 3371 3372 for (int c = 0; c < rr->rr_cols; c++) { 3373 raidz_col_t *rc = &rr->rr_col[c]; 3374 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 3375 3376 if (rc->rc_error != 0) 3377 continue; 3378 3379 zio_bad_cksum_t zbc; 3380 zbc.zbc_has_cksum = 0; 3381 zbc.zbc_injected = rm->rm_ecksuminjected; 3382 3383 mutex_enter(&cvd->vdev_stat_lock); 3384 cvd->vdev_stat.vs_checksum_errors++; 3385 mutex_exit(&cvd->vdev_stat_lock); 3386 (void) zfs_ereport_start_checksum(zio->io_spa, 3387 cvd, &zio->io_bookmark, zio, rc->rc_offset, 3388 rc->rc_size, &zbc); 3389 } 3390 } 3391 } 3392 3393 void 3394 vdev_raidz_io_done(zio_t *zio) 3395 { 3396 raidz_map_t *rm = zio->io_vsd; 3397 3398 ASSERT(zio->io_bp != NULL); 3399 if (zio->io_type == ZIO_TYPE_WRITE) { 3400 for (int i = 0; i < rm->rm_nrows; i++) { 3401 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 3402 } 3403 } else { 3404 if (rm->rm_phys_col) { 3405 /* 3406 * This is an aggregated read. Copy the data and status 3407 * from the aggregate abd's to the individual rows. 3408 */ 3409 for (int i = 0; i < rm->rm_nrows; i++) { 3410 raidz_row_t *rr = rm->rm_row[i]; 3411 3412 for (int c = 0; c < rr->rr_cols; c++) { 3413 raidz_col_t *rc = &rr->rr_col[c]; 3414 if (rc->rc_tried || rc->rc_size == 0) 3415 continue; 3416 3417 raidz_col_t *prc = 3418 &rm->rm_phys_col[rc->rc_devidx]; 3419 rc->rc_error = prc->rc_error; 3420 rc->rc_tried = prc->rc_tried; 3421 rc->rc_skipped = prc->rc_skipped; 3422 if (c >= rr->rr_firstdatacol) { 3423 /* 3424 * Note: this is slightly faster 3425 * than using abd_copy_off(). 3426 */ 3427 char *physbuf = abd_to_buf( 3428 prc->rc_abd); 3429 void *physloc = physbuf + 3430 rc->rc_offset - 3431 prc->rc_offset; 3432 3433 abd_copy_from_buf(rc->rc_abd, 3434 physloc, rc->rc_size); 3435 } 3436 } 3437 } 3438 } 3439 3440 for (int i = 0; i < rm->rm_nrows; i++) { 3441 raidz_row_t *rr = rm->rm_row[i]; 3442 vdev_raidz_io_done_reconstruct_known_missing(zio, 3443 rm, rr); 3444 } 3445 3446 if (raidz_checksum_verify(zio) == 0) { 3447 for (int i = 0; i < rm->rm_nrows; i++) { 3448 raidz_row_t *rr = rm->rm_row[i]; 3449 vdev_raidz_io_done_verified(zio, rr); 3450 } 3451 zio_checksum_verified(zio); 3452 } else { 3453 /* 3454 * A sequential resilver has no checksum which makes 3455 * combinatoral reconstruction impossible. This code 3456 * path is unreachable since raidz_checksum_verify() 3457 * has no checksum to verify and must succeed. 3458 */ 3459 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3460 3461 /* 3462 * This isn't a typical situation -- either we got a 3463 * read error or a child silently returned bad data. 3464 * Read every block so we can try again with as much 3465 * data and parity as we can track down. If we've 3466 * already been through once before, all children will 3467 * be marked as tried so we'll proceed to combinatorial 3468 * reconstruction. 3469 */ 3470 int nread = 0; 3471 for (int i = 0; i < rm->rm_nrows; i++) { 3472 nread += vdev_raidz_read_all(zio, 3473 rm->rm_row[i]); 3474 } 3475 if (nread != 0) { 3476 /* 3477 * Normally our stage is VDEV_IO_DONE, but if 3478 * we've already called redone(), it will have 3479 * changed to VDEV_IO_START, in which case we 3480 * don't want to call redone() again. 3481 */ 3482 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 3483 zio_vdev_io_redone(zio); 3484 return; 3485 } 3486 /* 3487 * It would be too expensive to try every possible 3488 * combination of failed sectors in every row, so 3489 * instead we try every combination of failed current or 3490 * past physical disk. This means that if the incorrect 3491 * sectors were all on Nparity disks at any point in the 3492 * past, we will find the correct data. The only known 3493 * case where this is less durable than a non-expanded 3494 * RAIDZ, is if we have a silent failure during 3495 * expansion. In that case, one block could be 3496 * partially in the old format and partially in the 3497 * new format, so we'd lost some sectors from the old 3498 * format and some from the new format. 3499 * 3500 * e.g. logical_width=4 physical_width=6 3501 * the 15 (6+5+4) possible failed disks are: 3502 * width=6 child=0 3503 * width=6 child=1 3504 * width=6 child=2 3505 * width=6 child=3 3506 * width=6 child=4 3507 * width=6 child=5 3508 * width=5 child=0 3509 * width=5 child=1 3510 * width=5 child=2 3511 * width=5 child=3 3512 * width=5 child=4 3513 * width=4 child=0 3514 * width=4 child=1 3515 * width=4 child=2 3516 * width=4 child=3 3517 * And we will try every combination of Nparity of these 3518 * failing. 3519 * 3520 * As a first pass, we can generate every combo, 3521 * and try reconstructing, ignoring any known 3522 * failures. If any row has too many known + simulated 3523 * failures, then we bail on reconstructing with this 3524 * number of simulated failures. As an improvement, 3525 * we could detect the number of whole known failures 3526 * (i.e. we have known failures on these disks for 3527 * every row; the disks never succeeded), and 3528 * subtract that from the max # failures to simulate. 3529 * We could go even further like the current 3530 * combrec code, but that doesn't seem like it 3531 * gains us very much. If we simulate a failure 3532 * that is also a known failure, that's fine. 3533 */ 3534 zio->io_error = vdev_raidz_combrec(zio); 3535 if (zio->io_error == ECKSUM && 3536 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3537 vdev_raidz_io_done_unrecoverable(zio); 3538 } 3539 } 3540 } 3541 if (rm->rm_lr != NULL) { 3542 zfs_rangelock_exit(rm->rm_lr); 3543 rm->rm_lr = NULL; 3544 } 3545 } 3546 3547 static void 3548 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3549 { 3550 vdev_raidz_t *vdrz = vd->vdev_tsd; 3551 if (faulted > vdrz->vd_nparity) 3552 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3553 VDEV_AUX_NO_REPLICAS); 3554 else if (degraded + faulted != 0) 3555 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3556 else 3557 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3558 } 3559 3560 /* 3561 * Determine if any portion of the provided block resides on a child vdev 3562 * with a dirty DTL and therefore needs to be resilvered. The function 3563 * assumes that at least one DTL is dirty which implies that full stripe 3564 * width blocks must be resilvered. 3565 */ 3566 static boolean_t 3567 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3568 uint64_t phys_birth) 3569 { 3570 vdev_raidz_t *vdrz = vd->vdev_tsd; 3571 3572 /* 3573 * If we're in the middle of a RAIDZ expansion, this block may be in 3574 * the old and/or new location. For simplicity, always resilver it. 3575 */ 3576 if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3577 return (B_TRUE); 3578 3579 uint64_t dcols = vd->vdev_children; 3580 uint64_t nparity = vdrz->vd_nparity; 3581 uint64_t ashift = vd->vdev_top->vdev_ashift; 3582 /* The starting RAIDZ (parent) vdev sector of the block. */ 3583 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3584 /* The zio's size in units of the vdev's minimum sector size. */ 3585 uint64_t s = ((psize - 1) >> ashift) + 1; 3586 /* The first column for this stripe. */ 3587 uint64_t f = b % dcols; 3588 3589 /* Unreachable by sequential resilver. */ 3590 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 3591 3592 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 3593 return (B_FALSE); 3594 3595 if (s + nparity >= dcols) 3596 return (B_TRUE); 3597 3598 for (uint64_t c = 0; c < s + nparity; c++) { 3599 uint64_t devidx = (f + c) % dcols; 3600 vdev_t *cvd = vd->vdev_child[devidx]; 3601 3602 /* 3603 * dsl_scan_need_resilver() already checked vd with 3604 * vdev_dtl_contains(). So here just check cvd with 3605 * vdev_dtl_empty(), cheaper and a good approximation. 3606 */ 3607 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3608 return (B_TRUE); 3609 } 3610 3611 return (B_FALSE); 3612 } 3613 3614 static void 3615 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 3616 range_seg64_t *physical_rs, range_seg64_t *remain_rs) 3617 { 3618 (void) remain_rs; 3619 3620 vdev_t *raidvd = cvd->vdev_parent; 3621 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3622 3623 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3624 3625 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3626 /* 3627 * We're in the middle of expansion, in which case the 3628 * translation is in flux. Any answer we give may be wrong 3629 * by the time we return, so it isn't safe for the caller to 3630 * act on it. Therefore we say that this range isn't present 3631 * on any children. The only consumers of this are "zpool 3632 * initialize" and trimming, both of which are "best effort" 3633 * anyway. 3634 */ 3635 physical_rs->rs_start = physical_rs->rs_end = 0; 3636 remain_rs->rs_start = remain_rs->rs_end = 0; 3637 return; 3638 } 3639 3640 uint64_t width = vdrz->vd_physical_width; 3641 uint64_t tgt_col = cvd->vdev_id; 3642 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3643 3644 /* make sure the offsets are block-aligned */ 3645 ASSERT0(logical_rs->rs_start % (1 << ashift)); 3646 ASSERT0(logical_rs->rs_end % (1 << ashift)); 3647 uint64_t b_start = logical_rs->rs_start >> ashift; 3648 uint64_t b_end = logical_rs->rs_end >> ashift; 3649 3650 uint64_t start_row = 0; 3651 if (b_start > tgt_col) /* avoid underflow */ 3652 start_row = ((b_start - tgt_col - 1) / width) + 1; 3653 3654 uint64_t end_row = 0; 3655 if (b_end > tgt_col) 3656 end_row = ((b_end - tgt_col - 1) / width) + 1; 3657 3658 physical_rs->rs_start = start_row << ashift; 3659 physical_rs->rs_end = end_row << ashift; 3660 3661 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 3662 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 3663 logical_rs->rs_end - logical_rs->rs_start); 3664 } 3665 3666 static void 3667 raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3668 { 3669 spa_t *spa = arg; 3670 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3671 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3672 3673 /* 3674 * Ensure there are no i/os to the range that is being committed. 3675 */ 3676 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3677 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3678 3679 mutex_enter(&vre->vre_lock); 3680 uint64_t new_offset = 3681 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3682 /* 3683 * We should not have committed anything that failed. 3684 */ 3685 VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3686 mutex_exit(&vre->vre_lock); 3687 3688 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3689 old_offset, new_offset - old_offset, 3690 RL_WRITER); 3691 3692 /* 3693 * Update the uberblock that will be written when this txg completes. 3694 */ 3695 RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3696 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3697 vre->vre_offset_pertxg[txgoff] = 0; 3698 zfs_rangelock_exit(lr); 3699 3700 mutex_enter(&vre->vre_lock); 3701 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3702 vre->vre_bytes_copied_pertxg[txgoff] = 0; 3703 mutex_exit(&vre->vre_lock); 3704 3705 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3706 VERIFY0(zap_update(spa->spa_meta_objset, 3707 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3708 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3709 } 3710 3711 static void 3712 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3713 { 3714 spa_t *spa = arg; 3715 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3716 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3717 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3718 3719 for (int i = 0; i < TXG_SIZE; i++) 3720 VERIFY0(vre->vre_offset_pertxg[i]); 3721 3722 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3723 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3724 re->re_logical_width = vdrz->vd_physical_width; 3725 mutex_enter(&vdrz->vd_expand_lock); 3726 avl_add(&vdrz->vd_expand_txgs, re); 3727 mutex_exit(&vdrz->vd_expand_lock); 3728 3729 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3730 3731 /* 3732 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3733 * will get written (based on vd_expand_txgs). 3734 */ 3735 vdev_config_dirty(vd); 3736 3737 /* 3738 * Before we change vre_state, the on-disk state must reflect that we 3739 * have completed all copying, so that vdev_raidz_io_start() can use 3740 * vre_state to determine if the reflow is in progress. See also the 3741 * end of spa_raidz_expand_thread(). 3742 */ 3743 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3744 raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3745 3746 vre->vre_end_time = gethrestime_sec(); 3747 vre->vre_state = DSS_FINISHED; 3748 3749 uint64_t state = vre->vre_state; 3750 VERIFY0(zap_update(spa->spa_meta_objset, 3751 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3752 sizeof (state), 1, &state, tx)); 3753 3754 uint64_t end_time = vre->vre_end_time; 3755 VERIFY0(zap_update(spa->spa_meta_objset, 3756 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3757 sizeof (end_time), 1, &end_time, tx)); 3758 3759 spa->spa_uberblock.ub_raidz_reflow_info = 0; 3760 3761 spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3762 "%s vdev %llu new width %llu", spa_name(spa), 3763 (unsigned long long)vd->vdev_id, 3764 (unsigned long long)vd->vdev_children); 3765 3766 spa->spa_raidz_expand = NULL; 3767 raidvd->vdev_rz_expanding = B_FALSE; 3768 3769 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3770 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3771 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3772 3773 spa_notify_waiters(spa); 3774 3775 /* 3776 * While we're in syncing context take the opportunity to 3777 * setup a scrub. All the data has been sucessfully copied 3778 * but we have not validated any checksums. 3779 */ 3780 pool_scan_func_t func = POOL_SCAN_SCRUB; 3781 if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) 3782 dsl_scan_setup_sync(&func, tx); 3783 } 3784 3785 /* 3786 * Struct for one copy zio. 3787 */ 3788 typedef struct raidz_reflow_arg { 3789 vdev_raidz_expand_t *rra_vre; 3790 zfs_locked_range_t *rra_lr; 3791 uint64_t rra_txg; 3792 } raidz_reflow_arg_t; 3793 3794 /* 3795 * The write of the new location is done. 3796 */ 3797 static void 3798 raidz_reflow_write_done(zio_t *zio) 3799 { 3800 raidz_reflow_arg_t *rra = zio->io_private; 3801 vdev_raidz_expand_t *vre = rra->rra_vre; 3802 3803 abd_free(zio->io_abd); 3804 3805 mutex_enter(&vre->vre_lock); 3806 if (zio->io_error != 0) { 3807 /* Force a reflow pause on errors */ 3808 vre->vre_failed_offset = 3809 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3810 } 3811 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3812 vre->vre_outstanding_bytes -= zio->io_size; 3813 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3814 vre->vre_failed_offset) { 3815 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3816 zio->io_size; 3817 } 3818 cv_signal(&vre->vre_cv); 3819 mutex_exit(&vre->vre_lock); 3820 3821 zfs_rangelock_exit(rra->rra_lr); 3822 3823 kmem_free(rra, sizeof (*rra)); 3824 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3825 } 3826 3827 /* 3828 * The read of the old location is done. The parent zio is the write to 3829 * the new location. Allow it to start. 3830 */ 3831 static void 3832 raidz_reflow_read_done(zio_t *zio) 3833 { 3834 raidz_reflow_arg_t *rra = zio->io_private; 3835 vdev_raidz_expand_t *vre = rra->rra_vre; 3836 3837 /* 3838 * If the read failed, or if it was done on a vdev that is not fully 3839 * healthy (e.g. a child that has a resilver in progress), we may not 3840 * have the correct data. Note that it's OK if the write proceeds. 3841 * It may write garbage but the location is otherwise unused and we 3842 * will retry later due to vre_failed_offset. 3843 */ 3844 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3845 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3846 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3847 (long long)rra->rra_lr->lr_offset, 3848 (long long)rra->rra_lr->lr_length, 3849 (long long)rra->rra_txg, 3850 zio->io_error, 3851 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3852 vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3853 mutex_enter(&vre->vre_lock); 3854 /* Force a reflow pause on errors */ 3855 vre->vre_failed_offset = 3856 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3857 mutex_exit(&vre->vre_lock); 3858 } 3859 3860 zio_nowait(zio_unique_parent(zio)); 3861 } 3862 3863 static void 3864 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3865 dmu_tx_t *tx) 3866 { 3867 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3868 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3869 3870 if (offset == 0) 3871 return; 3872 3873 mutex_enter(&vre->vre_lock); 3874 ASSERT3U(vre->vre_offset, <=, offset); 3875 vre->vre_offset = offset; 3876 mutex_exit(&vre->vre_lock); 3877 3878 if (vre->vre_offset_pertxg[txgoff] == 0) { 3879 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3880 spa, tx); 3881 } 3882 vre->vre_offset_pertxg[txgoff] = offset; 3883 } 3884 3885 static boolean_t 3886 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3887 { 3888 for (int i = 0; i < raidz_vd->vdev_children; i++) { 3889 /* Quick check if a child is being replaced */ 3890 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3891 return (B_TRUE); 3892 } 3893 return (B_FALSE); 3894 } 3895 3896 static boolean_t 3897 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, 3898 dmu_tx_t *tx) 3899 { 3900 spa_t *spa = vd->vdev_spa; 3901 int ashift = vd->vdev_top->vdev_ashift; 3902 uint64_t offset, size; 3903 3904 if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, 3905 &offset, &size)) { 3906 return (B_FALSE); 3907 } 3908 ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3909 ASSERT3U(size, >=, 1 << ashift); 3910 uint64_t length = 1 << ashift; 3911 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3912 3913 uint64_t blkid = offset >> ashift; 3914 3915 int old_children = vd->vdev_children - 1; 3916 3917 /* 3918 * We can only progress to the point that writes will not overlap 3919 * with blocks whose progress has not yet been recorded on disk. 3920 * Since partially-copied rows are still read from the old location, 3921 * we need to stop one row before the sector-wise overlap, to prevent 3922 * row-wise overlap. 3923 * 3924 * Note that even if we are skipping over a large unallocated region, 3925 * we can't move the on-disk progress to `offset`, because concurrent 3926 * writes/allocations could still use the currently-unallocated 3927 * region. 3928 */ 3929 uint64_t ubsync_blkid = 3930 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3931 uint64_t next_overwrite_blkid = ubsync_blkid + 3932 ubsync_blkid / old_children - old_children; 3933 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3934 3935 if (blkid >= next_overwrite_blkid) { 3936 raidz_reflow_record_progress(vre, 3937 next_overwrite_blkid << ashift, tx); 3938 return (B_TRUE); 3939 } 3940 3941 range_tree_remove(rt, offset, length); 3942 3943 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); 3944 rra->rra_vre = vre; 3945 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 3946 offset, length, RL_WRITER); 3947 rra->rra_txg = dmu_tx_get_txg(tx); 3948 3949 raidz_reflow_record_progress(vre, offset + length, tx); 3950 3951 mutex_enter(&vre->vre_lock); 3952 vre->vre_outstanding_bytes += length; 3953 mutex_exit(&vre->vre_lock); 3954 3955 /* 3956 * SCL_STATE will be released when the read and write are done, 3957 * by raidz_reflow_write_done(). 3958 */ 3959 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3960 3961 /* check if a replacing vdev was added, if so treat it as an error */ 3962 if (vdev_raidz_expand_child_replacing(vd)) { 3963 zfs_dbgmsg("replacing vdev encountered, reflow paused at " 3964 "offset=%llu txg=%llu", 3965 (long long)rra->rra_lr->lr_offset, 3966 (long long)rra->rra_txg); 3967 3968 mutex_enter(&vre->vre_lock); 3969 vre->vre_failed_offset = 3970 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3971 cv_signal(&vre->vre_cv); 3972 mutex_exit(&vre->vre_lock); 3973 3974 /* drop everything we acquired */ 3975 zfs_rangelock_exit(rra->rra_lr); 3976 kmem_free(rra, sizeof (*rra)); 3977 spa_config_exit(spa, SCL_STATE, spa); 3978 return (B_TRUE); 3979 } 3980 3981 zio_t *pio = spa->spa_txg_zio[txgoff]; 3982 abd_t *abd = abd_alloc_for_io(length, B_FALSE); 3983 zio_t *write_zio = zio_vdev_child_io(pio, NULL, 3984 vd->vdev_child[blkid % vd->vdev_children], 3985 (blkid / vd->vdev_children) << ashift, 3986 abd, length, 3987 ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 3988 ZIO_FLAG_CANFAIL, 3989 raidz_reflow_write_done, rra); 3990 3991 zio_nowait(zio_vdev_child_io(write_zio, NULL, 3992 vd->vdev_child[blkid % old_children], 3993 (blkid / old_children) << ashift, 3994 abd, length, 3995 ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 3996 ZIO_FLAG_CANFAIL, 3997 raidz_reflow_read_done, rra)); 3998 3999 return (B_FALSE); 4000 } 4001 4002 /* 4003 * For testing (ztest specific) 4004 */ 4005 static void 4006 raidz_expand_pause(uint_t pause_point) 4007 { 4008 while (raidz_expand_pause_point != 0 && 4009 raidz_expand_pause_point <= pause_point) 4010 delay(hz); 4011 } 4012 4013 static void 4014 raidz_scratch_child_done(zio_t *zio) 4015 { 4016 zio_t *pio = zio->io_private; 4017 4018 mutex_enter(&pio->io_lock); 4019 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4020 mutex_exit(&pio->io_lock); 4021 } 4022 4023 /* 4024 * Reflow the beginning portion of the vdev into an intermediate scratch area 4025 * in memory and on disk. This operation must be persisted on disk before we 4026 * proceed to overwrite the beginning portion with the reflowed data. 4027 * 4028 * This multi-step task can fail to complete if disk errors are encountered 4029 * and we can return here after a pause (waiting for disk to become healthy). 4030 */ 4031 static void 4032 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4033 { 4034 vdev_raidz_expand_t *vre = arg; 4035 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4036 zio_t *pio; 4037 int error; 4038 4039 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4040 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4041 int ashift = raidvd->vdev_ashift; 4042 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4043 uint64_t); 4044 uint64_t logical_size = write_size * raidvd->vdev_children; 4045 uint64_t read_size = 4046 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4047 1 << ashift); 4048 4049 /* 4050 * The scratch space must be large enough to get us to the point 4051 * that one row does not overlap itself when moved. This is checked 4052 * by vdev_raidz_attach_check(). 4053 */ 4054 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4055 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4056 VERIFY3U(write_size, <=, read_size); 4057 4058 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4059 0, logical_size, RL_WRITER); 4060 4061 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4062 KM_SLEEP); 4063 for (int i = 0; i < raidvd->vdev_children; i++) { 4064 abds[i] = abd_alloc_linear(read_size, B_FALSE); 4065 } 4066 4067 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4068 4069 /* 4070 * If we have already written the scratch area then we must read from 4071 * there, since new writes were redirected there while we were paused 4072 * or the original location may have been partially overwritten with 4073 * reflowed data. 4074 */ 4075 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4076 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4077 /* 4078 * Read from scratch space. 4079 */ 4080 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4081 for (int i = 0; i < raidvd->vdev_children; i++) { 4082 /* 4083 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4084 * to the offset to calculate the physical offset to 4085 * write to. Passing in a negative offset makes us 4086 * access the scratch area. 4087 */ 4088 zio_nowait(zio_vdev_child_io(pio, NULL, 4089 raidvd->vdev_child[i], 4090 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4091 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, 4092 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4093 } 4094 error = zio_wait(pio); 4095 if (error != 0) { 4096 zfs_dbgmsg("reflow: error %d reading scratch location", 4097 error); 4098 goto io_error_exit; 4099 } 4100 goto overwrite; 4101 } 4102 4103 /* 4104 * Read from original location. 4105 */ 4106 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4107 for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4108 ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4109 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4110 0, abds[i], read_size, ZIO_TYPE_READ, 4111 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 4112 raidz_scratch_child_done, pio)); 4113 } 4114 error = zio_wait(pio); 4115 if (error != 0) { 4116 zfs_dbgmsg("reflow: error %d reading original location", error); 4117 io_error_exit: 4118 for (int i = 0; i < raidvd->vdev_children; i++) 4119 abd_free(abds[i]); 4120 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4121 zfs_rangelock_exit(lr); 4122 spa_config_exit(spa, SCL_STATE, FTAG); 4123 return; 4124 } 4125 4126 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4127 4128 /* 4129 * Reflow in memory. 4130 */ 4131 uint64_t logical_sectors = logical_size >> ashift; 4132 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4133 int oldchild = i % (raidvd->vdev_children - 1); 4134 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4135 4136 int newchild = i % raidvd->vdev_children; 4137 uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4138 4139 /* a single sector should not be copying over itself */ 4140 ASSERT(!(newchild == oldchild && newoff == oldoff)); 4141 4142 abd_copy_off(abds[newchild], abds[oldchild], 4143 newoff, oldoff, 1 << ashift); 4144 } 4145 4146 /* 4147 * Verify that we filled in everything we intended to (write_size on 4148 * each child). 4149 */ 4150 VERIFY0(logical_sectors % raidvd->vdev_children); 4151 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4152 write_size); 4153 4154 /* 4155 * Write to scratch location (boot area). 4156 */ 4157 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4158 for (int i = 0; i < raidvd->vdev_children; i++) { 4159 /* 4160 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4161 * the offset to calculate the physical offset to write to. 4162 * Passing in a negative offset lets us access the boot area. 4163 */ 4164 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4165 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4166 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 4167 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4168 } 4169 error = zio_wait(pio); 4170 if (error != 0) { 4171 zfs_dbgmsg("reflow: error %d writing scratch location", error); 4172 goto io_error_exit; 4173 } 4174 pio = zio_root(spa, NULL, NULL, 0); 4175 zio_flush(pio, raidvd); 4176 zio_wait(pio); 4177 4178 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4179 (long long)logical_size); 4180 4181 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4182 4183 /* 4184 * Update uberblock to indicate that scratch space is valid. This is 4185 * needed because after this point, the real location may be 4186 * overwritten. If we crash, we need to get the data from the 4187 * scratch space, rather than the real location. 4188 * 4189 * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4190 * will prefer this uberblock. 4191 */ 4192 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4193 spa->spa_ubsync.ub_timestamp++; 4194 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4195 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4196 if (spa_multihost(spa)) 4197 mmp_update_uberblock(spa, &spa->spa_ubsync); 4198 4199 zfs_dbgmsg("reflow: uberblock updated " 4200 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4201 (long long)spa->spa_ubsync.ub_txg, 4202 (long long)logical_size, 4203 (long long)spa->spa_ubsync.ub_timestamp); 4204 4205 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4206 4207 /* 4208 * Overwrite with reflow'ed data. 4209 */ 4210 overwrite: 4211 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4212 for (int i = 0; i < raidvd->vdev_children; i++) { 4213 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4214 0, abds[i], write_size, ZIO_TYPE_WRITE, 4215 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, 4216 raidz_scratch_child_done, pio)); 4217 } 4218 error = zio_wait(pio); 4219 if (error != 0) { 4220 /* 4221 * When we exit early here and drop the range lock, new 4222 * writes will go into the scratch area so we'll need to 4223 * read from there when we return after pausing. 4224 */ 4225 zfs_dbgmsg("reflow: error %d writing real location", error); 4226 /* 4227 * Update the uberblock that is written when this txg completes. 4228 */ 4229 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4230 logical_size); 4231 goto io_error_exit; 4232 } 4233 pio = zio_root(spa, NULL, NULL, 0); 4234 zio_flush(pio, raidvd); 4235 zio_wait(pio); 4236 4237 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4238 (long long)logical_size); 4239 for (int i = 0; i < raidvd->vdev_children; i++) 4240 abd_free(abds[i]); 4241 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4242 4243 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4244 4245 /* 4246 * Update uberblock to indicate that the initial part has been 4247 * reflow'ed. This is needed because after this point (when we exit 4248 * the rangelock), we allow regular writes to this region, which will 4249 * be written to the new location only (because reflow_offset_next == 4250 * reflow_offset_synced). If we crashed and re-copied from the 4251 * scratch space, we would lose the regular writes. 4252 */ 4253 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4254 logical_size); 4255 spa->spa_ubsync.ub_timestamp++; 4256 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4257 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4258 if (spa_multihost(spa)) 4259 mmp_update_uberblock(spa, &spa->spa_ubsync); 4260 4261 zfs_dbgmsg("reflow: uberblock updated " 4262 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4263 (long long)spa->spa_ubsync.ub_txg, 4264 (long long)logical_size, 4265 (long long)spa->spa_ubsync.ub_timestamp); 4266 4267 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4268 4269 /* 4270 * Update progress. 4271 */ 4272 vre->vre_offset = logical_size; 4273 zfs_rangelock_exit(lr); 4274 spa_config_exit(spa, SCL_STATE, FTAG); 4275 4276 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4277 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4278 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4279 /* 4280 * Note - raidz_reflow_sync() will update the uberblock state to 4281 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4282 */ 4283 raidz_reflow_sync(spa, tx); 4284 4285 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4286 } 4287 4288 /* 4289 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4290 * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4291 */ 4292 void 4293 vdev_raidz_reflow_copy_scratch(spa_t *spa) 4294 { 4295 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4296 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4297 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4298 4299 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4300 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4301 ASSERT0(logical_size % raidvd->vdev_children); 4302 uint64_t write_size = logical_size / raidvd->vdev_children; 4303 4304 zio_t *pio; 4305 4306 /* 4307 * Read from scratch space. 4308 */ 4309 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4310 KM_SLEEP); 4311 for (int i = 0; i < raidvd->vdev_children; i++) { 4312 abds[i] = abd_alloc_linear(write_size, B_FALSE); 4313 } 4314 4315 pio = zio_root(spa, NULL, NULL, 0); 4316 for (int i = 0; i < raidvd->vdev_children; i++) { 4317 /* 4318 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4319 * the offset to calculate the physical offset to write to. 4320 * Passing in a negative offset lets us access the boot area. 4321 */ 4322 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4323 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4324 write_size, ZIO_TYPE_READ, 4325 ZIO_PRIORITY_ASYNC_READ, 0, 4326 raidz_scratch_child_done, pio)); 4327 } 4328 zio_wait(pio); 4329 4330 /* 4331 * Overwrite real location with reflow'ed data. 4332 */ 4333 pio = zio_root(spa, NULL, NULL, 0); 4334 for (int i = 0; i < raidvd->vdev_children; i++) { 4335 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4336 0, abds[i], write_size, ZIO_TYPE_WRITE, 4337 ZIO_PRIORITY_ASYNC_WRITE, 0, 4338 raidz_scratch_child_done, pio)); 4339 } 4340 zio_wait(pio); 4341 pio = zio_root(spa, NULL, NULL, 0); 4342 zio_flush(pio, raidvd); 4343 zio_wait(pio); 4344 4345 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4346 "to real location", (long long)logical_size); 4347 4348 for (int i = 0; i < raidvd->vdev_children; i++) 4349 abd_free(abds[i]); 4350 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4351 4352 /* 4353 * Update uberblock. 4354 */ 4355 RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4356 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4357 spa->spa_ubsync.ub_timestamp++; 4358 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4359 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4360 if (spa_multihost(spa)) 4361 mmp_update_uberblock(spa, &spa->spa_ubsync); 4362 4363 zfs_dbgmsg("reflow recovery: uberblock updated " 4364 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4365 (long long)spa->spa_ubsync.ub_txg, 4366 (long long)logical_size, 4367 (long long)spa->spa_ubsync.ub_timestamp); 4368 4369 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4370 spa_first_txg(spa)); 4371 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4372 vre->vre_offset = logical_size; 4373 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4374 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4375 /* 4376 * Note that raidz_reflow_sync() will update the uberblock once more 4377 */ 4378 raidz_reflow_sync(spa, tx); 4379 4380 dmu_tx_commit(tx); 4381 4382 spa_config_exit(spa, SCL_STATE, FTAG); 4383 } 4384 4385 static boolean_t 4386 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4387 { 4388 (void) zthr; 4389 spa_t *spa = arg; 4390 4391 return (spa->spa_raidz_expand != NULL && 4392 !spa->spa_raidz_expand->vre_waiting_for_resilver); 4393 } 4394 4395 /* 4396 * RAIDZ expansion background thread 4397 * 4398 * Can be called multiple times if the reflow is paused 4399 */ 4400 static void 4401 spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4402 { 4403 spa_t *spa = arg; 4404 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4405 4406 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4407 vre->vre_offset = 0; 4408 else 4409 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4410 4411 /* Reflow the begining portion using the scratch area */ 4412 if (vre->vre_offset == 0) { 4413 VERIFY0(dsl_sync_task(spa_name(spa), 4414 NULL, raidz_reflow_scratch_sync, 4415 vre, 0, ZFS_SPACE_CHECK_NONE)); 4416 4417 /* if we encountered errors then pause */ 4418 if (vre->vre_offset == 0) { 4419 mutex_enter(&vre->vre_lock); 4420 vre->vre_waiting_for_resilver = B_TRUE; 4421 mutex_exit(&vre->vre_lock); 4422 return; 4423 } 4424 } 4425 4426 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4427 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4428 4429 uint64_t guid = raidvd->vdev_guid; 4430 4431 /* Iterate over all the remaining metaslabs */ 4432 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4433 i < raidvd->vdev_ms_count && 4434 !zthr_iscancelled(zthr) && 4435 vre->vre_failed_offset == UINT64_MAX; i++) { 4436 metaslab_t *msp = raidvd->vdev_ms[i]; 4437 4438 metaslab_disable(msp); 4439 mutex_enter(&msp->ms_lock); 4440 4441 /* 4442 * The metaslab may be newly created (for the expanded 4443 * space), in which case its trees won't exist yet, 4444 * so we need to bail out early. 4445 */ 4446 if (msp->ms_new) { 4447 mutex_exit(&msp->ms_lock); 4448 metaslab_enable(msp, B_FALSE, B_FALSE); 4449 continue; 4450 } 4451 4452 VERIFY0(metaslab_load(msp)); 4453 4454 /* 4455 * We want to copy everything except the free (allocatable) 4456 * space. Note that there may be a little bit more free 4457 * space (e.g. in ms_defer), and it's fine to copy that too. 4458 */ 4459 range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, 4460 NULL, 0, 0); 4461 range_tree_add(rt, msp->ms_start, msp->ms_size); 4462 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); 4463 mutex_exit(&msp->ms_lock); 4464 4465 /* 4466 * Force the last sector of each metaslab to be copied. This 4467 * ensures that we advance the on-disk progress to the end of 4468 * this metaslab while the metaslab is disabled. Otherwise, we 4469 * could move past this metaslab without advancing the on-disk 4470 * progress, and then an allocation to this metaslab would not 4471 * be copied. 4472 */ 4473 int sectorsz = 1 << raidvd->vdev_ashift; 4474 uint64_t ms_last_offset = msp->ms_start + 4475 msp->ms_size - sectorsz; 4476 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { 4477 range_tree_add(rt, ms_last_offset, sectorsz); 4478 } 4479 4480 /* 4481 * When we are resuming from a paused expansion (i.e. 4482 * when importing a pool with a expansion in progress), 4483 * discard any state that we have already processed. 4484 */ 4485 range_tree_clear(rt, 0, vre->vre_offset); 4486 4487 while (!zthr_iscancelled(zthr) && 4488 !range_tree_is_empty(rt) && 4489 vre->vre_failed_offset == UINT64_MAX) { 4490 4491 /* 4492 * We need to periodically drop the config lock so that 4493 * writers can get in. Additionally, we can't wait 4494 * for a txg to sync while holding a config lock 4495 * (since a waiting writer could cause a 3-way deadlock 4496 * with the sync thread, which also gets a config 4497 * lock for reader). So we can't hold the config lock 4498 * while calling dmu_tx_assign(). 4499 */ 4500 spa_config_exit(spa, SCL_CONFIG, FTAG); 4501 4502 /* 4503 * If requested, pause the reflow when the amount 4504 * specified by raidz_expand_max_reflow_bytes is reached 4505 * 4506 * This pause is only used during testing or debugging. 4507 */ 4508 while (raidz_expand_max_reflow_bytes != 0 && 4509 raidz_expand_max_reflow_bytes <= 4510 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4511 delay(hz); 4512 } 4513 4514 mutex_enter(&vre->vre_lock); 4515 while (vre->vre_outstanding_bytes > 4516 raidz_expand_max_copy_bytes) { 4517 cv_wait(&vre->vre_cv, &vre->vre_lock); 4518 } 4519 mutex_exit(&vre->vre_lock); 4520 4521 dmu_tx_t *tx = 4522 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4523 4524 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 4525 uint64_t txg = dmu_tx_get_txg(tx); 4526 4527 /* 4528 * Reacquire the vdev_config lock. Theoretically, the 4529 * vdev_t that we're expanding may have changed. 4530 */ 4531 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4532 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4533 4534 boolean_t needsync = 4535 raidz_reflow_impl(raidvd, vre, rt, tx); 4536 4537 dmu_tx_commit(tx); 4538 4539 if (needsync) { 4540 spa_config_exit(spa, SCL_CONFIG, FTAG); 4541 txg_wait_synced(spa->spa_dsl_pool, txg); 4542 spa_config_enter(spa, SCL_CONFIG, FTAG, 4543 RW_READER); 4544 } 4545 } 4546 4547 spa_config_exit(spa, SCL_CONFIG, FTAG); 4548 4549 metaslab_enable(msp, B_FALSE, B_FALSE); 4550 range_tree_vacate(rt, NULL, NULL); 4551 range_tree_destroy(rt); 4552 4553 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4554 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4555 } 4556 4557 spa_config_exit(spa, SCL_CONFIG, FTAG); 4558 4559 /* 4560 * The txg_wait_synced() here ensures that all reflow zio's have 4561 * completed, and vre_failed_offset has been set if necessary. It 4562 * also ensures that the progress of the last raidz_reflow_sync() is 4563 * written to disk before raidz_reflow_complete_sync() changes the 4564 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4565 * determine if a reflow is in progress, in which case we may need to 4566 * write to both old and new locations. Therefore we can only change 4567 * vre_state once this is not necessary, which is once the on-disk 4568 * progress (in spa_ubsync) has been set past any possible writes (to 4569 * the end of the last metaslab). 4570 */ 4571 txg_wait_synced(spa->spa_dsl_pool, 0); 4572 4573 if (!zthr_iscancelled(zthr) && 4574 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4575 /* 4576 * We are not being canceled or paused, so the reflow must be 4577 * complete. In that case also mark it as completed on disk. 4578 */ 4579 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4580 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4581 raidz_reflow_complete_sync, spa, 4582 0, ZFS_SPACE_CHECK_NONE)); 4583 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4584 } else { 4585 /* 4586 * Wait for all copy zio's to complete and for all the 4587 * raidz_reflow_sync() synctasks to be run. 4588 */ 4589 spa_history_log_internal(spa, "reflow pause", 4590 NULL, "offset=%llu failed_offset=%lld", 4591 (long long)vre->vre_offset, 4592 (long long)vre->vre_failed_offset); 4593 mutex_enter(&vre->vre_lock); 4594 if (vre->vre_failed_offset != UINT64_MAX) { 4595 /* 4596 * Reset progress so that we will retry everything 4597 * after the point that something failed. 4598 */ 4599 vre->vre_offset = vre->vre_failed_offset; 4600 vre->vre_failed_offset = UINT64_MAX; 4601 vre->vre_waiting_for_resilver = B_TRUE; 4602 } 4603 mutex_exit(&vre->vre_lock); 4604 } 4605 } 4606 4607 void 4608 spa_start_raidz_expansion_thread(spa_t *spa) 4609 { 4610 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4611 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4612 spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4613 spa, defclsyspri); 4614 } 4615 4616 void 4617 raidz_dtl_reassessed(vdev_t *vd) 4618 { 4619 spa_t *spa = vd->vdev_spa; 4620 if (spa->spa_raidz_expand != NULL) { 4621 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4622 /* 4623 * we get called often from vdev_dtl_reassess() so make 4624 * sure it's our vdev and any replacing is complete 4625 */ 4626 if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4627 !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4628 mutex_enter(&vre->vre_lock); 4629 if (vre->vre_waiting_for_resilver) { 4630 vdev_dbgmsg(vd, "DTL reassessed, " 4631 "continuing raidz expansion"); 4632 vre->vre_waiting_for_resilver = B_FALSE; 4633 zthr_wakeup(spa->spa_raidz_expand_zthr); 4634 } 4635 mutex_exit(&vre->vre_lock); 4636 } 4637 } 4638 } 4639 4640 int 4641 vdev_raidz_attach_check(vdev_t *new_child) 4642 { 4643 vdev_t *raidvd = new_child->vdev_parent; 4644 uint64_t new_children = raidvd->vdev_children; 4645 4646 /* 4647 * We use the "boot" space as scratch space to handle overwriting the 4648 * initial part of the vdev. If it is too small, then this expansion 4649 * is not allowed. This would be very unusual (e.g. ashift > 13 and 4650 * >200 children). 4651 */ 4652 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4653 return (EINVAL); 4654 } 4655 return (0); 4656 } 4657 4658 void 4659 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4660 { 4661 vdev_t *new_child = arg; 4662 spa_t *spa = new_child->vdev_spa; 4663 vdev_t *raidvd = new_child->vdev_parent; 4664 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4665 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4666 ASSERT3P(raidvd->vdev_top, ==, raidvd); 4667 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4668 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4669 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4670 new_child); 4671 4672 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4673 4674 vdrz->vd_physical_width++; 4675 4676 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4677 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4678 vdrz->vn_vre.vre_offset = 0; 4679 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4680 spa->spa_raidz_expand = &vdrz->vn_vre; 4681 zthr_wakeup(spa->spa_raidz_expand_zthr); 4682 4683 /* 4684 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4685 * written to the config. 4686 */ 4687 vdev_config_dirty(raidvd); 4688 4689 vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4690 vdrz->vn_vre.vre_end_time = 0; 4691 vdrz->vn_vre.vre_state = DSS_SCANNING; 4692 vdrz->vn_vre.vre_bytes_copied = 0; 4693 4694 uint64_t state = vdrz->vn_vre.vre_state; 4695 VERIFY0(zap_update(spa->spa_meta_objset, 4696 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4697 sizeof (state), 1, &state, tx)); 4698 4699 uint64_t start_time = vdrz->vn_vre.vre_start_time; 4700 VERIFY0(zap_update(spa->spa_meta_objset, 4701 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4702 sizeof (start_time), 1, &start_time, tx)); 4703 4704 (void) zap_remove(spa->spa_meta_objset, 4705 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4706 (void) zap_remove(spa->spa_meta_objset, 4707 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4708 4709 spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4710 "%s vdev %llu new width %llu", spa_name(spa), 4711 (unsigned long long)raidvd->vdev_id, 4712 (unsigned long long)raidvd->vdev_children); 4713 } 4714 4715 int 4716 vdev_raidz_load(vdev_t *vd) 4717 { 4718 vdev_raidz_t *vdrz = vd->vdev_tsd; 4719 int err; 4720 4721 uint64_t state = DSS_NONE; 4722 uint64_t start_time = 0; 4723 uint64_t end_time = 0; 4724 uint64_t bytes_copied = 0; 4725 4726 if (vd->vdev_top_zap != 0) { 4727 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4728 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4729 sizeof (state), 1, &state); 4730 if (err != 0 && err != ENOENT) 4731 return (err); 4732 4733 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4734 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4735 sizeof (start_time), 1, &start_time); 4736 if (err != 0 && err != ENOENT) 4737 return (err); 4738 4739 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4740 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4741 sizeof (end_time), 1, &end_time); 4742 if (err != 0 && err != ENOENT) 4743 return (err); 4744 4745 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4746 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4747 sizeof (bytes_copied), 1, &bytes_copied); 4748 if (err != 0 && err != ENOENT) 4749 return (err); 4750 } 4751 4752 /* 4753 * If we are in the middle of expansion, vre_state should have 4754 * already been set by vdev_raidz_init(). 4755 */ 4756 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4757 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4758 vdrz->vn_vre.vre_start_time = start_time; 4759 vdrz->vn_vre.vre_end_time = end_time; 4760 vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4761 4762 return (0); 4763 } 4764 4765 int 4766 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4767 { 4768 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4769 4770 if (vre == NULL) { 4771 /* no removal in progress; find most recent completed */ 4772 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4773 vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4774 if (vd->vdev_ops == &vdev_raidz_ops) { 4775 vdev_raidz_t *vdrz = vd->vdev_tsd; 4776 4777 if (vdrz->vn_vre.vre_end_time != 0 && 4778 (vre == NULL || 4779 vdrz->vn_vre.vre_end_time > 4780 vre->vre_end_time)) { 4781 vre = &vdrz->vn_vre; 4782 } 4783 } 4784 } 4785 } 4786 4787 if (vre == NULL) { 4788 return (SET_ERROR(ENOENT)); 4789 } 4790 4791 pres->pres_state = vre->vre_state; 4792 pres->pres_expanding_vdev = vre->vre_vdev_id; 4793 4794 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4795 pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4796 4797 mutex_enter(&vre->vre_lock); 4798 pres->pres_reflowed = vre->vre_bytes_copied; 4799 for (int i = 0; i < TXG_SIZE; i++) 4800 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4801 mutex_exit(&vre->vre_lock); 4802 4803 pres->pres_start_time = vre->vre_start_time; 4804 pres->pres_end_time = vre->vre_end_time; 4805 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4806 4807 return (0); 4808 } 4809 4810 /* 4811 * Initialize private RAIDZ specific fields from the nvlist. 4812 */ 4813 static int 4814 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 4815 { 4816 uint_t children; 4817 nvlist_t **child; 4818 int error = nvlist_lookup_nvlist_array(nv, 4819 ZPOOL_CONFIG_CHILDREN, &child, &children); 4820 if (error != 0) 4821 return (SET_ERROR(EINVAL)); 4822 4823 uint64_t nparity; 4824 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 4825 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 4826 return (SET_ERROR(EINVAL)); 4827 4828 /* 4829 * Previous versions could only support 1 or 2 parity 4830 * device. 4831 */ 4832 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 4833 return (SET_ERROR(EINVAL)); 4834 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 4835 return (SET_ERROR(EINVAL)); 4836 } else { 4837 /* 4838 * We require the parity to be specified for SPAs that 4839 * support multiple parity levels. 4840 */ 4841 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 4842 return (SET_ERROR(EINVAL)); 4843 4844 /* 4845 * Otherwise, we default to 1 parity device for RAID-Z. 4846 */ 4847 nparity = 1; 4848 } 4849 4850 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4851 vdrz->vn_vre.vre_vdev_id = -1; 4852 vdrz->vn_vre.vre_offset = UINT64_MAX; 4853 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4854 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4855 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4856 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4857 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4858 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4859 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4860 4861 vdrz->vd_physical_width = children; 4862 vdrz->vd_nparity = nparity; 4863 4864 /* note, the ID does not exist when creating a pool */ 4865 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4866 &vdrz->vn_vre.vre_vdev_id); 4867 4868 boolean_t reflow_in_progress = 4869 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4870 if (reflow_in_progress) { 4871 spa->spa_raidz_expand = &vdrz->vn_vre; 4872 vdrz->vn_vre.vre_state = DSS_SCANNING; 4873 } 4874 4875 vdrz->vd_original_width = children; 4876 uint64_t *txgs; 4877 unsigned int txgs_size = 0; 4878 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4879 &txgs, &txgs_size); 4880 if (error == 0) { 4881 for (int i = 0; i < txgs_size; i++) { 4882 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4883 re->re_txg = txgs[txgs_size - i - 1]; 4884 re->re_logical_width = vdrz->vd_physical_width - i; 4885 4886 if (reflow_in_progress) 4887 re->re_logical_width--; 4888 4889 avl_add(&vdrz->vd_expand_txgs, re); 4890 } 4891 4892 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4893 } 4894 if (reflow_in_progress) { 4895 vdrz->vd_original_width--; 4896 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 4897 children, txgs_size); 4898 } 4899 4900 *tsd = vdrz; 4901 4902 return (0); 4903 } 4904 4905 static void 4906 vdev_raidz_fini(vdev_t *vd) 4907 { 4908 vdev_raidz_t *vdrz = vd->vdev_tsd; 4909 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 4910 vd->vdev_spa->spa_raidz_expand = NULL; 4911 reflow_node_t *re; 4912 void *cookie = NULL; 4913 avl_tree_t *tree = &vdrz->vd_expand_txgs; 4914 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 4915 kmem_free(re, sizeof (*re)); 4916 avl_destroy(&vdrz->vd_expand_txgs); 4917 mutex_destroy(&vdrz->vd_expand_lock); 4918 mutex_destroy(&vdrz->vn_vre.vre_lock); 4919 cv_destroy(&vdrz->vn_vre.vre_cv); 4920 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 4921 kmem_free(vdrz, sizeof (*vdrz)); 4922 } 4923 4924 /* 4925 * Add RAIDZ specific fields to the config nvlist. 4926 */ 4927 static void 4928 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 4929 { 4930 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 4931 vdev_raidz_t *vdrz = vd->vdev_tsd; 4932 4933 /* 4934 * Make sure someone hasn't managed to sneak a fancy new vdev 4935 * into a crufty old storage pool. 4936 */ 4937 ASSERT(vdrz->vd_nparity == 1 || 4938 (vdrz->vd_nparity <= 2 && 4939 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 4940 (vdrz->vd_nparity <= 3 && 4941 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 4942 4943 /* 4944 * Note that we'll add these even on storage pools where they 4945 * aren't strictly required -- older software will just ignore 4946 * it. 4947 */ 4948 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 4949 4950 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4951 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4952 } 4953 4954 mutex_enter(&vdrz->vd_expand_lock); 4955 if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 4956 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 4957 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 4958 KM_SLEEP); 4959 uint64_t i = 0; 4960 4961 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 4962 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 4963 txgs[i++] = re->re_txg; 4964 } 4965 4966 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4967 txgs, count); 4968 4969 kmem_free(txgs, sizeof (uint64_t) * count); 4970 } 4971 mutex_exit(&vdrz->vd_expand_lock); 4972 } 4973 4974 static uint64_t 4975 vdev_raidz_nparity(vdev_t *vd) 4976 { 4977 vdev_raidz_t *vdrz = vd->vdev_tsd; 4978 return (vdrz->vd_nparity); 4979 } 4980 4981 static uint64_t 4982 vdev_raidz_ndisks(vdev_t *vd) 4983 { 4984 return (vd->vdev_children); 4985 } 4986 4987 vdev_ops_t vdev_raidz_ops = { 4988 .vdev_op_init = vdev_raidz_init, 4989 .vdev_op_fini = vdev_raidz_fini, 4990 .vdev_op_open = vdev_raidz_open, 4991 .vdev_op_close = vdev_raidz_close, 4992 .vdev_op_asize = vdev_raidz_asize, 4993 .vdev_op_min_asize = vdev_raidz_min_asize, 4994 .vdev_op_min_alloc = NULL, 4995 .vdev_op_io_start = vdev_raidz_io_start, 4996 .vdev_op_io_done = vdev_raidz_io_done, 4997 .vdev_op_state_change = vdev_raidz_state_change, 4998 .vdev_op_need_resilver = vdev_raidz_need_resilver, 4999 .vdev_op_hold = NULL, 5000 .vdev_op_rele = NULL, 5001 .vdev_op_remap = NULL, 5002 .vdev_op_xlate = vdev_raidz_xlate, 5003 .vdev_op_rebuild_asize = NULL, 5004 .vdev_op_metaslab_init = NULL, 5005 .vdev_op_config_generate = vdev_raidz_config_generate, 5006 .vdev_op_nparity = vdev_raidz_nparity, 5007 .vdev_op_ndisks = vdev_raidz_ndisks, 5008 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5009 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5010 }; 5011 5012 /* BEGIN CSTYLED */ 5013 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5014 "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5015 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5016 "Max amount of concurrent i/o for RAIDZ expansion"); 5017 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5018 "For expanded RAIDZ, aggregate reads that have more rows than this"); 5019 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5020 "For expanded RAIDZ, automatically start a pool scrub when expansion " 5021 "completes"); 5022 /* END CSTYLED */ 5023