1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 26 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 27 * Copyright (c) 2025, Klara, Inc. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/zap.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/metaslab_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/dmu_tx.h> 39 #include <sys/abd.h> 40 #include <sys/zfs_rlock.h> 41 #include <sys/fs/zfs.h> 42 #include <sys/fm/fs/zfs.h> 43 #include <sys/vdev_raidz.h> 44 #include <sys/vdev_raidz_impl.h> 45 #include <sys/vdev_draid.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/dsl_scan.h> 48 49 #ifdef ZFS_DEBUG 50 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 51 #endif 52 53 /* 54 * Virtual device vector for RAID-Z. 55 * 56 * This vdev supports single, double, and triple parity. For single parity, 57 * we use a simple XOR of all the data columns. For double or triple parity, 58 * we use a special case of Reed-Solomon coding. This extends the 59 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 60 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 61 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 62 * former is also based. The latter is designed to provide higher performance 63 * for writes. 64 * 65 * Note that the Plank paper claimed to support arbitrary N+M, but was then 66 * amended six years later identifying a critical flaw that invalidates its 67 * claims. Nevertheless, the technique can be adapted to work for up to 68 * triple parity. For additional parity, the amendment "Note: Correction to 69 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 70 * is viable, but the additional complexity means that write performance will 71 * suffer. 72 * 73 * All of the methods above operate on a Galois field, defined over the 74 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 75 * can be expressed with a single byte. Briefly, the operations on the 76 * field are defined as follows: 77 * 78 * o addition (+) is represented by a bitwise XOR 79 * o subtraction (-) is therefore identical to addition: A + B = A - B 80 * o multiplication of A by 2 is defined by the following bitwise expression: 81 * 82 * (A * 2)_7 = A_6 83 * (A * 2)_6 = A_5 84 * (A * 2)_5 = A_4 85 * (A * 2)_4 = A_3 + A_7 86 * (A * 2)_3 = A_2 + A_7 87 * (A * 2)_2 = A_1 + A_7 88 * (A * 2)_1 = A_0 89 * (A * 2)_0 = A_7 90 * 91 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 92 * As an aside, this multiplication is derived from the error correcting 93 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 94 * 95 * Observe that any number in the field (except for 0) can be expressed as a 96 * power of 2 -- a generator for the field. We store a table of the powers of 97 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 98 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 99 * than field addition). The inverse of a field element A (A^-1) is therefore 100 * A ^ (255 - 1) = A^254. 101 * 102 * The up-to-three parity columns, P, Q, R over several data columns, 103 * D_0, ... D_n-1, can be expressed by field operations: 104 * 105 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 106 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 107 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 108 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 109 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 110 * 111 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 112 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 113 * independent coefficients. (There are no additional coefficients that have 114 * this property which is why the uncorrected Plank method breaks down.) 115 * 116 * See the reconstruction code below for how P, Q and R can used individually 117 * or in concert to recover missing data columns. 118 */ 119 120 #define VDEV_RAIDZ_P 0 121 #define VDEV_RAIDZ_Q 1 122 #define VDEV_RAIDZ_R 2 123 124 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 125 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 126 127 /* 128 * We provide a mechanism to perform the field multiplication operation on a 129 * 64-bit value all at once rather than a byte at a time. This works by 130 * creating a mask from the top bit in each byte and using that to 131 * conditionally apply the XOR of 0x1d. 132 */ 133 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 134 { \ 135 (mask) = (x) & 0x8080808080808080ULL; \ 136 (mask) = ((mask) << 1) - ((mask) >> 7); \ 137 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 138 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 139 } 140 141 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 142 { \ 143 VDEV_RAIDZ_64MUL_2((x), mask); \ 144 VDEV_RAIDZ_64MUL_2((x), mask); \ 145 } 146 147 148 /* 149 * Big Theory Statement for how a RAIDZ VDEV is expanded 150 * 151 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 152 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 153 * that have been previously expanded can be expanded again. 154 * 155 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 156 * the VDEV) when an expansion starts. And the expansion will pause if any 157 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 158 * operations on the pool can continue while an expansion is in progress (e.g. 159 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 160 * and zpool initialize which can't be run during an expansion. Following a 161 * reboot or export/import, the expansion resumes where it left off. 162 * 163 * == Reflowing the Data == 164 * 165 * The expansion involves reflowing (copying) the data from the current set 166 * of disks to spread it across the new set which now has one more disk. This 167 * reflow operation is similar to reflowing text when the column width of a 168 * text editor window is expanded. The text doesn’t change but the location of 169 * the text changes to accommodate the new width. An example reflow result for 170 * a 4-wide RAIDZ1 to a 5-wide is shown below. 171 * 172 * Reflow End State 173 * Each letter indicates a parity group (logical stripe) 174 * 175 * Before expansion After Expansion 176 * D1 D2 D3 D4 D1 D2 D3 D4 D5 177 * +------+------+------+------+ +------+------+------+------+------+ 178 * | | | | | | | | | | | 179 * | A | A | A | A | | A | A | A | A | B | 180 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 181 * +------+------+------+------+ +------+------+------+------+------+ 182 * | | | | | | | | | | | 183 * | B | B | C | C | | B | C | C | C | C | 184 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 185 * +------+------+------+------+ +------+------+------+------+------+ 186 * | | | | | | | | | | | 187 * | C | C | D | D | | D | D | E | E | E | 188 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 189 * +------+------+------+------+ +------+------+------+------+------+ 190 * | | | | | | | | | | | 191 * | E | E | E | E | --> | E | F | F | G | G | 192 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 193 * +------+------+------+------+ +------+------+------+------+------+ 194 * | | | | | | | | | | | 195 * | F | F | G | G | | G | G | H | H | H | 196 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 197 * +------+------+------+------+ +------+------+------+------+------+ 198 * | | | | | | | | | | | 199 * | G | G | H | H | | H | I | I | J | J | 200 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 201 * +------+------+------+------+ +------+------+------+------+------+ 202 * | | | | | | | | | | | 203 * | H | H | I | I | | J | J | | | K | 204 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 205 * +------+------+------+------+ +------+------+------+------+------+ 206 * 207 * This reflow approach has several advantages. There is no need to read or 208 * modify the block pointers or recompute any block checksums. The reflow 209 * doesn’t need to know where the parity sectors reside. We can read and write 210 * data sequentially and the copy can occur in a background thread in open 211 * context. The design also allows for fast discovery of what data to copy. 212 * 213 * The VDEV metaslabs are processed, one at a time, to copy the block data to 214 * have it flow across all the disks. The metaslab is disabled for allocations 215 * during the copy. As an optimization, we only copy the allocated data which 216 * can be determined by looking at the metaslab range tree. During the copy we 217 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 218 * need to be able to survive losing parity count disks). This means we 219 * cannot overwrite data during the reflow that would be needed if a disk is 220 * lost. 221 * 222 * After the reflow completes, all newly-written blocks will have the new 223 * layout, i.e., they will have the parity to data ratio implied by the new 224 * number of disks in the RAIDZ group. Even though the reflow copies all of 225 * the allocated space (data and parity), it is only rearranged, not changed. 226 * 227 * This act of reflowing the data has a few implications about blocks 228 * that were written before the reflow completes: 229 * 230 * - Old blocks will still use the same amount of space (i.e., they will have 231 * the parity to data ratio implied by the old number of disks in the RAIDZ 232 * group). 233 * - Reading old blocks will be slightly slower than before the reflow, for 234 * two reasons. First, we will have to read from all disks in the RAIDZ 235 * VDEV, rather than being able to skip the children that contain only 236 * parity of this block (because the data of a single block is now spread 237 * out across all the disks). Second, in most cases there will be an extra 238 * bcopy, needed to rearrange the data back to its original layout in memory. 239 * 240 * == Scratch Area == 241 * 242 * As we copy the block data, we can only progress to the point that writes 243 * will not overlap with blocks whose progress has not yet been recorded on 244 * disk. Since partially-copied rows are always read from the old location, 245 * we need to stop one row before the sector-wise overlap, to prevent any 246 * row-wise overlap. For example, in the diagram above, when we reflow sector 247 * B6 it will overwite the original location for B5. 248 * 249 * To get around this, a scratch space is used so that we can start copying 250 * without risking data loss by overlapping the row. As an added benefit, it 251 * improves performance at the beginning of the reflow, but that small perf 252 * boost wouldn't be worth the complexity on its own. 253 * 254 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 255 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 256 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 257 * the widths will likely be single digits so we can get a substantial chuck 258 * size using only a few MB of scratch per disk. 259 * 260 * The scratch area is persisted to disk which holds a large amount of reflowed 261 * state. We can always read the partially written stripes when a disk fails or 262 * the copy is interrupted (crash) during the initial copying phase and also 263 * get past a small chunk size restriction. At a minimum, the scratch space 264 * must be large enough to get us to the point that one row does not overlap 265 * itself when moved (i.e new_width^2). But going larger is even better. We 266 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 267 * as our scratch space to handle overwriting the initial part of the VDEV. 268 * 269 * 0 256K 512K 4M 270 * +------+------+-----------------------+----------------------------- 271 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 272 * | L0 | L1 | Reserved | (Metaslabs) 273 * +------+------+-----------------------+------------------------------- 274 * Scratch Area 275 * 276 * == Reflow Progress Updates == 277 * After the initial scratch-based reflow, the expansion process works 278 * similarly to device removal. We create a new open context thread which 279 * reflows the data, and periodically kicks off sync tasks to update logical 280 * state. In this case, state is the committed progress (offset of next data 281 * to copy). We need to persist the completed offset on disk, so that if we 282 * crash we know which format each VDEV offset is in. 283 * 284 * == Time Dependent Geometry == 285 * 286 * In non-expanded RAIDZ, blocks are read from disk in a column by column 287 * fashion. For a multi-row block, the second sector is in the first column 288 * not in the second column. This allows us to issue full reads for each 289 * column directly into the request buffer. The block data is thus laid out 290 * sequentially in a column-by-column fashion. 291 * 292 * For example, in the before expansion diagram above, one logical block might 293 * be sectors G19-H26. The parity is in G19,H23; and the data is in 294 * G20,H24,G21,H25,G22,H26. 295 * 296 * After a block is reflowed, the sectors that were all in the original column 297 * data can now reside in different columns. When reading from an expanded 298 * VDEV, we need to know the logical stripe width for each block so we can 299 * reconstitute the block’s data after the reads are completed. Likewise, 300 * when we perform the combinatorial reconstruction we need to know the 301 * original width so we can retry combinations from the past layouts. 302 * 303 * Time dependent geometry is what we call having blocks with different layouts 304 * (stripe widths) in the same VDEV. This time-dependent geometry uses the 305 * block’s birth time (+ the time expansion ended) to establish the correct 306 * width for a given block. After an expansion completes, we record the time 307 * for blocks written with a particular width (geometry). 308 * 309 * == On Disk Format Changes == 310 * 311 * New pool feature flag, 'raidz_expansion' whose reference count is the number 312 * of RAIDZ VDEVs that have been expanded. 313 * 314 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 315 * 316 * Since the uberblock can point to arbitrary blocks, which might be on the 317 * expanding RAIDZ, and might or might not have been expanded. We need to know 318 * which way a block is laid out before reading it. This info is the next 319 * offset that needs to be reflowed and we persist that in the uberblock, in 320 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 321 * After the expansion is complete, we then use the raidz_expand_txgs array 322 * (see below) to determine how to read a block and the ub_raidz_reflow_info 323 * field no longer required. 324 * 325 * The uberblock's ub_raidz_reflow_info field also holds the scratch space 326 * state (i.e., active or not) which is also required before reading a block 327 * during the initial phase of reflowing the data. 328 * 329 * The top-level RAIDZ VDEV has two new entries in the nvlist: 330 * 331 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 332 * and used after the expansion is complete to 333 * determine how to read a raidz block 334 * 'raidz_expanding' boolean: present during reflow and removed after completion 335 * used during a spa import to resume an unfinished 336 * expansion 337 * 338 * And finally the VDEVs top zap adds the following informational entries: 339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 341 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 342 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 343 */ 344 345 /* 346 * For testing only: pause the raidz expansion after reflowing this amount. 347 * (accessed by ZTS and ztest) 348 */ 349 #ifdef _KERNEL 350 static 351 #endif /* _KERNEL */ 352 unsigned long raidz_expand_max_reflow_bytes = 0; 353 354 /* 355 * For testing only: pause the raidz expansion at a certain point. 356 */ 357 uint_t raidz_expand_pause_point = 0; 358 359 /* 360 * This represents the duration for a slow drive read sit out. 361 */ 362 static unsigned long vdev_read_sit_out_secs = 600; 363 364 /* 365 * How often each RAID-Z and dRAID vdev will check for slow disk outliers. 366 * Increasing this interval will reduce the sensitivity of detection (since all 367 * I/Os since the last check are included in the statistics), but will slow the 368 * response to a disk developing a problem. 369 * 370 * Defaults to once per second; setting extremely small values may cause 371 * negative performance effects. 372 */ 373 static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000; 374 375 /* 376 * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is 377 * used to determine how far out an outlier must be before it counts as an event 378 * worth consdering. 379 * 380 * Smaller values will result in more aggressive sitting out of disks that may 381 * have problems, but may significantly increase the rate of spurious sit-outs. 382 */ 383 static uint32_t vdev_raidz_outlier_insensitivity = 50; 384 385 /* 386 * Maximum amount of copy io's outstanding at once. 387 */ 388 #ifdef _ILP32 389 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; 390 #else 391 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 392 #endif 393 394 /* 395 * Apply raidz map abds aggregation if the number of rows in the map is equal 396 * or greater than the value below. 397 */ 398 static unsigned long raidz_io_aggregate_rows = 4; 399 400 /* 401 * Automatically start a pool scrub when a RAIDZ expansion completes in 402 * order to verify the checksums of all blocks which have been copied 403 * during the expansion. Automatic scrubbing is enabled by default and 404 * is strongly recommended. 405 */ 406 static int zfs_scrub_after_expand = 1; 407 408 static void 409 vdev_raidz_row_free(raidz_row_t *rr) 410 { 411 for (int c = 0; c < rr->rr_cols; c++) { 412 raidz_col_t *rc = &rr->rr_col[c]; 413 414 if (rc->rc_size != 0) 415 abd_free(rc->rc_abd); 416 if (rc->rc_orig_data != NULL) 417 abd_free(rc->rc_orig_data); 418 } 419 420 if (rr->rr_abd_empty != NULL) 421 abd_free(rr->rr_abd_empty); 422 423 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 424 } 425 426 void 427 vdev_raidz_map_free(raidz_map_t *rm) 428 { 429 for (int i = 0; i < rm->rm_nrows; i++) 430 vdev_raidz_row_free(rm->rm_row[i]); 431 432 if (rm->rm_nphys_cols) { 433 for (int i = 0; i < rm->rm_nphys_cols; i++) { 434 if (rm->rm_phys_col[i].rc_abd != NULL) 435 abd_free(rm->rm_phys_col[i].rc_abd); 436 } 437 438 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 439 rm->rm_nphys_cols); 440 } 441 442 ASSERT0P(rm->rm_lr); 443 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 444 } 445 446 static void 447 vdev_raidz_map_free_vsd(zio_t *zio) 448 { 449 raidz_map_t *rm = zio->io_vsd; 450 451 vdev_raidz_map_free(rm); 452 } 453 454 static int 455 vdev_raidz_reflow_compare(const void *x1, const void *x2) 456 { 457 const reflow_node_t *l = x1; 458 const reflow_node_t *r = x2; 459 460 return (TREE_CMP(l->re_txg, r->re_txg)); 461 } 462 463 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 464 .vsd_free = vdev_raidz_map_free_vsd, 465 }; 466 467 raidz_row_t * 468 vdev_raidz_row_alloc(int cols, zio_t *zio) 469 { 470 raidz_row_t *rr = 471 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 472 473 rr->rr_cols = cols; 474 rr->rr_scols = cols; 475 476 for (int c = 0; c < cols; c++) { 477 raidz_col_t *rc = &rr->rr_col[c]; 478 rc->rc_shadow_devidx = INT_MAX; 479 rc->rc_shadow_offset = UINT64_MAX; 480 /* 481 * We can not allow self healing to take place for Direct I/O 482 * reads. There is nothing that stops the buffer contents from 483 * being manipulated while the I/O is in flight. It is possible 484 * that the checksum could be verified on the buffer and then 485 * the contents of that buffer are manipulated afterwards. This 486 * could lead to bad data being written out during self 487 * healing. 488 */ 489 if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 490 rc->rc_allow_repair = 1; 491 } 492 return (rr); 493 } 494 495 static void 496 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 497 { 498 int c; 499 int nwrapped = 0; 500 uint64_t off = 0; 501 raidz_row_t *rr = rm->rm_row[0]; 502 503 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 504 ASSERT3U(rm->rm_nrows, ==, 1); 505 506 /* 507 * Pad any parity columns with additional space to account for skip 508 * sectors. 509 */ 510 if (rm->rm_skipstart < rr->rr_firstdatacol) { 511 ASSERT0(rm->rm_skipstart); 512 nwrapped = rm->rm_nskip; 513 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 514 nwrapped = 515 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 516 } 517 518 /* 519 * Optional single skip sectors (rc_size == 0) will be handled in 520 * vdev_raidz_io_start_write(). 521 */ 522 int skipped = rr->rr_scols - rr->rr_cols; 523 524 /* Allocate buffers for the parity columns */ 525 for (c = 0; c < rr->rr_firstdatacol; c++) { 526 raidz_col_t *rc = &rr->rr_col[c]; 527 528 /* 529 * Parity columns will pad out a linear ABD to account for 530 * the skip sector. A linear ABD is used here because 531 * parity calculations use the ABD buffer directly to calculate 532 * parity. This avoids doing a memcpy back to the ABD after the 533 * parity has been calculated. By issuing the parity column 534 * with the skip sector we can reduce contention on the child 535 * VDEV queue locks (vq_lock). 536 */ 537 if (c < nwrapped) { 538 rc->rc_abd = abd_alloc_linear( 539 rc->rc_size + (1ULL << ashift), B_FALSE); 540 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 541 skipped++; 542 } else { 543 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 544 } 545 } 546 547 for (off = 0; c < rr->rr_cols; c++) { 548 raidz_col_t *rc = &rr->rr_col[c]; 549 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 550 zio->io_abd, off, rc->rc_size); 551 552 /* 553 * Generate I/O for skip sectors to improve aggregation 554 * continuity. We will use gang ABD's to reduce contention 555 * on the child VDEV queue locks (vq_lock) by issuing 556 * a single I/O that contains the data and skip sector. 557 * 558 * It is important to make sure that rc_size is not updated 559 * even though we are adding a skip sector to the ABD. When 560 * calculating the parity in vdev_raidz_generate_parity_row() 561 * the rc_size is used to iterate through the ABD's. We can 562 * not have zero'd out skip sectors used for calculating 563 * parity for raidz, because those same sectors are not used 564 * during reconstruction. 565 */ 566 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 567 rc->rc_abd = abd_alloc_gang(); 568 abd_gang_add(rc->rc_abd, abd, B_TRUE); 569 abd_gang_add(rc->rc_abd, 570 abd_get_zeros(1ULL << ashift), B_TRUE); 571 skipped++; 572 } else { 573 rc->rc_abd = abd; 574 } 575 off += rc->rc_size; 576 } 577 578 ASSERT3U(off, ==, zio->io_size); 579 ASSERT3S(skipped, ==, rm->rm_nskip); 580 } 581 582 static void 583 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 584 { 585 int c; 586 raidz_row_t *rr = rm->rm_row[0]; 587 588 ASSERT3U(rm->rm_nrows, ==, 1); 589 590 /* Allocate buffers for the parity columns */ 591 for (c = 0; c < rr->rr_firstdatacol; c++) 592 rr->rr_col[c].rc_abd = 593 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 594 595 for (uint64_t off = 0; c < rr->rr_cols; c++) { 596 raidz_col_t *rc = &rr->rr_col[c]; 597 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 598 zio->io_abd, off, rc->rc_size); 599 off += rc->rc_size; 600 } 601 } 602 603 /* 604 * Divides the IO evenly across all child vdevs; usually, dcols is 605 * the number of children in the target vdev. 606 * 607 * Avoid inlining the function to keep vdev_raidz_io_start(), which 608 * is this functions only caller, as small as possible on the stack. 609 */ 610 noinline raidz_map_t * 611 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 612 uint64_t nparity) 613 { 614 raidz_row_t *rr; 615 /* The starting RAIDZ (parent) vdev sector of the block. */ 616 uint64_t b = zio->io_offset >> ashift; 617 /* The zio's size in units of the vdev's minimum sector size. */ 618 uint64_t s = zio->io_size >> ashift; 619 /* The first column for this stripe. */ 620 uint64_t f = b % dcols; 621 /* The starting byte offset on each child vdev. */ 622 uint64_t o = (b / dcols) << ashift; 623 uint64_t acols, scols; 624 625 raidz_map_t *rm = 626 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 627 rm->rm_nrows = 1; 628 629 /* 630 * "Quotient": The number of data sectors for this stripe on all but 631 * the "big column" child vdevs that also contain "remainder" data. 632 */ 633 uint64_t q = s / (dcols - nparity); 634 635 /* 636 * "Remainder": The number of partial stripe data sectors in this I/O. 637 * This will add a sector to some, but not all, child vdevs. 638 */ 639 uint64_t r = s - q * (dcols - nparity); 640 641 /* The number of "big columns" - those which contain remainder data. */ 642 uint64_t bc = (r == 0 ? 0 : r + nparity); 643 644 /* 645 * The total number of data and parity sectors associated with 646 * this I/O. 647 */ 648 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 649 650 /* 651 * acols: The columns that will be accessed. 652 * scols: The columns that will be accessed or skipped. 653 */ 654 if (q == 0) { 655 /* Our I/O request doesn't span all child vdevs. */ 656 acols = bc; 657 scols = MIN(dcols, roundup(bc, nparity + 1)); 658 } else { 659 acols = dcols; 660 scols = dcols; 661 } 662 663 ASSERT3U(acols, <=, scols); 664 rr = vdev_raidz_row_alloc(scols, zio); 665 rm->rm_row[0] = rr; 666 rr->rr_cols = acols; 667 rr->rr_bigcols = bc; 668 rr->rr_firstdatacol = nparity; 669 #ifdef ZFS_DEBUG 670 rr->rr_offset = zio->io_offset; 671 rr->rr_size = zio->io_size; 672 #endif 673 674 uint64_t asize = 0; 675 676 for (uint64_t c = 0; c < scols; c++) { 677 raidz_col_t *rc = &rr->rr_col[c]; 678 uint64_t col = f + c; 679 uint64_t coff = o; 680 if (col >= dcols) { 681 col -= dcols; 682 coff += 1ULL << ashift; 683 } 684 rc->rc_devidx = col; 685 rc->rc_offset = coff; 686 687 if (c >= acols) 688 rc->rc_size = 0; 689 else if (c < bc) 690 rc->rc_size = (q + 1) << ashift; 691 else 692 rc->rc_size = q << ashift; 693 694 asize += rc->rc_size; 695 } 696 697 ASSERT3U(asize, ==, tot << ashift); 698 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 699 rm->rm_skipstart = bc; 700 701 /* 702 * If all data stored spans all columns, there's a danger that parity 703 * will always be on the same device and, since parity isn't read 704 * during normal operation, that device's I/O bandwidth won't be 705 * used effectively. We therefore switch the parity every 1MB. 706 * 707 * ... at least that was, ostensibly, the theory. As a practical 708 * matter unless we juggle the parity between all devices evenly, we 709 * won't see any benefit. Further, occasional writes that aren't a 710 * multiple of the LCM of the number of children and the minimum 711 * stripe width are sufficient to avoid pessimal behavior. 712 * Unfortunately, this decision created an implicit on-disk format 713 * requirement that we need to support for all eternity, but only 714 * for single-parity RAID-Z. 715 * 716 * If we intend to skip a sector in the zeroth column for padding 717 * we must make sure to note this swap. We will never intend to 718 * skip the first column since at least one data and one parity 719 * column must appear in each row. 720 */ 721 ASSERT(rr->rr_cols >= 2); 722 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 723 724 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 725 uint64_t devidx = rr->rr_col[0].rc_devidx; 726 o = rr->rr_col[0].rc_offset; 727 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 728 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 729 rr->rr_col[1].rc_devidx = devidx; 730 rr->rr_col[1].rc_offset = o; 731 if (rm->rm_skipstart == 0) 732 rm->rm_skipstart = 1; 733 } 734 735 if (zio->io_type == ZIO_TYPE_WRITE) { 736 vdev_raidz_map_alloc_write(zio, rm, ashift); 737 } else { 738 vdev_raidz_map_alloc_read(zio, rm); 739 } 740 /* init RAIDZ parity ops */ 741 rm->rm_ops = vdev_raidz_math_get_ops(); 742 743 return (rm); 744 } 745 746 /* 747 * Everything before reflow_offset_synced should have been moved to the new 748 * location (read and write completed). However, this may not yet be reflected 749 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 750 * uberblock has not yet been written). If reflow is not in progress, 751 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 752 * entirely before reflow_offset_synced, it will come from the new location. 753 * Otherwise this row will come from the old location. Therefore, rows that 754 * straddle the reflow_offset_synced will come from the old location. 755 * 756 * For writes, reflow_offset_next is the next offset to copy. If a sector has 757 * been copied, but not yet reflected in the on-disk progress 758 * (reflow_offset_synced), it will also be written to the new (already copied) 759 * offset. 760 */ 761 noinline raidz_map_t * 762 vdev_raidz_map_alloc_expanded(zio_t *zio, 763 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 764 uint64_t nparity, uint64_t reflow_offset_synced, 765 uint64_t reflow_offset_next, boolean_t use_scratch) 766 { 767 abd_t *abd = zio->io_abd; 768 uint64_t offset = zio->io_offset; 769 uint64_t size = zio->io_size; 770 771 /* The zio's size in units of the vdev's minimum sector size. */ 772 uint64_t s = size >> ashift; 773 774 /* 775 * "Quotient": The number of data sectors for this stripe on all but 776 * the "big column" child vdevs that also contain "remainder" data. 777 * AKA "full rows" 778 */ 779 uint64_t q = s / (logical_cols - nparity); 780 781 /* 782 * "Remainder": The number of partial stripe data sectors in this I/O. 783 * This will add a sector to some, but not all, child vdevs. 784 */ 785 uint64_t r = s - q * (logical_cols - nparity); 786 787 /* The number of "big columns" - those which contain remainder data. */ 788 uint64_t bc = (r == 0 ? 0 : r + nparity); 789 790 /* 791 * The total number of data and parity sectors associated with 792 * this I/O. 793 */ 794 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 795 796 /* How many rows contain data (not skip) */ 797 uint64_t rows = howmany(tot, logical_cols); 798 int cols = MIN(tot, logical_cols); 799 800 raidz_map_t *rm = 801 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 802 KM_SLEEP); 803 rm->rm_nrows = rows; 804 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 805 rm->rm_skipstart = bc; 806 uint64_t asize = 0; 807 808 for (uint64_t row = 0; row < rows; row++) { 809 boolean_t row_use_scratch = B_FALSE; 810 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 811 rm->rm_row[row] = rr; 812 813 /* The starting RAIDZ (parent) vdev sector of the row. */ 814 uint64_t b = (offset >> ashift) + row * logical_cols; 815 816 /* 817 * If we are in the middle of a reflow, and the copying has 818 * not yet completed for any part of this row, then use the 819 * old location of this row. Note that reflow_offset_synced 820 * reflects the i/o that's been completed, because it's 821 * updated by a synctask, after zio_wait(spa_txg_zio[]). 822 * This is sufficient for our check, even if that progress 823 * has not yet been recorded to disk (reflected in 824 * spa_ubsync). Also note that we consider the last row to 825 * be "full width" (`cols`-wide rather than `bc`-wide) for 826 * this calculation. This causes a tiny bit of unnecessary 827 * double-writes but is safe and simpler to calculate. 828 */ 829 int row_phys_cols = physical_cols; 830 if (b + cols > reflow_offset_synced >> ashift) 831 row_phys_cols--; 832 else if (use_scratch) 833 row_use_scratch = B_TRUE; 834 835 /* starting child of this row */ 836 uint64_t child_id = b % row_phys_cols; 837 /* The starting byte offset on each child vdev. */ 838 uint64_t child_offset = (b / row_phys_cols) << ashift; 839 840 /* 841 * Note, rr_cols is the entire width of the block, even 842 * if this row is shorter. This is needed because parity 843 * generation (for Q and R) needs to know the entire width, 844 * because it treats the short row as though it was 845 * full-width (and the "phantom" sectors were zero-filled). 846 * 847 * Another approach to this would be to set cols shorter 848 * (to just the number of columns that we might do i/o to) 849 * and have another mechanism to tell the parity generation 850 * about the "entire width". Reconstruction (at least 851 * vdev_raidz_reconstruct_general()) would also need to 852 * know about the "entire width". 853 */ 854 rr->rr_firstdatacol = nparity; 855 #ifdef ZFS_DEBUG 856 /* 857 * note: rr_size is PSIZE, not ASIZE 858 */ 859 rr->rr_offset = b << ashift; 860 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 861 #endif 862 863 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 864 if (child_id >= row_phys_cols) { 865 child_id -= row_phys_cols; 866 child_offset += 1ULL << ashift; 867 } 868 raidz_col_t *rc = &rr->rr_col[c]; 869 rc->rc_devidx = child_id; 870 rc->rc_offset = child_offset; 871 872 /* 873 * Get this from the scratch space if appropriate. 874 * This only happens if we crashed in the middle of 875 * raidz_reflow_scratch_sync() (while it's running, 876 * the rangelock prevents us from doing concurrent 877 * io), and even then only during zpool import or 878 * when the pool is imported readonly. 879 */ 880 if (row_use_scratch) 881 rc->rc_offset -= VDEV_BOOT_SIZE; 882 883 uint64_t dc = c - rr->rr_firstdatacol; 884 if (c < rr->rr_firstdatacol) { 885 rc->rc_size = 1ULL << ashift; 886 887 /* 888 * Parity sectors' rc_abd's are set below 889 * after determining if this is an aggregation. 890 */ 891 } else if (row == rows - 1 && bc != 0 && c >= bc) { 892 /* 893 * Past the end of the block (even including 894 * skip sectors). This sector is part of the 895 * map so that we have full rows for p/q parity 896 * generation. 897 */ 898 rc->rc_size = 0; 899 rc->rc_abd = NULL; 900 } else { 901 /* "data column" (col excluding parity) */ 902 uint64_t off; 903 904 if (c < bc || r == 0) { 905 off = dc * rows + row; 906 } else { 907 off = r * rows + 908 (dc - r) * (rows - 1) + row; 909 } 910 rc->rc_size = 1ULL << ashift; 911 rc->rc_abd = abd_get_offset_struct( 912 &rc->rc_abdstruct, abd, off << ashift, 913 rc->rc_size); 914 } 915 916 if (rc->rc_size == 0) 917 continue; 918 919 /* 920 * If any part of this row is in both old and new 921 * locations, the primary location is the old 922 * location. If this sector was already copied to the 923 * new location, we need to also write to the new, 924 * "shadow" location. 925 * 926 * Note, `row_phys_cols != physical_cols` indicates 927 * that the primary location is the old location. 928 * `b+c < reflow_offset_next` indicates that the copy 929 * to the new location has been initiated. We know 930 * that the copy has completed because we have the 931 * rangelock, which is held exclusively while the 932 * copy is in progress. 933 */ 934 if (row_use_scratch || 935 (row_phys_cols != physical_cols && 936 b + c < reflow_offset_next >> ashift)) { 937 rc->rc_shadow_devidx = (b + c) % physical_cols; 938 rc->rc_shadow_offset = 939 ((b + c) / physical_cols) << ashift; 940 if (row_use_scratch) 941 rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 942 } 943 944 asize += rc->rc_size; 945 } 946 947 /* 948 * See comment in vdev_raidz_map_alloc() 949 */ 950 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 951 (offset & (1ULL << 20))) { 952 ASSERT(rr->rr_cols >= 2); 953 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 954 955 int devidx0 = rr->rr_col[0].rc_devidx; 956 uint64_t offset0 = rr->rr_col[0].rc_offset; 957 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 958 uint64_t shadow_offset0 = 959 rr->rr_col[0].rc_shadow_offset; 960 961 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 962 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 963 rr->rr_col[0].rc_shadow_devidx = 964 rr->rr_col[1].rc_shadow_devidx; 965 rr->rr_col[0].rc_shadow_offset = 966 rr->rr_col[1].rc_shadow_offset; 967 968 rr->rr_col[1].rc_devidx = devidx0; 969 rr->rr_col[1].rc_offset = offset0; 970 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 971 rr->rr_col[1].rc_shadow_offset = shadow_offset0; 972 } 973 } 974 ASSERT3U(asize, ==, tot << ashift); 975 976 /* 977 * Determine if the block is contiguous, in which case we can use 978 * an aggregation. 979 */ 980 if (rows >= raidz_io_aggregate_rows) { 981 rm->rm_nphys_cols = physical_cols; 982 rm->rm_phys_col = 983 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 984 KM_SLEEP); 985 986 /* 987 * Determine the aggregate io's offset and size, and check 988 * that the io is contiguous. 989 */ 990 for (int i = 0; 991 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 992 raidz_row_t *rr = rm->rm_row[i]; 993 for (int c = 0; c < rr->rr_cols; c++) { 994 raidz_col_t *rc = &rr->rr_col[c]; 995 raidz_col_t *prc = 996 &rm->rm_phys_col[rc->rc_devidx]; 997 998 if (rc->rc_size == 0) 999 continue; 1000 1001 if (prc->rc_size == 0) { 1002 ASSERT0(prc->rc_offset); 1003 prc->rc_offset = rc->rc_offset; 1004 } else if (prc->rc_offset + prc->rc_size != 1005 rc->rc_offset) { 1006 /* 1007 * This block is not contiguous and 1008 * therefore can't be aggregated. 1009 * This is expected to be rare, so 1010 * the cost of allocating and then 1011 * freeing rm_phys_col is not 1012 * significant. 1013 */ 1014 kmem_free(rm->rm_phys_col, 1015 sizeof (raidz_col_t) * 1016 rm->rm_nphys_cols); 1017 rm->rm_phys_col = NULL; 1018 rm->rm_nphys_cols = 0; 1019 break; 1020 } 1021 prc->rc_size += rc->rc_size; 1022 } 1023 } 1024 } 1025 if (rm->rm_phys_col != NULL) { 1026 /* 1027 * Allocate aggregate ABD's. 1028 */ 1029 for (int i = 0; i < rm->rm_nphys_cols; i++) { 1030 raidz_col_t *prc = &rm->rm_phys_col[i]; 1031 1032 prc->rc_devidx = i; 1033 1034 if (prc->rc_size == 0) 1035 continue; 1036 1037 prc->rc_abd = 1038 abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1039 B_FALSE); 1040 } 1041 1042 /* 1043 * Point the parity abd's into the aggregate abd's. 1044 */ 1045 for (int i = 0; i < rm->rm_nrows; i++) { 1046 raidz_row_t *rr = rm->rm_row[i]; 1047 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1048 raidz_col_t *rc = &rr->rr_col[c]; 1049 raidz_col_t *prc = 1050 &rm->rm_phys_col[rc->rc_devidx]; 1051 rc->rc_abd = 1052 abd_get_offset_struct(&rc->rc_abdstruct, 1053 prc->rc_abd, 1054 rc->rc_offset - prc->rc_offset, 1055 rc->rc_size); 1056 } 1057 } 1058 } else { 1059 /* 1060 * Allocate new abd's for the parity sectors. 1061 */ 1062 for (int i = 0; i < rm->rm_nrows; i++) { 1063 raidz_row_t *rr = rm->rm_row[i]; 1064 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1065 raidz_col_t *rc = &rr->rr_col[c]; 1066 rc->rc_abd = 1067 abd_alloc_linear(rc->rc_size, 1068 B_TRUE); 1069 } 1070 } 1071 } 1072 /* init RAIDZ parity ops */ 1073 rm->rm_ops = vdev_raidz_math_get_ops(); 1074 1075 return (rm); 1076 } 1077 1078 struct pqr_struct { 1079 uint64_t *p; 1080 uint64_t *q; 1081 uint64_t *r; 1082 }; 1083 1084 static int 1085 vdev_raidz_p_func(void *buf, size_t size, void *private) 1086 { 1087 struct pqr_struct *pqr = private; 1088 const uint64_t *src = buf; 1089 int cnt = size / sizeof (src[0]); 1090 1091 ASSERT(pqr->p && !pqr->q && !pqr->r); 1092 1093 for (int i = 0; i < cnt; i++, src++, pqr->p++) 1094 *pqr->p ^= *src; 1095 1096 return (0); 1097 } 1098 1099 static int 1100 vdev_raidz_pq_func(void *buf, size_t size, void *private) 1101 { 1102 struct pqr_struct *pqr = private; 1103 const uint64_t *src = buf; 1104 uint64_t mask; 1105 int cnt = size / sizeof (src[0]); 1106 1107 ASSERT(pqr->p && pqr->q && !pqr->r); 1108 1109 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1110 *pqr->p ^= *src; 1111 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1112 *pqr->q ^= *src; 1113 } 1114 1115 return (0); 1116 } 1117 1118 static int 1119 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1120 { 1121 struct pqr_struct *pqr = private; 1122 const uint64_t *src = buf; 1123 uint64_t mask; 1124 int cnt = size / sizeof (src[0]); 1125 1126 ASSERT(pqr->p && pqr->q && pqr->r); 1127 1128 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1129 *pqr->p ^= *src; 1130 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1131 *pqr->q ^= *src; 1132 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1133 *pqr->r ^= *src; 1134 } 1135 1136 return (0); 1137 } 1138 1139 static void 1140 vdev_raidz_generate_parity_p(raidz_row_t *rr) 1141 { 1142 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1143 1144 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1145 abd_t *src = rr->rr_col[c].rc_abd; 1146 1147 if (c == rr->rr_firstdatacol) { 1148 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1149 } else { 1150 struct pqr_struct pqr = { p, NULL, NULL }; 1151 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1152 vdev_raidz_p_func, &pqr); 1153 } 1154 } 1155 } 1156 1157 static void 1158 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1159 { 1160 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1161 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1162 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1163 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1164 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1165 1166 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1167 abd_t *src = rr->rr_col[c].rc_abd; 1168 1169 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1170 1171 if (c == rr->rr_firstdatacol) { 1172 ASSERT(ccnt == pcnt || ccnt == 0); 1173 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1174 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1175 1176 for (uint64_t i = ccnt; i < pcnt; i++) { 1177 p[i] = 0; 1178 q[i] = 0; 1179 } 1180 } else { 1181 struct pqr_struct pqr = { p, q, NULL }; 1182 1183 ASSERT(ccnt <= pcnt); 1184 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1185 vdev_raidz_pq_func, &pqr); 1186 1187 /* 1188 * Treat short columns as though they are full of 0s. 1189 * Note that there's therefore nothing needed for P. 1190 */ 1191 uint64_t mask; 1192 for (uint64_t i = ccnt; i < pcnt; i++) { 1193 VDEV_RAIDZ_64MUL_2(q[i], mask); 1194 } 1195 } 1196 } 1197 } 1198 1199 static void 1200 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1201 { 1202 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1203 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1204 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 1205 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1206 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1207 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1208 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1209 rr->rr_col[VDEV_RAIDZ_R].rc_size); 1210 1211 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1212 abd_t *src = rr->rr_col[c].rc_abd; 1213 1214 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1215 1216 if (c == rr->rr_firstdatacol) { 1217 ASSERT(ccnt == pcnt || ccnt == 0); 1218 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1219 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1220 (void) memcpy(r, p, rr->rr_col[c].rc_size); 1221 1222 for (uint64_t i = ccnt; i < pcnt; i++) { 1223 p[i] = 0; 1224 q[i] = 0; 1225 r[i] = 0; 1226 } 1227 } else { 1228 struct pqr_struct pqr = { p, q, r }; 1229 1230 ASSERT(ccnt <= pcnt); 1231 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1232 vdev_raidz_pqr_func, &pqr); 1233 1234 /* 1235 * Treat short columns as though they are full of 0s. 1236 * Note that there's therefore nothing needed for P. 1237 */ 1238 uint64_t mask; 1239 for (uint64_t i = ccnt; i < pcnt; i++) { 1240 VDEV_RAIDZ_64MUL_2(q[i], mask); 1241 VDEV_RAIDZ_64MUL_4(r[i], mask); 1242 } 1243 } 1244 } 1245 } 1246 1247 /* 1248 * Generate RAID parity in the first virtual columns according to the number of 1249 * parity columns available. 1250 */ 1251 void 1252 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1253 { 1254 if (rr->rr_cols == 0) { 1255 /* 1256 * We are handling this block one row at a time (because 1257 * this block has a different logical vs physical width, 1258 * due to RAIDZ expansion), and this is a pad-only row, 1259 * which has no parity. 1260 */ 1261 return; 1262 } 1263 1264 /* Generate using the new math implementation */ 1265 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1266 return; 1267 1268 switch (rr->rr_firstdatacol) { 1269 case 1: 1270 vdev_raidz_generate_parity_p(rr); 1271 break; 1272 case 2: 1273 vdev_raidz_generate_parity_pq(rr); 1274 break; 1275 case 3: 1276 vdev_raidz_generate_parity_pqr(rr); 1277 break; 1278 default: 1279 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1280 } 1281 } 1282 1283 void 1284 vdev_raidz_generate_parity(raidz_map_t *rm) 1285 { 1286 for (int i = 0; i < rm->rm_nrows; i++) { 1287 raidz_row_t *rr = rm->rm_row[i]; 1288 vdev_raidz_generate_parity_row(rm, rr); 1289 } 1290 } 1291 1292 static int 1293 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1294 { 1295 (void) private; 1296 uint64_t *dst = dbuf; 1297 uint64_t *src = sbuf; 1298 int cnt = size / sizeof (src[0]); 1299 1300 for (int i = 0; i < cnt; i++) { 1301 dst[i] ^= src[i]; 1302 } 1303 1304 return (0); 1305 } 1306 1307 static int 1308 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1309 void *private) 1310 { 1311 (void) private; 1312 uint64_t *dst = dbuf; 1313 uint64_t *src = sbuf; 1314 uint64_t mask; 1315 int cnt = size / sizeof (dst[0]); 1316 1317 for (int i = 0; i < cnt; i++, dst++, src++) { 1318 VDEV_RAIDZ_64MUL_2(*dst, mask); 1319 *dst ^= *src; 1320 } 1321 1322 return (0); 1323 } 1324 1325 static int 1326 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1327 { 1328 (void) private; 1329 uint64_t *dst = buf; 1330 uint64_t mask; 1331 int cnt = size / sizeof (dst[0]); 1332 1333 for (int i = 0; i < cnt; i++, dst++) { 1334 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1335 VDEV_RAIDZ_64MUL_2(*dst, mask); 1336 } 1337 1338 return (0); 1339 } 1340 1341 struct reconst_q_struct { 1342 uint64_t *q; 1343 int exp; 1344 }; 1345 1346 static int 1347 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1348 { 1349 struct reconst_q_struct *rq = private; 1350 uint64_t *dst = buf; 1351 int cnt = size / sizeof (dst[0]); 1352 1353 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1354 int j; 1355 uint8_t *b; 1356 1357 *dst ^= *rq->q; 1358 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1359 *b = vdev_raidz_exp2(*b, rq->exp); 1360 } 1361 } 1362 1363 return (0); 1364 } 1365 1366 struct reconst_pq_struct { 1367 uint8_t *p; 1368 uint8_t *q; 1369 uint8_t *pxy; 1370 uint8_t *qxy; 1371 int aexp; 1372 int bexp; 1373 }; 1374 1375 static int 1376 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1377 { 1378 struct reconst_pq_struct *rpq = private; 1379 uint8_t *xd = xbuf; 1380 uint8_t *yd = ybuf; 1381 1382 for (int i = 0; i < size; 1383 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1384 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1385 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1386 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1387 } 1388 1389 return (0); 1390 } 1391 1392 static int 1393 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1394 { 1395 struct reconst_pq_struct *rpq = private; 1396 uint8_t *xd = xbuf; 1397 1398 for (int i = 0; i < size; 1399 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1400 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1401 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1402 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1403 } 1404 1405 return (0); 1406 } 1407 1408 static void 1409 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1410 { 1411 int x = tgts[0]; 1412 abd_t *dst, *src; 1413 1414 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1415 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1416 1417 ASSERT3U(ntgts, ==, 1); 1418 ASSERT3U(x, >=, rr->rr_firstdatacol); 1419 ASSERT3U(x, <, rr->rr_cols); 1420 1421 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1422 1423 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1424 dst = rr->rr_col[x].rc_abd; 1425 1426 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1427 1428 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1429 uint64_t size = MIN(rr->rr_col[x].rc_size, 1430 rr->rr_col[c].rc_size); 1431 1432 src = rr->rr_col[c].rc_abd; 1433 1434 if (c == x) 1435 continue; 1436 1437 (void) abd_iterate_func2(dst, src, 0, 0, size, 1438 vdev_raidz_reconst_p_func, NULL); 1439 } 1440 } 1441 1442 static void 1443 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1444 { 1445 int x = tgts[0]; 1446 int c, exp; 1447 abd_t *dst, *src; 1448 1449 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1450 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1451 1452 ASSERT(ntgts == 1); 1453 1454 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1455 1456 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1457 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 1458 rr->rr_col[c].rc_size); 1459 1460 src = rr->rr_col[c].rc_abd; 1461 dst = rr->rr_col[x].rc_abd; 1462 1463 if (c == rr->rr_firstdatacol) { 1464 abd_copy(dst, src, size); 1465 if (rr->rr_col[x].rc_size > size) { 1466 abd_zero_off(dst, size, 1467 rr->rr_col[x].rc_size - size); 1468 } 1469 } else { 1470 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1471 (void) abd_iterate_func2(dst, src, 0, 0, size, 1472 vdev_raidz_reconst_q_pre_func, NULL); 1473 (void) abd_iterate_func(dst, 1474 size, rr->rr_col[x].rc_size - size, 1475 vdev_raidz_reconst_q_pre_tail_func, NULL); 1476 } 1477 } 1478 1479 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1480 dst = rr->rr_col[x].rc_abd; 1481 exp = 255 - (rr->rr_cols - 1 - x); 1482 1483 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1484 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1485 vdev_raidz_reconst_q_post_func, &rq); 1486 } 1487 1488 static void 1489 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1490 { 1491 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1492 abd_t *pdata, *qdata; 1493 uint64_t xsize, ysize; 1494 int x = tgts[0]; 1495 int y = tgts[1]; 1496 abd_t *xd, *yd; 1497 1498 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1499 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1500 1501 ASSERT(ntgts == 2); 1502 ASSERT(x < y); 1503 ASSERT(x >= rr->rr_firstdatacol); 1504 ASSERT(y < rr->rr_cols); 1505 1506 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1507 1508 /* 1509 * Move the parity data aside -- we're going to compute parity as 1510 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1511 * reuse the parity generation mechanism without trashing the actual 1512 * parity so we make those columns appear to be full of zeros by 1513 * setting their lengths to zero. 1514 */ 1515 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1516 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1517 xsize = rr->rr_col[x].rc_size; 1518 ysize = rr->rr_col[y].rc_size; 1519 1520 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 1521 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1522 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 1523 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1524 rr->rr_col[x].rc_size = 0; 1525 rr->rr_col[y].rc_size = 0; 1526 1527 vdev_raidz_generate_parity_pq(rr); 1528 1529 rr->rr_col[x].rc_size = xsize; 1530 rr->rr_col[y].rc_size = ysize; 1531 1532 p = abd_to_buf(pdata); 1533 q = abd_to_buf(qdata); 1534 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1535 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1536 xd = rr->rr_col[x].rc_abd; 1537 yd = rr->rr_col[y].rc_abd; 1538 1539 /* 1540 * We now have: 1541 * Pxy = P + D_x + D_y 1542 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1543 * 1544 * We can then solve for D_x: 1545 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1546 * where 1547 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1548 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1549 * 1550 * With D_x in hand, we can easily solve for D_y: 1551 * D_y = P + Pxy + D_x 1552 */ 1553 1554 a = vdev_raidz_pow2[255 + x - y]; 1555 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1556 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1557 1558 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1559 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1560 1561 ASSERT3U(xsize, >=, ysize); 1562 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1563 1564 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1565 vdev_raidz_reconst_pq_func, &rpq); 1566 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1567 vdev_raidz_reconst_pq_tail_func, &rpq); 1568 1569 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1570 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1571 1572 /* 1573 * Restore the saved parity data. 1574 */ 1575 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 1576 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1577 } 1578 1579 /* 1580 * In the general case of reconstruction, we must solve the system of linear 1581 * equations defined by the coefficients used to generate parity as well as 1582 * the contents of the data and parity disks. This can be expressed with 1583 * vectors for the original data (D) and the actual data (d) and parity (p) 1584 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1585 * 1586 * __ __ __ __ 1587 * | | __ __ | p_0 | 1588 * | V | | D_0 | | p_m-1 | 1589 * | | x | : | = | d_0 | 1590 * | I | | D_n-1 | | : | 1591 * | | ~~ ~~ | d_n-1 | 1592 * ~~ ~~ ~~ ~~ 1593 * 1594 * I is simply a square identity matrix of size n, and V is a vandermonde 1595 * matrix defined by the coefficients we chose for the various parity columns 1596 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1597 * computation as well as linear separability. 1598 * 1599 * __ __ __ __ 1600 * | 1 .. 1 1 1 | | p_0 | 1601 * | 2^n-1 .. 4 2 1 | __ __ | : | 1602 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1603 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1604 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1605 * | : : : : | | : | | d_2 | 1606 * | 0 .. 1 0 0 | | D_n-1 | | : | 1607 * | 0 .. 0 1 0 | ~~ ~~ | : | 1608 * | 0 .. 0 0 1 | | d_n-1 | 1609 * ~~ ~~ ~~ ~~ 1610 * 1611 * Note that I, V, d, and p are known. To compute D, we must invert the 1612 * matrix and use the known data and parity values to reconstruct the unknown 1613 * data values. We begin by removing the rows in V|I and d|p that correspond 1614 * to failed or missing columns; we then make V|I square (n x n) and d|p 1615 * sized n by removing rows corresponding to unused parity from the bottom up 1616 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1617 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1618 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1619 * __ __ 1620 * | 1 1 1 1 1 1 1 1 | 1621 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1622 * | 19 205 116 29 64 16 4 1 | / / 1623 * | 1 0 0 0 0 0 0 0 | / / 1624 * | 0 1 0 0 0 0 0 0 | <--' / 1625 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1626 * | 0 0 0 1 0 0 0 0 | 1627 * | 0 0 0 0 1 0 0 0 | 1628 * | 0 0 0 0 0 1 0 0 | 1629 * | 0 0 0 0 0 0 1 0 | 1630 * | 0 0 0 0 0 0 0 1 | 1631 * ~~ ~~ 1632 * __ __ 1633 * | 1 1 1 1 1 1 1 1 | 1634 * | 128 64 32 16 8 4 2 1 | 1635 * | 19 205 116 29 64 16 4 1 | 1636 * | 1 0 0 0 0 0 0 0 | 1637 * | 0 1 0 0 0 0 0 0 | 1638 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1639 * | 0 0 0 1 0 0 0 0 | 1640 * | 0 0 0 0 1 0 0 0 | 1641 * | 0 0 0 0 0 1 0 0 | 1642 * | 0 0 0 0 0 0 1 0 | 1643 * | 0 0 0 0 0 0 0 1 | 1644 * ~~ ~~ 1645 * 1646 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1647 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1648 * matrix is not singular. 1649 * __ __ 1650 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1651 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1652 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1653 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1654 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1655 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1656 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1657 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1658 * ~~ ~~ 1659 * __ __ 1660 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1661 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1662 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1663 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1664 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1665 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1666 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1667 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1668 * ~~ ~~ 1669 * __ __ 1670 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1671 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1672 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1673 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1674 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1675 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1676 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1677 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1678 * ~~ ~~ 1679 * __ __ 1680 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1681 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1682 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1683 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1684 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1685 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1686 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1687 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1688 * ~~ ~~ 1689 * __ __ 1690 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1691 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1692 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1693 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1694 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1695 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1696 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1697 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1698 * ~~ ~~ 1699 * __ __ 1700 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1701 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1702 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1703 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1704 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1705 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1706 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1707 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1708 * ~~ ~~ 1709 * __ __ 1710 * | 0 0 1 0 0 0 0 0 | 1711 * | 167 100 5 41 159 169 217 208 | 1712 * | 166 100 4 40 158 168 216 209 | 1713 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1714 * | 0 0 0 0 1 0 0 0 | 1715 * | 0 0 0 0 0 1 0 0 | 1716 * | 0 0 0 0 0 0 1 0 | 1717 * | 0 0 0 0 0 0 0 1 | 1718 * ~~ ~~ 1719 * 1720 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1721 * of the missing data. 1722 * 1723 * As is apparent from the example above, the only non-trivial rows in the 1724 * inverse matrix correspond to the data disks that we're trying to 1725 * reconstruct. Indeed, those are the only rows we need as the others would 1726 * only be useful for reconstructing data known or assumed to be valid. For 1727 * that reason, we only build the coefficients in the rows that correspond to 1728 * targeted columns. 1729 */ 1730 1731 static void 1732 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1733 uint8_t **rows) 1734 { 1735 int i, j; 1736 int pow; 1737 1738 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1739 1740 /* 1741 * Fill in the missing rows of interest. 1742 */ 1743 for (i = 0; i < nmap; i++) { 1744 ASSERT3S(0, <=, map[i]); 1745 ASSERT3S(map[i], <=, 2); 1746 1747 pow = map[i] * n; 1748 if (pow > 255) 1749 pow -= 255; 1750 ASSERT(pow <= 255); 1751 1752 for (j = 0; j < n; j++) { 1753 pow -= map[i]; 1754 if (pow < 0) 1755 pow += 255; 1756 rows[i][j] = vdev_raidz_pow2[pow]; 1757 } 1758 } 1759 } 1760 1761 static void 1762 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1763 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1764 { 1765 int i, j, ii, jj; 1766 uint8_t log; 1767 1768 /* 1769 * Assert that the first nmissing entries from the array of used 1770 * columns correspond to parity columns and that subsequent entries 1771 * correspond to data columns. 1772 */ 1773 for (i = 0; i < nmissing; i++) { 1774 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1775 } 1776 for (; i < n; i++) { 1777 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1778 } 1779 1780 /* 1781 * First initialize the storage where we'll compute the inverse rows. 1782 */ 1783 for (i = 0; i < nmissing; i++) { 1784 for (j = 0; j < n; j++) { 1785 invrows[i][j] = (i == j) ? 1 : 0; 1786 } 1787 } 1788 1789 /* 1790 * Subtract all trivial rows from the rows of consequence. 1791 */ 1792 for (i = 0; i < nmissing; i++) { 1793 for (j = nmissing; j < n; j++) { 1794 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1795 jj = used[j] - rr->rr_firstdatacol; 1796 ASSERT3S(jj, <, n); 1797 invrows[i][j] = rows[i][jj]; 1798 rows[i][jj] = 0; 1799 } 1800 } 1801 1802 /* 1803 * For each of the rows of interest, we must normalize it and subtract 1804 * a multiple of it from the other rows. 1805 */ 1806 for (i = 0; i < nmissing; i++) { 1807 for (j = 0; j < missing[i]; j++) { 1808 ASSERT0(rows[i][j]); 1809 } 1810 ASSERT3U(rows[i][missing[i]], !=, 0); 1811 1812 /* 1813 * Compute the inverse of the first element and multiply each 1814 * element in the row by that value. 1815 */ 1816 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1817 1818 for (j = 0; j < n; j++) { 1819 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1820 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1821 } 1822 1823 for (ii = 0; ii < nmissing; ii++) { 1824 if (i == ii) 1825 continue; 1826 1827 ASSERT3U(rows[ii][missing[i]], !=, 0); 1828 1829 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1830 1831 for (j = 0; j < n; j++) { 1832 rows[ii][j] ^= 1833 vdev_raidz_exp2(rows[i][j], log); 1834 invrows[ii][j] ^= 1835 vdev_raidz_exp2(invrows[i][j], log); 1836 } 1837 } 1838 } 1839 1840 /* 1841 * Verify that the data that is left in the rows are properly part of 1842 * an identity matrix. 1843 */ 1844 for (i = 0; i < nmissing; i++) { 1845 for (j = 0; j < n; j++) { 1846 if (j == missing[i]) { 1847 ASSERT3U(rows[i][j], ==, 1); 1848 } else { 1849 ASSERT0(rows[i][j]); 1850 } 1851 } 1852 } 1853 } 1854 1855 static void 1856 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1857 int *missing, uint8_t **invrows, const uint8_t *used) 1858 { 1859 int i, j, x, cc, c; 1860 uint8_t *src; 1861 uint64_t ccount; 1862 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1863 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1864 uint8_t log = 0; 1865 uint8_t val; 1866 int ll; 1867 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1868 uint8_t *p, *pp; 1869 size_t psize; 1870 1871 psize = sizeof (invlog[0][0]) * n * nmissing; 1872 p = kmem_alloc(psize, KM_SLEEP); 1873 1874 for (pp = p, i = 0; i < nmissing; i++) { 1875 invlog[i] = pp; 1876 pp += n; 1877 } 1878 1879 for (i = 0; i < nmissing; i++) { 1880 for (j = 0; j < n; j++) { 1881 ASSERT3U(invrows[i][j], !=, 0); 1882 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1883 } 1884 } 1885 1886 for (i = 0; i < n; i++) { 1887 c = used[i]; 1888 ASSERT3U(c, <, rr->rr_cols); 1889 1890 ccount = rr->rr_col[c].rc_size; 1891 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1892 if (ccount == 0) 1893 continue; 1894 src = abd_to_buf(rr->rr_col[c].rc_abd); 1895 for (j = 0; j < nmissing; j++) { 1896 cc = missing[j] + rr->rr_firstdatacol; 1897 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1898 ASSERT3U(cc, <, rr->rr_cols); 1899 ASSERT3U(cc, !=, c); 1900 1901 dcount[j] = rr->rr_col[cc].rc_size; 1902 if (dcount[j] != 0) 1903 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1904 } 1905 1906 for (x = 0; x < ccount; x++, src++) { 1907 if (*src != 0) 1908 log = vdev_raidz_log2[*src]; 1909 1910 for (cc = 0; cc < nmissing; cc++) { 1911 if (x >= dcount[cc]) 1912 continue; 1913 1914 if (*src == 0) { 1915 val = 0; 1916 } else { 1917 if ((ll = log + invlog[cc][i]) >= 255) 1918 ll -= 255; 1919 val = vdev_raidz_pow2[ll]; 1920 } 1921 1922 if (i == 0) 1923 dst[cc][x] = val; 1924 else 1925 dst[cc][x] ^= val; 1926 } 1927 } 1928 } 1929 1930 kmem_free(p, psize); 1931 } 1932 1933 static void 1934 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1935 { 1936 int i, c, t, tt; 1937 unsigned int n; 1938 unsigned int nmissing_rows; 1939 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1940 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1941 uint8_t *p, *pp; 1942 size_t psize; 1943 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1944 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1945 uint8_t *used; 1946 1947 abd_t **bufs = NULL; 1948 1949 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1950 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1951 /* 1952 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1953 * temporary linear ABDs if any non-linear ABDs are found. 1954 */ 1955 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1956 ASSERT(rr->rr_col[i].rc_abd != NULL); 1957 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1958 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1959 KM_PUSHPAGE); 1960 1961 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1962 raidz_col_t *col = &rr->rr_col[c]; 1963 1964 bufs[c] = col->rc_abd; 1965 if (bufs[c] != NULL) { 1966 col->rc_abd = abd_alloc_linear( 1967 col->rc_size, B_TRUE); 1968 abd_copy(col->rc_abd, bufs[c], 1969 col->rc_size); 1970 } 1971 } 1972 1973 break; 1974 } 1975 } 1976 1977 n = rr->rr_cols - rr->rr_firstdatacol; 1978 1979 /* 1980 * Figure out which data columns are missing. 1981 */ 1982 nmissing_rows = 0; 1983 for (t = 0; t < ntgts; t++) { 1984 if (tgts[t] >= rr->rr_firstdatacol) { 1985 missing_rows[nmissing_rows++] = 1986 tgts[t] - rr->rr_firstdatacol; 1987 } 1988 } 1989 1990 /* 1991 * Figure out which parity columns to use to help generate the missing 1992 * data columns. 1993 */ 1994 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1995 ASSERT(tt < ntgts); 1996 ASSERT(c < rr->rr_firstdatacol); 1997 1998 /* 1999 * Skip any targeted parity columns. 2000 */ 2001 if (c == tgts[tt]) { 2002 tt++; 2003 continue; 2004 } 2005 2006 parity_map[i] = c; 2007 i++; 2008 } 2009 2010 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 2011 nmissing_rows * n + sizeof (used[0]) * n; 2012 p = kmem_alloc(psize, KM_SLEEP); 2013 2014 for (pp = p, i = 0; i < nmissing_rows; i++) { 2015 rows[i] = pp; 2016 pp += n; 2017 invrows[i] = pp; 2018 pp += n; 2019 } 2020 used = pp; 2021 2022 for (i = 0; i < nmissing_rows; i++) { 2023 used[i] = parity_map[i]; 2024 } 2025 2026 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2027 if (tt < nmissing_rows && 2028 c == missing_rows[tt] + rr->rr_firstdatacol) { 2029 tt++; 2030 continue; 2031 } 2032 2033 ASSERT3S(i, <, n); 2034 used[i] = c; 2035 i++; 2036 } 2037 2038 /* 2039 * Initialize the interesting rows of the matrix. 2040 */ 2041 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2042 2043 /* 2044 * Invert the matrix. 2045 */ 2046 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2047 invrows, used); 2048 2049 /* 2050 * Reconstruct the missing data using the generated matrix. 2051 */ 2052 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2053 invrows, used); 2054 2055 kmem_free(p, psize); 2056 2057 /* 2058 * copy back from temporary linear abds and free them 2059 */ 2060 if (bufs) { 2061 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2062 raidz_col_t *col = &rr->rr_col[c]; 2063 2064 if (bufs[c] != NULL) { 2065 abd_copy(bufs[c], col->rc_abd, col->rc_size); 2066 abd_free(col->rc_abd); 2067 } 2068 col->rc_abd = bufs[c]; 2069 } 2070 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2071 } 2072 } 2073 2074 static void 2075 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 2076 const int *t, int nt) 2077 { 2078 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2079 int ntgts; 2080 int i, c, ret; 2081 int nbadparity, nbaddata; 2082 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2083 2084 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2085 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2086 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2087 (int)rr->rr_missingparity); 2088 } 2089 2090 nbadparity = rr->rr_firstdatacol; 2091 nbaddata = rr->rr_cols - nbadparity; 2092 ntgts = 0; 2093 for (i = 0, c = 0; c < rr->rr_cols; c++) { 2094 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2095 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2096 "offset=%llx error=%u)", 2097 rr, c, (int)rr->rr_col[c].rc_devidx, 2098 (long long)rr->rr_col[c].rc_offset, 2099 (int)rr->rr_col[c].rc_error); 2100 } 2101 if (c < rr->rr_firstdatacol) 2102 parity_valid[c] = B_FALSE; 2103 2104 if (i < nt && c == t[i]) { 2105 tgts[ntgts++] = c; 2106 i++; 2107 } else if (rr->rr_col[c].rc_error != 0) { 2108 tgts[ntgts++] = c; 2109 } else if (c >= rr->rr_firstdatacol) { 2110 nbaddata--; 2111 } else { 2112 parity_valid[c] = B_TRUE; 2113 nbadparity--; 2114 } 2115 } 2116 2117 ASSERT(ntgts >= nt); 2118 ASSERT(nbaddata >= 0); 2119 ASSERT(nbaddata + nbadparity == ntgts); 2120 2121 dt = &tgts[nbadparity]; 2122 2123 /* Reconstruct using the new math implementation */ 2124 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2125 if (ret != RAIDZ_ORIGINAL_IMPL) 2126 return; 2127 2128 /* 2129 * See if we can use any of our optimized reconstruction routines. 2130 */ 2131 switch (nbaddata) { 2132 case 1: 2133 if (parity_valid[VDEV_RAIDZ_P]) { 2134 vdev_raidz_reconstruct_p(rr, dt, 1); 2135 return; 2136 } 2137 2138 ASSERT(rr->rr_firstdatacol > 1); 2139 2140 if (parity_valid[VDEV_RAIDZ_Q]) { 2141 vdev_raidz_reconstruct_q(rr, dt, 1); 2142 return; 2143 } 2144 2145 ASSERT(rr->rr_firstdatacol > 2); 2146 break; 2147 2148 case 2: 2149 ASSERT(rr->rr_firstdatacol > 1); 2150 2151 if (parity_valid[VDEV_RAIDZ_P] && 2152 parity_valid[VDEV_RAIDZ_Q]) { 2153 vdev_raidz_reconstruct_pq(rr, dt, 2); 2154 return; 2155 } 2156 2157 ASSERT(rr->rr_firstdatacol > 2); 2158 2159 break; 2160 } 2161 2162 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2163 } 2164 2165 static int 2166 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2167 uint64_t *logical_ashift, uint64_t *physical_ashift) 2168 { 2169 vdev_raidz_t *vdrz = vd->vdev_tsd; 2170 uint64_t nparity = vdrz->vd_nparity; 2171 int c; 2172 int lasterror = 0; 2173 int numerrors = 0; 2174 2175 ASSERT(nparity > 0); 2176 2177 if (nparity > VDEV_RAIDZ_MAXPARITY || 2178 vd->vdev_children < nparity + 1) { 2179 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2180 return (SET_ERROR(EINVAL)); 2181 } 2182 2183 vdev_open_children(vd); 2184 2185 for (c = 0; c < vd->vdev_children; c++) { 2186 vdev_t *cvd = vd->vdev_child[c]; 2187 2188 if (cvd->vdev_open_error != 0) { 2189 lasterror = cvd->vdev_open_error; 2190 numerrors++; 2191 continue; 2192 } 2193 2194 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2195 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2196 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2197 } 2198 for (c = 0; c < vd->vdev_children; c++) { 2199 vdev_t *cvd = vd->vdev_child[c]; 2200 2201 if (cvd->vdev_open_error != 0) 2202 continue; 2203 *physical_ashift = vdev_best_ashift(*logical_ashift, 2204 *physical_ashift, cvd->vdev_physical_ashift); 2205 } 2206 2207 if (vd->vdev_rz_expanding) { 2208 *asize *= vd->vdev_children - 1; 2209 *max_asize *= vd->vdev_children - 1; 2210 2211 vd->vdev_min_asize = *asize; 2212 } else { 2213 *asize *= vd->vdev_children; 2214 *max_asize *= vd->vdev_children; 2215 } 2216 2217 if (numerrors > nparity) { 2218 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2219 return (lasterror); 2220 } 2221 2222 return (0); 2223 } 2224 2225 static void 2226 vdev_raidz_close(vdev_t *vd) 2227 { 2228 for (int c = 0; c < vd->vdev_children; c++) { 2229 if (vd->vdev_child[c] != NULL) 2230 vdev_close(vd->vdev_child[c]); 2231 } 2232 } 2233 2234 /* 2235 * Return the logical width to use, given the txg in which the allocation 2236 * happened. 2237 */ 2238 static uint64_t 2239 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2240 { 2241 reflow_node_t lookup = { 2242 .re_txg = txg, 2243 }; 2244 avl_index_t where; 2245 2246 uint64_t width; 2247 mutex_enter(&vdrz->vd_expand_lock); 2248 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2249 if (re != NULL) { 2250 width = re->re_logical_width; 2251 } else { 2252 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2253 if (re != NULL) 2254 width = re->re_logical_width; 2255 else 2256 width = vdrz->vd_original_width; 2257 } 2258 mutex_exit(&vdrz->vd_expand_lock); 2259 return (width); 2260 } 2261 /* 2262 * This code converts an asize into the largest psize that can safely be written 2263 * to an allocation of that size for this vdev. 2264 * 2265 * Note that this function will not take into account the effect of gang 2266 * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of 2267 * the psize_to_asize function. 2268 */ 2269 static uint64_t 2270 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) 2271 { 2272 vdev_raidz_t *vdrz = vd->vdev_tsd; 2273 uint64_t psize; 2274 uint64_t ashift = vd->vdev_top->vdev_ashift; 2275 uint64_t nparity = vdrz->vd_nparity; 2276 2277 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); 2278 2279 ASSERT0(asize % (1 << ashift)); 2280 2281 psize = (asize >> ashift); 2282 /* 2283 * If the roundup to nparity + 1 caused us to spill into a new row, we 2284 * need to ignore that row entirely (since it can't store data or 2285 * parity). 2286 */ 2287 uint64_t rows = psize / cols; 2288 psize = psize - (rows * cols) <= nparity ? rows * cols : psize; 2289 /* Subtract out parity sectors for each row storing data. */ 2290 psize -= nparity * DIV_ROUND_UP(psize, cols); 2291 psize <<= ashift; 2292 2293 return (psize); 2294 } 2295 2296 /* 2297 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2298 * more space due to the lower data-to-parity ratio. In this case it's 2299 * important to pass in the correct txg. Note that vdev_gang_header_asize() 2300 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2301 * regardless of txg. This is assured because for a single data sector, we 2302 * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2303 */ 2304 static uint64_t 2305 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2306 { 2307 vdev_raidz_t *vdrz = vd->vdev_tsd; 2308 uint64_t asize; 2309 uint64_t ashift = vd->vdev_top->vdev_ashift; 2310 uint64_t nparity = vdrz->vd_nparity; 2311 2312 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); 2313 2314 asize = ((psize - 1) >> ashift) + 1; 2315 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2316 asize = roundup(asize, nparity + 1) << ashift; 2317 2318 #ifdef ZFS_DEBUG 2319 uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2320 uint64_t ncols_new = vdrz->vd_physical_width; 2321 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2322 (ncols_new - nparity)); 2323 asize_new = roundup(asize_new, nparity + 1) << ashift; 2324 VERIFY3U(asize_new, <=, asize); 2325 #endif 2326 2327 return (asize); 2328 } 2329 2330 /* 2331 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 2332 * so each child must provide at least 1/Nth of its asize. 2333 */ 2334 static uint64_t 2335 vdev_raidz_min_asize(vdev_t *vd) 2336 { 2337 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 2338 vd->vdev_children); 2339 } 2340 2341 /* 2342 * return B_TRUE if a read should be skipped due to being too slow. 2343 * 2344 * In vdev_child_slow_outlier() it looks for outliers based on disk 2345 * latency from the most recent child reads. Here we're checking if, 2346 * over time, a disk has has been an outlier too many times and is 2347 * now in a sit out period. 2348 */ 2349 boolean_t 2350 vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) 2351 { 2352 if (vdev_read_sit_out_secs == 0) 2353 return (B_FALSE); 2354 2355 /* Avoid skipping a data column read when scrubbing */ 2356 if (io_flags & ZIO_FLAG_SCRUB) 2357 return (B_FALSE); 2358 2359 if (!vd->vdev_ops->vdev_op_leaf) { 2360 boolean_t sitting = B_FALSE; 2361 for (int c = 0; c < vd->vdev_children; c++) { 2362 sitting |= vdev_sit_out_reads(vd->vdev_child[c], 2363 io_flags); 2364 } 2365 return (sitting); 2366 } 2367 2368 if (vd->vdev_read_sit_out_expire >= gethrestime_sec()) 2369 return (B_TRUE); 2370 2371 vd->vdev_read_sit_out_expire = 0; 2372 2373 return (B_FALSE); 2374 } 2375 2376 void 2377 vdev_raidz_child_done(zio_t *zio) 2378 { 2379 raidz_col_t *rc = zio->io_private; 2380 2381 ASSERT3P(rc->rc_abd, !=, NULL); 2382 rc->rc_error = zio->io_error; 2383 rc->rc_tried = 1; 2384 rc->rc_skipped = 0; 2385 } 2386 2387 static void 2388 vdev_raidz_shadow_child_done(zio_t *zio) 2389 { 2390 raidz_col_t *rc = zio->io_private; 2391 2392 rc->rc_shadow_error = zio->io_error; 2393 } 2394 2395 static void 2396 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2397 { 2398 (void) rm; 2399 #ifdef ZFS_DEBUG 2400 zfs_range_seg64_t logical_rs, physical_rs, remain_rs; 2401 logical_rs.rs_start = rr->rr_offset; 2402 logical_rs.rs_end = logical_rs.rs_start + 2403 vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, 2404 BP_GET_PHYSICAL_BIRTH(zio->io_bp)); 2405 2406 raidz_col_t *rc = &rr->rr_col[col]; 2407 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2408 2409 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 2410 ASSERT(vdev_xlate_is_empty(&remain_rs)); 2411 if (vdev_xlate_is_empty(&physical_rs)) { 2412 /* 2413 * If we are in the middle of expansion, the 2414 * physical->logical mapping is changing so vdev_xlate() 2415 * can't give us a reliable answer. 2416 */ 2417 return; 2418 } 2419 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2420 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2421 /* 2422 * It would be nice to assert that rs_end is equal 2423 * to rc_offset + rc_size but there might be an 2424 * optional I/O at the end that is not accounted in 2425 * rc_size. 2426 */ 2427 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2428 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2429 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2430 } else { 2431 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2432 } 2433 #endif 2434 } 2435 2436 static void 2437 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 2438 { 2439 vdev_t *vd = zio->io_vd; 2440 raidz_map_t *rm = zio->io_vsd; 2441 2442 vdev_raidz_generate_parity_row(rm, rr); 2443 2444 for (int c = 0; c < rr->rr_scols; c++) { 2445 raidz_col_t *rc = &rr->rr_col[c]; 2446 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2447 2448 /* Verify physical to logical translation */ 2449 vdev_raidz_io_verify(zio, rm, rr, c); 2450 2451 if (rc->rc_size == 0) 2452 continue; 2453 2454 ASSERT3U(rc->rc_offset + rc->rc_size, <, 2455 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2456 2457 ASSERT3P(rc->rc_abd, !=, NULL); 2458 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2459 rc->rc_offset, rc->rc_abd, 2460 abd_get_size(rc->rc_abd), zio->io_type, 2461 zio->io_priority, 0, vdev_raidz_child_done, rc)); 2462 2463 if (rc->rc_shadow_devidx != INT_MAX) { 2464 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2465 2466 ASSERT3U( 2467 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2468 cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2469 2470 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2471 rc->rc_shadow_offset, rc->rc_abd, 2472 abd_get_size(rc->rc_abd), 2473 zio->io_type, zio->io_priority, 0, 2474 vdev_raidz_shadow_child_done, rc)); 2475 } 2476 } 2477 } 2478 2479 /* 2480 * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2481 * This only works for vdev_raidz_map_alloc() (not _expanded()). 2482 */ 2483 static void 2484 raidz_start_skip_writes(zio_t *zio) 2485 { 2486 vdev_t *vd = zio->io_vd; 2487 uint64_t ashift = vd->vdev_top->vdev_ashift; 2488 raidz_map_t *rm = zio->io_vsd; 2489 ASSERT3U(rm->rm_nrows, ==, 1); 2490 raidz_row_t *rr = rm->rm_row[0]; 2491 for (int c = 0; c < rr->rr_scols; c++) { 2492 raidz_col_t *rc = &rr->rr_col[c]; 2493 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2494 if (rc->rc_size != 0) 2495 continue; 2496 ASSERT0P(rc->rc_abd); 2497 2498 ASSERT3U(rc->rc_offset, <, 2499 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2500 2501 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2502 NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2503 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2504 } 2505 } 2506 2507 static void 2508 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 2509 { 2510 vdev_t *vd = zio->io_vd; 2511 2512 /* 2513 * Iterate over the columns in reverse order so that we hit the parity 2514 * last -- any errors along the way will force us to read the parity. 2515 */ 2516 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2517 raidz_col_t *rc = &rr->rr_col[c]; 2518 if (rc->rc_size == 0) 2519 continue; 2520 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2521 if (!vdev_readable(cvd)) { 2522 if (c >= rr->rr_firstdatacol) 2523 rr->rr_missingdata++; 2524 else 2525 rr->rr_missingparity++; 2526 rc->rc_error = SET_ERROR(ENXIO); 2527 rc->rc_tried = 1; /* don't even try */ 2528 rc->rc_skipped = 1; 2529 continue; 2530 } 2531 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2532 if (c >= rr->rr_firstdatacol) 2533 rr->rr_missingdata++; 2534 else 2535 rr->rr_missingparity++; 2536 rc->rc_error = SET_ERROR(ESTALE); 2537 rc->rc_skipped = 1; 2538 continue; 2539 } 2540 2541 if (vdev_sit_out_reads(cvd, zio->io_flags)) { 2542 rr->rr_outlier_cnt++; 2543 ASSERT0(rc->rc_latency_outlier); 2544 rc->rc_latency_outlier = 1; 2545 } 2546 } 2547 2548 /* 2549 * When the row contains a latency outlier and sufficient parity 2550 * exists to reconstruct the column data, then skip reading the 2551 * known slow child vdev as a performance optimization. 2552 */ 2553 if (rr->rr_outlier_cnt > 0 && 2554 (rr->rr_firstdatacol - rr->rr_missingparity) >= 2555 (rr->rr_missingdata + 1)) { 2556 2557 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2558 raidz_col_t *rc = &rr->rr_col[c]; 2559 2560 if (rc->rc_error == 0 && rc->rc_latency_outlier) { 2561 if (c >= rr->rr_firstdatacol) 2562 rr->rr_missingdata++; 2563 else 2564 rr->rr_missingparity++; 2565 rc->rc_error = SET_ERROR(EAGAIN); 2566 rc->rc_skipped = 1; 2567 break; 2568 } 2569 } 2570 } 2571 2572 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2573 raidz_col_t *rc = &rr->rr_col[c]; 2574 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2575 2576 if (rc->rc_error || rc->rc_size == 0) 2577 continue; 2578 2579 if (forceparity || 2580 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 2581 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2582 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2583 rc->rc_offset, rc->rc_abd, rc->rc_size, 2584 zio->io_type, zio->io_priority, 0, 2585 vdev_raidz_child_done, rc)); 2586 } 2587 } 2588 } 2589 2590 static void 2591 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2592 { 2593 vdev_t *vd = zio->io_vd; 2594 2595 for (int i = 0; i < rm->rm_nphys_cols; i++) { 2596 raidz_col_t *prc = &rm->rm_phys_col[i]; 2597 if (prc->rc_size == 0) 2598 continue; 2599 2600 ASSERT3U(prc->rc_devidx, ==, i); 2601 vdev_t *cvd = vd->vdev_child[i]; 2602 2603 if (!vdev_readable(cvd)) { 2604 prc->rc_error = SET_ERROR(ENXIO); 2605 prc->rc_tried = 1; /* don't even try */ 2606 prc->rc_skipped = 1; 2607 continue; 2608 } 2609 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2610 prc->rc_error = SET_ERROR(ESTALE); 2611 prc->rc_skipped = 1; 2612 continue; 2613 } 2614 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2615 prc->rc_offset, prc->rc_abd, prc->rc_size, 2616 zio->io_type, zio->io_priority, 0, 2617 vdev_raidz_child_done, prc)); 2618 } 2619 } 2620 2621 static void 2622 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2623 { 2624 /* 2625 * If there are multiple rows, we will be hitting 2626 * all disks, so go ahead and read the parity so 2627 * that we are reading in decent size chunks. 2628 */ 2629 boolean_t forceparity = rm->rm_nrows > 1; 2630 2631 if (rm->rm_phys_col) { 2632 vdev_raidz_io_start_read_phys_cols(zio, rm); 2633 } else { 2634 for (int i = 0; i < rm->rm_nrows; i++) { 2635 raidz_row_t *rr = rm->rm_row[i]; 2636 vdev_raidz_io_start_read_row(zio, rr, forceparity); 2637 } 2638 } 2639 } 2640 2641 /* 2642 * Start an IO operation on a RAIDZ VDev 2643 * 2644 * Outline: 2645 * - For write operations: 2646 * 1. Generate the parity data 2647 * 2. Create child zio write operations to each column's vdev, for both 2648 * data and parity. 2649 * 3. If the column skips any sectors for padding, create optional dummy 2650 * write zio children for those areas to improve aggregation continuity. 2651 * - For read operations: 2652 * 1. Create child zio read operations to each data column's vdev to read 2653 * the range of data required for zio. 2654 * 2. If this is a scrub or resilver operation, or if any of the data 2655 * vdevs have had errors, then create zio read operations to the parity 2656 * columns' VDevs as well. 2657 */ 2658 static void 2659 vdev_raidz_io_start(zio_t *zio) 2660 { 2661 vdev_t *vd = zio->io_vd; 2662 vdev_t *tvd = vd->vdev_top; 2663 vdev_raidz_t *vdrz = vd->vdev_tsd; 2664 raidz_map_t *rm; 2665 2666 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2667 BP_GET_PHYSICAL_BIRTH(zio->io_bp)); 2668 if (logical_width != vdrz->vd_physical_width) { 2669 zfs_locked_range_t *lr = NULL; 2670 uint64_t synced_offset = UINT64_MAX; 2671 uint64_t next_offset = UINT64_MAX; 2672 boolean_t use_scratch = B_FALSE; 2673 /* 2674 * Note: when the expansion is completing, we set 2675 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2676 * in a later txg than when we last update spa_ubsync's state 2677 * (see the end of spa_raidz_expand_thread()). Therefore we 2678 * may see vre_state!=SCANNING before 2679 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2680 * on disk, but the copying progress has been synced to disk 2681 * (and reflected in spa_ubsync). In this case it's fine to 2682 * treat the expansion as completed, since if we crash there's 2683 * no additional copying to do. 2684 */ 2685 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2686 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2687 &vdrz->vn_vre); 2688 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2689 zio->io_offset, zio->io_size, RL_READER); 2690 use_scratch = 2691 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2692 RRSS_SCRATCH_VALID); 2693 synced_offset = 2694 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2695 next_offset = vdrz->vn_vre.vre_offset; 2696 /* 2697 * If we haven't resumed expanding since importing the 2698 * pool, vre_offset won't have been set yet. In 2699 * this case the next offset to be copied is the same 2700 * as what was synced. 2701 */ 2702 if (next_offset == UINT64_MAX) { 2703 next_offset = synced_offset; 2704 } 2705 } 2706 if (use_scratch) { 2707 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2708 "%lld next_offset=%lld use_scratch=%u", 2709 zio, 2710 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2711 (long long)zio->io_offset, 2712 (long long)synced_offset, 2713 (long long)next_offset, 2714 use_scratch); 2715 } 2716 2717 rm = vdev_raidz_map_alloc_expanded(zio, 2718 tvd->vdev_ashift, vdrz->vd_physical_width, 2719 logical_width, vdrz->vd_nparity, 2720 synced_offset, next_offset, use_scratch); 2721 rm->rm_lr = lr; 2722 } else { 2723 rm = vdev_raidz_map_alloc(zio, 2724 tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2725 } 2726 rm->rm_original_width = vdrz->vd_original_width; 2727 2728 zio->io_vsd = rm; 2729 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2730 if (zio->io_type == ZIO_TYPE_WRITE) { 2731 for (int i = 0; i < rm->rm_nrows; i++) { 2732 vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2733 } 2734 2735 if (logical_width == vdrz->vd_physical_width) { 2736 raidz_start_skip_writes(zio); 2737 } 2738 } else { 2739 ASSERT(zio->io_type == ZIO_TYPE_READ); 2740 vdev_raidz_io_start_read(zio, rm); 2741 } 2742 2743 zio_execute(zio); 2744 } 2745 2746 /* 2747 * Report a checksum error for a child of a RAID-Z device. 2748 */ 2749 void 2750 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2751 { 2752 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2753 2754 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 2755 zio->io_priority != ZIO_PRIORITY_REBUILD) { 2756 zio_bad_cksum_t zbc; 2757 raidz_map_t *rm = zio->io_vsd; 2758 2759 zbc.zbc_has_cksum = 0; 2760 zbc.zbc_injected = rm->rm_ecksuminjected; 2761 2762 mutex_enter(&vd->vdev_stat_lock); 2763 vd->vdev_stat.vs_checksum_errors++; 2764 mutex_exit(&vd->vdev_stat_lock); 2765 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2766 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2767 rc->rc_abd, bad_data, &zbc); 2768 } 2769 } 2770 2771 /* 2772 * We keep track of whether or not there were any injected errors, so that 2773 * any ereports we generate can note it. 2774 */ 2775 static int 2776 raidz_checksum_verify(zio_t *zio) 2777 { 2778 zio_bad_cksum_t zbc = {0}; 2779 raidz_map_t *rm = zio->io_vsd; 2780 2781 int ret = zio_checksum_error(zio, &zbc); 2782 /* 2783 * Any Direct I/O read that has a checksum error must be treated as 2784 * suspicious as the contents of the buffer could be getting 2785 * manipulated while the I/O is taking place. The checksum verify error 2786 * will be reported to the top-level RAIDZ VDEV. 2787 */ 2788 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 2789 zio->io_error = ret; 2790 zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; 2791 zio_dio_chksum_verify_error_report(zio); 2792 zio_checksum_verified(zio); 2793 return (0); 2794 } 2795 2796 if (ret != 0 && zbc.zbc_injected != 0) 2797 rm->rm_ecksuminjected = 1; 2798 2799 return (ret); 2800 } 2801 2802 /* 2803 * Generate the parity from the data columns. If we tried and were able to 2804 * read the parity without error, verify that the generated parity matches the 2805 * data we read. If it doesn't, we fire off a checksum error. Return the 2806 * number of such failures. 2807 */ 2808 static int 2809 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2810 { 2811 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2812 int c, ret = 0; 2813 raidz_map_t *rm = zio->io_vsd; 2814 raidz_col_t *rc; 2815 2816 blkptr_t *bp = zio->io_bp; 2817 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2818 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2819 2820 if (checksum == ZIO_CHECKSUM_NOPARITY) 2821 return (ret); 2822 2823 for (c = 0; c < rr->rr_firstdatacol; c++) { 2824 rc = &rr->rr_col[c]; 2825 if (!rc->rc_tried || rc->rc_error != 0) 2826 continue; 2827 2828 orig[c] = rc->rc_abd; 2829 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2830 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2831 } 2832 2833 /* 2834 * Verify any empty sectors are zero filled to ensure the parity 2835 * is calculated correctly even if these non-data sectors are damaged. 2836 */ 2837 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2838 ret += vdev_draid_map_verify_empty(zio, rr); 2839 2840 /* 2841 * Regenerates parity even for !tried||rc_error!=0 columns. This 2842 * isn't harmful but it does have the side effect of fixing stuff 2843 * we didn't realize was necessary (i.e. even if we return 0). 2844 */ 2845 vdev_raidz_generate_parity_row(rm, rr); 2846 2847 for (c = 0; c < rr->rr_firstdatacol; c++) { 2848 rc = &rr->rr_col[c]; 2849 2850 if (!rc->rc_tried || rc->rc_error != 0) 2851 continue; 2852 2853 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2854 zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2855 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2856 vdev_raidz_checksum_error(zio, rc, orig[c]); 2857 rc->rc_error = SET_ERROR(ECKSUM); 2858 ret++; 2859 } 2860 abd_free(orig[c]); 2861 } 2862 2863 return (ret); 2864 } 2865 2866 static int 2867 vdev_raidz_worst_error(raidz_row_t *rr) 2868 { 2869 int error = 0; 2870 2871 for (int c = 0; c < rr->rr_cols; c++) { 2872 error = zio_worst_error(error, rr->rr_col[c].rc_error); 2873 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2874 } 2875 2876 return (error); 2877 } 2878 2879 /* 2880 * Find the median value from a set of n values 2881 */ 2882 static uint64_t 2883 latency_median_value(const uint64_t *data, size_t n) 2884 { 2885 uint64_t m; 2886 2887 if (n % 2 == 0) 2888 m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1; 2889 else 2890 m = data[((n + 1) >> 1) - 1]; 2891 2892 return (m); 2893 } 2894 2895 /* 2896 * Calculate the outlier fence from a set of n latency values 2897 * 2898 * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1) 2899 */ 2900 static uint64_t 2901 latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr) 2902 { 2903 uint64_t q1 = latency_median_value(&data[0], n >> 1); 2904 uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1); 2905 2906 /* 2907 * To avoid detecting false positive outliers when N is small and 2908 * and the latencies values are very close, make sure the IQR 2909 * is at least 25% larger than Q1. 2910 */ 2911 *iqr = MAX(q3 - q1, q1 / 4); 2912 2913 return (q3 + (*iqr * vdev_raidz_outlier_insensitivity)); 2914 } 2915 #define LAT_CHILDREN_MIN 5 2916 #define LAT_OUTLIER_LIMIT 20 2917 2918 static int 2919 latency_compare(const void *arg1, const void *arg2) 2920 { 2921 const uint64_t *l1 = (uint64_t *)arg1; 2922 const uint64_t *l2 = (uint64_t *)arg2; 2923 2924 return (TREE_CMP(*l1, *l2)); 2925 } 2926 2927 void 2928 vdev_raidz_sit_child(vdev_t *svd, uint64_t secs) 2929 { 2930 for (int c = 0; c < svd->vdev_children; c++) 2931 vdev_raidz_sit_child(svd->vdev_child[c], secs); 2932 2933 if (!svd->vdev_ops->vdev_op_leaf) 2934 return; 2935 2936 /* Begin a sit out period for this slow drive */ 2937 svd->vdev_read_sit_out_expire = gethrestime_sec() + 2938 secs; 2939 2940 /* Count each slow io period */ 2941 mutex_enter(&svd->vdev_stat_lock); 2942 svd->vdev_stat.vs_slow_ios++; 2943 mutex_exit(&svd->vdev_stat_lock); 2944 } 2945 2946 void 2947 vdev_raidz_unsit_child(vdev_t *vd) 2948 { 2949 for (int c = 0; c < vd->vdev_children; c++) 2950 vdev_raidz_unsit_child(vd->vdev_child[c]); 2951 2952 if (!vd->vdev_ops->vdev_op_leaf) 2953 return; 2954 2955 vd->vdev_read_sit_out_expire = 0; 2956 } 2957 2958 /* 2959 * Check for any latency outlier from latest set of child reads. 2960 * 2961 * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This 2962 * rule defines extreme outliers as data points outside the fence of the 2963 * third quartile plus fifty times the Interquartile Range (IQR). This range 2964 * is the distance between the first and third quartile. 2965 * 2966 * Fifty is an extremely large value for Tukey's fence, but the outliers we're 2967 * attempting to detect here are orders of magnitude times larger than the 2968 * median. This large value should capture any truly fault disk quickly, 2969 * without causing spurious sit-outs. 2970 * 2971 * To further avoid spurious sit-outs, vdevs must be detected multiple times 2972 * as an outlier before they are sat, and outlier counts will gradually decay. 2973 * Every nchildren times we have detected an outlier, we subtract 2 from the 2974 * outlier count of all children. If detected outliers are close to uniformly 2975 * distributed, this will result in the outlier count remaining close to 0 2976 * (in expectation; over long enough time-scales, spurious sit-outs are still 2977 * possible). 2978 */ 2979 static void 2980 vdev_child_slow_outlier(zio_t *zio) 2981 { 2982 vdev_t *vd = zio->io_vd; 2983 if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 || 2984 vd->vdev_children < LAT_CHILDREN_MIN) 2985 return; 2986 2987 hrtime_t now = getlrtime(); 2988 uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); 2989 2990 if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms)) 2991 return; 2992 2993 /* Allow a single winner when there are racing callers. */ 2994 if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) 2995 return; 2996 2997 int children = vd->vdev_children; 2998 uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP); 2999 3000 for (int c = 0; c < children; c++) { 3001 vdev_t *cvd = vd->vdev_child[c]; 3002 if (cvd->vdev_prev_histo == NULL) { 3003 mutex_enter(&cvd->vdev_stat_lock); 3004 size_t size = 3005 sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); 3006 cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP); 3007 memcpy(cvd->vdev_prev_histo, 3008 cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], 3009 size); 3010 mutex_exit(&cvd->vdev_stat_lock); 3011 } 3012 } 3013 uint64_t max = 0; 3014 vdev_t *svd = NULL; 3015 uint_t sitouts = 0; 3016 boolean_t skip = B_FALSE, svd_sitting = B_FALSE; 3017 for (int c = 0; c < children; c++) { 3018 vdev_t *cvd = vd->vdev_child[c]; 3019 boolean_t sitting = vdev_sit_out_reads(cvd, 0) || 3020 cvd->vdev_state != VDEV_STATE_HEALTHY; 3021 3022 /* We can't sit out more disks than we have parity */ 3023 if (sitting && ++sitouts >= vdev_get_nparity(vd)) 3024 skip = B_TRUE; 3025 3026 mutex_enter(&cvd->vdev_stat_lock); 3027 3028 uint64_t *prev_histo = cvd->vdev_prev_histo; 3029 uint64_t *histo = 3030 cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ]; 3031 if (skip) { 3032 size_t size = 3033 sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); 3034 memcpy(prev_histo, histo, size); 3035 mutex_exit(&cvd->vdev_stat_lock); 3036 continue; 3037 } 3038 uint64_t count = 0; 3039 lat_data[c] = 0; 3040 for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { 3041 uint64_t this_count = histo[i] - prev_histo[i]; 3042 lat_data[c] += (1ULL << i) * this_count; 3043 count += this_count; 3044 } 3045 size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); 3046 memcpy(prev_histo, histo, size); 3047 mutex_exit(&cvd->vdev_stat_lock); 3048 lat_data[c] /= MAX(1, count); 3049 3050 /* Wait until all disks have been read from */ 3051 if (lat_data[c] == 0 && !sitting) { 3052 skip = B_TRUE; 3053 continue; 3054 } 3055 3056 /* Keep track of the vdev with largest value */ 3057 if (lat_data[c] > max) { 3058 max = lat_data[c]; 3059 svd = cvd; 3060 svd_sitting = sitting; 3061 } 3062 } 3063 3064 if (skip) { 3065 kmem_free(lat_data, sizeof (uint64_t) * children); 3066 return; 3067 } 3068 3069 qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare); 3070 3071 uint64_t iqr; 3072 uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr); 3073 3074 ASSERT3U(lat_data[children - 1], ==, max); 3075 if (max > fence && !svd_sitting) { 3076 ASSERT3U(iqr, >, 0); 3077 uint64_t incr = MAX(1, MIN((max - fence) / iqr, 3078 LAT_OUTLIER_LIMIT / 4)); 3079 vd->vdev_outlier_count += incr; 3080 if (vd->vdev_outlier_count >= children) { 3081 for (int c = 0; c < children; c++) { 3082 vdev_t *cvd = vd->vdev_child[c]; 3083 cvd->vdev_outlier_count -= 2; 3084 cvd->vdev_outlier_count = MAX(0, 3085 cvd->vdev_outlier_count); 3086 } 3087 vd->vdev_outlier_count = 0; 3088 } 3089 /* 3090 * Keep track of how many times this child has had 3091 * an outlier read. A disk that persitently has a 3092 * higher than peers outlier count will be considered 3093 * a slow disk. 3094 */ 3095 svd->vdev_outlier_count += incr; 3096 if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { 3097 ASSERT0(svd->vdev_read_sit_out_expire); 3098 vdev_raidz_sit_child(svd, vdev_read_sit_out_secs); 3099 (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT, 3100 zio->io_spa, svd, NULL, NULL, 0); 3101 vdev_dbgmsg(svd, "begin read sit out for %d secs", 3102 (int)vdev_read_sit_out_secs); 3103 3104 for (int c = 0; c < vd->vdev_children; c++) 3105 vd->vdev_child[c]->vdev_outlier_count = 0; 3106 } 3107 } 3108 3109 kmem_free(lat_data, sizeof (uint64_t) * children); 3110 } 3111 3112 static void 3113 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 3114 { 3115 int unexpected_errors = 0; 3116 int parity_errors = 0; 3117 int parity_untried = 0; 3118 int data_errors = 0; 3119 3120 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 3121 3122 for (int c = 0; c < rr->rr_cols; c++) { 3123 raidz_col_t *rc = &rr->rr_col[c]; 3124 3125 if (rc->rc_error) { 3126 if (c < rr->rr_firstdatacol) 3127 parity_errors++; 3128 else 3129 data_errors++; 3130 3131 if (!rc->rc_skipped) 3132 unexpected_errors++; 3133 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3134 parity_untried++; 3135 } 3136 3137 if (rc->rc_force_repair) 3138 unexpected_errors++; 3139 } 3140 3141 /* 3142 * If we read more parity disks than were used for 3143 * reconstruction, confirm that the other parity disks produced 3144 * correct data. 3145 * 3146 * Note that we also regenerate parity when resilvering so we 3147 * can write it out to failed devices later. 3148 */ 3149 if (parity_errors + parity_untried < 3150 rr->rr_firstdatacol - data_errors || 3151 (zio->io_flags & ZIO_FLAG_RESILVER)) { 3152 int n = raidz_parity_verify(zio, rr); 3153 unexpected_errors += n; 3154 } 3155 3156 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 3157 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 3158 /* 3159 * Use the good data we have in hand to repair damaged children. 3160 */ 3161 for (int c = 0; c < rr->rr_cols; c++) { 3162 raidz_col_t *rc = &rr->rr_col[c]; 3163 vdev_t *vd = zio->io_vd; 3164 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 3165 3166 if (!rc->rc_allow_repair) { 3167 continue; 3168 } else if (!rc->rc_force_repair && 3169 (rc->rc_error == 0 || rc->rc_size == 0)) { 3170 continue; 3171 } 3172 /* 3173 * We do not allow self healing for Direct I/O reads. 3174 * See comment in vdev_raid_row_alloc(). 3175 */ 3176 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 3177 3178 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 3179 "offset=%llx", 3180 zio, c, rc->rc_devidx, (long long)rc->rc_offset); 3181 3182 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 3183 rc->rc_offset, rc->rc_abd, rc->rc_size, 3184 ZIO_TYPE_WRITE, 3185 zio->io_priority == ZIO_PRIORITY_REBUILD ? 3186 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 3187 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 3188 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 3189 } 3190 } 3191 3192 /* 3193 * Scrub or resilver i/o's: overwrite any shadow locations with the 3194 * good data. This ensures that if we've already copied this sector, 3195 * it will be corrected if it was damaged. This writes more than is 3196 * necessary, but since expansion is paused during scrub/resilver, at 3197 * most a single row will have a shadow location. 3198 */ 3199 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 3200 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 3201 for (int c = 0; c < rr->rr_cols; c++) { 3202 raidz_col_t *rc = &rr->rr_col[c]; 3203 vdev_t *vd = zio->io_vd; 3204 3205 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 3206 continue; 3207 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 3208 3209 /* 3210 * Note: We don't want to update the repair stats 3211 * because that would incorrectly indicate that there 3212 * was bad data to repair, which we aren't sure about. 3213 * By clearing the SCAN_THREAD flag, we prevent this 3214 * from happening, despite having the REPAIR flag set. 3215 * We need to set SELF_HEAL so that this i/o can't be 3216 * bypassed by zio_vdev_io_start(). 3217 */ 3218 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 3219 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 3220 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 3221 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 3222 NULL, NULL); 3223 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 3224 zio_nowait(cio); 3225 } 3226 } 3227 } 3228 3229 static void 3230 raidz_restore_orig_data(raidz_map_t *rm) 3231 { 3232 for (int i = 0; i < rm->rm_nrows; i++) { 3233 raidz_row_t *rr = rm->rm_row[i]; 3234 for (int c = 0; c < rr->rr_cols; c++) { 3235 raidz_col_t *rc = &rr->rr_col[c]; 3236 if (rc->rc_need_orig_restore) { 3237 abd_copy(rc->rc_abd, 3238 rc->rc_orig_data, rc->rc_size); 3239 rc->rc_need_orig_restore = B_FALSE; 3240 } 3241 } 3242 } 3243 } 3244 3245 /* 3246 * During raidz_reconstruct() for expanded VDEV, we need special consideration 3247 * failure simulations. See note in raidz_reconstruct() on simulating failure 3248 * of a pre-expansion device. 3249 * 3250 * Treating logical child i as failed, return TRUE if the given column should 3251 * be treated as failed. The idea of logical children allows us to imagine 3252 * that a disk silently failed before a RAIDZ expansion (reads from this disk 3253 * succeed but return the wrong data). Since the expansion doesn't verify 3254 * checksums, the incorrect data will be moved to new locations spread among 3255 * the children (going diagonally across them). 3256 * 3257 * Higher "logical child failures" (values of `i`) indicate these 3258 * "pre-expansion failures". The first physical_width values imagine that a 3259 * current child failed; the next physical_width-1 values imagine that a 3260 * child failed before the most recent expansion; the next physical_width-2 3261 * values imagine a child failed in the expansion before that, etc. 3262 */ 3263 static boolean_t 3264 raidz_simulate_failure(int physical_width, int original_width, int ashift, 3265 int i, raidz_col_t *rc) 3266 { 3267 uint64_t sector_id = 3268 physical_width * (rc->rc_offset >> ashift) + 3269 rc->rc_devidx; 3270 3271 for (int w = physical_width; w >= original_width; w--) { 3272 if (i < w) { 3273 return (sector_id % w == i); 3274 } else { 3275 i -= w; 3276 } 3277 } 3278 ASSERT(!"invalid logical child id"); 3279 return (B_FALSE); 3280 } 3281 3282 /* 3283 * returns EINVAL if reconstruction of the block will not be possible 3284 * returns ECKSUM if this specific reconstruction failed 3285 * returns 0 on successful reconstruction 3286 */ 3287 static int 3288 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 3289 { 3290 raidz_map_t *rm = zio->io_vsd; 3291 int physical_width = zio->io_vd->vdev_children; 3292 int original_width = (rm->rm_original_width != 0) ? 3293 rm->rm_original_width : physical_width; 3294 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 3295 3296 if (dbgmsg) { 3297 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 3298 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 3299 } 3300 3301 /* Reconstruct each row */ 3302 for (int r = 0; r < rm->rm_nrows; r++) { 3303 raidz_row_t *rr = rm->rm_row[r]; 3304 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 3305 int t = 0; 3306 int dead = 0; 3307 int dead_data = 0; 3308 3309 if (dbgmsg) 3310 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 3311 3312 for (int c = 0; c < rr->rr_cols; c++) { 3313 raidz_col_t *rc = &rr->rr_col[c]; 3314 ASSERT0(rc->rc_need_orig_restore); 3315 if (rc->rc_error != 0) { 3316 dead++; 3317 if (c >= nparity) 3318 dead_data++; 3319 continue; 3320 } 3321 if (rc->rc_size == 0) 3322 continue; 3323 for (int lt = 0; lt < ntgts; lt++) { 3324 if (raidz_simulate_failure(physical_width, 3325 original_width, 3326 zio->io_vd->vdev_top->vdev_ashift, 3327 ltgts[lt], rc)) { 3328 if (rc->rc_orig_data == NULL) { 3329 rc->rc_orig_data = 3330 abd_alloc_linear( 3331 rc->rc_size, B_TRUE); 3332 abd_copy(rc->rc_orig_data, 3333 rc->rc_abd, rc->rc_size); 3334 } 3335 rc->rc_need_orig_restore = B_TRUE; 3336 3337 dead++; 3338 if (c >= nparity) 3339 dead_data++; 3340 /* 3341 * Note: simulating failure of a 3342 * pre-expansion device can hit more 3343 * than one column, in which case we 3344 * might try to simulate more failures 3345 * than can be reconstructed, which is 3346 * also more than the size of my_tgts. 3347 * This check prevents accessing past 3348 * the end of my_tgts. The "dead > 3349 * nparity" check below will fail this 3350 * reconstruction attempt. 3351 */ 3352 if (t < VDEV_RAIDZ_MAXPARITY) { 3353 my_tgts[t++] = c; 3354 if (dbgmsg) { 3355 zfs_dbgmsg("simulating " 3356 "failure of col %u " 3357 "devidx %u", c, 3358 (int)rc->rc_devidx); 3359 } 3360 } 3361 break; 3362 } 3363 } 3364 } 3365 if (dead > nparity) { 3366 /* reconstruction not possible */ 3367 if (dbgmsg) { 3368 zfs_dbgmsg("reconstruction not possible; " 3369 "too many failures"); 3370 } 3371 raidz_restore_orig_data(rm); 3372 return (EINVAL); 3373 } 3374 if (dead_data > 0) 3375 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 3376 } 3377 3378 /* Check for success */ 3379 if (raidz_checksum_verify(zio) == 0) { 3380 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) 3381 return (0); 3382 3383 /* Reconstruction succeeded - report errors */ 3384 for (int i = 0; i < rm->rm_nrows; i++) { 3385 raidz_row_t *rr = rm->rm_row[i]; 3386 3387 for (int c = 0; c < rr->rr_cols; c++) { 3388 raidz_col_t *rc = &rr->rr_col[c]; 3389 if (rc->rc_need_orig_restore) { 3390 /* 3391 * Note: if this is a parity column, 3392 * we don't really know if it's wrong. 3393 * We need to let 3394 * vdev_raidz_io_done_verified() check 3395 * it, and if we set rc_error, it will 3396 * think that it is a "known" error 3397 * that doesn't need to be checked 3398 * or corrected. 3399 */ 3400 if (rc->rc_error == 0 && 3401 c >= rr->rr_firstdatacol) { 3402 vdev_raidz_checksum_error(zio, 3403 rc, rc->rc_orig_data); 3404 rc->rc_error = 3405 SET_ERROR(ECKSUM); 3406 } 3407 rc->rc_need_orig_restore = B_FALSE; 3408 } 3409 } 3410 3411 vdev_raidz_io_done_verified(zio, rr); 3412 } 3413 3414 zio_checksum_verified(zio); 3415 3416 if (dbgmsg) { 3417 zfs_dbgmsg("reconstruction successful " 3418 "(checksum verified)"); 3419 } 3420 return (0); 3421 } 3422 3423 /* Reconstruction failed - restore original data */ 3424 raidz_restore_orig_data(rm); 3425 if (dbgmsg) { 3426 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3427 "failed", zio); 3428 } 3429 return (ECKSUM); 3430 } 3431 3432 /* 3433 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 3434 * Note that the algorithm below is non-optimal because it doesn't take into 3435 * account how reconstruction is actually performed. For example, with 3436 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 3437 * is targeted as invalid as if columns 1 and 4 are targeted since in both 3438 * cases we'd only use parity information in column 0. 3439 * 3440 * The order that we find the various possible combinations of failed 3441 * disks is dictated by these rules: 3442 * - Examine each "slot" (the "i" in tgts[i]) 3443 * - Try to increment this slot (tgts[i] += 1) 3444 * - if we can't increment because it runs into the next slot, 3445 * reset our slot to the minimum, and examine the next slot 3446 * 3447 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 3448 * 3 columns to reconstruct), we will generate the following sequence: 3449 * 3450 * STATE ACTION 3451 * 0 1 2 special case: skip since these are all parity 3452 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 3453 * 0 2 3 first slot: increment to 1 3454 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 3455 * 0 1 4 first: reset to 0; middle: increment to 2 3456 * 0 2 4 first: increment to 1 3457 * 1 2 4 first: reset to 0; middle: increment to 3 3458 * 0 3 4 first: increment to 1 3459 * 1 3 4 first: increment to 2 3460 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 3461 * 0 1 5 first: reset to 0; middle: increment to 2 3462 * 0 2 5 first: increment to 1 3463 * 1 2 5 first: reset to 0; middle: increment to 3 3464 * 0 3 5 first: increment to 1 3465 * 1 3 5 first: increment to 2 3466 * 2 3 5 first: reset to 0; middle: increment to 4 3467 * 0 4 5 first: increment to 1 3468 * 1 4 5 first: increment to 2 3469 * 2 4 5 first: increment to 3 3470 * 3 4 5 done 3471 * 3472 * This strategy works for dRAID but is less efficient when there are a large 3473 * number of child vdevs and therefore permutations to check. Furthermore, 3474 * since the raidz_map_t rows likely do not overlap, reconstruction would be 3475 * possible as long as there are no more than nparity data errors per row. 3476 * These additional permutations are not currently checked but could be as 3477 * a future improvement. 3478 * 3479 * Returns 0 on success, ECKSUM on failure. 3480 */ 3481 static int 3482 vdev_raidz_combrec(zio_t *zio) 3483 { 3484 int nparity = vdev_get_nparity(zio->io_vd); 3485 raidz_map_t *rm = zio->io_vsd; 3486 int physical_width = zio->io_vd->vdev_children; 3487 int original_width = (rm->rm_original_width != 0) ? 3488 rm->rm_original_width : physical_width; 3489 3490 for (int i = 0; i < rm->rm_nrows; i++) { 3491 raidz_row_t *rr = rm->rm_row[i]; 3492 int total_errors = 0; 3493 3494 for (int c = 0; c < rr->rr_cols; c++) { 3495 if (rr->rr_col[c].rc_error) 3496 total_errors++; 3497 } 3498 3499 if (total_errors > nparity) 3500 return (vdev_raidz_worst_error(rr)); 3501 } 3502 3503 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 3504 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 3505 int *ltgts = &tstore[1]; /* value is logical child ID */ 3506 3507 3508 /* 3509 * Determine number of logical children, n. See comment 3510 * above raidz_simulate_failure(). 3511 */ 3512 int n = 0; 3513 for (int w = physical_width; 3514 w >= original_width; w--) { 3515 n += w; 3516 } 3517 3518 ASSERT3U(num_failures, <=, nparity); 3519 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 3520 3521 /* Handle corner cases in combrec logic */ 3522 ltgts[-1] = -1; 3523 for (int i = 0; i < num_failures; i++) { 3524 ltgts[i] = i; 3525 } 3526 ltgts[num_failures] = n; 3527 3528 for (;;) { 3529 int err = raidz_reconstruct(zio, ltgts, num_failures, 3530 nparity); 3531 if (err == EINVAL) { 3532 /* 3533 * Reconstruction not possible with this # 3534 * failures; try more failures. 3535 */ 3536 break; 3537 } else if (err == 0) 3538 return (0); 3539 3540 /* Compute next targets to try */ 3541 for (int t = 0; ; t++) { 3542 ASSERT3U(t, <, num_failures); 3543 ltgts[t]++; 3544 if (ltgts[t] == n) { 3545 /* try more failures */ 3546 ASSERT3U(t, ==, num_failures - 1); 3547 if (zfs_flags & 3548 ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3549 zfs_dbgmsg("reconstruction " 3550 "failed for num_failures=" 3551 "%u; tried all " 3552 "combinations", 3553 num_failures); 3554 } 3555 break; 3556 } 3557 3558 ASSERT3U(ltgts[t], <, n); 3559 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 3560 3561 /* 3562 * If that spot is available, we're done here. 3563 * Try the next combination. 3564 */ 3565 if (ltgts[t] != ltgts[t + 1]) 3566 break; // found next combination 3567 3568 /* 3569 * Otherwise, reset this tgt to the minimum, 3570 * and move on to the next tgt. 3571 */ 3572 ltgts[t] = ltgts[t - 1] + 1; 3573 ASSERT3U(ltgts[t], ==, t); 3574 } 3575 3576 /* Increase the number of failures and keep trying. */ 3577 if (ltgts[num_failures - 1] == n) 3578 break; 3579 } 3580 } 3581 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3582 zfs_dbgmsg("reconstruction failed for all num_failures"); 3583 return (ECKSUM); 3584 } 3585 3586 void 3587 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 3588 { 3589 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 3590 raidz_row_t *rr = rm->rm_row[row]; 3591 vdev_raidz_reconstruct_row(rm, rr, t, nt); 3592 } 3593 } 3594 3595 /* 3596 * Complete a write IO operation on a RAIDZ VDev 3597 * 3598 * Outline: 3599 * 1. Check for errors on the child IOs. 3600 * 2. Return, setting an error code if too few child VDevs were written 3601 * to reconstruct the data later. Note that partial writes are 3602 * considered successful if they can be reconstructed at all. 3603 */ 3604 static void 3605 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 3606 { 3607 int normal_errors = 0; 3608 int shadow_errors = 0; 3609 3610 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3611 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3612 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3613 3614 for (int c = 0; c < rr->rr_cols; c++) { 3615 raidz_col_t *rc = &rr->rr_col[c]; 3616 3617 if (rc->rc_error != 0) { 3618 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3619 normal_errors++; 3620 } 3621 if (rc->rc_shadow_error != 0) { 3622 ASSERT(rc->rc_shadow_error != ECKSUM); 3623 shadow_errors++; 3624 } 3625 } 3626 3627 /* 3628 * Treat partial writes as a success. If we couldn't write enough 3629 * columns to reconstruct the data, the I/O failed. Otherwise, good 3630 * enough. Note that in the case of a shadow write (during raidz 3631 * expansion), depending on if we crash, either the normal (old) or 3632 * shadow (new) location may become the "real" version of the block, 3633 * so both locations must have sufficient redundancy. 3634 * 3635 * Now that we support write reallocation, it would be better 3636 * to treat partial failure as real failure unless there are 3637 * no non-degraded top-level vdevs left, and not update DTLs 3638 * if we intend to reallocate. 3639 */ 3640 if (normal_errors > rr->rr_firstdatacol || 3641 shadow_errors > rr->rr_firstdatacol) { 3642 zio->io_error = zio_worst_error(zio->io_error, 3643 vdev_raidz_worst_error(rr)); 3644 } 3645 } 3646 3647 static void 3648 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 3649 raidz_row_t *rr) 3650 { 3651 int parity_errors = 0; 3652 int parity_untried = 0; 3653 int data_errors = 0; 3654 int total_errors = 0; 3655 3656 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3657 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3658 3659 for (int c = 0; c < rr->rr_cols; c++) { 3660 raidz_col_t *rc = &rr->rr_col[c]; 3661 3662 /* 3663 * If scrubbing and a replacing/sparing child vdev determined 3664 * that not all of its children have an identical copy of the 3665 * data, then clear the error so the column is treated like 3666 * any other read and force a repair to correct the damage. 3667 */ 3668 if (rc->rc_error == ECKSUM) { 3669 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3670 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3671 rc->rc_force_repair = 1; 3672 rc->rc_error = 0; 3673 } 3674 3675 if (rc->rc_error) { 3676 if (c < rr->rr_firstdatacol) 3677 parity_errors++; 3678 else 3679 data_errors++; 3680 3681 total_errors++; 3682 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3683 parity_untried++; 3684 } 3685 } 3686 3687 /* 3688 * If there were data errors and the number of errors we saw was 3689 * correctable -- less than or equal to the number of parity disks read 3690 * -- reconstruct based on the missing data. 3691 */ 3692 if (data_errors != 0 && 3693 total_errors <= rr->rr_firstdatacol - parity_untried) { 3694 /* 3695 * We either attempt to read all the parity columns or 3696 * none of them. If we didn't try to read parity, we 3697 * wouldn't be here in the correctable case. There must 3698 * also have been fewer parity errors than parity 3699 * columns or, again, we wouldn't be in this code path. 3700 */ 3701 ASSERT0(parity_untried); 3702 ASSERT(parity_errors < rr->rr_firstdatacol); 3703 3704 /* 3705 * Identify the data columns that reported an error. 3706 */ 3707 int n = 0; 3708 int tgts[VDEV_RAIDZ_MAXPARITY]; 3709 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 3710 raidz_col_t *rc = &rr->rr_col[c]; 3711 if (rc->rc_error != 0) { 3712 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3713 tgts[n++] = c; 3714 } 3715 } 3716 3717 ASSERT(rr->rr_firstdatacol >= n); 3718 3719 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3720 } 3721 } 3722 3723 /* 3724 * Return the number of reads issued. 3725 */ 3726 static int 3727 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 3728 { 3729 vdev_t *vd = zio->io_vd; 3730 int nread = 0; 3731 3732 rr->rr_missingdata = 0; 3733 rr->rr_missingparity = 0; 3734 3735 /* 3736 * If this rows contains empty sectors which are not required 3737 * for a normal read then allocate an ABD for them now so they 3738 * may be read, verified, and any needed repairs performed. 3739 */ 3740 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 3741 vdev_draid_map_alloc_empty(zio, rr); 3742 3743 for (int c = 0; c < rr->rr_cols; c++) { 3744 raidz_col_t *rc = &rr->rr_col[c]; 3745 if (rc->rc_tried || rc->rc_size == 0) 3746 continue; 3747 3748 zio_nowait(zio_vdev_child_io(zio, NULL, 3749 vd->vdev_child[rc->rc_devidx], 3750 rc->rc_offset, rc->rc_abd, rc->rc_size, 3751 zio->io_type, zio->io_priority, 0, 3752 vdev_raidz_child_done, rc)); 3753 nread++; 3754 } 3755 return (nread); 3756 } 3757 3758 /* 3759 * We're here because either there were too many errors to even attempt 3760 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 3761 * failed. In either case, there is enough bad data to prevent reconstruction. 3762 * Start checksum ereports for all children which haven't failed. 3763 */ 3764 static void 3765 vdev_raidz_io_done_unrecoverable(zio_t *zio) 3766 { 3767 raidz_map_t *rm = zio->io_vsd; 3768 3769 for (int i = 0; i < rm->rm_nrows; i++) { 3770 raidz_row_t *rr = rm->rm_row[i]; 3771 3772 for (int c = 0; c < rr->rr_cols; c++) { 3773 raidz_col_t *rc = &rr->rr_col[c]; 3774 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 3775 3776 if (rc->rc_error != 0) 3777 continue; 3778 3779 zio_bad_cksum_t zbc; 3780 zbc.zbc_has_cksum = 0; 3781 zbc.zbc_injected = rm->rm_ecksuminjected; 3782 mutex_enter(&cvd->vdev_stat_lock); 3783 cvd->vdev_stat.vs_checksum_errors++; 3784 mutex_exit(&cvd->vdev_stat_lock); 3785 (void) zfs_ereport_start_checksum(zio->io_spa, 3786 cvd, &zio->io_bookmark, zio, rc->rc_offset, 3787 rc->rc_size, &zbc); 3788 } 3789 } 3790 } 3791 3792 void 3793 vdev_raidz_io_done(zio_t *zio) 3794 { 3795 raidz_map_t *rm = zio->io_vsd; 3796 3797 ASSERT(zio->io_bp != NULL); 3798 if (zio->io_type == ZIO_TYPE_WRITE) { 3799 for (int i = 0; i < rm->rm_nrows; i++) { 3800 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 3801 } 3802 } else { 3803 if (rm->rm_phys_col) { 3804 /* 3805 * This is an aggregated read. Copy the data and status 3806 * from the aggregate abd's to the individual rows. 3807 */ 3808 for (int i = 0; i < rm->rm_nrows; i++) { 3809 raidz_row_t *rr = rm->rm_row[i]; 3810 3811 for (int c = 0; c < rr->rr_cols; c++) { 3812 raidz_col_t *rc = &rr->rr_col[c]; 3813 if (rc->rc_tried || rc->rc_size == 0) 3814 continue; 3815 3816 raidz_col_t *prc = 3817 &rm->rm_phys_col[rc->rc_devidx]; 3818 rc->rc_error = prc->rc_error; 3819 rc->rc_tried = prc->rc_tried; 3820 rc->rc_skipped = prc->rc_skipped; 3821 if (c >= rr->rr_firstdatacol) { 3822 /* 3823 * Note: this is slightly faster 3824 * than using abd_copy_off(). 3825 */ 3826 char *physbuf = abd_to_buf( 3827 prc->rc_abd); 3828 void *physloc = physbuf + 3829 rc->rc_offset - 3830 prc->rc_offset; 3831 3832 abd_copy_from_buf(rc->rc_abd, 3833 physloc, rc->rc_size); 3834 } 3835 } 3836 } 3837 } 3838 3839 for (int i = 0; i < rm->rm_nrows; i++) { 3840 raidz_row_t *rr = rm->rm_row[i]; 3841 vdev_raidz_io_done_reconstruct_known_missing(zio, 3842 rm, rr); 3843 } 3844 3845 if (raidz_checksum_verify(zio) == 0) { 3846 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) 3847 goto done; 3848 3849 for (int i = 0; i < rm->rm_nrows; i++) { 3850 raidz_row_t *rr = rm->rm_row[i]; 3851 vdev_raidz_io_done_verified(zio, rr); 3852 } 3853 /* Periodically check for a read outlier */ 3854 if (zio->io_type == ZIO_TYPE_READ) 3855 vdev_child_slow_outlier(zio); 3856 zio_checksum_verified(zio); 3857 } else { 3858 /* 3859 * A sequential resilver has no checksum which makes 3860 * combinatoral reconstruction impossible. This code 3861 * path is unreachable since raidz_checksum_verify() 3862 * has no checksum to verify and must succeed. 3863 */ 3864 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3865 3866 /* 3867 * This isn't a typical situation -- either we got a 3868 * read error or a child silently returned bad data. 3869 * Read every block so we can try again with as much 3870 * data and parity as we can track down. If we've 3871 * already been through once before, all children will 3872 * be marked as tried so we'll proceed to combinatorial 3873 * reconstruction. 3874 */ 3875 int nread = 0; 3876 for (int i = 0; i < rm->rm_nrows; i++) { 3877 nread += vdev_raidz_read_all(zio, 3878 rm->rm_row[i]); 3879 } 3880 if (nread != 0) { 3881 /* 3882 * Normally our stage is VDEV_IO_DONE, but if 3883 * we've already called redone(), it will have 3884 * changed to VDEV_IO_START, in which case we 3885 * don't want to call redone() again. 3886 */ 3887 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 3888 zio_vdev_io_redone(zio); 3889 return; 3890 } 3891 /* 3892 * It would be too expensive to try every possible 3893 * combination of failed sectors in every row, so 3894 * instead we try every combination of failed current or 3895 * past physical disk. This means that if the incorrect 3896 * sectors were all on Nparity disks at any point in the 3897 * past, we will find the correct data. The only known 3898 * case where this is less durable than a non-expanded 3899 * RAIDZ, is if we have a silent failure during 3900 * expansion. In that case, one block could be 3901 * partially in the old format and partially in the 3902 * new format, so we'd lost some sectors from the old 3903 * format and some from the new format. 3904 * 3905 * e.g. logical_width=4 physical_width=6 3906 * the 15 (6+5+4) possible failed disks are: 3907 * width=6 child=0 3908 * width=6 child=1 3909 * width=6 child=2 3910 * width=6 child=3 3911 * width=6 child=4 3912 * width=6 child=5 3913 * width=5 child=0 3914 * width=5 child=1 3915 * width=5 child=2 3916 * width=5 child=3 3917 * width=5 child=4 3918 * width=4 child=0 3919 * width=4 child=1 3920 * width=4 child=2 3921 * width=4 child=3 3922 * And we will try every combination of Nparity of these 3923 * failing. 3924 * 3925 * As a first pass, we can generate every combo, 3926 * and try reconstructing, ignoring any known 3927 * failures. If any row has too many known + simulated 3928 * failures, then we bail on reconstructing with this 3929 * number of simulated failures. As an improvement, 3930 * we could detect the number of whole known failures 3931 * (i.e. we have known failures on these disks for 3932 * every row; the disks never succeeded), and 3933 * subtract that from the max # failures to simulate. 3934 * We could go even further like the current 3935 * combrec code, but that doesn't seem like it 3936 * gains us very much. If we simulate a failure 3937 * that is also a known failure, that's fine. 3938 */ 3939 zio->io_error = vdev_raidz_combrec(zio); 3940 if (zio->io_error == ECKSUM && 3941 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3942 vdev_raidz_io_done_unrecoverable(zio); 3943 } 3944 } 3945 } 3946 done: 3947 if (rm->rm_lr != NULL) { 3948 zfs_rangelock_exit(rm->rm_lr); 3949 rm->rm_lr = NULL; 3950 } 3951 } 3952 3953 static void 3954 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3955 { 3956 vdev_raidz_t *vdrz = vd->vdev_tsd; 3957 if (faulted > vdrz->vd_nparity) 3958 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3959 VDEV_AUX_NO_REPLICAS); 3960 else if (degraded + faulted != 0) 3961 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3962 else 3963 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3964 } 3965 3966 /* 3967 * Determine if any portion of the provided block resides on a child vdev 3968 * with a dirty DTL and therefore needs to be resilvered. The function 3969 * assumes that at least one DTL is dirty which implies that full stripe 3970 * width blocks must be resilvered. 3971 */ 3972 static boolean_t 3973 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3974 uint64_t phys_birth) 3975 { 3976 vdev_raidz_t *vdrz = vd->vdev_tsd; 3977 3978 /* 3979 * If we're in the middle of a RAIDZ expansion, this block may be in 3980 * the old and/or new location. For simplicity, always resilver it. 3981 */ 3982 if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3983 return (B_TRUE); 3984 3985 uint64_t dcols = vd->vdev_children; 3986 uint64_t nparity = vdrz->vd_nparity; 3987 uint64_t ashift = vd->vdev_top->vdev_ashift; 3988 /* The starting RAIDZ (parent) vdev sector of the block. */ 3989 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3990 /* The zio's size in units of the vdev's minimum sector size. */ 3991 uint64_t s = ((psize - 1) >> ashift) + 1; 3992 /* The first column for this stripe. */ 3993 uint64_t f = b % dcols; 3994 3995 /* Unreachable by sequential resilver. */ 3996 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 3997 3998 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 3999 return (B_FALSE); 4000 4001 if (s + nparity >= dcols) 4002 return (B_TRUE); 4003 4004 for (uint64_t c = 0; c < s + nparity; c++) { 4005 uint64_t devidx = (f + c) % dcols; 4006 vdev_t *cvd = vd->vdev_child[devidx]; 4007 4008 /* 4009 * dsl_scan_need_resilver() already checked vd with 4010 * vdev_dtl_contains(). So here just check cvd with 4011 * vdev_dtl_empty(), cheaper and a good approximation. 4012 */ 4013 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 4014 return (B_TRUE); 4015 } 4016 4017 return (B_FALSE); 4018 } 4019 4020 static void 4021 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, 4022 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 4023 { 4024 (void) remain_rs; 4025 4026 vdev_t *raidvd = cvd->vdev_parent; 4027 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 4028 4029 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4030 4031 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4032 /* 4033 * We're in the middle of expansion, in which case the 4034 * translation is in flux. Any answer we give may be wrong 4035 * by the time we return, so it isn't safe for the caller to 4036 * act on it. Therefore we say that this range isn't present 4037 * on any children. The only consumers of this are "zpool 4038 * initialize" and trimming, both of which are "best effort" 4039 * anyway. 4040 */ 4041 physical_rs->rs_start = physical_rs->rs_end = 0; 4042 remain_rs->rs_start = remain_rs->rs_end = 0; 4043 return; 4044 } 4045 4046 uint64_t width = vdrz->vd_physical_width; 4047 uint64_t tgt_col = cvd->vdev_id; 4048 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 4049 4050 /* make sure the offsets are block-aligned */ 4051 ASSERT0(logical_rs->rs_start % (1 << ashift)); 4052 ASSERT0(logical_rs->rs_end % (1 << ashift)); 4053 uint64_t b_start = logical_rs->rs_start >> ashift; 4054 uint64_t b_end = logical_rs->rs_end >> ashift; 4055 4056 uint64_t start_row = 0; 4057 if (b_start > tgt_col) /* avoid underflow */ 4058 start_row = ((b_start - tgt_col - 1) / width) + 1; 4059 4060 uint64_t end_row = 0; 4061 if (b_end > tgt_col) 4062 end_row = ((b_end - tgt_col - 1) / width) + 1; 4063 4064 physical_rs->rs_start = start_row << ashift; 4065 physical_rs->rs_end = end_row << ashift; 4066 4067 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 4068 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 4069 logical_rs->rs_end - logical_rs->rs_start); 4070 } 4071 4072 static void 4073 raidz_reflow_sync(void *arg, dmu_tx_t *tx) 4074 { 4075 spa_t *spa = arg; 4076 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4077 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4078 4079 /* 4080 * Ensure there are no i/os to the range that is being committed. 4081 */ 4082 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 4083 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 4084 4085 mutex_enter(&vre->vre_lock); 4086 uint64_t new_offset = 4087 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 4088 /* 4089 * We should not have committed anything that failed. 4090 */ 4091 VERIFY3U(vre->vre_failed_offset, >=, old_offset); 4092 mutex_exit(&vre->vre_lock); 4093 4094 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4095 old_offset, new_offset - old_offset, 4096 RL_WRITER); 4097 4098 /* 4099 * Update the uberblock that will be written when this txg completes. 4100 */ 4101 RAIDZ_REFLOW_SET(&spa->spa_uberblock, 4102 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 4103 vre->vre_offset_pertxg[txgoff] = 0; 4104 zfs_rangelock_exit(lr); 4105 4106 mutex_enter(&vre->vre_lock); 4107 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 4108 vre->vre_bytes_copied_pertxg[txgoff] = 0; 4109 mutex_exit(&vre->vre_lock); 4110 4111 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4112 VERIFY0(zap_update(spa->spa_meta_objset, 4113 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4114 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 4115 } 4116 4117 static void 4118 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 4119 { 4120 spa_t *spa = arg; 4121 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4122 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4123 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4124 4125 for (int i = 0; i < TXG_SIZE; i++) 4126 VERIFY0(vre->vre_offset_pertxg[i]); 4127 4128 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4129 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 4130 re->re_logical_width = vdrz->vd_physical_width; 4131 mutex_enter(&vdrz->vd_expand_lock); 4132 avl_add(&vdrz->vd_expand_txgs, re); 4133 mutex_exit(&vdrz->vd_expand_lock); 4134 4135 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4136 4137 /* 4138 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 4139 * will get written (based on vd_expand_txgs). 4140 */ 4141 vdev_config_dirty(vd); 4142 4143 /* 4144 * Before we change vre_state, the on-disk state must reflect that we 4145 * have completed all copying, so that vdev_raidz_io_start() can use 4146 * vre_state to determine if the reflow is in progress. See also the 4147 * end of spa_raidz_expand_thread(). 4148 */ 4149 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 4150 raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 4151 4152 vre->vre_end_time = gethrestime_sec(); 4153 vre->vre_state = DSS_FINISHED; 4154 4155 uint64_t state = vre->vre_state; 4156 VERIFY0(zap_update(spa->spa_meta_objset, 4157 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4158 sizeof (state), 1, &state, tx)); 4159 4160 uint64_t end_time = vre->vre_end_time; 4161 VERIFY0(zap_update(spa->spa_meta_objset, 4162 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4163 sizeof (end_time), 1, &end_time, tx)); 4164 4165 spa->spa_uberblock.ub_raidz_reflow_info = 0; 4166 4167 spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 4168 "%s vdev %llu new width %llu", spa_name(spa), 4169 (unsigned long long)vd->vdev_id, 4170 (unsigned long long)vd->vdev_children); 4171 4172 spa->spa_raidz_expand = NULL; 4173 raidvd->vdev_rz_expanding = B_FALSE; 4174 4175 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 4176 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 4177 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 4178 4179 spa_notify_waiters(spa); 4180 4181 /* 4182 * While we're in syncing context take the opportunity to 4183 * setup a scrub. All the data has been sucessfully copied 4184 * but we have not validated any checksums. 4185 */ 4186 setup_sync_arg_t setup_sync_arg = { 4187 .func = POOL_SCAN_SCRUB, 4188 .txgstart = 0, 4189 .txgend = 0, 4190 }; 4191 if (zfs_scrub_after_expand && 4192 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { 4193 dsl_scan_setup_sync(&setup_sync_arg, tx); 4194 } 4195 } 4196 4197 /* 4198 * State of one copy batch. 4199 */ 4200 typedef struct raidz_reflow_arg { 4201 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ 4202 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ 4203 uint64_t rra_txg; /* TXG of this batch. */ 4204 uint_t rra_ashift; /* Ashift of the vdev. */ 4205 uint32_t rra_tbd; /* Number of in-flight ZIOs. */ 4206 uint32_t rra_writes; /* Number of write ZIOs. */ 4207 zio_t *rra_zio[]; /* Write ZIO pointers. */ 4208 } raidz_reflow_arg_t; 4209 4210 /* 4211 * Write of the new location on one child is done. Once all of them are done 4212 * we can unlock and free everything. 4213 */ 4214 static void 4215 raidz_reflow_write_done(zio_t *zio) 4216 { 4217 raidz_reflow_arg_t *rra = zio->io_private; 4218 vdev_raidz_expand_t *vre = rra->rra_vre; 4219 4220 abd_free(zio->io_abd); 4221 4222 mutex_enter(&vre->vre_lock); 4223 if (zio->io_error != 0) { 4224 /* Force a reflow pause on errors */ 4225 vre->vre_failed_offset = 4226 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4227 } 4228 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 4229 vre->vre_outstanding_bytes -= zio->io_size; 4230 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 4231 vre->vre_failed_offset) { 4232 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 4233 zio->io_size; 4234 } 4235 cv_signal(&vre->vre_cv); 4236 boolean_t done = (--rra->rra_tbd == 0); 4237 mutex_exit(&vre->vre_lock); 4238 4239 if (!done) 4240 return; 4241 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 4242 zfs_rangelock_exit(rra->rra_lr); 4243 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); 4244 } 4245 4246 /* 4247 * Read of the old location on one child is done. Once all of them are done 4248 * writes should have all the data and we can issue them. 4249 */ 4250 static void 4251 raidz_reflow_read_done(zio_t *zio) 4252 { 4253 raidz_reflow_arg_t *rra = zio->io_private; 4254 vdev_raidz_expand_t *vre = rra->rra_vre; 4255 4256 /* Reads of only one block use write ABDs. For bigger free gangs. */ 4257 if (zio->io_size > (1 << rra->rra_ashift)) 4258 abd_free(zio->io_abd); 4259 4260 /* 4261 * If the read failed, or if it was done on a vdev that is not fully 4262 * healthy (e.g. a child that has a resilver in progress), we may not 4263 * have the correct data. Note that it's OK if the write proceeds. 4264 * It may write garbage but the location is otherwise unused and we 4265 * will retry later due to vre_failed_offset. 4266 */ 4267 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 4268 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 4269 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 4270 (long long)rra->rra_lr->lr_offset, 4271 (long long)rra->rra_lr->lr_length, 4272 (long long)rra->rra_txg, 4273 zio->io_error, 4274 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 4275 vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 4276 mutex_enter(&vre->vre_lock); 4277 /* Force a reflow pause on errors */ 4278 vre->vre_failed_offset = 4279 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4280 mutex_exit(&vre->vre_lock); 4281 } 4282 4283 if (atomic_dec_32_nv(&rra->rra_tbd) > 0) 4284 return; 4285 uint32_t writes = rra->rra_tbd = rra->rra_writes; 4286 for (uint64_t i = 0; i < writes; i++) 4287 zio_nowait(rra->rra_zio[i]); 4288 } 4289 4290 static void 4291 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 4292 dmu_tx_t *tx) 4293 { 4294 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4295 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4296 4297 if (offset == 0) 4298 return; 4299 4300 mutex_enter(&vre->vre_lock); 4301 ASSERT3U(vre->vre_offset, <=, offset); 4302 vre->vre_offset = offset; 4303 mutex_exit(&vre->vre_lock); 4304 4305 if (vre->vre_offset_pertxg[txgoff] == 0) { 4306 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 4307 spa, tx); 4308 } 4309 vre->vre_offset_pertxg[txgoff] = offset; 4310 } 4311 4312 static boolean_t 4313 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 4314 { 4315 for (int i = 0; i < raidz_vd->vdev_children; i++) { 4316 /* Quick check if a child is being replaced */ 4317 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 4318 return (B_TRUE); 4319 } 4320 return (B_FALSE); 4321 } 4322 4323 static boolean_t 4324 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, 4325 dmu_tx_t *tx) 4326 { 4327 spa_t *spa = vd->vdev_spa; 4328 uint_t ashift = vd->vdev_top->vdev_ashift; 4329 4330 zfs_range_seg_t *rs = zfs_range_tree_first(rt); 4331 if (rt == NULL) 4332 return (B_FALSE); 4333 uint64_t offset = zfs_rs_get_start(rs, rt); 4334 ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 4335 uint64_t size = zfs_rs_get_end(rs, rt) - offset; 4336 ASSERT3U(size, >=, 1 << ashift); 4337 ASSERT(IS_P2ALIGNED(size, 1 << ashift)); 4338 4339 uint64_t blkid = offset >> ashift; 4340 uint_t old_children = vd->vdev_children - 1; 4341 4342 /* 4343 * We can only progress to the point that writes will not overlap 4344 * with blocks whose progress has not yet been recorded on disk. 4345 * Since partially-copied rows are still read from the old location, 4346 * we need to stop one row before the sector-wise overlap, to prevent 4347 * row-wise overlap. 4348 * 4349 * Note that even if we are skipping over a large unallocated region, 4350 * we can't move the on-disk progress to `offset`, because concurrent 4351 * writes/allocations could still use the currently-unallocated 4352 * region. 4353 */ 4354 uint64_t ubsync_blkid = 4355 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 4356 uint64_t next_overwrite_blkid = ubsync_blkid + 4357 ubsync_blkid / old_children - old_children; 4358 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 4359 if (blkid >= next_overwrite_blkid) { 4360 raidz_reflow_record_progress(vre, 4361 next_overwrite_blkid << ashift, tx); 4362 return (B_TRUE); 4363 } 4364 4365 size = MIN(size, raidz_expand_max_copy_bytes); 4366 size = MIN(size, (uint64_t)old_children * 4367 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); 4368 size = MAX(size, 1 << ashift); 4369 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); 4370 size = (uint64_t)blocks << ashift; 4371 4372 zfs_range_tree_remove(rt, offset, size); 4373 4374 uint_t reads = MIN(blocks, old_children); 4375 uint_t writes = MIN(blocks, vd->vdev_children); 4376 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + 4377 sizeof (zio_t *) * writes, KM_SLEEP); 4378 rra->rra_vre = vre; 4379 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 4380 offset, size, RL_WRITER); 4381 rra->rra_txg = dmu_tx_get_txg(tx); 4382 rra->rra_ashift = ashift; 4383 rra->rra_tbd = reads; 4384 rra->rra_writes = writes; 4385 4386 raidz_reflow_record_progress(vre, offset + size, tx); 4387 4388 /* 4389 * SCL_STATE will be released when the read and write are done, 4390 * by raidz_reflow_write_done(). 4391 */ 4392 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4393 4394 /* check if a replacing vdev was added, if so treat it as an error */ 4395 if (vdev_raidz_expand_child_replacing(vd)) { 4396 zfs_dbgmsg("replacing vdev encountered, reflow paused at " 4397 "offset=%llu txg=%llu", 4398 (long long)rra->rra_lr->lr_offset, 4399 (long long)rra->rra_txg); 4400 4401 mutex_enter(&vre->vre_lock); 4402 vre->vre_failed_offset = 4403 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4404 cv_signal(&vre->vre_cv); 4405 mutex_exit(&vre->vre_lock); 4406 4407 /* drop everything we acquired */ 4408 spa_config_exit(spa, SCL_STATE, spa); 4409 zfs_rangelock_exit(rra->rra_lr); 4410 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); 4411 return (B_TRUE); 4412 } 4413 4414 mutex_enter(&vre->vre_lock); 4415 vre->vre_outstanding_bytes += size; 4416 mutex_exit(&vre->vre_lock); 4417 4418 /* Allocate ABD and ZIO for each child we write. */ 4419 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4420 zio_t *pio = spa->spa_txg_zio[txgoff]; 4421 uint_t b = blocks / vd->vdev_children; 4422 uint_t bb = blocks % vd->vdev_children; 4423 for (uint_t i = 0; i < writes; i++) { 4424 uint_t n = b + (i < bb); 4425 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); 4426 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, 4427 vd->vdev_child[(blkid + i) % vd->vdev_children], 4428 ((blkid + i) / vd->vdev_children) << ashift, 4429 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4430 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); 4431 } 4432 4433 /* 4434 * Allocate and issue ZIO for each child we read. For reads of only 4435 * one block we can use respective writer ABDs, since they will also 4436 * have only one block. For bigger reads create gang ABDs and fill 4437 * them with respective blocks from writer ABDs. 4438 */ 4439 b = blocks / old_children; 4440 bb = blocks % old_children; 4441 for (uint_t i = 0; i < reads; i++) { 4442 uint_t n = b + (i < bb); 4443 abd_t *abd; 4444 if (n > 1) { 4445 abd = abd_alloc_gang(); 4446 for (uint_t j = 0; j < n; j++) { 4447 uint_t b = j * old_children + i; 4448 abd_t *cabd = abd_get_offset_size( 4449 rra->rra_zio[b % vd->vdev_children]->io_abd, 4450 (b / vd->vdev_children) << ashift, 4451 1 << ashift); 4452 abd_gang_add(abd, cabd, B_TRUE); 4453 } 4454 } else { 4455 abd = rra->rra_zio[i]->io_abd; 4456 } 4457 zio_nowait(zio_vdev_child_io(pio, NULL, 4458 vd->vdev_child[(blkid + i) % old_children], 4459 ((blkid + i) / old_children) << ashift, abd, 4460 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4461 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); 4462 } 4463 4464 return (B_FALSE); 4465 } 4466 4467 /* 4468 * For testing (ztest specific) 4469 */ 4470 static void 4471 raidz_expand_pause(uint_t pause_point) 4472 { 4473 while (raidz_expand_pause_point != 0 && 4474 raidz_expand_pause_point <= pause_point) 4475 delay(hz); 4476 } 4477 4478 static void 4479 raidz_scratch_child_done(zio_t *zio) 4480 { 4481 zio_t *pio = zio->io_private; 4482 4483 mutex_enter(&pio->io_lock); 4484 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4485 mutex_exit(&pio->io_lock); 4486 } 4487 4488 /* 4489 * Reflow the beginning portion of the vdev into an intermediate scratch area 4490 * in memory and on disk. This operation must be persisted on disk before we 4491 * proceed to overwrite the beginning portion with the reflowed data. 4492 * 4493 * This multi-step task can fail to complete if disk errors are encountered 4494 * and we can return here after a pause (waiting for disk to become healthy). 4495 */ 4496 static void 4497 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4498 { 4499 vdev_raidz_expand_t *vre = arg; 4500 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4501 zio_t *pio; 4502 int error; 4503 4504 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4505 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4506 int ashift = raidvd->vdev_ashift; 4507 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4508 uint64_t); 4509 uint64_t logical_size = write_size * raidvd->vdev_children; 4510 uint64_t read_size = 4511 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4512 1 << ashift); 4513 4514 /* 4515 * The scratch space must be large enough to get us to the point 4516 * that one row does not overlap itself when moved. This is checked 4517 * by vdev_raidz_attach_check(). 4518 */ 4519 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4520 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4521 VERIFY3U(write_size, <=, read_size); 4522 4523 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4524 0, logical_size, RL_WRITER); 4525 4526 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4527 KM_SLEEP); 4528 for (int i = 0; i < raidvd->vdev_children; i++) { 4529 abds[i] = abd_alloc_linear(read_size, B_FALSE); 4530 } 4531 4532 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4533 4534 /* 4535 * If we have already written the scratch area then we must read from 4536 * there, since new writes were redirected there while we were paused 4537 * or the original location may have been partially overwritten with 4538 * reflowed data. 4539 */ 4540 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4541 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4542 /* 4543 * Read from scratch space. 4544 */ 4545 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4546 for (int i = 0; i < raidvd->vdev_children; i++) { 4547 /* 4548 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4549 * to the offset to calculate the physical offset to 4550 * write to. Passing in a negative offset makes us 4551 * access the scratch area. 4552 */ 4553 zio_nowait(zio_vdev_child_io(pio, NULL, 4554 raidvd->vdev_child[i], 4555 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4556 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4557 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4558 } 4559 error = zio_wait(pio); 4560 if (error != 0) { 4561 zfs_dbgmsg("reflow: error %d reading scratch location", 4562 error); 4563 goto io_error_exit; 4564 } 4565 goto overwrite; 4566 } 4567 4568 /* 4569 * Read from original location. 4570 */ 4571 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4572 for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4573 ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4574 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4575 0, abds[i], read_size, ZIO_TYPE_READ, 4576 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4577 raidz_scratch_child_done, pio)); 4578 } 4579 error = zio_wait(pio); 4580 if (error != 0) { 4581 zfs_dbgmsg("reflow: error %d reading original location", error); 4582 io_error_exit: 4583 for (int i = 0; i < raidvd->vdev_children; i++) 4584 abd_free(abds[i]); 4585 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4586 zfs_rangelock_exit(lr); 4587 spa_config_exit(spa, SCL_STATE, FTAG); 4588 return; 4589 } 4590 4591 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4592 4593 /* 4594 * Reflow in memory. 4595 */ 4596 uint64_t logical_sectors = logical_size >> ashift; 4597 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4598 int oldchild = i % (raidvd->vdev_children - 1); 4599 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4600 4601 int newchild = i % raidvd->vdev_children; 4602 uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4603 4604 /* a single sector should not be copying over itself */ 4605 ASSERT(!(newchild == oldchild && newoff == oldoff)); 4606 4607 abd_copy_off(abds[newchild], abds[oldchild], 4608 newoff, oldoff, 1 << ashift); 4609 } 4610 4611 /* 4612 * Verify that we filled in everything we intended to (write_size on 4613 * each child). 4614 */ 4615 VERIFY0(logical_sectors % raidvd->vdev_children); 4616 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4617 write_size); 4618 4619 /* 4620 * Write to scratch location (boot area). 4621 */ 4622 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4623 for (int i = 0; i < raidvd->vdev_children; i++) { 4624 /* 4625 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4626 * the offset to calculate the physical offset to write to. 4627 * Passing in a negative offset lets us access the boot area. 4628 */ 4629 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4630 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4631 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4632 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4633 } 4634 error = zio_wait(pio); 4635 if (error != 0) { 4636 zfs_dbgmsg("reflow: error %d writing scratch location", error); 4637 goto io_error_exit; 4638 } 4639 pio = zio_root(spa, NULL, NULL, 0); 4640 zio_flush(pio, raidvd); 4641 zio_wait(pio); 4642 4643 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4644 (long long)logical_size); 4645 4646 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4647 4648 /* 4649 * Update uberblock to indicate that scratch space is valid. This is 4650 * needed because after this point, the real location may be 4651 * overwritten. If we crash, we need to get the data from the 4652 * scratch space, rather than the real location. 4653 * 4654 * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4655 * will prefer this uberblock. 4656 */ 4657 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4658 spa->spa_ubsync.ub_timestamp++; 4659 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4660 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4661 if (spa_multihost(spa)) 4662 mmp_update_uberblock(spa, &spa->spa_ubsync); 4663 4664 zfs_dbgmsg("reflow: uberblock updated " 4665 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4666 (long long)spa->spa_ubsync.ub_txg, 4667 (long long)logical_size, 4668 (long long)spa->spa_ubsync.ub_timestamp); 4669 4670 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4671 4672 /* 4673 * Overwrite with reflow'ed data. 4674 */ 4675 overwrite: 4676 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4677 for (int i = 0; i < raidvd->vdev_children; i++) { 4678 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4679 0, abds[i], write_size, ZIO_TYPE_WRITE, 4680 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4681 raidz_scratch_child_done, pio)); 4682 } 4683 error = zio_wait(pio); 4684 if (error != 0) { 4685 /* 4686 * When we exit early here and drop the range lock, new 4687 * writes will go into the scratch area so we'll need to 4688 * read from there when we return after pausing. 4689 */ 4690 zfs_dbgmsg("reflow: error %d writing real location", error); 4691 /* 4692 * Update the uberblock that is written when this txg completes. 4693 */ 4694 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4695 logical_size); 4696 goto io_error_exit; 4697 } 4698 pio = zio_root(spa, NULL, NULL, 0); 4699 zio_flush(pio, raidvd); 4700 zio_wait(pio); 4701 4702 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4703 (long long)logical_size); 4704 for (int i = 0; i < raidvd->vdev_children; i++) 4705 abd_free(abds[i]); 4706 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4707 4708 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4709 4710 /* 4711 * Update uberblock to indicate that the initial part has been 4712 * reflow'ed. This is needed because after this point (when we exit 4713 * the rangelock), we allow regular writes to this region, which will 4714 * be written to the new location only (because reflow_offset_next == 4715 * reflow_offset_synced). If we crashed and re-copied from the 4716 * scratch space, we would lose the regular writes. 4717 */ 4718 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4719 logical_size); 4720 spa->spa_ubsync.ub_timestamp++; 4721 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4722 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4723 if (spa_multihost(spa)) 4724 mmp_update_uberblock(spa, &spa->spa_ubsync); 4725 4726 zfs_dbgmsg("reflow: uberblock updated " 4727 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4728 (long long)spa->spa_ubsync.ub_txg, 4729 (long long)logical_size, 4730 (long long)spa->spa_ubsync.ub_timestamp); 4731 4732 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4733 4734 /* 4735 * Update progress. 4736 */ 4737 vre->vre_offset = logical_size; 4738 zfs_rangelock_exit(lr); 4739 spa_config_exit(spa, SCL_STATE, FTAG); 4740 4741 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4742 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4743 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4744 /* 4745 * Note - raidz_reflow_sync() will update the uberblock state to 4746 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4747 */ 4748 raidz_reflow_sync(spa, tx); 4749 4750 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4751 } 4752 4753 /* 4754 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4755 * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4756 */ 4757 void 4758 vdev_raidz_reflow_copy_scratch(spa_t *spa) 4759 { 4760 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4761 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4762 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4763 4764 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4765 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4766 ASSERT0(logical_size % raidvd->vdev_children); 4767 uint64_t write_size = logical_size / raidvd->vdev_children; 4768 4769 zio_t *pio; 4770 4771 /* 4772 * Read from scratch space. 4773 */ 4774 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4775 KM_SLEEP); 4776 for (int i = 0; i < raidvd->vdev_children; i++) { 4777 abds[i] = abd_alloc_linear(write_size, B_FALSE); 4778 } 4779 4780 pio = zio_root(spa, NULL, NULL, 0); 4781 for (int i = 0; i < raidvd->vdev_children; i++) { 4782 /* 4783 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4784 * the offset to calculate the physical offset to write to. 4785 * Passing in a negative offset lets us access the boot area. 4786 */ 4787 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4788 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4789 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, 4790 raidz_scratch_child_done, pio)); 4791 } 4792 zio_wait(pio); 4793 4794 /* 4795 * Overwrite real location with reflow'ed data. 4796 */ 4797 pio = zio_root(spa, NULL, NULL, 0); 4798 for (int i = 0; i < raidvd->vdev_children; i++) { 4799 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4800 0, abds[i], write_size, ZIO_TYPE_WRITE, 4801 ZIO_PRIORITY_REMOVAL, 0, 4802 raidz_scratch_child_done, pio)); 4803 } 4804 zio_wait(pio); 4805 pio = zio_root(spa, NULL, NULL, 0); 4806 zio_flush(pio, raidvd); 4807 zio_wait(pio); 4808 4809 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4810 "to real location", (long long)logical_size); 4811 4812 for (int i = 0; i < raidvd->vdev_children; i++) 4813 abd_free(abds[i]); 4814 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4815 4816 /* 4817 * Update uberblock. 4818 */ 4819 RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4820 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4821 spa->spa_ubsync.ub_timestamp++; 4822 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4823 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4824 if (spa_multihost(spa)) 4825 mmp_update_uberblock(spa, &spa->spa_ubsync); 4826 4827 zfs_dbgmsg("reflow recovery: uberblock updated " 4828 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4829 (long long)spa->spa_ubsync.ub_txg, 4830 (long long)logical_size, 4831 (long long)spa->spa_ubsync.ub_timestamp); 4832 4833 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4834 spa_first_txg(spa)); 4835 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4836 vre->vre_offset = logical_size; 4837 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4838 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4839 /* 4840 * Note that raidz_reflow_sync() will update the uberblock once more 4841 */ 4842 raidz_reflow_sync(spa, tx); 4843 4844 dmu_tx_commit(tx); 4845 4846 spa_config_exit(spa, SCL_STATE, FTAG); 4847 } 4848 4849 static boolean_t 4850 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4851 { 4852 (void) zthr; 4853 spa_t *spa = arg; 4854 4855 return (spa->spa_raidz_expand != NULL && 4856 !spa->spa_raidz_expand->vre_waiting_for_resilver); 4857 } 4858 4859 /* 4860 * RAIDZ expansion background thread 4861 * 4862 * Can be called multiple times if the reflow is paused 4863 */ 4864 static void 4865 spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4866 { 4867 spa_t *spa = arg; 4868 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4869 4870 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4871 vre->vre_offset = 0; 4872 else 4873 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4874 4875 /* Reflow the begining portion using the scratch area */ 4876 if (vre->vre_offset == 0) { 4877 VERIFY0(dsl_sync_task(spa_name(spa), 4878 NULL, raidz_reflow_scratch_sync, 4879 vre, 0, ZFS_SPACE_CHECK_NONE)); 4880 4881 /* if we encountered errors then pause */ 4882 if (vre->vre_offset == 0) { 4883 mutex_enter(&vre->vre_lock); 4884 vre->vre_waiting_for_resilver = B_TRUE; 4885 mutex_exit(&vre->vre_lock); 4886 return; 4887 } 4888 } 4889 4890 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4891 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4892 4893 uint64_t guid = raidvd->vdev_guid; 4894 4895 /* Iterate over all the remaining metaslabs */ 4896 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4897 i < raidvd->vdev_ms_count && 4898 !zthr_iscancelled(zthr) && 4899 vre->vre_failed_offset == UINT64_MAX; i++) { 4900 metaslab_t *msp = raidvd->vdev_ms[i]; 4901 4902 metaslab_disable(msp); 4903 mutex_enter(&msp->ms_lock); 4904 4905 /* 4906 * The metaslab may be newly created (for the expanded 4907 * space), in which case its trees won't exist yet, 4908 * so we need to bail out early. 4909 */ 4910 if (msp->ms_new) { 4911 mutex_exit(&msp->ms_lock); 4912 metaslab_enable(msp, B_FALSE, B_FALSE); 4913 continue; 4914 } 4915 4916 VERIFY0(metaslab_load(msp)); 4917 4918 /* 4919 * We want to copy everything except the free (allocatable) 4920 * space. Note that there may be a little bit more free 4921 * space (e.g. in ms_defer), and it's fine to copy that too. 4922 */ 4923 uint64_t shift, start; 4924 zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( 4925 raidvd, msp, &start, &shift); 4926 zfs_range_tree_t *rt = zfs_range_tree_create_flags( 4927 NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME, 4928 metaslab_rt_name(msp->ms_group, msp, 4929 "spa_raidz_expand_thread:rt")); 4930 zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); 4931 zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, 4932 rt); 4933 mutex_exit(&msp->ms_lock); 4934 4935 /* 4936 * Force the last sector of each metaslab to be copied. This 4937 * ensures that we advance the on-disk progress to the end of 4938 * this metaslab while the metaslab is disabled. Otherwise, we 4939 * could move past this metaslab without advancing the on-disk 4940 * progress, and then an allocation to this metaslab would not 4941 * be copied. 4942 */ 4943 int sectorsz = 1 << raidvd->vdev_ashift; 4944 uint64_t ms_last_offset = msp->ms_start + 4945 msp->ms_size - sectorsz; 4946 if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { 4947 zfs_range_tree_add(rt, ms_last_offset, sectorsz); 4948 } 4949 4950 /* 4951 * When we are resuming from a paused expansion (i.e. 4952 * when importing a pool with a expansion in progress), 4953 * discard any state that we have already processed. 4954 */ 4955 if (vre->vre_offset > msp->ms_start) { 4956 zfs_range_tree_clear(rt, msp->ms_start, 4957 vre->vre_offset - msp->ms_start); 4958 } 4959 4960 while (!zthr_iscancelled(zthr) && 4961 !zfs_range_tree_is_empty(rt) && 4962 vre->vre_failed_offset == UINT64_MAX) { 4963 4964 /* 4965 * We need to periodically drop the config lock so that 4966 * writers can get in. Additionally, we can't wait 4967 * for a txg to sync while holding a config lock 4968 * (since a waiting writer could cause a 3-way deadlock 4969 * with the sync thread, which also gets a config 4970 * lock for reader). So we can't hold the config lock 4971 * while calling dmu_tx_assign(). 4972 */ 4973 spa_config_exit(spa, SCL_CONFIG, FTAG); 4974 4975 /* 4976 * If requested, pause the reflow when the amount 4977 * specified by raidz_expand_max_reflow_bytes is reached 4978 * 4979 * This pause is only used during testing or debugging. 4980 */ 4981 while (raidz_expand_max_reflow_bytes != 0 && 4982 raidz_expand_max_reflow_bytes <= 4983 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4984 delay(hz); 4985 } 4986 4987 mutex_enter(&vre->vre_lock); 4988 while (vre->vre_outstanding_bytes > 4989 raidz_expand_max_copy_bytes) { 4990 cv_wait(&vre->vre_cv, &vre->vre_lock); 4991 } 4992 mutex_exit(&vre->vre_lock); 4993 4994 dmu_tx_t *tx = 4995 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4996 4997 VERIFY0(dmu_tx_assign(tx, 4998 DMU_TX_WAIT | DMU_TX_SUSPEND)); 4999 uint64_t txg = dmu_tx_get_txg(tx); 5000 5001 /* 5002 * Reacquire the vdev_config lock. Theoretically, the 5003 * vdev_t that we're expanding may have changed. 5004 */ 5005 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5006 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 5007 5008 boolean_t needsync = 5009 raidz_reflow_impl(raidvd, vre, rt, tx); 5010 5011 dmu_tx_commit(tx); 5012 5013 if (needsync) { 5014 spa_config_exit(spa, SCL_CONFIG, FTAG); 5015 txg_wait_synced(spa->spa_dsl_pool, txg); 5016 spa_config_enter(spa, SCL_CONFIG, FTAG, 5017 RW_READER); 5018 } 5019 } 5020 5021 spa_config_exit(spa, SCL_CONFIG, FTAG); 5022 5023 metaslab_enable(msp, B_FALSE, B_FALSE); 5024 zfs_range_tree_vacate(rt, NULL, NULL); 5025 zfs_range_tree_destroy(rt); 5026 5027 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5028 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 5029 } 5030 5031 spa_config_exit(spa, SCL_CONFIG, FTAG); 5032 5033 /* 5034 * The txg_wait_synced() here ensures that all reflow zio's have 5035 * completed, and vre_failed_offset has been set if necessary. It 5036 * also ensures that the progress of the last raidz_reflow_sync() is 5037 * written to disk before raidz_reflow_complete_sync() changes the 5038 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 5039 * determine if a reflow is in progress, in which case we may need to 5040 * write to both old and new locations. Therefore we can only change 5041 * vre_state once this is not necessary, which is once the on-disk 5042 * progress (in spa_ubsync) has been set past any possible writes (to 5043 * the end of the last metaslab). 5044 */ 5045 txg_wait_synced(spa->spa_dsl_pool, 0); 5046 5047 if (!zthr_iscancelled(zthr) && 5048 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 5049 /* 5050 * We are not being canceled or paused, so the reflow must be 5051 * complete. In that case also mark it as completed on disk. 5052 */ 5053 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 5054 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 5055 raidz_reflow_complete_sync, spa, 5056 0, ZFS_SPACE_CHECK_NONE)); 5057 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 5058 } else { 5059 /* 5060 * Wait for all copy zio's to complete and for all the 5061 * raidz_reflow_sync() synctasks to be run. 5062 */ 5063 spa_history_log_internal(spa, "reflow pause", 5064 NULL, "offset=%llu failed_offset=%lld", 5065 (long long)vre->vre_offset, 5066 (long long)vre->vre_failed_offset); 5067 mutex_enter(&vre->vre_lock); 5068 if (vre->vre_failed_offset != UINT64_MAX) { 5069 /* 5070 * Reset progress so that we will retry everything 5071 * after the point that something failed. 5072 */ 5073 vre->vre_offset = vre->vre_failed_offset; 5074 vre->vre_failed_offset = UINT64_MAX; 5075 vre->vre_waiting_for_resilver = B_TRUE; 5076 } 5077 mutex_exit(&vre->vre_lock); 5078 } 5079 } 5080 5081 void 5082 spa_start_raidz_expansion_thread(spa_t *spa) 5083 { 5084 ASSERT0P(spa->spa_raidz_expand_zthr); 5085 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 5086 spa_raidz_expand_thread_check, spa_raidz_expand_thread, 5087 spa, defclsyspri); 5088 } 5089 5090 void 5091 raidz_dtl_reassessed(vdev_t *vd) 5092 { 5093 spa_t *spa = vd->vdev_spa; 5094 if (spa->spa_raidz_expand != NULL) { 5095 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 5096 /* 5097 * we get called often from vdev_dtl_reassess() so make 5098 * sure it's our vdev and any replacing is complete 5099 */ 5100 if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 5101 !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 5102 mutex_enter(&vre->vre_lock); 5103 if (vre->vre_waiting_for_resilver) { 5104 vdev_dbgmsg(vd, "DTL reassessed, " 5105 "continuing raidz expansion"); 5106 vre->vre_waiting_for_resilver = B_FALSE; 5107 zthr_wakeup(spa->spa_raidz_expand_zthr); 5108 } 5109 mutex_exit(&vre->vre_lock); 5110 } 5111 } 5112 } 5113 5114 int 5115 vdev_raidz_attach_check(vdev_t *new_child) 5116 { 5117 vdev_t *raidvd = new_child->vdev_parent; 5118 uint64_t new_children = raidvd->vdev_children; 5119 5120 /* 5121 * We use the "boot" space as scratch space to handle overwriting the 5122 * initial part of the vdev. If it is too small, then this expansion 5123 * is not allowed. This would be very unusual (e.g. ashift > 13 and 5124 * >200 children). 5125 */ 5126 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 5127 return (EINVAL); 5128 } 5129 return (0); 5130 } 5131 5132 void 5133 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 5134 { 5135 vdev_t *new_child = arg; 5136 spa_t *spa = new_child->vdev_spa; 5137 vdev_t *raidvd = new_child->vdev_parent; 5138 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 5139 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 5140 ASSERT3P(raidvd->vdev_top, ==, raidvd); 5141 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 5142 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 5143 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 5144 new_child); 5145 5146 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 5147 5148 vdrz->vd_physical_width++; 5149 5150 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 5151 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 5152 vdrz->vn_vre.vre_offset = 0; 5153 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 5154 spa->spa_raidz_expand = &vdrz->vn_vre; 5155 zthr_wakeup(spa->spa_raidz_expand_zthr); 5156 5157 /* 5158 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 5159 * written to the config. 5160 */ 5161 vdev_config_dirty(raidvd); 5162 5163 vdrz->vn_vre.vre_start_time = gethrestime_sec(); 5164 vdrz->vn_vre.vre_end_time = 0; 5165 vdrz->vn_vre.vre_state = DSS_SCANNING; 5166 vdrz->vn_vre.vre_bytes_copied = 0; 5167 5168 uint64_t state = vdrz->vn_vre.vre_state; 5169 VERIFY0(zap_update(spa->spa_meta_objset, 5170 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 5171 sizeof (state), 1, &state, tx)); 5172 5173 uint64_t start_time = vdrz->vn_vre.vre_start_time; 5174 VERIFY0(zap_update(spa->spa_meta_objset, 5175 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 5176 sizeof (start_time), 1, &start_time, tx)); 5177 5178 (void) zap_remove(spa->spa_meta_objset, 5179 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 5180 (void) zap_remove(spa->spa_meta_objset, 5181 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 5182 5183 spa_history_log_internal(spa, "raidz vdev expansion started", tx, 5184 "%s vdev %llu new width %llu", spa_name(spa), 5185 (unsigned long long)raidvd->vdev_id, 5186 (unsigned long long)raidvd->vdev_children); 5187 } 5188 5189 int 5190 vdev_raidz_load(vdev_t *vd) 5191 { 5192 vdev_raidz_t *vdrz = vd->vdev_tsd; 5193 int err; 5194 5195 uint64_t state = DSS_NONE; 5196 uint64_t start_time = 0; 5197 uint64_t end_time = 0; 5198 uint64_t bytes_copied = 0; 5199 5200 if (vd->vdev_top_zap != 0) { 5201 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5202 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 5203 sizeof (state), 1, &state); 5204 if (err != 0 && err != ENOENT) 5205 return (err); 5206 5207 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5208 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 5209 sizeof (start_time), 1, &start_time); 5210 if (err != 0 && err != ENOENT) 5211 return (err); 5212 5213 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5214 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 5215 sizeof (end_time), 1, &end_time); 5216 if (err != 0 && err != ENOENT) 5217 return (err); 5218 5219 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5220 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 5221 sizeof (bytes_copied), 1, &bytes_copied); 5222 if (err != 0 && err != ENOENT) 5223 return (err); 5224 } 5225 5226 /* 5227 * If we are in the middle of expansion, vre_state should have 5228 * already been set by vdev_raidz_init(). 5229 */ 5230 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 5231 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 5232 vdrz->vn_vre.vre_start_time = start_time; 5233 vdrz->vn_vre.vre_end_time = end_time; 5234 vdrz->vn_vre.vre_bytes_copied = bytes_copied; 5235 5236 return (0); 5237 } 5238 5239 int 5240 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 5241 { 5242 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 5243 5244 if (vre == NULL) { 5245 /* no removal in progress; find most recent completed */ 5246 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 5247 vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 5248 if (vd->vdev_ops == &vdev_raidz_ops) { 5249 vdev_raidz_t *vdrz = vd->vdev_tsd; 5250 5251 if (vdrz->vn_vre.vre_end_time != 0 && 5252 (vre == NULL || 5253 vdrz->vn_vre.vre_end_time > 5254 vre->vre_end_time)) { 5255 vre = &vdrz->vn_vre; 5256 } 5257 } 5258 } 5259 } 5260 5261 if (vre == NULL) { 5262 return (SET_ERROR(ENOENT)); 5263 } 5264 5265 pres->pres_state = vre->vre_state; 5266 pres->pres_expanding_vdev = vre->vre_vdev_id; 5267 5268 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 5269 pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 5270 5271 mutex_enter(&vre->vre_lock); 5272 pres->pres_reflowed = vre->vre_bytes_copied; 5273 for (int i = 0; i < TXG_SIZE; i++) 5274 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 5275 mutex_exit(&vre->vre_lock); 5276 5277 pres->pres_start_time = vre->vre_start_time; 5278 pres->pres_end_time = vre->vre_end_time; 5279 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 5280 5281 return (0); 5282 } 5283 5284 /* 5285 * Initialize private RAIDZ specific fields from the nvlist. 5286 */ 5287 static int 5288 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 5289 { 5290 uint_t children; 5291 nvlist_t **child; 5292 int error = nvlist_lookup_nvlist_array(nv, 5293 ZPOOL_CONFIG_CHILDREN, &child, &children); 5294 if (error != 0) 5295 return (SET_ERROR(EINVAL)); 5296 5297 uint64_t nparity; 5298 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 5299 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 5300 return (SET_ERROR(EINVAL)); 5301 5302 /* 5303 * Previous versions could only support 1 or 2 parity 5304 * device. 5305 */ 5306 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 5307 return (SET_ERROR(EINVAL)); 5308 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 5309 return (SET_ERROR(EINVAL)); 5310 } else { 5311 /* 5312 * We require the parity to be specified for SPAs that 5313 * support multiple parity levels. 5314 */ 5315 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 5316 return (SET_ERROR(EINVAL)); 5317 5318 /* 5319 * Otherwise, we default to 1 parity device for RAID-Z. 5320 */ 5321 nparity = 1; 5322 } 5323 5324 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 5325 vdrz->vn_vre.vre_vdev_id = -1; 5326 vdrz->vn_vre.vre_offset = UINT64_MAX; 5327 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 5328 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 5329 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 5330 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 5331 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 5332 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 5333 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 5334 5335 vdrz->vd_physical_width = children; 5336 vdrz->vd_nparity = nparity; 5337 5338 /* note, the ID does not exist when creating a pool */ 5339 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 5340 &vdrz->vn_vre.vre_vdev_id); 5341 5342 boolean_t reflow_in_progress = 5343 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5344 if (reflow_in_progress) { 5345 spa->spa_raidz_expand = &vdrz->vn_vre; 5346 vdrz->vn_vre.vre_state = DSS_SCANNING; 5347 } 5348 5349 vdrz->vd_original_width = children; 5350 uint64_t *txgs; 5351 unsigned int txgs_size = 0; 5352 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5353 &txgs, &txgs_size); 5354 if (error == 0) { 5355 for (int i = 0; i < txgs_size; i++) { 5356 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 5357 re->re_txg = txgs[txgs_size - i - 1]; 5358 re->re_logical_width = vdrz->vd_physical_width - i; 5359 5360 if (reflow_in_progress) 5361 re->re_logical_width--; 5362 5363 avl_add(&vdrz->vd_expand_txgs, re); 5364 } 5365 5366 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 5367 } 5368 if (reflow_in_progress) { 5369 vdrz->vd_original_width--; 5370 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 5371 children, txgs_size); 5372 } 5373 5374 *tsd = vdrz; 5375 5376 return (0); 5377 } 5378 5379 static void 5380 vdev_raidz_fini(vdev_t *vd) 5381 { 5382 vdev_raidz_t *vdrz = vd->vdev_tsd; 5383 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 5384 vd->vdev_spa->spa_raidz_expand = NULL; 5385 reflow_node_t *re; 5386 void *cookie = NULL; 5387 avl_tree_t *tree = &vdrz->vd_expand_txgs; 5388 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 5389 kmem_free(re, sizeof (*re)); 5390 avl_destroy(&vdrz->vd_expand_txgs); 5391 mutex_destroy(&vdrz->vd_expand_lock); 5392 mutex_destroy(&vdrz->vn_vre.vre_lock); 5393 cv_destroy(&vdrz->vn_vre.vre_cv); 5394 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 5395 kmem_free(vdrz, sizeof (*vdrz)); 5396 } 5397 5398 /* 5399 * Add RAIDZ specific fields to the config nvlist. 5400 */ 5401 static void 5402 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 5403 { 5404 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 5405 vdev_raidz_t *vdrz = vd->vdev_tsd; 5406 5407 /* 5408 * Make sure someone hasn't managed to sneak a fancy new vdev 5409 * into a crufty old storage pool. 5410 */ 5411 ASSERT(vdrz->vd_nparity == 1 || 5412 (vdrz->vd_nparity <= 2 && 5413 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 5414 (vdrz->vd_nparity <= 3 && 5415 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 5416 5417 /* 5418 * Note that we'll add these even on storage pools where they 5419 * aren't strictly required -- older software will just ignore 5420 * it. 5421 */ 5422 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 5423 5424 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 5425 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5426 } 5427 5428 mutex_enter(&vdrz->vd_expand_lock); 5429 if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 5430 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 5431 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 5432 KM_SLEEP); 5433 uint64_t i = 0; 5434 5435 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 5436 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 5437 txgs[i++] = re->re_txg; 5438 } 5439 5440 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5441 txgs, count); 5442 5443 kmem_free(txgs, sizeof (uint64_t) * count); 5444 } 5445 mutex_exit(&vdrz->vd_expand_lock); 5446 } 5447 5448 static uint64_t 5449 vdev_raidz_nparity(vdev_t *vd) 5450 { 5451 vdev_raidz_t *vdrz = vd->vdev_tsd; 5452 return (vdrz->vd_nparity); 5453 } 5454 5455 static uint64_t 5456 vdev_raidz_ndisks(vdev_t *vd) 5457 { 5458 return (vd->vdev_children); 5459 } 5460 5461 vdev_ops_t vdev_raidz_ops = { 5462 .vdev_op_init = vdev_raidz_init, 5463 .vdev_op_fini = vdev_raidz_fini, 5464 .vdev_op_open = vdev_raidz_open, 5465 .vdev_op_close = vdev_raidz_close, 5466 .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, 5467 .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, 5468 .vdev_op_min_asize = vdev_raidz_min_asize, 5469 .vdev_op_min_alloc = NULL, 5470 .vdev_op_io_start = vdev_raidz_io_start, 5471 .vdev_op_io_done = vdev_raidz_io_done, 5472 .vdev_op_state_change = vdev_raidz_state_change, 5473 .vdev_op_need_resilver = vdev_raidz_need_resilver, 5474 .vdev_op_hold = NULL, 5475 .vdev_op_rele = NULL, 5476 .vdev_op_remap = NULL, 5477 .vdev_op_xlate = vdev_raidz_xlate, 5478 .vdev_op_rebuild_asize = NULL, 5479 .vdev_op_metaslab_init = NULL, 5480 .vdev_op_config_generate = vdev_raidz_config_generate, 5481 .vdev_op_nparity = vdev_raidz_nparity, 5482 .vdev_op_ndisks = vdev_raidz_ndisks, 5483 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5484 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5485 }; 5486 5487 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5488 "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5489 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5490 "Max amount of concurrent i/o for RAIDZ expansion"); 5491 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5492 "For expanded RAIDZ, aggregate reads that have more rows than this"); 5493 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5494 "For expanded RAIDZ, automatically start a pool scrub when expansion " 5495 "completes"); 5496 ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, 5497 "Raidz/draid slow disk sit out time period in seconds"); 5498 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64, 5499 ZMOD_RW, "Interval to check for slow raidz/draid children"); 5500 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT, 5501 ZMOD_RW, "How insensitive the slow raidz/draid child check should be"); 5502 /* END CSTYLED */ 5503