1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 26 * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 27 * Copyright (c) 2025, Klara, Inc. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/zap.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/metaslab_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/dmu_tx.h> 39 #include <sys/abd.h> 40 #include <sys/zfs_rlock.h> 41 #include <sys/fs/zfs.h> 42 #include <sys/fm/fs/zfs.h> 43 #include <sys/vdev_raidz.h> 44 #include <sys/vdev_raidz_impl.h> 45 #include <sys/vdev_draid.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/dsl_scan.h> 48 49 #ifdef ZFS_DEBUG 50 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 51 #endif 52 53 /* 54 * Virtual device vector for RAID-Z. 55 * 56 * This vdev supports single, double, and triple parity. For single parity, 57 * we use a simple XOR of all the data columns. For double or triple parity, 58 * we use a special case of Reed-Solomon coding. This extends the 59 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 60 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 61 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 62 * former is also based. The latter is designed to provide higher performance 63 * for writes. 64 * 65 * Note that the Plank paper claimed to support arbitrary N+M, but was then 66 * amended six years later identifying a critical flaw that invalidates its 67 * claims. Nevertheless, the technique can be adapted to work for up to 68 * triple parity. For additional parity, the amendment "Note: Correction to 69 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 70 * is viable, but the additional complexity means that write performance will 71 * suffer. 72 * 73 * All of the methods above operate on a Galois field, defined over the 74 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 75 * can be expressed with a single byte. Briefly, the operations on the 76 * field are defined as follows: 77 * 78 * o addition (+) is represented by a bitwise XOR 79 * o subtraction (-) is therefore identical to addition: A + B = A - B 80 * o multiplication of A by 2 is defined by the following bitwise expression: 81 * 82 * (A * 2)_7 = A_6 83 * (A * 2)_6 = A_5 84 * (A * 2)_5 = A_4 85 * (A * 2)_4 = A_3 + A_7 86 * (A * 2)_3 = A_2 + A_7 87 * (A * 2)_2 = A_1 + A_7 88 * (A * 2)_1 = A_0 89 * (A * 2)_0 = A_7 90 * 91 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 92 * As an aside, this multiplication is derived from the error correcting 93 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 94 * 95 * Observe that any number in the field (except for 0) can be expressed as a 96 * power of 2 -- a generator for the field. We store a table of the powers of 97 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 98 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 99 * than field addition). The inverse of a field element A (A^-1) is therefore 100 * A ^ (255 - 1) = A^254. 101 * 102 * The up-to-three parity columns, P, Q, R over several data columns, 103 * D_0, ... D_n-1, can be expressed by field operations: 104 * 105 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 106 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 107 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 108 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 109 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 110 * 111 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 112 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 113 * independent coefficients. (There are no additional coefficients that have 114 * this property which is why the uncorrected Plank method breaks down.) 115 * 116 * See the reconstruction code below for how P, Q and R can used individually 117 * or in concert to recover missing data columns. 118 */ 119 120 #define VDEV_RAIDZ_P 0 121 #define VDEV_RAIDZ_Q 1 122 #define VDEV_RAIDZ_R 2 123 124 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 125 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 126 127 /* 128 * We provide a mechanism to perform the field multiplication operation on a 129 * 64-bit value all at once rather than a byte at a time. This works by 130 * creating a mask from the top bit in each byte and using that to 131 * conditionally apply the XOR of 0x1d. 132 */ 133 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 134 { \ 135 (mask) = (x) & 0x8080808080808080ULL; \ 136 (mask) = ((mask) << 1) - ((mask) >> 7); \ 137 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 138 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 139 } 140 141 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 142 { \ 143 VDEV_RAIDZ_64MUL_2((x), mask); \ 144 VDEV_RAIDZ_64MUL_2((x), mask); \ 145 } 146 147 148 /* 149 * Big Theory Statement for how a RAIDZ VDEV is expanded 150 * 151 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 152 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 153 * that have been previously expanded can be expanded again. 154 * 155 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 156 * the VDEV) when an expansion starts. And the expansion will pause if any 157 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 158 * operations on the pool can continue while an expansion is in progress (e.g. 159 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 160 * and zpool initialize which can't be run during an expansion. Following a 161 * reboot or export/import, the expansion resumes where it left off. 162 * 163 * == Reflowing the Data == 164 * 165 * The expansion involves reflowing (copying) the data from the current set 166 * of disks to spread it across the new set which now has one more disk. This 167 * reflow operation is similar to reflowing text when the column width of a 168 * text editor window is expanded. The text doesn’t change but the location of 169 * the text changes to accommodate the new width. An example reflow result for 170 * a 4-wide RAIDZ1 to a 5-wide is shown below. 171 * 172 * Reflow End State 173 * Each letter indicates a parity group (logical stripe) 174 * 175 * Before expansion After Expansion 176 * D1 D2 D3 D4 D1 D2 D3 D4 D5 177 * +------+------+------+------+ +------+------+------+------+------+ 178 * | | | | | | | | | | | 179 * | A | A | A | A | | A | A | A | A | B | 180 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 181 * +------+------+------+------+ +------+------+------+------+------+ 182 * | | | | | | | | | | | 183 * | B | B | C | C | | B | C | C | C | C | 184 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 185 * +------+------+------+------+ +------+------+------+------+------+ 186 * | | | | | | | | | | | 187 * | C | C | D | D | | D | D | E | E | E | 188 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 189 * +------+------+------+------+ +------+------+------+------+------+ 190 * | | | | | | | | | | | 191 * | E | E | E | E | --> | E | F | F | G | G | 192 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 193 * +------+------+------+------+ +------+------+------+------+------+ 194 * | | | | | | | | | | | 195 * | F | F | G | G | | G | G | H | H | H | 196 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 197 * +------+------+------+------+ +------+------+------+------+------+ 198 * | | | | | | | | | | | 199 * | G | G | H | H | | H | I | I | J | J | 200 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 201 * +------+------+------+------+ +------+------+------+------+------+ 202 * | | | | | | | | | | | 203 * | H | H | I | I | | J | J | | | K | 204 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 205 * +------+------+------+------+ +------+------+------+------+------+ 206 * 207 * This reflow approach has several advantages. There is no need to read or 208 * modify the block pointers or recompute any block checksums. The reflow 209 * doesn’t need to know where the parity sectors reside. We can read and write 210 * data sequentially and the copy can occur in a background thread in open 211 * context. The design also allows for fast discovery of what data to copy. 212 * 213 * The VDEV metaslabs are processed, one at a time, to copy the block data to 214 * have it flow across all the disks. The metaslab is disabled for allocations 215 * during the copy. As an optimization, we only copy the allocated data which 216 * can be determined by looking at the metaslab range tree. During the copy we 217 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 218 * need to be able to survive losing parity count disks). This means we 219 * cannot overwrite data during the reflow that would be needed if a disk is 220 * lost. 221 * 222 * After the reflow completes, all newly-written blocks will have the new 223 * layout, i.e., they will have the parity to data ratio implied by the new 224 * number of disks in the RAIDZ group. Even though the reflow copies all of 225 * the allocated space (data and parity), it is only rearranged, not changed. 226 * 227 * This act of reflowing the data has a few implications about blocks 228 * that were written before the reflow completes: 229 * 230 * - Old blocks will still use the same amount of space (i.e., they will have 231 * the parity to data ratio implied by the old number of disks in the RAIDZ 232 * group). 233 * - Reading old blocks will be slightly slower than before the reflow, for 234 * two reasons. First, we will have to read from all disks in the RAIDZ 235 * VDEV, rather than being able to skip the children that contain only 236 * parity of this block (because the data of a single block is now spread 237 * out across all the disks). Second, in most cases there will be an extra 238 * bcopy, needed to rearrange the data back to its original layout in memory. 239 * 240 * == Scratch Area == 241 * 242 * As we copy the block data, we can only progress to the point that writes 243 * will not overlap with blocks whose progress has not yet been recorded on 244 * disk. Since partially-copied rows are always read from the old location, 245 * we need to stop one row before the sector-wise overlap, to prevent any 246 * row-wise overlap. For example, in the diagram above, when we reflow sector 247 * B6 it will overwite the original location for B5. 248 * 249 * To get around this, a scratch space is used so that we can start copying 250 * without risking data loss by overlapping the row. As an added benefit, it 251 * improves performance at the beginning of the reflow, but that small perf 252 * boost wouldn't be worth the complexity on its own. 253 * 254 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 255 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 256 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 257 * the widths will likely be single digits so we can get a substantial chuck 258 * size using only a few MB of scratch per disk. 259 * 260 * The scratch area is persisted to disk which holds a large amount of reflowed 261 * state. We can always read the partially written stripes when a disk fails or 262 * the copy is interrupted (crash) during the initial copying phase and also 263 * get past a small chunk size restriction. At a minimum, the scratch space 264 * must be large enough to get us to the point that one row does not overlap 265 * itself when moved (i.e new_width^2). But going larger is even better. We 266 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 267 * as our scratch space to handle overwriting the initial part of the VDEV. 268 * 269 * 0 256K 512K 4M 270 * +------+------+-----------------------+----------------------------- 271 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 272 * | L0 | L1 | Reserved | (Metaslabs) 273 * +------+------+-----------------------+------------------------------- 274 * Scratch Area 275 * 276 * == Reflow Progress Updates == 277 * After the initial scratch-based reflow, the expansion process works 278 * similarly to device removal. We create a new open context thread which 279 * reflows the data, and periodically kicks off sync tasks to update logical 280 * state. In this case, state is the committed progress (offset of next data 281 * to copy). We need to persist the completed offset on disk, so that if we 282 * crash we know which format each VDEV offset is in. 283 * 284 * == Time Dependent Geometry == 285 * 286 * In non-expanded RAIDZ, blocks are read from disk in a column by column 287 * fashion. For a multi-row block, the second sector is in the first column 288 * not in the second column. This allows us to issue full reads for each 289 * column directly into the request buffer. The block data is thus laid out 290 * sequentially in a column-by-column fashion. 291 * 292 * For example, in the before expansion diagram above, one logical block might 293 * be sectors G19-H26. The parity is in G19,H23; and the data is in 294 * G20,H24,G21,H25,G22,H26. 295 * 296 * After a block is reflowed, the sectors that were all in the original column 297 * data can now reside in different columns. When reading from an expanded 298 * VDEV, we need to know the logical stripe width for each block so we can 299 * reconstitute the block’s data after the reads are completed. Likewise, 300 * when we perform the combinatorial reconstruction we need to know the 301 * original width so we can retry combinations from the past layouts. 302 * 303 * Time dependent geometry is what we call having blocks with different layouts 304 * (stripe widths) in the same VDEV. This time-dependent geometry uses the 305 * block’s birth time (+ the time expansion ended) to establish the correct 306 * width for a given block. After an expansion completes, we record the time 307 * for blocks written with a particular width (geometry). 308 * 309 * == On Disk Format Changes == 310 * 311 * New pool feature flag, 'raidz_expansion' whose reference count is the number 312 * of RAIDZ VDEVs that have been expanded. 313 * 314 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 315 * 316 * Since the uberblock can point to arbitrary blocks, which might be on the 317 * expanding RAIDZ, and might or might not have been expanded. We need to know 318 * which way a block is laid out before reading it. This info is the next 319 * offset that needs to be reflowed and we persist that in the uberblock, in 320 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 321 * After the expansion is complete, we then use the raidz_expand_txgs array 322 * (see below) to determine how to read a block and the ub_raidz_reflow_info 323 * field no longer required. 324 * 325 * The uberblock's ub_raidz_reflow_info field also holds the scratch space 326 * state (i.e., active or not) which is also required before reading a block 327 * during the initial phase of reflowing the data. 328 * 329 * The top-level RAIDZ VDEV has two new entries in the nvlist: 330 * 331 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 332 * and used after the expansion is complete to 333 * determine how to read a raidz block 334 * 'raidz_expanding' boolean: present during reflow and removed after completion 335 * used during a spa import to resume an unfinished 336 * expansion 337 * 338 * And finally the VDEVs top zap adds the following informational entries: 339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 341 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 342 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 343 */ 344 345 /* 346 * For testing only: pause the raidz expansion after reflowing this amount. 347 * (accessed by ZTS and ztest) 348 */ 349 #ifdef _KERNEL 350 static 351 #endif /* _KERNEL */ 352 unsigned long raidz_expand_max_reflow_bytes = 0; 353 354 /* 355 * For testing only: pause the raidz expansion at a certain point. 356 */ 357 uint_t raidz_expand_pause_point = 0; 358 359 /* 360 * This represents the duration for a slow drive read sit out. 361 */ 362 static unsigned long vdev_read_sit_out_secs = 600; 363 364 /* 365 * How often each RAID-Z and dRAID vdev will check for slow disk outliers. 366 * Increasing this interval will reduce the sensitivity of detection (since all 367 * I/Os since the last check are included in the statistics), but will slow the 368 * response to a disk developing a problem. 369 * 370 * Defaults to once per second; setting extremely small values may cause 371 * negative performance effects. 372 */ 373 static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000; 374 375 /* 376 * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is 377 * used to determine how far out an outlier must be before it counts as an event 378 * worth consdering. 379 * 380 * Smaller values will result in more aggressive sitting out of disks that may 381 * have problems, but may significantly increase the rate of spurious sit-outs. 382 */ 383 static uint32_t vdev_raidz_outlier_insensitivity = 50; 384 385 /* 386 * Maximum amount of copy io's outstanding at once. 387 */ 388 #ifdef _ILP32 389 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; 390 #else 391 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 392 #endif 393 394 /* 395 * Apply raidz map abds aggregation if the number of rows in the map is equal 396 * or greater than the value below. 397 */ 398 static unsigned long raidz_io_aggregate_rows = 4; 399 400 /* 401 * Automatically start a pool scrub when a RAIDZ expansion completes in 402 * order to verify the checksums of all blocks which have been copied 403 * during the expansion. Automatic scrubbing is enabled by default and 404 * is strongly recommended. 405 */ 406 static int zfs_scrub_after_expand = 1; 407 408 static void 409 vdev_raidz_row_free(raidz_row_t *rr) 410 { 411 for (int c = 0; c < rr->rr_cols; c++) { 412 raidz_col_t *rc = &rr->rr_col[c]; 413 414 if (rc->rc_size != 0) 415 abd_free(rc->rc_abd); 416 if (rc->rc_orig_data != NULL) 417 abd_free(rc->rc_orig_data); 418 } 419 420 if (rr->rr_abd_empty != NULL) 421 abd_free(rr->rr_abd_empty); 422 423 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 424 } 425 426 void 427 vdev_raidz_map_free(raidz_map_t *rm) 428 { 429 for (int i = 0; i < rm->rm_nrows; i++) 430 vdev_raidz_row_free(rm->rm_row[i]); 431 432 if (rm->rm_nphys_cols) { 433 for (int i = 0; i < rm->rm_nphys_cols; i++) { 434 if (rm->rm_phys_col[i].rc_abd != NULL) 435 abd_free(rm->rm_phys_col[i].rc_abd); 436 } 437 438 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 439 rm->rm_nphys_cols); 440 } 441 442 ASSERT0P(rm->rm_lr); 443 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 444 } 445 446 static void 447 vdev_raidz_map_free_vsd(zio_t *zio) 448 { 449 raidz_map_t *rm = zio->io_vsd; 450 451 vdev_raidz_map_free(rm); 452 } 453 454 static int 455 vdev_raidz_reflow_compare(const void *x1, const void *x2) 456 { 457 const reflow_node_t *l = x1; 458 const reflow_node_t *r = x2; 459 460 return (TREE_CMP(l->re_txg, r->re_txg)); 461 } 462 463 const zio_vsd_ops_t vdev_raidz_vsd_ops = { 464 .vsd_free = vdev_raidz_map_free_vsd, 465 }; 466 467 raidz_row_t * 468 vdev_raidz_row_alloc(int cols, zio_t *zio) 469 { 470 raidz_row_t *rr = 471 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 472 473 rr->rr_cols = cols; 474 rr->rr_scols = cols; 475 476 for (int c = 0; c < cols; c++) { 477 raidz_col_t *rc = &rr->rr_col[c]; 478 rc->rc_shadow_devidx = INT_MAX; 479 rc->rc_shadow_offset = UINT64_MAX; 480 /* 481 * We can not allow self healing to take place for Direct I/O 482 * reads. There is nothing that stops the buffer contents from 483 * being manipulated while the I/O is in flight. It is possible 484 * that the checksum could be verified on the buffer and then 485 * the contents of that buffer are manipulated afterwards. This 486 * could lead to bad data being written out during self 487 * healing. 488 */ 489 if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 490 rc->rc_allow_repair = 1; 491 } 492 return (rr); 493 } 494 495 static void 496 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 497 { 498 int c; 499 int nwrapped = 0; 500 uint64_t off = 0; 501 raidz_row_t *rr = rm->rm_row[0]; 502 503 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 504 ASSERT3U(rm->rm_nrows, ==, 1); 505 506 /* 507 * Pad any parity columns with additional space to account for skip 508 * sectors. 509 */ 510 if (rm->rm_skipstart < rr->rr_firstdatacol) { 511 ASSERT0(rm->rm_skipstart); 512 nwrapped = rm->rm_nskip; 513 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 514 nwrapped = 515 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 516 } 517 518 /* 519 * Optional single skip sectors (rc_size == 0) will be handled in 520 * vdev_raidz_io_start_write(). 521 */ 522 int skipped = rr->rr_scols - rr->rr_cols; 523 524 /* Allocate buffers for the parity columns */ 525 for (c = 0; c < rr->rr_firstdatacol; c++) { 526 raidz_col_t *rc = &rr->rr_col[c]; 527 528 /* 529 * Parity columns will pad out a linear ABD to account for 530 * the skip sector. A linear ABD is used here because 531 * parity calculations use the ABD buffer directly to calculate 532 * parity. This avoids doing a memcpy back to the ABD after the 533 * parity has been calculated. By issuing the parity column 534 * with the skip sector we can reduce contention on the child 535 * VDEV queue locks (vq_lock). 536 */ 537 if (c < nwrapped) { 538 rc->rc_abd = abd_alloc_linear( 539 rc->rc_size + (1ULL << ashift), B_FALSE); 540 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 541 skipped++; 542 } else { 543 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 544 } 545 } 546 547 for (off = 0; c < rr->rr_cols; c++) { 548 raidz_col_t *rc = &rr->rr_col[c]; 549 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 550 zio->io_abd, off, rc->rc_size); 551 552 /* 553 * Generate I/O for skip sectors to improve aggregation 554 * continuity. We will use gang ABD's to reduce contention 555 * on the child VDEV queue locks (vq_lock) by issuing 556 * a single I/O that contains the data and skip sector. 557 * 558 * It is important to make sure that rc_size is not updated 559 * even though we are adding a skip sector to the ABD. When 560 * calculating the parity in vdev_raidz_generate_parity_row() 561 * the rc_size is used to iterate through the ABD's. We can 562 * not have zero'd out skip sectors used for calculating 563 * parity for raidz, because those same sectors are not used 564 * during reconstruction. 565 */ 566 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 567 rc->rc_abd = abd_alloc_gang(); 568 abd_gang_add(rc->rc_abd, abd, B_TRUE); 569 abd_gang_add(rc->rc_abd, 570 abd_get_zeros(1ULL << ashift), B_TRUE); 571 skipped++; 572 } else { 573 rc->rc_abd = abd; 574 } 575 off += rc->rc_size; 576 } 577 578 ASSERT3U(off, ==, zio->io_size); 579 ASSERT3S(skipped, ==, rm->rm_nskip); 580 } 581 582 static void 583 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 584 { 585 int c; 586 raidz_row_t *rr = rm->rm_row[0]; 587 588 ASSERT3U(rm->rm_nrows, ==, 1); 589 590 /* Allocate buffers for the parity columns */ 591 for (c = 0; c < rr->rr_firstdatacol; c++) 592 rr->rr_col[c].rc_abd = 593 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 594 595 for (uint64_t off = 0; c < rr->rr_cols; c++) { 596 raidz_col_t *rc = &rr->rr_col[c]; 597 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 598 zio->io_abd, off, rc->rc_size); 599 off += rc->rc_size; 600 } 601 } 602 603 /* 604 * Divides the IO evenly across all child vdevs; usually, dcols is 605 * the number of children in the target vdev. 606 * 607 * Avoid inlining the function to keep vdev_raidz_io_start(), which 608 * is this functions only caller, as small as possible on the stack. 609 */ 610 noinline raidz_map_t * 611 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 612 uint64_t nparity) 613 { 614 raidz_row_t *rr; 615 /* The starting RAIDZ (parent) vdev sector of the block. */ 616 uint64_t b = zio->io_offset >> ashift; 617 /* The zio's size in units of the vdev's minimum sector size. */ 618 uint64_t s = zio->io_size >> ashift; 619 /* The first column for this stripe. */ 620 uint64_t f = b % dcols; 621 /* The starting byte offset on each child vdev. */ 622 uint64_t o = (b / dcols) << ashift; 623 uint64_t acols, scols; 624 625 raidz_map_t *rm = 626 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 627 rm->rm_nrows = 1; 628 629 /* 630 * "Quotient": The number of data sectors for this stripe on all but 631 * the "big column" child vdevs that also contain "remainder" data. 632 */ 633 uint64_t q = s / (dcols - nparity); 634 635 /* 636 * "Remainder": The number of partial stripe data sectors in this I/O. 637 * This will add a sector to some, but not all, child vdevs. 638 */ 639 uint64_t r = s - q * (dcols - nparity); 640 641 /* The number of "big columns" - those which contain remainder data. */ 642 uint64_t bc = (r == 0 ? 0 : r + nparity); 643 644 /* 645 * The total number of data and parity sectors associated with 646 * this I/O. 647 */ 648 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 649 650 /* 651 * acols: The columns that will be accessed. 652 * scols: The columns that will be accessed or skipped. 653 */ 654 if (q == 0) { 655 /* Our I/O request doesn't span all child vdevs. */ 656 acols = bc; 657 scols = MIN(dcols, roundup(bc, nparity + 1)); 658 } else { 659 acols = dcols; 660 scols = dcols; 661 } 662 663 ASSERT3U(acols, <=, scols); 664 rr = vdev_raidz_row_alloc(scols, zio); 665 rm->rm_row[0] = rr; 666 rr->rr_cols = acols; 667 rr->rr_bigcols = bc; 668 rr->rr_firstdatacol = nparity; 669 #ifdef ZFS_DEBUG 670 rr->rr_offset = zio->io_offset; 671 rr->rr_size = zio->io_size; 672 #endif 673 674 uint64_t asize = 0; 675 676 for (uint64_t c = 0; c < scols; c++) { 677 raidz_col_t *rc = &rr->rr_col[c]; 678 uint64_t col = f + c; 679 uint64_t coff = o; 680 if (col >= dcols) { 681 col -= dcols; 682 coff += 1ULL << ashift; 683 } 684 rc->rc_devidx = col; 685 rc->rc_offset = coff; 686 687 if (c >= acols) 688 rc->rc_size = 0; 689 else if (c < bc) 690 rc->rc_size = (q + 1) << ashift; 691 else 692 rc->rc_size = q << ashift; 693 694 asize += rc->rc_size; 695 } 696 697 ASSERT3U(asize, ==, tot << ashift); 698 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 699 rm->rm_skipstart = bc; 700 701 /* 702 * If all data stored spans all columns, there's a danger that parity 703 * will always be on the same device and, since parity isn't read 704 * during normal operation, that device's I/O bandwidth won't be 705 * used effectively. We therefore switch the parity every 1MB. 706 * 707 * ... at least that was, ostensibly, the theory. As a practical 708 * matter unless we juggle the parity between all devices evenly, we 709 * won't see any benefit. Further, occasional writes that aren't a 710 * multiple of the LCM of the number of children and the minimum 711 * stripe width are sufficient to avoid pessimal behavior. 712 * Unfortunately, this decision created an implicit on-disk format 713 * requirement that we need to support for all eternity, but only 714 * for single-parity RAID-Z. 715 * 716 * If we intend to skip a sector in the zeroth column for padding 717 * we must make sure to note this swap. We will never intend to 718 * skip the first column since at least one data and one parity 719 * column must appear in each row. 720 */ 721 ASSERT(rr->rr_cols >= 2); 722 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 723 724 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 725 uint64_t devidx = rr->rr_col[0].rc_devidx; 726 o = rr->rr_col[0].rc_offset; 727 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 728 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 729 rr->rr_col[1].rc_devidx = devidx; 730 rr->rr_col[1].rc_offset = o; 731 if (rm->rm_skipstart == 0) 732 rm->rm_skipstart = 1; 733 } 734 735 if (zio->io_type == ZIO_TYPE_WRITE) { 736 vdev_raidz_map_alloc_write(zio, rm, ashift); 737 } else { 738 vdev_raidz_map_alloc_read(zio, rm); 739 } 740 /* init RAIDZ parity ops */ 741 rm->rm_ops = vdev_raidz_math_get_ops(); 742 743 return (rm); 744 } 745 746 /* 747 * Everything before reflow_offset_synced should have been moved to the new 748 * location (read and write completed). However, this may not yet be reflected 749 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 750 * uberblock has not yet been written). If reflow is not in progress, 751 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 752 * entirely before reflow_offset_synced, it will come from the new location. 753 * Otherwise this row will come from the old location. Therefore, rows that 754 * straddle the reflow_offset_synced will come from the old location. 755 * 756 * For writes, reflow_offset_next is the next offset to copy. If a sector has 757 * been copied, but not yet reflected in the on-disk progress 758 * (reflow_offset_synced), it will also be written to the new (already copied) 759 * offset. 760 */ 761 noinline raidz_map_t * 762 vdev_raidz_map_alloc_expanded(zio_t *zio, 763 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 764 uint64_t nparity, uint64_t reflow_offset_synced, 765 uint64_t reflow_offset_next, boolean_t use_scratch) 766 { 767 abd_t *abd = zio->io_abd; 768 uint64_t offset = zio->io_offset; 769 uint64_t size = zio->io_size; 770 771 /* The zio's size in units of the vdev's minimum sector size. */ 772 uint64_t s = size >> ashift; 773 774 /* 775 * "Quotient": The number of data sectors for this stripe on all but 776 * the "big column" child vdevs that also contain "remainder" data. 777 * AKA "full rows" 778 */ 779 uint64_t q = s / (logical_cols - nparity); 780 781 /* 782 * "Remainder": The number of partial stripe data sectors in this I/O. 783 * This will add a sector to some, but not all, child vdevs. 784 */ 785 uint64_t r = s - q * (logical_cols - nparity); 786 787 /* The number of "big columns" - those which contain remainder data. */ 788 uint64_t bc = (r == 0 ? 0 : r + nparity); 789 790 /* 791 * The total number of data and parity sectors associated with 792 * this I/O. 793 */ 794 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 795 796 /* How many rows contain data (not skip) */ 797 uint64_t rows = howmany(tot, logical_cols); 798 int cols = MIN(tot, logical_cols); 799 800 raidz_map_t *rm = 801 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 802 KM_SLEEP); 803 rm->rm_nrows = rows; 804 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 805 rm->rm_skipstart = bc; 806 uint64_t asize = 0; 807 808 for (uint64_t row = 0; row < rows; row++) { 809 boolean_t row_use_scratch = B_FALSE; 810 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 811 rm->rm_row[row] = rr; 812 813 /* The starting RAIDZ (parent) vdev sector of the row. */ 814 uint64_t b = (offset >> ashift) + row * logical_cols; 815 816 /* 817 * If we are in the middle of a reflow, and the copying has 818 * not yet completed for any part of this row, then use the 819 * old location of this row. Note that reflow_offset_synced 820 * reflects the i/o that's been completed, because it's 821 * updated by a synctask, after zio_wait(spa_txg_zio[]). 822 * This is sufficient for our check, even if that progress 823 * has not yet been recorded to disk (reflected in 824 * spa_ubsync). Also note that we consider the last row to 825 * be "full width" (`cols`-wide rather than `bc`-wide) for 826 * this calculation. This causes a tiny bit of unnecessary 827 * double-writes but is safe and simpler to calculate. 828 */ 829 int row_phys_cols = physical_cols; 830 if (b + cols > reflow_offset_synced >> ashift) 831 row_phys_cols--; 832 else if (use_scratch) 833 row_use_scratch = B_TRUE; 834 835 /* starting child of this row */ 836 uint64_t child_id = b % row_phys_cols; 837 /* The starting byte offset on each child vdev. */ 838 uint64_t child_offset = (b / row_phys_cols) << ashift; 839 840 /* 841 * Note, rr_cols is the entire width of the block, even 842 * if this row is shorter. This is needed because parity 843 * generation (for Q and R) needs to know the entire width, 844 * because it treats the short row as though it was 845 * full-width (and the "phantom" sectors were zero-filled). 846 * 847 * Another approach to this would be to set cols shorter 848 * (to just the number of columns that we might do i/o to) 849 * and have another mechanism to tell the parity generation 850 * about the "entire width". Reconstruction (at least 851 * vdev_raidz_reconstruct_general()) would also need to 852 * know about the "entire width". 853 */ 854 rr->rr_firstdatacol = nparity; 855 #ifdef ZFS_DEBUG 856 /* 857 * note: rr_size is PSIZE, not ASIZE 858 */ 859 rr->rr_offset = b << ashift; 860 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 861 #endif 862 863 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 864 if (child_id >= row_phys_cols) { 865 child_id -= row_phys_cols; 866 child_offset += 1ULL << ashift; 867 } 868 raidz_col_t *rc = &rr->rr_col[c]; 869 rc->rc_devidx = child_id; 870 rc->rc_offset = child_offset; 871 872 /* 873 * Get this from the scratch space if appropriate. 874 * This only happens if we crashed in the middle of 875 * raidz_reflow_scratch_sync() (while it's running, 876 * the rangelock prevents us from doing concurrent 877 * io), and even then only during zpool import or 878 * when the pool is imported readonly. 879 */ 880 if (row_use_scratch) 881 rc->rc_offset -= VDEV_BOOT_SIZE; 882 883 uint64_t dc = c - rr->rr_firstdatacol; 884 if (c < rr->rr_firstdatacol) { 885 rc->rc_size = 1ULL << ashift; 886 887 /* 888 * Parity sectors' rc_abd's are set below 889 * after determining if this is an aggregation. 890 */ 891 } else if (row == rows - 1 && bc != 0 && c >= bc) { 892 /* 893 * Past the end of the block (even including 894 * skip sectors). This sector is part of the 895 * map so that we have full rows for p/q parity 896 * generation. 897 */ 898 rc->rc_size = 0; 899 rc->rc_abd = NULL; 900 } else { 901 /* "data column" (col excluding parity) */ 902 uint64_t off; 903 904 if (c < bc || r == 0) { 905 off = dc * rows + row; 906 } else { 907 off = r * rows + 908 (dc - r) * (rows - 1) + row; 909 } 910 rc->rc_size = 1ULL << ashift; 911 rc->rc_abd = abd_get_offset_struct( 912 &rc->rc_abdstruct, abd, off << ashift, 913 rc->rc_size); 914 } 915 916 if (rc->rc_size == 0) 917 continue; 918 919 /* 920 * If any part of this row is in both old and new 921 * locations, the primary location is the old 922 * location. If this sector was already copied to the 923 * new location, we need to also write to the new, 924 * "shadow" location. 925 * 926 * Note, `row_phys_cols != physical_cols` indicates 927 * that the primary location is the old location. 928 * `b+c < reflow_offset_next` indicates that the copy 929 * to the new location has been initiated. We know 930 * that the copy has completed because we have the 931 * rangelock, which is held exclusively while the 932 * copy is in progress. 933 */ 934 if (row_use_scratch || 935 (row_phys_cols != physical_cols && 936 b + c < reflow_offset_next >> ashift)) { 937 rc->rc_shadow_devidx = (b + c) % physical_cols; 938 rc->rc_shadow_offset = 939 ((b + c) / physical_cols) << ashift; 940 if (row_use_scratch) 941 rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 942 } 943 944 asize += rc->rc_size; 945 } 946 947 /* 948 * See comment in vdev_raidz_map_alloc() 949 */ 950 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 951 (offset & (1ULL << 20))) { 952 ASSERT(rr->rr_cols >= 2); 953 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 954 955 int devidx0 = rr->rr_col[0].rc_devidx; 956 uint64_t offset0 = rr->rr_col[0].rc_offset; 957 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 958 uint64_t shadow_offset0 = 959 rr->rr_col[0].rc_shadow_offset; 960 961 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 962 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 963 rr->rr_col[0].rc_shadow_devidx = 964 rr->rr_col[1].rc_shadow_devidx; 965 rr->rr_col[0].rc_shadow_offset = 966 rr->rr_col[1].rc_shadow_offset; 967 968 rr->rr_col[1].rc_devidx = devidx0; 969 rr->rr_col[1].rc_offset = offset0; 970 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 971 rr->rr_col[1].rc_shadow_offset = shadow_offset0; 972 } 973 } 974 ASSERT3U(asize, ==, tot << ashift); 975 976 /* 977 * Determine if the block is contiguous, in which case we can use 978 * an aggregation. 979 */ 980 if (rows >= raidz_io_aggregate_rows) { 981 rm->rm_nphys_cols = physical_cols; 982 rm->rm_phys_col = 983 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 984 KM_SLEEP); 985 986 /* 987 * Determine the aggregate io's offset and size, and check 988 * that the io is contiguous. 989 */ 990 for (int i = 0; 991 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 992 raidz_row_t *rr = rm->rm_row[i]; 993 for (int c = 0; c < rr->rr_cols; c++) { 994 raidz_col_t *rc = &rr->rr_col[c]; 995 raidz_col_t *prc = 996 &rm->rm_phys_col[rc->rc_devidx]; 997 998 if (rc->rc_size == 0) 999 continue; 1000 1001 if (prc->rc_size == 0) { 1002 ASSERT0(prc->rc_offset); 1003 prc->rc_offset = rc->rc_offset; 1004 } else if (prc->rc_offset + prc->rc_size != 1005 rc->rc_offset) { 1006 /* 1007 * This block is not contiguous and 1008 * therefore can't be aggregated. 1009 * This is expected to be rare, so 1010 * the cost of allocating and then 1011 * freeing rm_phys_col is not 1012 * significant. 1013 */ 1014 kmem_free(rm->rm_phys_col, 1015 sizeof (raidz_col_t) * 1016 rm->rm_nphys_cols); 1017 rm->rm_phys_col = NULL; 1018 rm->rm_nphys_cols = 0; 1019 break; 1020 } 1021 prc->rc_size += rc->rc_size; 1022 } 1023 } 1024 } 1025 if (rm->rm_phys_col != NULL) { 1026 /* 1027 * Allocate aggregate ABD's. 1028 */ 1029 for (int i = 0; i < rm->rm_nphys_cols; i++) { 1030 raidz_col_t *prc = &rm->rm_phys_col[i]; 1031 1032 prc->rc_devidx = i; 1033 1034 if (prc->rc_size == 0) 1035 continue; 1036 1037 prc->rc_abd = 1038 abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1039 B_FALSE); 1040 } 1041 1042 /* 1043 * Point the parity abd's into the aggregate abd's. 1044 */ 1045 for (int i = 0; i < rm->rm_nrows; i++) { 1046 raidz_row_t *rr = rm->rm_row[i]; 1047 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1048 raidz_col_t *rc = &rr->rr_col[c]; 1049 raidz_col_t *prc = 1050 &rm->rm_phys_col[rc->rc_devidx]; 1051 rc->rc_abd = 1052 abd_get_offset_struct(&rc->rc_abdstruct, 1053 prc->rc_abd, 1054 rc->rc_offset - prc->rc_offset, 1055 rc->rc_size); 1056 } 1057 } 1058 } else { 1059 /* 1060 * Allocate new abd's for the parity sectors. 1061 */ 1062 for (int i = 0; i < rm->rm_nrows; i++) { 1063 raidz_row_t *rr = rm->rm_row[i]; 1064 for (int c = 0; c < rr->rr_firstdatacol; c++) { 1065 raidz_col_t *rc = &rr->rr_col[c]; 1066 rc->rc_abd = 1067 abd_alloc_linear(rc->rc_size, 1068 B_TRUE); 1069 } 1070 } 1071 } 1072 /* init RAIDZ parity ops */ 1073 rm->rm_ops = vdev_raidz_math_get_ops(); 1074 1075 return (rm); 1076 } 1077 1078 struct pqr_struct { 1079 uint64_t *p; 1080 uint64_t *q; 1081 uint64_t *r; 1082 }; 1083 1084 static int 1085 vdev_raidz_p_func(void *buf, size_t size, void *private) 1086 { 1087 struct pqr_struct *pqr = private; 1088 const uint64_t *src = buf; 1089 int cnt = size / sizeof (src[0]); 1090 1091 ASSERT(pqr->p && !pqr->q && !pqr->r); 1092 1093 for (int i = 0; i < cnt; i++, src++, pqr->p++) 1094 *pqr->p ^= *src; 1095 1096 return (0); 1097 } 1098 1099 static int 1100 vdev_raidz_pq_func(void *buf, size_t size, void *private) 1101 { 1102 struct pqr_struct *pqr = private; 1103 const uint64_t *src = buf; 1104 uint64_t mask; 1105 int cnt = size / sizeof (src[0]); 1106 1107 ASSERT(pqr->p && pqr->q && !pqr->r); 1108 1109 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1110 *pqr->p ^= *src; 1111 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1112 *pqr->q ^= *src; 1113 } 1114 1115 return (0); 1116 } 1117 1118 static int 1119 vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1120 { 1121 struct pqr_struct *pqr = private; 1122 const uint64_t *src = buf; 1123 uint64_t mask; 1124 int cnt = size / sizeof (src[0]); 1125 1126 ASSERT(pqr->p && pqr->q && pqr->r); 1127 1128 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1129 *pqr->p ^= *src; 1130 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1131 *pqr->q ^= *src; 1132 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1133 *pqr->r ^= *src; 1134 } 1135 1136 return (0); 1137 } 1138 1139 static void 1140 vdev_raidz_generate_parity_p(raidz_row_t *rr) 1141 { 1142 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1143 1144 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1145 abd_t *src = rr->rr_col[c].rc_abd; 1146 1147 if (c == rr->rr_firstdatacol) { 1148 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1149 } else { 1150 struct pqr_struct pqr = { p, NULL, NULL }; 1151 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1152 vdev_raidz_p_func, &pqr); 1153 } 1154 } 1155 } 1156 1157 static void 1158 vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1159 { 1160 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1161 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1162 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1163 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1164 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1165 1166 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1167 abd_t *src = rr->rr_col[c].rc_abd; 1168 1169 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1170 1171 if (c == rr->rr_firstdatacol) { 1172 ASSERT(ccnt == pcnt || ccnt == 0); 1173 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1174 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1175 1176 for (uint64_t i = ccnt; i < pcnt; i++) { 1177 p[i] = 0; 1178 q[i] = 0; 1179 } 1180 } else { 1181 struct pqr_struct pqr = { p, q, NULL }; 1182 1183 ASSERT(ccnt <= pcnt); 1184 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1185 vdev_raidz_pq_func, &pqr); 1186 1187 /* 1188 * Treat short columns as though they are full of 0s. 1189 * Note that there's therefore nothing needed for P. 1190 */ 1191 uint64_t mask; 1192 for (uint64_t i = ccnt; i < pcnt; i++) { 1193 VDEV_RAIDZ_64MUL_2(q[i], mask); 1194 } 1195 } 1196 } 1197 } 1198 1199 static void 1200 vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1201 { 1202 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1203 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1204 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 1205 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 1206 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1207 rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1208 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 1209 rr->rr_col[VDEV_RAIDZ_R].rc_size); 1210 1211 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1212 abd_t *src = rr->rr_col[c].rc_abd; 1213 1214 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1215 1216 if (c == rr->rr_firstdatacol) { 1217 ASSERT(ccnt == pcnt || ccnt == 0); 1218 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1219 (void) memcpy(q, p, rr->rr_col[c].rc_size); 1220 (void) memcpy(r, p, rr->rr_col[c].rc_size); 1221 1222 for (uint64_t i = ccnt; i < pcnt; i++) { 1223 p[i] = 0; 1224 q[i] = 0; 1225 r[i] = 0; 1226 } 1227 } else { 1228 struct pqr_struct pqr = { p, q, r }; 1229 1230 ASSERT(ccnt <= pcnt); 1231 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1232 vdev_raidz_pqr_func, &pqr); 1233 1234 /* 1235 * Treat short columns as though they are full of 0s. 1236 * Note that there's therefore nothing needed for P. 1237 */ 1238 uint64_t mask; 1239 for (uint64_t i = ccnt; i < pcnt; i++) { 1240 VDEV_RAIDZ_64MUL_2(q[i], mask); 1241 VDEV_RAIDZ_64MUL_4(r[i], mask); 1242 } 1243 } 1244 } 1245 } 1246 1247 /* 1248 * Generate RAID parity in the first virtual columns according to the number of 1249 * parity columns available. 1250 */ 1251 void 1252 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1253 { 1254 if (rr->rr_cols == 0) { 1255 /* 1256 * We are handling this block one row at a time (because 1257 * this block has a different logical vs physical width, 1258 * due to RAIDZ expansion), and this is a pad-only row, 1259 * which has no parity. 1260 */ 1261 return; 1262 } 1263 1264 /* Generate using the new math implementation */ 1265 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1266 return; 1267 1268 switch (rr->rr_firstdatacol) { 1269 case 1: 1270 vdev_raidz_generate_parity_p(rr); 1271 break; 1272 case 2: 1273 vdev_raidz_generate_parity_pq(rr); 1274 break; 1275 case 3: 1276 vdev_raidz_generate_parity_pqr(rr); 1277 break; 1278 default: 1279 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1280 } 1281 } 1282 1283 void 1284 vdev_raidz_generate_parity(raidz_map_t *rm) 1285 { 1286 for (int i = 0; i < rm->rm_nrows; i++) { 1287 raidz_row_t *rr = rm->rm_row[i]; 1288 vdev_raidz_generate_parity_row(rm, rr); 1289 } 1290 } 1291 1292 static int 1293 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1294 { 1295 (void) private; 1296 uint64_t *dst = dbuf; 1297 uint64_t *src = sbuf; 1298 int cnt = size / sizeof (src[0]); 1299 1300 for (int i = 0; i < cnt; i++) { 1301 dst[i] ^= src[i]; 1302 } 1303 1304 return (0); 1305 } 1306 1307 static int 1308 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1309 void *private) 1310 { 1311 (void) private; 1312 uint64_t *dst = dbuf; 1313 uint64_t *src = sbuf; 1314 uint64_t mask; 1315 int cnt = size / sizeof (dst[0]); 1316 1317 for (int i = 0; i < cnt; i++, dst++, src++) { 1318 VDEV_RAIDZ_64MUL_2(*dst, mask); 1319 *dst ^= *src; 1320 } 1321 1322 return (0); 1323 } 1324 1325 static int 1326 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1327 { 1328 (void) private; 1329 uint64_t *dst = buf; 1330 uint64_t mask; 1331 int cnt = size / sizeof (dst[0]); 1332 1333 for (int i = 0; i < cnt; i++, dst++) { 1334 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1335 VDEV_RAIDZ_64MUL_2(*dst, mask); 1336 } 1337 1338 return (0); 1339 } 1340 1341 struct reconst_q_struct { 1342 uint64_t *q; 1343 int exp; 1344 }; 1345 1346 static int 1347 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1348 { 1349 struct reconst_q_struct *rq = private; 1350 uint64_t *dst = buf; 1351 int cnt = size / sizeof (dst[0]); 1352 1353 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1354 int j; 1355 uint8_t *b; 1356 1357 *dst ^= *rq->q; 1358 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1359 *b = vdev_raidz_exp2(*b, rq->exp); 1360 } 1361 } 1362 1363 return (0); 1364 } 1365 1366 struct reconst_pq_struct { 1367 uint8_t *p; 1368 uint8_t *q; 1369 uint8_t *pxy; 1370 uint8_t *qxy; 1371 int aexp; 1372 int bexp; 1373 }; 1374 1375 static int 1376 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1377 { 1378 struct reconst_pq_struct *rpq = private; 1379 uint8_t *xd = xbuf; 1380 uint8_t *yd = ybuf; 1381 1382 for (int i = 0; i < size; 1383 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1384 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1385 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1386 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1387 } 1388 1389 return (0); 1390 } 1391 1392 static int 1393 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1394 { 1395 struct reconst_pq_struct *rpq = private; 1396 uint8_t *xd = xbuf; 1397 1398 for (int i = 0; i < size; 1399 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1400 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1401 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1402 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1403 } 1404 1405 return (0); 1406 } 1407 1408 static void 1409 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1410 { 1411 int x = tgts[0]; 1412 abd_t *dst, *src; 1413 1414 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1415 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1416 1417 ASSERT3U(ntgts, ==, 1); 1418 ASSERT3U(x, >=, rr->rr_firstdatacol); 1419 ASSERT3U(x, <, rr->rr_cols); 1420 1421 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1422 1423 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1424 dst = rr->rr_col[x].rc_abd; 1425 1426 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1427 1428 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1429 uint64_t size = MIN(rr->rr_col[x].rc_size, 1430 rr->rr_col[c].rc_size); 1431 1432 src = rr->rr_col[c].rc_abd; 1433 1434 if (c == x) 1435 continue; 1436 1437 (void) abd_iterate_func2(dst, src, 0, 0, size, 1438 vdev_raidz_reconst_p_func, NULL); 1439 } 1440 } 1441 1442 static void 1443 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1444 { 1445 int x = tgts[0]; 1446 int c, exp; 1447 abd_t *dst, *src; 1448 1449 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1450 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1451 1452 ASSERT(ntgts == 1); 1453 1454 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1455 1456 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1457 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 1458 rr->rr_col[c].rc_size); 1459 1460 src = rr->rr_col[c].rc_abd; 1461 dst = rr->rr_col[x].rc_abd; 1462 1463 if (c == rr->rr_firstdatacol) { 1464 abd_copy(dst, src, size); 1465 if (rr->rr_col[x].rc_size > size) { 1466 abd_zero_off(dst, size, 1467 rr->rr_col[x].rc_size - size); 1468 } 1469 } else { 1470 ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1471 (void) abd_iterate_func2(dst, src, 0, 0, size, 1472 vdev_raidz_reconst_q_pre_func, NULL); 1473 (void) abd_iterate_func(dst, 1474 size, rr->rr_col[x].rc_size - size, 1475 vdev_raidz_reconst_q_pre_tail_func, NULL); 1476 } 1477 } 1478 1479 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1480 dst = rr->rr_col[x].rc_abd; 1481 exp = 255 - (rr->rr_cols - 1 - x); 1482 1483 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1484 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1485 vdev_raidz_reconst_q_post_func, &rq); 1486 } 1487 1488 static void 1489 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1490 { 1491 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1492 abd_t *pdata, *qdata; 1493 uint64_t xsize, ysize; 1494 int x = tgts[0]; 1495 int y = tgts[1]; 1496 abd_t *xd, *yd; 1497 1498 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1499 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1500 1501 ASSERT(ntgts == 2); 1502 ASSERT(x < y); 1503 ASSERT(x >= rr->rr_firstdatacol); 1504 ASSERT(y < rr->rr_cols); 1505 1506 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1507 1508 /* 1509 * Move the parity data aside -- we're going to compute parity as 1510 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1511 * reuse the parity generation mechanism without trashing the actual 1512 * parity so we make those columns appear to be full of zeros by 1513 * setting their lengths to zero. 1514 */ 1515 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 1516 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 1517 xsize = rr->rr_col[x].rc_size; 1518 ysize = rr->rr_col[y].rc_size; 1519 1520 rr->rr_col[VDEV_RAIDZ_P].rc_abd = 1521 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1522 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 1523 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1524 rr->rr_col[x].rc_size = 0; 1525 rr->rr_col[y].rc_size = 0; 1526 1527 vdev_raidz_generate_parity_pq(rr); 1528 1529 rr->rr_col[x].rc_size = xsize; 1530 rr->rr_col[y].rc_size = ysize; 1531 1532 p = abd_to_buf(pdata); 1533 q = abd_to_buf(qdata); 1534 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1535 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1536 xd = rr->rr_col[x].rc_abd; 1537 yd = rr->rr_col[y].rc_abd; 1538 1539 /* 1540 * We now have: 1541 * Pxy = P + D_x + D_y 1542 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1543 * 1544 * We can then solve for D_x: 1545 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1546 * where 1547 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1548 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1549 * 1550 * With D_x in hand, we can easily solve for D_y: 1551 * D_y = P + Pxy + D_x 1552 */ 1553 1554 a = vdev_raidz_pow2[255 + x - y]; 1555 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1556 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1557 1558 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1559 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1560 1561 ASSERT3U(xsize, >=, ysize); 1562 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1563 1564 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1565 vdev_raidz_reconst_pq_func, &rpq); 1566 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1567 vdev_raidz_reconst_pq_tail_func, &rpq); 1568 1569 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1570 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1571 1572 /* 1573 * Restore the saved parity data. 1574 */ 1575 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 1576 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1577 } 1578 1579 /* 1580 * In the general case of reconstruction, we must solve the system of linear 1581 * equations defined by the coefficients used to generate parity as well as 1582 * the contents of the data and parity disks. This can be expressed with 1583 * vectors for the original data (D) and the actual data (d) and parity (p) 1584 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1585 * 1586 * __ __ __ __ 1587 * | | __ __ | p_0 | 1588 * | V | | D_0 | | p_m-1 | 1589 * | | x | : | = | d_0 | 1590 * | I | | D_n-1 | | : | 1591 * | | ~~ ~~ | d_n-1 | 1592 * ~~ ~~ ~~ ~~ 1593 * 1594 * I is simply a square identity matrix of size n, and V is a vandermonde 1595 * matrix defined by the coefficients we chose for the various parity columns 1596 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1597 * computation as well as linear separability. 1598 * 1599 * __ __ __ __ 1600 * | 1 .. 1 1 1 | | p_0 | 1601 * | 2^n-1 .. 4 2 1 | __ __ | : | 1602 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1603 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1604 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1605 * | : : : : | | : | | d_2 | 1606 * | 0 .. 1 0 0 | | D_n-1 | | : | 1607 * | 0 .. 0 1 0 | ~~ ~~ | : | 1608 * | 0 .. 0 0 1 | | d_n-1 | 1609 * ~~ ~~ ~~ ~~ 1610 * 1611 * Note that I, V, d, and p are known. To compute D, we must invert the 1612 * matrix and use the known data and parity values to reconstruct the unknown 1613 * data values. We begin by removing the rows in V|I and d|p that correspond 1614 * to failed or missing columns; we then make V|I square (n x n) and d|p 1615 * sized n by removing rows corresponding to unused parity from the bottom up 1616 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1617 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1618 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1619 * __ __ 1620 * | 1 1 1 1 1 1 1 1 | 1621 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1622 * | 19 205 116 29 64 16 4 1 | / / 1623 * | 1 0 0 0 0 0 0 0 | / / 1624 * | 0 1 0 0 0 0 0 0 | <--' / 1625 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1626 * | 0 0 0 1 0 0 0 0 | 1627 * | 0 0 0 0 1 0 0 0 | 1628 * | 0 0 0 0 0 1 0 0 | 1629 * | 0 0 0 0 0 0 1 0 | 1630 * | 0 0 0 0 0 0 0 1 | 1631 * ~~ ~~ 1632 * __ __ 1633 * | 1 1 1 1 1 1 1 1 | 1634 * | 128 64 32 16 8 4 2 1 | 1635 * | 19 205 116 29 64 16 4 1 | 1636 * | 1 0 0 0 0 0 0 0 | 1637 * | 0 1 0 0 0 0 0 0 | 1638 * (V|I)' = | 0 0 1 0 0 0 0 0 | 1639 * | 0 0 0 1 0 0 0 0 | 1640 * | 0 0 0 0 1 0 0 0 | 1641 * | 0 0 0 0 0 1 0 0 | 1642 * | 0 0 0 0 0 0 1 0 | 1643 * | 0 0 0 0 0 0 0 1 | 1644 * ~~ ~~ 1645 * 1646 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1647 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1648 * matrix is not singular. 1649 * __ __ 1650 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1651 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1652 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1653 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1654 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1655 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1656 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1657 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1658 * ~~ ~~ 1659 * __ __ 1660 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1661 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1662 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1663 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1664 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1665 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1666 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1667 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1668 * ~~ ~~ 1669 * __ __ 1670 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1671 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1672 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1673 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1674 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1675 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1676 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1677 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1678 * ~~ ~~ 1679 * __ __ 1680 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1681 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1682 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1683 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1684 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1685 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1686 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1687 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1688 * ~~ ~~ 1689 * __ __ 1690 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1691 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1692 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1693 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1694 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1695 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1696 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1697 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1698 * ~~ ~~ 1699 * __ __ 1700 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1701 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1702 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1703 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1704 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1705 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1706 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1707 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1708 * ~~ ~~ 1709 * __ __ 1710 * | 0 0 1 0 0 0 0 0 | 1711 * | 167 100 5 41 159 169 217 208 | 1712 * | 166 100 4 40 158 168 216 209 | 1713 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1714 * | 0 0 0 0 1 0 0 0 | 1715 * | 0 0 0 0 0 1 0 0 | 1716 * | 0 0 0 0 0 0 1 0 | 1717 * | 0 0 0 0 0 0 0 1 | 1718 * ~~ ~~ 1719 * 1720 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1721 * of the missing data. 1722 * 1723 * As is apparent from the example above, the only non-trivial rows in the 1724 * inverse matrix correspond to the data disks that we're trying to 1725 * reconstruct. Indeed, those are the only rows we need as the others would 1726 * only be useful for reconstructing data known or assumed to be valid. For 1727 * that reason, we only build the coefficients in the rows that correspond to 1728 * targeted columns. 1729 */ 1730 1731 static void 1732 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1733 uint8_t **rows) 1734 { 1735 int i, j; 1736 int pow; 1737 1738 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1739 1740 /* 1741 * Fill in the missing rows of interest. 1742 */ 1743 for (i = 0; i < nmap; i++) { 1744 ASSERT3S(0, <=, map[i]); 1745 ASSERT3S(map[i], <=, 2); 1746 1747 pow = map[i] * n; 1748 if (pow > 255) 1749 pow -= 255; 1750 ASSERT(pow <= 255); 1751 1752 for (j = 0; j < n; j++) { 1753 pow -= map[i]; 1754 if (pow < 0) 1755 pow += 255; 1756 rows[i][j] = vdev_raidz_pow2[pow]; 1757 } 1758 } 1759 } 1760 1761 static void 1762 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1763 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1764 { 1765 int i, j, ii, jj; 1766 uint8_t log; 1767 1768 /* 1769 * Assert that the first nmissing entries from the array of used 1770 * columns correspond to parity columns and that subsequent entries 1771 * correspond to data columns. 1772 */ 1773 for (i = 0; i < nmissing; i++) { 1774 ASSERT3S(used[i], <, rr->rr_firstdatacol); 1775 } 1776 for (; i < n; i++) { 1777 ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1778 } 1779 1780 /* 1781 * First initialize the storage where we'll compute the inverse rows. 1782 */ 1783 for (i = 0; i < nmissing; i++) { 1784 for (j = 0; j < n; j++) { 1785 invrows[i][j] = (i == j) ? 1 : 0; 1786 } 1787 } 1788 1789 /* 1790 * Subtract all trivial rows from the rows of consequence. 1791 */ 1792 for (i = 0; i < nmissing; i++) { 1793 for (j = nmissing; j < n; j++) { 1794 ASSERT3U(used[j], >=, rr->rr_firstdatacol); 1795 jj = used[j] - rr->rr_firstdatacol; 1796 ASSERT3S(jj, <, n); 1797 invrows[i][j] = rows[i][jj]; 1798 rows[i][jj] = 0; 1799 } 1800 } 1801 1802 /* 1803 * For each of the rows of interest, we must normalize it and subtract 1804 * a multiple of it from the other rows. 1805 */ 1806 for (i = 0; i < nmissing; i++) { 1807 for (j = 0; j < missing[i]; j++) { 1808 ASSERT0(rows[i][j]); 1809 } 1810 ASSERT3U(rows[i][missing[i]], !=, 0); 1811 1812 /* 1813 * Compute the inverse of the first element and multiply each 1814 * element in the row by that value. 1815 */ 1816 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1817 1818 for (j = 0; j < n; j++) { 1819 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1820 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1821 } 1822 1823 for (ii = 0; ii < nmissing; ii++) { 1824 if (i == ii) 1825 continue; 1826 1827 ASSERT3U(rows[ii][missing[i]], !=, 0); 1828 1829 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1830 1831 for (j = 0; j < n; j++) { 1832 rows[ii][j] ^= 1833 vdev_raidz_exp2(rows[i][j], log); 1834 invrows[ii][j] ^= 1835 vdev_raidz_exp2(invrows[i][j], log); 1836 } 1837 } 1838 } 1839 1840 /* 1841 * Verify that the data that is left in the rows are properly part of 1842 * an identity matrix. 1843 */ 1844 for (i = 0; i < nmissing; i++) { 1845 for (j = 0; j < n; j++) { 1846 if (j == missing[i]) { 1847 ASSERT3U(rows[i][j], ==, 1); 1848 } else { 1849 ASSERT0(rows[i][j]); 1850 } 1851 } 1852 } 1853 } 1854 1855 static void 1856 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1857 int *missing, uint8_t **invrows, const uint8_t *used) 1858 { 1859 int i, j, x, cc, c; 1860 uint8_t *src; 1861 uint64_t ccount; 1862 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1863 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1864 uint8_t log = 0; 1865 uint8_t val; 1866 int ll; 1867 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1868 uint8_t *p, *pp; 1869 size_t psize; 1870 1871 psize = sizeof (invlog[0][0]) * n * nmissing; 1872 p = kmem_alloc(psize, KM_SLEEP); 1873 1874 for (pp = p, i = 0; i < nmissing; i++) { 1875 invlog[i] = pp; 1876 pp += n; 1877 } 1878 1879 for (i = 0; i < nmissing; i++) { 1880 for (j = 0; j < n; j++) { 1881 ASSERT3U(invrows[i][j], !=, 0); 1882 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1883 } 1884 } 1885 1886 for (i = 0; i < n; i++) { 1887 c = used[i]; 1888 ASSERT3U(c, <, rr->rr_cols); 1889 1890 ccount = rr->rr_col[c].rc_size; 1891 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 1892 if (ccount == 0) 1893 continue; 1894 src = abd_to_buf(rr->rr_col[c].rc_abd); 1895 for (j = 0; j < nmissing; j++) { 1896 cc = missing[j] + rr->rr_firstdatacol; 1897 ASSERT3U(cc, >=, rr->rr_firstdatacol); 1898 ASSERT3U(cc, <, rr->rr_cols); 1899 ASSERT3U(cc, !=, c); 1900 1901 dcount[j] = rr->rr_col[cc].rc_size; 1902 if (dcount[j] != 0) 1903 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1904 } 1905 1906 for (x = 0; x < ccount; x++, src++) { 1907 if (*src != 0) 1908 log = vdev_raidz_log2[*src]; 1909 1910 for (cc = 0; cc < nmissing; cc++) { 1911 if (x >= dcount[cc]) 1912 continue; 1913 1914 if (*src == 0) { 1915 val = 0; 1916 } else { 1917 if ((ll = log + invlog[cc][i]) >= 255) 1918 ll -= 255; 1919 val = vdev_raidz_pow2[ll]; 1920 } 1921 1922 if (i == 0) 1923 dst[cc][x] = val; 1924 else 1925 dst[cc][x] ^= val; 1926 } 1927 } 1928 } 1929 1930 kmem_free(p, psize); 1931 } 1932 1933 static void 1934 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1935 { 1936 int i, c, t, tt; 1937 unsigned int n; 1938 unsigned int nmissing_rows; 1939 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1940 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1941 uint8_t *p, *pp; 1942 size_t psize; 1943 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1944 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1945 uint8_t *used; 1946 1947 abd_t **bufs = NULL; 1948 1949 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1950 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1951 /* 1952 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1953 * temporary linear ABDs if any non-linear ABDs are found. 1954 */ 1955 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1956 ASSERT(rr->rr_col[i].rc_abd != NULL); 1957 if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 1958 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 1959 KM_PUSHPAGE); 1960 1961 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1962 raidz_col_t *col = &rr->rr_col[c]; 1963 1964 bufs[c] = col->rc_abd; 1965 if (bufs[c] != NULL) { 1966 col->rc_abd = abd_alloc_linear( 1967 col->rc_size, B_TRUE); 1968 abd_copy(col->rc_abd, bufs[c], 1969 col->rc_size); 1970 } 1971 } 1972 1973 break; 1974 } 1975 } 1976 1977 n = rr->rr_cols - rr->rr_firstdatacol; 1978 1979 /* 1980 * Figure out which data columns are missing. 1981 */ 1982 nmissing_rows = 0; 1983 for (t = 0; t < ntgts; t++) { 1984 if (tgts[t] >= rr->rr_firstdatacol) { 1985 missing_rows[nmissing_rows++] = 1986 tgts[t] - rr->rr_firstdatacol; 1987 } 1988 } 1989 1990 /* 1991 * Figure out which parity columns to use to help generate the missing 1992 * data columns. 1993 */ 1994 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1995 ASSERT(tt < ntgts); 1996 ASSERT(c < rr->rr_firstdatacol); 1997 1998 /* 1999 * Skip any targeted parity columns. 2000 */ 2001 if (c == tgts[tt]) { 2002 tt++; 2003 continue; 2004 } 2005 2006 parity_map[i] = c; 2007 i++; 2008 } 2009 2010 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 2011 nmissing_rows * n + sizeof (used[0]) * n; 2012 p = kmem_alloc(psize, KM_SLEEP); 2013 2014 for (pp = p, i = 0; i < nmissing_rows; i++) { 2015 rows[i] = pp; 2016 pp += n; 2017 invrows[i] = pp; 2018 pp += n; 2019 } 2020 used = pp; 2021 2022 for (i = 0; i < nmissing_rows; i++) { 2023 used[i] = parity_map[i]; 2024 } 2025 2026 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2027 if (tt < nmissing_rows && 2028 c == missing_rows[tt] + rr->rr_firstdatacol) { 2029 tt++; 2030 continue; 2031 } 2032 2033 ASSERT3S(i, <, n); 2034 used[i] = c; 2035 i++; 2036 } 2037 2038 /* 2039 * Initialize the interesting rows of the matrix. 2040 */ 2041 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2042 2043 /* 2044 * Invert the matrix. 2045 */ 2046 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2047 invrows, used); 2048 2049 /* 2050 * Reconstruct the missing data using the generated matrix. 2051 */ 2052 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2053 invrows, used); 2054 2055 kmem_free(p, psize); 2056 2057 /* 2058 * copy back from temporary linear abds and free them 2059 */ 2060 if (bufs) { 2061 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2062 raidz_col_t *col = &rr->rr_col[c]; 2063 2064 if (bufs[c] != NULL) { 2065 abd_copy(bufs[c], col->rc_abd, col->rc_size); 2066 abd_free(col->rc_abd); 2067 } 2068 col->rc_abd = bufs[c]; 2069 } 2070 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2071 } 2072 } 2073 2074 static void 2075 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 2076 const int *t, int nt) 2077 { 2078 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2079 int ntgts; 2080 int i, c, ret; 2081 int nbadparity, nbaddata; 2082 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2083 2084 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2085 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2086 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2087 (int)rr->rr_missingparity); 2088 } 2089 2090 nbadparity = rr->rr_firstdatacol; 2091 nbaddata = rr->rr_cols - nbadparity; 2092 ntgts = 0; 2093 for (i = 0, c = 0; c < rr->rr_cols; c++) { 2094 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2095 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2096 "offset=%llx error=%u)", 2097 rr, c, (int)rr->rr_col[c].rc_devidx, 2098 (long long)rr->rr_col[c].rc_offset, 2099 (int)rr->rr_col[c].rc_error); 2100 } 2101 if (c < rr->rr_firstdatacol) 2102 parity_valid[c] = B_FALSE; 2103 2104 if (i < nt && c == t[i]) { 2105 tgts[ntgts++] = c; 2106 i++; 2107 } else if (rr->rr_col[c].rc_error != 0) { 2108 tgts[ntgts++] = c; 2109 } else if (c >= rr->rr_firstdatacol) { 2110 nbaddata--; 2111 } else { 2112 parity_valid[c] = B_TRUE; 2113 nbadparity--; 2114 } 2115 } 2116 2117 ASSERT(ntgts >= nt); 2118 ASSERT(nbaddata >= 0); 2119 ASSERT(nbaddata + nbadparity == ntgts); 2120 2121 dt = &tgts[nbadparity]; 2122 2123 /* Reconstruct using the new math implementation */ 2124 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2125 if (ret != RAIDZ_ORIGINAL_IMPL) 2126 return; 2127 2128 /* 2129 * See if we can use any of our optimized reconstruction routines. 2130 */ 2131 switch (nbaddata) { 2132 case 1: 2133 if (parity_valid[VDEV_RAIDZ_P]) { 2134 vdev_raidz_reconstruct_p(rr, dt, 1); 2135 return; 2136 } 2137 2138 ASSERT(rr->rr_firstdatacol > 1); 2139 2140 if (parity_valid[VDEV_RAIDZ_Q]) { 2141 vdev_raidz_reconstruct_q(rr, dt, 1); 2142 return; 2143 } 2144 2145 ASSERT(rr->rr_firstdatacol > 2); 2146 break; 2147 2148 case 2: 2149 ASSERT(rr->rr_firstdatacol > 1); 2150 2151 if (parity_valid[VDEV_RAIDZ_P] && 2152 parity_valid[VDEV_RAIDZ_Q]) { 2153 vdev_raidz_reconstruct_pq(rr, dt, 2); 2154 return; 2155 } 2156 2157 ASSERT(rr->rr_firstdatacol > 2); 2158 2159 break; 2160 } 2161 2162 vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2163 } 2164 2165 static int 2166 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2167 uint64_t *logical_ashift, uint64_t *physical_ashift) 2168 { 2169 vdev_raidz_t *vdrz = vd->vdev_tsd; 2170 uint64_t nparity = vdrz->vd_nparity; 2171 int c; 2172 int lasterror = 0; 2173 int numerrors = 0; 2174 2175 ASSERT(nparity > 0); 2176 2177 if (nparity > VDEV_RAIDZ_MAXPARITY || 2178 vd->vdev_children < nparity + 1) { 2179 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2180 return (SET_ERROR(EINVAL)); 2181 } 2182 2183 vdev_open_children(vd); 2184 2185 for (c = 0; c < vd->vdev_children; c++) { 2186 vdev_t *cvd = vd->vdev_child[c]; 2187 2188 if (cvd->vdev_open_error != 0) { 2189 lasterror = cvd->vdev_open_error; 2190 numerrors++; 2191 continue; 2192 } 2193 2194 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2195 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2196 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2197 } 2198 for (c = 0; c < vd->vdev_children; c++) { 2199 vdev_t *cvd = vd->vdev_child[c]; 2200 2201 if (cvd->vdev_open_error != 0) 2202 continue; 2203 *physical_ashift = vdev_best_ashift(*logical_ashift, 2204 *physical_ashift, cvd->vdev_physical_ashift); 2205 } 2206 2207 if (vd->vdev_rz_expanding) { 2208 *asize *= vd->vdev_children - 1; 2209 *max_asize *= vd->vdev_children - 1; 2210 2211 vd->vdev_min_asize = *asize; 2212 } else { 2213 *asize *= vd->vdev_children; 2214 *max_asize *= vd->vdev_children; 2215 } 2216 2217 if (numerrors > nparity) { 2218 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2219 return (lasterror); 2220 } 2221 2222 return (0); 2223 } 2224 2225 static void 2226 vdev_raidz_close(vdev_t *vd) 2227 { 2228 for (int c = 0; c < vd->vdev_children; c++) { 2229 if (vd->vdev_child[c] != NULL) 2230 vdev_close(vd->vdev_child[c]); 2231 } 2232 } 2233 2234 /* 2235 * Return the logical width to use, given the txg in which the allocation 2236 * happened. 2237 */ 2238 static uint64_t 2239 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2240 { 2241 reflow_node_t lookup = { 2242 .re_txg = txg, 2243 }; 2244 avl_index_t where; 2245 2246 uint64_t width; 2247 mutex_enter(&vdrz->vd_expand_lock); 2248 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2249 if (re != NULL) { 2250 width = re->re_logical_width; 2251 } else { 2252 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2253 if (re != NULL) 2254 width = re->re_logical_width; 2255 else 2256 width = vdrz->vd_original_width; 2257 } 2258 mutex_exit(&vdrz->vd_expand_lock); 2259 return (width); 2260 } 2261 /* 2262 * This code converts an asize into the largest psize that can safely be written 2263 * to an allocation of that size for this vdev. 2264 * 2265 * Note that this function will not take into account the effect of gang 2266 * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of 2267 * the psize_to_asize function. 2268 */ 2269 static uint64_t 2270 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) 2271 { 2272 vdev_raidz_t *vdrz = vd->vdev_tsd; 2273 uint64_t psize; 2274 uint64_t ashift = vd->vdev_top->vdev_ashift; 2275 uint64_t nparity = vdrz->vd_nparity; 2276 2277 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); 2278 2279 ASSERT0(asize % (1 << ashift)); 2280 2281 psize = (asize >> ashift); 2282 /* 2283 * If the roundup to nparity + 1 caused us to spill into a new row, we 2284 * need to ignore that row entirely (since it can't store data or 2285 * parity). 2286 */ 2287 uint64_t rows = psize / cols; 2288 psize = psize - (rows * cols) <= nparity ? rows * cols : psize; 2289 /* Subtract out parity sectors for each row storing data. */ 2290 psize -= nparity * DIV_ROUND_UP(psize, cols); 2291 psize <<= ashift; 2292 2293 return (psize); 2294 } 2295 2296 /* 2297 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2298 * more space due to the lower data-to-parity ratio. In this case it's 2299 * important to pass in the correct txg. Note that vdev_gang_header_asize() 2300 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2301 * regardless of txg. This is assured because for a single data sector, we 2302 * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2303 */ 2304 static uint64_t 2305 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2306 { 2307 vdev_raidz_t *vdrz = vd->vdev_tsd; 2308 uint64_t asize; 2309 uint64_t ashift = vd->vdev_top->vdev_ashift; 2310 uint64_t nparity = vdrz->vd_nparity; 2311 2312 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); 2313 2314 asize = ((psize - 1) >> ashift) + 1; 2315 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2316 asize = roundup(asize, nparity + 1) << ashift; 2317 2318 #ifdef ZFS_DEBUG 2319 uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2320 uint64_t ncols_new = vdrz->vd_physical_width; 2321 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2322 (ncols_new - nparity)); 2323 asize_new = roundup(asize_new, nparity + 1) << ashift; 2324 VERIFY3U(asize_new, <=, asize); 2325 #endif 2326 2327 return (asize); 2328 } 2329 2330 /* 2331 * The allocatable space for a raidz vdev is N * sizeof(smallest child) 2332 * so each child must provide at least 1/Nth of its asize. 2333 */ 2334 static uint64_t 2335 vdev_raidz_min_asize(vdev_t *vd) 2336 { 2337 return ((vd->vdev_min_asize + vd->vdev_children - 1) / 2338 vd->vdev_children); 2339 } 2340 2341 /* 2342 * return B_TRUE if a read should be skipped due to being too slow. 2343 * 2344 * In vdev_child_slow_outlier() it looks for outliers based on disk 2345 * latency from the most recent child reads. Here we're checking if, 2346 * over time, a disk has has been an outlier too many times and is 2347 * now in a sit out period. 2348 */ 2349 boolean_t 2350 vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) 2351 { 2352 if (vdev_read_sit_out_secs == 0) 2353 return (B_FALSE); 2354 2355 /* Avoid skipping a data column read when scrubbing */ 2356 if (io_flags & ZIO_FLAG_SCRUB) 2357 return (B_FALSE); 2358 2359 if (!vd->vdev_ops->vdev_op_leaf) { 2360 boolean_t sitting = B_FALSE; 2361 for (int c = 0; c < vd->vdev_children; c++) { 2362 sitting |= vdev_sit_out_reads(vd->vdev_child[c], 2363 io_flags); 2364 } 2365 return (sitting); 2366 } 2367 2368 if (vd->vdev_read_sit_out_expire >= gethrestime_sec()) 2369 return (B_TRUE); 2370 2371 vd->vdev_read_sit_out_expire = 0; 2372 2373 return (B_FALSE); 2374 } 2375 2376 void 2377 vdev_raidz_child_done(zio_t *zio) 2378 { 2379 raidz_col_t *rc = zio->io_private; 2380 2381 ASSERT3P(rc->rc_abd, !=, NULL); 2382 rc->rc_error = zio->io_error; 2383 rc->rc_tried = 1; 2384 rc->rc_skipped = 0; 2385 } 2386 2387 static void 2388 vdev_raidz_shadow_child_done(zio_t *zio) 2389 { 2390 raidz_col_t *rc = zio->io_private; 2391 2392 rc->rc_shadow_error = zio->io_error; 2393 } 2394 2395 static void 2396 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2397 { 2398 (void) rm; 2399 #ifdef ZFS_DEBUG 2400 zfs_range_seg64_t logical_rs, physical_rs, remain_rs; 2401 logical_rs.rs_start = rr->rr_offset; 2402 logical_rs.rs_end = logical_rs.rs_start + 2403 vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, 2404 BP_GET_PHYSICAL_BIRTH(zio->io_bp)); 2405 2406 raidz_col_t *rc = &rr->rr_col[col]; 2407 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2408 2409 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 2410 ASSERT(vdev_xlate_is_empty(&remain_rs)); 2411 if (vdev_xlate_is_empty(&physical_rs)) { 2412 /* 2413 * If we are in the middle of expansion, the 2414 * physical->logical mapping is changing so vdev_xlate() 2415 * can't give us a reliable answer. 2416 */ 2417 return; 2418 } 2419 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2420 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2421 /* 2422 * It would be nice to assert that rs_end is equal 2423 * to rc_offset + rc_size but there might be an 2424 * optional I/O at the end that is not accounted in 2425 * rc_size. 2426 */ 2427 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2428 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2429 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2430 } else { 2431 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2432 } 2433 #endif 2434 } 2435 2436 static void 2437 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 2438 { 2439 vdev_t *vd = zio->io_vd; 2440 raidz_map_t *rm = zio->io_vsd; 2441 2442 vdev_raidz_generate_parity_row(rm, rr); 2443 2444 for (int c = 0; c < rr->rr_scols; c++) { 2445 raidz_col_t *rc = &rr->rr_col[c]; 2446 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2447 2448 /* Verify physical to logical translation */ 2449 vdev_raidz_io_verify(zio, rm, rr, c); 2450 2451 if (rc->rc_size == 0) 2452 continue; 2453 2454 ASSERT3U(rc->rc_offset + rc->rc_size, <, 2455 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2456 2457 ASSERT3P(rc->rc_abd, !=, NULL); 2458 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2459 rc->rc_offset, rc->rc_abd, 2460 abd_get_size(rc->rc_abd), zio->io_type, 2461 zio->io_priority, 0, vdev_raidz_child_done, rc)); 2462 2463 if (rc->rc_shadow_devidx != INT_MAX) { 2464 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2465 2466 ASSERT3U( 2467 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2468 cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2469 2470 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2471 rc->rc_shadow_offset, rc->rc_abd, 2472 abd_get_size(rc->rc_abd), 2473 zio->io_type, zio->io_priority, 0, 2474 vdev_raidz_shadow_child_done, rc)); 2475 } 2476 } 2477 } 2478 2479 /* 2480 * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2481 * This only works for vdev_raidz_map_alloc() (not _expanded()). 2482 */ 2483 static void 2484 raidz_start_skip_writes(zio_t *zio) 2485 { 2486 vdev_t *vd = zio->io_vd; 2487 uint64_t ashift = vd->vdev_top->vdev_ashift; 2488 raidz_map_t *rm = zio->io_vsd; 2489 ASSERT3U(rm->rm_nrows, ==, 1); 2490 raidz_row_t *rr = rm->rm_row[0]; 2491 for (int c = 0; c < rr->rr_scols; c++) { 2492 raidz_col_t *rc = &rr->rr_col[c]; 2493 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2494 if (rc->rc_size != 0) 2495 continue; 2496 ASSERT0P(rc->rc_abd); 2497 2498 ASSERT3U(rc->rc_offset, <, 2499 cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2500 2501 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2502 NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2503 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2504 } 2505 } 2506 2507 static void 2508 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 2509 { 2510 vdev_t *vd = zio->io_vd; 2511 2512 /* 2513 * Iterate over the columns in reverse order so that we hit the parity 2514 * last -- any errors along the way will force us to read the parity. 2515 */ 2516 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2517 raidz_col_t *rc = &rr->rr_col[c]; 2518 if (rc->rc_size == 0) 2519 continue; 2520 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2521 if (!vdev_readable(cvd)) { 2522 if (c >= rr->rr_firstdatacol) 2523 rr->rr_missingdata++; 2524 else 2525 rr->rr_missingparity++; 2526 rc->rc_error = SET_ERROR(ENXIO); 2527 rc->rc_tried = 1; /* don't even try */ 2528 rc->rc_skipped = 1; 2529 continue; 2530 } 2531 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2532 if (c >= rr->rr_firstdatacol) 2533 rr->rr_missingdata++; 2534 else 2535 rr->rr_missingparity++; 2536 rc->rc_error = SET_ERROR(ESTALE); 2537 rc->rc_skipped = 1; 2538 continue; 2539 } 2540 2541 if (vdev_sit_out_reads(cvd, zio->io_flags)) { 2542 rr->rr_outlier_cnt++; 2543 ASSERT0(rc->rc_latency_outlier); 2544 rc->rc_latency_outlier = 1; 2545 } 2546 } 2547 2548 /* 2549 * When the row contains a latency outlier and sufficient parity 2550 * exists to reconstruct the column data, then skip reading the 2551 * known slow child vdev as a performance optimization. 2552 */ 2553 if (rr->rr_outlier_cnt > 0 && 2554 (rr->rr_firstdatacol - rr->rr_missingparity) >= 2555 (rr->rr_missingdata + 1)) { 2556 2557 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2558 raidz_col_t *rc = &rr->rr_col[c]; 2559 2560 if (rc->rc_error == 0 && rc->rc_latency_outlier) { 2561 if (c >= rr->rr_firstdatacol) 2562 rr->rr_missingdata++; 2563 else 2564 rr->rr_missingparity++; 2565 rc->rc_error = SET_ERROR(EAGAIN); 2566 rc->rc_skipped = 1; 2567 break; 2568 } 2569 } 2570 } 2571 2572 for (int c = rr->rr_cols - 1; c >= 0; c--) { 2573 raidz_col_t *rc = &rr->rr_col[c]; 2574 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2575 2576 if (rc->rc_error || rc->rc_size == 0) 2577 continue; 2578 2579 if (forceparity || 2580 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 2581 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2582 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2583 rc->rc_offset, rc->rc_abd, rc->rc_size, 2584 zio->io_type, zio->io_priority, 0, 2585 vdev_raidz_child_done, rc)); 2586 } 2587 } 2588 } 2589 2590 static void 2591 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2592 { 2593 vdev_t *vd = zio->io_vd; 2594 2595 for (int i = 0; i < rm->rm_nphys_cols; i++) { 2596 raidz_col_t *prc = &rm->rm_phys_col[i]; 2597 if (prc->rc_size == 0) 2598 continue; 2599 2600 ASSERT3U(prc->rc_devidx, ==, i); 2601 vdev_t *cvd = vd->vdev_child[i]; 2602 2603 if (!vdev_readable(cvd)) { 2604 prc->rc_error = SET_ERROR(ENXIO); 2605 prc->rc_tried = 1; /* don't even try */ 2606 prc->rc_skipped = 1; 2607 continue; 2608 } 2609 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2610 prc->rc_error = SET_ERROR(ESTALE); 2611 prc->rc_skipped = 1; 2612 continue; 2613 } 2614 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2615 prc->rc_offset, prc->rc_abd, prc->rc_size, 2616 zio->io_type, zio->io_priority, 0, 2617 vdev_raidz_child_done, prc)); 2618 } 2619 } 2620 2621 static void 2622 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2623 { 2624 /* 2625 * If there are multiple rows, we will be hitting 2626 * all disks, so go ahead and read the parity so 2627 * that we are reading in decent size chunks. 2628 */ 2629 boolean_t forceparity = rm->rm_nrows > 1; 2630 2631 if (rm->rm_phys_col) { 2632 vdev_raidz_io_start_read_phys_cols(zio, rm); 2633 } else { 2634 for (int i = 0; i < rm->rm_nrows; i++) { 2635 raidz_row_t *rr = rm->rm_row[i]; 2636 vdev_raidz_io_start_read_row(zio, rr, forceparity); 2637 } 2638 } 2639 } 2640 2641 /* 2642 * Start an IO operation on a RAIDZ VDev 2643 * 2644 * Outline: 2645 * - For write operations: 2646 * 1. Generate the parity data 2647 * 2. Create child zio write operations to each column's vdev, for both 2648 * data and parity. 2649 * 3. If the column skips any sectors for padding, create optional dummy 2650 * write zio children for those areas to improve aggregation continuity. 2651 * - For read operations: 2652 * 1. Create child zio read operations to each data column's vdev to read 2653 * the range of data required for zio. 2654 * 2. If this is a scrub or resilver operation, or if any of the data 2655 * vdevs have had errors, then create zio read operations to the parity 2656 * columns' VDevs as well. 2657 */ 2658 static void 2659 vdev_raidz_io_start(zio_t *zio) 2660 { 2661 vdev_t *vd = zio->io_vd; 2662 vdev_t *tvd = vd->vdev_top; 2663 vdev_raidz_t *vdrz = vd->vdev_tsd; 2664 raidz_map_t *rm; 2665 2666 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2667 BP_GET_PHYSICAL_BIRTH(zio->io_bp)); 2668 if (logical_width != vdrz->vd_physical_width) { 2669 zfs_locked_range_t *lr = NULL; 2670 uint64_t synced_offset = UINT64_MAX; 2671 uint64_t next_offset = UINT64_MAX; 2672 boolean_t use_scratch = B_FALSE; 2673 /* 2674 * Note: when the expansion is completing, we set 2675 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2676 * in a later txg than when we last update spa_ubsync's state 2677 * (see the end of spa_raidz_expand_thread()). Therefore we 2678 * may see vre_state!=SCANNING before 2679 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2680 * on disk, but the copying progress has been synced to disk 2681 * (and reflected in spa_ubsync). In this case it's fine to 2682 * treat the expansion as completed, since if we crash there's 2683 * no additional copying to do. 2684 */ 2685 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2686 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2687 &vdrz->vn_vre); 2688 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2689 zio->io_offset, zio->io_size, RL_READER); 2690 use_scratch = 2691 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2692 RRSS_SCRATCH_VALID); 2693 synced_offset = 2694 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2695 next_offset = vdrz->vn_vre.vre_offset; 2696 /* 2697 * If we haven't resumed expanding since importing the 2698 * pool, vre_offset won't have been set yet. In 2699 * this case the next offset to be copied is the same 2700 * as what was synced. 2701 */ 2702 if (next_offset == UINT64_MAX) { 2703 next_offset = synced_offset; 2704 } 2705 } 2706 2707 rm = vdev_raidz_map_alloc_expanded(zio, 2708 tvd->vdev_ashift, vdrz->vd_physical_width, 2709 logical_width, vdrz->vd_nparity, 2710 synced_offset, next_offset, use_scratch); 2711 rm->rm_lr = lr; 2712 } else { 2713 rm = vdev_raidz_map_alloc(zio, 2714 tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2715 } 2716 rm->rm_original_width = vdrz->vd_original_width; 2717 2718 zio->io_vsd = rm; 2719 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2720 if (zio->io_type == ZIO_TYPE_WRITE) { 2721 for (int i = 0; i < rm->rm_nrows; i++) { 2722 vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2723 } 2724 2725 if (logical_width == vdrz->vd_physical_width) { 2726 raidz_start_skip_writes(zio); 2727 } 2728 } else { 2729 ASSERT(zio->io_type == ZIO_TYPE_READ); 2730 vdev_raidz_io_start_read(zio, rm); 2731 } 2732 2733 zio_execute(zio); 2734 } 2735 2736 /* 2737 * Report a checksum error for a child of a RAID-Z device. 2738 */ 2739 void 2740 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2741 { 2742 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2743 2744 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 2745 zio->io_priority != ZIO_PRIORITY_REBUILD) { 2746 zio_bad_cksum_t zbc; 2747 raidz_map_t *rm = zio->io_vsd; 2748 2749 zbc.zbc_has_cksum = 0; 2750 zbc.zbc_injected = rm->rm_ecksuminjected; 2751 2752 mutex_enter(&vd->vdev_stat_lock); 2753 vd->vdev_stat.vs_checksum_errors++; 2754 mutex_exit(&vd->vdev_stat_lock); 2755 (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2756 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2757 rc->rc_abd, bad_data, &zbc); 2758 } 2759 } 2760 2761 /* 2762 * We keep track of whether or not there were any injected errors, so that 2763 * any ereports we generate can note it. 2764 */ 2765 static int 2766 raidz_checksum_verify(zio_t *zio) 2767 { 2768 zio_bad_cksum_t zbc = {0}; 2769 raidz_map_t *rm = zio->io_vsd; 2770 2771 int ret = zio_checksum_error(zio, &zbc); 2772 /* 2773 * Any Direct I/O read that has a checksum error must be treated as 2774 * suspicious as the contents of the buffer could be getting 2775 * manipulated while the I/O is taking place. The checksum verify error 2776 * will be reported to the top-level RAIDZ VDEV. 2777 */ 2778 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 2779 zio->io_error = ret; 2780 zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; 2781 zio_dio_chksum_verify_error_report(zio); 2782 zio_checksum_verified(zio); 2783 return (0); 2784 } 2785 2786 if (ret != 0 && zbc.zbc_injected != 0) 2787 rm->rm_ecksuminjected = 1; 2788 2789 return (ret); 2790 } 2791 2792 /* 2793 * Generate the parity from the data columns. If we tried and were able to 2794 * read the parity without error, verify that the generated parity matches the 2795 * data we read. If it doesn't, we fire off a checksum error. Return the 2796 * number of such failures. 2797 */ 2798 static int 2799 raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2800 { 2801 abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2802 int c, ret = 0; 2803 raidz_map_t *rm = zio->io_vsd; 2804 raidz_col_t *rc; 2805 2806 blkptr_t *bp = zio->io_bp; 2807 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2808 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2809 2810 if (checksum == ZIO_CHECKSUM_NOPARITY) 2811 return (ret); 2812 2813 for (c = 0; c < rr->rr_firstdatacol; c++) { 2814 rc = &rr->rr_col[c]; 2815 if (!rc->rc_tried || rc->rc_error != 0) 2816 continue; 2817 2818 orig[c] = rc->rc_abd; 2819 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2820 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2821 } 2822 2823 /* 2824 * Verify any empty sectors are zero filled to ensure the parity 2825 * is calculated correctly even if these non-data sectors are damaged. 2826 */ 2827 if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2828 ret += vdev_draid_map_verify_empty(zio, rr); 2829 2830 /* 2831 * Regenerates parity even for !tried||rc_error!=0 columns. This 2832 * isn't harmful but it does have the side effect of fixing stuff 2833 * we didn't realize was necessary (i.e. even if we return 0). 2834 */ 2835 vdev_raidz_generate_parity_row(rm, rr); 2836 2837 for (c = 0; c < rr->rr_firstdatacol; c++) { 2838 rc = &rr->rr_col[c]; 2839 2840 if (!rc->rc_tried || rc->rc_error != 0) 2841 continue; 2842 2843 if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2844 vdev_raidz_checksum_error(zio, rc, orig[c]); 2845 rc->rc_error = SET_ERROR(ECKSUM); 2846 ret++; 2847 } 2848 abd_free(orig[c]); 2849 } 2850 2851 return (ret); 2852 } 2853 2854 static int 2855 vdev_raidz_worst_error(raidz_row_t *rr) 2856 { 2857 int error = 0; 2858 2859 for (int c = 0; c < rr->rr_cols; c++) { 2860 error = zio_worst_error(error, rr->rr_col[c].rc_error); 2861 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2862 } 2863 2864 return (error); 2865 } 2866 2867 /* 2868 * Find the median value from a set of n values 2869 */ 2870 static uint64_t 2871 latency_median_value(const uint64_t *data, size_t n) 2872 { 2873 uint64_t m; 2874 2875 if (n % 2 == 0) 2876 m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1; 2877 else 2878 m = data[((n + 1) >> 1) - 1]; 2879 2880 return (m); 2881 } 2882 2883 /* 2884 * Calculate the outlier fence from a set of n latency values 2885 * 2886 * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1) 2887 */ 2888 static uint64_t 2889 latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr) 2890 { 2891 uint64_t q1 = latency_median_value(&data[0], n >> 1); 2892 uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1); 2893 2894 /* 2895 * To avoid detecting false positive outliers when N is small and 2896 * and the latencies values are very close, make sure the IQR 2897 * is at least 25% larger than Q1. 2898 */ 2899 *iqr = MAX(q3 - q1, q1 / 4); 2900 2901 return (q3 + (*iqr * vdev_raidz_outlier_insensitivity)); 2902 } 2903 #define LAT_CHILDREN_MIN 5 2904 #define LAT_OUTLIER_LIMIT 20 2905 2906 static int 2907 latency_compare(const void *arg1, const void *arg2) 2908 { 2909 const uint64_t *l1 = (uint64_t *)arg1; 2910 const uint64_t *l2 = (uint64_t *)arg2; 2911 2912 return (TREE_CMP(*l1, *l2)); 2913 } 2914 2915 void 2916 vdev_raidz_sit_child(vdev_t *svd, uint64_t secs) 2917 { 2918 for (int c = 0; c < svd->vdev_children; c++) 2919 vdev_raidz_sit_child(svd->vdev_child[c], secs); 2920 2921 if (!svd->vdev_ops->vdev_op_leaf) 2922 return; 2923 2924 /* Begin a sit out period for this slow drive */ 2925 svd->vdev_read_sit_out_expire = gethrestime_sec() + 2926 secs; 2927 2928 /* Count each slow io period */ 2929 mutex_enter(&svd->vdev_stat_lock); 2930 svd->vdev_stat.vs_slow_ios++; 2931 mutex_exit(&svd->vdev_stat_lock); 2932 } 2933 2934 void 2935 vdev_raidz_unsit_child(vdev_t *vd) 2936 { 2937 for (int c = 0; c < vd->vdev_children; c++) 2938 vdev_raidz_unsit_child(vd->vdev_child[c]); 2939 2940 if (!vd->vdev_ops->vdev_op_leaf) 2941 return; 2942 2943 vd->vdev_read_sit_out_expire = 0; 2944 } 2945 2946 /* 2947 * Check for any latency outlier from latest set of child reads. 2948 * 2949 * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This 2950 * rule defines extreme outliers as data points outside the fence of the 2951 * third quartile plus fifty times the Interquartile Range (IQR). This range 2952 * is the distance between the first and third quartile. 2953 * 2954 * Fifty is an extremely large value for Tukey's fence, but the outliers we're 2955 * attempting to detect here are orders of magnitude times larger than the 2956 * median. This large value should capture any truly fault disk quickly, 2957 * without causing spurious sit-outs. 2958 * 2959 * To further avoid spurious sit-outs, vdevs must be detected multiple times 2960 * as an outlier before they are sat, and outlier counts will gradually decay. 2961 * Every nchildren times we have detected an outlier, we subtract 2 from the 2962 * outlier count of all children. If detected outliers are close to uniformly 2963 * distributed, this will result in the outlier count remaining close to 0 2964 * (in expectation; over long enough time-scales, spurious sit-outs are still 2965 * possible). 2966 */ 2967 static void 2968 vdev_child_slow_outlier(zio_t *zio) 2969 { 2970 vdev_t *vd = zio->io_vd; 2971 if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 || 2972 vd->vdev_children < LAT_CHILDREN_MIN) 2973 return; 2974 2975 hrtime_t now = getlrtime(); 2976 uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); 2977 2978 if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms)) 2979 return; 2980 2981 /* Allow a single winner when there are racing callers. */ 2982 if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) 2983 return; 2984 2985 int children = vd->vdev_children; 2986 uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP); 2987 2988 for (int c = 0; c < children; c++) { 2989 vdev_t *cvd = vd->vdev_child[c]; 2990 if (cvd->vdev_prev_histo == NULL) { 2991 mutex_enter(&cvd->vdev_stat_lock); 2992 size_t size = 2993 sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); 2994 cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP); 2995 memcpy(cvd->vdev_prev_histo, 2996 cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], 2997 size); 2998 mutex_exit(&cvd->vdev_stat_lock); 2999 } 3000 } 3001 uint64_t max = 0; 3002 vdev_t *svd = NULL; 3003 uint_t sitouts = 0; 3004 boolean_t skip = B_FALSE, svd_sitting = B_FALSE; 3005 for (int c = 0; c < children; c++) { 3006 vdev_t *cvd = vd->vdev_child[c]; 3007 boolean_t sitting = vdev_sit_out_reads(cvd, 0) || 3008 cvd->vdev_state != VDEV_STATE_HEALTHY; 3009 3010 /* We can't sit out more disks than we have parity */ 3011 if (sitting && ++sitouts >= vdev_get_nparity(vd)) 3012 skip = B_TRUE; 3013 3014 mutex_enter(&cvd->vdev_stat_lock); 3015 3016 uint64_t *prev_histo = cvd->vdev_prev_histo; 3017 uint64_t *histo = 3018 cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ]; 3019 if (skip) { 3020 size_t size = 3021 sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); 3022 memcpy(prev_histo, histo, size); 3023 mutex_exit(&cvd->vdev_stat_lock); 3024 continue; 3025 } 3026 uint64_t count = 0; 3027 lat_data[c] = 0; 3028 for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { 3029 uint64_t this_count = histo[i] - prev_histo[i]; 3030 lat_data[c] += (1ULL << i) * this_count; 3031 count += this_count; 3032 } 3033 size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); 3034 memcpy(prev_histo, histo, size); 3035 mutex_exit(&cvd->vdev_stat_lock); 3036 lat_data[c] /= MAX(1, count); 3037 3038 /* Wait until all disks have been read from */ 3039 if (lat_data[c] == 0 && !sitting) { 3040 skip = B_TRUE; 3041 continue; 3042 } 3043 3044 /* Keep track of the vdev with largest value */ 3045 if (lat_data[c] > max) { 3046 max = lat_data[c]; 3047 svd = cvd; 3048 svd_sitting = sitting; 3049 } 3050 } 3051 3052 if (skip) { 3053 kmem_free(lat_data, sizeof (uint64_t) * children); 3054 return; 3055 } 3056 3057 qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare); 3058 3059 uint64_t iqr; 3060 uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr); 3061 3062 ASSERT3U(lat_data[children - 1], ==, max); 3063 if (max > fence && !svd_sitting) { 3064 ASSERT3U(iqr, >, 0); 3065 uint64_t incr = MAX(1, MIN((max - fence) / iqr, 3066 LAT_OUTLIER_LIMIT / 4)); 3067 vd->vdev_outlier_count += incr; 3068 if (vd->vdev_outlier_count >= children) { 3069 for (int c = 0; c < children; c++) { 3070 vdev_t *cvd = vd->vdev_child[c]; 3071 cvd->vdev_outlier_count -= 2; 3072 cvd->vdev_outlier_count = MAX(0, 3073 cvd->vdev_outlier_count); 3074 } 3075 vd->vdev_outlier_count = 0; 3076 } 3077 /* 3078 * Keep track of how many times this child has had 3079 * an outlier read. A disk that persitently has a 3080 * higher than peers outlier count will be considered 3081 * a slow disk. 3082 */ 3083 svd->vdev_outlier_count += incr; 3084 if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { 3085 ASSERT0(svd->vdev_read_sit_out_expire); 3086 vdev_raidz_sit_child(svd, vdev_read_sit_out_secs); 3087 (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT, 3088 zio->io_spa, svd, NULL, NULL, 0); 3089 vdev_dbgmsg(svd, "begin read sit out for %d secs", 3090 (int)vdev_read_sit_out_secs); 3091 3092 for (int c = 0; c < vd->vdev_children; c++) 3093 vd->vdev_child[c]->vdev_outlier_count = 0; 3094 } 3095 } 3096 3097 kmem_free(lat_data, sizeof (uint64_t) * children); 3098 } 3099 3100 static void 3101 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 3102 { 3103 int unexpected_errors = 0; 3104 int parity_errors = 0; 3105 int parity_untried = 0; 3106 int data_errors = 0; 3107 3108 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 3109 3110 for (int c = 0; c < rr->rr_cols; c++) { 3111 raidz_col_t *rc = &rr->rr_col[c]; 3112 3113 if (rc->rc_error) { 3114 if (c < rr->rr_firstdatacol) 3115 parity_errors++; 3116 else 3117 data_errors++; 3118 3119 if (!rc->rc_skipped) 3120 unexpected_errors++; 3121 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3122 parity_untried++; 3123 } 3124 3125 if (rc->rc_force_repair) 3126 unexpected_errors++; 3127 } 3128 3129 /* 3130 * If we read more parity disks than were used for 3131 * reconstruction, confirm that the other parity disks produced 3132 * correct data. 3133 * 3134 * Note that we also regenerate parity when resilvering so we 3135 * can write it out to failed devices later. 3136 */ 3137 if (parity_errors + parity_untried < 3138 rr->rr_firstdatacol - data_errors || 3139 (zio->io_flags & ZIO_FLAG_RESILVER)) { 3140 int n = raidz_parity_verify(zio, rr); 3141 unexpected_errors += n; 3142 } 3143 3144 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 3145 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 3146 /* 3147 * Use the good data we have in hand to repair damaged children. 3148 */ 3149 for (int c = 0; c < rr->rr_cols; c++) { 3150 raidz_col_t *rc = &rr->rr_col[c]; 3151 vdev_t *vd = zio->io_vd; 3152 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 3153 3154 if (!rc->rc_allow_repair) { 3155 continue; 3156 } else if (!rc->rc_force_repair && 3157 (rc->rc_error == 0 || rc->rc_size == 0)) { 3158 continue; 3159 } 3160 /* 3161 * We do not allow self healing for Direct I/O reads. 3162 * See comment in vdev_raid_row_alloc(). 3163 */ 3164 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 3165 3166 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 3167 rc->rc_offset, rc->rc_abd, rc->rc_size, 3168 ZIO_TYPE_WRITE, 3169 zio->io_priority == ZIO_PRIORITY_REBUILD ? 3170 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 3171 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 3172 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 3173 } 3174 } 3175 3176 /* 3177 * Scrub or resilver i/o's: overwrite any shadow locations with the 3178 * good data. This ensures that if we've already copied this sector, 3179 * it will be corrected if it was damaged. This writes more than is 3180 * necessary, but since expansion is paused during scrub/resilver, at 3181 * most a single row will have a shadow location. 3182 */ 3183 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 3184 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 3185 for (int c = 0; c < rr->rr_cols; c++) { 3186 raidz_col_t *rc = &rr->rr_col[c]; 3187 vdev_t *vd = zio->io_vd; 3188 3189 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 3190 continue; 3191 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 3192 3193 /* 3194 * Note: We don't want to update the repair stats 3195 * because that would incorrectly indicate that there 3196 * was bad data to repair, which we aren't sure about. 3197 * By clearing the SCAN_THREAD flag, we prevent this 3198 * from happening, despite having the REPAIR flag set. 3199 * We need to set SELF_HEAL so that this i/o can't be 3200 * bypassed by zio_vdev_io_start(). 3201 */ 3202 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 3203 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 3204 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 3205 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 3206 NULL, NULL); 3207 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 3208 zio_nowait(cio); 3209 } 3210 } 3211 } 3212 3213 static void 3214 raidz_restore_orig_data(raidz_map_t *rm) 3215 { 3216 for (int i = 0; i < rm->rm_nrows; i++) { 3217 raidz_row_t *rr = rm->rm_row[i]; 3218 for (int c = 0; c < rr->rr_cols; c++) { 3219 raidz_col_t *rc = &rr->rr_col[c]; 3220 if (rc->rc_need_orig_restore) { 3221 abd_copy(rc->rc_abd, 3222 rc->rc_orig_data, rc->rc_size); 3223 rc->rc_need_orig_restore = B_FALSE; 3224 } 3225 } 3226 } 3227 } 3228 3229 /* 3230 * During raidz_reconstruct() for expanded VDEV, we need special consideration 3231 * failure simulations. See note in raidz_reconstruct() on simulating failure 3232 * of a pre-expansion device. 3233 * 3234 * Treating logical child i as failed, return TRUE if the given column should 3235 * be treated as failed. The idea of logical children allows us to imagine 3236 * that a disk silently failed before a RAIDZ expansion (reads from this disk 3237 * succeed but return the wrong data). Since the expansion doesn't verify 3238 * checksums, the incorrect data will be moved to new locations spread among 3239 * the children (going diagonally across them). 3240 * 3241 * Higher "logical child failures" (values of `i`) indicate these 3242 * "pre-expansion failures". The first physical_width values imagine that a 3243 * current child failed; the next physical_width-1 values imagine that a 3244 * child failed before the most recent expansion; the next physical_width-2 3245 * values imagine a child failed in the expansion before that, etc. 3246 */ 3247 static boolean_t 3248 raidz_simulate_failure(int physical_width, int original_width, int ashift, 3249 int i, raidz_col_t *rc) 3250 { 3251 uint64_t sector_id = 3252 physical_width * (rc->rc_offset >> ashift) + 3253 rc->rc_devidx; 3254 3255 for (int w = physical_width; w >= original_width; w--) { 3256 if (i < w) { 3257 return (sector_id % w == i); 3258 } else { 3259 i -= w; 3260 } 3261 } 3262 ASSERT(!"invalid logical child id"); 3263 return (B_FALSE); 3264 } 3265 3266 /* 3267 * returns EINVAL if reconstruction of the block will not be possible 3268 * returns ECKSUM if this specific reconstruction failed 3269 * returns 0 on successful reconstruction 3270 */ 3271 static int 3272 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 3273 { 3274 raidz_map_t *rm = zio->io_vsd; 3275 int physical_width = zio->io_vd->vdev_children; 3276 int original_width = (rm->rm_original_width != 0) ? 3277 rm->rm_original_width : physical_width; 3278 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 3279 3280 if (dbgmsg) { 3281 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 3282 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 3283 } 3284 3285 /* Reconstruct each row */ 3286 for (int r = 0; r < rm->rm_nrows; r++) { 3287 raidz_row_t *rr = rm->rm_row[r]; 3288 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 3289 int t = 0; 3290 int dead = 0; 3291 int dead_data = 0; 3292 3293 if (dbgmsg) 3294 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 3295 3296 for (int c = 0; c < rr->rr_cols; c++) { 3297 raidz_col_t *rc = &rr->rr_col[c]; 3298 ASSERT0(rc->rc_need_orig_restore); 3299 if (rc->rc_error != 0) { 3300 dead++; 3301 if (c >= nparity) 3302 dead_data++; 3303 continue; 3304 } 3305 if (rc->rc_size == 0) 3306 continue; 3307 for (int lt = 0; lt < ntgts; lt++) { 3308 if (raidz_simulate_failure(physical_width, 3309 original_width, 3310 zio->io_vd->vdev_top->vdev_ashift, 3311 ltgts[lt], rc)) { 3312 if (rc->rc_orig_data == NULL) { 3313 rc->rc_orig_data = 3314 abd_alloc_linear( 3315 rc->rc_size, B_TRUE); 3316 abd_copy(rc->rc_orig_data, 3317 rc->rc_abd, rc->rc_size); 3318 } 3319 rc->rc_need_orig_restore = B_TRUE; 3320 3321 dead++; 3322 if (c >= nparity) 3323 dead_data++; 3324 /* 3325 * Note: simulating failure of a 3326 * pre-expansion device can hit more 3327 * than one column, in which case we 3328 * might try to simulate more failures 3329 * than can be reconstructed, which is 3330 * also more than the size of my_tgts. 3331 * This check prevents accessing past 3332 * the end of my_tgts. The "dead > 3333 * nparity" check below will fail this 3334 * reconstruction attempt. 3335 */ 3336 if (t < VDEV_RAIDZ_MAXPARITY) { 3337 my_tgts[t++] = c; 3338 if (dbgmsg) { 3339 zfs_dbgmsg("simulating " 3340 "failure of col %u " 3341 "devidx %u", c, 3342 (int)rc->rc_devidx); 3343 } 3344 } 3345 break; 3346 } 3347 } 3348 } 3349 if (dead > nparity) { 3350 /* reconstruction not possible */ 3351 if (dbgmsg) { 3352 zfs_dbgmsg("reconstruction not possible; " 3353 "too many failures"); 3354 } 3355 raidz_restore_orig_data(rm); 3356 return (EINVAL); 3357 } 3358 if (dead_data > 0) 3359 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 3360 } 3361 3362 /* Check for success */ 3363 if (raidz_checksum_verify(zio) == 0) { 3364 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) 3365 return (0); 3366 3367 /* Reconstruction succeeded - report errors */ 3368 for (int i = 0; i < rm->rm_nrows; i++) { 3369 raidz_row_t *rr = rm->rm_row[i]; 3370 3371 for (int c = 0; c < rr->rr_cols; c++) { 3372 raidz_col_t *rc = &rr->rr_col[c]; 3373 if (rc->rc_need_orig_restore) { 3374 /* 3375 * Note: if this is a parity column, 3376 * we don't really know if it's wrong. 3377 * We need to let 3378 * vdev_raidz_io_done_verified() check 3379 * it, and if we set rc_error, it will 3380 * think that it is a "known" error 3381 * that doesn't need to be checked 3382 * or corrected. 3383 */ 3384 if (rc->rc_error == 0 && 3385 c >= rr->rr_firstdatacol) { 3386 vdev_raidz_checksum_error(zio, 3387 rc, rc->rc_orig_data); 3388 rc->rc_error = 3389 SET_ERROR(ECKSUM); 3390 } 3391 rc->rc_need_orig_restore = B_FALSE; 3392 } 3393 } 3394 3395 vdev_raidz_io_done_verified(zio, rr); 3396 } 3397 3398 zio_checksum_verified(zio); 3399 3400 if (dbgmsg) { 3401 zfs_dbgmsg("reconstruction successful " 3402 "(checksum verified)"); 3403 } 3404 return (0); 3405 } 3406 3407 /* Reconstruction failed - restore original data */ 3408 raidz_restore_orig_data(rm); 3409 if (dbgmsg) { 3410 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3411 "failed", zio); 3412 } 3413 return (ECKSUM); 3414 } 3415 3416 /* 3417 * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 3418 * Note that the algorithm below is non-optimal because it doesn't take into 3419 * account how reconstruction is actually performed. For example, with 3420 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 3421 * is targeted as invalid as if columns 1 and 4 are targeted since in both 3422 * cases we'd only use parity information in column 0. 3423 * 3424 * The order that we find the various possible combinations of failed 3425 * disks is dictated by these rules: 3426 * - Examine each "slot" (the "i" in tgts[i]) 3427 * - Try to increment this slot (tgts[i] += 1) 3428 * - if we can't increment because it runs into the next slot, 3429 * reset our slot to the minimum, and examine the next slot 3430 * 3431 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 3432 * 3 columns to reconstruct), we will generate the following sequence: 3433 * 3434 * STATE ACTION 3435 * 0 1 2 special case: skip since these are all parity 3436 * 0 1 3 first slot: reset to 0; middle slot: increment to 2 3437 * 0 2 3 first slot: increment to 1 3438 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 3439 * 0 1 4 first: reset to 0; middle: increment to 2 3440 * 0 2 4 first: increment to 1 3441 * 1 2 4 first: reset to 0; middle: increment to 3 3442 * 0 3 4 first: increment to 1 3443 * 1 3 4 first: increment to 2 3444 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 3445 * 0 1 5 first: reset to 0; middle: increment to 2 3446 * 0 2 5 first: increment to 1 3447 * 1 2 5 first: reset to 0; middle: increment to 3 3448 * 0 3 5 first: increment to 1 3449 * 1 3 5 first: increment to 2 3450 * 2 3 5 first: reset to 0; middle: increment to 4 3451 * 0 4 5 first: increment to 1 3452 * 1 4 5 first: increment to 2 3453 * 2 4 5 first: increment to 3 3454 * 3 4 5 done 3455 * 3456 * This strategy works for dRAID but is less efficient when there are a large 3457 * number of child vdevs and therefore permutations to check. Furthermore, 3458 * since the raidz_map_t rows likely do not overlap, reconstruction would be 3459 * possible as long as there are no more than nparity data errors per row. 3460 * These additional permutations are not currently checked but could be as 3461 * a future improvement. 3462 * 3463 * Returns 0 on success, ECKSUM on failure. 3464 */ 3465 static int 3466 vdev_raidz_combrec(zio_t *zio) 3467 { 3468 int nparity = vdev_get_nparity(zio->io_vd); 3469 raidz_map_t *rm = zio->io_vsd; 3470 int physical_width = zio->io_vd->vdev_children; 3471 int original_width = (rm->rm_original_width != 0) ? 3472 rm->rm_original_width : physical_width; 3473 3474 for (int i = 0; i < rm->rm_nrows; i++) { 3475 raidz_row_t *rr = rm->rm_row[i]; 3476 int total_errors = 0; 3477 3478 for (int c = 0; c < rr->rr_cols; c++) { 3479 if (rr->rr_col[c].rc_error) 3480 total_errors++; 3481 } 3482 3483 if (total_errors > nparity) 3484 return (vdev_raidz_worst_error(rr)); 3485 } 3486 3487 for (int num_failures = 1; num_failures <= nparity; num_failures++) { 3488 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 3489 int *ltgts = &tstore[1]; /* value is logical child ID */ 3490 3491 3492 /* 3493 * Determine number of logical children, n. See comment 3494 * above raidz_simulate_failure(). 3495 */ 3496 int n = 0; 3497 for (int w = physical_width; 3498 w >= original_width; w--) { 3499 n += w; 3500 } 3501 3502 ASSERT3U(num_failures, <=, nparity); 3503 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 3504 3505 /* Handle corner cases in combrec logic */ 3506 ltgts[-1] = -1; 3507 for (int i = 0; i < num_failures; i++) { 3508 ltgts[i] = i; 3509 } 3510 ltgts[num_failures] = n; 3511 3512 for (;;) { 3513 int err = raidz_reconstruct(zio, ltgts, num_failures, 3514 nparity); 3515 if (err == EINVAL) { 3516 /* 3517 * Reconstruction not possible with this # 3518 * failures; try more failures. 3519 */ 3520 break; 3521 } else if (err == 0) 3522 return (0); 3523 3524 /* Compute next targets to try */ 3525 for (int t = 0; ; t++) { 3526 ASSERT3U(t, <, num_failures); 3527 ltgts[t]++; 3528 if (ltgts[t] == n) { 3529 /* try more failures */ 3530 ASSERT3U(t, ==, num_failures - 1); 3531 if (zfs_flags & 3532 ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3533 zfs_dbgmsg("reconstruction " 3534 "failed for num_failures=" 3535 "%u; tried all " 3536 "combinations", 3537 num_failures); 3538 } 3539 break; 3540 } 3541 3542 ASSERT3U(ltgts[t], <, n); 3543 ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 3544 3545 /* 3546 * If that spot is available, we're done here. 3547 * Try the next combination. 3548 */ 3549 if (ltgts[t] != ltgts[t + 1]) 3550 break; // found next combination 3551 3552 /* 3553 * Otherwise, reset this tgt to the minimum, 3554 * and move on to the next tgt. 3555 */ 3556 ltgts[t] = ltgts[t - 1] + 1; 3557 ASSERT3U(ltgts[t], ==, t); 3558 } 3559 3560 /* Increase the number of failures and keep trying. */ 3561 if (ltgts[num_failures - 1] == n) 3562 break; 3563 } 3564 } 3565 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3566 zfs_dbgmsg("reconstruction failed for all num_failures"); 3567 return (ECKSUM); 3568 } 3569 3570 void 3571 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 3572 { 3573 for (uint64_t row = 0; row < rm->rm_nrows; row++) { 3574 raidz_row_t *rr = rm->rm_row[row]; 3575 vdev_raidz_reconstruct_row(rm, rr, t, nt); 3576 } 3577 } 3578 3579 /* 3580 * Complete a write IO operation on a RAIDZ VDev 3581 * 3582 * Outline: 3583 * 1. Check for errors on the child IOs. 3584 * 2. Return, setting an error code if too few child VDevs were written 3585 * to reconstruct the data later. Note that partial writes are 3586 * considered successful if they can be reconstructed at all. 3587 */ 3588 static void 3589 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 3590 { 3591 int normal_errors = 0; 3592 int shadow_errors = 0; 3593 3594 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3595 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3596 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3597 3598 for (int c = 0; c < rr->rr_cols; c++) { 3599 raidz_col_t *rc = &rr->rr_col[c]; 3600 3601 if (rc->rc_error != 0) { 3602 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3603 normal_errors++; 3604 } 3605 if (rc->rc_shadow_error != 0) { 3606 ASSERT(rc->rc_shadow_error != ECKSUM); 3607 shadow_errors++; 3608 } 3609 } 3610 3611 /* 3612 * Treat partial writes as a success. If we couldn't write enough 3613 * columns to reconstruct the data, the I/O failed. Otherwise, good 3614 * enough. Note that in the case of a shadow write (during raidz 3615 * expansion), depending on if we crash, either the normal (old) or 3616 * shadow (new) location may become the "real" version of the block, 3617 * so both locations must have sufficient redundancy. 3618 * 3619 * Now that we support write reallocation, it would be better 3620 * to treat partial failure as real failure unless there are 3621 * no non-degraded top-level vdevs left, and not update DTLs 3622 * if we intend to reallocate. 3623 */ 3624 if (normal_errors > rr->rr_firstdatacol || 3625 shadow_errors > rr->rr_firstdatacol) { 3626 zio->io_error = zio_worst_error(zio->io_error, 3627 vdev_raidz_worst_error(rr)); 3628 } 3629 } 3630 3631 static void 3632 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 3633 raidz_row_t *rr) 3634 { 3635 int parity_errors = 0; 3636 int parity_untried = 0; 3637 int data_errors = 0; 3638 int total_errors = 0; 3639 3640 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 3641 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 3642 3643 for (int c = 0; c < rr->rr_cols; c++) { 3644 raidz_col_t *rc = &rr->rr_col[c]; 3645 3646 /* 3647 * If scrubbing and a replacing/sparing child vdev determined 3648 * that not all of its children have an identical copy of the 3649 * data, then clear the error so the column is treated like 3650 * any other read and force a repair to correct the damage. 3651 */ 3652 if (rc->rc_error == ECKSUM) { 3653 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3654 vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3655 rc->rc_force_repair = 1; 3656 rc->rc_error = 0; 3657 } 3658 3659 if (rc->rc_error) { 3660 if (c < rr->rr_firstdatacol) 3661 parity_errors++; 3662 else 3663 data_errors++; 3664 3665 total_errors++; 3666 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 3667 parity_untried++; 3668 } 3669 } 3670 3671 /* 3672 * If there were data errors and the number of errors we saw was 3673 * correctable -- less than or equal to the number of parity disks read 3674 * -- reconstruct based on the missing data. 3675 */ 3676 if (data_errors != 0 && 3677 total_errors <= rr->rr_firstdatacol - parity_untried) { 3678 /* 3679 * We either attempt to read all the parity columns or 3680 * none of them. If we didn't try to read parity, we 3681 * wouldn't be here in the correctable case. There must 3682 * also have been fewer parity errors than parity 3683 * columns or, again, we wouldn't be in this code path. 3684 */ 3685 ASSERT0(parity_untried); 3686 ASSERT(parity_errors < rr->rr_firstdatacol); 3687 3688 /* 3689 * Identify the data columns that reported an error. 3690 */ 3691 int n = 0; 3692 int tgts[VDEV_RAIDZ_MAXPARITY]; 3693 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 3694 raidz_col_t *rc = &rr->rr_col[c]; 3695 if (rc->rc_error != 0) { 3696 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3697 tgts[n++] = c; 3698 } 3699 } 3700 3701 ASSERT(rr->rr_firstdatacol >= n); 3702 3703 vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3704 } 3705 } 3706 3707 /* 3708 * Return the number of reads issued. 3709 */ 3710 static int 3711 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 3712 { 3713 vdev_t *vd = zio->io_vd; 3714 int nread = 0; 3715 3716 rr->rr_missingdata = 0; 3717 rr->rr_missingparity = 0; 3718 3719 /* 3720 * If this rows contains empty sectors which are not required 3721 * for a normal read then allocate an ABD for them now so they 3722 * may be read, verified, and any needed repairs performed. 3723 */ 3724 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 3725 vdev_draid_map_alloc_empty(zio, rr); 3726 3727 for (int c = 0; c < rr->rr_cols; c++) { 3728 raidz_col_t *rc = &rr->rr_col[c]; 3729 if (rc->rc_tried || rc->rc_size == 0) 3730 continue; 3731 3732 zio_nowait(zio_vdev_child_io(zio, NULL, 3733 vd->vdev_child[rc->rc_devidx], 3734 rc->rc_offset, rc->rc_abd, rc->rc_size, 3735 zio->io_type, zio->io_priority, 0, 3736 vdev_raidz_child_done, rc)); 3737 nread++; 3738 } 3739 return (nread); 3740 } 3741 3742 /* 3743 * We're here because either there were too many errors to even attempt 3744 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 3745 * failed. In either case, there is enough bad data to prevent reconstruction. 3746 * Start checksum ereports for all children which haven't failed. 3747 */ 3748 static void 3749 vdev_raidz_io_done_unrecoverable(zio_t *zio) 3750 { 3751 raidz_map_t *rm = zio->io_vsd; 3752 3753 for (int i = 0; i < rm->rm_nrows; i++) { 3754 raidz_row_t *rr = rm->rm_row[i]; 3755 3756 for (int c = 0; c < rr->rr_cols; c++) { 3757 raidz_col_t *rc = &rr->rr_col[c]; 3758 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 3759 3760 if (rc->rc_error != 0) 3761 continue; 3762 3763 zio_bad_cksum_t zbc; 3764 zbc.zbc_has_cksum = 0; 3765 zbc.zbc_injected = rm->rm_ecksuminjected; 3766 mutex_enter(&cvd->vdev_stat_lock); 3767 cvd->vdev_stat.vs_checksum_errors++; 3768 mutex_exit(&cvd->vdev_stat_lock); 3769 (void) zfs_ereport_start_checksum(zio->io_spa, 3770 cvd, &zio->io_bookmark, zio, rc->rc_offset, 3771 rc->rc_size, &zbc); 3772 } 3773 } 3774 } 3775 3776 void 3777 vdev_raidz_io_done(zio_t *zio) 3778 { 3779 raidz_map_t *rm = zio->io_vsd; 3780 3781 ASSERT(zio->io_bp != NULL); 3782 if (zio->io_type == ZIO_TYPE_WRITE) { 3783 for (int i = 0; i < rm->rm_nrows; i++) { 3784 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 3785 } 3786 } else { 3787 if (rm->rm_phys_col) { 3788 /* 3789 * This is an aggregated read. Copy the data and status 3790 * from the aggregate abd's to the individual rows. 3791 */ 3792 for (int i = 0; i < rm->rm_nrows; i++) { 3793 raidz_row_t *rr = rm->rm_row[i]; 3794 3795 for (int c = 0; c < rr->rr_cols; c++) { 3796 raidz_col_t *rc = &rr->rr_col[c]; 3797 if (rc->rc_tried || rc->rc_size == 0) 3798 continue; 3799 3800 raidz_col_t *prc = 3801 &rm->rm_phys_col[rc->rc_devidx]; 3802 rc->rc_error = prc->rc_error; 3803 rc->rc_tried = prc->rc_tried; 3804 rc->rc_skipped = prc->rc_skipped; 3805 if (c >= rr->rr_firstdatacol) { 3806 /* 3807 * Note: this is slightly faster 3808 * than using abd_copy_off(). 3809 */ 3810 char *physbuf = abd_to_buf( 3811 prc->rc_abd); 3812 void *physloc = physbuf + 3813 rc->rc_offset - 3814 prc->rc_offset; 3815 3816 abd_copy_from_buf(rc->rc_abd, 3817 physloc, rc->rc_size); 3818 } 3819 } 3820 } 3821 } 3822 3823 for (int i = 0; i < rm->rm_nrows; i++) { 3824 raidz_row_t *rr = rm->rm_row[i]; 3825 vdev_raidz_io_done_reconstruct_known_missing(zio, 3826 rm, rr); 3827 } 3828 3829 if (raidz_checksum_verify(zio) == 0) { 3830 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) 3831 goto done; 3832 3833 for (int i = 0; i < rm->rm_nrows; i++) { 3834 raidz_row_t *rr = rm->rm_row[i]; 3835 vdev_raidz_io_done_verified(zio, rr); 3836 } 3837 /* Periodically check for a read outlier */ 3838 if (zio->io_type == ZIO_TYPE_READ) 3839 vdev_child_slow_outlier(zio); 3840 zio_checksum_verified(zio); 3841 } else { 3842 /* 3843 * A sequential resilver has no checksum which makes 3844 * combinatoral reconstruction impossible. This code 3845 * path is unreachable since raidz_checksum_verify() 3846 * has no checksum to verify and must succeed. 3847 */ 3848 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3849 3850 /* 3851 * This isn't a typical situation -- either we got a 3852 * read error or a child silently returned bad data. 3853 * Read every block so we can try again with as much 3854 * data and parity as we can track down. If we've 3855 * already been through once before, all children will 3856 * be marked as tried so we'll proceed to combinatorial 3857 * reconstruction. 3858 */ 3859 int nread = 0; 3860 for (int i = 0; i < rm->rm_nrows; i++) { 3861 nread += vdev_raidz_read_all(zio, 3862 rm->rm_row[i]); 3863 } 3864 if (nread != 0) { 3865 /* 3866 * Normally our stage is VDEV_IO_DONE, but if 3867 * we've already called redone(), it will have 3868 * changed to VDEV_IO_START, in which case we 3869 * don't want to call redone() again. 3870 */ 3871 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 3872 zio_vdev_io_redone(zio); 3873 return; 3874 } 3875 /* 3876 * It would be too expensive to try every possible 3877 * combination of failed sectors in every row, so 3878 * instead we try every combination of failed current or 3879 * past physical disk. This means that if the incorrect 3880 * sectors were all on Nparity disks at any point in the 3881 * past, we will find the correct data. The only known 3882 * case where this is less durable than a non-expanded 3883 * RAIDZ, is if we have a silent failure during 3884 * expansion. In that case, one block could be 3885 * partially in the old format and partially in the 3886 * new format, so we'd lost some sectors from the old 3887 * format and some from the new format. 3888 * 3889 * e.g. logical_width=4 physical_width=6 3890 * the 15 (6+5+4) possible failed disks are: 3891 * width=6 child=0 3892 * width=6 child=1 3893 * width=6 child=2 3894 * width=6 child=3 3895 * width=6 child=4 3896 * width=6 child=5 3897 * width=5 child=0 3898 * width=5 child=1 3899 * width=5 child=2 3900 * width=5 child=3 3901 * width=5 child=4 3902 * width=4 child=0 3903 * width=4 child=1 3904 * width=4 child=2 3905 * width=4 child=3 3906 * And we will try every combination of Nparity of these 3907 * failing. 3908 * 3909 * As a first pass, we can generate every combo, 3910 * and try reconstructing, ignoring any known 3911 * failures. If any row has too many known + simulated 3912 * failures, then we bail on reconstructing with this 3913 * number of simulated failures. As an improvement, 3914 * we could detect the number of whole known failures 3915 * (i.e. we have known failures on these disks for 3916 * every row; the disks never succeeded), and 3917 * subtract that from the max # failures to simulate. 3918 * We could go even further like the current 3919 * combrec code, but that doesn't seem like it 3920 * gains us very much. If we simulate a failure 3921 * that is also a known failure, that's fine. 3922 */ 3923 zio->io_error = vdev_raidz_combrec(zio); 3924 if (zio->io_error == ECKSUM && 3925 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3926 vdev_raidz_io_done_unrecoverable(zio); 3927 } 3928 } 3929 } 3930 done: 3931 if (rm->rm_lr != NULL) { 3932 zfs_rangelock_exit(rm->rm_lr); 3933 rm->rm_lr = NULL; 3934 } 3935 } 3936 3937 static void 3938 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3939 { 3940 vdev_raidz_t *vdrz = vd->vdev_tsd; 3941 if (faulted > vdrz->vd_nparity) 3942 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3943 VDEV_AUX_NO_REPLICAS); 3944 else if (degraded + faulted != 0) 3945 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3946 else 3947 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3948 } 3949 3950 /* 3951 * Determine if any portion of the provided block resides on a child vdev 3952 * with a dirty DTL and therefore needs to be resilvered. The function 3953 * assumes that at least one DTL is dirty which implies that full stripe 3954 * width blocks must be resilvered. 3955 */ 3956 static boolean_t 3957 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3958 uint64_t phys_birth) 3959 { 3960 vdev_raidz_t *vdrz = vd->vdev_tsd; 3961 3962 /* 3963 * If we're in the middle of a RAIDZ expansion, this block may be in 3964 * the old and/or new location. For simplicity, always resilver it. 3965 */ 3966 if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3967 return (B_TRUE); 3968 3969 uint64_t dcols = vd->vdev_children; 3970 uint64_t nparity = vdrz->vd_nparity; 3971 uint64_t ashift = vd->vdev_top->vdev_ashift; 3972 /* The starting RAIDZ (parent) vdev sector of the block. */ 3973 uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3974 /* The zio's size in units of the vdev's minimum sector size. */ 3975 uint64_t s = ((psize - 1) >> ashift) + 1; 3976 /* The first column for this stripe. */ 3977 uint64_t f = b % dcols; 3978 3979 /* Unreachable by sequential resilver. */ 3980 ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 3981 3982 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 3983 return (B_FALSE); 3984 3985 if (s + nparity >= dcols) 3986 return (B_TRUE); 3987 3988 for (uint64_t c = 0; c < s + nparity; c++) { 3989 uint64_t devidx = (f + c) % dcols; 3990 vdev_t *cvd = vd->vdev_child[devidx]; 3991 3992 /* 3993 * dsl_scan_need_resilver() already checked vd with 3994 * vdev_dtl_contains(). So here just check cvd with 3995 * vdev_dtl_empty(), cheaper and a good approximation. 3996 */ 3997 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3998 return (B_TRUE); 3999 } 4000 4001 return (B_FALSE); 4002 } 4003 4004 static void 4005 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, 4006 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 4007 { 4008 (void) remain_rs; 4009 4010 vdev_t *raidvd = cvd->vdev_parent; 4011 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 4012 4013 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4014 4015 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4016 /* 4017 * We're in the middle of expansion, in which case the 4018 * translation is in flux. Any answer we give may be wrong 4019 * by the time we return, so it isn't safe for the caller to 4020 * act on it. Therefore we say that this range isn't present 4021 * on any children. The only consumers of this are "zpool 4022 * initialize" and trimming, both of which are "best effort" 4023 * anyway. 4024 */ 4025 physical_rs->rs_start = physical_rs->rs_end = 0; 4026 remain_rs->rs_start = remain_rs->rs_end = 0; 4027 return; 4028 } 4029 4030 uint64_t width = vdrz->vd_physical_width; 4031 uint64_t tgt_col = cvd->vdev_id; 4032 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 4033 4034 /* make sure the offsets are block-aligned */ 4035 ASSERT0(logical_rs->rs_start % (1 << ashift)); 4036 ASSERT0(logical_rs->rs_end % (1 << ashift)); 4037 uint64_t b_start = logical_rs->rs_start >> ashift; 4038 uint64_t b_end = logical_rs->rs_end >> ashift; 4039 4040 uint64_t start_row = 0; 4041 if (b_start > tgt_col) /* avoid underflow */ 4042 start_row = ((b_start - tgt_col - 1) / width) + 1; 4043 4044 uint64_t end_row = 0; 4045 if (b_end > tgt_col) 4046 end_row = ((b_end - tgt_col - 1) / width) + 1; 4047 4048 physical_rs->rs_start = start_row << ashift; 4049 physical_rs->rs_end = end_row << ashift; 4050 4051 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 4052 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 4053 logical_rs->rs_end - logical_rs->rs_start); 4054 } 4055 4056 static void 4057 raidz_reflow_sync(void *arg, dmu_tx_t *tx) 4058 { 4059 spa_t *spa = arg; 4060 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4061 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4062 4063 /* 4064 * Ensure there are no i/os to the range that is being committed. 4065 */ 4066 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 4067 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 4068 4069 mutex_enter(&vre->vre_lock); 4070 uint64_t new_offset = 4071 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 4072 /* 4073 * We should not have committed anything that failed. 4074 */ 4075 VERIFY3U(vre->vre_failed_offset, >=, old_offset); 4076 mutex_exit(&vre->vre_lock); 4077 4078 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4079 old_offset, new_offset - old_offset, 4080 RL_WRITER); 4081 4082 /* 4083 * Update the uberblock that will be written when this txg completes. 4084 */ 4085 RAIDZ_REFLOW_SET(&spa->spa_uberblock, 4086 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 4087 vre->vre_offset_pertxg[txgoff] = 0; 4088 zfs_rangelock_exit(lr); 4089 4090 mutex_enter(&vre->vre_lock); 4091 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 4092 vre->vre_bytes_copied_pertxg[txgoff] = 0; 4093 mutex_exit(&vre->vre_lock); 4094 4095 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4096 VERIFY0(zap_update(spa->spa_meta_objset, 4097 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4098 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 4099 } 4100 4101 static void 4102 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 4103 { 4104 spa_t *spa = arg; 4105 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4106 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4107 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4108 4109 for (int i = 0; i < TXG_SIZE; i++) 4110 VERIFY0(vre->vre_offset_pertxg[i]); 4111 4112 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4113 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 4114 re->re_logical_width = vdrz->vd_physical_width; 4115 mutex_enter(&vdrz->vd_expand_lock); 4116 avl_add(&vdrz->vd_expand_txgs, re); 4117 mutex_exit(&vdrz->vd_expand_lock); 4118 4119 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4120 4121 /* 4122 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 4123 * will get written (based on vd_expand_txgs). 4124 */ 4125 vdev_config_dirty(vd); 4126 4127 /* 4128 * Before we change vre_state, the on-disk state must reflect that we 4129 * have completed all copying, so that vdev_raidz_io_start() can use 4130 * vre_state to determine if the reflow is in progress. See also the 4131 * end of spa_raidz_expand_thread(). 4132 */ 4133 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 4134 raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 4135 4136 vre->vre_end_time = gethrestime_sec(); 4137 vre->vre_state = DSS_FINISHED; 4138 4139 uint64_t state = vre->vre_state; 4140 VERIFY0(zap_update(spa->spa_meta_objset, 4141 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4142 sizeof (state), 1, &state, tx)); 4143 4144 uint64_t end_time = vre->vre_end_time; 4145 VERIFY0(zap_update(spa->spa_meta_objset, 4146 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4147 sizeof (end_time), 1, &end_time, tx)); 4148 4149 spa->spa_uberblock.ub_raidz_reflow_info = 0; 4150 4151 spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 4152 "%s vdev %llu new width %llu", spa_name(spa), 4153 (unsigned long long)vd->vdev_id, 4154 (unsigned long long)vd->vdev_children); 4155 4156 spa->spa_raidz_expand = NULL; 4157 raidvd->vdev_rz_expanding = B_FALSE; 4158 4159 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 4160 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 4161 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 4162 4163 spa_notify_waiters(spa); 4164 4165 /* 4166 * While we're in syncing context take the opportunity to 4167 * setup a scrub. All the data has been sucessfully copied 4168 * but we have not validated any checksums. 4169 */ 4170 setup_sync_arg_t setup_sync_arg = { 4171 .func = POOL_SCAN_SCRUB, 4172 .txgstart = 0, 4173 .txgend = 0, 4174 }; 4175 if (zfs_scrub_after_expand && 4176 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { 4177 dsl_scan_setup_sync(&setup_sync_arg, tx); 4178 } 4179 } 4180 4181 /* 4182 * State of one copy batch. 4183 */ 4184 typedef struct raidz_reflow_arg { 4185 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ 4186 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ 4187 uint64_t rra_txg; /* TXG of this batch. */ 4188 uint_t rra_ashift; /* Ashift of the vdev. */ 4189 uint32_t rra_tbd; /* Number of in-flight ZIOs. */ 4190 uint32_t rra_writes; /* Number of write ZIOs. */ 4191 zio_t *rra_zio[]; /* Write ZIO pointers. */ 4192 } raidz_reflow_arg_t; 4193 4194 /* 4195 * Write of the new location on one child is done. Once all of them are done 4196 * we can unlock and free everything. 4197 */ 4198 static void 4199 raidz_reflow_write_done(zio_t *zio) 4200 { 4201 raidz_reflow_arg_t *rra = zio->io_private; 4202 vdev_raidz_expand_t *vre = rra->rra_vre; 4203 4204 abd_free(zio->io_abd); 4205 4206 mutex_enter(&vre->vre_lock); 4207 if (zio->io_error != 0) { 4208 /* Force a reflow pause on errors */ 4209 vre->vre_failed_offset = 4210 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4211 } 4212 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 4213 vre->vre_outstanding_bytes -= zio->io_size; 4214 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 4215 vre->vre_failed_offset) { 4216 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 4217 zio->io_size; 4218 } 4219 cv_signal(&vre->vre_cv); 4220 boolean_t done = (--rra->rra_tbd == 0); 4221 mutex_exit(&vre->vre_lock); 4222 4223 if (!done) 4224 return; 4225 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 4226 zfs_rangelock_exit(rra->rra_lr); 4227 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); 4228 } 4229 4230 /* 4231 * Read of the old location on one child is done. Once all of them are done 4232 * writes should have all the data and we can issue them. 4233 */ 4234 static void 4235 raidz_reflow_read_done(zio_t *zio) 4236 { 4237 raidz_reflow_arg_t *rra = zio->io_private; 4238 vdev_raidz_expand_t *vre = rra->rra_vre; 4239 4240 /* Reads of only one block use write ABDs. For bigger free gangs. */ 4241 if (zio->io_size > (1 << rra->rra_ashift)) 4242 abd_free(zio->io_abd); 4243 4244 /* 4245 * If the read failed, or if it was done on a vdev that is not fully 4246 * healthy (e.g. a child that has a resilver in progress), we may not 4247 * have the correct data. Note that it's OK if the write proceeds. 4248 * It may write garbage but the location is otherwise unused and we 4249 * will retry later due to vre_failed_offset. 4250 */ 4251 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 4252 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 4253 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 4254 (long long)rra->rra_lr->lr_offset, 4255 (long long)rra->rra_lr->lr_length, 4256 (long long)rra->rra_txg, 4257 zio->io_error, 4258 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 4259 vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 4260 mutex_enter(&vre->vre_lock); 4261 /* Force a reflow pause on errors */ 4262 vre->vre_failed_offset = 4263 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4264 mutex_exit(&vre->vre_lock); 4265 } 4266 4267 if (atomic_dec_32_nv(&rra->rra_tbd) > 0) 4268 return; 4269 uint32_t writes = rra->rra_tbd = rra->rra_writes; 4270 for (uint64_t i = 0; i < writes; i++) 4271 zio_nowait(rra->rra_zio[i]); 4272 } 4273 4274 static void 4275 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 4276 dmu_tx_t *tx) 4277 { 4278 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4279 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4280 4281 if (offset == 0) 4282 return; 4283 4284 mutex_enter(&vre->vre_lock); 4285 ASSERT3U(vre->vre_offset, <=, offset); 4286 vre->vre_offset = offset; 4287 mutex_exit(&vre->vre_lock); 4288 4289 if (vre->vre_offset_pertxg[txgoff] == 0) { 4290 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 4291 spa, tx); 4292 } 4293 vre->vre_offset_pertxg[txgoff] = offset; 4294 } 4295 4296 static boolean_t 4297 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 4298 { 4299 for (int i = 0; i < raidz_vd->vdev_children; i++) { 4300 /* Quick check if a child is being replaced */ 4301 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 4302 return (B_TRUE); 4303 } 4304 return (B_FALSE); 4305 } 4306 4307 static boolean_t 4308 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, 4309 dmu_tx_t *tx) 4310 { 4311 spa_t *spa = vd->vdev_spa; 4312 uint_t ashift = vd->vdev_top->vdev_ashift; 4313 4314 zfs_range_seg_t *rs = zfs_range_tree_first(rt); 4315 if (rt == NULL) 4316 return (B_FALSE); 4317 uint64_t offset = zfs_rs_get_start(rs, rt); 4318 ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 4319 uint64_t size = zfs_rs_get_end(rs, rt) - offset; 4320 ASSERT3U(size, >=, 1 << ashift); 4321 ASSERT(IS_P2ALIGNED(size, 1 << ashift)); 4322 4323 uint64_t blkid = offset >> ashift; 4324 uint_t old_children = vd->vdev_children - 1; 4325 4326 /* 4327 * We can only progress to the point that writes will not overlap 4328 * with blocks whose progress has not yet been recorded on disk. 4329 * Since partially-copied rows are still read from the old location, 4330 * we need to stop one row before the sector-wise overlap, to prevent 4331 * row-wise overlap. 4332 * 4333 * Note that even if we are skipping over a large unallocated region, 4334 * we can't move the on-disk progress to `offset`, because concurrent 4335 * writes/allocations could still use the currently-unallocated 4336 * region. 4337 */ 4338 uint64_t ubsync_blkid = 4339 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 4340 uint64_t next_overwrite_blkid = ubsync_blkid + 4341 ubsync_blkid / old_children - old_children; 4342 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 4343 if (blkid >= next_overwrite_blkid) { 4344 raidz_reflow_record_progress(vre, 4345 next_overwrite_blkid << ashift, tx); 4346 return (B_TRUE); 4347 } 4348 4349 size = MIN(size, raidz_expand_max_copy_bytes); 4350 size = MIN(size, (uint64_t)old_children * 4351 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); 4352 size = MAX(size, 1 << ashift); 4353 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); 4354 size = (uint64_t)blocks << ashift; 4355 4356 zfs_range_tree_remove(rt, offset, size); 4357 4358 uint_t reads = MIN(blocks, old_children); 4359 uint_t writes = MIN(blocks, vd->vdev_children); 4360 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + 4361 sizeof (zio_t *) * writes, KM_SLEEP); 4362 rra->rra_vre = vre; 4363 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 4364 offset, size, RL_WRITER); 4365 rra->rra_txg = dmu_tx_get_txg(tx); 4366 rra->rra_ashift = ashift; 4367 rra->rra_tbd = reads; 4368 rra->rra_writes = writes; 4369 4370 raidz_reflow_record_progress(vre, offset + size, tx); 4371 4372 /* 4373 * SCL_STATE will be released when the read and write are done, 4374 * by raidz_reflow_write_done(). 4375 */ 4376 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4377 4378 /* check if a replacing vdev was added, if so treat it as an error */ 4379 if (vdev_raidz_expand_child_replacing(vd)) { 4380 zfs_dbgmsg("replacing vdev encountered, reflow paused at " 4381 "offset=%llu txg=%llu", 4382 (long long)rra->rra_lr->lr_offset, 4383 (long long)rra->rra_txg); 4384 4385 mutex_enter(&vre->vre_lock); 4386 vre->vre_failed_offset = 4387 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4388 cv_signal(&vre->vre_cv); 4389 mutex_exit(&vre->vre_lock); 4390 4391 /* drop everything we acquired */ 4392 spa_config_exit(spa, SCL_STATE, spa); 4393 zfs_rangelock_exit(rra->rra_lr); 4394 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); 4395 return (B_TRUE); 4396 } 4397 4398 mutex_enter(&vre->vre_lock); 4399 vre->vre_outstanding_bytes += size; 4400 mutex_exit(&vre->vre_lock); 4401 4402 /* Allocate ABD and ZIO for each child we write. */ 4403 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4404 zio_t *pio = spa->spa_txg_zio[txgoff]; 4405 uint_t b = blocks / vd->vdev_children; 4406 uint_t bb = blocks % vd->vdev_children; 4407 for (uint_t i = 0; i < writes; i++) { 4408 uint_t n = b + (i < bb); 4409 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); 4410 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, 4411 vd->vdev_child[(blkid + i) % vd->vdev_children], 4412 ((blkid + i) / vd->vdev_children) << ashift, 4413 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4414 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); 4415 } 4416 4417 /* 4418 * Allocate and issue ZIO for each child we read. For reads of only 4419 * one block we can use respective writer ABDs, since they will also 4420 * have only one block. For bigger reads create gang ABDs and fill 4421 * them with respective blocks from writer ABDs. 4422 */ 4423 b = blocks / old_children; 4424 bb = blocks % old_children; 4425 for (uint_t i = 0; i < reads; i++) { 4426 uint_t n = b + (i < bb); 4427 abd_t *abd; 4428 if (n > 1) { 4429 abd = abd_alloc_gang(); 4430 for (uint_t j = 0; j < n; j++) { 4431 uint_t b = j * old_children + i; 4432 abd_t *cabd = abd_get_offset_size( 4433 rra->rra_zio[b % vd->vdev_children]->io_abd, 4434 (b / vd->vdev_children) << ashift, 4435 1 << ashift); 4436 abd_gang_add(abd, cabd, B_TRUE); 4437 } 4438 } else { 4439 abd = rra->rra_zio[i]->io_abd; 4440 } 4441 zio_nowait(zio_vdev_child_io(pio, NULL, 4442 vd->vdev_child[(blkid + i) % old_children], 4443 ((blkid + i) / old_children) << ashift, abd, 4444 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4445 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); 4446 } 4447 4448 return (B_FALSE); 4449 } 4450 4451 /* 4452 * For testing (ztest specific) 4453 */ 4454 static void 4455 raidz_expand_pause(uint_t pause_point) 4456 { 4457 while (raidz_expand_pause_point != 0 && 4458 raidz_expand_pause_point <= pause_point) 4459 delay(hz); 4460 } 4461 4462 static void 4463 raidz_scratch_child_done(zio_t *zio) 4464 { 4465 zio_t *pio = zio->io_private; 4466 4467 mutex_enter(&pio->io_lock); 4468 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4469 mutex_exit(&pio->io_lock); 4470 } 4471 4472 /* 4473 * Reflow the beginning portion of the vdev into an intermediate scratch area 4474 * in memory and on disk. This operation must be persisted on disk before we 4475 * proceed to overwrite the beginning portion with the reflowed data. 4476 * 4477 * This multi-step task can fail to complete if disk errors are encountered 4478 * and we can return here after a pause (waiting for disk to become healthy). 4479 */ 4480 static void 4481 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4482 { 4483 vdev_raidz_expand_t *vre = arg; 4484 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4485 zio_t *pio; 4486 int error; 4487 4488 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4489 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4490 int ashift = raidvd->vdev_ashift; 4491 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4492 uint64_t); 4493 uint64_t logical_size = write_size * raidvd->vdev_children; 4494 uint64_t read_size = 4495 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4496 1 << ashift); 4497 4498 /* 4499 * The scratch space must be large enough to get us to the point 4500 * that one row does not overlap itself when moved. This is checked 4501 * by vdev_raidz_attach_check(). 4502 */ 4503 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4504 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4505 VERIFY3U(write_size, <=, read_size); 4506 4507 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4508 0, logical_size, RL_WRITER); 4509 4510 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4511 KM_SLEEP); 4512 for (int i = 0; i < raidvd->vdev_children; i++) { 4513 abds[i] = abd_alloc_linear(read_size, B_FALSE); 4514 } 4515 4516 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4517 4518 /* 4519 * If we have already written the scratch area then we must read from 4520 * there, since new writes were redirected there while we were paused 4521 * or the original location may have been partially overwritten with 4522 * reflowed data. 4523 */ 4524 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4525 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4526 /* 4527 * Read from scratch space. 4528 */ 4529 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4530 for (int i = 0; i < raidvd->vdev_children; i++) { 4531 /* 4532 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4533 * to the offset to calculate the physical offset to 4534 * write to. Passing in a negative offset makes us 4535 * access the scratch area. 4536 */ 4537 zio_nowait(zio_vdev_child_io(pio, NULL, 4538 raidvd->vdev_child[i], 4539 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4540 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4541 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4542 } 4543 error = zio_wait(pio); 4544 if (error != 0) { 4545 zfs_dbgmsg("reflow: error %d reading scratch location", 4546 error); 4547 goto io_error_exit; 4548 } 4549 goto overwrite; 4550 } 4551 4552 /* 4553 * Read from original location. 4554 */ 4555 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4556 for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4557 ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4558 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4559 0, abds[i], read_size, ZIO_TYPE_READ, 4560 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4561 raidz_scratch_child_done, pio)); 4562 } 4563 error = zio_wait(pio); 4564 if (error != 0) { 4565 zfs_dbgmsg("reflow: error %d reading original location", error); 4566 io_error_exit: 4567 for (int i = 0; i < raidvd->vdev_children; i++) 4568 abd_free(abds[i]); 4569 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4570 zfs_rangelock_exit(lr); 4571 spa_config_exit(spa, SCL_STATE, FTAG); 4572 return; 4573 } 4574 4575 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4576 4577 /* 4578 * Reflow in memory. 4579 */ 4580 uint64_t logical_sectors = logical_size >> ashift; 4581 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4582 int oldchild = i % (raidvd->vdev_children - 1); 4583 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4584 4585 int newchild = i % raidvd->vdev_children; 4586 uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4587 4588 /* a single sector should not be copying over itself */ 4589 ASSERT(!(newchild == oldchild && newoff == oldoff)); 4590 4591 abd_copy_off(abds[newchild], abds[oldchild], 4592 newoff, oldoff, 1 << ashift); 4593 } 4594 4595 /* 4596 * Verify that we filled in everything we intended to (write_size on 4597 * each child). 4598 */ 4599 VERIFY0(logical_sectors % raidvd->vdev_children); 4600 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4601 write_size); 4602 4603 /* 4604 * Write to scratch location (boot area). 4605 */ 4606 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4607 for (int i = 0; i < raidvd->vdev_children; i++) { 4608 /* 4609 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4610 * the offset to calculate the physical offset to write to. 4611 * Passing in a negative offset lets us access the boot area. 4612 */ 4613 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4614 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4615 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4616 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4617 } 4618 error = zio_wait(pio); 4619 if (error != 0) { 4620 zfs_dbgmsg("reflow: error %d writing scratch location", error); 4621 goto io_error_exit; 4622 } 4623 pio = zio_root(spa, NULL, NULL, 0); 4624 zio_flush(pio, raidvd); 4625 zio_wait(pio); 4626 4627 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4628 (long long)logical_size); 4629 4630 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4631 4632 /* 4633 * Update uberblock to indicate that scratch space is valid. This is 4634 * needed because after this point, the real location may be 4635 * overwritten. If we crash, we need to get the data from the 4636 * scratch space, rather than the real location. 4637 * 4638 * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4639 * will prefer this uberblock. 4640 */ 4641 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4642 spa->spa_ubsync.ub_timestamp++; 4643 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4644 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4645 if (spa_multihost(spa)) 4646 mmp_update_uberblock(spa, &spa->spa_ubsync); 4647 4648 zfs_dbgmsg("reflow: uberblock updated " 4649 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4650 (long long)spa->spa_ubsync.ub_txg, 4651 (long long)logical_size, 4652 (long long)spa->spa_ubsync.ub_timestamp); 4653 4654 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4655 4656 /* 4657 * Overwrite with reflow'ed data. 4658 */ 4659 overwrite: 4660 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4661 for (int i = 0; i < raidvd->vdev_children; i++) { 4662 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4663 0, abds[i], write_size, ZIO_TYPE_WRITE, 4664 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4665 raidz_scratch_child_done, pio)); 4666 } 4667 error = zio_wait(pio); 4668 if (error != 0) { 4669 /* 4670 * When we exit early here and drop the range lock, new 4671 * writes will go into the scratch area so we'll need to 4672 * read from there when we return after pausing. 4673 */ 4674 zfs_dbgmsg("reflow: error %d writing real location", error); 4675 /* 4676 * Update the uberblock that is written when this txg completes. 4677 */ 4678 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4679 logical_size); 4680 goto io_error_exit; 4681 } 4682 pio = zio_root(spa, NULL, NULL, 0); 4683 zio_flush(pio, raidvd); 4684 zio_wait(pio); 4685 4686 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4687 (long long)logical_size); 4688 for (int i = 0; i < raidvd->vdev_children; i++) 4689 abd_free(abds[i]); 4690 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4691 4692 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4693 4694 /* 4695 * Update uberblock to indicate that the initial part has been 4696 * reflow'ed. This is needed because after this point (when we exit 4697 * the rangelock), we allow regular writes to this region, which will 4698 * be written to the new location only (because reflow_offset_next == 4699 * reflow_offset_synced). If we crashed and re-copied from the 4700 * scratch space, we would lose the regular writes. 4701 */ 4702 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4703 logical_size); 4704 spa->spa_ubsync.ub_timestamp++; 4705 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4706 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4707 if (spa_multihost(spa)) 4708 mmp_update_uberblock(spa, &spa->spa_ubsync); 4709 4710 zfs_dbgmsg("reflow: uberblock updated " 4711 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4712 (long long)spa->spa_ubsync.ub_txg, 4713 (long long)logical_size, 4714 (long long)spa->spa_ubsync.ub_timestamp); 4715 4716 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4717 4718 /* 4719 * Update progress. 4720 */ 4721 vre->vre_offset = logical_size; 4722 zfs_rangelock_exit(lr); 4723 spa_config_exit(spa, SCL_STATE, FTAG); 4724 4725 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4726 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4727 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4728 /* 4729 * Note - raidz_reflow_sync() will update the uberblock state to 4730 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4731 */ 4732 raidz_reflow_sync(spa, tx); 4733 4734 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4735 } 4736 4737 /* 4738 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4739 * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4740 */ 4741 void 4742 vdev_raidz_reflow_copy_scratch(spa_t *spa) 4743 { 4744 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4745 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4746 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4747 4748 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4749 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4750 ASSERT0(logical_size % raidvd->vdev_children); 4751 uint64_t write_size = logical_size / raidvd->vdev_children; 4752 4753 zio_t *pio; 4754 4755 /* 4756 * Read from scratch space. 4757 */ 4758 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4759 KM_SLEEP); 4760 for (int i = 0; i < raidvd->vdev_children; i++) { 4761 abds[i] = abd_alloc_linear(write_size, B_FALSE); 4762 } 4763 4764 pio = zio_root(spa, NULL, NULL, 0); 4765 for (int i = 0; i < raidvd->vdev_children; i++) { 4766 /* 4767 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4768 * the offset to calculate the physical offset to write to. 4769 * Passing in a negative offset lets us access the boot area. 4770 */ 4771 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4772 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4773 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, 4774 raidz_scratch_child_done, pio)); 4775 } 4776 zio_wait(pio); 4777 4778 /* 4779 * Overwrite real location with reflow'ed data. 4780 */ 4781 pio = zio_root(spa, NULL, NULL, 0); 4782 for (int i = 0; i < raidvd->vdev_children; i++) { 4783 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4784 0, abds[i], write_size, ZIO_TYPE_WRITE, 4785 ZIO_PRIORITY_REMOVAL, 0, 4786 raidz_scratch_child_done, pio)); 4787 } 4788 zio_wait(pio); 4789 pio = zio_root(spa, NULL, NULL, 0); 4790 zio_flush(pio, raidvd); 4791 zio_wait(pio); 4792 4793 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4794 "to real location", (long long)logical_size); 4795 4796 for (int i = 0; i < raidvd->vdev_children; i++) 4797 abd_free(abds[i]); 4798 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4799 4800 /* 4801 * Update uberblock. 4802 */ 4803 RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4804 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4805 spa->spa_ubsync.ub_timestamp++; 4806 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4807 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4808 if (spa_multihost(spa)) 4809 mmp_update_uberblock(spa, &spa->spa_ubsync); 4810 4811 zfs_dbgmsg("reflow recovery: uberblock updated " 4812 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4813 (long long)spa->spa_ubsync.ub_txg, 4814 (long long)logical_size, 4815 (long long)spa->spa_ubsync.ub_timestamp); 4816 4817 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4818 spa_first_txg(spa)); 4819 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4820 vre->vre_offset = logical_size; 4821 vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4822 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4823 /* 4824 * Note that raidz_reflow_sync() will update the uberblock once more 4825 */ 4826 raidz_reflow_sync(spa, tx); 4827 4828 dmu_tx_commit(tx); 4829 4830 spa_config_exit(spa, SCL_STATE, FTAG); 4831 } 4832 4833 static boolean_t 4834 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4835 { 4836 (void) zthr; 4837 spa_t *spa = arg; 4838 4839 return (spa->spa_raidz_expand != NULL && 4840 !spa->spa_raidz_expand->vre_waiting_for_resilver); 4841 } 4842 4843 /* 4844 * RAIDZ expansion background thread 4845 * 4846 * Can be called multiple times if the reflow is paused 4847 */ 4848 static void 4849 spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4850 { 4851 spa_t *spa = arg; 4852 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4853 4854 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4855 vre->vre_offset = 0; 4856 else 4857 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4858 4859 /* Reflow the beginning portion using the scratch area */ 4860 if (vre->vre_offset == 0) { 4861 VERIFY0(dsl_sync_task(spa_name(spa), 4862 NULL, raidz_reflow_scratch_sync, 4863 vre, 0, ZFS_SPACE_CHECK_NONE)); 4864 4865 /* if we encountered errors then pause */ 4866 if (vre->vre_offset == 0) { 4867 mutex_enter(&vre->vre_lock); 4868 vre->vre_waiting_for_resilver = B_TRUE; 4869 mutex_exit(&vre->vre_lock); 4870 return; 4871 } 4872 } 4873 4874 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4875 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4876 4877 uint64_t guid = raidvd->vdev_guid; 4878 4879 /* Iterate over all the remaining metaslabs */ 4880 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4881 i < raidvd->vdev_ms_count && 4882 !zthr_iscancelled(zthr) && 4883 vre->vre_failed_offset == UINT64_MAX; i++) { 4884 metaslab_t *msp = raidvd->vdev_ms[i]; 4885 4886 metaslab_disable(msp); 4887 mutex_enter(&msp->ms_lock); 4888 4889 /* 4890 * The metaslab may be newly created (for the expanded 4891 * space), in which case its trees won't exist yet, 4892 * so we need to bail out early. 4893 */ 4894 if (msp->ms_new) { 4895 mutex_exit(&msp->ms_lock); 4896 metaslab_enable(msp, B_FALSE, B_FALSE); 4897 continue; 4898 } 4899 4900 VERIFY0(metaslab_load(msp)); 4901 4902 /* 4903 * We want to copy everything except the free (allocatable) 4904 * space. Note that there may be a little bit more free 4905 * space (e.g. in ms_defer), and it's fine to copy that too. 4906 */ 4907 uint64_t shift, start; 4908 zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( 4909 raidvd, msp, &start, &shift); 4910 zfs_range_tree_t *rt = zfs_range_tree_create_flags( 4911 NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME, 4912 metaslab_rt_name(msp->ms_group, msp, 4913 "spa_raidz_expand_thread:rt")); 4914 zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); 4915 zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, 4916 rt); 4917 mutex_exit(&msp->ms_lock); 4918 4919 /* 4920 * Force the last sector of each metaslab to be copied. This 4921 * ensures that we advance the on-disk progress to the end of 4922 * this metaslab while the metaslab is disabled. Otherwise, we 4923 * could move past this metaslab without advancing the on-disk 4924 * progress, and then an allocation to this metaslab would not 4925 * be copied. 4926 */ 4927 int sectorsz = 1 << raidvd->vdev_ashift; 4928 uint64_t ms_last_offset = msp->ms_start + 4929 msp->ms_size - sectorsz; 4930 if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { 4931 zfs_range_tree_add(rt, ms_last_offset, sectorsz); 4932 } 4933 4934 /* 4935 * When we are resuming from a paused expansion (i.e. 4936 * when importing a pool with a expansion in progress), 4937 * discard any state that we have already processed. 4938 */ 4939 if (vre->vre_offset > msp->ms_start) { 4940 zfs_range_tree_clear(rt, msp->ms_start, 4941 vre->vre_offset - msp->ms_start); 4942 } 4943 4944 while (!zthr_iscancelled(zthr) && 4945 !zfs_range_tree_is_empty(rt) && 4946 vre->vre_failed_offset == UINT64_MAX) { 4947 4948 /* 4949 * We need to periodically drop the config lock so that 4950 * writers can get in. Additionally, we can't wait 4951 * for a txg to sync while holding a config lock 4952 * (since a waiting writer could cause a 3-way deadlock 4953 * with the sync thread, which also gets a config 4954 * lock for reader). So we can't hold the config lock 4955 * while calling dmu_tx_assign(). 4956 */ 4957 spa_config_exit(spa, SCL_CONFIG, FTAG); 4958 4959 /* 4960 * If requested, pause the reflow when the amount 4961 * specified by raidz_expand_max_reflow_bytes is reached 4962 * 4963 * This pause is only used during testing or debugging. 4964 */ 4965 while (raidz_expand_max_reflow_bytes != 0 && 4966 raidz_expand_max_reflow_bytes <= 4967 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4968 delay(hz); 4969 } 4970 4971 mutex_enter(&vre->vre_lock); 4972 while (vre->vre_outstanding_bytes > 4973 raidz_expand_max_copy_bytes) { 4974 cv_wait(&vre->vre_cv, &vre->vre_lock); 4975 } 4976 mutex_exit(&vre->vre_lock); 4977 4978 dmu_tx_t *tx = 4979 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4980 4981 VERIFY0(dmu_tx_assign(tx, 4982 DMU_TX_WAIT | DMU_TX_SUSPEND)); 4983 uint64_t txg = dmu_tx_get_txg(tx); 4984 4985 /* 4986 * Reacquire the vdev_config lock. Theoretically, the 4987 * vdev_t that we're expanding may have changed. 4988 */ 4989 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4990 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4991 4992 boolean_t needsync = 4993 raidz_reflow_impl(raidvd, vre, rt, tx); 4994 4995 dmu_tx_commit(tx); 4996 4997 if (needsync) { 4998 spa_config_exit(spa, SCL_CONFIG, FTAG); 4999 txg_wait_synced(spa->spa_dsl_pool, txg); 5000 spa_config_enter(spa, SCL_CONFIG, FTAG, 5001 RW_READER); 5002 } 5003 } 5004 5005 spa_config_exit(spa, SCL_CONFIG, FTAG); 5006 5007 metaslab_enable(msp, B_FALSE, B_FALSE); 5008 zfs_range_tree_vacate(rt, NULL, NULL); 5009 zfs_range_tree_destroy(rt); 5010 5011 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5012 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 5013 } 5014 5015 spa_config_exit(spa, SCL_CONFIG, FTAG); 5016 5017 /* 5018 * The txg_wait_synced() here ensures that all reflow zio's have 5019 * completed, and vre_failed_offset has been set if necessary. It 5020 * also ensures that the progress of the last raidz_reflow_sync() is 5021 * written to disk before raidz_reflow_complete_sync() changes the 5022 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 5023 * determine if a reflow is in progress, in which case we may need to 5024 * write to both old and new locations. Therefore we can only change 5025 * vre_state once this is not necessary, which is once the on-disk 5026 * progress (in spa_ubsync) has been set past any possible writes (to 5027 * the end of the last metaslab). 5028 */ 5029 txg_wait_synced(spa->spa_dsl_pool, 0); 5030 5031 if (!zthr_iscancelled(zthr) && 5032 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 5033 /* 5034 * We are not being canceled or paused, so the reflow must be 5035 * complete. In that case also mark it as completed on disk. 5036 */ 5037 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 5038 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 5039 raidz_reflow_complete_sync, spa, 5040 0, ZFS_SPACE_CHECK_NONE)); 5041 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 5042 } else { 5043 /* 5044 * Wait for all copy zio's to complete and for all the 5045 * raidz_reflow_sync() synctasks to be run. 5046 */ 5047 spa_history_log_internal(spa, "reflow pause", 5048 NULL, "offset=%llu failed_offset=%lld", 5049 (long long)vre->vre_offset, 5050 (long long)vre->vre_failed_offset); 5051 mutex_enter(&vre->vre_lock); 5052 if (vre->vre_failed_offset != UINT64_MAX) { 5053 /* 5054 * Reset progress so that we will retry everything 5055 * after the point that something failed. 5056 */ 5057 vre->vre_offset = vre->vre_failed_offset; 5058 vre->vre_failed_offset = UINT64_MAX; 5059 vre->vre_waiting_for_resilver = B_TRUE; 5060 } 5061 mutex_exit(&vre->vre_lock); 5062 } 5063 } 5064 5065 void 5066 spa_start_raidz_expansion_thread(spa_t *spa) 5067 { 5068 ASSERT0P(spa->spa_raidz_expand_zthr); 5069 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 5070 spa_raidz_expand_thread_check, spa_raidz_expand_thread, 5071 spa, defclsyspri); 5072 } 5073 5074 void 5075 raidz_dtl_reassessed(vdev_t *vd) 5076 { 5077 spa_t *spa = vd->vdev_spa; 5078 if (spa->spa_raidz_expand != NULL) { 5079 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 5080 /* 5081 * we get called often from vdev_dtl_reassess() so make 5082 * sure it's our vdev and any replacing is complete 5083 */ 5084 if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 5085 !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 5086 mutex_enter(&vre->vre_lock); 5087 if (vre->vre_waiting_for_resilver) { 5088 vdev_dbgmsg(vd, "DTL reassessed, " 5089 "continuing raidz expansion"); 5090 vre->vre_waiting_for_resilver = B_FALSE; 5091 zthr_wakeup(spa->spa_raidz_expand_zthr); 5092 } 5093 mutex_exit(&vre->vre_lock); 5094 } 5095 } 5096 } 5097 5098 int 5099 vdev_raidz_attach_check(vdev_t *new_child) 5100 { 5101 vdev_t *raidvd = new_child->vdev_parent; 5102 uint64_t new_children = raidvd->vdev_children; 5103 5104 /* 5105 * We use the "boot" space as scratch space to handle overwriting the 5106 * initial part of the vdev. If it is too small, then this expansion 5107 * is not allowed. This would be very unusual (e.g. ashift > 13 and 5108 * >200 children). 5109 */ 5110 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 5111 return (EINVAL); 5112 } 5113 return (0); 5114 } 5115 5116 void 5117 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 5118 { 5119 vdev_t *new_child = arg; 5120 spa_t *spa = new_child->vdev_spa; 5121 vdev_t *raidvd = new_child->vdev_parent; 5122 vdev_raidz_t *vdrz = raidvd->vdev_tsd; 5123 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 5124 ASSERT3P(raidvd->vdev_top, ==, raidvd); 5125 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 5126 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 5127 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 5128 new_child); 5129 5130 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 5131 5132 vdrz->vd_physical_width++; 5133 5134 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 5135 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 5136 vdrz->vn_vre.vre_offset = 0; 5137 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 5138 spa->spa_raidz_expand = &vdrz->vn_vre; 5139 zthr_wakeup(spa->spa_raidz_expand_zthr); 5140 5141 /* 5142 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 5143 * written to the config. 5144 */ 5145 vdev_config_dirty(raidvd); 5146 5147 vdrz->vn_vre.vre_start_time = gethrestime_sec(); 5148 vdrz->vn_vre.vre_end_time = 0; 5149 vdrz->vn_vre.vre_state = DSS_SCANNING; 5150 vdrz->vn_vre.vre_bytes_copied = 0; 5151 5152 uint64_t state = vdrz->vn_vre.vre_state; 5153 VERIFY0(zap_update(spa->spa_meta_objset, 5154 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 5155 sizeof (state), 1, &state, tx)); 5156 5157 uint64_t start_time = vdrz->vn_vre.vre_start_time; 5158 VERIFY0(zap_update(spa->spa_meta_objset, 5159 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 5160 sizeof (start_time), 1, &start_time, tx)); 5161 5162 (void) zap_remove(spa->spa_meta_objset, 5163 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 5164 (void) zap_remove(spa->spa_meta_objset, 5165 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 5166 5167 spa_history_log_internal(spa, "raidz vdev expansion started", tx, 5168 "%s vdev %llu new width %llu", spa_name(spa), 5169 (unsigned long long)raidvd->vdev_id, 5170 (unsigned long long)raidvd->vdev_children); 5171 } 5172 5173 int 5174 vdev_raidz_load(vdev_t *vd) 5175 { 5176 vdev_raidz_t *vdrz = vd->vdev_tsd; 5177 int err; 5178 5179 uint64_t state = DSS_NONE; 5180 uint64_t start_time = 0; 5181 uint64_t end_time = 0; 5182 uint64_t bytes_copied = 0; 5183 5184 if (vd->vdev_top_zap != 0) { 5185 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5186 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 5187 sizeof (state), 1, &state); 5188 if (err != 0 && err != ENOENT) 5189 return (err); 5190 5191 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5192 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 5193 sizeof (start_time), 1, &start_time); 5194 if (err != 0 && err != ENOENT) 5195 return (err); 5196 5197 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5198 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 5199 sizeof (end_time), 1, &end_time); 5200 if (err != 0 && err != ENOENT) 5201 return (err); 5202 5203 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 5204 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 5205 sizeof (bytes_copied), 1, &bytes_copied); 5206 if (err != 0 && err != ENOENT) 5207 return (err); 5208 } 5209 5210 /* 5211 * If we are in the middle of expansion, vre_state should have 5212 * already been set by vdev_raidz_init(). 5213 */ 5214 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 5215 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 5216 vdrz->vn_vre.vre_start_time = start_time; 5217 vdrz->vn_vre.vre_end_time = end_time; 5218 vdrz->vn_vre.vre_bytes_copied = bytes_copied; 5219 5220 return (0); 5221 } 5222 5223 int 5224 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 5225 { 5226 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 5227 5228 if (vre == NULL) { 5229 /* no removal in progress; find most recent completed */ 5230 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 5231 vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 5232 if (vd->vdev_ops == &vdev_raidz_ops) { 5233 vdev_raidz_t *vdrz = vd->vdev_tsd; 5234 5235 if (vdrz->vn_vre.vre_end_time != 0 && 5236 (vre == NULL || 5237 vdrz->vn_vre.vre_end_time > 5238 vre->vre_end_time)) { 5239 vre = &vdrz->vn_vre; 5240 } 5241 } 5242 } 5243 } 5244 5245 if (vre == NULL) { 5246 return (SET_ERROR(ENOENT)); 5247 } 5248 5249 pres->pres_state = vre->vre_state; 5250 pres->pres_expanding_vdev = vre->vre_vdev_id; 5251 5252 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 5253 pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 5254 5255 mutex_enter(&vre->vre_lock); 5256 pres->pres_reflowed = vre->vre_bytes_copied; 5257 for (int i = 0; i < TXG_SIZE; i++) 5258 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 5259 mutex_exit(&vre->vre_lock); 5260 5261 pres->pres_start_time = vre->vre_start_time; 5262 pres->pres_end_time = vre->vre_end_time; 5263 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 5264 5265 return (0); 5266 } 5267 5268 /* 5269 * Initialize private RAIDZ specific fields from the nvlist. 5270 */ 5271 static int 5272 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 5273 { 5274 uint_t children; 5275 nvlist_t **child; 5276 int error = nvlist_lookup_nvlist_array(nv, 5277 ZPOOL_CONFIG_CHILDREN, &child, &children); 5278 if (error != 0) 5279 return (SET_ERROR(EINVAL)); 5280 5281 uint64_t nparity; 5282 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 5283 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 5284 return (SET_ERROR(EINVAL)); 5285 5286 /* 5287 * Previous versions could only support 1 or 2 parity 5288 * device. 5289 */ 5290 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 5291 return (SET_ERROR(EINVAL)); 5292 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 5293 return (SET_ERROR(EINVAL)); 5294 } else { 5295 /* 5296 * We require the parity to be specified for SPAs that 5297 * support multiple parity levels. 5298 */ 5299 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 5300 return (SET_ERROR(EINVAL)); 5301 5302 /* 5303 * Otherwise, we default to 1 parity device for RAID-Z. 5304 */ 5305 nparity = 1; 5306 } 5307 5308 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 5309 vdrz->vn_vre.vre_vdev_id = -1; 5310 vdrz->vn_vre.vre_offset = UINT64_MAX; 5311 vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 5312 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 5313 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 5314 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 5315 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 5316 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 5317 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 5318 5319 vdrz->vd_physical_width = children; 5320 vdrz->vd_nparity = nparity; 5321 5322 /* note, the ID does not exist when creating a pool */ 5323 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 5324 &vdrz->vn_vre.vre_vdev_id); 5325 5326 boolean_t reflow_in_progress = 5327 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5328 if (reflow_in_progress) { 5329 spa->spa_raidz_expand = &vdrz->vn_vre; 5330 vdrz->vn_vre.vre_state = DSS_SCANNING; 5331 } 5332 5333 vdrz->vd_original_width = children; 5334 uint64_t *txgs; 5335 unsigned int txgs_size = 0; 5336 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5337 &txgs, &txgs_size); 5338 if (error == 0) { 5339 for (int i = 0; i < txgs_size; i++) { 5340 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 5341 re->re_txg = txgs[txgs_size - i - 1]; 5342 re->re_logical_width = vdrz->vd_physical_width - i; 5343 5344 if (reflow_in_progress) 5345 re->re_logical_width--; 5346 5347 avl_add(&vdrz->vd_expand_txgs, re); 5348 } 5349 5350 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 5351 } 5352 if (reflow_in_progress) { 5353 vdrz->vd_original_width--; 5354 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 5355 children, txgs_size); 5356 } 5357 5358 *tsd = vdrz; 5359 5360 return (0); 5361 } 5362 5363 static void 5364 vdev_raidz_fini(vdev_t *vd) 5365 { 5366 vdev_raidz_t *vdrz = vd->vdev_tsd; 5367 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 5368 vd->vdev_spa->spa_raidz_expand = NULL; 5369 reflow_node_t *re; 5370 void *cookie = NULL; 5371 avl_tree_t *tree = &vdrz->vd_expand_txgs; 5372 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 5373 kmem_free(re, sizeof (*re)); 5374 avl_destroy(&vdrz->vd_expand_txgs); 5375 mutex_destroy(&vdrz->vd_expand_lock); 5376 mutex_destroy(&vdrz->vn_vre.vre_lock); 5377 cv_destroy(&vdrz->vn_vre.vre_cv); 5378 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 5379 kmem_free(vdrz, sizeof (*vdrz)); 5380 } 5381 5382 /* 5383 * Add RAIDZ specific fields to the config nvlist. 5384 */ 5385 static void 5386 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 5387 { 5388 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 5389 vdev_raidz_t *vdrz = vd->vdev_tsd; 5390 5391 /* 5392 * Make sure someone hasn't managed to sneak a fancy new vdev 5393 * into a crufty old storage pool. 5394 */ 5395 ASSERT(vdrz->vd_nparity == 1 || 5396 (vdrz->vd_nparity <= 2 && 5397 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 5398 (vdrz->vd_nparity <= 3 && 5399 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 5400 5401 /* 5402 * Note that we'll add these even on storage pools where they 5403 * aren't strictly required -- older software will just ignore 5404 * it. 5405 */ 5406 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 5407 5408 if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 5409 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5410 } 5411 5412 mutex_enter(&vdrz->vd_expand_lock); 5413 if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 5414 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 5415 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 5416 KM_SLEEP); 5417 uint64_t i = 0; 5418 5419 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 5420 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 5421 txgs[i++] = re->re_txg; 5422 } 5423 5424 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5425 txgs, count); 5426 5427 kmem_free(txgs, sizeof (uint64_t) * count); 5428 } 5429 mutex_exit(&vdrz->vd_expand_lock); 5430 } 5431 5432 static uint64_t 5433 vdev_raidz_nparity(vdev_t *vd) 5434 { 5435 vdev_raidz_t *vdrz = vd->vdev_tsd; 5436 return (vdrz->vd_nparity); 5437 } 5438 5439 static uint64_t 5440 vdev_raidz_ndisks(vdev_t *vd) 5441 { 5442 return (vd->vdev_children); 5443 } 5444 5445 vdev_ops_t vdev_raidz_ops = { 5446 .vdev_op_init = vdev_raidz_init, 5447 .vdev_op_fini = vdev_raidz_fini, 5448 .vdev_op_open = vdev_raidz_open, 5449 .vdev_op_close = vdev_raidz_close, 5450 .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, 5451 .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, 5452 .vdev_op_min_asize = vdev_raidz_min_asize, 5453 .vdev_op_min_alloc = NULL, 5454 .vdev_op_io_start = vdev_raidz_io_start, 5455 .vdev_op_io_done = vdev_raidz_io_done, 5456 .vdev_op_state_change = vdev_raidz_state_change, 5457 .vdev_op_need_resilver = vdev_raidz_need_resilver, 5458 .vdev_op_hold = NULL, 5459 .vdev_op_rele = NULL, 5460 .vdev_op_remap = NULL, 5461 .vdev_op_xlate = vdev_raidz_xlate, 5462 .vdev_op_rebuild_asize = NULL, 5463 .vdev_op_metaslab_init = NULL, 5464 .vdev_op_config_generate = vdev_raidz_config_generate, 5465 .vdev_op_nparity = vdev_raidz_nparity, 5466 .vdev_op_ndisks = vdev_raidz_ndisks, 5467 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5468 .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5469 }; 5470 5471 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5472 "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5473 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5474 "Max amount of concurrent i/o for RAIDZ expansion"); 5475 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5476 "For expanded RAIDZ, aggregate reads that have more rows than this"); 5477 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5478 "For expanded RAIDZ, automatically start a pool scrub when expansion " 5479 "completes"); 5480 ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, 5481 "Raidz/draid slow disk sit out time period in seconds"); 5482 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64, 5483 ZMOD_RW, "Interval to check for slow raidz/draid children"); 5484 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT, 5485 ZMOD_RW, "How insensitive the slow raidz/draid child check should be"); 5486 /* END CSTYLED */ 5487