1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy 22eda14cbcSMatt Macy /* 23eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 242c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26eda14cbcSMatt Macy */ 27eda14cbcSMatt Macy 28eda14cbcSMatt Macy #include <sys/zfs_context.h> 29eda14cbcSMatt Macy #include <sys/spa.h> 30e716630dSMartin Matuska #include <sys/spa_impl.h> 31e716630dSMartin Matuska #include <sys/zap.h> 32eda14cbcSMatt Macy #include <sys/vdev_impl.h> 33e716630dSMartin Matuska #include <sys/metaslab_impl.h> 34eda14cbcSMatt Macy #include <sys/zio.h> 35eda14cbcSMatt Macy #include <sys/zio_checksum.h> 36e716630dSMartin Matuska #include <sys/dmu_tx.h> 37eda14cbcSMatt Macy #include <sys/abd.h> 38e716630dSMartin Matuska #include <sys/zfs_rlock.h> 39eda14cbcSMatt Macy #include <sys/fs/zfs.h> 40eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 41eda14cbcSMatt Macy #include <sys/vdev_raidz.h> 42eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h> 437877fdebSMatt Macy #include <sys/vdev_draid.h> 44e716630dSMartin Matuska #include <sys/uberblock_impl.h> 45e716630dSMartin Matuska #include <sys/dsl_scan.h> 46eda14cbcSMatt Macy 47eda14cbcSMatt Macy #ifdef ZFS_DEBUG 48eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 49eda14cbcSMatt Macy #endif 50eda14cbcSMatt Macy 51eda14cbcSMatt Macy /* 52eda14cbcSMatt Macy * Virtual device vector for RAID-Z. 53eda14cbcSMatt Macy * 54eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity, 55eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity, 56eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the 57eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 58eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 59eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 60eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance 61eda14cbcSMatt Macy * for writes. 62eda14cbcSMatt Macy * 63eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then 64eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its 65eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to 66eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to 67eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 68eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will 69eda14cbcSMatt Macy * suffer. 70eda14cbcSMatt Macy * 71eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the 72eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 73eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the 74eda14cbcSMatt Macy * field are defined as follows: 75eda14cbcSMatt Macy * 76eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR 77eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B 78eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression: 79eda14cbcSMatt Macy * 80eda14cbcSMatt Macy * (A * 2)_7 = A_6 81eda14cbcSMatt Macy * (A * 2)_6 = A_5 82eda14cbcSMatt Macy * (A * 2)_5 = A_4 83eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7 84eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7 85eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7 86eda14cbcSMatt Macy * (A * 2)_1 = A_0 87eda14cbcSMatt Macy * (A * 2)_0 = A_7 88eda14cbcSMatt Macy * 89eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 90eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting 91eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 92eda14cbcSMatt Macy * 93eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a 94eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of 95eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 96eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 97eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore 98eda14cbcSMatt Macy * A ^ (255 - 1) = A^254. 99eda14cbcSMatt Macy * 100eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns, 101eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations: 102eda14cbcSMatt Macy * 103eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1 104eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 105eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 106eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 107eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 108eda14cbcSMatt Macy * 109eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 110eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 111eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have 112eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.) 113eda14cbcSMatt Macy * 114eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually 115eda14cbcSMatt Macy * or in concert to recover missing data columns. 116eda14cbcSMatt Macy */ 117eda14cbcSMatt Macy 118eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0 119eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1 120eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 124eda14cbcSMatt Macy 125eda14cbcSMatt Macy /* 126eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a 127eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by 128eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to 129eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d. 130eda14cbcSMatt Macy */ 131eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \ 132eda14cbcSMatt Macy { \ 133eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \ 134eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \ 135eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 136eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 137eda14cbcSMatt Macy } 138eda14cbcSMatt Macy 139eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \ 140eda14cbcSMatt Macy { \ 141eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 143eda14cbcSMatt Macy } 144eda14cbcSMatt Macy 145e716630dSMartin Matuska 146e716630dSMartin Matuska /* 147e716630dSMartin Matuska * Big Theory Statement for how a RAIDZ VDEV is expanded 148e716630dSMartin Matuska * 149e716630dSMartin Matuska * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 150e716630dSMartin Matuska * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 151e716630dSMartin Matuska * that have been previously expanded can be expanded again. 152e716630dSMartin Matuska * 153e716630dSMartin Matuska * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 154e716630dSMartin Matuska * the VDEV) when an expansion starts. And the expansion will pause if any 155e716630dSMartin Matuska * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 156e716630dSMartin Matuska * operations on the pool can continue while an expansion is in progress (e.g. 157e716630dSMartin Matuska * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 158e716630dSMartin Matuska * and zpool initialize which can't be run during an expansion. Following a 159e716630dSMartin Matuska * reboot or export/import, the expansion resumes where it left off. 160e716630dSMartin Matuska * 161e716630dSMartin Matuska * == Reflowing the Data == 162e716630dSMartin Matuska * 163e716630dSMartin Matuska * The expansion involves reflowing (copying) the data from the current set 164e716630dSMartin Matuska * of disks to spread it across the new set which now has one more disk. This 165e716630dSMartin Matuska * reflow operation is similar to reflowing text when the column width of a 166e716630dSMartin Matuska * text editor window is expanded. The text doesn’t change but the location of 167e716630dSMartin Matuska * the text changes to accommodate the new width. An example reflow result for 168e716630dSMartin Matuska * a 4-wide RAIDZ1 to a 5-wide is shown below. 169e716630dSMartin Matuska * 170e716630dSMartin Matuska * Reflow End State 171e716630dSMartin Matuska * Each letter indicates a parity group (logical stripe) 172e716630dSMartin Matuska * 173e716630dSMartin Matuska * Before expansion After Expansion 174e716630dSMartin Matuska * D1 D2 D3 D4 D1 D2 D3 D4 D5 175e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 176e716630dSMartin Matuska * | | | | | | | | | | | 177e716630dSMartin Matuska * | A | A | A | A | | A | A | A | A | B | 178e716630dSMartin Matuska * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 179e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 180e716630dSMartin Matuska * | | | | | | | | | | | 181e716630dSMartin Matuska * | B | B | C | C | | B | C | C | C | C | 182e716630dSMartin Matuska * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 183e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 184e716630dSMartin Matuska * | | | | | | | | | | | 185e716630dSMartin Matuska * | C | C | D | D | | D | D | E | E | E | 186e716630dSMartin Matuska * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 187e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 188e716630dSMartin Matuska * | | | | | | | | | | | 189e716630dSMartin Matuska * | E | E | E | E | --> | E | F | F | G | G | 190e716630dSMartin Matuska * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 191e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 192e716630dSMartin Matuska * | | | | | | | | | | | 193e716630dSMartin Matuska * | F | F | G | G | | G | G | H | H | H | 194e716630dSMartin Matuska * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 195e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 196e716630dSMartin Matuska * | | | | | | | | | | | 197e716630dSMartin Matuska * | G | G | H | H | | H | I | I | J | J | 198e716630dSMartin Matuska * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 199e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 200e716630dSMartin Matuska * | | | | | | | | | | | 201e716630dSMartin Matuska * | H | H | I | I | | J | J | | | K | 202e716630dSMartin Matuska * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 203e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 204e716630dSMartin Matuska * 205e716630dSMartin Matuska * This reflow approach has several advantages. There is no need to read or 206e716630dSMartin Matuska * modify the block pointers or recompute any block checksums. The reflow 207e716630dSMartin Matuska * doesn’t need to know where the parity sectors reside. We can read and write 208e716630dSMartin Matuska * data sequentially and the copy can occur in a background thread in open 209e716630dSMartin Matuska * context. The design also allows for fast discovery of what data to copy. 210e716630dSMartin Matuska * 211e716630dSMartin Matuska * The VDEV metaslabs are processed, one at a time, to copy the block data to 212e716630dSMartin Matuska * have it flow across all the disks. The metaslab is disabled for allocations 213e716630dSMartin Matuska * during the copy. As an optimization, we only copy the allocated data which 214e716630dSMartin Matuska * can be determined by looking at the metaslab range tree. During the copy we 215e716630dSMartin Matuska * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 216e716630dSMartin Matuska * need to be able to survive losing parity count disks). This means we 217e716630dSMartin Matuska * cannot overwrite data during the reflow that would be needed if a disk is 218e716630dSMartin Matuska * lost. 219e716630dSMartin Matuska * 220e716630dSMartin Matuska * After the reflow completes, all newly-written blocks will have the new 221e716630dSMartin Matuska * layout, i.e., they will have the parity to data ratio implied by the new 222e716630dSMartin Matuska * number of disks in the RAIDZ group. Even though the reflow copies all of 223e716630dSMartin Matuska * the allocated space (data and parity), it is only rearranged, not changed. 224e716630dSMartin Matuska * 225e716630dSMartin Matuska * This act of reflowing the data has a few implications about blocks 226e716630dSMartin Matuska * that were written before the reflow completes: 227e716630dSMartin Matuska * 228e716630dSMartin Matuska * - Old blocks will still use the same amount of space (i.e., they will have 229e716630dSMartin Matuska * the parity to data ratio implied by the old number of disks in the RAIDZ 230e716630dSMartin Matuska * group). 231e716630dSMartin Matuska * - Reading old blocks will be slightly slower than before the reflow, for 232e716630dSMartin Matuska * two reasons. First, we will have to read from all disks in the RAIDZ 233e716630dSMartin Matuska * VDEV, rather than being able to skip the children that contain only 234e716630dSMartin Matuska * parity of this block (because the data of a single block is now spread 235e716630dSMartin Matuska * out across all the disks). Second, in most cases there will be an extra 236e716630dSMartin Matuska * bcopy, needed to rearrange the data back to its original layout in memory. 237e716630dSMartin Matuska * 238e716630dSMartin Matuska * == Scratch Area == 239e716630dSMartin Matuska * 240e716630dSMartin Matuska * As we copy the block data, we can only progress to the point that writes 241e716630dSMartin Matuska * will not overlap with blocks whose progress has not yet been recorded on 242e716630dSMartin Matuska * disk. Since partially-copied rows are always read from the old location, 243e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent any 244e716630dSMartin Matuska * row-wise overlap. For example, in the diagram above, when we reflow sector 245e716630dSMartin Matuska * B6 it will overwite the original location for B5. 246e716630dSMartin Matuska * 247e716630dSMartin Matuska * To get around this, a scratch space is used so that we can start copying 248e716630dSMartin Matuska * without risking data loss by overlapping the row. As an added benefit, it 249e716630dSMartin Matuska * improves performance at the beginning of the reflow, but that small perf 250e716630dSMartin Matuska * boost wouldn't be worth the complexity on its own. 251e716630dSMartin Matuska * 252e716630dSMartin Matuska * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 253e716630dSMartin Matuska * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 254e716630dSMartin Matuska * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 255e716630dSMartin Matuska * the widths will likely be single digits so we can get a substantial chuck 256e716630dSMartin Matuska * size using only a few MB of scratch per disk. 257e716630dSMartin Matuska * 258e716630dSMartin Matuska * The scratch area is persisted to disk which holds a large amount of reflowed 259e716630dSMartin Matuska * state. We can always read the partially written stripes when a disk fails or 260e716630dSMartin Matuska * the copy is interrupted (crash) during the initial copying phase and also 261e716630dSMartin Matuska * get past a small chunk size restriction. At a minimum, the scratch space 262e716630dSMartin Matuska * must be large enough to get us to the point that one row does not overlap 263e716630dSMartin Matuska * itself when moved (i.e new_width^2). But going larger is even better. We 264e716630dSMartin Matuska * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 265e716630dSMartin Matuska * as our scratch space to handle overwriting the initial part of the VDEV. 266e716630dSMartin Matuska * 267e716630dSMartin Matuska * 0 256K 512K 4M 268e716630dSMartin Matuska * +------+------+-----------------------+----------------------------- 269e716630dSMartin Matuska * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 270e716630dSMartin Matuska * | L0 | L1 | Reserved | (Metaslabs) 271e716630dSMartin Matuska * +------+------+-----------------------+------------------------------- 272e716630dSMartin Matuska * Scratch Area 273e716630dSMartin Matuska * 274e716630dSMartin Matuska * == Reflow Progress Updates == 275e716630dSMartin Matuska * After the initial scratch-based reflow, the expansion process works 276e716630dSMartin Matuska * similarly to device removal. We create a new open context thread which 277e716630dSMartin Matuska * reflows the data, and periodically kicks off sync tasks to update logical 278e716630dSMartin Matuska * state. In this case, state is the committed progress (offset of next data 279e716630dSMartin Matuska * to copy). We need to persist the completed offset on disk, so that if we 280e716630dSMartin Matuska * crash we know which format each VDEV offset is in. 281e716630dSMartin Matuska * 282e716630dSMartin Matuska * == Time Dependent Geometry == 283e716630dSMartin Matuska * 284e716630dSMartin Matuska * In non-expanded RAIDZ, blocks are read from disk in a column by column 285e716630dSMartin Matuska * fashion. For a multi-row block, the second sector is in the first column 286e716630dSMartin Matuska * not in the second column. This allows us to issue full reads for each 287e716630dSMartin Matuska * column directly into the request buffer. The block data is thus laid out 288e716630dSMartin Matuska * sequentially in a column-by-column fashion. 289e716630dSMartin Matuska * 290e716630dSMartin Matuska * For example, in the before expansion diagram above, one logical block might 291e716630dSMartin Matuska * be sectors G19-H26. The parity is in G19,H23; and the data is in 292e716630dSMartin Matuska * G20,H24,G21,H25,G22,H26. 293e716630dSMartin Matuska * 294e716630dSMartin Matuska * After a block is reflowed, the sectors that were all in the original column 295e716630dSMartin Matuska * data can now reside in different columns. When reading from an expanded 296e716630dSMartin Matuska * VDEV, we need to know the logical stripe width for each block so we can 297e716630dSMartin Matuska * reconstitute the block’s data after the reads are completed. Likewise, 298e716630dSMartin Matuska * when we perform the combinatorial reconstruction we need to know the 299e716630dSMartin Matuska * original width so we can retry combinations from the past layouts. 300e716630dSMartin Matuska * 301e716630dSMartin Matuska * Time dependent geometry is what we call having blocks with different layouts 302e716630dSMartin Matuska * (stripe widths) in the same VDEV. This time-dependent geometry uses the 303e716630dSMartin Matuska * block’s birth time (+ the time expansion ended) to establish the correct 304e716630dSMartin Matuska * width for a given block. After an expansion completes, we record the time 305e716630dSMartin Matuska * for blocks written with a particular width (geometry). 306e716630dSMartin Matuska * 307e716630dSMartin Matuska * == On Disk Format Changes == 308e716630dSMartin Matuska * 309e716630dSMartin Matuska * New pool feature flag, 'raidz_expansion' whose reference count is the number 310e716630dSMartin Matuska * of RAIDZ VDEVs that have been expanded. 311e716630dSMartin Matuska * 312e716630dSMartin Matuska * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 313e716630dSMartin Matuska * 314e716630dSMartin Matuska * Since the uberblock can point to arbitrary blocks, which might be on the 315e716630dSMartin Matuska * expanding RAIDZ, and might or might not have been expanded. We need to know 316e716630dSMartin Matuska * which way a block is laid out before reading it. This info is the next 317e716630dSMartin Matuska * offset that needs to be reflowed and we persist that in the uberblock, in 318e716630dSMartin Matuska * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 319e716630dSMartin Matuska * After the expansion is complete, we then use the raidz_expand_txgs array 320e716630dSMartin Matuska * (see below) to determine how to read a block and the ub_raidz_reflow_info 321e716630dSMartin Matuska * field no longer required. 322e716630dSMartin Matuska * 323e716630dSMartin Matuska * The uberblock's ub_raidz_reflow_info field also holds the scratch space 324e716630dSMartin Matuska * state (i.e., active or not) which is also required before reading a block 325e716630dSMartin Matuska * during the initial phase of reflowing the data. 326e716630dSMartin Matuska * 327e716630dSMartin Matuska * The top-level RAIDZ VDEV has two new entries in the nvlist: 328e716630dSMartin Matuska * 329e716630dSMartin Matuska * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 330e716630dSMartin Matuska * and used after the expansion is complete to 331e716630dSMartin Matuska * determine how to read a raidz block 332e716630dSMartin Matuska * 'raidz_expanding' boolean: present during reflow and removed after completion 333e716630dSMartin Matuska * used during a spa import to resume an unfinished 334e716630dSMartin Matuska * expansion 335e716630dSMartin Matuska * 336e716630dSMartin Matuska * And finally the VDEVs top zap adds the following informational entries: 337e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 338e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 339e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 340e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 341e716630dSMartin Matuska */ 342e716630dSMartin Matuska 343e716630dSMartin Matuska /* 344e716630dSMartin Matuska * For testing only: pause the raidz expansion after reflowing this amount. 345e716630dSMartin Matuska * (accessed by ZTS and ztest) 346e716630dSMartin Matuska */ 347e716630dSMartin Matuska #ifdef _KERNEL 348e716630dSMartin Matuska static 349e716630dSMartin Matuska #endif /* _KERNEL */ 350e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0; 351e716630dSMartin Matuska 352e716630dSMartin Matuska /* 353e716630dSMartin Matuska * For testing only: pause the raidz expansion at a certain point. 354e716630dSMartin Matuska */ 355e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0; 356e716630dSMartin Matuska 357e716630dSMartin Matuska /* 358e716630dSMartin Matuska * Maximum amount of copy io's outstanding at once. 359e716630dSMartin Matuska */ 360e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 361e716630dSMartin Matuska 362e716630dSMartin Matuska /* 363e716630dSMartin Matuska * Apply raidz map abds aggregation if the number of rows in the map is equal 364e716630dSMartin Matuska * or greater than the value below. 365e716630dSMartin Matuska */ 366e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4; 367e716630dSMartin Matuska 368e716630dSMartin Matuska /* 369e716630dSMartin Matuska * Automatically start a pool scrub when a RAIDZ expansion completes in 370e716630dSMartin Matuska * order to verify the checksums of all blocks which have been copied 371e716630dSMartin Matuska * during the expansion. Automatic scrubbing is enabled by default and 372e716630dSMartin Matuska * is strongly recommended. 373e716630dSMartin Matuska */ 374e716630dSMartin Matuska static int zfs_scrub_after_expand = 1; 375e716630dSMartin Matuska 3767877fdebSMatt Macy static void 3777877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr) 378eda14cbcSMatt Macy { 379184c1b94SMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 380184c1b94SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 381eda14cbcSMatt Macy 382184c1b94SMartin Matuska if (rc->rc_size != 0) 383184c1b94SMartin Matuska abd_free(rc->rc_abd); 384184c1b94SMartin Matuska if (rc->rc_orig_data != NULL) 385f9693befSMartin Matuska abd_free(rc->rc_orig_data); 386eda14cbcSMatt Macy } 387eda14cbcSMatt Macy 3887877fdebSMatt Macy if (rr->rr_abd_empty != NULL) 3897877fdebSMatt Macy abd_free(rr->rr_abd_empty); 390eda14cbcSMatt Macy 3917877fdebSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 3927877fdebSMatt Macy } 3937877fdebSMatt Macy 3947877fdebSMatt Macy void 3957877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm) 3967877fdebSMatt Macy { 3977877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) 3987877fdebSMatt Macy vdev_raidz_row_free(rm->rm_row[i]); 3997877fdebSMatt Macy 400e716630dSMartin Matuska if (rm->rm_nphys_cols) { 401e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 402e716630dSMartin Matuska if (rm->rm_phys_col[i].rc_abd != NULL) 403e716630dSMartin Matuska abd_free(rm->rm_phys_col[i].rc_abd); 404e716630dSMartin Matuska } 405e716630dSMartin Matuska 406e716630dSMartin Matuska kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 407e716630dSMartin Matuska rm->rm_nphys_cols); 408e716630dSMartin Matuska } 409e716630dSMartin Matuska 410e716630dSMartin Matuska ASSERT3P(rm->rm_lr, ==, NULL); 4117877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 412eda14cbcSMatt Macy } 413eda14cbcSMatt Macy 414eda14cbcSMatt Macy static void 415eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio) 416eda14cbcSMatt Macy { 417eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy vdev_raidz_map_free(rm); 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422e716630dSMartin Matuska static int 423e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2) 424e716630dSMartin Matuska { 425e716630dSMartin Matuska const reflow_node_t *l = x1; 426e716630dSMartin Matuska const reflow_node_t *r = x2; 427e716630dSMartin Matuska 428e716630dSMartin Matuska return (TREE_CMP(l->re_txg, r->re_txg)); 429e716630dSMartin Matuska } 430e716630dSMartin Matuska 431f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432eda14cbcSMatt Macy .vsd_free = vdev_raidz_map_free_vsd, 433eda14cbcSMatt Macy }; 434eda14cbcSMatt Macy 435e716630dSMartin Matuska raidz_row_t * 436e716630dSMartin Matuska vdev_raidz_row_alloc(int cols) 437e716630dSMartin Matuska { 438e716630dSMartin Matuska raidz_row_t *rr = 439e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 440e716630dSMartin Matuska 441e716630dSMartin Matuska rr->rr_cols = cols; 442e716630dSMartin Matuska rr->rr_scols = cols; 443e716630dSMartin Matuska 444e716630dSMartin Matuska for (int c = 0; c < cols; c++) { 445e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 446e716630dSMartin Matuska rc->rc_shadow_devidx = INT_MAX; 447e716630dSMartin Matuska rc->rc_shadow_offset = UINT64_MAX; 448e716630dSMartin Matuska rc->rc_allow_repair = 1; 449e716630dSMartin Matuska } 450e716630dSMartin Matuska return (rr); 451e716630dSMartin Matuska } 452e716630dSMartin Matuska 45381b22a98SMartin Matuska static void 45481b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 45581b22a98SMartin Matuska { 45681b22a98SMartin Matuska int c; 45781b22a98SMartin Matuska int nwrapped = 0; 45881b22a98SMartin Matuska uint64_t off = 0; 45981b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 46081b22a98SMartin Matuska 46181b22a98SMartin Matuska ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 46281b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 46381b22a98SMartin Matuska 46481b22a98SMartin Matuska /* 46581b22a98SMartin Matuska * Pad any parity columns with additional space to account for skip 46681b22a98SMartin Matuska * sectors. 46781b22a98SMartin Matuska */ 46881b22a98SMartin Matuska if (rm->rm_skipstart < rr->rr_firstdatacol) { 46981b22a98SMartin Matuska ASSERT0(rm->rm_skipstart); 47081b22a98SMartin Matuska nwrapped = rm->rm_nskip; 47181b22a98SMartin Matuska } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 47281b22a98SMartin Matuska nwrapped = 47381b22a98SMartin Matuska (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 47481b22a98SMartin Matuska } 47581b22a98SMartin Matuska 47681b22a98SMartin Matuska /* 47781b22a98SMartin Matuska * Optional single skip sectors (rc_size == 0) will be handled in 47881b22a98SMartin Matuska * vdev_raidz_io_start_write(). 47981b22a98SMartin Matuska */ 48081b22a98SMartin Matuska int skipped = rr->rr_scols - rr->rr_cols; 48181b22a98SMartin Matuska 48281b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 48381b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) { 48481b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 48581b22a98SMartin Matuska 48681b22a98SMartin Matuska /* 48781b22a98SMartin Matuska * Parity columns will pad out a linear ABD to account for 48881b22a98SMartin Matuska * the skip sector. A linear ABD is used here because 48981b22a98SMartin Matuska * parity calculations use the ABD buffer directly to calculate 49081b22a98SMartin Matuska * parity. This avoids doing a memcpy back to the ABD after the 49181b22a98SMartin Matuska * parity has been calculated. By issuing the parity column 49281b22a98SMartin Matuska * with the skip sector we can reduce contention on the child 49381b22a98SMartin Matuska * VDEV queue locks (vq_lock). 49481b22a98SMartin Matuska */ 49581b22a98SMartin Matuska if (c < nwrapped) { 49681b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear( 49781b22a98SMartin Matuska rc->rc_size + (1ULL << ashift), B_FALSE); 49881b22a98SMartin Matuska abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 49981b22a98SMartin Matuska skipped++; 50081b22a98SMartin Matuska } else { 50181b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 50281b22a98SMartin Matuska } 50381b22a98SMartin Matuska } 50481b22a98SMartin Matuska 50581b22a98SMartin Matuska for (off = 0; c < rr->rr_cols; c++) { 50681b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 50781b22a98SMartin Matuska abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 50881b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 50981b22a98SMartin Matuska 51081b22a98SMartin Matuska /* 51181b22a98SMartin Matuska * Generate I/O for skip sectors to improve aggregation 51281b22a98SMartin Matuska * continuity. We will use gang ABD's to reduce contention 51381b22a98SMartin Matuska * on the child VDEV queue locks (vq_lock) by issuing 51481b22a98SMartin Matuska * a single I/O that contains the data and skip sector. 51581b22a98SMartin Matuska * 51681b22a98SMartin Matuska * It is important to make sure that rc_size is not updated 51781b22a98SMartin Matuska * even though we are adding a skip sector to the ABD. When 51881b22a98SMartin Matuska * calculating the parity in vdev_raidz_generate_parity_row() 51981b22a98SMartin Matuska * the rc_size is used to iterate through the ABD's. We can 52081b22a98SMartin Matuska * not have zero'd out skip sectors used for calculating 52181b22a98SMartin Matuska * parity for raidz, because those same sectors are not used 52281b22a98SMartin Matuska * during reconstruction. 52381b22a98SMartin Matuska */ 52481b22a98SMartin Matuska if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 52581b22a98SMartin Matuska rc->rc_abd = abd_alloc_gang(); 52681b22a98SMartin Matuska abd_gang_add(rc->rc_abd, abd, B_TRUE); 52781b22a98SMartin Matuska abd_gang_add(rc->rc_abd, 52881b22a98SMartin Matuska abd_get_zeros(1ULL << ashift), B_TRUE); 52981b22a98SMartin Matuska skipped++; 53081b22a98SMartin Matuska } else { 53181b22a98SMartin Matuska rc->rc_abd = abd; 53281b22a98SMartin Matuska } 53381b22a98SMartin Matuska off += rc->rc_size; 53481b22a98SMartin Matuska } 53581b22a98SMartin Matuska 53681b22a98SMartin Matuska ASSERT3U(off, ==, zio->io_size); 53781b22a98SMartin Matuska ASSERT3S(skipped, ==, rm->rm_nskip); 53881b22a98SMartin Matuska } 53981b22a98SMartin Matuska 54081b22a98SMartin Matuska static void 54181b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 54281b22a98SMartin Matuska { 54381b22a98SMartin Matuska int c; 54481b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 54581b22a98SMartin Matuska 54681b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 54781b22a98SMartin Matuska 54881b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 54981b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) 55081b22a98SMartin Matuska rr->rr_col[c].rc_abd = 55181b22a98SMartin Matuska abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 55281b22a98SMartin Matuska 55381b22a98SMartin Matuska for (uint64_t off = 0; c < rr->rr_cols; c++) { 55481b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 55581b22a98SMartin Matuska rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 55681b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 55781b22a98SMartin Matuska off += rc->rc_size; 55881b22a98SMartin Matuska } 55981b22a98SMartin Matuska } 56081b22a98SMartin Matuska 561eda14cbcSMatt Macy /* 562eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is 563eda14cbcSMatt Macy * the number of children in the target vdev. 564eda14cbcSMatt Macy * 565eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which 566eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 567eda14cbcSMatt Macy */ 568eda14cbcSMatt Macy noinline raidz_map_t * 569eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 570eda14cbcSMatt Macy uint64_t nparity) 571eda14cbcSMatt Macy { 5727877fdebSMatt Macy raidz_row_t *rr; 573eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 574eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift; 575eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 576eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift; 577eda14cbcSMatt Macy /* The first column for this stripe. */ 578eda14cbcSMatt Macy uint64_t f = b % dcols; 579eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */ 580eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift; 581e716630dSMartin Matuska uint64_t acols, scols; 582eda14cbcSMatt Macy 5837877fdebSMatt Macy raidz_map_t *rm = 5847877fdebSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 5857877fdebSMatt Macy rm->rm_nrows = 1; 5867877fdebSMatt Macy 587eda14cbcSMatt Macy /* 588eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but 589eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data. 590eda14cbcSMatt Macy */ 591e716630dSMartin Matuska uint64_t q = s / (dcols - nparity); 592eda14cbcSMatt Macy 593eda14cbcSMatt Macy /* 594eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O. 595eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs. 596eda14cbcSMatt Macy */ 597e716630dSMartin Matuska uint64_t r = s - q * (dcols - nparity); 598eda14cbcSMatt Macy 599eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */ 600e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 601eda14cbcSMatt Macy 602eda14cbcSMatt Macy /* 603eda14cbcSMatt Macy * The total number of data and parity sectors associated with 604eda14cbcSMatt Macy * this I/O. 605eda14cbcSMatt Macy */ 606e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 607eda14cbcSMatt Macy 6087877fdebSMatt Macy /* 6097877fdebSMatt Macy * acols: The columns that will be accessed. 6107877fdebSMatt Macy * scols: The columns that will be accessed or skipped. 6117877fdebSMatt Macy */ 612eda14cbcSMatt Macy if (q == 0) { 613eda14cbcSMatt Macy /* Our I/O request doesn't span all child vdevs. */ 614eda14cbcSMatt Macy acols = bc; 615eda14cbcSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1)); 616eda14cbcSMatt Macy } else { 617eda14cbcSMatt Macy acols = dcols; 618eda14cbcSMatt Macy scols = dcols; 619eda14cbcSMatt Macy } 620eda14cbcSMatt Macy 621eda14cbcSMatt Macy ASSERT3U(acols, <=, scols); 622e716630dSMartin Matuska rr = vdev_raidz_row_alloc(scols); 6237877fdebSMatt Macy rm->rm_row[0] = rr; 6247877fdebSMatt Macy rr->rr_cols = acols; 6257877fdebSMatt Macy rr->rr_bigcols = bc; 6267877fdebSMatt Macy rr->rr_firstdatacol = nparity; 6277877fdebSMatt Macy #ifdef ZFS_DEBUG 6287877fdebSMatt Macy rr->rr_offset = zio->io_offset; 6297877fdebSMatt Macy rr->rr_size = zio->io_size; 6307877fdebSMatt Macy #endif 631eda14cbcSMatt Macy 632e716630dSMartin Matuska uint64_t asize = 0; 633eda14cbcSMatt Macy 634e716630dSMartin Matuska for (uint64_t c = 0; c < scols; c++) { 6357877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 636e716630dSMartin Matuska uint64_t col = f + c; 637e716630dSMartin Matuska uint64_t coff = o; 638eda14cbcSMatt Macy if (col >= dcols) { 639eda14cbcSMatt Macy col -= dcols; 640eda14cbcSMatt Macy coff += 1ULL << ashift; 641eda14cbcSMatt Macy } 6427877fdebSMatt Macy rc->rc_devidx = col; 6437877fdebSMatt Macy rc->rc_offset = coff; 644eda14cbcSMatt Macy 645eda14cbcSMatt Macy if (c >= acols) 6467877fdebSMatt Macy rc->rc_size = 0; 647eda14cbcSMatt Macy else if (c < bc) 6487877fdebSMatt Macy rc->rc_size = (q + 1) << ashift; 649eda14cbcSMatt Macy else 6507877fdebSMatt Macy rc->rc_size = q << ashift; 651eda14cbcSMatt Macy 6527877fdebSMatt Macy asize += rc->rc_size; 653eda14cbcSMatt Macy } 654eda14cbcSMatt Macy 655eda14cbcSMatt Macy ASSERT3U(asize, ==, tot << ashift); 656eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot; 6577877fdebSMatt Macy rm->rm_skipstart = bc; 658eda14cbcSMatt Macy 659eda14cbcSMatt Macy /* 660eda14cbcSMatt Macy * If all data stored spans all columns, there's a danger that parity 661eda14cbcSMatt Macy * will always be on the same device and, since parity isn't read 662eda14cbcSMatt Macy * during normal operation, that device's I/O bandwidth won't be 663eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB. 664eda14cbcSMatt Macy * 665eda14cbcSMatt Macy * ... at least that was, ostensibly, the theory. As a practical 666eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we 667eda14cbcSMatt Macy * won't see any benefit. Further, occasional writes that aren't a 668eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum 669eda14cbcSMatt Macy * stripe width are sufficient to avoid pessimal behavior. 670eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format 671eda14cbcSMatt Macy * requirement that we need to support for all eternity, but only 672eda14cbcSMatt Macy * for single-parity RAID-Z. 673eda14cbcSMatt Macy * 674eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding 675eda14cbcSMatt Macy * we must make sure to note this swap. We will never intend to 676eda14cbcSMatt Macy * skip the first column since at least one data and one parity 677eda14cbcSMatt Macy * column must appear in each row. 678eda14cbcSMatt Macy */ 6797877fdebSMatt Macy ASSERT(rr->rr_cols >= 2); 6807877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 681eda14cbcSMatt Macy 6827877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 683e716630dSMartin Matuska uint64_t devidx = rr->rr_col[0].rc_devidx; 6847877fdebSMatt Macy o = rr->rr_col[0].rc_offset; 6857877fdebSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 6867877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 6877877fdebSMatt Macy rr->rr_col[1].rc_devidx = devidx; 6887877fdebSMatt Macy rr->rr_col[1].rc_offset = o; 689eda14cbcSMatt Macy if (rm->rm_skipstart == 0) 690eda14cbcSMatt Macy rm->rm_skipstart = 1; 691eda14cbcSMatt Macy } 692eda14cbcSMatt Macy 69381b22a98SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) { 69481b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio, rm, ashift); 69581b22a98SMartin Matuska } else { 69681b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio, rm); 69781b22a98SMartin Matuska } 698e716630dSMartin Matuska /* init RAIDZ parity ops */ 699e716630dSMartin Matuska rm->rm_ops = vdev_raidz_math_get_ops(); 70081b22a98SMartin Matuska 701e716630dSMartin Matuska return (rm); 702e716630dSMartin Matuska } 703e716630dSMartin Matuska 704e716630dSMartin Matuska /* 705e716630dSMartin Matuska * Everything before reflow_offset_synced should have been moved to the new 706e716630dSMartin Matuska * location (read and write completed). However, this may not yet be reflected 707e716630dSMartin Matuska * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 708e716630dSMartin Matuska * uberblock has not yet been written). If reflow is not in progress, 709e716630dSMartin Matuska * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 710e716630dSMartin Matuska * entirely before reflow_offset_synced, it will come from the new location. 711e716630dSMartin Matuska * Otherwise this row will come from the old location. Therefore, rows that 712e716630dSMartin Matuska * straddle the reflow_offset_synced will come from the old location. 713e716630dSMartin Matuska * 714e716630dSMartin Matuska * For writes, reflow_offset_next is the next offset to copy. If a sector has 715e716630dSMartin Matuska * been copied, but not yet reflected in the on-disk progress 716e716630dSMartin Matuska * (reflow_offset_synced), it will also be written to the new (already copied) 717e716630dSMartin Matuska * offset. 718e716630dSMartin Matuska */ 719e716630dSMartin Matuska noinline raidz_map_t * 720e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio, 721e716630dSMartin Matuska uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 722e716630dSMartin Matuska uint64_t nparity, uint64_t reflow_offset_synced, 723e716630dSMartin Matuska uint64_t reflow_offset_next, boolean_t use_scratch) 724e716630dSMartin Matuska { 725e716630dSMartin Matuska abd_t *abd = zio->io_abd; 726e716630dSMartin Matuska uint64_t offset = zio->io_offset; 727e716630dSMartin Matuska uint64_t size = zio->io_size; 728e716630dSMartin Matuska 729e716630dSMartin Matuska /* The zio's size in units of the vdev's minimum sector size. */ 730e716630dSMartin Matuska uint64_t s = size >> ashift; 731e716630dSMartin Matuska 732e716630dSMartin Matuska /* 733e716630dSMartin Matuska * "Quotient": The number of data sectors for this stripe on all but 734e716630dSMartin Matuska * the "big column" child vdevs that also contain "remainder" data. 735e716630dSMartin Matuska * AKA "full rows" 736e716630dSMartin Matuska */ 737e716630dSMartin Matuska uint64_t q = s / (logical_cols - nparity); 738e716630dSMartin Matuska 739e716630dSMartin Matuska /* 740e716630dSMartin Matuska * "Remainder": The number of partial stripe data sectors in this I/O. 741e716630dSMartin Matuska * This will add a sector to some, but not all, child vdevs. 742e716630dSMartin Matuska */ 743e716630dSMartin Matuska uint64_t r = s - q * (logical_cols - nparity); 744e716630dSMartin Matuska 745e716630dSMartin Matuska /* The number of "big columns" - those which contain remainder data. */ 746e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 747e716630dSMartin Matuska 748e716630dSMartin Matuska /* 749e716630dSMartin Matuska * The total number of data and parity sectors associated with 750e716630dSMartin Matuska * this I/O. 751e716630dSMartin Matuska */ 752e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 753e716630dSMartin Matuska 754e716630dSMartin Matuska /* How many rows contain data (not skip) */ 755e716630dSMartin Matuska uint64_t rows = howmany(tot, logical_cols); 756e716630dSMartin Matuska int cols = MIN(tot, logical_cols); 757e716630dSMartin Matuska 758e716630dSMartin Matuska raidz_map_t *rm = 759e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 760e716630dSMartin Matuska KM_SLEEP); 761e716630dSMartin Matuska rm->rm_nrows = rows; 762e716630dSMartin Matuska rm->rm_nskip = roundup(tot, nparity + 1) - tot; 763e716630dSMartin Matuska rm->rm_skipstart = bc; 764e716630dSMartin Matuska uint64_t asize = 0; 765e716630dSMartin Matuska 766e716630dSMartin Matuska for (uint64_t row = 0; row < rows; row++) { 767e716630dSMartin Matuska boolean_t row_use_scratch = B_FALSE; 768e716630dSMartin Matuska raidz_row_t *rr = vdev_raidz_row_alloc(cols); 769e716630dSMartin Matuska rm->rm_row[row] = rr; 770e716630dSMartin Matuska 771e716630dSMartin Matuska /* The starting RAIDZ (parent) vdev sector of the row. */ 772e716630dSMartin Matuska uint64_t b = (offset >> ashift) + row * logical_cols; 773e716630dSMartin Matuska 774e716630dSMartin Matuska /* 775e716630dSMartin Matuska * If we are in the middle of a reflow, and the copying has 776e716630dSMartin Matuska * not yet completed for any part of this row, then use the 777e716630dSMartin Matuska * old location of this row. Note that reflow_offset_synced 778e716630dSMartin Matuska * reflects the i/o that's been completed, because it's 779e716630dSMartin Matuska * updated by a synctask, after zio_wait(spa_txg_zio[]). 780e716630dSMartin Matuska * This is sufficient for our check, even if that progress 781e716630dSMartin Matuska * has not yet been recorded to disk (reflected in 782e716630dSMartin Matuska * spa_ubsync). Also note that we consider the last row to 783e716630dSMartin Matuska * be "full width" (`cols`-wide rather than `bc`-wide) for 784e716630dSMartin Matuska * this calculation. This causes a tiny bit of unnecessary 785e716630dSMartin Matuska * double-writes but is safe and simpler to calculate. 786e716630dSMartin Matuska */ 787e716630dSMartin Matuska int row_phys_cols = physical_cols; 788e716630dSMartin Matuska if (b + cols > reflow_offset_synced >> ashift) 789e716630dSMartin Matuska row_phys_cols--; 790e716630dSMartin Matuska else if (use_scratch) 791e716630dSMartin Matuska row_use_scratch = B_TRUE; 792e716630dSMartin Matuska 793e716630dSMartin Matuska /* starting child of this row */ 794e716630dSMartin Matuska uint64_t child_id = b % row_phys_cols; 795e716630dSMartin Matuska /* The starting byte offset on each child vdev. */ 796e716630dSMartin Matuska uint64_t child_offset = (b / row_phys_cols) << ashift; 797e716630dSMartin Matuska 798e716630dSMartin Matuska /* 799e716630dSMartin Matuska * Note, rr_cols is the entire width of the block, even 800e716630dSMartin Matuska * if this row is shorter. This is needed because parity 801e716630dSMartin Matuska * generation (for Q and R) needs to know the entire width, 802e716630dSMartin Matuska * because it treats the short row as though it was 803e716630dSMartin Matuska * full-width (and the "phantom" sectors were zero-filled). 804e716630dSMartin Matuska * 805e716630dSMartin Matuska * Another approach to this would be to set cols shorter 806e716630dSMartin Matuska * (to just the number of columns that we might do i/o to) 807e716630dSMartin Matuska * and have another mechanism to tell the parity generation 808e716630dSMartin Matuska * about the "entire width". Reconstruction (at least 809e716630dSMartin Matuska * vdev_raidz_reconstruct_general()) would also need to 810e716630dSMartin Matuska * know about the "entire width". 811e716630dSMartin Matuska */ 812e716630dSMartin Matuska rr->rr_firstdatacol = nparity; 813e716630dSMartin Matuska #ifdef ZFS_DEBUG 814e716630dSMartin Matuska /* 815e716630dSMartin Matuska * note: rr_size is PSIZE, not ASIZE 816e716630dSMartin Matuska */ 817e716630dSMartin Matuska rr->rr_offset = b << ashift; 818e716630dSMartin Matuska rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 819e716630dSMartin Matuska #endif 820e716630dSMartin Matuska 821e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++, child_id++) { 822e716630dSMartin Matuska if (child_id >= row_phys_cols) { 823e716630dSMartin Matuska child_id -= row_phys_cols; 824e716630dSMartin Matuska child_offset += 1ULL << ashift; 825e716630dSMartin Matuska } 826e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 827e716630dSMartin Matuska rc->rc_devidx = child_id; 828e716630dSMartin Matuska rc->rc_offset = child_offset; 829e716630dSMartin Matuska 830e716630dSMartin Matuska /* 831e716630dSMartin Matuska * Get this from the scratch space if appropriate. 832e716630dSMartin Matuska * This only happens if we crashed in the middle of 833e716630dSMartin Matuska * raidz_reflow_scratch_sync() (while it's running, 834e716630dSMartin Matuska * the rangelock prevents us from doing concurrent 835e716630dSMartin Matuska * io), and even then only during zpool import or 836e716630dSMartin Matuska * when the pool is imported readonly. 837e716630dSMartin Matuska */ 838e716630dSMartin Matuska if (row_use_scratch) 839e716630dSMartin Matuska rc->rc_offset -= VDEV_BOOT_SIZE; 840e716630dSMartin Matuska 841e716630dSMartin Matuska uint64_t dc = c - rr->rr_firstdatacol; 842e716630dSMartin Matuska if (c < rr->rr_firstdatacol) { 843e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 844e716630dSMartin Matuska 845e716630dSMartin Matuska /* 846e716630dSMartin Matuska * Parity sectors' rc_abd's are set below 847e716630dSMartin Matuska * after determining if this is an aggregation. 848e716630dSMartin Matuska */ 849e716630dSMartin Matuska } else if (row == rows - 1 && bc != 0 && c >= bc) { 850e716630dSMartin Matuska /* 851e716630dSMartin Matuska * Past the end of the block (even including 852e716630dSMartin Matuska * skip sectors). This sector is part of the 853e716630dSMartin Matuska * map so that we have full rows for p/q parity 854e716630dSMartin Matuska * generation. 855e716630dSMartin Matuska */ 856e716630dSMartin Matuska rc->rc_size = 0; 857e716630dSMartin Matuska rc->rc_abd = NULL; 858e716630dSMartin Matuska } else { 859e716630dSMartin Matuska /* "data column" (col excluding parity) */ 860e716630dSMartin Matuska uint64_t off; 861e716630dSMartin Matuska 862e716630dSMartin Matuska if (c < bc || r == 0) { 863e716630dSMartin Matuska off = dc * rows + row; 864e716630dSMartin Matuska } else { 865e716630dSMartin Matuska off = r * rows + 866e716630dSMartin Matuska (dc - r) * (rows - 1) + row; 867e716630dSMartin Matuska } 868e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 869e716630dSMartin Matuska rc->rc_abd = abd_get_offset_struct( 870e716630dSMartin Matuska &rc->rc_abdstruct, abd, off << ashift, 871e716630dSMartin Matuska rc->rc_size); 872e716630dSMartin Matuska } 873e716630dSMartin Matuska 874e716630dSMartin Matuska if (rc->rc_size == 0) 875e716630dSMartin Matuska continue; 876e716630dSMartin Matuska 877e716630dSMartin Matuska /* 878e716630dSMartin Matuska * If any part of this row is in both old and new 879e716630dSMartin Matuska * locations, the primary location is the old 880e716630dSMartin Matuska * location. If this sector was already copied to the 881e716630dSMartin Matuska * new location, we need to also write to the new, 882e716630dSMartin Matuska * "shadow" location. 883e716630dSMartin Matuska * 884e716630dSMartin Matuska * Note, `row_phys_cols != physical_cols` indicates 885e716630dSMartin Matuska * that the primary location is the old location. 886e716630dSMartin Matuska * `b+c < reflow_offset_next` indicates that the copy 887e716630dSMartin Matuska * to the new location has been initiated. We know 888e716630dSMartin Matuska * that the copy has completed because we have the 889e716630dSMartin Matuska * rangelock, which is held exclusively while the 890e716630dSMartin Matuska * copy is in progress. 891e716630dSMartin Matuska */ 892e716630dSMartin Matuska if (row_use_scratch || 893e716630dSMartin Matuska (row_phys_cols != physical_cols && 894e716630dSMartin Matuska b + c < reflow_offset_next >> ashift)) { 895e716630dSMartin Matuska rc->rc_shadow_devidx = (b + c) % physical_cols; 896e716630dSMartin Matuska rc->rc_shadow_offset = 897e716630dSMartin Matuska ((b + c) / physical_cols) << ashift; 898e716630dSMartin Matuska if (row_use_scratch) 899e716630dSMartin Matuska rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 900e716630dSMartin Matuska } 901e716630dSMartin Matuska 902e716630dSMartin Matuska asize += rc->rc_size; 903e716630dSMartin Matuska } 904e716630dSMartin Matuska 905e716630dSMartin Matuska /* 906e716630dSMartin Matuska * See comment in vdev_raidz_map_alloc() 907e716630dSMartin Matuska */ 908e716630dSMartin Matuska if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 909e716630dSMartin Matuska (offset & (1ULL << 20))) { 910e716630dSMartin Matuska ASSERT(rr->rr_cols >= 2); 911e716630dSMartin Matuska ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 912e716630dSMartin Matuska 913e716630dSMartin Matuska int devidx0 = rr->rr_col[0].rc_devidx; 914e716630dSMartin Matuska uint64_t offset0 = rr->rr_col[0].rc_offset; 915e716630dSMartin Matuska int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 916e716630dSMartin Matuska uint64_t shadow_offset0 = 917e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset; 918e716630dSMartin Matuska 919e716630dSMartin Matuska rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 920e716630dSMartin Matuska rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 921e716630dSMartin Matuska rr->rr_col[0].rc_shadow_devidx = 922e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx; 923e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset = 924e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset; 925e716630dSMartin Matuska 926e716630dSMartin Matuska rr->rr_col[1].rc_devidx = devidx0; 927e716630dSMartin Matuska rr->rr_col[1].rc_offset = offset0; 928e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 929e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset = shadow_offset0; 930e716630dSMartin Matuska } 931e716630dSMartin Matuska } 932e716630dSMartin Matuska ASSERT3U(asize, ==, tot << ashift); 933e716630dSMartin Matuska 934e716630dSMartin Matuska /* 935e716630dSMartin Matuska * Determine if the block is contiguous, in which case we can use 936e716630dSMartin Matuska * an aggregation. 937e716630dSMartin Matuska */ 938e716630dSMartin Matuska if (rows >= raidz_io_aggregate_rows) { 939e716630dSMartin Matuska rm->rm_nphys_cols = physical_cols; 940e716630dSMartin Matuska rm->rm_phys_col = 941e716630dSMartin Matuska kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 942e716630dSMartin Matuska KM_SLEEP); 943e716630dSMartin Matuska 944e716630dSMartin Matuska /* 945e716630dSMartin Matuska * Determine the aggregate io's offset and size, and check 946e716630dSMartin Matuska * that the io is contiguous. 947e716630dSMartin Matuska */ 948e716630dSMartin Matuska for (int i = 0; 949e716630dSMartin Matuska i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 950e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 951e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 952e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 953e716630dSMartin Matuska raidz_col_t *prc = 954e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 955e716630dSMartin Matuska 956e716630dSMartin Matuska if (rc->rc_size == 0) 957e716630dSMartin Matuska continue; 958e716630dSMartin Matuska 959e716630dSMartin Matuska if (prc->rc_size == 0) { 960e716630dSMartin Matuska ASSERT0(prc->rc_offset); 961e716630dSMartin Matuska prc->rc_offset = rc->rc_offset; 962e716630dSMartin Matuska } else if (prc->rc_offset + prc->rc_size != 963e716630dSMartin Matuska rc->rc_offset) { 964e716630dSMartin Matuska /* 965e716630dSMartin Matuska * This block is not contiguous and 966e716630dSMartin Matuska * therefore can't be aggregated. 967e716630dSMartin Matuska * This is expected to be rare, so 968e716630dSMartin Matuska * the cost of allocating and then 969e716630dSMartin Matuska * freeing rm_phys_col is not 970e716630dSMartin Matuska * significant. 971e716630dSMartin Matuska */ 972e716630dSMartin Matuska kmem_free(rm->rm_phys_col, 973e716630dSMartin Matuska sizeof (raidz_col_t) * 974e716630dSMartin Matuska rm->rm_nphys_cols); 975e716630dSMartin Matuska rm->rm_phys_col = NULL; 976e716630dSMartin Matuska rm->rm_nphys_cols = 0; 977e716630dSMartin Matuska break; 978e716630dSMartin Matuska } 979e716630dSMartin Matuska prc->rc_size += rc->rc_size; 980e716630dSMartin Matuska } 981e716630dSMartin Matuska } 982e716630dSMartin Matuska } 983e716630dSMartin Matuska if (rm->rm_phys_col != NULL) { 984e716630dSMartin Matuska /* 985e716630dSMartin Matuska * Allocate aggregate ABD's. 986e716630dSMartin Matuska */ 987e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 988e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 989e716630dSMartin Matuska 990e716630dSMartin Matuska prc->rc_devidx = i; 991e716630dSMartin Matuska 992e716630dSMartin Matuska if (prc->rc_size == 0) 993e716630dSMartin Matuska continue; 994e716630dSMartin Matuska 995e716630dSMartin Matuska prc->rc_abd = 996e716630dSMartin Matuska abd_alloc_linear(rm->rm_phys_col[i].rc_size, 997e716630dSMartin Matuska B_FALSE); 998e716630dSMartin Matuska } 999e716630dSMartin Matuska 1000e716630dSMartin Matuska /* 1001e716630dSMartin Matuska * Point the parity abd's into the aggregate abd's. 1002e716630dSMartin Matuska */ 1003e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1004e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1005e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1006e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1007e716630dSMartin Matuska raidz_col_t *prc = 1008e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 1009e716630dSMartin Matuska rc->rc_abd = 1010e716630dSMartin Matuska abd_get_offset_struct(&rc->rc_abdstruct, 1011e716630dSMartin Matuska prc->rc_abd, 1012e716630dSMartin Matuska rc->rc_offset - prc->rc_offset, 1013e716630dSMartin Matuska rc->rc_size); 1014e716630dSMartin Matuska } 1015e716630dSMartin Matuska } 1016e716630dSMartin Matuska } else { 1017e716630dSMartin Matuska /* 1018e716630dSMartin Matuska * Allocate new abd's for the parity sectors. 1019e716630dSMartin Matuska */ 1020e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1021e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1022e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1023e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1024e716630dSMartin Matuska rc->rc_abd = 1025e716630dSMartin Matuska abd_alloc_linear(rc->rc_size, 1026e716630dSMartin Matuska B_TRUE); 1027e716630dSMartin Matuska } 1028e716630dSMartin Matuska } 1029e716630dSMartin Matuska } 1030eda14cbcSMatt Macy /* init RAIDZ parity ops */ 1031eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops(); 1032eda14cbcSMatt Macy 1033eda14cbcSMatt Macy return (rm); 1034eda14cbcSMatt Macy } 1035eda14cbcSMatt Macy 1036eda14cbcSMatt Macy struct pqr_struct { 1037eda14cbcSMatt Macy uint64_t *p; 1038eda14cbcSMatt Macy uint64_t *q; 1039eda14cbcSMatt Macy uint64_t *r; 1040eda14cbcSMatt Macy }; 1041eda14cbcSMatt Macy 1042eda14cbcSMatt Macy static int 1043eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private) 1044eda14cbcSMatt Macy { 1045eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1046eda14cbcSMatt Macy const uint64_t *src = buf; 1047e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1048eda14cbcSMatt Macy 1049eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r); 1050eda14cbcSMatt Macy 1051e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++) 1052eda14cbcSMatt Macy *pqr->p ^= *src; 1053eda14cbcSMatt Macy 1054eda14cbcSMatt Macy return (0); 1055eda14cbcSMatt Macy } 1056eda14cbcSMatt Macy 1057eda14cbcSMatt Macy static int 1058eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private) 1059eda14cbcSMatt Macy { 1060eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1061eda14cbcSMatt Macy const uint64_t *src = buf; 1062eda14cbcSMatt Macy uint64_t mask; 1063e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1064eda14cbcSMatt Macy 1065eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r); 1066eda14cbcSMatt Macy 1067e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1068eda14cbcSMatt Macy *pqr->p ^= *src; 1069eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1070eda14cbcSMatt Macy *pqr->q ^= *src; 1071eda14cbcSMatt Macy } 1072eda14cbcSMatt Macy 1073eda14cbcSMatt Macy return (0); 1074eda14cbcSMatt Macy } 1075eda14cbcSMatt Macy 1076eda14cbcSMatt Macy static int 1077eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1078eda14cbcSMatt Macy { 1079eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1080eda14cbcSMatt Macy const uint64_t *src = buf; 1081eda14cbcSMatt Macy uint64_t mask; 1082e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1083eda14cbcSMatt Macy 1084eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r); 1085eda14cbcSMatt Macy 1086e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1087eda14cbcSMatt Macy *pqr->p ^= *src; 1088eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1089eda14cbcSMatt Macy *pqr->q ^= *src; 1090eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1091eda14cbcSMatt Macy *pqr->r ^= *src; 1092eda14cbcSMatt Macy } 1093eda14cbcSMatt Macy 1094eda14cbcSMatt Macy return (0); 1095eda14cbcSMatt Macy } 1096eda14cbcSMatt Macy 1097eda14cbcSMatt Macy static void 10987877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr) 1099eda14cbcSMatt Macy { 11007877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1101eda14cbcSMatt Macy 11027877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11037877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1104eda14cbcSMatt Macy 11057877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 11067877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1107eda14cbcSMatt Macy } else { 1108eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL }; 11097877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1110eda14cbcSMatt Macy vdev_raidz_p_func, &pqr); 1111eda14cbcSMatt Macy } 1112eda14cbcSMatt Macy } 1113eda14cbcSMatt Macy } 1114eda14cbcSMatt Macy 1115eda14cbcSMatt Macy static void 11167877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1117eda14cbcSMatt Macy { 11187877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11197877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11207877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11217877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11227877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1123eda14cbcSMatt Macy 11247877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11257877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1126eda14cbcSMatt Macy 11277877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1128eda14cbcSMatt Macy 11297877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1130eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11317877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11327877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 1133eda14cbcSMatt Macy 11347877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1135eda14cbcSMatt Macy p[i] = 0; 1136eda14cbcSMatt Macy q[i] = 0; 1137eda14cbcSMatt Macy } 1138eda14cbcSMatt Macy } else { 1139eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL }; 1140eda14cbcSMatt Macy 1141eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11427877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1143eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr); 1144eda14cbcSMatt Macy 1145eda14cbcSMatt Macy /* 1146eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1147eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1148eda14cbcSMatt Macy */ 11497877fdebSMatt Macy uint64_t mask; 11507877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1151eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1152eda14cbcSMatt Macy } 1153eda14cbcSMatt Macy } 1154eda14cbcSMatt Macy } 1155eda14cbcSMatt Macy } 1156eda14cbcSMatt Macy 1157eda14cbcSMatt Macy static void 11587877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1159eda14cbcSMatt Macy { 11607877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11617877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11627877fdebSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 11637877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11647877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11657877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 11667877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11677877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size); 1168eda14cbcSMatt Macy 11697877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11707877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1171eda14cbcSMatt Macy 11727877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1173eda14cbcSMatt Macy 11747877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1175eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11767877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11777877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 11787877fdebSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size); 1179eda14cbcSMatt Macy 11807877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1181eda14cbcSMatt Macy p[i] = 0; 1182eda14cbcSMatt Macy q[i] = 0; 1183eda14cbcSMatt Macy r[i] = 0; 1184eda14cbcSMatt Macy } 1185eda14cbcSMatt Macy } else { 1186eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r }; 1187eda14cbcSMatt Macy 1188eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11897877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1190eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr); 1191eda14cbcSMatt Macy 1192eda14cbcSMatt Macy /* 1193eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1194eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1195eda14cbcSMatt Macy */ 11967877fdebSMatt Macy uint64_t mask; 11977877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1198eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1199eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask); 1200eda14cbcSMatt Macy } 1201eda14cbcSMatt Macy } 1202eda14cbcSMatt Macy } 1203eda14cbcSMatt Macy } 1204eda14cbcSMatt Macy 1205eda14cbcSMatt Macy /* 1206eda14cbcSMatt Macy * Generate RAID parity in the first virtual columns according to the number of 1207eda14cbcSMatt Macy * parity columns available. 1208eda14cbcSMatt Macy */ 1209eda14cbcSMatt Macy void 12107877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1211eda14cbcSMatt Macy { 1212e716630dSMartin Matuska if (rr->rr_cols == 0) { 1213e716630dSMartin Matuska /* 1214e716630dSMartin Matuska * We are handling this block one row at a time (because 1215e716630dSMartin Matuska * this block has a different logical vs physical width, 1216e716630dSMartin Matuska * due to RAIDZ expansion), and this is a pad-only row, 1217e716630dSMartin Matuska * which has no parity. 1218e716630dSMartin Matuska */ 1219e716630dSMartin Matuska return; 1220e716630dSMartin Matuska } 12217877fdebSMatt Macy 1222eda14cbcSMatt Macy /* Generate using the new math implementation */ 12237877fdebSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1224eda14cbcSMatt Macy return; 1225eda14cbcSMatt Macy 12267877fdebSMatt Macy switch (rr->rr_firstdatacol) { 1227eda14cbcSMatt Macy case 1: 12287877fdebSMatt Macy vdev_raidz_generate_parity_p(rr); 1229eda14cbcSMatt Macy break; 1230eda14cbcSMatt Macy case 2: 12317877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1232eda14cbcSMatt Macy break; 1233eda14cbcSMatt Macy case 3: 12347877fdebSMatt Macy vdev_raidz_generate_parity_pqr(rr); 1235eda14cbcSMatt Macy break; 1236eda14cbcSMatt Macy default: 1237eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1238eda14cbcSMatt Macy } 1239eda14cbcSMatt Macy } 1240eda14cbcSMatt Macy 12417877fdebSMatt Macy void 12427877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm) 12437877fdebSMatt Macy { 12447877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 12457877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 12467877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 12477877fdebSMatt Macy } 12487877fdebSMatt Macy } 12497877fdebSMatt Macy 1250eda14cbcSMatt Macy static int 1251eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1252eda14cbcSMatt Macy { 1253e92ffd9bSMartin Matuska (void) private; 1254eda14cbcSMatt Macy uint64_t *dst = dbuf; 1255eda14cbcSMatt Macy uint64_t *src = sbuf; 1256eda14cbcSMatt Macy int cnt = size / sizeof (src[0]); 1257eda14cbcSMatt Macy 1258eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) { 1259eda14cbcSMatt Macy dst[i] ^= src[i]; 1260eda14cbcSMatt Macy } 1261eda14cbcSMatt Macy 1262eda14cbcSMatt Macy return (0); 1263eda14cbcSMatt Macy } 1264eda14cbcSMatt Macy 1265eda14cbcSMatt Macy static int 1266eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1267eda14cbcSMatt Macy void *private) 1268eda14cbcSMatt Macy { 1269e92ffd9bSMartin Matuska (void) private; 1270eda14cbcSMatt Macy uint64_t *dst = dbuf; 1271eda14cbcSMatt Macy uint64_t *src = sbuf; 1272eda14cbcSMatt Macy uint64_t mask; 1273eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1274eda14cbcSMatt Macy 1275eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) { 1276eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1277eda14cbcSMatt Macy *dst ^= *src; 1278eda14cbcSMatt Macy } 1279eda14cbcSMatt Macy 1280eda14cbcSMatt Macy return (0); 1281eda14cbcSMatt Macy } 1282eda14cbcSMatt Macy 1283eda14cbcSMatt Macy static int 1284eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1285eda14cbcSMatt Macy { 1286e92ffd9bSMartin Matuska (void) private; 1287eda14cbcSMatt Macy uint64_t *dst = buf; 1288eda14cbcSMatt Macy uint64_t mask; 1289eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1290eda14cbcSMatt Macy 1291eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) { 1292eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1293eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1294eda14cbcSMatt Macy } 1295eda14cbcSMatt Macy 1296eda14cbcSMatt Macy return (0); 1297eda14cbcSMatt Macy } 1298eda14cbcSMatt Macy 1299eda14cbcSMatt Macy struct reconst_q_struct { 1300eda14cbcSMatt Macy uint64_t *q; 1301eda14cbcSMatt Macy int exp; 1302eda14cbcSMatt Macy }; 1303eda14cbcSMatt Macy 1304eda14cbcSMatt Macy static int 1305eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1306eda14cbcSMatt Macy { 1307eda14cbcSMatt Macy struct reconst_q_struct *rq = private; 1308eda14cbcSMatt Macy uint64_t *dst = buf; 1309eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1310eda14cbcSMatt Macy 1311eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1312eda14cbcSMatt Macy int j; 1313eda14cbcSMatt Macy uint8_t *b; 1314eda14cbcSMatt Macy 1315eda14cbcSMatt Macy *dst ^= *rq->q; 1316eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1317eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp); 1318eda14cbcSMatt Macy } 1319eda14cbcSMatt Macy } 1320eda14cbcSMatt Macy 1321eda14cbcSMatt Macy return (0); 1322eda14cbcSMatt Macy } 1323eda14cbcSMatt Macy 1324eda14cbcSMatt Macy struct reconst_pq_struct { 1325eda14cbcSMatt Macy uint8_t *p; 1326eda14cbcSMatt Macy uint8_t *q; 1327eda14cbcSMatt Macy uint8_t *pxy; 1328eda14cbcSMatt Macy uint8_t *qxy; 1329eda14cbcSMatt Macy int aexp; 1330eda14cbcSMatt Macy int bexp; 1331eda14cbcSMatt Macy }; 1332eda14cbcSMatt Macy 1333eda14cbcSMatt Macy static int 1334eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1335eda14cbcSMatt Macy { 1336eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1337eda14cbcSMatt Macy uint8_t *xd = xbuf; 1338eda14cbcSMatt Macy uint8_t *yd = ybuf; 1339eda14cbcSMatt Macy 1340eda14cbcSMatt Macy for (int i = 0; i < size; 1341eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1342eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1343eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1344eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1345eda14cbcSMatt Macy } 1346eda14cbcSMatt Macy 1347eda14cbcSMatt Macy return (0); 1348eda14cbcSMatt Macy } 1349eda14cbcSMatt Macy 1350eda14cbcSMatt Macy static int 1351eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1352eda14cbcSMatt Macy { 1353eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1354eda14cbcSMatt Macy uint8_t *xd = xbuf; 1355eda14cbcSMatt Macy 1356eda14cbcSMatt Macy for (int i = 0; i < size; 1357eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1358eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1359eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1360eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1361eda14cbcSMatt Macy } 1362eda14cbcSMatt Macy 1363eda14cbcSMatt Macy return (0); 1364eda14cbcSMatt Macy } 1365eda14cbcSMatt Macy 1366f9693befSMartin Matuska static void 13677877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1368eda14cbcSMatt Macy { 1369eda14cbcSMatt Macy int x = tgts[0]; 1370eda14cbcSMatt Macy abd_t *dst, *src; 1371eda14cbcSMatt Macy 1372e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1373e716630dSMartin Matuska zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1374e716630dSMartin Matuska 13757877fdebSMatt Macy ASSERT3U(ntgts, ==, 1); 13767877fdebSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol); 13777877fdebSMatt Macy ASSERT3U(x, <, rr->rr_cols); 1378eda14cbcSMatt Macy 13797877fdebSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1380eda14cbcSMatt Macy 13817877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 13827877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1383eda14cbcSMatt Macy 13847877fdebSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1385eda14cbcSMatt Macy 13867877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 13877877fdebSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size, 13887877fdebSMatt Macy rr->rr_col[c].rc_size); 1389eda14cbcSMatt Macy 13907877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 1391eda14cbcSMatt Macy 1392eda14cbcSMatt Macy if (c == x) 1393eda14cbcSMatt Macy continue; 1394eda14cbcSMatt Macy 1395eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1396eda14cbcSMatt Macy vdev_raidz_reconst_p_func, NULL); 1397eda14cbcSMatt Macy } 1398eda14cbcSMatt Macy } 1399eda14cbcSMatt Macy 1400f9693befSMartin Matuska static void 14017877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1402eda14cbcSMatt Macy { 1403eda14cbcSMatt Macy int x = tgts[0]; 1404eda14cbcSMatt Macy int c, exp; 1405eda14cbcSMatt Macy abd_t *dst, *src; 1406eda14cbcSMatt Macy 1407e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1408e716630dSMartin Matuska zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1409e716630dSMartin Matuska 1410eda14cbcSMatt Macy ASSERT(ntgts == 1); 1411eda14cbcSMatt Macy 14127877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1413eda14cbcSMatt Macy 14147877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14157877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 14167877fdebSMatt Macy rr->rr_col[c].rc_size); 1417eda14cbcSMatt Macy 14187877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 14197877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1420eda14cbcSMatt Macy 14217877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1422eda14cbcSMatt Macy abd_copy(dst, src, size); 14237877fdebSMatt Macy if (rr->rr_col[x].rc_size > size) { 1424eda14cbcSMatt Macy abd_zero_off(dst, size, 14257877fdebSMatt Macy rr->rr_col[x].rc_size - size); 14267877fdebSMatt Macy } 1427eda14cbcSMatt Macy } else { 14287877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1429eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1430eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL); 1431eda14cbcSMatt Macy (void) abd_iterate_func(dst, 14327877fdebSMatt Macy size, rr->rr_col[x].rc_size - size, 1433eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL); 1434eda14cbcSMatt Macy } 1435eda14cbcSMatt Macy } 1436eda14cbcSMatt Macy 14377877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14387877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 14397877fdebSMatt Macy exp = 255 - (rr->rr_cols - 1 - x); 1440eda14cbcSMatt Macy 1441eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp }; 14427877fdebSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1443eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq); 1444eda14cbcSMatt Macy } 1445eda14cbcSMatt Macy 1446f9693befSMartin Matuska static void 14477877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1448eda14cbcSMatt Macy { 1449eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1450eda14cbcSMatt Macy abd_t *pdata, *qdata; 1451eda14cbcSMatt Macy uint64_t xsize, ysize; 1452eda14cbcSMatt Macy int x = tgts[0]; 1453eda14cbcSMatt Macy int y = tgts[1]; 1454eda14cbcSMatt Macy abd_t *xd, *yd; 1455eda14cbcSMatt Macy 1456e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1457e716630dSMartin Matuska zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1458e716630dSMartin Matuska 1459eda14cbcSMatt Macy ASSERT(ntgts == 2); 1460eda14cbcSMatt Macy ASSERT(x < y); 14617877fdebSMatt Macy ASSERT(x >= rr->rr_firstdatacol); 14627877fdebSMatt Macy ASSERT(y < rr->rr_cols); 1463eda14cbcSMatt Macy 14647877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1465eda14cbcSMatt Macy 1466eda14cbcSMatt Macy /* 1467eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as 1468eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1469eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual 1470eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by 1471eda14cbcSMatt Macy * setting their lengths to zero. 1472eda14cbcSMatt Macy */ 14737877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 14747877fdebSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14757877fdebSMatt Macy xsize = rr->rr_col[x].rc_size; 14767877fdebSMatt Macy ysize = rr->rr_col[y].rc_size; 1477eda14cbcSMatt Macy 14787877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = 14797877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 14807877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 14817877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 14827877fdebSMatt Macy rr->rr_col[x].rc_size = 0; 14837877fdebSMatt Macy rr->rr_col[y].rc_size = 0; 1484eda14cbcSMatt Macy 14857877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1486eda14cbcSMatt Macy 14877877fdebSMatt Macy rr->rr_col[x].rc_size = xsize; 14887877fdebSMatt Macy rr->rr_col[y].rc_size = ysize; 1489eda14cbcSMatt Macy 1490eda14cbcSMatt Macy p = abd_to_buf(pdata); 1491eda14cbcSMatt Macy q = abd_to_buf(qdata); 14927877fdebSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 14937877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 14947877fdebSMatt Macy xd = rr->rr_col[x].rc_abd; 14957877fdebSMatt Macy yd = rr->rr_col[y].rc_abd; 1496eda14cbcSMatt Macy 1497eda14cbcSMatt Macy /* 1498eda14cbcSMatt Macy * We now have: 1499eda14cbcSMatt Macy * Pxy = P + D_x + D_y 1500eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1501eda14cbcSMatt Macy * 1502eda14cbcSMatt Macy * We can then solve for D_x: 1503eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy) 1504eda14cbcSMatt Macy * where 1505eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1 1506eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1507eda14cbcSMatt Macy * 1508eda14cbcSMatt Macy * With D_x in hand, we can easily solve for D_y: 1509eda14cbcSMatt Macy * D_y = P + Pxy + D_x 1510eda14cbcSMatt Macy */ 1511eda14cbcSMatt Macy 1512eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y]; 15137877fdebSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1514eda14cbcSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1]; 1515eda14cbcSMatt Macy 1516eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1517eda14cbcSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1518eda14cbcSMatt Macy 1519eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize); 1520eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1521eda14cbcSMatt Macy 1522eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1523eda14cbcSMatt Macy vdev_raidz_reconst_pq_func, &rpq); 1524eda14cbcSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize, 1525eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq); 1526eda14cbcSMatt Macy 15277877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15287877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1529eda14cbcSMatt Macy 1530eda14cbcSMatt Macy /* 1531eda14cbcSMatt Macy * Restore the saved parity data. 1532eda14cbcSMatt Macy */ 15337877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 15347877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1535eda14cbcSMatt Macy } 1536eda14cbcSMatt Macy 1537eda14cbcSMatt Macy /* 1538eda14cbcSMatt Macy * In the general case of reconstruction, we must solve the system of linear 1539eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as 1540eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with 1541eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p) 1542eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1543eda14cbcSMatt Macy * 1544eda14cbcSMatt Macy * __ __ __ __ 1545eda14cbcSMatt Macy * | | __ __ | p_0 | 1546eda14cbcSMatt Macy * | V | | D_0 | | p_m-1 | 1547eda14cbcSMatt Macy * | | x | : | = | d_0 | 1548eda14cbcSMatt Macy * | I | | D_n-1 | | : | 1549eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 | 1550eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1551eda14cbcSMatt Macy * 1552eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde 1553eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns 1554eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1555eda14cbcSMatt Macy * computation as well as linear separability. 1556eda14cbcSMatt Macy * 1557eda14cbcSMatt Macy * __ __ __ __ 1558eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 | 1559eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : | 1560eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1561eda14cbcSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 | 1562eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1563eda14cbcSMatt Macy * | : : : : | | : | | d_2 | 1564eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : | 1565eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : | 1566eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 | 1567eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1568eda14cbcSMatt Macy * 1569eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the 1570eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown 1571eda14cbcSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond 1572eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p 1573eda14cbcSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up 1574eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1575eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity 1576eda14cbcSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1577eda14cbcSMatt Macy * __ __ 1578eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1579eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1580eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / / 1581eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / / 1582eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' / 1583eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1584eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1585eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1586eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1587eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1588eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1589eda14cbcSMatt Macy * ~~ ~~ 1590eda14cbcSMatt Macy * __ __ 1591eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1592eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | 1593eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | 1594eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | 1595eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | 1596eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 | 1597eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1598eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1599eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1600eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1601eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1602eda14cbcSMatt Macy * ~~ ~~ 1603eda14cbcSMatt Macy * 1604eda14cbcSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1605eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1606eda14cbcSMatt Macy * matrix is not singular. 1607eda14cbcSMatt Macy * __ __ 1608eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1609eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1610eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1611eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1612eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1613eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1614eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1615eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1616eda14cbcSMatt Macy * ~~ ~~ 1617eda14cbcSMatt Macy * __ __ 1618eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1619eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1620eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1621eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1622eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1623eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1624eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1625eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1626eda14cbcSMatt Macy * ~~ ~~ 1627eda14cbcSMatt Macy * __ __ 1628eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1629eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1630eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1631eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1632eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1633eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1634eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1635eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1636eda14cbcSMatt Macy * ~~ ~~ 1637eda14cbcSMatt Macy * __ __ 1638eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1639eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1640eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1641eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1642eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1643eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1644eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1645eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1646eda14cbcSMatt Macy * ~~ ~~ 1647eda14cbcSMatt Macy * __ __ 1648eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1649eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1650eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1651eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1652eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1653eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1654eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1655eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1656eda14cbcSMatt Macy * ~~ ~~ 1657eda14cbcSMatt Macy * __ __ 1658eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1659eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1660eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1661eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1662eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1663eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1664eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1665eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1666eda14cbcSMatt Macy * ~~ ~~ 1667eda14cbcSMatt Macy * __ __ 1668eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 | 1669eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 | 1670eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 | 1671eda14cbcSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1672eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1673eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1674eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1675eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1676eda14cbcSMatt Macy * ~~ ~~ 1677eda14cbcSMatt Macy * 1678eda14cbcSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1679eda14cbcSMatt Macy * of the missing data. 1680eda14cbcSMatt Macy * 1681eda14cbcSMatt Macy * As is apparent from the example above, the only non-trivial rows in the 1682eda14cbcSMatt Macy * inverse matrix correspond to the data disks that we're trying to 1683eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would 1684eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For 1685eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to 1686eda14cbcSMatt Macy * targeted columns. 1687eda14cbcSMatt Macy */ 1688eda14cbcSMatt Macy 1689eda14cbcSMatt Macy static void 16907877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1691eda14cbcSMatt Macy uint8_t **rows) 1692eda14cbcSMatt Macy { 1693eda14cbcSMatt Macy int i, j; 1694eda14cbcSMatt Macy int pow; 1695eda14cbcSMatt Macy 16967877fdebSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1697eda14cbcSMatt Macy 1698eda14cbcSMatt Macy /* 1699eda14cbcSMatt Macy * Fill in the missing rows of interest. 1700eda14cbcSMatt Macy */ 1701eda14cbcSMatt Macy for (i = 0; i < nmap; i++) { 1702eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]); 1703eda14cbcSMatt Macy ASSERT3S(map[i], <=, 2); 1704eda14cbcSMatt Macy 1705eda14cbcSMatt Macy pow = map[i] * n; 1706eda14cbcSMatt Macy if (pow > 255) 1707eda14cbcSMatt Macy pow -= 255; 1708eda14cbcSMatt Macy ASSERT(pow <= 255); 1709eda14cbcSMatt Macy 1710eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1711eda14cbcSMatt Macy pow -= map[i]; 1712eda14cbcSMatt Macy if (pow < 0) 1713eda14cbcSMatt Macy pow += 255; 1714eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow]; 1715eda14cbcSMatt Macy } 1716eda14cbcSMatt Macy } 1717eda14cbcSMatt Macy } 1718eda14cbcSMatt Macy 1719eda14cbcSMatt Macy static void 17207877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1721eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1722eda14cbcSMatt Macy { 1723eda14cbcSMatt Macy int i, j, ii, jj; 1724eda14cbcSMatt Macy uint8_t log; 1725eda14cbcSMatt Macy 1726eda14cbcSMatt Macy /* 1727eda14cbcSMatt Macy * Assert that the first nmissing entries from the array of used 1728eda14cbcSMatt Macy * columns correspond to parity columns and that subsequent entries 1729eda14cbcSMatt Macy * correspond to data columns. 1730eda14cbcSMatt Macy */ 1731eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 17327877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol); 1733eda14cbcSMatt Macy } 1734eda14cbcSMatt Macy for (; i < n; i++) { 17357877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1736eda14cbcSMatt Macy } 1737eda14cbcSMatt Macy 1738eda14cbcSMatt Macy /* 1739eda14cbcSMatt Macy * First initialize the storage where we'll compute the inverse rows. 1740eda14cbcSMatt Macy */ 1741eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1742eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1743eda14cbcSMatt Macy invrows[i][j] = (i == j) ? 1 : 0; 1744eda14cbcSMatt Macy } 1745eda14cbcSMatt Macy } 1746eda14cbcSMatt Macy 1747eda14cbcSMatt Macy /* 1748eda14cbcSMatt Macy * Subtract all trivial rows from the rows of consequence. 1749eda14cbcSMatt Macy */ 1750eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1751eda14cbcSMatt Macy for (j = nmissing; j < n; j++) { 17527877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol); 17537877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol; 1754eda14cbcSMatt Macy ASSERT3S(jj, <, n); 1755eda14cbcSMatt Macy invrows[i][j] = rows[i][jj]; 1756eda14cbcSMatt Macy rows[i][jj] = 0; 1757eda14cbcSMatt Macy } 1758eda14cbcSMatt Macy } 1759eda14cbcSMatt Macy 1760eda14cbcSMatt Macy /* 1761eda14cbcSMatt Macy * For each of the rows of interest, we must normalize it and subtract 1762eda14cbcSMatt Macy * a multiple of it from the other rows. 1763eda14cbcSMatt Macy */ 1764eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1765eda14cbcSMatt Macy for (j = 0; j < missing[i]; j++) { 1766eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1767eda14cbcSMatt Macy } 1768eda14cbcSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0); 1769eda14cbcSMatt Macy 1770eda14cbcSMatt Macy /* 1771eda14cbcSMatt Macy * Compute the inverse of the first element and multiply each 1772eda14cbcSMatt Macy * element in the row by that value. 1773eda14cbcSMatt Macy */ 1774eda14cbcSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1775eda14cbcSMatt Macy 1776eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1777eda14cbcSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1778eda14cbcSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1779eda14cbcSMatt Macy } 1780eda14cbcSMatt Macy 1781eda14cbcSMatt Macy for (ii = 0; ii < nmissing; ii++) { 1782eda14cbcSMatt Macy if (i == ii) 1783eda14cbcSMatt Macy continue; 1784eda14cbcSMatt Macy 1785eda14cbcSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0); 1786eda14cbcSMatt Macy 1787eda14cbcSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]]; 1788eda14cbcSMatt Macy 1789eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1790eda14cbcSMatt Macy rows[ii][j] ^= 1791eda14cbcSMatt Macy vdev_raidz_exp2(rows[i][j], log); 1792eda14cbcSMatt Macy invrows[ii][j] ^= 1793eda14cbcSMatt Macy vdev_raidz_exp2(invrows[i][j], log); 1794eda14cbcSMatt Macy } 1795eda14cbcSMatt Macy } 1796eda14cbcSMatt Macy } 1797eda14cbcSMatt Macy 1798eda14cbcSMatt Macy /* 1799eda14cbcSMatt Macy * Verify that the data that is left in the rows are properly part of 1800eda14cbcSMatt Macy * an identity matrix. 1801eda14cbcSMatt Macy */ 1802eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1803eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1804eda14cbcSMatt Macy if (j == missing[i]) { 1805eda14cbcSMatt Macy ASSERT3U(rows[i][j], ==, 1); 1806eda14cbcSMatt Macy } else { 1807eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1808eda14cbcSMatt Macy } 1809eda14cbcSMatt Macy } 1810eda14cbcSMatt Macy } 1811eda14cbcSMatt Macy } 1812eda14cbcSMatt Macy 1813eda14cbcSMatt Macy static void 18147877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1815eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used) 1816eda14cbcSMatt Macy { 1817eda14cbcSMatt Macy int i, j, x, cc, c; 1818eda14cbcSMatt Macy uint8_t *src; 1819eda14cbcSMatt Macy uint64_t ccount; 1820eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1821eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1822eda14cbcSMatt Macy uint8_t log = 0; 1823eda14cbcSMatt Macy uint8_t val; 1824eda14cbcSMatt Macy int ll; 1825eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1826eda14cbcSMatt Macy uint8_t *p, *pp; 1827eda14cbcSMatt Macy size_t psize; 1828eda14cbcSMatt Macy 1829eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing; 1830eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1831eda14cbcSMatt Macy 1832eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing; i++) { 1833eda14cbcSMatt Macy invlog[i] = pp; 1834eda14cbcSMatt Macy pp += n; 1835eda14cbcSMatt Macy } 1836eda14cbcSMatt Macy 1837eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1838eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1839eda14cbcSMatt Macy ASSERT3U(invrows[i][j], !=, 0); 1840eda14cbcSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1841eda14cbcSMatt Macy } 1842eda14cbcSMatt Macy } 1843eda14cbcSMatt Macy 1844eda14cbcSMatt Macy for (i = 0; i < n; i++) { 1845eda14cbcSMatt Macy c = used[i]; 18467877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols); 1847eda14cbcSMatt Macy 18487877fdebSMatt Macy ccount = rr->rr_col[c].rc_size; 18497877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 18507877fdebSMatt Macy if (ccount == 0) 18517877fdebSMatt Macy continue; 18527877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd); 1853eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) { 18547877fdebSMatt Macy cc = missing[j] + rr->rr_firstdatacol; 18557877fdebSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol); 18567877fdebSMatt Macy ASSERT3U(cc, <, rr->rr_cols); 1857eda14cbcSMatt Macy ASSERT3U(cc, !=, c); 1858eda14cbcSMatt Macy 18597877fdebSMatt Macy dcount[j] = rr->rr_col[cc].rc_size; 18607877fdebSMatt Macy if (dcount[j] != 0) 18617877fdebSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1862eda14cbcSMatt Macy } 1863eda14cbcSMatt Macy 1864eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) { 1865eda14cbcSMatt Macy if (*src != 0) 1866eda14cbcSMatt Macy log = vdev_raidz_log2[*src]; 1867eda14cbcSMatt Macy 1868eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) { 1869eda14cbcSMatt Macy if (x >= dcount[cc]) 1870eda14cbcSMatt Macy continue; 1871eda14cbcSMatt Macy 1872eda14cbcSMatt Macy if (*src == 0) { 1873eda14cbcSMatt Macy val = 0; 1874eda14cbcSMatt Macy } else { 1875eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255) 1876eda14cbcSMatt Macy ll -= 255; 1877eda14cbcSMatt Macy val = vdev_raidz_pow2[ll]; 1878eda14cbcSMatt Macy } 1879eda14cbcSMatt Macy 1880eda14cbcSMatt Macy if (i == 0) 1881eda14cbcSMatt Macy dst[cc][x] = val; 1882eda14cbcSMatt Macy else 1883eda14cbcSMatt Macy dst[cc][x] ^= val; 1884eda14cbcSMatt Macy } 1885eda14cbcSMatt Macy } 1886eda14cbcSMatt Macy } 1887eda14cbcSMatt Macy 1888eda14cbcSMatt Macy kmem_free(p, psize); 1889eda14cbcSMatt Macy } 1890eda14cbcSMatt Macy 1891f9693befSMartin Matuska static void 18927877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1893eda14cbcSMatt Macy { 1894b985c9caSMartin Matuska int i, c, t, tt; 1895b985c9caSMartin Matuska unsigned int n; 1896b985c9caSMartin Matuska unsigned int nmissing_rows; 1897eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1898eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY]; 1899eda14cbcSMatt Macy uint8_t *p, *pp; 1900eda14cbcSMatt Macy size_t psize; 1901eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1902eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1903eda14cbcSMatt Macy uint8_t *used; 1904eda14cbcSMatt Macy 1905eda14cbcSMatt Macy abd_t **bufs = NULL; 1906eda14cbcSMatt Macy 1907e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1908e716630dSMartin Matuska zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1909eda14cbcSMatt Macy /* 1910eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate 19117877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found. 1912eda14cbcSMatt Macy */ 19137877fdebSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1914e716630dSMartin Matuska ASSERT(rr->rr_col[i].rc_abd != NULL); 19157877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 19167877fdebSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 19177877fdebSMatt Macy KM_PUSHPAGE); 1918eda14cbcSMatt Macy 19197877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 19207877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 1921eda14cbcSMatt Macy 1922eda14cbcSMatt Macy bufs[c] = col->rc_abd; 19237877fdebSMatt Macy if (bufs[c] != NULL) { 19247877fdebSMatt Macy col->rc_abd = abd_alloc_linear( 19257877fdebSMatt Macy col->rc_size, B_TRUE); 19267877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c], 19277877fdebSMatt Macy col->rc_size); 1928eda14cbcSMatt Macy } 1929eda14cbcSMatt Macy } 1930eda14cbcSMatt Macy 19317877fdebSMatt Macy break; 19327877fdebSMatt Macy } 19337877fdebSMatt Macy } 19347877fdebSMatt Macy 19357877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol; 1936eda14cbcSMatt Macy 1937eda14cbcSMatt Macy /* 1938eda14cbcSMatt Macy * Figure out which data columns are missing. 1939eda14cbcSMatt Macy */ 1940eda14cbcSMatt Macy nmissing_rows = 0; 1941eda14cbcSMatt Macy for (t = 0; t < ntgts; t++) { 19427877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) { 1943eda14cbcSMatt Macy missing_rows[nmissing_rows++] = 19447877fdebSMatt Macy tgts[t] - rr->rr_firstdatacol; 1945eda14cbcSMatt Macy } 1946eda14cbcSMatt Macy } 1947eda14cbcSMatt Macy 1948eda14cbcSMatt Macy /* 1949eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing 1950eda14cbcSMatt Macy * data columns. 1951eda14cbcSMatt Macy */ 1952eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1953eda14cbcSMatt Macy ASSERT(tt < ntgts); 19547877fdebSMatt Macy ASSERT(c < rr->rr_firstdatacol); 1955eda14cbcSMatt Macy 1956eda14cbcSMatt Macy /* 1957eda14cbcSMatt Macy * Skip any targeted parity columns. 1958eda14cbcSMatt Macy */ 1959eda14cbcSMatt Macy if (c == tgts[tt]) { 1960eda14cbcSMatt Macy tt++; 1961eda14cbcSMatt Macy continue; 1962eda14cbcSMatt Macy } 1963eda14cbcSMatt Macy 1964eda14cbcSMatt Macy parity_map[i] = c; 1965eda14cbcSMatt Macy i++; 1966eda14cbcSMatt Macy } 1967eda14cbcSMatt Macy 1968eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1969eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n; 1970eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1971eda14cbcSMatt Macy 1972eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) { 1973eda14cbcSMatt Macy rows[i] = pp; 1974eda14cbcSMatt Macy pp += n; 1975eda14cbcSMatt Macy invrows[i] = pp; 1976eda14cbcSMatt Macy pp += n; 1977eda14cbcSMatt Macy } 1978eda14cbcSMatt Macy used = pp; 1979eda14cbcSMatt Macy 1980eda14cbcSMatt Macy for (i = 0; i < nmissing_rows; i++) { 1981eda14cbcSMatt Macy used[i] = parity_map[i]; 1982eda14cbcSMatt Macy } 1983eda14cbcSMatt Macy 19847877fdebSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1985eda14cbcSMatt Macy if (tt < nmissing_rows && 19867877fdebSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) { 1987eda14cbcSMatt Macy tt++; 1988eda14cbcSMatt Macy continue; 1989eda14cbcSMatt Macy } 1990eda14cbcSMatt Macy 1991eda14cbcSMatt Macy ASSERT3S(i, <, n); 1992eda14cbcSMatt Macy used[i] = c; 1993eda14cbcSMatt Macy i++; 1994eda14cbcSMatt Macy } 1995eda14cbcSMatt Macy 1996eda14cbcSMatt Macy /* 1997eda14cbcSMatt Macy * Initialize the interesting rows of the matrix. 1998eda14cbcSMatt Macy */ 19997877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2000eda14cbcSMatt Macy 2001eda14cbcSMatt Macy /* 2002eda14cbcSMatt Macy * Invert the matrix. 2003eda14cbcSMatt Macy */ 20047877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2005eda14cbcSMatt Macy invrows, used); 2006eda14cbcSMatt Macy 2007eda14cbcSMatt Macy /* 2008eda14cbcSMatt Macy * Reconstruct the missing data using the generated matrix. 2009eda14cbcSMatt Macy */ 20107877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2011eda14cbcSMatt Macy invrows, used); 2012eda14cbcSMatt Macy 2013eda14cbcSMatt Macy kmem_free(p, psize); 2014eda14cbcSMatt Macy 2015eda14cbcSMatt Macy /* 2016eda14cbcSMatt Macy * copy back from temporary linear abds and free them 2017eda14cbcSMatt Macy */ 2018eda14cbcSMatt Macy if (bufs) { 20197877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 20207877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 2021eda14cbcSMatt Macy 20227877fdebSMatt Macy if (bufs[c] != NULL) { 2023eda14cbcSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size); 2024eda14cbcSMatt Macy abd_free(col->rc_abd); 20257877fdebSMatt Macy } 2026eda14cbcSMatt Macy col->rc_abd = bufs[c]; 2027eda14cbcSMatt Macy } 20287877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2029eda14cbcSMatt Macy } 2030eda14cbcSMatt Macy } 2031eda14cbcSMatt Macy 2032f9693befSMartin Matuska static void 20337877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 20347877fdebSMatt Macy const int *t, int nt) 2035eda14cbcSMatt Macy { 2036eda14cbcSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2037eda14cbcSMatt Macy int ntgts; 2038eda14cbcSMatt Macy int i, c, ret; 2039eda14cbcSMatt Macy int nbadparity, nbaddata; 2040eda14cbcSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2041eda14cbcSMatt Macy 2042e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2043e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2044e716630dSMartin Matuska rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2045e716630dSMartin Matuska (int)rr->rr_missingparity); 2046e716630dSMartin Matuska } 2047e716630dSMartin Matuska 20487877fdebSMatt Macy nbadparity = rr->rr_firstdatacol; 20497877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity; 2050eda14cbcSMatt Macy ntgts = 0; 20517877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) { 2052e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2053e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2054e716630dSMartin Matuska "offset=%llx error=%u)", 2055e716630dSMartin Matuska rr, c, (int)rr->rr_col[c].rc_devidx, 2056e716630dSMartin Matuska (long long)rr->rr_col[c].rc_offset, 2057e716630dSMartin Matuska (int)rr->rr_col[c].rc_error); 2058e716630dSMartin Matuska } 20597877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2060eda14cbcSMatt Macy parity_valid[c] = B_FALSE; 2061eda14cbcSMatt Macy 2062eda14cbcSMatt Macy if (i < nt && c == t[i]) { 2063eda14cbcSMatt Macy tgts[ntgts++] = c; 2064eda14cbcSMatt Macy i++; 20657877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) { 2066eda14cbcSMatt Macy tgts[ntgts++] = c; 20677877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) { 2068eda14cbcSMatt Macy nbaddata--; 2069eda14cbcSMatt Macy } else { 2070eda14cbcSMatt Macy parity_valid[c] = B_TRUE; 2071eda14cbcSMatt Macy nbadparity--; 2072eda14cbcSMatt Macy } 2073eda14cbcSMatt Macy } 2074eda14cbcSMatt Macy 2075eda14cbcSMatt Macy ASSERT(ntgts >= nt); 2076eda14cbcSMatt Macy ASSERT(nbaddata >= 0); 2077eda14cbcSMatt Macy ASSERT(nbaddata + nbadparity == ntgts); 2078eda14cbcSMatt Macy 2079eda14cbcSMatt Macy dt = &tgts[nbadparity]; 2080eda14cbcSMatt Macy 2081eda14cbcSMatt Macy /* Reconstruct using the new math implementation */ 20827877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2083eda14cbcSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL) 2084f9693befSMartin Matuska return; 2085eda14cbcSMatt Macy 2086eda14cbcSMatt Macy /* 2087eda14cbcSMatt Macy * See if we can use any of our optimized reconstruction routines. 2088eda14cbcSMatt Macy */ 2089eda14cbcSMatt Macy switch (nbaddata) { 2090eda14cbcSMatt Macy case 1: 2091f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_P]) { 2092f9693befSMartin Matuska vdev_raidz_reconstruct_p(rr, dt, 1); 2093f9693befSMartin Matuska return; 2094f9693befSMartin Matuska } 2095eda14cbcSMatt Macy 20967877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2097eda14cbcSMatt Macy 2098f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_Q]) { 2099f9693befSMartin Matuska vdev_raidz_reconstruct_q(rr, dt, 1); 2100f9693befSMartin Matuska return; 2101f9693befSMartin Matuska } 2102eda14cbcSMatt Macy 21037877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2104eda14cbcSMatt Macy break; 2105eda14cbcSMatt Macy 2106eda14cbcSMatt Macy case 2: 21077877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2108eda14cbcSMatt Macy 2109eda14cbcSMatt Macy if (parity_valid[VDEV_RAIDZ_P] && 2110f9693befSMartin Matuska parity_valid[VDEV_RAIDZ_Q]) { 2111f9693befSMartin Matuska vdev_raidz_reconstruct_pq(rr, dt, 2); 2112f9693befSMartin Matuska return; 2113f9693befSMartin Matuska } 2114eda14cbcSMatt Macy 21157877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2116eda14cbcSMatt Macy 2117eda14cbcSMatt Macy break; 2118eda14cbcSMatt Macy } 2119eda14cbcSMatt Macy 2120f9693befSMartin Matuska vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2121eda14cbcSMatt Macy } 2122eda14cbcSMatt Macy 2123eda14cbcSMatt Macy static int 2124eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2125eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 2126eda14cbcSMatt Macy { 21277877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 21287877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2129eda14cbcSMatt Macy int c; 2130eda14cbcSMatt Macy int lasterror = 0; 2131eda14cbcSMatt Macy int numerrors = 0; 2132eda14cbcSMatt Macy 2133eda14cbcSMatt Macy ASSERT(nparity > 0); 2134eda14cbcSMatt Macy 2135eda14cbcSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY || 2136eda14cbcSMatt Macy vd->vdev_children < nparity + 1) { 2137eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2138eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 2139eda14cbcSMatt Macy } 2140eda14cbcSMatt Macy 2141eda14cbcSMatt Macy vdev_open_children(vd); 2142eda14cbcSMatt Macy 2143eda14cbcSMatt Macy for (c = 0; c < vd->vdev_children; c++) { 21447877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 2145eda14cbcSMatt Macy 2146eda14cbcSMatt Macy if (cvd->vdev_open_error != 0) { 2147eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 2148eda14cbcSMatt Macy numerrors++; 2149eda14cbcSMatt Macy continue; 2150eda14cbcSMatt Macy } 2151eda14cbcSMatt Macy 2152eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2153eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2154eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2155c7046f76SMartin Matuska } 2156c7046f76SMartin Matuska for (c = 0; c < vd->vdev_children; c++) { 2157c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c]; 2158c7046f76SMartin Matuska 2159c7046f76SMartin Matuska if (cvd->vdev_open_error != 0) 2160c7046f76SMartin Matuska continue; 2161c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift, 2162c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift); 2163eda14cbcSMatt Macy } 2164eda14cbcSMatt Macy 2165e716630dSMartin Matuska if (vd->vdev_rz_expanding) { 2166e716630dSMartin Matuska *asize *= vd->vdev_children - 1; 2167e716630dSMartin Matuska *max_asize *= vd->vdev_children - 1; 2168e716630dSMartin Matuska 2169e716630dSMartin Matuska vd->vdev_min_asize = *asize; 2170e716630dSMartin Matuska } else { 2171eda14cbcSMatt Macy *asize *= vd->vdev_children; 2172eda14cbcSMatt Macy *max_asize *= vd->vdev_children; 2173e716630dSMartin Matuska } 2174eda14cbcSMatt Macy 2175eda14cbcSMatt Macy if (numerrors > nparity) { 2176eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2177eda14cbcSMatt Macy return (lasterror); 2178eda14cbcSMatt Macy } 2179eda14cbcSMatt Macy 2180eda14cbcSMatt Macy return (0); 2181eda14cbcSMatt Macy } 2182eda14cbcSMatt Macy 2183eda14cbcSMatt Macy static void 2184eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd) 2185eda14cbcSMatt Macy { 21867877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 21877877fdebSMatt Macy if (vd->vdev_child[c] != NULL) 2188eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 2189eda14cbcSMatt Macy } 21907877fdebSMatt Macy } 2191eda14cbcSMatt Macy 2192e716630dSMartin Matuska /* 2193e716630dSMartin Matuska * Return the logical width to use, given the txg in which the allocation 2194783d3ff6SMartin Matuska * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2195e716630dSMartin Matuska * BP was allocated. Remapped BP's (that were relocated due to device 2196783d3ff6SMartin Matuska * removal, see remap_blkptr_cb()), will have a more recent physical birth 2197783d3ff6SMartin Matuska * which reflects when the BP was relocated, but we can ignore these because 2198783d3ff6SMartin Matuska * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2199e716630dSMartin Matuska */ 2200eda14cbcSMatt Macy static uint64_t 2201e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2202e716630dSMartin Matuska { 2203e716630dSMartin Matuska reflow_node_t lookup = { 2204e716630dSMartin Matuska .re_txg = txg, 2205e716630dSMartin Matuska }; 2206e716630dSMartin Matuska avl_index_t where; 2207e716630dSMartin Matuska 2208e716630dSMartin Matuska uint64_t width; 2209e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 2210e716630dSMartin Matuska reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2211e716630dSMartin Matuska if (re != NULL) { 2212e716630dSMartin Matuska width = re->re_logical_width; 2213e716630dSMartin Matuska } else { 2214e716630dSMartin Matuska re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2215e716630dSMartin Matuska if (re != NULL) 2216e716630dSMartin Matuska width = re->re_logical_width; 2217e716630dSMartin Matuska else 2218e716630dSMartin Matuska width = vdrz->vd_original_width; 2219e716630dSMartin Matuska } 2220e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 2221e716630dSMartin Matuska return (width); 2222e716630dSMartin Matuska } 2223e716630dSMartin Matuska 2224e716630dSMartin Matuska /* 2225e716630dSMartin Matuska * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2226e716630dSMartin Matuska * more space due to the lower data-to-parity ratio. In this case it's 2227e716630dSMartin Matuska * important to pass in the correct txg. Note that vdev_gang_header_asize() 2228e716630dSMartin Matuska * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2229e716630dSMartin Matuska * regardless of txg. This is assured because for a single data sector, we 2230e716630dSMartin Matuska * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2231e716630dSMartin Matuska */ 2232e716630dSMartin Matuska static uint64_t 2233e716630dSMartin Matuska vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2234eda14cbcSMatt Macy { 22357877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2236eda14cbcSMatt Macy uint64_t asize; 2237eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 2238e716630dSMartin Matuska uint64_t cols = vdrz->vd_original_width; 22397877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2240eda14cbcSMatt Macy 2241e716630dSMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg); 2242e716630dSMartin Matuska 2243eda14cbcSMatt Macy asize = ((psize - 1) >> ashift) + 1; 2244eda14cbcSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2245eda14cbcSMatt Macy asize = roundup(asize, nparity + 1) << ashift; 2246eda14cbcSMatt Macy 2247e716630dSMartin Matuska #ifdef ZFS_DEBUG 2248e716630dSMartin Matuska uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2249e716630dSMartin Matuska uint64_t ncols_new = vdrz->vd_physical_width; 2250e716630dSMartin Matuska asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2251e716630dSMartin Matuska (ncols_new - nparity)); 2252e716630dSMartin Matuska asize_new = roundup(asize_new, nparity + 1) << ashift; 2253e716630dSMartin Matuska VERIFY3U(asize_new, <=, asize); 2254e716630dSMartin Matuska #endif 2255e716630dSMartin Matuska 2256eda14cbcSMatt Macy return (asize); 2257eda14cbcSMatt Macy } 2258eda14cbcSMatt Macy 22597877fdebSMatt Macy /* 22607877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child) 22617877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize. 22627877fdebSMatt Macy */ 22637877fdebSMatt Macy static uint64_t 22647877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd) 22657877fdebSMatt Macy { 22667877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) / 22677877fdebSMatt Macy vd->vdev_children); 22687877fdebSMatt Macy } 22697877fdebSMatt Macy 22707877fdebSMatt Macy void 2271eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio) 2272eda14cbcSMatt Macy { 2273eda14cbcSMatt Macy raidz_col_t *rc = zio->io_private; 2274eda14cbcSMatt Macy 227581b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 2276eda14cbcSMatt Macy rc->rc_error = zio->io_error; 2277eda14cbcSMatt Macy rc->rc_tried = 1; 2278eda14cbcSMatt Macy rc->rc_skipped = 0; 2279eda14cbcSMatt Macy } 2280eda14cbcSMatt Macy 2281eda14cbcSMatt Macy static void 2282e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio) 2283eda14cbcSMatt Macy { 2284e716630dSMartin Matuska raidz_col_t *rc = zio->io_private; 2285eda14cbcSMatt Macy 2286e716630dSMartin Matuska rc->rc_shadow_error = zio->io_error; 2287e716630dSMartin Matuska } 2288e716630dSMartin Matuska 2289e716630dSMartin Matuska static void 2290e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2291e716630dSMartin Matuska { 2292e716630dSMartin Matuska (void) rm; 2293e716630dSMartin Matuska #ifdef ZFS_DEBUG 22947877fdebSMatt Macy range_seg64_t logical_rs, physical_rs, remain_rs; 22957877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset; 2296eda14cbcSMatt Macy logical_rs.rs_end = logical_rs.rs_start + 2297e716630dSMartin Matuska vdev_raidz_asize(zio->io_vd, rr->rr_size, 2298783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2299eda14cbcSMatt Macy 23007877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col]; 2301e716630dSMartin Matuska vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2302eda14cbcSMatt Macy 23037877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 23047877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs)); 2305e716630dSMartin Matuska if (vdev_xlate_is_empty(&physical_rs)) { 2306e716630dSMartin Matuska /* 2307e716630dSMartin Matuska * If we are in the middle of expansion, the 2308e716630dSMartin Matuska * physical->logical mapping is changing so vdev_xlate() 2309e716630dSMartin Matuska * can't give us a reliable answer. 2310e716630dSMartin Matuska */ 2311e716630dSMartin Matuska return; 2312e716630dSMartin Matuska } 2313eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2314eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2315eda14cbcSMatt Macy /* 2316eda14cbcSMatt Macy * It would be nice to assert that rs_end is equal 2317eda14cbcSMatt Macy * to rc_offset + rc_size but there might be an 2318eda14cbcSMatt Macy * optional I/O at the end that is not accounted in 2319eda14cbcSMatt Macy * rc_size. 2320eda14cbcSMatt Macy */ 2321eda14cbcSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2322eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2323e716630dSMartin Matuska rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2324eda14cbcSMatt Macy } else { 2325eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2326eda14cbcSMatt Macy } 2327eda14cbcSMatt Macy #endif 2328eda14cbcSMatt Macy } 2329eda14cbcSMatt Macy 23307877fdebSMatt Macy static void 2331e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 23327877fdebSMatt Macy { 23337877fdebSMatt Macy vdev_t *vd = zio->io_vd; 23347877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 23357877fdebSMatt Macy 23367877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 23377877fdebSMatt Macy 233881b22a98SMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 23397877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 234081b22a98SMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 23417877fdebSMatt Macy 23427877fdebSMatt Macy /* Verify physical to logical translation */ 2343e716630dSMartin Matuska vdev_raidz_io_verify(zio, rm, rr, c); 23447877fdebSMatt Macy 2345e716630dSMartin Matuska if (rc->rc_size == 0) 2346e716630dSMartin Matuska continue; 2347e716630dSMartin Matuska 2348e716630dSMartin Matuska ASSERT3U(rc->rc_offset + rc->rc_size, <, 2349e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2350e716630dSMartin Matuska 235181b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 23527877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 235381b22a98SMartin Matuska rc->rc_offset, rc->rc_abd, 235481b22a98SMartin Matuska abd_get_size(rc->rc_abd), zio->io_type, 235581b22a98SMartin Matuska zio->io_priority, 0, vdev_raidz_child_done, rc)); 2356e716630dSMartin Matuska 2357e716630dSMartin Matuska if (rc->rc_shadow_devidx != INT_MAX) { 2358e716630dSMartin Matuska vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2359e716630dSMartin Matuska 2360e716630dSMartin Matuska ASSERT3U( 2361e716630dSMartin Matuska rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2362e716630dSMartin Matuska cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2363e716630dSMartin Matuska 2364e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2365e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, 2366e716630dSMartin Matuska abd_get_size(rc->rc_abd), 2367e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2368e716630dSMartin Matuska vdev_raidz_shadow_child_done, rc)); 236981b22a98SMartin Matuska } 23707877fdebSMatt Macy } 23717877fdebSMatt Macy } 23727877fdebSMatt Macy 2373e716630dSMartin Matuska /* 2374e716630dSMartin Matuska * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2375e716630dSMartin Matuska * This only works for vdev_raidz_map_alloc() (not _expanded()). 2376e716630dSMartin Matuska */ 23777877fdebSMatt Macy static void 2378e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio) 2379e716630dSMartin Matuska { 2380e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2381e716630dSMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift; 2382e716630dSMartin Matuska raidz_map_t *rm = zio->io_vsd; 2383e716630dSMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 2384e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 2385e716630dSMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 2386e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2387e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2388e716630dSMartin Matuska if (rc->rc_size != 0) 2389e716630dSMartin Matuska continue; 2390e716630dSMartin Matuska ASSERT3P(rc->rc_abd, ==, NULL); 2391e716630dSMartin Matuska 2392e716630dSMartin Matuska ASSERT3U(rc->rc_offset, <, 2393e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2394e716630dSMartin Matuska 2395e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2396e716630dSMartin Matuska NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2397e716630dSMartin Matuska ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2398e716630dSMartin Matuska } 2399e716630dSMartin Matuska } 2400e716630dSMartin Matuska 2401e716630dSMartin Matuska static void 2402e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 24037877fdebSMatt Macy { 24047877fdebSMatt Macy vdev_t *vd = zio->io_vd; 24057877fdebSMatt Macy 24067877fdebSMatt Macy /* 24077877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity 24087877fdebSMatt Macy * last -- any errors along the way will force us to read the parity. 24097877fdebSMatt Macy */ 24107877fdebSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) { 24117877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 24127877fdebSMatt Macy if (rc->rc_size == 0) 24137877fdebSMatt Macy continue; 24147877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 24157877fdebSMatt Macy if (!vdev_readable(cvd)) { 24167877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24177877fdebSMatt Macy rr->rr_missingdata++; 24187877fdebSMatt Macy else 24197877fdebSMatt Macy rr->rr_missingparity++; 24207877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO); 24217877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */ 24227877fdebSMatt Macy rc->rc_skipped = 1; 24237877fdebSMatt Macy continue; 24247877fdebSMatt Macy } 24257877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 24267877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24277877fdebSMatt Macy rr->rr_missingdata++; 24287877fdebSMatt Macy else 24297877fdebSMatt Macy rr->rr_missingparity++; 24307877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE); 24317877fdebSMatt Macy rc->rc_skipped = 1; 24327877fdebSMatt Macy continue; 24337877fdebSMatt Macy } 2434e716630dSMartin Matuska if (forceparity || 2435e716630dSMartin Matuska c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 24367877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 24377877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 24387877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 24397877fdebSMatt Macy zio->io_type, zio->io_priority, 0, 24407877fdebSMatt Macy vdev_raidz_child_done, rc)); 24417877fdebSMatt Macy } 24427877fdebSMatt Macy } 24437877fdebSMatt Macy } 24447877fdebSMatt Macy 2445e716630dSMartin Matuska static void 2446e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2447e716630dSMartin Matuska { 2448e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2449e716630dSMartin Matuska 2450e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 2451e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 2452e716630dSMartin Matuska if (prc->rc_size == 0) 2453e716630dSMartin Matuska continue; 2454e716630dSMartin Matuska 2455e716630dSMartin Matuska ASSERT3U(prc->rc_devidx, ==, i); 2456e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[i]; 2457e716630dSMartin Matuska if (!vdev_readable(cvd)) { 2458e716630dSMartin Matuska prc->rc_error = SET_ERROR(ENXIO); 2459e716630dSMartin Matuska prc->rc_tried = 1; /* don't even try */ 2460e716630dSMartin Matuska prc->rc_skipped = 1; 2461e716630dSMartin Matuska continue; 2462e716630dSMartin Matuska } 2463e716630dSMartin Matuska if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2464e716630dSMartin Matuska prc->rc_error = SET_ERROR(ESTALE); 2465e716630dSMartin Matuska prc->rc_skipped = 1; 2466e716630dSMartin Matuska continue; 2467e716630dSMartin Matuska } 2468e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2469e716630dSMartin Matuska prc->rc_offset, prc->rc_abd, prc->rc_size, 2470e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2471e716630dSMartin Matuska vdev_raidz_child_done, prc)); 2472e716630dSMartin Matuska } 2473e716630dSMartin Matuska } 2474e716630dSMartin Matuska 2475e716630dSMartin Matuska static void 2476e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2477e716630dSMartin Matuska { 2478e716630dSMartin Matuska /* 2479e716630dSMartin Matuska * If there are multiple rows, we will be hitting 2480e716630dSMartin Matuska * all disks, so go ahead and read the parity so 2481e716630dSMartin Matuska * that we are reading in decent size chunks. 2482e716630dSMartin Matuska */ 2483e716630dSMartin Matuska boolean_t forceparity = rm->rm_nrows > 1; 2484e716630dSMartin Matuska 2485e716630dSMartin Matuska if (rm->rm_phys_col) { 2486e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio, rm); 2487e716630dSMartin Matuska } else { 2488e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2489e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 2490e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio, rr, forceparity); 2491e716630dSMartin Matuska } 2492e716630dSMartin Matuska } 2493e716630dSMartin Matuska } 2494e716630dSMartin Matuska 2495eda14cbcSMatt Macy /* 2496eda14cbcSMatt Macy * Start an IO operation on a RAIDZ VDev 2497eda14cbcSMatt Macy * 2498eda14cbcSMatt Macy * Outline: 2499eda14cbcSMatt Macy * - For write operations: 2500eda14cbcSMatt Macy * 1. Generate the parity data 2501eda14cbcSMatt Macy * 2. Create child zio write operations to each column's vdev, for both 2502eda14cbcSMatt Macy * data and parity. 2503eda14cbcSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy 2504eda14cbcSMatt Macy * write zio children for those areas to improve aggregation continuity. 2505eda14cbcSMatt Macy * - For read operations: 2506eda14cbcSMatt Macy * 1. Create child zio read operations to each data column's vdev to read 2507eda14cbcSMatt Macy * the range of data required for zio. 2508eda14cbcSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data 2509eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity 2510eda14cbcSMatt Macy * columns' VDevs as well. 2511eda14cbcSMatt Macy */ 2512eda14cbcSMatt Macy static void 2513eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio) 2514eda14cbcSMatt Macy { 2515eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 2516eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top; 25177877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2518e716630dSMartin Matuska raidz_map_t *rm; 2519eda14cbcSMatt Macy 2520e716630dSMartin Matuska uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2521783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2522e716630dSMartin Matuska if (logical_width != vdrz->vd_physical_width) { 2523e716630dSMartin Matuska zfs_locked_range_t *lr = NULL; 2524e716630dSMartin Matuska uint64_t synced_offset = UINT64_MAX; 2525e716630dSMartin Matuska uint64_t next_offset = UINT64_MAX; 2526e716630dSMartin Matuska boolean_t use_scratch = B_FALSE; 2527e716630dSMartin Matuska /* 2528e716630dSMartin Matuska * Note: when the expansion is completing, we set 2529e716630dSMartin Matuska * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2530e716630dSMartin Matuska * in a later txg than when we last update spa_ubsync's state 2531e716630dSMartin Matuska * (see the end of spa_raidz_expand_thread()). Therefore we 2532e716630dSMartin Matuska * may see vre_state!=SCANNING before 2533e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2534e716630dSMartin Matuska * on disk, but the copying progress has been synced to disk 2535e716630dSMartin Matuska * (and reflected in spa_ubsync). In this case it's fine to 2536e716630dSMartin Matuska * treat the expansion as completed, since if we crash there's 2537e716630dSMartin Matuska * no additional copying to do. 2538e716630dSMartin Matuska */ 2539e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2540e716630dSMartin Matuska ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2541e716630dSMartin Matuska &vdrz->vn_vre); 2542e716630dSMartin Matuska lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2543e716630dSMartin Matuska zio->io_offset, zio->io_size, RL_READER); 2544e716630dSMartin Matuska use_scratch = 2545e716630dSMartin Matuska (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2546e716630dSMartin Matuska RRSS_SCRATCH_VALID); 2547e716630dSMartin Matuska synced_offset = 2548e716630dSMartin Matuska RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2549e716630dSMartin Matuska next_offset = vdrz->vn_vre.vre_offset; 2550e716630dSMartin Matuska /* 2551e716630dSMartin Matuska * If we haven't resumed expanding since importing the 2552e716630dSMartin Matuska * pool, vre_offset won't have been set yet. In 2553e716630dSMartin Matuska * this case the next offset to be copied is the same 2554e716630dSMartin Matuska * as what was synced. 2555e716630dSMartin Matuska */ 2556e716630dSMartin Matuska if (next_offset == UINT64_MAX) { 2557e716630dSMartin Matuska next_offset = synced_offset; 2558e716630dSMartin Matuska } 2559e716630dSMartin Matuska } 2560e716630dSMartin Matuska if (use_scratch) { 2561e716630dSMartin Matuska zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2562e716630dSMartin Matuska "%lld next_offset=%lld use_scratch=%u", 2563e716630dSMartin Matuska zio, 2564e716630dSMartin Matuska zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2565e716630dSMartin Matuska (long long)zio->io_offset, 2566e716630dSMartin Matuska (long long)synced_offset, 2567e716630dSMartin Matuska (long long)next_offset, 2568e716630dSMartin Matuska use_scratch); 2569e716630dSMartin Matuska } 2570e716630dSMartin Matuska 2571e716630dSMartin Matuska rm = vdev_raidz_map_alloc_expanded(zio, 2572e716630dSMartin Matuska tvd->vdev_ashift, vdrz->vd_physical_width, 2573e716630dSMartin Matuska logical_width, vdrz->vd_nparity, 2574e716630dSMartin Matuska synced_offset, next_offset, use_scratch); 2575e716630dSMartin Matuska rm->rm_lr = lr; 2576e716630dSMartin Matuska } else { 2577e716630dSMartin Matuska rm = vdev_raidz_map_alloc(zio, 2578e716630dSMartin Matuska tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2579e716630dSMartin Matuska } 2580e716630dSMartin Matuska rm->rm_original_width = vdrz->vd_original_width; 2581e716630dSMartin Matuska 2582f9693befSMartin Matuska zio->io_vsd = rm; 2583f9693befSMartin Matuska zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2584eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 2585e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2586e716630dSMartin Matuska vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2587e716630dSMartin Matuska } 2588e716630dSMartin Matuska 2589e716630dSMartin Matuska if (logical_width == vdrz->vd_physical_width) { 2590e716630dSMartin Matuska raidz_start_skip_writes(zio); 2591e716630dSMartin Matuska } 25927877fdebSMatt Macy } else { 2593eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 2594e716630dSMartin Matuska vdev_raidz_io_start_read(zio, rm); 2595eda14cbcSMatt Macy } 2596eda14cbcSMatt Macy 2597eda14cbcSMatt Macy zio_execute(zio); 2598eda14cbcSMatt Macy } 2599eda14cbcSMatt Macy 2600eda14cbcSMatt Macy /* 2601eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device. 2602eda14cbcSMatt Macy */ 2603e92ffd9bSMartin Matuska void 2604e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2605eda14cbcSMatt Macy { 2606eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2607eda14cbcSMatt Macy 26087877fdebSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 26097877fdebSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) { 2610eda14cbcSMatt Macy zio_bad_cksum_t zbc; 2611eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2612eda14cbcSMatt Macy 2613eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 2614eda14cbcSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 2615eda14cbcSMatt Macy 26162c48331dSMatt Macy mutex_enter(&vd->vdev_stat_lock); 26172c48331dSMatt Macy vd->vdev_stat.vs_checksum_errors++; 26182c48331dSMatt Macy mutex_exit(&vd->vdev_stat_lock); 2619bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2620bb2d13b6SMartin Matuska &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2621bb2d13b6SMartin Matuska rc->rc_abd, bad_data, &zbc); 26222c48331dSMatt Macy } 2623eda14cbcSMatt Macy } 2624eda14cbcSMatt Macy 2625eda14cbcSMatt Macy /* 2626eda14cbcSMatt Macy * We keep track of whether or not there were any injected errors, so that 2627eda14cbcSMatt Macy * any ereports we generate can note it. 2628eda14cbcSMatt Macy */ 2629eda14cbcSMatt Macy static int 2630eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio) 2631eda14cbcSMatt Macy { 2632315ee00fSMartin Matuska zio_bad_cksum_t zbc = {0}; 2633eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2634eda14cbcSMatt Macy 2635eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc); 2636eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0) 2637eda14cbcSMatt Macy rm->rm_ecksuminjected = 1; 2638eda14cbcSMatt Macy 2639eda14cbcSMatt Macy return (ret); 2640eda14cbcSMatt Macy } 2641eda14cbcSMatt Macy 2642eda14cbcSMatt Macy /* 2643eda14cbcSMatt Macy * Generate the parity from the data columns. If we tried and were able to 2644eda14cbcSMatt Macy * read the parity without error, verify that the generated parity matches the 2645eda14cbcSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the 26467877fdebSMatt Macy * number of such failures. 2647eda14cbcSMatt Macy */ 2648eda14cbcSMatt Macy static int 26497877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2650eda14cbcSMatt Macy { 2651eda14cbcSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2652eda14cbcSMatt Macy int c, ret = 0; 26537877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2654eda14cbcSMatt Macy raidz_col_t *rc; 2655eda14cbcSMatt Macy 2656eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp; 2657eda14cbcSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2658eda14cbcSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2659eda14cbcSMatt Macy 2660eda14cbcSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY) 2661eda14cbcSMatt Macy return (ret); 2662eda14cbcSMatt Macy 26637877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 26647877fdebSMatt Macy rc = &rr->rr_col[c]; 2665eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2666eda14cbcSMatt Macy continue; 2667eda14cbcSMatt Macy 2668a0b956f5SMartin Matuska orig[c] = rc->rc_abd; 2669a0b956f5SMartin Matuska ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2670a0b956f5SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2671eda14cbcSMatt Macy } 2672eda14cbcSMatt Macy 26737877fdebSMatt Macy /* 2674e92ffd9bSMartin Matuska * Verify any empty sectors are zero filled to ensure the parity 2675e92ffd9bSMartin Matuska * is calculated correctly even if these non-data sectors are damaged. 2676e92ffd9bSMartin Matuska */ 2677e92ffd9bSMartin Matuska if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2678e92ffd9bSMartin Matuska ret += vdev_draid_map_verify_empty(zio, rr); 2679e92ffd9bSMartin Matuska 2680e92ffd9bSMartin Matuska /* 26817877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This 26827877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff 26837877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0). 26847877fdebSMatt Macy */ 26857877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 2686eda14cbcSMatt Macy 26877877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 26887877fdebSMatt Macy rc = &rr->rr_col[c]; 26897877fdebSMatt Macy 2690eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2691eda14cbcSMatt Macy continue; 26927877fdebSMatt Macy 2693eda14cbcSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2694e716630dSMartin Matuska zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2695e716630dSMartin Matuska c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2696e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, rc, orig[c]); 2697eda14cbcSMatt Macy rc->rc_error = SET_ERROR(ECKSUM); 2698eda14cbcSMatt Macy ret++; 2699eda14cbcSMatt Macy } 2700eda14cbcSMatt Macy abd_free(orig[c]); 2701eda14cbcSMatt Macy } 2702eda14cbcSMatt Macy 2703eda14cbcSMatt Macy return (ret); 2704eda14cbcSMatt Macy } 2705eda14cbcSMatt Macy 2706eda14cbcSMatt Macy static int 27077877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr) 2708eda14cbcSMatt Macy { 2709eda14cbcSMatt Macy int error = 0; 2710eda14cbcSMatt Macy 2711e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 27127877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error); 2713e716630dSMartin Matuska error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2714e716630dSMartin Matuska } 2715eda14cbcSMatt Macy 2716eda14cbcSMatt Macy return (error); 2717eda14cbcSMatt Macy } 2718eda14cbcSMatt Macy 2719eda14cbcSMatt Macy static void 27207877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2721eda14cbcSMatt Macy { 2722eda14cbcSMatt Macy int unexpected_errors = 0; 2723eda14cbcSMatt Macy int parity_errors = 0; 2724eda14cbcSMatt Macy int parity_untried = 0; 2725eda14cbcSMatt Macy int data_errors = 0; 2726eda14cbcSMatt Macy 27277877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2728eda14cbcSMatt Macy 27297877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27307877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 2731eda14cbcSMatt Macy 2732eda14cbcSMatt Macy if (rc->rc_error) { 27337877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2734eda14cbcSMatt Macy parity_errors++; 2735eda14cbcSMatt Macy else 2736eda14cbcSMatt Macy data_errors++; 2737eda14cbcSMatt Macy 2738eda14cbcSMatt Macy if (!rc->rc_skipped) 2739eda14cbcSMatt Macy unexpected_errors++; 27407877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2741eda14cbcSMatt Macy parity_untried++; 2742eda14cbcSMatt Macy } 2743a0b956f5SMartin Matuska 2744a0b956f5SMartin Matuska if (rc->rc_force_repair) 2745a0b956f5SMartin Matuska unexpected_errors++; 2746eda14cbcSMatt Macy } 2747eda14cbcSMatt Macy 2748eda14cbcSMatt Macy /* 27497877fdebSMatt Macy * If we read more parity disks than were used for 27507877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced 27517877fdebSMatt Macy * correct data. 27527877fdebSMatt Macy * 27537877fdebSMatt Macy * Note that we also regenerate parity when resilvering so we 27547877fdebSMatt Macy * can write it out to failed devices later. 27557877fdebSMatt Macy */ 27567877fdebSMatt Macy if (parity_errors + parity_untried < 27577877fdebSMatt Macy rr->rr_firstdatacol - data_errors || 27587877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) { 27597877fdebSMatt Macy int n = raidz_parity_verify(zio, rr); 27607877fdebSMatt Macy unexpected_errors += n; 27617877fdebSMatt Macy } 27627877fdebSMatt Macy 27637877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 27647877fdebSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 27657877fdebSMatt Macy /* 27667877fdebSMatt Macy * Use the good data we have in hand to repair damaged children. 27677877fdebSMatt Macy */ 27687877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27697877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 27707877fdebSMatt Macy vdev_t *vd = zio->io_vd; 27717877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 27727877fdebSMatt Macy 277316038816SMartin Matuska if (!rc->rc_allow_repair) { 277416038816SMartin Matuska continue; 277516038816SMartin Matuska } else if (!rc->rc_force_repair && 277616038816SMartin Matuska (rc->rc_error == 0 || rc->rc_size == 0)) { 27777877fdebSMatt Macy continue; 27787877fdebSMatt Macy } 27797877fdebSMatt Macy 2780e716630dSMartin Matuska zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2781e716630dSMartin Matuska "offset=%llx", 2782e716630dSMartin Matuska zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2783e716630dSMartin Matuska 27847877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 27857877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 27867877fdebSMatt Macy ZIO_TYPE_WRITE, 27877877fdebSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 27887877fdebSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 27897877fdebSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 27907877fdebSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 27917877fdebSMatt Macy } 27927877fdebSMatt Macy } 2793e716630dSMartin Matuska 2794e716630dSMartin Matuska /* 2795e716630dSMartin Matuska * Scrub or resilver i/o's: overwrite any shadow locations with the 2796e716630dSMartin Matuska * good data. This ensures that if we've already copied this sector, 2797e716630dSMartin Matuska * it will be corrected if it was damaged. This writes more than is 2798e716630dSMartin Matuska * necessary, but since expansion is paused during scrub/resilver, at 2799e716630dSMartin Matuska * most a single row will have a shadow location. 2800e716630dSMartin Matuska */ 2801e716630dSMartin Matuska if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2802e716630dSMartin Matuska (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2803e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 2804e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2805e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2806e716630dSMartin Matuska 2807e716630dSMartin Matuska if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2808e716630dSMartin Matuska continue; 2809e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2810e716630dSMartin Matuska 2811e716630dSMartin Matuska /* 2812e716630dSMartin Matuska * Note: We don't want to update the repair stats 2813e716630dSMartin Matuska * because that would incorrectly indicate that there 2814e716630dSMartin Matuska * was bad data to repair, which we aren't sure about. 2815e716630dSMartin Matuska * By clearing the SCAN_THREAD flag, we prevent this 2816e716630dSMartin Matuska * from happening, despite having the REPAIR flag set. 2817e716630dSMartin Matuska * We need to set SELF_HEAL so that this i/o can't be 2818e716630dSMartin Matuska * bypassed by zio_vdev_io_start(). 2819e716630dSMartin Matuska */ 2820e716630dSMartin Matuska zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2821e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2822e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2823e716630dSMartin Matuska ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2824e716630dSMartin Matuska NULL, NULL); 2825e716630dSMartin Matuska cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2826e716630dSMartin Matuska zio_nowait(cio); 2827e716630dSMartin Matuska } 2828e716630dSMartin Matuska } 28297877fdebSMatt Macy } 28307877fdebSMatt Macy 28317877fdebSMatt Macy static void 28327877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm) 28337877fdebSMatt Macy { 28347877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 28357877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 28367877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 28377877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 28387877fdebSMatt Macy if (rc->rc_need_orig_restore) { 2839f9693befSMartin Matuska abd_copy(rc->rc_abd, 28407877fdebSMatt Macy rc->rc_orig_data, rc->rc_size); 28417877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 28427877fdebSMatt Macy } 28437877fdebSMatt Macy } 28447877fdebSMatt Macy } 28457877fdebSMatt Macy } 28467877fdebSMatt Macy 28477877fdebSMatt Macy /* 2848e716630dSMartin Matuska * During raidz_reconstruct() for expanded VDEV, we need special consideration 2849e716630dSMartin Matuska * failure simulations. See note in raidz_reconstruct() on simulating failure 2850e716630dSMartin Matuska * of a pre-expansion device. 2851e716630dSMartin Matuska * 2852e716630dSMartin Matuska * Treating logical child i as failed, return TRUE if the given column should 2853e716630dSMartin Matuska * be treated as failed. The idea of logical children allows us to imagine 2854e716630dSMartin Matuska * that a disk silently failed before a RAIDZ expansion (reads from this disk 2855e716630dSMartin Matuska * succeed but return the wrong data). Since the expansion doesn't verify 2856e716630dSMartin Matuska * checksums, the incorrect data will be moved to new locations spread among 2857e716630dSMartin Matuska * the children (going diagonally across them). 2858e716630dSMartin Matuska * 2859e716630dSMartin Matuska * Higher "logical child failures" (values of `i`) indicate these 2860e716630dSMartin Matuska * "pre-expansion failures". The first physical_width values imagine that a 2861e716630dSMartin Matuska * current child failed; the next physical_width-1 values imagine that a 2862e716630dSMartin Matuska * child failed before the most recent expansion; the next physical_width-2 2863e716630dSMartin Matuska * values imagine a child failed in the expansion before that, etc. 2864e716630dSMartin Matuska */ 2865e716630dSMartin Matuska static boolean_t 2866e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift, 2867e716630dSMartin Matuska int i, raidz_col_t *rc) 2868e716630dSMartin Matuska { 2869e716630dSMartin Matuska uint64_t sector_id = 2870e716630dSMartin Matuska physical_width * (rc->rc_offset >> ashift) + 2871e716630dSMartin Matuska rc->rc_devidx; 2872e716630dSMartin Matuska 2873e716630dSMartin Matuska for (int w = physical_width; w >= original_width; w--) { 2874e716630dSMartin Matuska if (i < w) { 2875e716630dSMartin Matuska return (sector_id % w == i); 2876e716630dSMartin Matuska } else { 2877e716630dSMartin Matuska i -= w; 2878e716630dSMartin Matuska } 2879e716630dSMartin Matuska } 2880e716630dSMartin Matuska ASSERT(!"invalid logical child id"); 2881e716630dSMartin Matuska return (B_FALSE); 2882e716630dSMartin Matuska } 2883e716630dSMartin Matuska 2884e716630dSMartin Matuska /* 28857877fdebSMatt Macy * returns EINVAL if reconstruction of the block will not be possible 28867877fdebSMatt Macy * returns ECKSUM if this specific reconstruction failed 28877877fdebSMatt Macy * returns 0 on successful reconstruction 28887877fdebSMatt Macy */ 28897877fdebSMatt Macy static int 28907877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 28917877fdebSMatt Macy { 28927877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2893e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 2894e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 2895e716630dSMartin Matuska rm->rm_original_width : physical_width; 2896e716630dSMartin Matuska int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2897e716630dSMartin Matuska 2898e716630dSMartin Matuska if (dbgmsg) { 2899e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2900e716630dSMartin Matuska "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2901e716630dSMartin Matuska } 29027877fdebSMatt Macy 29037877fdebSMatt Macy /* Reconstruct each row */ 29047877fdebSMatt Macy for (int r = 0; r < rm->rm_nrows; r++) { 29057877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[r]; 29067877fdebSMatt Macy int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 29077877fdebSMatt Macy int t = 0; 29087877fdebSMatt Macy int dead = 0; 29097877fdebSMatt Macy int dead_data = 0; 29107877fdebSMatt Macy 2911e716630dSMartin Matuska if (dbgmsg) 2912e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2913e716630dSMartin Matuska 29147877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29157877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29167877fdebSMatt Macy ASSERT0(rc->rc_need_orig_restore); 29177877fdebSMatt Macy if (rc->rc_error != 0) { 29187877fdebSMatt Macy dead++; 29197877fdebSMatt Macy if (c >= nparity) 29207877fdebSMatt Macy dead_data++; 29217877fdebSMatt Macy continue; 29227877fdebSMatt Macy } 29237877fdebSMatt Macy if (rc->rc_size == 0) 29247877fdebSMatt Macy continue; 29257877fdebSMatt Macy for (int lt = 0; lt < ntgts; lt++) { 2926e716630dSMartin Matuska if (raidz_simulate_failure(physical_width, 2927e716630dSMartin Matuska original_width, 2928e716630dSMartin Matuska zio->io_vd->vdev_top->vdev_ashift, 2929e716630dSMartin Matuska ltgts[lt], rc)) { 29307877fdebSMatt Macy if (rc->rc_orig_data == NULL) { 29317877fdebSMatt Macy rc->rc_orig_data = 2932f9693befSMartin Matuska abd_alloc_linear( 2933f9693befSMartin Matuska rc->rc_size, B_TRUE); 2934f9693befSMartin Matuska abd_copy(rc->rc_orig_data, 29357877fdebSMatt Macy rc->rc_abd, rc->rc_size); 29367877fdebSMatt Macy } 29377877fdebSMatt Macy rc->rc_need_orig_restore = B_TRUE; 29387877fdebSMatt Macy 29397877fdebSMatt Macy dead++; 29407877fdebSMatt Macy if (c >= nparity) 29417877fdebSMatt Macy dead_data++; 2942e716630dSMartin Matuska /* 2943e716630dSMartin Matuska * Note: simulating failure of a 2944e716630dSMartin Matuska * pre-expansion device can hit more 2945e716630dSMartin Matuska * than one column, in which case we 2946e716630dSMartin Matuska * might try to simulate more failures 2947e716630dSMartin Matuska * than can be reconstructed, which is 2948e716630dSMartin Matuska * also more than the size of my_tgts. 2949e716630dSMartin Matuska * This check prevents accessing past 2950e716630dSMartin Matuska * the end of my_tgts. The "dead > 2951e716630dSMartin Matuska * nparity" check below will fail this 2952e716630dSMartin Matuska * reconstruction attempt. 2953e716630dSMartin Matuska */ 2954e716630dSMartin Matuska if (t < VDEV_RAIDZ_MAXPARITY) { 29557877fdebSMatt Macy my_tgts[t++] = c; 2956e716630dSMartin Matuska if (dbgmsg) { 2957e716630dSMartin Matuska zfs_dbgmsg("simulating " 2958e716630dSMartin Matuska "failure of col %u " 2959e716630dSMartin Matuska "devidx %u", c, 2960e716630dSMartin Matuska (int)rc->rc_devidx); 2961e716630dSMartin Matuska } 2962e716630dSMartin Matuska } 29637877fdebSMatt Macy break; 29647877fdebSMatt Macy } 29657877fdebSMatt Macy } 29667877fdebSMatt Macy } 29677877fdebSMatt Macy if (dead > nparity) { 29687877fdebSMatt Macy /* reconstruction not possible */ 2969e716630dSMartin Matuska if (dbgmsg) { 2970e716630dSMartin Matuska zfs_dbgmsg("reconstruction not possible; " 2971e716630dSMartin Matuska "too many failures"); 2972e716630dSMartin Matuska } 29737877fdebSMatt Macy raidz_restore_orig_data(rm); 29747877fdebSMatt Macy return (EINVAL); 29757877fdebSMatt Macy } 29767877fdebSMatt Macy if (dead_data > 0) 2977f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 29787877fdebSMatt Macy } 29797877fdebSMatt Macy 29807877fdebSMatt Macy /* Check for success */ 29817877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 29827877fdebSMatt Macy 29837877fdebSMatt Macy /* Reconstruction succeeded - report errors */ 29847877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 29857877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 29867877fdebSMatt Macy 29877877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29887877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29897877fdebSMatt Macy if (rc->rc_need_orig_restore) { 29907877fdebSMatt Macy /* 29917877fdebSMatt Macy * Note: if this is a parity column, 29927877fdebSMatt Macy * we don't really know if it's wrong. 29937877fdebSMatt Macy * We need to let 29947877fdebSMatt Macy * vdev_raidz_io_done_verified() check 29957877fdebSMatt Macy * it, and if we set rc_error, it will 29967877fdebSMatt Macy * think that it is a "known" error 29977877fdebSMatt Macy * that doesn't need to be checked 29987877fdebSMatt Macy * or corrected. 29997877fdebSMatt Macy */ 30007877fdebSMatt Macy if (rc->rc_error == 0 && 30017877fdebSMatt Macy c >= rr->rr_firstdatacol) { 3002e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, 3003f9693befSMartin Matuska rc, rc->rc_orig_data); 30047877fdebSMatt Macy rc->rc_error = 30057877fdebSMatt Macy SET_ERROR(ECKSUM); 30067877fdebSMatt Macy } 30077877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 30087877fdebSMatt Macy } 30097877fdebSMatt Macy } 30107877fdebSMatt Macy 30117877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 30127877fdebSMatt Macy } 30137877fdebSMatt Macy 30147877fdebSMatt Macy zio_checksum_verified(zio); 30157877fdebSMatt Macy 3016e716630dSMartin Matuska if (dbgmsg) { 3017e716630dSMartin Matuska zfs_dbgmsg("reconstruction successful " 3018e716630dSMartin Matuska "(checksum verified)"); 3019e716630dSMartin Matuska } 30207877fdebSMatt Macy return (0); 30217877fdebSMatt Macy } 30227877fdebSMatt Macy 30237877fdebSMatt Macy /* Reconstruction failed - restore original data */ 30247877fdebSMatt Macy raidz_restore_orig_data(rm); 3025e716630dSMartin Matuska if (dbgmsg) { 3026e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3027e716630dSMartin Matuska "failed", zio); 3028e716630dSMartin Matuska } 30297877fdebSMatt Macy return (ECKSUM); 30307877fdebSMatt Macy } 30317877fdebSMatt Macy 30327877fdebSMatt Macy /* 30337877fdebSMatt Macy * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 30347877fdebSMatt Macy * Note that the algorithm below is non-optimal because it doesn't take into 30357877fdebSMatt Macy * account how reconstruction is actually performed. For example, with 30367877fdebSMatt Macy * triple-parity RAID-Z the reconstruction procedure is the same if column 4 30377877fdebSMatt Macy * is targeted as invalid as if columns 1 and 4 are targeted since in both 30387877fdebSMatt Macy * cases we'd only use parity information in column 0. 30397877fdebSMatt Macy * 30407877fdebSMatt Macy * The order that we find the various possible combinations of failed 30417877fdebSMatt Macy * disks is dictated by these rules: 30427877fdebSMatt Macy * - Examine each "slot" (the "i" in tgts[i]) 3043e716630dSMartin Matuska * - Try to increment this slot (tgts[i] += 1) 30447877fdebSMatt Macy * - if we can't increment because it runs into the next slot, 30457877fdebSMatt Macy * reset our slot to the minimum, and examine the next slot 30467877fdebSMatt Macy * 30477877fdebSMatt Macy * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 30487877fdebSMatt Macy * 3 columns to reconstruct), we will generate the following sequence: 30497877fdebSMatt Macy * 30507877fdebSMatt Macy * STATE ACTION 30517877fdebSMatt Macy * 0 1 2 special case: skip since these are all parity 30527877fdebSMatt Macy * 0 1 3 first slot: reset to 0; middle slot: increment to 2 30537877fdebSMatt Macy * 0 2 3 first slot: increment to 1 30547877fdebSMatt Macy * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 30557877fdebSMatt Macy * 0 1 4 first: reset to 0; middle: increment to 2 30567877fdebSMatt Macy * 0 2 4 first: increment to 1 30577877fdebSMatt Macy * 1 2 4 first: reset to 0; middle: increment to 3 30587877fdebSMatt Macy * 0 3 4 first: increment to 1 30597877fdebSMatt Macy * 1 3 4 first: increment to 2 30607877fdebSMatt Macy * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 30617877fdebSMatt Macy * 0 1 5 first: reset to 0; middle: increment to 2 30627877fdebSMatt Macy * 0 2 5 first: increment to 1 30637877fdebSMatt Macy * 1 2 5 first: reset to 0; middle: increment to 3 30647877fdebSMatt Macy * 0 3 5 first: increment to 1 30657877fdebSMatt Macy * 1 3 5 first: increment to 2 30667877fdebSMatt Macy * 2 3 5 first: reset to 0; middle: increment to 4 30677877fdebSMatt Macy * 0 4 5 first: increment to 1 30687877fdebSMatt Macy * 1 4 5 first: increment to 2 30697877fdebSMatt Macy * 2 4 5 first: increment to 3 30707877fdebSMatt Macy * 3 4 5 done 30717877fdebSMatt Macy * 307216038816SMartin Matuska * This strategy works for dRAID but is less efficient when there are a large 30737877fdebSMatt Macy * number of child vdevs and therefore permutations to check. Furthermore, 3074e716630dSMartin Matuska * since the raidz_map_t rows likely do not overlap, reconstruction would be 30757877fdebSMatt Macy * possible as long as there are no more than nparity data errors per row. 30767877fdebSMatt Macy * These additional permutations are not currently checked but could be as 30777877fdebSMatt Macy * a future improvement. 3078e716630dSMartin Matuska * 3079e716630dSMartin Matuska * Returns 0 on success, ECKSUM on failure. 30807877fdebSMatt Macy */ 30817877fdebSMatt Macy static int 30827877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio) 30837877fdebSMatt Macy { 30847877fdebSMatt Macy int nparity = vdev_get_nparity(zio->io_vd); 30857877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3086e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 3087e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 3088e716630dSMartin Matuska rm->rm_original_width : physical_width; 30897877fdebSMatt Macy 30907877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 30917877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 30927877fdebSMatt Macy int total_errors = 0; 30937877fdebSMatt Macy 30947877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 30957877fdebSMatt Macy if (rr->rr_col[c].rc_error) 30967877fdebSMatt Macy total_errors++; 30977877fdebSMatt Macy } 30987877fdebSMatt Macy 30997877fdebSMatt Macy if (total_errors > nparity) 31007877fdebSMatt Macy return (vdev_raidz_worst_error(rr)); 31017877fdebSMatt Macy } 31027877fdebSMatt Macy 31037877fdebSMatt Macy for (int num_failures = 1; num_failures <= nparity; num_failures++) { 31047877fdebSMatt Macy int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 31057877fdebSMatt Macy int *ltgts = &tstore[1]; /* value is logical child ID */ 31067877fdebSMatt Macy 3107e716630dSMartin Matuska 3108e716630dSMartin Matuska /* 3109e716630dSMartin Matuska * Determine number of logical children, n. See comment 3110e716630dSMartin Matuska * above raidz_simulate_failure(). 3111e716630dSMartin Matuska */ 3112e716630dSMartin Matuska int n = 0; 3113e716630dSMartin Matuska for (int w = physical_width; 3114e716630dSMartin Matuska w >= original_width; w--) { 3115e716630dSMartin Matuska n += w; 3116e716630dSMartin Matuska } 31177877fdebSMatt Macy 31187877fdebSMatt Macy ASSERT3U(num_failures, <=, nparity); 31197877fdebSMatt Macy ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 31207877fdebSMatt Macy 31217877fdebSMatt Macy /* Handle corner cases in combrec logic */ 31227877fdebSMatt Macy ltgts[-1] = -1; 31237877fdebSMatt Macy for (int i = 0; i < num_failures; i++) { 31247877fdebSMatt Macy ltgts[i] = i; 31257877fdebSMatt Macy } 31267877fdebSMatt Macy ltgts[num_failures] = n; 31277877fdebSMatt Macy 31287877fdebSMatt Macy for (;;) { 31297877fdebSMatt Macy int err = raidz_reconstruct(zio, ltgts, num_failures, 31307877fdebSMatt Macy nparity); 31317877fdebSMatt Macy if (err == EINVAL) { 31327877fdebSMatt Macy /* 31337877fdebSMatt Macy * Reconstruction not possible with this # 31347877fdebSMatt Macy * failures; try more failures. 31357877fdebSMatt Macy */ 31367877fdebSMatt Macy break; 31377877fdebSMatt Macy } else if (err == 0) 31387877fdebSMatt Macy return (0); 31397877fdebSMatt Macy 31407877fdebSMatt Macy /* Compute next targets to try */ 31417877fdebSMatt Macy for (int t = 0; ; t++) { 31427877fdebSMatt Macy ASSERT3U(t, <, num_failures); 31437877fdebSMatt Macy ltgts[t]++; 31447877fdebSMatt Macy if (ltgts[t] == n) { 31457877fdebSMatt Macy /* try more failures */ 31467877fdebSMatt Macy ASSERT3U(t, ==, num_failures - 1); 3147e716630dSMartin Matuska if (zfs_flags & 3148e716630dSMartin Matuska ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3149e716630dSMartin Matuska zfs_dbgmsg("reconstruction " 3150e716630dSMartin Matuska "failed for num_failures=" 3151e716630dSMartin Matuska "%u; tried all " 3152e716630dSMartin Matuska "combinations", 3153e716630dSMartin Matuska num_failures); 3154e716630dSMartin Matuska } 31557877fdebSMatt Macy break; 31567877fdebSMatt Macy } 31577877fdebSMatt Macy 31587877fdebSMatt Macy ASSERT3U(ltgts[t], <, n); 31597877fdebSMatt Macy ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 31607877fdebSMatt Macy 31617877fdebSMatt Macy /* 31627877fdebSMatt Macy * If that spot is available, we're done here. 31637877fdebSMatt Macy * Try the next combination. 31647877fdebSMatt Macy */ 31657877fdebSMatt Macy if (ltgts[t] != ltgts[t + 1]) 3166e716630dSMartin Matuska break; // found next combination 31677877fdebSMatt Macy 31687877fdebSMatt Macy /* 31697877fdebSMatt Macy * Otherwise, reset this tgt to the minimum, 31707877fdebSMatt Macy * and move on to the next tgt. 31717877fdebSMatt Macy */ 31727877fdebSMatt Macy ltgts[t] = ltgts[t - 1] + 1; 31737877fdebSMatt Macy ASSERT3U(ltgts[t], ==, t); 31747877fdebSMatt Macy } 31757877fdebSMatt Macy 31767877fdebSMatt Macy /* Increase the number of failures and keep trying. */ 31777877fdebSMatt Macy if (ltgts[num_failures - 1] == n) 31787877fdebSMatt Macy break; 31797877fdebSMatt Macy } 31807877fdebSMatt Macy } 3181e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3182e716630dSMartin Matuska zfs_dbgmsg("reconstruction failed for all num_failures"); 31837877fdebSMatt Macy return (ECKSUM); 31847877fdebSMatt Macy } 31857877fdebSMatt Macy 31867877fdebSMatt Macy void 31877877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 31887877fdebSMatt Macy { 31897877fdebSMatt Macy for (uint64_t row = 0; row < rm->rm_nrows; row++) { 31907877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[row]; 31917877fdebSMatt Macy vdev_raidz_reconstruct_row(rm, rr, t, nt); 31927877fdebSMatt Macy } 31937877fdebSMatt Macy } 31947877fdebSMatt Macy 31957877fdebSMatt Macy /* 31967877fdebSMatt Macy * Complete a write IO operation on a RAIDZ VDev 31977877fdebSMatt Macy * 31987877fdebSMatt Macy * Outline: 31997877fdebSMatt Macy * 1. Check for errors on the child IOs. 32007877fdebSMatt Macy * 2. Return, setting an error code if too few child VDevs were written 32017877fdebSMatt Macy * to reconstruct the data later. Note that partial writes are 32027877fdebSMatt Macy * considered successful if they can be reconstructed at all. 32037877fdebSMatt Macy */ 32047877fdebSMatt Macy static void 32057877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 32067877fdebSMatt Macy { 3207e716630dSMartin Matuska int normal_errors = 0; 3208e716630dSMartin Matuska int shadow_errors = 0; 32097877fdebSMatt Macy 32107877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32117877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32127877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 32137877fdebSMatt Macy 32147877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32157877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32167877fdebSMatt Macy 3217e716630dSMartin Matuska if (rc->rc_error != 0) { 32187877fdebSMatt Macy ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3219e716630dSMartin Matuska normal_errors++; 3220e716630dSMartin Matuska } 3221e716630dSMartin Matuska if (rc->rc_shadow_error != 0) { 3222e716630dSMartin Matuska ASSERT(rc->rc_shadow_error != ECKSUM); 3223e716630dSMartin Matuska shadow_errors++; 32247877fdebSMatt Macy } 32257877fdebSMatt Macy } 32267877fdebSMatt Macy 32277877fdebSMatt Macy /* 32287877fdebSMatt Macy * Treat partial writes as a success. If we couldn't write enough 3229e716630dSMartin Matuska * columns to reconstruct the data, the I/O failed. Otherwise, good 3230e716630dSMartin Matuska * enough. Note that in the case of a shadow write (during raidz 3231e716630dSMartin Matuska * expansion), depending on if we crash, either the normal (old) or 3232e716630dSMartin Matuska * shadow (new) location may become the "real" version of the block, 3233e716630dSMartin Matuska * so both locations must have sufficient redundancy. 3234eda14cbcSMatt Macy * 3235eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 3236eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 3237eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 3238eda14cbcSMatt Macy * if we intend to reallocate. 3239eda14cbcSMatt Macy */ 3240e716630dSMartin Matuska if (normal_errors > rr->rr_firstdatacol || 3241e716630dSMartin Matuska shadow_errors > rr->rr_firstdatacol) { 32427877fdebSMatt Macy zio->io_error = zio_worst_error(zio->io_error, 32437877fdebSMatt Macy vdev_raidz_worst_error(rr)); 32447877fdebSMatt Macy } 3245eda14cbcSMatt Macy } 3246eda14cbcSMatt Macy 3247f9693befSMartin Matuska static void 32487877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 32497877fdebSMatt Macy raidz_row_t *rr) 32507877fdebSMatt Macy { 32517877fdebSMatt Macy int parity_errors = 0; 32527877fdebSMatt Macy int parity_untried = 0; 32537877fdebSMatt Macy int data_errors = 0; 32547877fdebSMatt Macy int total_errors = 0; 32557877fdebSMatt Macy 32567877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32577877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32587877fdebSMatt Macy 32597877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32607877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32617877fdebSMatt Macy 3262a0b956f5SMartin Matuska /* 3263a0b956f5SMartin Matuska * If scrubbing and a replacing/sparing child vdev determined 3264a0b956f5SMartin Matuska * that not all of its children have an identical copy of the 3265a0b956f5SMartin Matuska * data, then clear the error so the column is treated like 3266a0b956f5SMartin Matuska * any other read and force a repair to correct the damage. 3267a0b956f5SMartin Matuska */ 3268a0b956f5SMartin Matuska if (rc->rc_error == ECKSUM) { 3269a0b956f5SMartin Matuska ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3270a0b956f5SMartin Matuska vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3271a0b956f5SMartin Matuska rc->rc_force_repair = 1; 3272a0b956f5SMartin Matuska rc->rc_error = 0; 3273a0b956f5SMartin Matuska } 32747877fdebSMatt Macy 3275a0b956f5SMartin Matuska if (rc->rc_error) { 32767877fdebSMatt Macy if (c < rr->rr_firstdatacol) 32777877fdebSMatt Macy parity_errors++; 32787877fdebSMatt Macy else 32797877fdebSMatt Macy data_errors++; 32807877fdebSMatt Macy 32817877fdebSMatt Macy total_errors++; 32827877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 32837877fdebSMatt Macy parity_untried++; 32847877fdebSMatt Macy } 32857877fdebSMatt Macy } 3286eda14cbcSMatt Macy 3287eda14cbcSMatt Macy /* 32887877fdebSMatt Macy * If there were data errors and the number of errors we saw was 32897877fdebSMatt Macy * correctable -- less than or equal to the number of parity disks read 32907877fdebSMatt Macy * -- reconstruct based on the missing data. 3291eda14cbcSMatt Macy */ 32927877fdebSMatt Macy if (data_errors != 0 && 32937877fdebSMatt Macy total_errors <= rr->rr_firstdatacol - parity_untried) { 3294eda14cbcSMatt Macy /* 3295eda14cbcSMatt Macy * We either attempt to read all the parity columns or 3296eda14cbcSMatt Macy * none of them. If we didn't try to read parity, we 3297eda14cbcSMatt Macy * wouldn't be here in the correctable case. There must 3298eda14cbcSMatt Macy * also have been fewer parity errors than parity 3299eda14cbcSMatt Macy * columns or, again, we wouldn't be in this code path. 3300eda14cbcSMatt Macy */ 3301eda14cbcSMatt Macy ASSERT(parity_untried == 0); 33027877fdebSMatt Macy ASSERT(parity_errors < rr->rr_firstdatacol); 3303eda14cbcSMatt Macy 3304eda14cbcSMatt Macy /* 3305eda14cbcSMatt Macy * Identify the data columns that reported an error. 3306eda14cbcSMatt Macy */ 33077877fdebSMatt Macy int n = 0; 33087877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY]; 33097877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 33107877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 3311eda14cbcSMatt Macy if (rc->rc_error != 0) { 3312eda14cbcSMatt Macy ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3313eda14cbcSMatt Macy tgts[n++] = c; 3314eda14cbcSMatt Macy } 3315eda14cbcSMatt Macy } 3316eda14cbcSMatt Macy 33177877fdebSMatt Macy ASSERT(rr->rr_firstdatacol >= n); 3318eda14cbcSMatt Macy 3319f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3320eda14cbcSMatt Macy } 3321eda14cbcSMatt Macy } 3322eda14cbcSMatt Macy 3323eda14cbcSMatt Macy /* 33247877fdebSMatt Macy * Return the number of reads issued. 3325eda14cbcSMatt Macy */ 33267877fdebSMatt Macy static int 33277877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 33287877fdebSMatt Macy { 33297877fdebSMatt Macy vdev_t *vd = zio->io_vd; 33307877fdebSMatt Macy int nread = 0; 3331eda14cbcSMatt Macy 33327877fdebSMatt Macy rr->rr_missingdata = 0; 33337877fdebSMatt Macy rr->rr_missingparity = 0; 33347877fdebSMatt Macy 33357877fdebSMatt Macy /* 33367877fdebSMatt Macy * If this rows contains empty sectors which are not required 33377877fdebSMatt Macy * for a normal read then allocate an ABD for them now so they 33387877fdebSMatt Macy * may be read, verified, and any needed repairs performed. 33397877fdebSMatt Macy */ 3340e716630dSMartin Matuska if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 33417877fdebSMatt Macy vdev_draid_map_alloc_empty(zio, rr); 33427877fdebSMatt Macy 33437877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33447877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33457877fdebSMatt Macy if (rc->rc_tried || rc->rc_size == 0) 3346eda14cbcSMatt Macy continue; 3347eda14cbcSMatt Macy 3348eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 3349eda14cbcSMatt Macy vd->vdev_child[rc->rc_devidx], 3350eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 3351eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 3352eda14cbcSMatt Macy vdev_raidz_child_done, rc)); 33537877fdebSMatt Macy nread++; 33547877fdebSMatt Macy } 33557877fdebSMatt Macy return (nread); 3356eda14cbcSMatt Macy } 3357eda14cbcSMatt Macy 3358eda14cbcSMatt Macy /* 33597877fdebSMatt Macy * We're here because either there were too many errors to even attempt 33607877fdebSMatt Macy * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 33617877fdebSMatt Macy * failed. In either case, there is enough bad data to prevent reconstruction. 33627877fdebSMatt Macy * Start checksum ereports for all children which haven't failed. 3363eda14cbcSMatt Macy */ 33647877fdebSMatt Macy static void 33657877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio) 33667877fdebSMatt Macy { 33677877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3368eda14cbcSMatt Macy 33697877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 33707877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 3371eda14cbcSMatt Macy 33727877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33737877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33747877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 33757877fdebSMatt Macy 33762c48331dSMatt Macy if (rc->rc_error != 0) 33772c48331dSMatt Macy continue; 33782c48331dSMatt Macy 3379eda14cbcSMatt Macy zio_bad_cksum_t zbc; 3380eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 33812c48331dSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 3382eda14cbcSMatt Macy 3383eda14cbcSMatt Macy mutex_enter(&cvd->vdev_stat_lock); 3384eda14cbcSMatt Macy cvd->vdev_stat.vs_checksum_errors++; 3385eda14cbcSMatt Macy mutex_exit(&cvd->vdev_stat_lock); 3386bb2d13b6SMartin Matuska (void) zfs_ereport_start_checksum(zio->io_spa, 3387bb2d13b6SMartin Matuska cvd, &zio->io_bookmark, zio, rc->rc_offset, 3388bb2d13b6SMartin Matuska rc->rc_size, &zbc); 3389eda14cbcSMatt Macy } 3390eda14cbcSMatt Macy } 3391eda14cbcSMatt Macy } 3392eda14cbcSMatt Macy 33937877fdebSMatt Macy void 33947877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio) 33957877fdebSMatt Macy { 33967877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 33977877fdebSMatt Macy 3398e716630dSMartin Matuska ASSERT(zio->io_bp != NULL); 33997877fdebSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 34007877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34017877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 34027877fdebSMatt Macy } 34037877fdebSMatt Macy } else { 3404e716630dSMartin Matuska if (rm->rm_phys_col) { 3405e716630dSMartin Matuska /* 3406e716630dSMartin Matuska * This is an aggregated read. Copy the data and status 3407e716630dSMartin Matuska * from the aggregate abd's to the individual rows. 3408e716630dSMartin Matuska */ 3409e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 3410e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 3411e716630dSMartin Matuska 3412e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 3413e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 3414e716630dSMartin Matuska if (rc->rc_tried || rc->rc_size == 0) 3415e716630dSMartin Matuska continue; 3416e716630dSMartin Matuska 3417e716630dSMartin Matuska raidz_col_t *prc = 3418e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 3419e716630dSMartin Matuska rc->rc_error = prc->rc_error; 3420e716630dSMartin Matuska rc->rc_tried = prc->rc_tried; 3421e716630dSMartin Matuska rc->rc_skipped = prc->rc_skipped; 3422e716630dSMartin Matuska if (c >= rr->rr_firstdatacol) { 3423e716630dSMartin Matuska /* 3424e716630dSMartin Matuska * Note: this is slightly faster 3425e716630dSMartin Matuska * than using abd_copy_off(). 3426e716630dSMartin Matuska */ 3427e716630dSMartin Matuska char *physbuf = abd_to_buf( 3428e716630dSMartin Matuska prc->rc_abd); 3429e716630dSMartin Matuska void *physloc = physbuf + 3430e716630dSMartin Matuska rc->rc_offset - 3431e716630dSMartin Matuska prc->rc_offset; 3432e716630dSMartin Matuska 3433e716630dSMartin Matuska abd_copy_from_buf(rc->rc_abd, 3434e716630dSMartin Matuska physloc, rc->rc_size); 3435e716630dSMartin Matuska } 3436e716630dSMartin Matuska } 3437e716630dSMartin Matuska } 3438e716630dSMartin Matuska } 3439e716630dSMartin Matuska 34407877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34417877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34427877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio, 34437877fdebSMatt Macy rm, rr); 34447877fdebSMatt Macy } 34457877fdebSMatt Macy 34467877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 34477877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34487877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34497877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 34507877fdebSMatt Macy } 3451eda14cbcSMatt Macy zio_checksum_verified(zio); 34527877fdebSMatt Macy } else { 3453eda14cbcSMatt Macy /* 34547877fdebSMatt Macy * A sequential resilver has no checksum which makes 34557877fdebSMatt Macy * combinatoral reconstruction impossible. This code 34567877fdebSMatt Macy * path is unreachable since raidz_checksum_verify() 34577877fdebSMatt Macy * has no checksum to verify and must succeed. 3458eda14cbcSMatt Macy */ 34597877fdebSMatt Macy ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3460eda14cbcSMatt Macy 34617877fdebSMatt Macy /* 34627877fdebSMatt Macy * This isn't a typical situation -- either we got a 34637877fdebSMatt Macy * read error or a child silently returned bad data. 34647877fdebSMatt Macy * Read every block so we can try again with as much 34657877fdebSMatt Macy * data and parity as we can track down. If we've 34667877fdebSMatt Macy * already been through once before, all children will 34677877fdebSMatt Macy * be marked as tried so we'll proceed to combinatorial 34687877fdebSMatt Macy * reconstruction. 34697877fdebSMatt Macy */ 34707877fdebSMatt Macy int nread = 0; 34717877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34727877fdebSMatt Macy nread += vdev_raidz_read_all(zio, 34737877fdebSMatt Macy rm->rm_row[i]); 34747877fdebSMatt Macy } 34757877fdebSMatt Macy if (nread != 0) { 34767877fdebSMatt Macy /* 34777877fdebSMatt Macy * Normally our stage is VDEV_IO_DONE, but if 34787877fdebSMatt Macy * we've already called redone(), it will have 34797877fdebSMatt Macy * changed to VDEV_IO_START, in which case we 34807877fdebSMatt Macy * don't want to call redone() again. 34817877fdebSMatt Macy */ 34827877fdebSMatt Macy if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 34837877fdebSMatt Macy zio_vdev_io_redone(zio); 34847877fdebSMatt Macy return; 34857877fdebSMatt Macy } 3486e716630dSMartin Matuska /* 3487e716630dSMartin Matuska * It would be too expensive to try every possible 3488e716630dSMartin Matuska * combination of failed sectors in every row, so 3489e716630dSMartin Matuska * instead we try every combination of failed current or 3490e716630dSMartin Matuska * past physical disk. This means that if the incorrect 3491e716630dSMartin Matuska * sectors were all on Nparity disks at any point in the 3492e716630dSMartin Matuska * past, we will find the correct data. The only known 3493e716630dSMartin Matuska * case where this is less durable than a non-expanded 3494e716630dSMartin Matuska * RAIDZ, is if we have a silent failure during 3495e716630dSMartin Matuska * expansion. In that case, one block could be 3496e716630dSMartin Matuska * partially in the old format and partially in the 3497e716630dSMartin Matuska * new format, so we'd lost some sectors from the old 3498e716630dSMartin Matuska * format and some from the new format. 3499e716630dSMartin Matuska * 3500e716630dSMartin Matuska * e.g. logical_width=4 physical_width=6 3501e716630dSMartin Matuska * the 15 (6+5+4) possible failed disks are: 3502e716630dSMartin Matuska * width=6 child=0 3503e716630dSMartin Matuska * width=6 child=1 3504e716630dSMartin Matuska * width=6 child=2 3505e716630dSMartin Matuska * width=6 child=3 3506e716630dSMartin Matuska * width=6 child=4 3507e716630dSMartin Matuska * width=6 child=5 3508e716630dSMartin Matuska * width=5 child=0 3509e716630dSMartin Matuska * width=5 child=1 3510e716630dSMartin Matuska * width=5 child=2 3511e716630dSMartin Matuska * width=5 child=3 3512e716630dSMartin Matuska * width=5 child=4 3513e716630dSMartin Matuska * width=4 child=0 3514e716630dSMartin Matuska * width=4 child=1 3515e716630dSMartin Matuska * width=4 child=2 3516e716630dSMartin Matuska * width=4 child=3 3517e716630dSMartin Matuska * And we will try every combination of Nparity of these 3518e716630dSMartin Matuska * failing. 3519e716630dSMartin Matuska * 3520e716630dSMartin Matuska * As a first pass, we can generate every combo, 3521e716630dSMartin Matuska * and try reconstructing, ignoring any known 3522e716630dSMartin Matuska * failures. If any row has too many known + simulated 3523e716630dSMartin Matuska * failures, then we bail on reconstructing with this 3524e716630dSMartin Matuska * number of simulated failures. As an improvement, 3525e716630dSMartin Matuska * we could detect the number of whole known failures 3526e716630dSMartin Matuska * (i.e. we have known failures on these disks for 3527e716630dSMartin Matuska * every row; the disks never succeeded), and 3528e716630dSMartin Matuska * subtract that from the max # failures to simulate. 3529e716630dSMartin Matuska * We could go even further like the current 3530e716630dSMartin Matuska * combrec code, but that doesn't seem like it 3531e716630dSMartin Matuska * gains us very much. If we simulate a failure 3532e716630dSMartin Matuska * that is also a known failure, that's fine. 3533e716630dSMartin Matuska */ 35347877fdebSMatt Macy zio->io_error = vdev_raidz_combrec(zio); 35357877fdebSMatt Macy if (zio->io_error == ECKSUM && 35367877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 35377877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio); 35387877fdebSMatt Macy } 3539eda14cbcSMatt Macy } 3540eda14cbcSMatt Macy } 3541e716630dSMartin Matuska if (rm->rm_lr != NULL) { 3542e716630dSMartin Matuska zfs_rangelock_exit(rm->rm_lr); 3543e716630dSMartin Matuska rm->rm_lr = NULL; 3544e716630dSMartin Matuska } 3545eda14cbcSMatt Macy } 3546eda14cbcSMatt Macy 3547eda14cbcSMatt Macy static void 3548eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3549eda14cbcSMatt Macy { 35507877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 35517877fdebSMatt Macy if (faulted > vdrz->vd_nparity) 3552eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3553eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 3554eda14cbcSMatt Macy else if (degraded + faulted != 0) 3555eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3556eda14cbcSMatt Macy else 3557eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3558eda14cbcSMatt Macy } 3559eda14cbcSMatt Macy 3560eda14cbcSMatt Macy /* 3561eda14cbcSMatt Macy * Determine if any portion of the provided block resides on a child vdev 3562eda14cbcSMatt Macy * with a dirty DTL and therefore needs to be resilvered. The function 3563eda14cbcSMatt Macy * assumes that at least one DTL is dirty which implies that full stripe 3564eda14cbcSMatt Macy * width blocks must be resilvered. 3565eda14cbcSMatt Macy */ 3566eda14cbcSMatt Macy static boolean_t 35677877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 35687877fdebSMatt Macy uint64_t phys_birth) 3569eda14cbcSMatt Macy { 35707877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 3571e716630dSMartin Matuska 3572e716630dSMartin Matuska /* 3573e716630dSMartin Matuska * If we're in the middle of a RAIDZ expansion, this block may be in 3574e716630dSMartin Matuska * the old and/or new location. For simplicity, always resilver it. 3575e716630dSMartin Matuska */ 3576e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3577e716630dSMartin Matuska return (B_TRUE); 3578e716630dSMartin Matuska 3579eda14cbcSMatt Macy uint64_t dcols = vd->vdev_children; 35807877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 3581eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 3582eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 35837877fdebSMatt Macy uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3584eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 3585eda14cbcSMatt Macy uint64_t s = ((psize - 1) >> ashift) + 1; 3586eda14cbcSMatt Macy /* The first column for this stripe. */ 3587eda14cbcSMatt Macy uint64_t f = b % dcols; 3588eda14cbcSMatt Macy 35897877fdebSMatt Macy /* Unreachable by sequential resilver. */ 35907877fdebSMatt Macy ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 35917877fdebSMatt Macy 35927877fdebSMatt Macy if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 35937877fdebSMatt Macy return (B_FALSE); 35947877fdebSMatt Macy 3595eda14cbcSMatt Macy if (s + nparity >= dcols) 3596eda14cbcSMatt Macy return (B_TRUE); 3597eda14cbcSMatt Macy 3598eda14cbcSMatt Macy for (uint64_t c = 0; c < s + nparity; c++) { 3599eda14cbcSMatt Macy uint64_t devidx = (f + c) % dcols; 3600eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[devidx]; 3601eda14cbcSMatt Macy 3602eda14cbcSMatt Macy /* 3603eda14cbcSMatt Macy * dsl_scan_need_resilver() already checked vd with 3604eda14cbcSMatt Macy * vdev_dtl_contains(). So here just check cvd with 3605eda14cbcSMatt Macy * vdev_dtl_empty(), cheaper and a good approximation. 3606eda14cbcSMatt Macy */ 3607eda14cbcSMatt Macy if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3608eda14cbcSMatt Macy return (B_TRUE); 3609eda14cbcSMatt Macy } 3610eda14cbcSMatt Macy 3611eda14cbcSMatt Macy return (B_FALSE); 3612eda14cbcSMatt Macy } 3613eda14cbcSMatt Macy 3614eda14cbcSMatt Macy static void 36157877fdebSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 36167877fdebSMatt Macy range_seg64_t *physical_rs, range_seg64_t *remain_rs) 3617eda14cbcSMatt Macy { 3618e92ffd9bSMartin Matuska (void) remain_rs; 3619e92ffd9bSMartin Matuska 3620eda14cbcSMatt Macy vdev_t *raidvd = cvd->vdev_parent; 3621eda14cbcSMatt Macy ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3622eda14cbcSMatt Macy 3623e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3624e716630dSMartin Matuska 3625e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3626e716630dSMartin Matuska /* 3627e716630dSMartin Matuska * We're in the middle of expansion, in which case the 3628e716630dSMartin Matuska * translation is in flux. Any answer we give may be wrong 3629e716630dSMartin Matuska * by the time we return, so it isn't safe for the caller to 3630e716630dSMartin Matuska * act on it. Therefore we say that this range isn't present 3631e716630dSMartin Matuska * on any children. The only consumers of this are "zpool 3632e716630dSMartin Matuska * initialize" and trimming, both of which are "best effort" 3633e716630dSMartin Matuska * anyway. 3634e716630dSMartin Matuska */ 3635e716630dSMartin Matuska physical_rs->rs_start = physical_rs->rs_end = 0; 3636e716630dSMartin Matuska remain_rs->rs_start = remain_rs->rs_end = 0; 3637e716630dSMartin Matuska return; 3638e716630dSMartin Matuska } 3639e716630dSMartin Matuska 3640e716630dSMartin Matuska uint64_t width = vdrz->vd_physical_width; 3641eda14cbcSMatt Macy uint64_t tgt_col = cvd->vdev_id; 3642eda14cbcSMatt Macy uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3643eda14cbcSMatt Macy 3644eda14cbcSMatt Macy /* make sure the offsets are block-aligned */ 36457877fdebSMatt Macy ASSERT0(logical_rs->rs_start % (1 << ashift)); 36467877fdebSMatt Macy ASSERT0(logical_rs->rs_end % (1 << ashift)); 36477877fdebSMatt Macy uint64_t b_start = logical_rs->rs_start >> ashift; 36487877fdebSMatt Macy uint64_t b_end = logical_rs->rs_end >> ashift; 3649eda14cbcSMatt Macy 3650eda14cbcSMatt Macy uint64_t start_row = 0; 3651eda14cbcSMatt Macy if (b_start > tgt_col) /* avoid underflow */ 3652eda14cbcSMatt Macy start_row = ((b_start - tgt_col - 1) / width) + 1; 3653eda14cbcSMatt Macy 3654eda14cbcSMatt Macy uint64_t end_row = 0; 3655eda14cbcSMatt Macy if (b_end > tgt_col) 3656eda14cbcSMatt Macy end_row = ((b_end - tgt_col - 1) / width) + 1; 3657eda14cbcSMatt Macy 36587877fdebSMatt Macy physical_rs->rs_start = start_row << ashift; 36597877fdebSMatt Macy physical_rs->rs_end = end_row << ashift; 3660eda14cbcSMatt Macy 36617877fdebSMatt Macy ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 36627877fdebSMatt Macy ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 36637877fdebSMatt Macy logical_rs->rs_end - logical_rs->rs_start); 36647877fdebSMatt Macy } 36657877fdebSMatt Macy 3666e716630dSMartin Matuska static void 3667e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3668e716630dSMartin Matuska { 3669e716630dSMartin Matuska spa_t *spa = arg; 3670e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3671e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3672e716630dSMartin Matuska 3673e716630dSMartin Matuska /* 3674e716630dSMartin Matuska * Ensure there are no i/os to the range that is being committed. 3675e716630dSMartin Matuska */ 3676e716630dSMartin Matuska uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3677e716630dSMartin Matuska ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3678e716630dSMartin Matuska 3679e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3680e716630dSMartin Matuska uint64_t new_offset = 3681e716630dSMartin Matuska MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3682e716630dSMartin Matuska /* 3683e716630dSMartin Matuska * We should not have committed anything that failed. 3684e716630dSMartin Matuska */ 3685e716630dSMartin Matuska VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3686e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3687e716630dSMartin Matuska 3688e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3689e716630dSMartin Matuska old_offset, new_offset - old_offset, 3690e716630dSMartin Matuska RL_WRITER); 3691e716630dSMartin Matuska 3692e716630dSMartin Matuska /* 3693e716630dSMartin Matuska * Update the uberblock that will be written when this txg completes. 3694e716630dSMartin Matuska */ 3695e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3696e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3697e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = 0; 3698e716630dSMartin Matuska zfs_rangelock_exit(lr); 3699e716630dSMartin Matuska 3700e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3701e716630dSMartin Matuska vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3702e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = 0; 3703e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3704e716630dSMartin Matuska 3705e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3706e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3707e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3708e716630dSMartin Matuska sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3709e716630dSMartin Matuska } 3710e716630dSMartin Matuska 3711e716630dSMartin Matuska static void 3712e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3713e716630dSMartin Matuska { 3714e716630dSMartin Matuska spa_t *spa = arg; 3715e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3716e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3717e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3718e716630dSMartin Matuska 3719e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 3720e716630dSMartin Matuska VERIFY0(vre->vre_offset_pertxg[i]); 3721e716630dSMartin Matuska 3722e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3723e716630dSMartin Matuska re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3724e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width; 3725e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 3726e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 3727e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 3728e716630dSMartin Matuska 3729e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3730e716630dSMartin Matuska 3731e716630dSMartin Matuska /* 3732e716630dSMartin Matuska * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3733e716630dSMartin Matuska * will get written (based on vd_expand_txgs). 3734e716630dSMartin Matuska */ 3735e716630dSMartin Matuska vdev_config_dirty(vd); 3736e716630dSMartin Matuska 3737e716630dSMartin Matuska /* 3738e716630dSMartin Matuska * Before we change vre_state, the on-disk state must reflect that we 3739e716630dSMartin Matuska * have completed all copying, so that vdev_raidz_io_start() can use 3740e716630dSMartin Matuska * vre_state to determine if the reflow is in progress. See also the 3741e716630dSMartin Matuska * end of spa_raidz_expand_thread(). 3742e716630dSMartin Matuska */ 3743e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3744e716630dSMartin Matuska raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3745e716630dSMartin Matuska 3746e716630dSMartin Matuska vre->vre_end_time = gethrestime_sec(); 3747e716630dSMartin Matuska vre->vre_state = DSS_FINISHED; 3748e716630dSMartin Matuska 3749e716630dSMartin Matuska uint64_t state = vre->vre_state; 3750e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3751e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3752e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 3753e716630dSMartin Matuska 3754e716630dSMartin Matuska uint64_t end_time = vre->vre_end_time; 3755e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3756e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3757e716630dSMartin Matuska sizeof (end_time), 1, &end_time, tx)); 3758e716630dSMartin Matuska 3759e716630dSMartin Matuska spa->spa_uberblock.ub_raidz_reflow_info = 0; 3760e716630dSMartin Matuska 3761e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3762e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 3763e716630dSMartin Matuska (unsigned long long)vd->vdev_id, 3764e716630dSMartin Matuska (unsigned long long)vd->vdev_children); 3765e716630dSMartin Matuska 3766e716630dSMartin Matuska spa->spa_raidz_expand = NULL; 3767e716630dSMartin Matuska raidvd->vdev_rz_expanding = B_FALSE; 3768e716630dSMartin Matuska 3769e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3770e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3771e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3772e716630dSMartin Matuska 3773e716630dSMartin Matuska spa_notify_waiters(spa); 3774e716630dSMartin Matuska 3775e716630dSMartin Matuska /* 3776e716630dSMartin Matuska * While we're in syncing context take the opportunity to 3777e716630dSMartin Matuska * setup a scrub. All the data has been sucessfully copied 3778e716630dSMartin Matuska * but we have not validated any checksums. 3779e716630dSMartin Matuska */ 3780e716630dSMartin Matuska pool_scan_func_t func = POOL_SCAN_SCRUB; 3781e716630dSMartin Matuska if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) 3782e716630dSMartin Matuska dsl_scan_setup_sync(&func, tx); 3783e716630dSMartin Matuska } 3784e716630dSMartin Matuska 3785e716630dSMartin Matuska /* 3786e716630dSMartin Matuska * Struct for one copy zio. 3787e716630dSMartin Matuska */ 3788e716630dSMartin Matuska typedef struct raidz_reflow_arg { 3789e716630dSMartin Matuska vdev_raidz_expand_t *rra_vre; 3790e716630dSMartin Matuska zfs_locked_range_t *rra_lr; 3791e716630dSMartin Matuska uint64_t rra_txg; 3792e716630dSMartin Matuska } raidz_reflow_arg_t; 3793e716630dSMartin Matuska 3794e716630dSMartin Matuska /* 3795e716630dSMartin Matuska * The write of the new location is done. 3796e716630dSMartin Matuska */ 3797e716630dSMartin Matuska static void 3798e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio) 3799e716630dSMartin Matuska { 3800e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3801e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3802e716630dSMartin Matuska 3803e716630dSMartin Matuska abd_free(zio->io_abd); 3804e716630dSMartin Matuska 3805e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3806e716630dSMartin Matuska if (zio->io_error != 0) { 3807e716630dSMartin Matuska /* Force a reflow pause on errors */ 3808e716630dSMartin Matuska vre->vre_failed_offset = 3809e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3810e716630dSMartin Matuska } 3811e716630dSMartin Matuska ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3812e716630dSMartin Matuska vre->vre_outstanding_bytes -= zio->io_size; 3813e716630dSMartin Matuska if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3814e716630dSMartin Matuska vre->vre_failed_offset) { 3815e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3816e716630dSMartin Matuska zio->io_size; 3817e716630dSMartin Matuska } 3818e716630dSMartin Matuska cv_signal(&vre->vre_cv); 3819e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3820e716630dSMartin Matuska 3821e716630dSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 3822e716630dSMartin Matuska 3823e716630dSMartin Matuska kmem_free(rra, sizeof (*rra)); 3824e716630dSMartin Matuska spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3825e716630dSMartin Matuska } 3826e716630dSMartin Matuska 3827e716630dSMartin Matuska /* 3828e716630dSMartin Matuska * The read of the old location is done. The parent zio is the write to 3829e716630dSMartin Matuska * the new location. Allow it to start. 3830e716630dSMartin Matuska */ 3831e716630dSMartin Matuska static void 3832e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio) 3833e716630dSMartin Matuska { 3834e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3835e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3836e716630dSMartin Matuska 3837e716630dSMartin Matuska /* 3838e716630dSMartin Matuska * If the read failed, or if it was done on a vdev that is not fully 3839e716630dSMartin Matuska * healthy (e.g. a child that has a resilver in progress), we may not 3840e716630dSMartin Matuska * have the correct data. Note that it's OK if the write proceeds. 3841e716630dSMartin Matuska * It may write garbage but the location is otherwise unused and we 3842e716630dSMartin Matuska * will retry later due to vre_failed_offset. 3843e716630dSMartin Matuska */ 3844e716630dSMartin Matuska if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3845e716630dSMartin Matuska zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3846e716630dSMartin Matuska "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3847e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3848e716630dSMartin Matuska (long long)rra->rra_lr->lr_length, 3849e716630dSMartin Matuska (long long)rra->rra_txg, 3850e716630dSMartin Matuska zio->io_error, 3851e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3852e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3853e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3854e716630dSMartin Matuska /* Force a reflow pause on errors */ 3855e716630dSMartin Matuska vre->vre_failed_offset = 3856e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3857e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3858e716630dSMartin Matuska } 3859e716630dSMartin Matuska 3860e716630dSMartin Matuska zio_nowait(zio_unique_parent(zio)); 3861e716630dSMartin Matuska } 3862e716630dSMartin Matuska 3863e716630dSMartin Matuska static void 3864e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3865e716630dSMartin Matuska dmu_tx_t *tx) 3866e716630dSMartin Matuska { 3867e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3868e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3869e716630dSMartin Matuska 3870e716630dSMartin Matuska if (offset == 0) 3871e716630dSMartin Matuska return; 3872e716630dSMartin Matuska 3873e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3874e716630dSMartin Matuska ASSERT3U(vre->vre_offset, <=, offset); 3875e716630dSMartin Matuska vre->vre_offset = offset; 3876e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3877e716630dSMartin Matuska 3878e716630dSMartin Matuska if (vre->vre_offset_pertxg[txgoff] == 0) { 3879e716630dSMartin Matuska dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3880e716630dSMartin Matuska spa, tx); 3881e716630dSMartin Matuska } 3882e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = offset; 3883e716630dSMartin Matuska } 3884e716630dSMartin Matuska 3885e716630dSMartin Matuska static boolean_t 3886e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3887e716630dSMartin Matuska { 3888e716630dSMartin Matuska for (int i = 0; i < raidz_vd->vdev_children; i++) { 3889e716630dSMartin Matuska /* Quick check if a child is being replaced */ 3890e716630dSMartin Matuska if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3891e716630dSMartin Matuska return (B_TRUE); 3892e716630dSMartin Matuska } 3893e716630dSMartin Matuska return (B_FALSE); 3894e716630dSMartin Matuska } 3895e716630dSMartin Matuska 3896e716630dSMartin Matuska static boolean_t 3897e716630dSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, 3898e716630dSMartin Matuska dmu_tx_t *tx) 3899e716630dSMartin Matuska { 3900e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 3901e716630dSMartin Matuska int ashift = vd->vdev_top->vdev_ashift; 3902e716630dSMartin Matuska uint64_t offset, size; 3903e716630dSMartin Matuska 3904e716630dSMartin Matuska if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, 3905e716630dSMartin Matuska &offset, &size)) { 3906e716630dSMartin Matuska return (B_FALSE); 3907e716630dSMartin Matuska } 3908e716630dSMartin Matuska ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3909e716630dSMartin Matuska ASSERT3U(size, >=, 1 << ashift); 3910e716630dSMartin Matuska uint64_t length = 1 << ashift; 3911e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3912e716630dSMartin Matuska 3913e716630dSMartin Matuska uint64_t blkid = offset >> ashift; 3914e716630dSMartin Matuska 3915e716630dSMartin Matuska int old_children = vd->vdev_children - 1; 3916e716630dSMartin Matuska 3917e716630dSMartin Matuska /* 3918e716630dSMartin Matuska * We can only progress to the point that writes will not overlap 3919e716630dSMartin Matuska * with blocks whose progress has not yet been recorded on disk. 3920e716630dSMartin Matuska * Since partially-copied rows are still read from the old location, 3921e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent 3922e716630dSMartin Matuska * row-wise overlap. 3923e716630dSMartin Matuska * 3924e716630dSMartin Matuska * Note that even if we are skipping over a large unallocated region, 3925e716630dSMartin Matuska * we can't move the on-disk progress to `offset`, because concurrent 3926e716630dSMartin Matuska * writes/allocations could still use the currently-unallocated 3927e716630dSMartin Matuska * region. 3928e716630dSMartin Matuska */ 3929e716630dSMartin Matuska uint64_t ubsync_blkid = 3930e716630dSMartin Matuska RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3931e716630dSMartin Matuska uint64_t next_overwrite_blkid = ubsync_blkid + 3932e716630dSMartin Matuska ubsync_blkid / old_children - old_children; 3933e716630dSMartin Matuska VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3934e716630dSMartin Matuska 3935e716630dSMartin Matuska if (blkid >= next_overwrite_blkid) { 3936e716630dSMartin Matuska raidz_reflow_record_progress(vre, 3937e716630dSMartin Matuska next_overwrite_blkid << ashift, tx); 3938e716630dSMartin Matuska return (B_TRUE); 3939e716630dSMartin Matuska } 3940e716630dSMartin Matuska 3941e716630dSMartin Matuska range_tree_remove(rt, offset, length); 3942e716630dSMartin Matuska 3943e716630dSMartin Matuska raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); 3944e716630dSMartin Matuska rra->rra_vre = vre; 3945e716630dSMartin Matuska rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 3946e716630dSMartin Matuska offset, length, RL_WRITER); 3947e716630dSMartin Matuska rra->rra_txg = dmu_tx_get_txg(tx); 3948e716630dSMartin Matuska 3949e716630dSMartin Matuska raidz_reflow_record_progress(vre, offset + length, tx); 3950e716630dSMartin Matuska 3951e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3952e716630dSMartin Matuska vre->vre_outstanding_bytes += length; 3953e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3954e716630dSMartin Matuska 3955e716630dSMartin Matuska /* 3956e716630dSMartin Matuska * SCL_STATE will be released when the read and write are done, 3957e716630dSMartin Matuska * by raidz_reflow_write_done(). 3958e716630dSMartin Matuska */ 3959e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3960e716630dSMartin Matuska 3961e716630dSMartin Matuska /* check if a replacing vdev was added, if so treat it as an error */ 3962e716630dSMartin Matuska if (vdev_raidz_expand_child_replacing(vd)) { 3963e716630dSMartin Matuska zfs_dbgmsg("replacing vdev encountered, reflow paused at " 3964e716630dSMartin Matuska "offset=%llu txg=%llu", 3965e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3966e716630dSMartin Matuska (long long)rra->rra_txg); 3967e716630dSMartin Matuska 3968e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3969e716630dSMartin Matuska vre->vre_failed_offset = 3970e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3971e716630dSMartin Matuska cv_signal(&vre->vre_cv); 3972e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3973e716630dSMartin Matuska 3974e716630dSMartin Matuska /* drop everything we acquired */ 3975e716630dSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 3976e716630dSMartin Matuska kmem_free(rra, sizeof (*rra)); 3977e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, spa); 3978e716630dSMartin Matuska return (B_TRUE); 3979e716630dSMartin Matuska } 3980e716630dSMartin Matuska 3981e716630dSMartin Matuska zio_t *pio = spa->spa_txg_zio[txgoff]; 3982e716630dSMartin Matuska abd_t *abd = abd_alloc_for_io(length, B_FALSE); 3983e716630dSMartin Matuska zio_t *write_zio = zio_vdev_child_io(pio, NULL, 3984e716630dSMartin Matuska vd->vdev_child[blkid % vd->vdev_children], 3985e716630dSMartin Matuska (blkid / vd->vdev_children) << ashift, 3986e716630dSMartin Matuska abd, length, 3987e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 3988e716630dSMartin Matuska ZIO_FLAG_CANFAIL, 3989e716630dSMartin Matuska raidz_reflow_write_done, rra); 3990e716630dSMartin Matuska 3991e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(write_zio, NULL, 3992e716630dSMartin Matuska vd->vdev_child[blkid % old_children], 3993e716630dSMartin Matuska (blkid / old_children) << ashift, 3994e716630dSMartin Matuska abd, length, 3995e716630dSMartin Matuska ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 3996e716630dSMartin Matuska ZIO_FLAG_CANFAIL, 3997e716630dSMartin Matuska raidz_reflow_read_done, rra)); 3998e716630dSMartin Matuska 3999e716630dSMartin Matuska return (B_FALSE); 4000e716630dSMartin Matuska } 4001e716630dSMartin Matuska 4002e716630dSMartin Matuska /* 4003e716630dSMartin Matuska * For testing (ztest specific) 4004e716630dSMartin Matuska */ 4005e716630dSMartin Matuska static void 4006e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point) 4007e716630dSMartin Matuska { 4008e716630dSMartin Matuska while (raidz_expand_pause_point != 0 && 4009e716630dSMartin Matuska raidz_expand_pause_point <= pause_point) 4010e716630dSMartin Matuska delay(hz); 4011e716630dSMartin Matuska } 4012e716630dSMartin Matuska 4013e716630dSMartin Matuska static void 4014e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio) 4015e716630dSMartin Matuska { 4016e716630dSMartin Matuska zio_t *pio = zio->io_private; 4017e716630dSMartin Matuska 4018e716630dSMartin Matuska mutex_enter(&pio->io_lock); 4019e716630dSMartin Matuska pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4020e716630dSMartin Matuska mutex_exit(&pio->io_lock); 4021e716630dSMartin Matuska } 4022e716630dSMartin Matuska 4023e716630dSMartin Matuska /* 4024e716630dSMartin Matuska * Reflow the beginning portion of the vdev into an intermediate scratch area 4025e716630dSMartin Matuska * in memory and on disk. This operation must be persisted on disk before we 4026e716630dSMartin Matuska * proceed to overwrite the beginning portion with the reflowed data. 4027e716630dSMartin Matuska * 4028e716630dSMartin Matuska * This multi-step task can fail to complete if disk errors are encountered 4029e716630dSMartin Matuska * and we can return here after a pause (waiting for disk to become healthy). 4030e716630dSMartin Matuska */ 4031e716630dSMartin Matuska static void 4032e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4033e716630dSMartin Matuska { 4034e716630dSMartin Matuska vdev_raidz_expand_t *vre = arg; 4035e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4036e716630dSMartin Matuska zio_t *pio; 4037e716630dSMartin Matuska int error; 4038e716630dSMartin Matuska 4039e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4040e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4041e716630dSMartin Matuska int ashift = raidvd->vdev_ashift; 4042*aca928a5SMartin Matuska uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4043*aca928a5SMartin Matuska uint64_t); 4044e716630dSMartin Matuska uint64_t logical_size = write_size * raidvd->vdev_children; 4045e716630dSMartin Matuska uint64_t read_size = 4046e716630dSMartin Matuska P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4047e716630dSMartin Matuska 1 << ashift); 4048e716630dSMartin Matuska 4049e716630dSMartin Matuska /* 4050e716630dSMartin Matuska * The scratch space must be large enough to get us to the point 4051e716630dSMartin Matuska * that one row does not overlap itself when moved. This is checked 4052e716630dSMartin Matuska * by vdev_raidz_attach_check(). 4053e716630dSMartin Matuska */ 4054e716630dSMartin Matuska VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4055e716630dSMartin Matuska VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4056e716630dSMartin Matuska VERIFY3U(write_size, <=, read_size); 4057e716630dSMartin Matuska 4058e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4059e716630dSMartin Matuska 0, logical_size, RL_WRITER); 4060e716630dSMartin Matuska 4061e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4062e716630dSMartin Matuska KM_SLEEP); 4063e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4064e716630dSMartin Matuska abds[i] = abd_alloc_linear(read_size, B_FALSE); 4065e716630dSMartin Matuska } 4066e716630dSMartin Matuska 4067e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4068e716630dSMartin Matuska 4069e716630dSMartin Matuska /* 4070e716630dSMartin Matuska * If we have already written the scratch area then we must read from 4071e716630dSMartin Matuska * there, since new writes were redirected there while we were paused 4072e716630dSMartin Matuska * or the original location may have been partially overwritten with 4073e716630dSMartin Matuska * reflowed data. 4074e716630dSMartin Matuska */ 4075e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4076e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4077e716630dSMartin Matuska /* 4078e716630dSMartin Matuska * Read from scratch space. 4079e716630dSMartin Matuska */ 4080e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4081e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4082e716630dSMartin Matuska /* 4083e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4084e716630dSMartin Matuska * to the offset to calculate the physical offset to 4085e716630dSMartin Matuska * write to. Passing in a negative offset makes us 4086e716630dSMartin Matuska * access the scratch area. 4087e716630dSMartin Matuska */ 4088e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 4089e716630dSMartin Matuska raidvd->vdev_child[i], 4090e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4091e716630dSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, 4092e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4093e716630dSMartin Matuska } 4094e716630dSMartin Matuska error = zio_wait(pio); 4095e716630dSMartin Matuska if (error != 0) { 4096e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading scratch location", 4097e716630dSMartin Matuska error); 4098e716630dSMartin Matuska goto io_error_exit; 4099e716630dSMartin Matuska } 4100e716630dSMartin Matuska goto overwrite; 4101e716630dSMartin Matuska } 4102e716630dSMartin Matuska 4103e716630dSMartin Matuska /* 4104e716630dSMartin Matuska * Read from original location. 4105e716630dSMartin Matuska */ 4106e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4107e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4108e716630dSMartin Matuska ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4109e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4110e716630dSMartin Matuska 0, abds[i], read_size, ZIO_TYPE_READ, 4111e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 4112e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4113e716630dSMartin Matuska } 4114e716630dSMartin Matuska error = zio_wait(pio); 4115e716630dSMartin Matuska if (error != 0) { 4116e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading original location", error); 4117e716630dSMartin Matuska io_error_exit: 4118e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4119e716630dSMartin Matuska abd_free(abds[i]); 4120e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4121e716630dSMartin Matuska zfs_rangelock_exit(lr); 4122e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4123e716630dSMartin Matuska return; 4124e716630dSMartin Matuska } 4125e716630dSMartin Matuska 4126e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4127e716630dSMartin Matuska 4128e716630dSMartin Matuska /* 4129e716630dSMartin Matuska * Reflow in memory. 4130e716630dSMartin Matuska */ 4131e716630dSMartin Matuska uint64_t logical_sectors = logical_size >> ashift; 4132e716630dSMartin Matuska for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4133e716630dSMartin Matuska int oldchild = i % (raidvd->vdev_children - 1); 4134e716630dSMartin Matuska uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4135e716630dSMartin Matuska 4136e716630dSMartin Matuska int newchild = i % raidvd->vdev_children; 4137e716630dSMartin Matuska uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4138e716630dSMartin Matuska 4139e716630dSMartin Matuska /* a single sector should not be copying over itself */ 4140e716630dSMartin Matuska ASSERT(!(newchild == oldchild && newoff == oldoff)); 4141e716630dSMartin Matuska 4142e716630dSMartin Matuska abd_copy_off(abds[newchild], abds[oldchild], 4143e716630dSMartin Matuska newoff, oldoff, 1 << ashift); 4144e716630dSMartin Matuska } 4145e716630dSMartin Matuska 4146e716630dSMartin Matuska /* 4147e716630dSMartin Matuska * Verify that we filled in everything we intended to (write_size on 4148e716630dSMartin Matuska * each child). 4149e716630dSMartin Matuska */ 4150e716630dSMartin Matuska VERIFY0(logical_sectors % raidvd->vdev_children); 4151e716630dSMartin Matuska VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4152e716630dSMartin Matuska write_size); 4153e716630dSMartin Matuska 4154e716630dSMartin Matuska /* 4155e716630dSMartin Matuska * Write to scratch location (boot area). 4156e716630dSMartin Matuska */ 4157e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4158e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4159e716630dSMartin Matuska /* 4160e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4161e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4162e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4163e716630dSMartin Matuska */ 4164e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4165e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4166e716630dSMartin Matuska write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 4167e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4168e716630dSMartin Matuska } 4169e716630dSMartin Matuska error = zio_wait(pio); 4170e716630dSMartin Matuska if (error != 0) { 4171e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing scratch location", error); 4172e716630dSMartin Matuska goto io_error_exit; 4173e716630dSMartin Matuska } 4174e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4175e716630dSMartin Matuska zio_flush(pio, raidvd); 4176e716630dSMartin Matuska zio_wait(pio); 4177e716630dSMartin Matuska 4178e716630dSMartin Matuska zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4179e716630dSMartin Matuska (long long)logical_size); 4180e716630dSMartin Matuska 4181e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4182e716630dSMartin Matuska 4183e716630dSMartin Matuska /* 4184e716630dSMartin Matuska * Update uberblock to indicate that scratch space is valid. This is 4185e716630dSMartin Matuska * needed because after this point, the real location may be 4186e716630dSMartin Matuska * overwritten. If we crash, we need to get the data from the 4187e716630dSMartin Matuska * scratch space, rather than the real location. 4188e716630dSMartin Matuska * 4189e716630dSMartin Matuska * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4190e716630dSMartin Matuska * will prefer this uberblock. 4191e716630dSMartin Matuska */ 4192e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4193e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4194e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4195e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4196e716630dSMartin Matuska if (spa_multihost(spa)) 4197e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4198e716630dSMartin Matuska 4199e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4200e716630dSMartin Matuska "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4201e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4202e716630dSMartin Matuska (long long)logical_size, 4203e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4204e716630dSMartin Matuska 4205e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4206e716630dSMartin Matuska 4207e716630dSMartin Matuska /* 4208e716630dSMartin Matuska * Overwrite with reflow'ed data. 4209e716630dSMartin Matuska */ 4210e716630dSMartin Matuska overwrite: 4211e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4212e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4213e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4214e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 4215e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, 4216e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4217e716630dSMartin Matuska } 4218e716630dSMartin Matuska error = zio_wait(pio); 4219e716630dSMartin Matuska if (error != 0) { 4220e716630dSMartin Matuska /* 4221e716630dSMartin Matuska * When we exit early here and drop the range lock, new 4222e716630dSMartin Matuska * writes will go into the scratch area so we'll need to 4223e716630dSMartin Matuska * read from there when we return after pausing. 4224e716630dSMartin Matuska */ 4225e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing real location", error); 4226e716630dSMartin Matuska /* 4227e716630dSMartin Matuska * Update the uberblock that is written when this txg completes. 4228e716630dSMartin Matuska */ 4229e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4230e716630dSMartin Matuska logical_size); 4231e716630dSMartin Matuska goto io_error_exit; 4232e716630dSMartin Matuska } 4233e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4234e716630dSMartin Matuska zio_flush(pio, raidvd); 4235e716630dSMartin Matuska zio_wait(pio); 4236e716630dSMartin Matuska 4237e716630dSMartin Matuska zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4238e716630dSMartin Matuska (long long)logical_size); 4239e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4240e716630dSMartin Matuska abd_free(abds[i]); 4241e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4242e716630dSMartin Matuska 4243e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4244e716630dSMartin Matuska 4245e716630dSMartin Matuska /* 4246e716630dSMartin Matuska * Update uberblock to indicate that the initial part has been 4247e716630dSMartin Matuska * reflow'ed. This is needed because after this point (when we exit 4248e716630dSMartin Matuska * the rangelock), we allow regular writes to this region, which will 4249e716630dSMartin Matuska * be written to the new location only (because reflow_offset_next == 4250e716630dSMartin Matuska * reflow_offset_synced). If we crashed and re-copied from the 4251e716630dSMartin Matuska * scratch space, we would lose the regular writes. 4252e716630dSMartin Matuska */ 4253e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4254e716630dSMartin Matuska logical_size); 4255e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4256e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4257e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4258e716630dSMartin Matuska if (spa_multihost(spa)) 4259e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4260e716630dSMartin Matuska 4261e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4262e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4263e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4264e716630dSMartin Matuska (long long)logical_size, 4265e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4266e716630dSMartin Matuska 4267e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4268e716630dSMartin Matuska 4269e716630dSMartin Matuska /* 4270e716630dSMartin Matuska * Update progress. 4271e716630dSMartin Matuska */ 4272e716630dSMartin Matuska vre->vre_offset = logical_size; 4273e716630dSMartin Matuska zfs_rangelock_exit(lr); 4274e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4275e716630dSMartin Matuska 4276e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4277e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4278e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4279e716630dSMartin Matuska /* 4280e716630dSMartin Matuska * Note - raidz_reflow_sync() will update the uberblock state to 4281e716630dSMartin Matuska * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4282e716630dSMartin Matuska */ 4283e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4284e716630dSMartin Matuska 4285e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4286e716630dSMartin Matuska } 4287e716630dSMartin Matuska 4288e716630dSMartin Matuska /* 4289e716630dSMartin Matuska * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4290e716630dSMartin Matuska * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4291e716630dSMartin Matuska */ 4292e716630dSMartin Matuska void 4293e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa) 4294e716630dSMartin Matuska { 4295e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4296e716630dSMartin Matuska uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4297e716630dSMartin Matuska ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4298e716630dSMartin Matuska 4299e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4300e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4301e716630dSMartin Matuska ASSERT0(logical_size % raidvd->vdev_children); 4302e716630dSMartin Matuska uint64_t write_size = logical_size / raidvd->vdev_children; 4303e716630dSMartin Matuska 4304e716630dSMartin Matuska zio_t *pio; 4305e716630dSMartin Matuska 4306e716630dSMartin Matuska /* 4307e716630dSMartin Matuska * Read from scratch space. 4308e716630dSMartin Matuska */ 4309e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4310e716630dSMartin Matuska KM_SLEEP); 4311e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4312e716630dSMartin Matuska abds[i] = abd_alloc_linear(write_size, B_FALSE); 4313e716630dSMartin Matuska } 4314e716630dSMartin Matuska 4315e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4316e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4317e716630dSMartin Matuska /* 4318e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4319e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4320e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4321e716630dSMartin Matuska */ 4322e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4323e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4324e716630dSMartin Matuska write_size, ZIO_TYPE_READ, 4325e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, 4326e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4327e716630dSMartin Matuska } 4328e716630dSMartin Matuska zio_wait(pio); 4329e716630dSMartin Matuska 4330e716630dSMartin Matuska /* 4331e716630dSMartin Matuska * Overwrite real location with reflow'ed data. 4332e716630dSMartin Matuska */ 4333e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4334e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4335e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4336e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 4337e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_WRITE, 0, 4338e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4339e716630dSMartin Matuska } 4340e716630dSMartin Matuska zio_wait(pio); 4341e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4342e716630dSMartin Matuska zio_flush(pio, raidvd); 4343e716630dSMartin Matuska zio_wait(pio); 4344e716630dSMartin Matuska 4345e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4346e716630dSMartin Matuska "to real location", (long long)logical_size); 4347e716630dSMartin Matuska 4348e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4349e716630dSMartin Matuska abd_free(abds[i]); 4350e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4351e716630dSMartin Matuska 4352e716630dSMartin Matuska /* 4353e716630dSMartin Matuska * Update uberblock. 4354e716630dSMartin Matuska */ 4355e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4356e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4357e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4358e716630dSMartin Matuska VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4359e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4360e716630dSMartin Matuska if (spa_multihost(spa)) 4361e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4362e716630dSMartin Matuska 4363e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: uberblock updated " 4364e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4365e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4366e716630dSMartin Matuska (long long)logical_size, 4367e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4368e716630dSMartin Matuska 4369e716630dSMartin Matuska dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4370e716630dSMartin Matuska spa_first_txg(spa)); 4371e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4372e716630dSMartin Matuska vre->vre_offset = logical_size; 4373e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4374e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4375e716630dSMartin Matuska /* 4376e716630dSMartin Matuska * Note that raidz_reflow_sync() will update the uberblock once more 4377e716630dSMartin Matuska */ 4378e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4379e716630dSMartin Matuska 4380e716630dSMartin Matuska dmu_tx_commit(tx); 4381e716630dSMartin Matuska 4382e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4383e716630dSMartin Matuska } 4384e716630dSMartin Matuska 4385e716630dSMartin Matuska static boolean_t 4386e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4387e716630dSMartin Matuska { 4388e716630dSMartin Matuska (void) zthr; 4389e716630dSMartin Matuska spa_t *spa = arg; 4390e716630dSMartin Matuska 4391e716630dSMartin Matuska return (spa->spa_raidz_expand != NULL && 4392e716630dSMartin Matuska !spa->spa_raidz_expand->vre_waiting_for_resilver); 4393e716630dSMartin Matuska } 4394e716630dSMartin Matuska 4395e716630dSMartin Matuska /* 4396e716630dSMartin Matuska * RAIDZ expansion background thread 4397e716630dSMartin Matuska * 4398e716630dSMartin Matuska * Can be called multiple times if the reflow is paused 4399e716630dSMartin Matuska */ 4400e716630dSMartin Matuska static void 4401e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4402e716630dSMartin Matuska { 4403e716630dSMartin Matuska spa_t *spa = arg; 4404e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4405e716630dSMartin Matuska 4406e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4407e716630dSMartin Matuska vre->vre_offset = 0; 4408e716630dSMartin Matuska else 4409e716630dSMartin Matuska vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4410e716630dSMartin Matuska 4411e716630dSMartin Matuska /* Reflow the begining portion using the scratch area */ 4412e716630dSMartin Matuska if (vre->vre_offset == 0) { 4413e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), 4414e716630dSMartin Matuska NULL, raidz_reflow_scratch_sync, 4415e716630dSMartin Matuska vre, 0, ZFS_SPACE_CHECK_NONE)); 4416e716630dSMartin Matuska 4417e716630dSMartin Matuska /* if we encountered errors then pause */ 4418e716630dSMartin Matuska if (vre->vre_offset == 0) { 4419e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4420e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4421e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4422e716630dSMartin Matuska return; 4423e716630dSMartin Matuska } 4424e716630dSMartin Matuska } 4425e716630dSMartin Matuska 4426e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4427e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4428e716630dSMartin Matuska 4429e716630dSMartin Matuska uint64_t guid = raidvd->vdev_guid; 4430e716630dSMartin Matuska 4431e716630dSMartin Matuska /* Iterate over all the remaining metaslabs */ 4432e716630dSMartin Matuska for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4433e716630dSMartin Matuska i < raidvd->vdev_ms_count && 4434e716630dSMartin Matuska !zthr_iscancelled(zthr) && 4435e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX; i++) { 4436e716630dSMartin Matuska metaslab_t *msp = raidvd->vdev_ms[i]; 4437e716630dSMartin Matuska 4438e716630dSMartin Matuska metaslab_disable(msp); 4439e716630dSMartin Matuska mutex_enter(&msp->ms_lock); 4440e716630dSMartin Matuska 4441e716630dSMartin Matuska /* 4442e716630dSMartin Matuska * The metaslab may be newly created (for the expanded 4443e716630dSMartin Matuska * space), in which case its trees won't exist yet, 4444e716630dSMartin Matuska * so we need to bail out early. 4445e716630dSMartin Matuska */ 4446e716630dSMartin Matuska if (msp->ms_new) { 4447e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4448e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4449e716630dSMartin Matuska continue; 4450e716630dSMartin Matuska } 4451e716630dSMartin Matuska 4452e716630dSMartin Matuska VERIFY0(metaslab_load(msp)); 4453e716630dSMartin Matuska 4454e716630dSMartin Matuska /* 4455e716630dSMartin Matuska * We want to copy everything except the free (allocatable) 4456e716630dSMartin Matuska * space. Note that there may be a little bit more free 4457e716630dSMartin Matuska * space (e.g. in ms_defer), and it's fine to copy that too. 4458e716630dSMartin Matuska */ 4459e716630dSMartin Matuska range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, 4460e716630dSMartin Matuska NULL, 0, 0); 4461e716630dSMartin Matuska range_tree_add(rt, msp->ms_start, msp->ms_size); 4462e716630dSMartin Matuska range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); 4463e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4464e716630dSMartin Matuska 4465e716630dSMartin Matuska /* 4466e716630dSMartin Matuska * Force the last sector of each metaslab to be copied. This 4467e716630dSMartin Matuska * ensures that we advance the on-disk progress to the end of 4468e716630dSMartin Matuska * this metaslab while the metaslab is disabled. Otherwise, we 4469e716630dSMartin Matuska * could move past this metaslab without advancing the on-disk 4470e716630dSMartin Matuska * progress, and then an allocation to this metaslab would not 4471e716630dSMartin Matuska * be copied. 4472e716630dSMartin Matuska */ 4473e716630dSMartin Matuska int sectorsz = 1 << raidvd->vdev_ashift; 4474e716630dSMartin Matuska uint64_t ms_last_offset = msp->ms_start + 4475e716630dSMartin Matuska msp->ms_size - sectorsz; 4476e716630dSMartin Matuska if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { 4477e716630dSMartin Matuska range_tree_add(rt, ms_last_offset, sectorsz); 4478e716630dSMartin Matuska } 4479e716630dSMartin Matuska 4480e716630dSMartin Matuska /* 4481e716630dSMartin Matuska * When we are resuming from a paused expansion (i.e. 4482e716630dSMartin Matuska * when importing a pool with a expansion in progress), 4483e716630dSMartin Matuska * discard any state that we have already processed. 4484e716630dSMartin Matuska */ 4485e716630dSMartin Matuska range_tree_clear(rt, 0, vre->vre_offset); 4486e716630dSMartin Matuska 4487e716630dSMartin Matuska while (!zthr_iscancelled(zthr) && 4488e716630dSMartin Matuska !range_tree_is_empty(rt) && 4489e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX) { 4490e716630dSMartin Matuska 4491e716630dSMartin Matuska /* 4492e716630dSMartin Matuska * We need to periodically drop the config lock so that 4493e716630dSMartin Matuska * writers can get in. Additionally, we can't wait 4494e716630dSMartin Matuska * for a txg to sync while holding a config lock 4495e716630dSMartin Matuska * (since a waiting writer could cause a 3-way deadlock 4496e716630dSMartin Matuska * with the sync thread, which also gets a config 4497e716630dSMartin Matuska * lock for reader). So we can't hold the config lock 4498e716630dSMartin Matuska * while calling dmu_tx_assign(). 4499e716630dSMartin Matuska */ 4500e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4501e716630dSMartin Matuska 4502e716630dSMartin Matuska /* 4503e716630dSMartin Matuska * If requested, pause the reflow when the amount 4504e716630dSMartin Matuska * specified by raidz_expand_max_reflow_bytes is reached 4505e716630dSMartin Matuska * 4506e716630dSMartin Matuska * This pause is only used during testing or debugging. 4507e716630dSMartin Matuska */ 4508e716630dSMartin Matuska while (raidz_expand_max_reflow_bytes != 0 && 4509e716630dSMartin Matuska raidz_expand_max_reflow_bytes <= 4510e716630dSMartin Matuska vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4511e716630dSMartin Matuska delay(hz); 4512e716630dSMartin Matuska } 4513e716630dSMartin Matuska 4514e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4515e716630dSMartin Matuska while (vre->vre_outstanding_bytes > 4516e716630dSMartin Matuska raidz_expand_max_copy_bytes) { 4517e716630dSMartin Matuska cv_wait(&vre->vre_cv, &vre->vre_lock); 4518e716630dSMartin Matuska } 4519e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4520e716630dSMartin Matuska 4521e716630dSMartin Matuska dmu_tx_t *tx = 4522e716630dSMartin Matuska dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4523e716630dSMartin Matuska 4524e716630dSMartin Matuska VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 4525e716630dSMartin Matuska uint64_t txg = dmu_tx_get_txg(tx); 4526e716630dSMartin Matuska 4527e716630dSMartin Matuska /* 4528e716630dSMartin Matuska * Reacquire the vdev_config lock. Theoretically, the 4529e716630dSMartin Matuska * vdev_t that we're expanding may have changed. 4530e716630dSMartin Matuska */ 4531e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4532e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4533e716630dSMartin Matuska 4534e716630dSMartin Matuska boolean_t needsync = 4535e716630dSMartin Matuska raidz_reflow_impl(raidvd, vre, rt, tx); 4536e716630dSMartin Matuska 4537e716630dSMartin Matuska dmu_tx_commit(tx); 4538e716630dSMartin Matuska 4539e716630dSMartin Matuska if (needsync) { 4540e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4541e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, txg); 4542e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, 4543e716630dSMartin Matuska RW_READER); 4544e716630dSMartin Matuska } 4545e716630dSMartin Matuska } 4546e716630dSMartin Matuska 4547e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4548e716630dSMartin Matuska 4549e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4550e716630dSMartin Matuska range_tree_vacate(rt, NULL, NULL); 4551e716630dSMartin Matuska range_tree_destroy(rt); 4552e716630dSMartin Matuska 4553e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4554e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4555e716630dSMartin Matuska } 4556e716630dSMartin Matuska 4557e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4558e716630dSMartin Matuska 4559e716630dSMartin Matuska /* 4560e716630dSMartin Matuska * The txg_wait_synced() here ensures that all reflow zio's have 4561e716630dSMartin Matuska * completed, and vre_failed_offset has been set if necessary. It 4562e716630dSMartin Matuska * also ensures that the progress of the last raidz_reflow_sync() is 4563e716630dSMartin Matuska * written to disk before raidz_reflow_complete_sync() changes the 4564e716630dSMartin Matuska * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4565e716630dSMartin Matuska * determine if a reflow is in progress, in which case we may need to 4566e716630dSMartin Matuska * write to both old and new locations. Therefore we can only change 4567e716630dSMartin Matuska * vre_state once this is not necessary, which is once the on-disk 4568e716630dSMartin Matuska * progress (in spa_ubsync) has been set past any possible writes (to 4569e716630dSMartin Matuska * the end of the last metaslab). 4570e716630dSMartin Matuska */ 4571e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, 0); 4572e716630dSMartin Matuska 4573e716630dSMartin Matuska if (!zthr_iscancelled(zthr) && 4574e716630dSMartin Matuska vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4575e716630dSMartin Matuska /* 4576e716630dSMartin Matuska * We are not being canceled or paused, so the reflow must be 4577e716630dSMartin Matuska * complete. In that case also mark it as completed on disk. 4578e716630dSMartin Matuska */ 4579e716630dSMartin Matuska ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4580e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4581e716630dSMartin Matuska raidz_reflow_complete_sync, spa, 4582e716630dSMartin Matuska 0, ZFS_SPACE_CHECK_NONE)); 4583e716630dSMartin Matuska (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4584e716630dSMartin Matuska } else { 4585e716630dSMartin Matuska /* 4586e716630dSMartin Matuska * Wait for all copy zio's to complete and for all the 4587e716630dSMartin Matuska * raidz_reflow_sync() synctasks to be run. 4588e716630dSMartin Matuska */ 4589e716630dSMartin Matuska spa_history_log_internal(spa, "reflow pause", 4590e716630dSMartin Matuska NULL, "offset=%llu failed_offset=%lld", 4591e716630dSMartin Matuska (long long)vre->vre_offset, 4592e716630dSMartin Matuska (long long)vre->vre_failed_offset); 4593e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4594e716630dSMartin Matuska if (vre->vre_failed_offset != UINT64_MAX) { 4595e716630dSMartin Matuska /* 4596e716630dSMartin Matuska * Reset progress so that we will retry everything 4597e716630dSMartin Matuska * after the point that something failed. 4598e716630dSMartin Matuska */ 4599e716630dSMartin Matuska vre->vre_offset = vre->vre_failed_offset; 4600e716630dSMartin Matuska vre->vre_failed_offset = UINT64_MAX; 4601e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4602e716630dSMartin Matuska } 4603e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4604e716630dSMartin Matuska } 4605e716630dSMartin Matuska } 4606e716630dSMartin Matuska 4607e716630dSMartin Matuska void 4608e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa) 4609e716630dSMartin Matuska { 4610e716630dSMartin Matuska ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4611e716630dSMartin Matuska spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4612e716630dSMartin Matuska spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4613e716630dSMartin Matuska spa, defclsyspri); 4614e716630dSMartin Matuska } 4615e716630dSMartin Matuska 4616e716630dSMartin Matuska void 4617e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd) 4618e716630dSMartin Matuska { 4619e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 4620e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL) { 4621e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4622e716630dSMartin Matuska /* 4623e716630dSMartin Matuska * we get called often from vdev_dtl_reassess() so make 4624e716630dSMartin Matuska * sure it's our vdev and any replacing is complete 4625e716630dSMartin Matuska */ 4626e716630dSMartin Matuska if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4627e716630dSMartin Matuska !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4628e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4629e716630dSMartin Matuska if (vre->vre_waiting_for_resilver) { 4630e716630dSMartin Matuska vdev_dbgmsg(vd, "DTL reassessed, " 4631e716630dSMartin Matuska "continuing raidz expansion"); 4632e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_FALSE; 4633e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4634e716630dSMartin Matuska } 4635e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4636e716630dSMartin Matuska } 4637e716630dSMartin Matuska } 4638e716630dSMartin Matuska } 4639e716630dSMartin Matuska 4640e716630dSMartin Matuska int 4641e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child) 4642e716630dSMartin Matuska { 4643e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4644e716630dSMartin Matuska uint64_t new_children = raidvd->vdev_children; 4645e716630dSMartin Matuska 4646e716630dSMartin Matuska /* 4647e716630dSMartin Matuska * We use the "boot" space as scratch space to handle overwriting the 4648e716630dSMartin Matuska * initial part of the vdev. If it is too small, then this expansion 4649e716630dSMartin Matuska * is not allowed. This would be very unusual (e.g. ashift > 13 and 4650e716630dSMartin Matuska * >200 children). 4651e716630dSMartin Matuska */ 4652e716630dSMartin Matuska if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4653e716630dSMartin Matuska return (EINVAL); 4654e716630dSMartin Matuska } 4655e716630dSMartin Matuska return (0); 4656e716630dSMartin Matuska } 4657e716630dSMartin Matuska 4658e716630dSMartin Matuska void 4659e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4660e716630dSMartin Matuska { 4661e716630dSMartin Matuska vdev_t *new_child = arg; 4662e716630dSMartin Matuska spa_t *spa = new_child->vdev_spa; 4663e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4664e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4665e716630dSMartin Matuska ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4666e716630dSMartin Matuska ASSERT3P(raidvd->vdev_top, ==, raidvd); 4667e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4668e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4669e716630dSMartin Matuska ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4670e716630dSMartin Matuska new_child); 4671e716630dSMartin Matuska 4672e716630dSMartin Matuska spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4673e716630dSMartin Matuska 4674e716630dSMartin Matuska vdrz->vd_physical_width++; 4675e716630dSMartin Matuska 4676e716630dSMartin Matuska VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4677e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4678e716630dSMartin Matuska vdrz->vn_vre.vre_offset = 0; 4679e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4680e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4681e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4682e716630dSMartin Matuska 4683e716630dSMartin Matuska /* 4684e716630dSMartin Matuska * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4685e716630dSMartin Matuska * written to the config. 4686e716630dSMartin Matuska */ 4687e716630dSMartin Matuska vdev_config_dirty(raidvd); 4688e716630dSMartin Matuska 4689e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4690e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = 0; 4691e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4692e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = 0; 4693e716630dSMartin Matuska 4694e716630dSMartin Matuska uint64_t state = vdrz->vn_vre.vre_state; 4695e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4696e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4697e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 4698e716630dSMartin Matuska 4699e716630dSMartin Matuska uint64_t start_time = vdrz->vn_vre.vre_start_time; 4700e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4701e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4702e716630dSMartin Matuska sizeof (start_time), 1, &start_time, tx)); 4703e716630dSMartin Matuska 4704e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4705e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4706e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4707e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4708e716630dSMartin Matuska 4709e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4710e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 4711e716630dSMartin Matuska (unsigned long long)raidvd->vdev_id, 4712e716630dSMartin Matuska (unsigned long long)raidvd->vdev_children); 4713e716630dSMartin Matuska } 4714e716630dSMartin Matuska 4715e716630dSMartin Matuska int 4716e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd) 4717e716630dSMartin Matuska { 4718e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4719e716630dSMartin Matuska int err; 4720e716630dSMartin Matuska 4721e716630dSMartin Matuska uint64_t state = DSS_NONE; 4722e716630dSMartin Matuska uint64_t start_time = 0; 4723e716630dSMartin Matuska uint64_t end_time = 0; 4724e716630dSMartin Matuska uint64_t bytes_copied = 0; 4725e716630dSMartin Matuska 4726e716630dSMartin Matuska if (vd->vdev_top_zap != 0) { 4727e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4728e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4729e716630dSMartin Matuska sizeof (state), 1, &state); 4730e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4731e716630dSMartin Matuska return (err); 4732e716630dSMartin Matuska 4733e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4734e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4735e716630dSMartin Matuska sizeof (start_time), 1, &start_time); 4736e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4737e716630dSMartin Matuska return (err); 4738e716630dSMartin Matuska 4739e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4740e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4741e716630dSMartin Matuska sizeof (end_time), 1, &end_time); 4742e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4743e716630dSMartin Matuska return (err); 4744e716630dSMartin Matuska 4745e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4746e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4747e716630dSMartin Matuska sizeof (bytes_copied), 1, &bytes_copied); 4748e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4749e716630dSMartin Matuska return (err); 4750e716630dSMartin Matuska } 4751e716630dSMartin Matuska 4752e716630dSMartin Matuska /* 4753e716630dSMartin Matuska * If we are in the middle of expansion, vre_state should have 4754e716630dSMartin Matuska * already been set by vdev_raidz_init(). 4755e716630dSMartin Matuska */ 4756e716630dSMartin Matuska EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4757e716630dSMartin Matuska vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4758e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = start_time; 4759e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = end_time; 4760e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4761e716630dSMartin Matuska 4762e716630dSMartin Matuska return (0); 4763e716630dSMartin Matuska } 4764e716630dSMartin Matuska 4765e716630dSMartin Matuska int 4766e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4767e716630dSMartin Matuska { 4768e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4769e716630dSMartin Matuska 4770e716630dSMartin Matuska if (vre == NULL) { 4771e716630dSMartin Matuska /* no removal in progress; find most recent completed */ 4772e716630dSMartin Matuska for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4773e716630dSMartin Matuska vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4774e716630dSMartin Matuska if (vd->vdev_ops == &vdev_raidz_ops) { 4775e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4776e716630dSMartin Matuska 4777e716630dSMartin Matuska if (vdrz->vn_vre.vre_end_time != 0 && 4778e716630dSMartin Matuska (vre == NULL || 4779e716630dSMartin Matuska vdrz->vn_vre.vre_end_time > 4780e716630dSMartin Matuska vre->vre_end_time)) { 4781e716630dSMartin Matuska vre = &vdrz->vn_vre; 4782e716630dSMartin Matuska } 4783e716630dSMartin Matuska } 4784e716630dSMartin Matuska } 4785e716630dSMartin Matuska } 4786e716630dSMartin Matuska 4787e716630dSMartin Matuska if (vre == NULL) { 4788e716630dSMartin Matuska return (SET_ERROR(ENOENT)); 4789e716630dSMartin Matuska } 4790e716630dSMartin Matuska 4791e716630dSMartin Matuska pres->pres_state = vre->vre_state; 4792e716630dSMartin Matuska pres->pres_expanding_vdev = vre->vre_vdev_id; 4793e716630dSMartin Matuska 4794e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4795e716630dSMartin Matuska pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4796e716630dSMartin Matuska 4797e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4798e716630dSMartin Matuska pres->pres_reflowed = vre->vre_bytes_copied; 4799e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 4800e716630dSMartin Matuska pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4801e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4802e716630dSMartin Matuska 4803e716630dSMartin Matuska pres->pres_start_time = vre->vre_start_time; 4804e716630dSMartin Matuska pres->pres_end_time = vre->vre_end_time; 4805e716630dSMartin Matuska pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4806e716630dSMartin Matuska 4807e716630dSMartin Matuska return (0); 4808e716630dSMartin Matuska } 4809e716630dSMartin Matuska 48107877fdebSMatt Macy /* 48117877fdebSMatt Macy * Initialize private RAIDZ specific fields from the nvlist. 48127877fdebSMatt Macy */ 48137877fdebSMatt Macy static int 48147877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 48157877fdebSMatt Macy { 48167877fdebSMatt Macy uint_t children; 48177877fdebSMatt Macy nvlist_t **child; 48187877fdebSMatt Macy int error = nvlist_lookup_nvlist_array(nv, 48197877fdebSMatt Macy ZPOOL_CONFIG_CHILDREN, &child, &children); 48207877fdebSMatt Macy if (error != 0) 48217877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48227877fdebSMatt Macy 4823e716630dSMartin Matuska uint64_t nparity; 48247877fdebSMatt Macy if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 48257877fdebSMatt Macy if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 48267877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48277877fdebSMatt Macy 48287877fdebSMatt Macy /* 48297877fdebSMatt Macy * Previous versions could only support 1 or 2 parity 48307877fdebSMatt Macy * device. 48317877fdebSMatt Macy */ 48327877fdebSMatt Macy if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 48337877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48347877fdebSMatt Macy else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 48357877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48367877fdebSMatt Macy } else { 48377877fdebSMatt Macy /* 48387877fdebSMatt Macy * We require the parity to be specified for SPAs that 48397877fdebSMatt Macy * support multiple parity levels. 48407877fdebSMatt Macy */ 48417877fdebSMatt Macy if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 48427877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48437877fdebSMatt Macy 48447877fdebSMatt Macy /* 48457877fdebSMatt Macy * Otherwise, we default to 1 parity device for RAID-Z. 48467877fdebSMatt Macy */ 48477877fdebSMatt Macy nparity = 1; 48487877fdebSMatt Macy } 48497877fdebSMatt Macy 4850e716630dSMartin Matuska vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4851e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = -1; 4852e716630dSMartin Matuska vdrz->vn_vre.vre_offset = UINT64_MAX; 4853e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4854e716630dSMartin Matuska mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4855e716630dSMartin Matuska cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4856e716630dSMartin Matuska zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4857e716630dSMartin Matuska mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4858e716630dSMartin Matuska avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4859e716630dSMartin Matuska sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4860e716630dSMartin Matuska 4861e716630dSMartin Matuska vdrz->vd_physical_width = children; 48627877fdebSMatt Macy vdrz->vd_nparity = nparity; 48637877fdebSMatt Macy 4864e716630dSMartin Matuska /* note, the ID does not exist when creating a pool */ 4865e716630dSMartin Matuska (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4866e716630dSMartin Matuska &vdrz->vn_vre.vre_vdev_id); 4867e716630dSMartin Matuska 4868e716630dSMartin Matuska boolean_t reflow_in_progress = 4869e716630dSMartin Matuska nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4870e716630dSMartin Matuska if (reflow_in_progress) { 4871e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4872e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4873e716630dSMartin Matuska } 4874e716630dSMartin Matuska 4875e716630dSMartin Matuska vdrz->vd_original_width = children; 4876e716630dSMartin Matuska uint64_t *txgs; 4877e716630dSMartin Matuska unsigned int txgs_size = 0; 4878e716630dSMartin Matuska error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4879e716630dSMartin Matuska &txgs, &txgs_size); 4880e716630dSMartin Matuska if (error == 0) { 4881e716630dSMartin Matuska for (int i = 0; i < txgs_size; i++) { 4882e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4883e716630dSMartin Matuska re->re_txg = txgs[txgs_size - i - 1]; 4884e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width - i; 4885e716630dSMartin Matuska 4886e716630dSMartin Matuska if (reflow_in_progress) 4887e716630dSMartin Matuska re->re_logical_width--; 4888e716630dSMartin Matuska 4889e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 4890e716630dSMartin Matuska } 4891e716630dSMartin Matuska 4892e716630dSMartin Matuska vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4893e716630dSMartin Matuska } 4894e716630dSMartin Matuska if (reflow_in_progress) { 4895e716630dSMartin Matuska vdrz->vd_original_width--; 4896e716630dSMartin Matuska zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 4897e716630dSMartin Matuska children, txgs_size); 4898e716630dSMartin Matuska } 4899e716630dSMartin Matuska 49007877fdebSMatt Macy *tsd = vdrz; 49017877fdebSMatt Macy 49027877fdebSMatt Macy return (0); 49037877fdebSMatt Macy } 49047877fdebSMatt Macy 49057877fdebSMatt Macy static void 49067877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd) 49077877fdebSMatt Macy { 4908e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4909e716630dSMartin Matuska if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 4910e716630dSMartin Matuska vd->vdev_spa->spa_raidz_expand = NULL; 4911e716630dSMartin Matuska reflow_node_t *re; 4912e716630dSMartin Matuska void *cookie = NULL; 4913e716630dSMartin Matuska avl_tree_t *tree = &vdrz->vd_expand_txgs; 4914e716630dSMartin Matuska while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 4915e716630dSMartin Matuska kmem_free(re, sizeof (*re)); 4916e716630dSMartin Matuska avl_destroy(&vdrz->vd_expand_txgs); 4917e716630dSMartin Matuska mutex_destroy(&vdrz->vd_expand_lock); 4918e716630dSMartin Matuska mutex_destroy(&vdrz->vn_vre.vre_lock); 4919e716630dSMartin Matuska cv_destroy(&vdrz->vn_vre.vre_cv); 4920e716630dSMartin Matuska zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 4921e716630dSMartin Matuska kmem_free(vdrz, sizeof (*vdrz)); 49227877fdebSMatt Macy } 49237877fdebSMatt Macy 49247877fdebSMatt Macy /* 49257877fdebSMatt Macy * Add RAIDZ specific fields to the config nvlist. 49267877fdebSMatt Macy */ 49277877fdebSMatt Macy static void 49287877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 49297877fdebSMatt Macy { 49307877fdebSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 49317877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 49327877fdebSMatt Macy 49337877fdebSMatt Macy /* 49347877fdebSMatt Macy * Make sure someone hasn't managed to sneak a fancy new vdev 49357877fdebSMatt Macy * into a crufty old storage pool. 49367877fdebSMatt Macy */ 49377877fdebSMatt Macy ASSERT(vdrz->vd_nparity == 1 || 49387877fdebSMatt Macy (vdrz->vd_nparity <= 2 && 49397877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 49407877fdebSMatt Macy (vdrz->vd_nparity <= 3 && 49417877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 49427877fdebSMatt Macy 49437877fdebSMatt Macy /* 49447877fdebSMatt Macy * Note that we'll add these even on storage pools where they 49457877fdebSMatt Macy * aren't strictly required -- older software will just ignore 49467877fdebSMatt Macy * it. 49477877fdebSMatt Macy */ 49487877fdebSMatt Macy fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 4949e716630dSMartin Matuska 4950e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4951e716630dSMartin Matuska fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4952e716630dSMartin Matuska } 4953e716630dSMartin Matuska 4954e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 4955e716630dSMartin Matuska if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 4956e716630dSMartin Matuska uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 4957e716630dSMartin Matuska uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 4958e716630dSMartin Matuska KM_SLEEP); 4959e716630dSMartin Matuska uint64_t i = 0; 4960e716630dSMartin Matuska 4961e716630dSMartin Matuska for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 4962e716630dSMartin Matuska re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 4963e716630dSMartin Matuska txgs[i++] = re->re_txg; 4964e716630dSMartin Matuska } 4965e716630dSMartin Matuska 4966e716630dSMartin Matuska fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4967e716630dSMartin Matuska txgs, count); 4968e716630dSMartin Matuska 4969e716630dSMartin Matuska kmem_free(txgs, sizeof (uint64_t) * count); 4970e716630dSMartin Matuska } 4971e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 49727877fdebSMatt Macy } 49737877fdebSMatt Macy 49747877fdebSMatt Macy static uint64_t 49757877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd) 49767877fdebSMatt Macy { 49777877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 49787877fdebSMatt Macy return (vdrz->vd_nparity); 49797877fdebSMatt Macy } 49807877fdebSMatt Macy 49817877fdebSMatt Macy static uint64_t 49827877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd) 49837877fdebSMatt Macy { 49847877fdebSMatt Macy return (vd->vdev_children); 4985eda14cbcSMatt Macy } 4986eda14cbcSMatt Macy 4987eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = { 49887877fdebSMatt Macy .vdev_op_init = vdev_raidz_init, 49897877fdebSMatt Macy .vdev_op_fini = vdev_raidz_fini, 4990eda14cbcSMatt Macy .vdev_op_open = vdev_raidz_open, 4991eda14cbcSMatt Macy .vdev_op_close = vdev_raidz_close, 4992eda14cbcSMatt Macy .vdev_op_asize = vdev_raidz_asize, 49937877fdebSMatt Macy .vdev_op_min_asize = vdev_raidz_min_asize, 49947877fdebSMatt Macy .vdev_op_min_alloc = NULL, 4995eda14cbcSMatt Macy .vdev_op_io_start = vdev_raidz_io_start, 4996eda14cbcSMatt Macy .vdev_op_io_done = vdev_raidz_io_done, 4997eda14cbcSMatt Macy .vdev_op_state_change = vdev_raidz_state_change, 4998eda14cbcSMatt Macy .vdev_op_need_resilver = vdev_raidz_need_resilver, 4999eda14cbcSMatt Macy .vdev_op_hold = NULL, 5000eda14cbcSMatt Macy .vdev_op_rele = NULL, 5001eda14cbcSMatt Macy .vdev_op_remap = NULL, 5002eda14cbcSMatt Macy .vdev_op_xlate = vdev_raidz_xlate, 50037877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 50047877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 50057877fdebSMatt Macy .vdev_op_config_generate = vdev_raidz_config_generate, 50067877fdebSMatt Macy .vdev_op_nparity = vdev_raidz_nparity, 50077877fdebSMatt Macy .vdev_op_ndisks = vdev_raidz_ndisks, 5008eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5009eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5010eda14cbcSMatt Macy }; 5011e716630dSMartin Matuska 5012e716630dSMartin Matuska /* BEGIN CSTYLED */ 5013e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5014e716630dSMartin Matuska "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5015e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5016e716630dSMartin Matuska "Max amount of concurrent i/o for RAIDZ expansion"); 5017e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5018e716630dSMartin Matuska "For expanded RAIDZ, aggregate reads that have more rows than this"); 5019e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5020e716630dSMartin Matuska "For expanded RAIDZ, automatically start a pool scrub when expansion " 5021e716630dSMartin Matuska "completes"); 5022e716630dSMartin Matuska /* END CSTYLED */ 5023