1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy 22eda14cbcSMatt Macy /* 23eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 242c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26eda14cbcSMatt Macy */ 27eda14cbcSMatt Macy 28eda14cbcSMatt Macy #include <sys/zfs_context.h> 29eda14cbcSMatt Macy #include <sys/spa.h> 30e716630dSMartin Matuska #include <sys/spa_impl.h> 31e716630dSMartin Matuska #include <sys/zap.h> 32eda14cbcSMatt Macy #include <sys/vdev_impl.h> 33e716630dSMartin Matuska #include <sys/metaslab_impl.h> 34eda14cbcSMatt Macy #include <sys/zio.h> 35eda14cbcSMatt Macy #include <sys/zio_checksum.h> 36e716630dSMartin Matuska #include <sys/dmu_tx.h> 37eda14cbcSMatt Macy #include <sys/abd.h> 38e716630dSMartin Matuska #include <sys/zfs_rlock.h> 39eda14cbcSMatt Macy #include <sys/fs/zfs.h> 40eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 41eda14cbcSMatt Macy #include <sys/vdev_raidz.h> 42eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h> 437877fdebSMatt Macy #include <sys/vdev_draid.h> 44e716630dSMartin Matuska #include <sys/uberblock_impl.h> 45e716630dSMartin Matuska #include <sys/dsl_scan.h> 46eda14cbcSMatt Macy 47eda14cbcSMatt Macy #ifdef ZFS_DEBUG 48eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 49eda14cbcSMatt Macy #endif 50eda14cbcSMatt Macy 51eda14cbcSMatt Macy /* 52eda14cbcSMatt Macy * Virtual device vector for RAID-Z. 53eda14cbcSMatt Macy * 54eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity, 55eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity, 56eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the 57eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 58eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 59eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 60eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance 61eda14cbcSMatt Macy * for writes. 62eda14cbcSMatt Macy * 63eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then 64eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its 65eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to 66eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to 67eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 68eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will 69eda14cbcSMatt Macy * suffer. 70eda14cbcSMatt Macy * 71eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the 72eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 73eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the 74eda14cbcSMatt Macy * field are defined as follows: 75eda14cbcSMatt Macy * 76eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR 77eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B 78eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression: 79eda14cbcSMatt Macy * 80eda14cbcSMatt Macy * (A * 2)_7 = A_6 81eda14cbcSMatt Macy * (A * 2)_6 = A_5 82eda14cbcSMatt Macy * (A * 2)_5 = A_4 83eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7 84eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7 85eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7 86eda14cbcSMatt Macy * (A * 2)_1 = A_0 87eda14cbcSMatt Macy * (A * 2)_0 = A_7 88eda14cbcSMatt Macy * 89eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 90eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting 91eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 92eda14cbcSMatt Macy * 93eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a 94eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of 95eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 96eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 97eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore 98eda14cbcSMatt Macy * A ^ (255 - 1) = A^254. 99eda14cbcSMatt Macy * 100eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns, 101eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations: 102eda14cbcSMatt Macy * 103eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1 104eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 105eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 106eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 107eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 108eda14cbcSMatt Macy * 109eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 110eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 111eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have 112eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.) 113eda14cbcSMatt Macy * 114eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually 115eda14cbcSMatt Macy * or in concert to recover missing data columns. 116eda14cbcSMatt Macy */ 117eda14cbcSMatt Macy 118eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0 119eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1 120eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 124eda14cbcSMatt Macy 125eda14cbcSMatt Macy /* 126eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a 127eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by 128eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to 129eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d. 130eda14cbcSMatt Macy */ 131eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \ 132eda14cbcSMatt Macy { \ 133eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \ 134eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \ 135eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 136eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 137eda14cbcSMatt Macy } 138eda14cbcSMatt Macy 139eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \ 140eda14cbcSMatt Macy { \ 141eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 143eda14cbcSMatt Macy } 144eda14cbcSMatt Macy 145e716630dSMartin Matuska 146e716630dSMartin Matuska /* 147e716630dSMartin Matuska * Big Theory Statement for how a RAIDZ VDEV is expanded 148e716630dSMartin Matuska * 149e716630dSMartin Matuska * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 150e716630dSMartin Matuska * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 151e716630dSMartin Matuska * that have been previously expanded can be expanded again. 152e716630dSMartin Matuska * 153e716630dSMartin Matuska * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 154e716630dSMartin Matuska * the VDEV) when an expansion starts. And the expansion will pause if any 155e716630dSMartin Matuska * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 156e716630dSMartin Matuska * operations on the pool can continue while an expansion is in progress (e.g. 157e716630dSMartin Matuska * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 158e716630dSMartin Matuska * and zpool initialize which can't be run during an expansion. Following a 159e716630dSMartin Matuska * reboot or export/import, the expansion resumes where it left off. 160e716630dSMartin Matuska * 161e716630dSMartin Matuska * == Reflowing the Data == 162e716630dSMartin Matuska * 163e716630dSMartin Matuska * The expansion involves reflowing (copying) the data from the current set 164e716630dSMartin Matuska * of disks to spread it across the new set which now has one more disk. This 165e716630dSMartin Matuska * reflow operation is similar to reflowing text when the column width of a 166e716630dSMartin Matuska * text editor window is expanded. The text doesn’t change but the location of 167e716630dSMartin Matuska * the text changes to accommodate the new width. An example reflow result for 168e716630dSMartin Matuska * a 4-wide RAIDZ1 to a 5-wide is shown below. 169e716630dSMartin Matuska * 170e716630dSMartin Matuska * Reflow End State 171e716630dSMartin Matuska * Each letter indicates a parity group (logical stripe) 172e716630dSMartin Matuska * 173e716630dSMartin Matuska * Before expansion After Expansion 174e716630dSMartin Matuska * D1 D2 D3 D4 D1 D2 D3 D4 D5 175e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 176e716630dSMartin Matuska * | | | | | | | | | | | 177e716630dSMartin Matuska * | A | A | A | A | | A | A | A | A | B | 178e716630dSMartin Matuska * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 179e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 180e716630dSMartin Matuska * | | | | | | | | | | | 181e716630dSMartin Matuska * | B | B | C | C | | B | C | C | C | C | 182e716630dSMartin Matuska * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 183e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 184e716630dSMartin Matuska * | | | | | | | | | | | 185e716630dSMartin Matuska * | C | C | D | D | | D | D | E | E | E | 186e716630dSMartin Matuska * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 187e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 188e716630dSMartin Matuska * | | | | | | | | | | | 189e716630dSMartin Matuska * | E | E | E | E | --> | E | F | F | G | G | 190e716630dSMartin Matuska * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 191e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 192e716630dSMartin Matuska * | | | | | | | | | | | 193e716630dSMartin Matuska * | F | F | G | G | | G | G | H | H | H | 194e716630dSMartin Matuska * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 195e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 196e716630dSMartin Matuska * | | | | | | | | | | | 197e716630dSMartin Matuska * | G | G | H | H | | H | I | I | J | J | 198e716630dSMartin Matuska * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 199e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 200e716630dSMartin Matuska * | | | | | | | | | | | 201e716630dSMartin Matuska * | H | H | I | I | | J | J | | | K | 202e716630dSMartin Matuska * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 203e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 204e716630dSMartin Matuska * 205e716630dSMartin Matuska * This reflow approach has several advantages. There is no need to read or 206e716630dSMartin Matuska * modify the block pointers or recompute any block checksums. The reflow 207e716630dSMartin Matuska * doesn’t need to know where the parity sectors reside. We can read and write 208e716630dSMartin Matuska * data sequentially and the copy can occur in a background thread in open 209e716630dSMartin Matuska * context. The design also allows for fast discovery of what data to copy. 210e716630dSMartin Matuska * 211e716630dSMartin Matuska * The VDEV metaslabs are processed, one at a time, to copy the block data to 212e716630dSMartin Matuska * have it flow across all the disks. The metaslab is disabled for allocations 213e716630dSMartin Matuska * during the copy. As an optimization, we only copy the allocated data which 214e716630dSMartin Matuska * can be determined by looking at the metaslab range tree. During the copy we 215e716630dSMartin Matuska * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 216e716630dSMartin Matuska * need to be able to survive losing parity count disks). This means we 217e716630dSMartin Matuska * cannot overwrite data during the reflow that would be needed if a disk is 218e716630dSMartin Matuska * lost. 219e716630dSMartin Matuska * 220e716630dSMartin Matuska * After the reflow completes, all newly-written blocks will have the new 221e716630dSMartin Matuska * layout, i.e., they will have the parity to data ratio implied by the new 222e716630dSMartin Matuska * number of disks in the RAIDZ group. Even though the reflow copies all of 223e716630dSMartin Matuska * the allocated space (data and parity), it is only rearranged, not changed. 224e716630dSMartin Matuska * 225e716630dSMartin Matuska * This act of reflowing the data has a few implications about blocks 226e716630dSMartin Matuska * that were written before the reflow completes: 227e716630dSMartin Matuska * 228e716630dSMartin Matuska * - Old blocks will still use the same amount of space (i.e., they will have 229e716630dSMartin Matuska * the parity to data ratio implied by the old number of disks in the RAIDZ 230e716630dSMartin Matuska * group). 231e716630dSMartin Matuska * - Reading old blocks will be slightly slower than before the reflow, for 232e716630dSMartin Matuska * two reasons. First, we will have to read from all disks in the RAIDZ 233e716630dSMartin Matuska * VDEV, rather than being able to skip the children that contain only 234e716630dSMartin Matuska * parity of this block (because the data of a single block is now spread 235e716630dSMartin Matuska * out across all the disks). Second, in most cases there will be an extra 236e716630dSMartin Matuska * bcopy, needed to rearrange the data back to its original layout in memory. 237e716630dSMartin Matuska * 238e716630dSMartin Matuska * == Scratch Area == 239e716630dSMartin Matuska * 240e716630dSMartin Matuska * As we copy the block data, we can only progress to the point that writes 241e716630dSMartin Matuska * will not overlap with blocks whose progress has not yet been recorded on 242e716630dSMartin Matuska * disk. Since partially-copied rows are always read from the old location, 243e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent any 244e716630dSMartin Matuska * row-wise overlap. For example, in the diagram above, when we reflow sector 245e716630dSMartin Matuska * B6 it will overwite the original location for B5. 246e716630dSMartin Matuska * 247e716630dSMartin Matuska * To get around this, a scratch space is used so that we can start copying 248e716630dSMartin Matuska * without risking data loss by overlapping the row. As an added benefit, it 249e716630dSMartin Matuska * improves performance at the beginning of the reflow, but that small perf 250e716630dSMartin Matuska * boost wouldn't be worth the complexity on its own. 251e716630dSMartin Matuska * 252e716630dSMartin Matuska * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 253e716630dSMartin Matuska * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 254e716630dSMartin Matuska * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 255e716630dSMartin Matuska * the widths will likely be single digits so we can get a substantial chuck 256e716630dSMartin Matuska * size using only a few MB of scratch per disk. 257e716630dSMartin Matuska * 258e716630dSMartin Matuska * The scratch area is persisted to disk which holds a large amount of reflowed 259e716630dSMartin Matuska * state. We can always read the partially written stripes when a disk fails or 260e716630dSMartin Matuska * the copy is interrupted (crash) during the initial copying phase and also 261e716630dSMartin Matuska * get past a small chunk size restriction. At a minimum, the scratch space 262e716630dSMartin Matuska * must be large enough to get us to the point that one row does not overlap 263e716630dSMartin Matuska * itself when moved (i.e new_width^2). But going larger is even better. We 264e716630dSMartin Matuska * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 265e716630dSMartin Matuska * as our scratch space to handle overwriting the initial part of the VDEV. 266e716630dSMartin Matuska * 267e716630dSMartin Matuska * 0 256K 512K 4M 268e716630dSMartin Matuska * +------+------+-----------------------+----------------------------- 269e716630dSMartin Matuska * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 270e716630dSMartin Matuska * | L0 | L1 | Reserved | (Metaslabs) 271e716630dSMartin Matuska * +------+------+-----------------------+------------------------------- 272e716630dSMartin Matuska * Scratch Area 273e716630dSMartin Matuska * 274e716630dSMartin Matuska * == Reflow Progress Updates == 275e716630dSMartin Matuska * After the initial scratch-based reflow, the expansion process works 276e716630dSMartin Matuska * similarly to device removal. We create a new open context thread which 277e716630dSMartin Matuska * reflows the data, and periodically kicks off sync tasks to update logical 278e716630dSMartin Matuska * state. In this case, state is the committed progress (offset of next data 279e716630dSMartin Matuska * to copy). We need to persist the completed offset on disk, so that if we 280e716630dSMartin Matuska * crash we know which format each VDEV offset is in. 281e716630dSMartin Matuska * 282e716630dSMartin Matuska * == Time Dependent Geometry == 283e716630dSMartin Matuska * 284e716630dSMartin Matuska * In non-expanded RAIDZ, blocks are read from disk in a column by column 285e716630dSMartin Matuska * fashion. For a multi-row block, the second sector is in the first column 286e716630dSMartin Matuska * not in the second column. This allows us to issue full reads for each 287e716630dSMartin Matuska * column directly into the request buffer. The block data is thus laid out 288e716630dSMartin Matuska * sequentially in a column-by-column fashion. 289e716630dSMartin Matuska * 290e716630dSMartin Matuska * For example, in the before expansion diagram above, one logical block might 291e716630dSMartin Matuska * be sectors G19-H26. The parity is in G19,H23; and the data is in 292e716630dSMartin Matuska * G20,H24,G21,H25,G22,H26. 293e716630dSMartin Matuska * 294e716630dSMartin Matuska * After a block is reflowed, the sectors that were all in the original column 295e716630dSMartin Matuska * data can now reside in different columns. When reading from an expanded 296e716630dSMartin Matuska * VDEV, we need to know the logical stripe width for each block so we can 297e716630dSMartin Matuska * reconstitute the block’s data after the reads are completed. Likewise, 298e716630dSMartin Matuska * when we perform the combinatorial reconstruction we need to know the 299e716630dSMartin Matuska * original width so we can retry combinations from the past layouts. 300e716630dSMartin Matuska * 301e716630dSMartin Matuska * Time dependent geometry is what we call having blocks with different layouts 302e716630dSMartin Matuska * (stripe widths) in the same VDEV. This time-dependent geometry uses the 303e716630dSMartin Matuska * block’s birth time (+ the time expansion ended) to establish the correct 304e716630dSMartin Matuska * width for a given block. After an expansion completes, we record the time 305e716630dSMartin Matuska * for blocks written with a particular width (geometry). 306e716630dSMartin Matuska * 307e716630dSMartin Matuska * == On Disk Format Changes == 308e716630dSMartin Matuska * 309e716630dSMartin Matuska * New pool feature flag, 'raidz_expansion' whose reference count is the number 310e716630dSMartin Matuska * of RAIDZ VDEVs that have been expanded. 311e716630dSMartin Matuska * 312e716630dSMartin Matuska * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 313e716630dSMartin Matuska * 314e716630dSMartin Matuska * Since the uberblock can point to arbitrary blocks, which might be on the 315e716630dSMartin Matuska * expanding RAIDZ, and might or might not have been expanded. We need to know 316e716630dSMartin Matuska * which way a block is laid out before reading it. This info is the next 317e716630dSMartin Matuska * offset that needs to be reflowed and we persist that in the uberblock, in 318e716630dSMartin Matuska * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 319e716630dSMartin Matuska * After the expansion is complete, we then use the raidz_expand_txgs array 320e716630dSMartin Matuska * (see below) to determine how to read a block and the ub_raidz_reflow_info 321e716630dSMartin Matuska * field no longer required. 322e716630dSMartin Matuska * 323e716630dSMartin Matuska * The uberblock's ub_raidz_reflow_info field also holds the scratch space 324e716630dSMartin Matuska * state (i.e., active or not) which is also required before reading a block 325e716630dSMartin Matuska * during the initial phase of reflowing the data. 326e716630dSMartin Matuska * 327e716630dSMartin Matuska * The top-level RAIDZ VDEV has two new entries in the nvlist: 328e716630dSMartin Matuska * 329e716630dSMartin Matuska * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 330e716630dSMartin Matuska * and used after the expansion is complete to 331e716630dSMartin Matuska * determine how to read a raidz block 332e716630dSMartin Matuska * 'raidz_expanding' boolean: present during reflow and removed after completion 333e716630dSMartin Matuska * used during a spa import to resume an unfinished 334e716630dSMartin Matuska * expansion 335e716630dSMartin Matuska * 336e716630dSMartin Matuska * And finally the VDEVs top zap adds the following informational entries: 337e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 338e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 339e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 340e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 341e716630dSMartin Matuska */ 342e716630dSMartin Matuska 343e716630dSMartin Matuska /* 344e716630dSMartin Matuska * For testing only: pause the raidz expansion after reflowing this amount. 345e716630dSMartin Matuska * (accessed by ZTS and ztest) 346e716630dSMartin Matuska */ 347e716630dSMartin Matuska #ifdef _KERNEL 348e716630dSMartin Matuska static 349e716630dSMartin Matuska #endif /* _KERNEL */ 350e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0; 351e716630dSMartin Matuska 352e716630dSMartin Matuska /* 353e716630dSMartin Matuska * For testing only: pause the raidz expansion at a certain point. 354e716630dSMartin Matuska */ 355e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0; 356e716630dSMartin Matuska 357e716630dSMartin Matuska /* 358e716630dSMartin Matuska * Maximum amount of copy io's outstanding at once. 359e716630dSMartin Matuska */ 360e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 361e716630dSMartin Matuska 362e716630dSMartin Matuska /* 363e716630dSMartin Matuska * Apply raidz map abds aggregation if the number of rows in the map is equal 364e716630dSMartin Matuska * or greater than the value below. 365e716630dSMartin Matuska */ 366e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4; 367e716630dSMartin Matuska 368e716630dSMartin Matuska /* 369e716630dSMartin Matuska * Automatically start a pool scrub when a RAIDZ expansion completes in 370e716630dSMartin Matuska * order to verify the checksums of all blocks which have been copied 371e716630dSMartin Matuska * during the expansion. Automatic scrubbing is enabled by default and 372e716630dSMartin Matuska * is strongly recommended. 373e716630dSMartin Matuska */ 374e716630dSMartin Matuska static int zfs_scrub_after_expand = 1; 375e716630dSMartin Matuska 3767877fdebSMatt Macy static void 3777877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr) 378eda14cbcSMatt Macy { 379184c1b94SMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 380184c1b94SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 381eda14cbcSMatt Macy 382184c1b94SMartin Matuska if (rc->rc_size != 0) 383184c1b94SMartin Matuska abd_free(rc->rc_abd); 384184c1b94SMartin Matuska if (rc->rc_orig_data != NULL) 385f9693befSMartin Matuska abd_free(rc->rc_orig_data); 386eda14cbcSMatt Macy } 387eda14cbcSMatt Macy 3887877fdebSMatt Macy if (rr->rr_abd_empty != NULL) 3897877fdebSMatt Macy abd_free(rr->rr_abd_empty); 390eda14cbcSMatt Macy 3917877fdebSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 3927877fdebSMatt Macy } 3937877fdebSMatt Macy 3947877fdebSMatt Macy void 3957877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm) 3967877fdebSMatt Macy { 3977877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) 3987877fdebSMatt Macy vdev_raidz_row_free(rm->rm_row[i]); 3997877fdebSMatt Macy 400e716630dSMartin Matuska if (rm->rm_nphys_cols) { 401e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 402e716630dSMartin Matuska if (rm->rm_phys_col[i].rc_abd != NULL) 403e716630dSMartin Matuska abd_free(rm->rm_phys_col[i].rc_abd); 404e716630dSMartin Matuska } 405e716630dSMartin Matuska 406e716630dSMartin Matuska kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 407e716630dSMartin Matuska rm->rm_nphys_cols); 408e716630dSMartin Matuska } 409e716630dSMartin Matuska 410e716630dSMartin Matuska ASSERT3P(rm->rm_lr, ==, NULL); 4117877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 412eda14cbcSMatt Macy } 413eda14cbcSMatt Macy 414eda14cbcSMatt Macy static void 415eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio) 416eda14cbcSMatt Macy { 417eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy vdev_raidz_map_free(rm); 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422e716630dSMartin Matuska static int 423e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2) 424e716630dSMartin Matuska { 425e716630dSMartin Matuska const reflow_node_t *l = x1; 426e716630dSMartin Matuska const reflow_node_t *r = x2; 427e716630dSMartin Matuska 428e716630dSMartin Matuska return (TREE_CMP(l->re_txg, r->re_txg)); 429e716630dSMartin Matuska } 430e716630dSMartin Matuska 431f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432eda14cbcSMatt Macy .vsd_free = vdev_raidz_map_free_vsd, 433eda14cbcSMatt Macy }; 434eda14cbcSMatt Macy 435e716630dSMartin Matuska raidz_row_t * 436*87bf66d4SMartin Matuska vdev_raidz_row_alloc(int cols, zio_t *zio) 437e716630dSMartin Matuska { 438e716630dSMartin Matuska raidz_row_t *rr = 439e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 440e716630dSMartin Matuska 441e716630dSMartin Matuska rr->rr_cols = cols; 442e716630dSMartin Matuska rr->rr_scols = cols; 443e716630dSMartin Matuska 444e716630dSMartin Matuska for (int c = 0; c < cols; c++) { 445e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 446e716630dSMartin Matuska rc->rc_shadow_devidx = INT_MAX; 447e716630dSMartin Matuska rc->rc_shadow_offset = UINT64_MAX; 448*87bf66d4SMartin Matuska /* 449*87bf66d4SMartin Matuska * We can not allow self healing to take place for Direct I/O 450*87bf66d4SMartin Matuska * reads. There is nothing that stops the buffer contents from 451*87bf66d4SMartin Matuska * being manipulated while the I/O is in flight. It is possible 452*87bf66d4SMartin Matuska * that the checksum could be verified on the buffer and then 453*87bf66d4SMartin Matuska * the contents of that buffer are manipulated afterwards. This 454*87bf66d4SMartin Matuska * could lead to bad data being written out during self 455*87bf66d4SMartin Matuska * healing. 456*87bf66d4SMartin Matuska */ 457*87bf66d4SMartin Matuska if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 458e716630dSMartin Matuska rc->rc_allow_repair = 1; 459e716630dSMartin Matuska } 460e716630dSMartin Matuska return (rr); 461e716630dSMartin Matuska } 462e716630dSMartin Matuska 46381b22a98SMartin Matuska static void 46481b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 46581b22a98SMartin Matuska { 46681b22a98SMartin Matuska int c; 46781b22a98SMartin Matuska int nwrapped = 0; 46881b22a98SMartin Matuska uint64_t off = 0; 46981b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 47081b22a98SMartin Matuska 47181b22a98SMartin Matuska ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 47281b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 47381b22a98SMartin Matuska 47481b22a98SMartin Matuska /* 47581b22a98SMartin Matuska * Pad any parity columns with additional space to account for skip 47681b22a98SMartin Matuska * sectors. 47781b22a98SMartin Matuska */ 47881b22a98SMartin Matuska if (rm->rm_skipstart < rr->rr_firstdatacol) { 47981b22a98SMartin Matuska ASSERT0(rm->rm_skipstart); 48081b22a98SMartin Matuska nwrapped = rm->rm_nskip; 48181b22a98SMartin Matuska } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 48281b22a98SMartin Matuska nwrapped = 48381b22a98SMartin Matuska (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 48481b22a98SMartin Matuska } 48581b22a98SMartin Matuska 48681b22a98SMartin Matuska /* 48781b22a98SMartin Matuska * Optional single skip sectors (rc_size == 0) will be handled in 48881b22a98SMartin Matuska * vdev_raidz_io_start_write(). 48981b22a98SMartin Matuska */ 49081b22a98SMartin Matuska int skipped = rr->rr_scols - rr->rr_cols; 49181b22a98SMartin Matuska 49281b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 49381b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) { 49481b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 49581b22a98SMartin Matuska 49681b22a98SMartin Matuska /* 49781b22a98SMartin Matuska * Parity columns will pad out a linear ABD to account for 49881b22a98SMartin Matuska * the skip sector. A linear ABD is used here because 49981b22a98SMartin Matuska * parity calculations use the ABD buffer directly to calculate 50081b22a98SMartin Matuska * parity. This avoids doing a memcpy back to the ABD after the 50181b22a98SMartin Matuska * parity has been calculated. By issuing the parity column 50281b22a98SMartin Matuska * with the skip sector we can reduce contention on the child 50381b22a98SMartin Matuska * VDEV queue locks (vq_lock). 50481b22a98SMartin Matuska */ 50581b22a98SMartin Matuska if (c < nwrapped) { 50681b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear( 50781b22a98SMartin Matuska rc->rc_size + (1ULL << ashift), B_FALSE); 50881b22a98SMartin Matuska abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 50981b22a98SMartin Matuska skipped++; 51081b22a98SMartin Matuska } else { 51181b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 51281b22a98SMartin Matuska } 51381b22a98SMartin Matuska } 51481b22a98SMartin Matuska 51581b22a98SMartin Matuska for (off = 0; c < rr->rr_cols; c++) { 51681b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 51781b22a98SMartin Matuska abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 51881b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 51981b22a98SMartin Matuska 52081b22a98SMartin Matuska /* 52181b22a98SMartin Matuska * Generate I/O for skip sectors to improve aggregation 52281b22a98SMartin Matuska * continuity. We will use gang ABD's to reduce contention 52381b22a98SMartin Matuska * on the child VDEV queue locks (vq_lock) by issuing 52481b22a98SMartin Matuska * a single I/O that contains the data and skip sector. 52581b22a98SMartin Matuska * 52681b22a98SMartin Matuska * It is important to make sure that rc_size is not updated 52781b22a98SMartin Matuska * even though we are adding a skip sector to the ABD. When 52881b22a98SMartin Matuska * calculating the parity in vdev_raidz_generate_parity_row() 52981b22a98SMartin Matuska * the rc_size is used to iterate through the ABD's. We can 53081b22a98SMartin Matuska * not have zero'd out skip sectors used for calculating 53181b22a98SMartin Matuska * parity for raidz, because those same sectors are not used 53281b22a98SMartin Matuska * during reconstruction. 53381b22a98SMartin Matuska */ 53481b22a98SMartin Matuska if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 53581b22a98SMartin Matuska rc->rc_abd = abd_alloc_gang(); 53681b22a98SMartin Matuska abd_gang_add(rc->rc_abd, abd, B_TRUE); 53781b22a98SMartin Matuska abd_gang_add(rc->rc_abd, 53881b22a98SMartin Matuska abd_get_zeros(1ULL << ashift), B_TRUE); 53981b22a98SMartin Matuska skipped++; 54081b22a98SMartin Matuska } else { 54181b22a98SMartin Matuska rc->rc_abd = abd; 54281b22a98SMartin Matuska } 54381b22a98SMartin Matuska off += rc->rc_size; 54481b22a98SMartin Matuska } 54581b22a98SMartin Matuska 54681b22a98SMartin Matuska ASSERT3U(off, ==, zio->io_size); 54781b22a98SMartin Matuska ASSERT3S(skipped, ==, rm->rm_nskip); 54881b22a98SMartin Matuska } 54981b22a98SMartin Matuska 55081b22a98SMartin Matuska static void 55181b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 55281b22a98SMartin Matuska { 55381b22a98SMartin Matuska int c; 55481b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 55581b22a98SMartin Matuska 55681b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 55781b22a98SMartin Matuska 55881b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 55981b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) 56081b22a98SMartin Matuska rr->rr_col[c].rc_abd = 56181b22a98SMartin Matuska abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 56281b22a98SMartin Matuska 56381b22a98SMartin Matuska for (uint64_t off = 0; c < rr->rr_cols; c++) { 56481b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 56581b22a98SMartin Matuska rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 56681b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 56781b22a98SMartin Matuska off += rc->rc_size; 56881b22a98SMartin Matuska } 56981b22a98SMartin Matuska } 57081b22a98SMartin Matuska 571eda14cbcSMatt Macy /* 572eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is 573eda14cbcSMatt Macy * the number of children in the target vdev. 574eda14cbcSMatt Macy * 575eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which 576eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 577eda14cbcSMatt Macy */ 578eda14cbcSMatt Macy noinline raidz_map_t * 579eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 580eda14cbcSMatt Macy uint64_t nparity) 581eda14cbcSMatt Macy { 5827877fdebSMatt Macy raidz_row_t *rr; 583eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 584eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift; 585eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 586eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift; 587eda14cbcSMatt Macy /* The first column for this stripe. */ 588eda14cbcSMatt Macy uint64_t f = b % dcols; 589eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */ 590eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift; 591e716630dSMartin Matuska uint64_t acols, scols; 592eda14cbcSMatt Macy 5937877fdebSMatt Macy raidz_map_t *rm = 5947877fdebSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 5957877fdebSMatt Macy rm->rm_nrows = 1; 5967877fdebSMatt Macy 597eda14cbcSMatt Macy /* 598eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but 599eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data. 600eda14cbcSMatt Macy */ 601e716630dSMartin Matuska uint64_t q = s / (dcols - nparity); 602eda14cbcSMatt Macy 603eda14cbcSMatt Macy /* 604eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O. 605eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs. 606eda14cbcSMatt Macy */ 607e716630dSMartin Matuska uint64_t r = s - q * (dcols - nparity); 608eda14cbcSMatt Macy 609eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */ 610e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 611eda14cbcSMatt Macy 612eda14cbcSMatt Macy /* 613eda14cbcSMatt Macy * The total number of data and parity sectors associated with 614eda14cbcSMatt Macy * this I/O. 615eda14cbcSMatt Macy */ 616e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 617eda14cbcSMatt Macy 6187877fdebSMatt Macy /* 6197877fdebSMatt Macy * acols: The columns that will be accessed. 6207877fdebSMatt Macy * scols: The columns that will be accessed or skipped. 6217877fdebSMatt Macy */ 622eda14cbcSMatt Macy if (q == 0) { 623eda14cbcSMatt Macy /* Our I/O request doesn't span all child vdevs. */ 624eda14cbcSMatt Macy acols = bc; 625eda14cbcSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1)); 626eda14cbcSMatt Macy } else { 627eda14cbcSMatt Macy acols = dcols; 628eda14cbcSMatt Macy scols = dcols; 629eda14cbcSMatt Macy } 630eda14cbcSMatt Macy 631eda14cbcSMatt Macy ASSERT3U(acols, <=, scols); 632*87bf66d4SMartin Matuska rr = vdev_raidz_row_alloc(scols, zio); 6337877fdebSMatt Macy rm->rm_row[0] = rr; 6347877fdebSMatt Macy rr->rr_cols = acols; 6357877fdebSMatt Macy rr->rr_bigcols = bc; 6367877fdebSMatt Macy rr->rr_firstdatacol = nparity; 6377877fdebSMatt Macy #ifdef ZFS_DEBUG 6387877fdebSMatt Macy rr->rr_offset = zio->io_offset; 6397877fdebSMatt Macy rr->rr_size = zio->io_size; 6407877fdebSMatt Macy #endif 641eda14cbcSMatt Macy 642e716630dSMartin Matuska uint64_t asize = 0; 643eda14cbcSMatt Macy 644e716630dSMartin Matuska for (uint64_t c = 0; c < scols; c++) { 6457877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 646e716630dSMartin Matuska uint64_t col = f + c; 647e716630dSMartin Matuska uint64_t coff = o; 648eda14cbcSMatt Macy if (col >= dcols) { 649eda14cbcSMatt Macy col -= dcols; 650eda14cbcSMatt Macy coff += 1ULL << ashift; 651eda14cbcSMatt Macy } 6527877fdebSMatt Macy rc->rc_devidx = col; 6537877fdebSMatt Macy rc->rc_offset = coff; 654eda14cbcSMatt Macy 655eda14cbcSMatt Macy if (c >= acols) 6567877fdebSMatt Macy rc->rc_size = 0; 657eda14cbcSMatt Macy else if (c < bc) 6587877fdebSMatt Macy rc->rc_size = (q + 1) << ashift; 659eda14cbcSMatt Macy else 6607877fdebSMatt Macy rc->rc_size = q << ashift; 661eda14cbcSMatt Macy 6627877fdebSMatt Macy asize += rc->rc_size; 663eda14cbcSMatt Macy } 664eda14cbcSMatt Macy 665eda14cbcSMatt Macy ASSERT3U(asize, ==, tot << ashift); 666eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot; 6677877fdebSMatt Macy rm->rm_skipstart = bc; 668eda14cbcSMatt Macy 669eda14cbcSMatt Macy /* 670eda14cbcSMatt Macy * If all data stored spans all columns, there's a danger that parity 671eda14cbcSMatt Macy * will always be on the same device and, since parity isn't read 672eda14cbcSMatt Macy * during normal operation, that device's I/O bandwidth won't be 673eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB. 674eda14cbcSMatt Macy * 675eda14cbcSMatt Macy * ... at least that was, ostensibly, the theory. As a practical 676eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we 677eda14cbcSMatt Macy * won't see any benefit. Further, occasional writes that aren't a 678eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum 679eda14cbcSMatt Macy * stripe width are sufficient to avoid pessimal behavior. 680eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format 681eda14cbcSMatt Macy * requirement that we need to support for all eternity, but only 682eda14cbcSMatt Macy * for single-parity RAID-Z. 683eda14cbcSMatt Macy * 684eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding 685eda14cbcSMatt Macy * we must make sure to note this swap. We will never intend to 686eda14cbcSMatt Macy * skip the first column since at least one data and one parity 687eda14cbcSMatt Macy * column must appear in each row. 688eda14cbcSMatt Macy */ 6897877fdebSMatt Macy ASSERT(rr->rr_cols >= 2); 6907877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 691eda14cbcSMatt Macy 6927877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 693e716630dSMartin Matuska uint64_t devidx = rr->rr_col[0].rc_devidx; 6947877fdebSMatt Macy o = rr->rr_col[0].rc_offset; 6957877fdebSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 6967877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 6977877fdebSMatt Macy rr->rr_col[1].rc_devidx = devidx; 6987877fdebSMatt Macy rr->rr_col[1].rc_offset = o; 699eda14cbcSMatt Macy if (rm->rm_skipstart == 0) 700eda14cbcSMatt Macy rm->rm_skipstart = 1; 701eda14cbcSMatt Macy } 702eda14cbcSMatt Macy 70381b22a98SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) { 70481b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio, rm, ashift); 70581b22a98SMartin Matuska } else { 70681b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio, rm); 70781b22a98SMartin Matuska } 708e716630dSMartin Matuska /* init RAIDZ parity ops */ 709e716630dSMartin Matuska rm->rm_ops = vdev_raidz_math_get_ops(); 71081b22a98SMartin Matuska 711e716630dSMartin Matuska return (rm); 712e716630dSMartin Matuska } 713e716630dSMartin Matuska 714e716630dSMartin Matuska /* 715e716630dSMartin Matuska * Everything before reflow_offset_synced should have been moved to the new 716e716630dSMartin Matuska * location (read and write completed). However, this may not yet be reflected 717e716630dSMartin Matuska * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 718e716630dSMartin Matuska * uberblock has not yet been written). If reflow is not in progress, 719e716630dSMartin Matuska * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 720e716630dSMartin Matuska * entirely before reflow_offset_synced, it will come from the new location. 721e716630dSMartin Matuska * Otherwise this row will come from the old location. Therefore, rows that 722e716630dSMartin Matuska * straddle the reflow_offset_synced will come from the old location. 723e716630dSMartin Matuska * 724e716630dSMartin Matuska * For writes, reflow_offset_next is the next offset to copy. If a sector has 725e716630dSMartin Matuska * been copied, but not yet reflected in the on-disk progress 726e716630dSMartin Matuska * (reflow_offset_synced), it will also be written to the new (already copied) 727e716630dSMartin Matuska * offset. 728e716630dSMartin Matuska */ 729e716630dSMartin Matuska noinline raidz_map_t * 730e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio, 731e716630dSMartin Matuska uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 732e716630dSMartin Matuska uint64_t nparity, uint64_t reflow_offset_synced, 733e716630dSMartin Matuska uint64_t reflow_offset_next, boolean_t use_scratch) 734e716630dSMartin Matuska { 735e716630dSMartin Matuska abd_t *abd = zio->io_abd; 736e716630dSMartin Matuska uint64_t offset = zio->io_offset; 737e716630dSMartin Matuska uint64_t size = zio->io_size; 738e716630dSMartin Matuska 739e716630dSMartin Matuska /* The zio's size in units of the vdev's minimum sector size. */ 740e716630dSMartin Matuska uint64_t s = size >> ashift; 741e716630dSMartin Matuska 742e716630dSMartin Matuska /* 743e716630dSMartin Matuska * "Quotient": The number of data sectors for this stripe on all but 744e716630dSMartin Matuska * the "big column" child vdevs that also contain "remainder" data. 745e716630dSMartin Matuska * AKA "full rows" 746e716630dSMartin Matuska */ 747e716630dSMartin Matuska uint64_t q = s / (logical_cols - nparity); 748e716630dSMartin Matuska 749e716630dSMartin Matuska /* 750e716630dSMartin Matuska * "Remainder": The number of partial stripe data sectors in this I/O. 751e716630dSMartin Matuska * This will add a sector to some, but not all, child vdevs. 752e716630dSMartin Matuska */ 753e716630dSMartin Matuska uint64_t r = s - q * (logical_cols - nparity); 754e716630dSMartin Matuska 755e716630dSMartin Matuska /* The number of "big columns" - those which contain remainder data. */ 756e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 757e716630dSMartin Matuska 758e716630dSMartin Matuska /* 759e716630dSMartin Matuska * The total number of data and parity sectors associated with 760e716630dSMartin Matuska * this I/O. 761e716630dSMartin Matuska */ 762e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 763e716630dSMartin Matuska 764e716630dSMartin Matuska /* How many rows contain data (not skip) */ 765e716630dSMartin Matuska uint64_t rows = howmany(tot, logical_cols); 766e716630dSMartin Matuska int cols = MIN(tot, logical_cols); 767e716630dSMartin Matuska 768e716630dSMartin Matuska raidz_map_t *rm = 769e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 770e716630dSMartin Matuska KM_SLEEP); 771e716630dSMartin Matuska rm->rm_nrows = rows; 772e716630dSMartin Matuska rm->rm_nskip = roundup(tot, nparity + 1) - tot; 773e716630dSMartin Matuska rm->rm_skipstart = bc; 774e716630dSMartin Matuska uint64_t asize = 0; 775e716630dSMartin Matuska 776e716630dSMartin Matuska for (uint64_t row = 0; row < rows; row++) { 777e716630dSMartin Matuska boolean_t row_use_scratch = B_FALSE; 778*87bf66d4SMartin Matuska raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 779e716630dSMartin Matuska rm->rm_row[row] = rr; 780e716630dSMartin Matuska 781e716630dSMartin Matuska /* The starting RAIDZ (parent) vdev sector of the row. */ 782e716630dSMartin Matuska uint64_t b = (offset >> ashift) + row * logical_cols; 783e716630dSMartin Matuska 784e716630dSMartin Matuska /* 785e716630dSMartin Matuska * If we are in the middle of a reflow, and the copying has 786e716630dSMartin Matuska * not yet completed for any part of this row, then use the 787e716630dSMartin Matuska * old location of this row. Note that reflow_offset_synced 788e716630dSMartin Matuska * reflects the i/o that's been completed, because it's 789e716630dSMartin Matuska * updated by a synctask, after zio_wait(spa_txg_zio[]). 790e716630dSMartin Matuska * This is sufficient for our check, even if that progress 791e716630dSMartin Matuska * has not yet been recorded to disk (reflected in 792e716630dSMartin Matuska * spa_ubsync). Also note that we consider the last row to 793e716630dSMartin Matuska * be "full width" (`cols`-wide rather than `bc`-wide) for 794e716630dSMartin Matuska * this calculation. This causes a tiny bit of unnecessary 795e716630dSMartin Matuska * double-writes but is safe and simpler to calculate. 796e716630dSMartin Matuska */ 797e716630dSMartin Matuska int row_phys_cols = physical_cols; 798e716630dSMartin Matuska if (b + cols > reflow_offset_synced >> ashift) 799e716630dSMartin Matuska row_phys_cols--; 800e716630dSMartin Matuska else if (use_scratch) 801e716630dSMartin Matuska row_use_scratch = B_TRUE; 802e716630dSMartin Matuska 803e716630dSMartin Matuska /* starting child of this row */ 804e716630dSMartin Matuska uint64_t child_id = b % row_phys_cols; 805e716630dSMartin Matuska /* The starting byte offset on each child vdev. */ 806e716630dSMartin Matuska uint64_t child_offset = (b / row_phys_cols) << ashift; 807e716630dSMartin Matuska 808e716630dSMartin Matuska /* 809e716630dSMartin Matuska * Note, rr_cols is the entire width of the block, even 810e716630dSMartin Matuska * if this row is shorter. This is needed because parity 811e716630dSMartin Matuska * generation (for Q and R) needs to know the entire width, 812e716630dSMartin Matuska * because it treats the short row as though it was 813e716630dSMartin Matuska * full-width (and the "phantom" sectors were zero-filled). 814e716630dSMartin Matuska * 815e716630dSMartin Matuska * Another approach to this would be to set cols shorter 816e716630dSMartin Matuska * (to just the number of columns that we might do i/o to) 817e716630dSMartin Matuska * and have another mechanism to tell the parity generation 818e716630dSMartin Matuska * about the "entire width". Reconstruction (at least 819e716630dSMartin Matuska * vdev_raidz_reconstruct_general()) would also need to 820e716630dSMartin Matuska * know about the "entire width". 821e716630dSMartin Matuska */ 822e716630dSMartin Matuska rr->rr_firstdatacol = nparity; 823e716630dSMartin Matuska #ifdef ZFS_DEBUG 824e716630dSMartin Matuska /* 825e716630dSMartin Matuska * note: rr_size is PSIZE, not ASIZE 826e716630dSMartin Matuska */ 827e716630dSMartin Matuska rr->rr_offset = b << ashift; 828e716630dSMartin Matuska rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 829e716630dSMartin Matuska #endif 830e716630dSMartin Matuska 831e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++, child_id++) { 832e716630dSMartin Matuska if (child_id >= row_phys_cols) { 833e716630dSMartin Matuska child_id -= row_phys_cols; 834e716630dSMartin Matuska child_offset += 1ULL << ashift; 835e716630dSMartin Matuska } 836e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 837e716630dSMartin Matuska rc->rc_devidx = child_id; 838e716630dSMartin Matuska rc->rc_offset = child_offset; 839e716630dSMartin Matuska 840e716630dSMartin Matuska /* 841e716630dSMartin Matuska * Get this from the scratch space if appropriate. 842e716630dSMartin Matuska * This only happens if we crashed in the middle of 843e716630dSMartin Matuska * raidz_reflow_scratch_sync() (while it's running, 844e716630dSMartin Matuska * the rangelock prevents us from doing concurrent 845e716630dSMartin Matuska * io), and even then only during zpool import or 846e716630dSMartin Matuska * when the pool is imported readonly. 847e716630dSMartin Matuska */ 848e716630dSMartin Matuska if (row_use_scratch) 849e716630dSMartin Matuska rc->rc_offset -= VDEV_BOOT_SIZE; 850e716630dSMartin Matuska 851e716630dSMartin Matuska uint64_t dc = c - rr->rr_firstdatacol; 852e716630dSMartin Matuska if (c < rr->rr_firstdatacol) { 853e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 854e716630dSMartin Matuska 855e716630dSMartin Matuska /* 856e716630dSMartin Matuska * Parity sectors' rc_abd's are set below 857e716630dSMartin Matuska * after determining if this is an aggregation. 858e716630dSMartin Matuska */ 859e716630dSMartin Matuska } else if (row == rows - 1 && bc != 0 && c >= bc) { 860e716630dSMartin Matuska /* 861e716630dSMartin Matuska * Past the end of the block (even including 862e716630dSMartin Matuska * skip sectors). This sector is part of the 863e716630dSMartin Matuska * map so that we have full rows for p/q parity 864e716630dSMartin Matuska * generation. 865e716630dSMartin Matuska */ 866e716630dSMartin Matuska rc->rc_size = 0; 867e716630dSMartin Matuska rc->rc_abd = NULL; 868e716630dSMartin Matuska } else { 869e716630dSMartin Matuska /* "data column" (col excluding parity) */ 870e716630dSMartin Matuska uint64_t off; 871e716630dSMartin Matuska 872e716630dSMartin Matuska if (c < bc || r == 0) { 873e716630dSMartin Matuska off = dc * rows + row; 874e716630dSMartin Matuska } else { 875e716630dSMartin Matuska off = r * rows + 876e716630dSMartin Matuska (dc - r) * (rows - 1) + row; 877e716630dSMartin Matuska } 878e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 879e716630dSMartin Matuska rc->rc_abd = abd_get_offset_struct( 880e716630dSMartin Matuska &rc->rc_abdstruct, abd, off << ashift, 881e716630dSMartin Matuska rc->rc_size); 882e716630dSMartin Matuska } 883e716630dSMartin Matuska 884e716630dSMartin Matuska if (rc->rc_size == 0) 885e716630dSMartin Matuska continue; 886e716630dSMartin Matuska 887e716630dSMartin Matuska /* 888e716630dSMartin Matuska * If any part of this row is in both old and new 889e716630dSMartin Matuska * locations, the primary location is the old 890e716630dSMartin Matuska * location. If this sector was already copied to the 891e716630dSMartin Matuska * new location, we need to also write to the new, 892e716630dSMartin Matuska * "shadow" location. 893e716630dSMartin Matuska * 894e716630dSMartin Matuska * Note, `row_phys_cols != physical_cols` indicates 895e716630dSMartin Matuska * that the primary location is the old location. 896e716630dSMartin Matuska * `b+c < reflow_offset_next` indicates that the copy 897e716630dSMartin Matuska * to the new location has been initiated. We know 898e716630dSMartin Matuska * that the copy has completed because we have the 899e716630dSMartin Matuska * rangelock, which is held exclusively while the 900e716630dSMartin Matuska * copy is in progress. 901e716630dSMartin Matuska */ 902e716630dSMartin Matuska if (row_use_scratch || 903e716630dSMartin Matuska (row_phys_cols != physical_cols && 904e716630dSMartin Matuska b + c < reflow_offset_next >> ashift)) { 905e716630dSMartin Matuska rc->rc_shadow_devidx = (b + c) % physical_cols; 906e716630dSMartin Matuska rc->rc_shadow_offset = 907e716630dSMartin Matuska ((b + c) / physical_cols) << ashift; 908e716630dSMartin Matuska if (row_use_scratch) 909e716630dSMartin Matuska rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 910e716630dSMartin Matuska } 911e716630dSMartin Matuska 912e716630dSMartin Matuska asize += rc->rc_size; 913e716630dSMartin Matuska } 914e716630dSMartin Matuska 915e716630dSMartin Matuska /* 916e716630dSMartin Matuska * See comment in vdev_raidz_map_alloc() 917e716630dSMartin Matuska */ 918e716630dSMartin Matuska if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 919e716630dSMartin Matuska (offset & (1ULL << 20))) { 920e716630dSMartin Matuska ASSERT(rr->rr_cols >= 2); 921e716630dSMartin Matuska ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 922e716630dSMartin Matuska 923e716630dSMartin Matuska int devidx0 = rr->rr_col[0].rc_devidx; 924e716630dSMartin Matuska uint64_t offset0 = rr->rr_col[0].rc_offset; 925e716630dSMartin Matuska int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 926e716630dSMartin Matuska uint64_t shadow_offset0 = 927e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset; 928e716630dSMartin Matuska 929e716630dSMartin Matuska rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 930e716630dSMartin Matuska rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 931e716630dSMartin Matuska rr->rr_col[0].rc_shadow_devidx = 932e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx; 933e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset = 934e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset; 935e716630dSMartin Matuska 936e716630dSMartin Matuska rr->rr_col[1].rc_devidx = devidx0; 937e716630dSMartin Matuska rr->rr_col[1].rc_offset = offset0; 938e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 939e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset = shadow_offset0; 940e716630dSMartin Matuska } 941e716630dSMartin Matuska } 942e716630dSMartin Matuska ASSERT3U(asize, ==, tot << ashift); 943e716630dSMartin Matuska 944e716630dSMartin Matuska /* 945e716630dSMartin Matuska * Determine if the block is contiguous, in which case we can use 946e716630dSMartin Matuska * an aggregation. 947e716630dSMartin Matuska */ 948e716630dSMartin Matuska if (rows >= raidz_io_aggregate_rows) { 949e716630dSMartin Matuska rm->rm_nphys_cols = physical_cols; 950e716630dSMartin Matuska rm->rm_phys_col = 951e716630dSMartin Matuska kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 952e716630dSMartin Matuska KM_SLEEP); 953e716630dSMartin Matuska 954e716630dSMartin Matuska /* 955e716630dSMartin Matuska * Determine the aggregate io's offset and size, and check 956e716630dSMartin Matuska * that the io is contiguous. 957e716630dSMartin Matuska */ 958e716630dSMartin Matuska for (int i = 0; 959e716630dSMartin Matuska i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 960e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 961e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 962e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 963e716630dSMartin Matuska raidz_col_t *prc = 964e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 965e716630dSMartin Matuska 966e716630dSMartin Matuska if (rc->rc_size == 0) 967e716630dSMartin Matuska continue; 968e716630dSMartin Matuska 969e716630dSMartin Matuska if (prc->rc_size == 0) { 970e716630dSMartin Matuska ASSERT0(prc->rc_offset); 971e716630dSMartin Matuska prc->rc_offset = rc->rc_offset; 972e716630dSMartin Matuska } else if (prc->rc_offset + prc->rc_size != 973e716630dSMartin Matuska rc->rc_offset) { 974e716630dSMartin Matuska /* 975e716630dSMartin Matuska * This block is not contiguous and 976e716630dSMartin Matuska * therefore can't be aggregated. 977e716630dSMartin Matuska * This is expected to be rare, so 978e716630dSMartin Matuska * the cost of allocating and then 979e716630dSMartin Matuska * freeing rm_phys_col is not 980e716630dSMartin Matuska * significant. 981e716630dSMartin Matuska */ 982e716630dSMartin Matuska kmem_free(rm->rm_phys_col, 983e716630dSMartin Matuska sizeof (raidz_col_t) * 984e716630dSMartin Matuska rm->rm_nphys_cols); 985e716630dSMartin Matuska rm->rm_phys_col = NULL; 986e716630dSMartin Matuska rm->rm_nphys_cols = 0; 987e716630dSMartin Matuska break; 988e716630dSMartin Matuska } 989e716630dSMartin Matuska prc->rc_size += rc->rc_size; 990e716630dSMartin Matuska } 991e716630dSMartin Matuska } 992e716630dSMartin Matuska } 993e716630dSMartin Matuska if (rm->rm_phys_col != NULL) { 994e716630dSMartin Matuska /* 995e716630dSMartin Matuska * Allocate aggregate ABD's. 996e716630dSMartin Matuska */ 997e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 998e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 999e716630dSMartin Matuska 1000e716630dSMartin Matuska prc->rc_devidx = i; 1001e716630dSMartin Matuska 1002e716630dSMartin Matuska if (prc->rc_size == 0) 1003e716630dSMartin Matuska continue; 1004e716630dSMartin Matuska 1005e716630dSMartin Matuska prc->rc_abd = 1006e716630dSMartin Matuska abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1007e716630dSMartin Matuska B_FALSE); 1008e716630dSMartin Matuska } 1009e716630dSMartin Matuska 1010e716630dSMartin Matuska /* 1011e716630dSMartin Matuska * Point the parity abd's into the aggregate abd's. 1012e716630dSMartin Matuska */ 1013e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1014e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1015e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1016e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1017e716630dSMartin Matuska raidz_col_t *prc = 1018e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 1019e716630dSMartin Matuska rc->rc_abd = 1020e716630dSMartin Matuska abd_get_offset_struct(&rc->rc_abdstruct, 1021e716630dSMartin Matuska prc->rc_abd, 1022e716630dSMartin Matuska rc->rc_offset - prc->rc_offset, 1023e716630dSMartin Matuska rc->rc_size); 1024e716630dSMartin Matuska } 1025e716630dSMartin Matuska } 1026e716630dSMartin Matuska } else { 1027e716630dSMartin Matuska /* 1028e716630dSMartin Matuska * Allocate new abd's for the parity sectors. 1029e716630dSMartin Matuska */ 1030e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1031e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1032e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1033e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1034e716630dSMartin Matuska rc->rc_abd = 1035e716630dSMartin Matuska abd_alloc_linear(rc->rc_size, 1036e716630dSMartin Matuska B_TRUE); 1037e716630dSMartin Matuska } 1038e716630dSMartin Matuska } 1039e716630dSMartin Matuska } 1040eda14cbcSMatt Macy /* init RAIDZ parity ops */ 1041eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops(); 1042eda14cbcSMatt Macy 1043eda14cbcSMatt Macy return (rm); 1044eda14cbcSMatt Macy } 1045eda14cbcSMatt Macy 1046eda14cbcSMatt Macy struct pqr_struct { 1047eda14cbcSMatt Macy uint64_t *p; 1048eda14cbcSMatt Macy uint64_t *q; 1049eda14cbcSMatt Macy uint64_t *r; 1050eda14cbcSMatt Macy }; 1051eda14cbcSMatt Macy 1052eda14cbcSMatt Macy static int 1053eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private) 1054eda14cbcSMatt Macy { 1055eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1056eda14cbcSMatt Macy const uint64_t *src = buf; 1057e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1058eda14cbcSMatt Macy 1059eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r); 1060eda14cbcSMatt Macy 1061e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++) 1062eda14cbcSMatt Macy *pqr->p ^= *src; 1063eda14cbcSMatt Macy 1064eda14cbcSMatt Macy return (0); 1065eda14cbcSMatt Macy } 1066eda14cbcSMatt Macy 1067eda14cbcSMatt Macy static int 1068eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private) 1069eda14cbcSMatt Macy { 1070eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1071eda14cbcSMatt Macy const uint64_t *src = buf; 1072eda14cbcSMatt Macy uint64_t mask; 1073e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1074eda14cbcSMatt Macy 1075eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r); 1076eda14cbcSMatt Macy 1077e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1078eda14cbcSMatt Macy *pqr->p ^= *src; 1079eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1080eda14cbcSMatt Macy *pqr->q ^= *src; 1081eda14cbcSMatt Macy } 1082eda14cbcSMatt Macy 1083eda14cbcSMatt Macy return (0); 1084eda14cbcSMatt Macy } 1085eda14cbcSMatt Macy 1086eda14cbcSMatt Macy static int 1087eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1088eda14cbcSMatt Macy { 1089eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1090eda14cbcSMatt Macy const uint64_t *src = buf; 1091eda14cbcSMatt Macy uint64_t mask; 1092e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1093eda14cbcSMatt Macy 1094eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r); 1095eda14cbcSMatt Macy 1096e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1097eda14cbcSMatt Macy *pqr->p ^= *src; 1098eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1099eda14cbcSMatt Macy *pqr->q ^= *src; 1100eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1101eda14cbcSMatt Macy *pqr->r ^= *src; 1102eda14cbcSMatt Macy } 1103eda14cbcSMatt Macy 1104eda14cbcSMatt Macy return (0); 1105eda14cbcSMatt Macy } 1106eda14cbcSMatt Macy 1107eda14cbcSMatt Macy static void 11087877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr) 1109eda14cbcSMatt Macy { 11107877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1111eda14cbcSMatt Macy 11127877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11137877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1114eda14cbcSMatt Macy 11157877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 11167877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1117eda14cbcSMatt Macy } else { 1118eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL }; 11197877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1120eda14cbcSMatt Macy vdev_raidz_p_func, &pqr); 1121eda14cbcSMatt Macy } 1122eda14cbcSMatt Macy } 1123eda14cbcSMatt Macy } 1124eda14cbcSMatt Macy 1125eda14cbcSMatt Macy static void 11267877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1127eda14cbcSMatt Macy { 11287877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11297877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11307877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11317877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11327877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1133eda14cbcSMatt Macy 11347877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11357877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1136eda14cbcSMatt Macy 11377877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1138eda14cbcSMatt Macy 11397877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1140eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11417877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11427877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 1143eda14cbcSMatt Macy 11447877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1145eda14cbcSMatt Macy p[i] = 0; 1146eda14cbcSMatt Macy q[i] = 0; 1147eda14cbcSMatt Macy } 1148eda14cbcSMatt Macy } else { 1149eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL }; 1150eda14cbcSMatt Macy 1151eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11527877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1153eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr); 1154eda14cbcSMatt Macy 1155eda14cbcSMatt Macy /* 1156eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1157eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1158eda14cbcSMatt Macy */ 11597877fdebSMatt Macy uint64_t mask; 11607877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1161eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1162eda14cbcSMatt Macy } 1163eda14cbcSMatt Macy } 1164eda14cbcSMatt Macy } 1165eda14cbcSMatt Macy } 1166eda14cbcSMatt Macy 1167eda14cbcSMatt Macy static void 11687877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1169eda14cbcSMatt Macy { 11707877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11717877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11727877fdebSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 11737877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11747877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11757877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 11767877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11777877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size); 1178eda14cbcSMatt Macy 11797877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11807877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1181eda14cbcSMatt Macy 11827877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1183eda14cbcSMatt Macy 11847877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1185eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11867877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11877877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 11887877fdebSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size); 1189eda14cbcSMatt Macy 11907877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1191eda14cbcSMatt Macy p[i] = 0; 1192eda14cbcSMatt Macy q[i] = 0; 1193eda14cbcSMatt Macy r[i] = 0; 1194eda14cbcSMatt Macy } 1195eda14cbcSMatt Macy } else { 1196eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r }; 1197eda14cbcSMatt Macy 1198eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11997877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1200eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr); 1201eda14cbcSMatt Macy 1202eda14cbcSMatt Macy /* 1203eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1204eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1205eda14cbcSMatt Macy */ 12067877fdebSMatt Macy uint64_t mask; 12077877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1208eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1209eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask); 1210eda14cbcSMatt Macy } 1211eda14cbcSMatt Macy } 1212eda14cbcSMatt Macy } 1213eda14cbcSMatt Macy } 1214eda14cbcSMatt Macy 1215eda14cbcSMatt Macy /* 1216eda14cbcSMatt Macy * Generate RAID parity in the first virtual columns according to the number of 1217eda14cbcSMatt Macy * parity columns available. 1218eda14cbcSMatt Macy */ 1219eda14cbcSMatt Macy void 12207877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1221eda14cbcSMatt Macy { 1222e716630dSMartin Matuska if (rr->rr_cols == 0) { 1223e716630dSMartin Matuska /* 1224e716630dSMartin Matuska * We are handling this block one row at a time (because 1225e716630dSMartin Matuska * this block has a different logical vs physical width, 1226e716630dSMartin Matuska * due to RAIDZ expansion), and this is a pad-only row, 1227e716630dSMartin Matuska * which has no parity. 1228e716630dSMartin Matuska */ 1229e716630dSMartin Matuska return; 1230e716630dSMartin Matuska } 12317877fdebSMatt Macy 1232eda14cbcSMatt Macy /* Generate using the new math implementation */ 12337877fdebSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1234eda14cbcSMatt Macy return; 1235eda14cbcSMatt Macy 12367877fdebSMatt Macy switch (rr->rr_firstdatacol) { 1237eda14cbcSMatt Macy case 1: 12387877fdebSMatt Macy vdev_raidz_generate_parity_p(rr); 1239eda14cbcSMatt Macy break; 1240eda14cbcSMatt Macy case 2: 12417877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1242eda14cbcSMatt Macy break; 1243eda14cbcSMatt Macy case 3: 12447877fdebSMatt Macy vdev_raidz_generate_parity_pqr(rr); 1245eda14cbcSMatt Macy break; 1246eda14cbcSMatt Macy default: 1247eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1248eda14cbcSMatt Macy } 1249eda14cbcSMatt Macy } 1250eda14cbcSMatt Macy 12517877fdebSMatt Macy void 12527877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm) 12537877fdebSMatt Macy { 12547877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 12557877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 12567877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 12577877fdebSMatt Macy } 12587877fdebSMatt Macy } 12597877fdebSMatt Macy 1260eda14cbcSMatt Macy static int 1261eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1262eda14cbcSMatt Macy { 1263e92ffd9bSMartin Matuska (void) private; 1264eda14cbcSMatt Macy uint64_t *dst = dbuf; 1265eda14cbcSMatt Macy uint64_t *src = sbuf; 1266eda14cbcSMatt Macy int cnt = size / sizeof (src[0]); 1267eda14cbcSMatt Macy 1268eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) { 1269eda14cbcSMatt Macy dst[i] ^= src[i]; 1270eda14cbcSMatt Macy } 1271eda14cbcSMatt Macy 1272eda14cbcSMatt Macy return (0); 1273eda14cbcSMatt Macy } 1274eda14cbcSMatt Macy 1275eda14cbcSMatt Macy static int 1276eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1277eda14cbcSMatt Macy void *private) 1278eda14cbcSMatt Macy { 1279e92ffd9bSMartin Matuska (void) private; 1280eda14cbcSMatt Macy uint64_t *dst = dbuf; 1281eda14cbcSMatt Macy uint64_t *src = sbuf; 1282eda14cbcSMatt Macy uint64_t mask; 1283eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1284eda14cbcSMatt Macy 1285eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) { 1286eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1287eda14cbcSMatt Macy *dst ^= *src; 1288eda14cbcSMatt Macy } 1289eda14cbcSMatt Macy 1290eda14cbcSMatt Macy return (0); 1291eda14cbcSMatt Macy } 1292eda14cbcSMatt Macy 1293eda14cbcSMatt Macy static int 1294eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1295eda14cbcSMatt Macy { 1296e92ffd9bSMartin Matuska (void) private; 1297eda14cbcSMatt Macy uint64_t *dst = buf; 1298eda14cbcSMatt Macy uint64_t mask; 1299eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1300eda14cbcSMatt Macy 1301eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) { 1302eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1303eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1304eda14cbcSMatt Macy } 1305eda14cbcSMatt Macy 1306eda14cbcSMatt Macy return (0); 1307eda14cbcSMatt Macy } 1308eda14cbcSMatt Macy 1309eda14cbcSMatt Macy struct reconst_q_struct { 1310eda14cbcSMatt Macy uint64_t *q; 1311eda14cbcSMatt Macy int exp; 1312eda14cbcSMatt Macy }; 1313eda14cbcSMatt Macy 1314eda14cbcSMatt Macy static int 1315eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1316eda14cbcSMatt Macy { 1317eda14cbcSMatt Macy struct reconst_q_struct *rq = private; 1318eda14cbcSMatt Macy uint64_t *dst = buf; 1319eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1320eda14cbcSMatt Macy 1321eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1322eda14cbcSMatt Macy int j; 1323eda14cbcSMatt Macy uint8_t *b; 1324eda14cbcSMatt Macy 1325eda14cbcSMatt Macy *dst ^= *rq->q; 1326eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1327eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp); 1328eda14cbcSMatt Macy } 1329eda14cbcSMatt Macy } 1330eda14cbcSMatt Macy 1331eda14cbcSMatt Macy return (0); 1332eda14cbcSMatt Macy } 1333eda14cbcSMatt Macy 1334eda14cbcSMatt Macy struct reconst_pq_struct { 1335eda14cbcSMatt Macy uint8_t *p; 1336eda14cbcSMatt Macy uint8_t *q; 1337eda14cbcSMatt Macy uint8_t *pxy; 1338eda14cbcSMatt Macy uint8_t *qxy; 1339eda14cbcSMatt Macy int aexp; 1340eda14cbcSMatt Macy int bexp; 1341eda14cbcSMatt Macy }; 1342eda14cbcSMatt Macy 1343eda14cbcSMatt Macy static int 1344eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1345eda14cbcSMatt Macy { 1346eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1347eda14cbcSMatt Macy uint8_t *xd = xbuf; 1348eda14cbcSMatt Macy uint8_t *yd = ybuf; 1349eda14cbcSMatt Macy 1350eda14cbcSMatt Macy for (int i = 0; i < size; 1351eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1352eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1353eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1354eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1355eda14cbcSMatt Macy } 1356eda14cbcSMatt Macy 1357eda14cbcSMatt Macy return (0); 1358eda14cbcSMatt Macy } 1359eda14cbcSMatt Macy 1360eda14cbcSMatt Macy static int 1361eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1362eda14cbcSMatt Macy { 1363eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1364eda14cbcSMatt Macy uint8_t *xd = xbuf; 1365eda14cbcSMatt Macy 1366eda14cbcSMatt Macy for (int i = 0; i < size; 1367eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1368eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1369eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1370eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1371eda14cbcSMatt Macy } 1372eda14cbcSMatt Macy 1373eda14cbcSMatt Macy return (0); 1374eda14cbcSMatt Macy } 1375eda14cbcSMatt Macy 1376f9693befSMartin Matuska static void 13777877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1378eda14cbcSMatt Macy { 1379eda14cbcSMatt Macy int x = tgts[0]; 1380eda14cbcSMatt Macy abd_t *dst, *src; 1381eda14cbcSMatt Macy 1382e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1383e716630dSMartin Matuska zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1384e716630dSMartin Matuska 13857877fdebSMatt Macy ASSERT3U(ntgts, ==, 1); 13867877fdebSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol); 13877877fdebSMatt Macy ASSERT3U(x, <, rr->rr_cols); 1388eda14cbcSMatt Macy 13897877fdebSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1390eda14cbcSMatt Macy 13917877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 13927877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1393eda14cbcSMatt Macy 13947877fdebSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1395eda14cbcSMatt Macy 13967877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 13977877fdebSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size, 13987877fdebSMatt Macy rr->rr_col[c].rc_size); 1399eda14cbcSMatt Macy 14007877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 1401eda14cbcSMatt Macy 1402eda14cbcSMatt Macy if (c == x) 1403eda14cbcSMatt Macy continue; 1404eda14cbcSMatt Macy 1405eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1406eda14cbcSMatt Macy vdev_raidz_reconst_p_func, NULL); 1407eda14cbcSMatt Macy } 1408eda14cbcSMatt Macy } 1409eda14cbcSMatt Macy 1410f9693befSMartin Matuska static void 14117877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1412eda14cbcSMatt Macy { 1413eda14cbcSMatt Macy int x = tgts[0]; 1414eda14cbcSMatt Macy int c, exp; 1415eda14cbcSMatt Macy abd_t *dst, *src; 1416eda14cbcSMatt Macy 1417e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1418e716630dSMartin Matuska zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1419e716630dSMartin Matuska 1420eda14cbcSMatt Macy ASSERT(ntgts == 1); 1421eda14cbcSMatt Macy 14227877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1423eda14cbcSMatt Macy 14247877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14257877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 14267877fdebSMatt Macy rr->rr_col[c].rc_size); 1427eda14cbcSMatt Macy 14287877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 14297877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1430eda14cbcSMatt Macy 14317877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1432eda14cbcSMatt Macy abd_copy(dst, src, size); 14337877fdebSMatt Macy if (rr->rr_col[x].rc_size > size) { 1434eda14cbcSMatt Macy abd_zero_off(dst, size, 14357877fdebSMatt Macy rr->rr_col[x].rc_size - size); 14367877fdebSMatt Macy } 1437eda14cbcSMatt Macy } else { 14387877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1439eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1440eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL); 1441eda14cbcSMatt Macy (void) abd_iterate_func(dst, 14427877fdebSMatt Macy size, rr->rr_col[x].rc_size - size, 1443eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL); 1444eda14cbcSMatt Macy } 1445eda14cbcSMatt Macy } 1446eda14cbcSMatt Macy 14477877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14487877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 14497877fdebSMatt Macy exp = 255 - (rr->rr_cols - 1 - x); 1450eda14cbcSMatt Macy 1451eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp }; 14527877fdebSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1453eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq); 1454eda14cbcSMatt Macy } 1455eda14cbcSMatt Macy 1456f9693befSMartin Matuska static void 14577877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1458eda14cbcSMatt Macy { 1459eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1460eda14cbcSMatt Macy abd_t *pdata, *qdata; 1461eda14cbcSMatt Macy uint64_t xsize, ysize; 1462eda14cbcSMatt Macy int x = tgts[0]; 1463eda14cbcSMatt Macy int y = tgts[1]; 1464eda14cbcSMatt Macy abd_t *xd, *yd; 1465eda14cbcSMatt Macy 1466e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1467e716630dSMartin Matuska zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1468e716630dSMartin Matuska 1469eda14cbcSMatt Macy ASSERT(ntgts == 2); 1470eda14cbcSMatt Macy ASSERT(x < y); 14717877fdebSMatt Macy ASSERT(x >= rr->rr_firstdatacol); 14727877fdebSMatt Macy ASSERT(y < rr->rr_cols); 1473eda14cbcSMatt Macy 14747877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1475eda14cbcSMatt Macy 1476eda14cbcSMatt Macy /* 1477eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as 1478eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1479eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual 1480eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by 1481eda14cbcSMatt Macy * setting their lengths to zero. 1482eda14cbcSMatt Macy */ 14837877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 14847877fdebSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14857877fdebSMatt Macy xsize = rr->rr_col[x].rc_size; 14867877fdebSMatt Macy ysize = rr->rr_col[y].rc_size; 1487eda14cbcSMatt Macy 14887877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = 14897877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 14907877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 14917877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 14927877fdebSMatt Macy rr->rr_col[x].rc_size = 0; 14937877fdebSMatt Macy rr->rr_col[y].rc_size = 0; 1494eda14cbcSMatt Macy 14957877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1496eda14cbcSMatt Macy 14977877fdebSMatt Macy rr->rr_col[x].rc_size = xsize; 14987877fdebSMatt Macy rr->rr_col[y].rc_size = ysize; 1499eda14cbcSMatt Macy 1500eda14cbcSMatt Macy p = abd_to_buf(pdata); 1501eda14cbcSMatt Macy q = abd_to_buf(qdata); 15027877fdebSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15037877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 15047877fdebSMatt Macy xd = rr->rr_col[x].rc_abd; 15057877fdebSMatt Macy yd = rr->rr_col[y].rc_abd; 1506eda14cbcSMatt Macy 1507eda14cbcSMatt Macy /* 1508eda14cbcSMatt Macy * We now have: 1509eda14cbcSMatt Macy * Pxy = P + D_x + D_y 1510eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1511eda14cbcSMatt Macy * 1512eda14cbcSMatt Macy * We can then solve for D_x: 1513eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy) 1514eda14cbcSMatt Macy * where 1515eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1 1516eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1517eda14cbcSMatt Macy * 1518eda14cbcSMatt Macy * With D_x in hand, we can easily solve for D_y: 1519eda14cbcSMatt Macy * D_y = P + Pxy + D_x 1520eda14cbcSMatt Macy */ 1521eda14cbcSMatt Macy 1522eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y]; 15237877fdebSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1524eda14cbcSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1]; 1525eda14cbcSMatt Macy 1526eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1527eda14cbcSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1528eda14cbcSMatt Macy 1529eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize); 1530eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1531eda14cbcSMatt Macy 1532eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1533eda14cbcSMatt Macy vdev_raidz_reconst_pq_func, &rpq); 1534eda14cbcSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize, 1535eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq); 1536eda14cbcSMatt Macy 15377877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15387877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1539eda14cbcSMatt Macy 1540eda14cbcSMatt Macy /* 1541eda14cbcSMatt Macy * Restore the saved parity data. 1542eda14cbcSMatt Macy */ 15437877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 15447877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1545eda14cbcSMatt Macy } 1546eda14cbcSMatt Macy 1547eda14cbcSMatt Macy /* 1548eda14cbcSMatt Macy * In the general case of reconstruction, we must solve the system of linear 1549eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as 1550eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with 1551eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p) 1552eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1553eda14cbcSMatt Macy * 1554eda14cbcSMatt Macy * __ __ __ __ 1555eda14cbcSMatt Macy * | | __ __ | p_0 | 1556eda14cbcSMatt Macy * | V | | D_0 | | p_m-1 | 1557eda14cbcSMatt Macy * | | x | : | = | d_0 | 1558eda14cbcSMatt Macy * | I | | D_n-1 | | : | 1559eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 | 1560eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1561eda14cbcSMatt Macy * 1562eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde 1563eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns 1564eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1565eda14cbcSMatt Macy * computation as well as linear separability. 1566eda14cbcSMatt Macy * 1567eda14cbcSMatt Macy * __ __ __ __ 1568eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 | 1569eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : | 1570eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1571eda14cbcSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 | 1572eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1573eda14cbcSMatt Macy * | : : : : | | : | | d_2 | 1574eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : | 1575eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : | 1576eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 | 1577eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1578eda14cbcSMatt Macy * 1579eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the 1580eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown 1581eda14cbcSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond 1582eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p 1583eda14cbcSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up 1584eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1585eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity 1586eda14cbcSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1587eda14cbcSMatt Macy * __ __ 1588eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1589eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1590eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / / 1591eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / / 1592eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' / 1593eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1594eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1595eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1596eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1597eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1598eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1599eda14cbcSMatt Macy * ~~ ~~ 1600eda14cbcSMatt Macy * __ __ 1601eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1602eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | 1603eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | 1604eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | 1605eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | 1606eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 | 1607eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1608eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1609eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1610eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1611eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1612eda14cbcSMatt Macy * ~~ ~~ 1613eda14cbcSMatt Macy * 1614eda14cbcSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1615eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1616eda14cbcSMatt Macy * matrix is not singular. 1617eda14cbcSMatt Macy * __ __ 1618eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1619eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1620eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1621eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1622eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1623eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1624eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1625eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1626eda14cbcSMatt Macy * ~~ ~~ 1627eda14cbcSMatt Macy * __ __ 1628eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1629eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1630eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1631eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1632eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1633eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1634eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1635eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1636eda14cbcSMatt Macy * ~~ ~~ 1637eda14cbcSMatt Macy * __ __ 1638eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1639eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1640eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1641eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1642eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1643eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1644eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1645eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1646eda14cbcSMatt Macy * ~~ ~~ 1647eda14cbcSMatt Macy * __ __ 1648eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1649eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1650eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1651eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1652eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1653eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1654eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1655eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1656eda14cbcSMatt Macy * ~~ ~~ 1657eda14cbcSMatt Macy * __ __ 1658eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1659eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1660eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1661eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1662eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1663eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1664eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1665eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1666eda14cbcSMatt Macy * ~~ ~~ 1667eda14cbcSMatt Macy * __ __ 1668eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1669eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1670eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1671eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1672eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1673eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1674eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1675eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1676eda14cbcSMatt Macy * ~~ ~~ 1677eda14cbcSMatt Macy * __ __ 1678eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 | 1679eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 | 1680eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 | 1681eda14cbcSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1682eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1683eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1684eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1685eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1686eda14cbcSMatt Macy * ~~ ~~ 1687eda14cbcSMatt Macy * 1688eda14cbcSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1689eda14cbcSMatt Macy * of the missing data. 1690eda14cbcSMatt Macy * 1691eda14cbcSMatt Macy * As is apparent from the example above, the only non-trivial rows in the 1692eda14cbcSMatt Macy * inverse matrix correspond to the data disks that we're trying to 1693eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would 1694eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For 1695eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to 1696eda14cbcSMatt Macy * targeted columns. 1697eda14cbcSMatt Macy */ 1698eda14cbcSMatt Macy 1699eda14cbcSMatt Macy static void 17007877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1701eda14cbcSMatt Macy uint8_t **rows) 1702eda14cbcSMatt Macy { 1703eda14cbcSMatt Macy int i, j; 1704eda14cbcSMatt Macy int pow; 1705eda14cbcSMatt Macy 17067877fdebSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1707eda14cbcSMatt Macy 1708eda14cbcSMatt Macy /* 1709eda14cbcSMatt Macy * Fill in the missing rows of interest. 1710eda14cbcSMatt Macy */ 1711eda14cbcSMatt Macy for (i = 0; i < nmap; i++) { 1712eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]); 1713eda14cbcSMatt Macy ASSERT3S(map[i], <=, 2); 1714eda14cbcSMatt Macy 1715eda14cbcSMatt Macy pow = map[i] * n; 1716eda14cbcSMatt Macy if (pow > 255) 1717eda14cbcSMatt Macy pow -= 255; 1718eda14cbcSMatt Macy ASSERT(pow <= 255); 1719eda14cbcSMatt Macy 1720eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1721eda14cbcSMatt Macy pow -= map[i]; 1722eda14cbcSMatt Macy if (pow < 0) 1723eda14cbcSMatt Macy pow += 255; 1724eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow]; 1725eda14cbcSMatt Macy } 1726eda14cbcSMatt Macy } 1727eda14cbcSMatt Macy } 1728eda14cbcSMatt Macy 1729eda14cbcSMatt Macy static void 17307877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1731eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1732eda14cbcSMatt Macy { 1733eda14cbcSMatt Macy int i, j, ii, jj; 1734eda14cbcSMatt Macy uint8_t log; 1735eda14cbcSMatt Macy 1736eda14cbcSMatt Macy /* 1737eda14cbcSMatt Macy * Assert that the first nmissing entries from the array of used 1738eda14cbcSMatt Macy * columns correspond to parity columns and that subsequent entries 1739eda14cbcSMatt Macy * correspond to data columns. 1740eda14cbcSMatt Macy */ 1741eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 17427877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol); 1743eda14cbcSMatt Macy } 1744eda14cbcSMatt Macy for (; i < n; i++) { 17457877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1746eda14cbcSMatt Macy } 1747eda14cbcSMatt Macy 1748eda14cbcSMatt Macy /* 1749eda14cbcSMatt Macy * First initialize the storage where we'll compute the inverse rows. 1750eda14cbcSMatt Macy */ 1751eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1752eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1753eda14cbcSMatt Macy invrows[i][j] = (i == j) ? 1 : 0; 1754eda14cbcSMatt Macy } 1755eda14cbcSMatt Macy } 1756eda14cbcSMatt Macy 1757eda14cbcSMatt Macy /* 1758eda14cbcSMatt Macy * Subtract all trivial rows from the rows of consequence. 1759eda14cbcSMatt Macy */ 1760eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1761eda14cbcSMatt Macy for (j = nmissing; j < n; j++) { 17627877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol); 17637877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol; 1764eda14cbcSMatt Macy ASSERT3S(jj, <, n); 1765eda14cbcSMatt Macy invrows[i][j] = rows[i][jj]; 1766eda14cbcSMatt Macy rows[i][jj] = 0; 1767eda14cbcSMatt Macy } 1768eda14cbcSMatt Macy } 1769eda14cbcSMatt Macy 1770eda14cbcSMatt Macy /* 1771eda14cbcSMatt Macy * For each of the rows of interest, we must normalize it and subtract 1772eda14cbcSMatt Macy * a multiple of it from the other rows. 1773eda14cbcSMatt Macy */ 1774eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1775eda14cbcSMatt Macy for (j = 0; j < missing[i]; j++) { 1776eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1777eda14cbcSMatt Macy } 1778eda14cbcSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0); 1779eda14cbcSMatt Macy 1780eda14cbcSMatt Macy /* 1781eda14cbcSMatt Macy * Compute the inverse of the first element and multiply each 1782eda14cbcSMatt Macy * element in the row by that value. 1783eda14cbcSMatt Macy */ 1784eda14cbcSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1785eda14cbcSMatt Macy 1786eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1787eda14cbcSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1788eda14cbcSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1789eda14cbcSMatt Macy } 1790eda14cbcSMatt Macy 1791eda14cbcSMatt Macy for (ii = 0; ii < nmissing; ii++) { 1792eda14cbcSMatt Macy if (i == ii) 1793eda14cbcSMatt Macy continue; 1794eda14cbcSMatt Macy 1795eda14cbcSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0); 1796eda14cbcSMatt Macy 1797eda14cbcSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]]; 1798eda14cbcSMatt Macy 1799eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1800eda14cbcSMatt Macy rows[ii][j] ^= 1801eda14cbcSMatt Macy vdev_raidz_exp2(rows[i][j], log); 1802eda14cbcSMatt Macy invrows[ii][j] ^= 1803eda14cbcSMatt Macy vdev_raidz_exp2(invrows[i][j], log); 1804eda14cbcSMatt Macy } 1805eda14cbcSMatt Macy } 1806eda14cbcSMatt Macy } 1807eda14cbcSMatt Macy 1808eda14cbcSMatt Macy /* 1809eda14cbcSMatt Macy * Verify that the data that is left in the rows are properly part of 1810eda14cbcSMatt Macy * an identity matrix. 1811eda14cbcSMatt Macy */ 1812eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1813eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1814eda14cbcSMatt Macy if (j == missing[i]) { 1815eda14cbcSMatt Macy ASSERT3U(rows[i][j], ==, 1); 1816eda14cbcSMatt Macy } else { 1817eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1818eda14cbcSMatt Macy } 1819eda14cbcSMatt Macy } 1820eda14cbcSMatt Macy } 1821eda14cbcSMatt Macy } 1822eda14cbcSMatt Macy 1823eda14cbcSMatt Macy static void 18247877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1825eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used) 1826eda14cbcSMatt Macy { 1827eda14cbcSMatt Macy int i, j, x, cc, c; 1828eda14cbcSMatt Macy uint8_t *src; 1829eda14cbcSMatt Macy uint64_t ccount; 1830eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1831eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1832eda14cbcSMatt Macy uint8_t log = 0; 1833eda14cbcSMatt Macy uint8_t val; 1834eda14cbcSMatt Macy int ll; 1835eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1836eda14cbcSMatt Macy uint8_t *p, *pp; 1837eda14cbcSMatt Macy size_t psize; 1838eda14cbcSMatt Macy 1839eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing; 1840eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1841eda14cbcSMatt Macy 1842eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing; i++) { 1843eda14cbcSMatt Macy invlog[i] = pp; 1844eda14cbcSMatt Macy pp += n; 1845eda14cbcSMatt Macy } 1846eda14cbcSMatt Macy 1847eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1848eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1849eda14cbcSMatt Macy ASSERT3U(invrows[i][j], !=, 0); 1850eda14cbcSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1851eda14cbcSMatt Macy } 1852eda14cbcSMatt Macy } 1853eda14cbcSMatt Macy 1854eda14cbcSMatt Macy for (i = 0; i < n; i++) { 1855eda14cbcSMatt Macy c = used[i]; 18567877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols); 1857eda14cbcSMatt Macy 18587877fdebSMatt Macy ccount = rr->rr_col[c].rc_size; 18597877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 18607877fdebSMatt Macy if (ccount == 0) 18617877fdebSMatt Macy continue; 18627877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd); 1863eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) { 18647877fdebSMatt Macy cc = missing[j] + rr->rr_firstdatacol; 18657877fdebSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol); 18667877fdebSMatt Macy ASSERT3U(cc, <, rr->rr_cols); 1867eda14cbcSMatt Macy ASSERT3U(cc, !=, c); 1868eda14cbcSMatt Macy 18697877fdebSMatt Macy dcount[j] = rr->rr_col[cc].rc_size; 18707877fdebSMatt Macy if (dcount[j] != 0) 18717877fdebSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1872eda14cbcSMatt Macy } 1873eda14cbcSMatt Macy 1874eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) { 1875eda14cbcSMatt Macy if (*src != 0) 1876eda14cbcSMatt Macy log = vdev_raidz_log2[*src]; 1877eda14cbcSMatt Macy 1878eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) { 1879eda14cbcSMatt Macy if (x >= dcount[cc]) 1880eda14cbcSMatt Macy continue; 1881eda14cbcSMatt Macy 1882eda14cbcSMatt Macy if (*src == 0) { 1883eda14cbcSMatt Macy val = 0; 1884eda14cbcSMatt Macy } else { 1885eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255) 1886eda14cbcSMatt Macy ll -= 255; 1887eda14cbcSMatt Macy val = vdev_raidz_pow2[ll]; 1888eda14cbcSMatt Macy } 1889eda14cbcSMatt Macy 1890eda14cbcSMatt Macy if (i == 0) 1891eda14cbcSMatt Macy dst[cc][x] = val; 1892eda14cbcSMatt Macy else 1893eda14cbcSMatt Macy dst[cc][x] ^= val; 1894eda14cbcSMatt Macy } 1895eda14cbcSMatt Macy } 1896eda14cbcSMatt Macy } 1897eda14cbcSMatt Macy 1898eda14cbcSMatt Macy kmem_free(p, psize); 1899eda14cbcSMatt Macy } 1900eda14cbcSMatt Macy 1901f9693befSMartin Matuska static void 19027877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1903eda14cbcSMatt Macy { 1904b985c9caSMartin Matuska int i, c, t, tt; 1905b985c9caSMartin Matuska unsigned int n; 1906b985c9caSMartin Matuska unsigned int nmissing_rows; 1907eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1908eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY]; 1909eda14cbcSMatt Macy uint8_t *p, *pp; 1910eda14cbcSMatt Macy size_t psize; 1911eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1912eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1913eda14cbcSMatt Macy uint8_t *used; 1914eda14cbcSMatt Macy 1915eda14cbcSMatt Macy abd_t **bufs = NULL; 1916eda14cbcSMatt Macy 1917e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1918e716630dSMartin Matuska zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1919eda14cbcSMatt Macy /* 1920eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate 19217877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found. 1922eda14cbcSMatt Macy */ 19237877fdebSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1924e716630dSMartin Matuska ASSERT(rr->rr_col[i].rc_abd != NULL); 19257877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 19267877fdebSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 19277877fdebSMatt Macy KM_PUSHPAGE); 1928eda14cbcSMatt Macy 19297877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 19307877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 1931eda14cbcSMatt Macy 1932eda14cbcSMatt Macy bufs[c] = col->rc_abd; 19337877fdebSMatt Macy if (bufs[c] != NULL) { 19347877fdebSMatt Macy col->rc_abd = abd_alloc_linear( 19357877fdebSMatt Macy col->rc_size, B_TRUE); 19367877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c], 19377877fdebSMatt Macy col->rc_size); 1938eda14cbcSMatt Macy } 1939eda14cbcSMatt Macy } 1940eda14cbcSMatt Macy 19417877fdebSMatt Macy break; 19427877fdebSMatt Macy } 19437877fdebSMatt Macy } 19447877fdebSMatt Macy 19457877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol; 1946eda14cbcSMatt Macy 1947eda14cbcSMatt Macy /* 1948eda14cbcSMatt Macy * Figure out which data columns are missing. 1949eda14cbcSMatt Macy */ 1950eda14cbcSMatt Macy nmissing_rows = 0; 1951eda14cbcSMatt Macy for (t = 0; t < ntgts; t++) { 19527877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) { 1953eda14cbcSMatt Macy missing_rows[nmissing_rows++] = 19547877fdebSMatt Macy tgts[t] - rr->rr_firstdatacol; 1955eda14cbcSMatt Macy } 1956eda14cbcSMatt Macy } 1957eda14cbcSMatt Macy 1958eda14cbcSMatt Macy /* 1959eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing 1960eda14cbcSMatt Macy * data columns. 1961eda14cbcSMatt Macy */ 1962eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1963eda14cbcSMatt Macy ASSERT(tt < ntgts); 19647877fdebSMatt Macy ASSERT(c < rr->rr_firstdatacol); 1965eda14cbcSMatt Macy 1966eda14cbcSMatt Macy /* 1967eda14cbcSMatt Macy * Skip any targeted parity columns. 1968eda14cbcSMatt Macy */ 1969eda14cbcSMatt Macy if (c == tgts[tt]) { 1970eda14cbcSMatt Macy tt++; 1971eda14cbcSMatt Macy continue; 1972eda14cbcSMatt Macy } 1973eda14cbcSMatt Macy 1974eda14cbcSMatt Macy parity_map[i] = c; 1975eda14cbcSMatt Macy i++; 1976eda14cbcSMatt Macy } 1977eda14cbcSMatt Macy 1978eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1979eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n; 1980eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1981eda14cbcSMatt Macy 1982eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) { 1983eda14cbcSMatt Macy rows[i] = pp; 1984eda14cbcSMatt Macy pp += n; 1985eda14cbcSMatt Macy invrows[i] = pp; 1986eda14cbcSMatt Macy pp += n; 1987eda14cbcSMatt Macy } 1988eda14cbcSMatt Macy used = pp; 1989eda14cbcSMatt Macy 1990eda14cbcSMatt Macy for (i = 0; i < nmissing_rows; i++) { 1991eda14cbcSMatt Macy used[i] = parity_map[i]; 1992eda14cbcSMatt Macy } 1993eda14cbcSMatt Macy 19947877fdebSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1995eda14cbcSMatt Macy if (tt < nmissing_rows && 19967877fdebSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) { 1997eda14cbcSMatt Macy tt++; 1998eda14cbcSMatt Macy continue; 1999eda14cbcSMatt Macy } 2000eda14cbcSMatt Macy 2001eda14cbcSMatt Macy ASSERT3S(i, <, n); 2002eda14cbcSMatt Macy used[i] = c; 2003eda14cbcSMatt Macy i++; 2004eda14cbcSMatt Macy } 2005eda14cbcSMatt Macy 2006eda14cbcSMatt Macy /* 2007eda14cbcSMatt Macy * Initialize the interesting rows of the matrix. 2008eda14cbcSMatt Macy */ 20097877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2010eda14cbcSMatt Macy 2011eda14cbcSMatt Macy /* 2012eda14cbcSMatt Macy * Invert the matrix. 2013eda14cbcSMatt Macy */ 20147877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2015eda14cbcSMatt Macy invrows, used); 2016eda14cbcSMatt Macy 2017eda14cbcSMatt Macy /* 2018eda14cbcSMatt Macy * Reconstruct the missing data using the generated matrix. 2019eda14cbcSMatt Macy */ 20207877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2021eda14cbcSMatt Macy invrows, used); 2022eda14cbcSMatt Macy 2023eda14cbcSMatt Macy kmem_free(p, psize); 2024eda14cbcSMatt Macy 2025eda14cbcSMatt Macy /* 2026eda14cbcSMatt Macy * copy back from temporary linear abds and free them 2027eda14cbcSMatt Macy */ 2028eda14cbcSMatt Macy if (bufs) { 20297877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 20307877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 2031eda14cbcSMatt Macy 20327877fdebSMatt Macy if (bufs[c] != NULL) { 2033eda14cbcSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size); 2034eda14cbcSMatt Macy abd_free(col->rc_abd); 20357877fdebSMatt Macy } 2036eda14cbcSMatt Macy col->rc_abd = bufs[c]; 2037eda14cbcSMatt Macy } 20387877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2039eda14cbcSMatt Macy } 2040eda14cbcSMatt Macy } 2041eda14cbcSMatt Macy 2042f9693befSMartin Matuska static void 20437877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 20447877fdebSMatt Macy const int *t, int nt) 2045eda14cbcSMatt Macy { 2046eda14cbcSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2047eda14cbcSMatt Macy int ntgts; 2048eda14cbcSMatt Macy int i, c, ret; 2049eda14cbcSMatt Macy int nbadparity, nbaddata; 2050eda14cbcSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2051eda14cbcSMatt Macy 2052e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2053e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2054e716630dSMartin Matuska rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2055e716630dSMartin Matuska (int)rr->rr_missingparity); 2056e716630dSMartin Matuska } 2057e716630dSMartin Matuska 20587877fdebSMatt Macy nbadparity = rr->rr_firstdatacol; 20597877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity; 2060eda14cbcSMatt Macy ntgts = 0; 20617877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) { 2062e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2063e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2064e716630dSMartin Matuska "offset=%llx error=%u)", 2065e716630dSMartin Matuska rr, c, (int)rr->rr_col[c].rc_devidx, 2066e716630dSMartin Matuska (long long)rr->rr_col[c].rc_offset, 2067e716630dSMartin Matuska (int)rr->rr_col[c].rc_error); 2068e716630dSMartin Matuska } 20697877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2070eda14cbcSMatt Macy parity_valid[c] = B_FALSE; 2071eda14cbcSMatt Macy 2072eda14cbcSMatt Macy if (i < nt && c == t[i]) { 2073eda14cbcSMatt Macy tgts[ntgts++] = c; 2074eda14cbcSMatt Macy i++; 20757877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) { 2076eda14cbcSMatt Macy tgts[ntgts++] = c; 20777877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) { 2078eda14cbcSMatt Macy nbaddata--; 2079eda14cbcSMatt Macy } else { 2080eda14cbcSMatt Macy parity_valid[c] = B_TRUE; 2081eda14cbcSMatt Macy nbadparity--; 2082eda14cbcSMatt Macy } 2083eda14cbcSMatt Macy } 2084eda14cbcSMatt Macy 2085eda14cbcSMatt Macy ASSERT(ntgts >= nt); 2086eda14cbcSMatt Macy ASSERT(nbaddata >= 0); 2087eda14cbcSMatt Macy ASSERT(nbaddata + nbadparity == ntgts); 2088eda14cbcSMatt Macy 2089eda14cbcSMatt Macy dt = &tgts[nbadparity]; 2090eda14cbcSMatt Macy 2091eda14cbcSMatt Macy /* Reconstruct using the new math implementation */ 20927877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2093eda14cbcSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL) 2094f9693befSMartin Matuska return; 2095eda14cbcSMatt Macy 2096eda14cbcSMatt Macy /* 2097eda14cbcSMatt Macy * See if we can use any of our optimized reconstruction routines. 2098eda14cbcSMatt Macy */ 2099eda14cbcSMatt Macy switch (nbaddata) { 2100eda14cbcSMatt Macy case 1: 2101f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_P]) { 2102f9693befSMartin Matuska vdev_raidz_reconstruct_p(rr, dt, 1); 2103f9693befSMartin Matuska return; 2104f9693befSMartin Matuska } 2105eda14cbcSMatt Macy 21067877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2107eda14cbcSMatt Macy 2108f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_Q]) { 2109f9693befSMartin Matuska vdev_raidz_reconstruct_q(rr, dt, 1); 2110f9693befSMartin Matuska return; 2111f9693befSMartin Matuska } 2112eda14cbcSMatt Macy 21137877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2114eda14cbcSMatt Macy break; 2115eda14cbcSMatt Macy 2116eda14cbcSMatt Macy case 2: 21177877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2118eda14cbcSMatt Macy 2119eda14cbcSMatt Macy if (parity_valid[VDEV_RAIDZ_P] && 2120f9693befSMartin Matuska parity_valid[VDEV_RAIDZ_Q]) { 2121f9693befSMartin Matuska vdev_raidz_reconstruct_pq(rr, dt, 2); 2122f9693befSMartin Matuska return; 2123f9693befSMartin Matuska } 2124eda14cbcSMatt Macy 21257877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2126eda14cbcSMatt Macy 2127eda14cbcSMatt Macy break; 2128eda14cbcSMatt Macy } 2129eda14cbcSMatt Macy 2130f9693befSMartin Matuska vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2131eda14cbcSMatt Macy } 2132eda14cbcSMatt Macy 2133eda14cbcSMatt Macy static int 2134eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2135eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 2136eda14cbcSMatt Macy { 21377877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 21387877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2139eda14cbcSMatt Macy int c; 2140eda14cbcSMatt Macy int lasterror = 0; 2141eda14cbcSMatt Macy int numerrors = 0; 2142eda14cbcSMatt Macy 2143eda14cbcSMatt Macy ASSERT(nparity > 0); 2144eda14cbcSMatt Macy 2145eda14cbcSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY || 2146eda14cbcSMatt Macy vd->vdev_children < nparity + 1) { 2147eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2148eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 2149eda14cbcSMatt Macy } 2150eda14cbcSMatt Macy 2151eda14cbcSMatt Macy vdev_open_children(vd); 2152eda14cbcSMatt Macy 2153eda14cbcSMatt Macy for (c = 0; c < vd->vdev_children; c++) { 21547877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 2155eda14cbcSMatt Macy 2156eda14cbcSMatt Macy if (cvd->vdev_open_error != 0) { 2157eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 2158eda14cbcSMatt Macy numerrors++; 2159eda14cbcSMatt Macy continue; 2160eda14cbcSMatt Macy } 2161eda14cbcSMatt Macy 2162eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2163eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2164eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2165c7046f76SMartin Matuska } 2166c7046f76SMartin Matuska for (c = 0; c < vd->vdev_children; c++) { 2167c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c]; 2168c7046f76SMartin Matuska 2169c7046f76SMartin Matuska if (cvd->vdev_open_error != 0) 2170c7046f76SMartin Matuska continue; 2171c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift, 2172c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift); 2173eda14cbcSMatt Macy } 2174eda14cbcSMatt Macy 2175e716630dSMartin Matuska if (vd->vdev_rz_expanding) { 2176e716630dSMartin Matuska *asize *= vd->vdev_children - 1; 2177e716630dSMartin Matuska *max_asize *= vd->vdev_children - 1; 2178e716630dSMartin Matuska 2179e716630dSMartin Matuska vd->vdev_min_asize = *asize; 2180e716630dSMartin Matuska } else { 2181eda14cbcSMatt Macy *asize *= vd->vdev_children; 2182eda14cbcSMatt Macy *max_asize *= vd->vdev_children; 2183e716630dSMartin Matuska } 2184eda14cbcSMatt Macy 2185eda14cbcSMatt Macy if (numerrors > nparity) { 2186eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2187eda14cbcSMatt Macy return (lasterror); 2188eda14cbcSMatt Macy } 2189eda14cbcSMatt Macy 2190eda14cbcSMatt Macy return (0); 2191eda14cbcSMatt Macy } 2192eda14cbcSMatt Macy 2193eda14cbcSMatt Macy static void 2194eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd) 2195eda14cbcSMatt Macy { 21967877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 21977877fdebSMatt Macy if (vd->vdev_child[c] != NULL) 2198eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 2199eda14cbcSMatt Macy } 22007877fdebSMatt Macy } 2201eda14cbcSMatt Macy 2202e716630dSMartin Matuska /* 2203e716630dSMartin Matuska * Return the logical width to use, given the txg in which the allocation 2204783d3ff6SMartin Matuska * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2205e716630dSMartin Matuska * BP was allocated. Remapped BP's (that were relocated due to device 2206783d3ff6SMartin Matuska * removal, see remap_blkptr_cb()), will have a more recent physical birth 2207783d3ff6SMartin Matuska * which reflects when the BP was relocated, but we can ignore these because 2208783d3ff6SMartin Matuska * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2209e716630dSMartin Matuska */ 2210eda14cbcSMatt Macy static uint64_t 2211e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2212e716630dSMartin Matuska { 2213e716630dSMartin Matuska reflow_node_t lookup = { 2214e716630dSMartin Matuska .re_txg = txg, 2215e716630dSMartin Matuska }; 2216e716630dSMartin Matuska avl_index_t where; 2217e716630dSMartin Matuska 2218e716630dSMartin Matuska uint64_t width; 2219e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 2220e716630dSMartin Matuska reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2221e716630dSMartin Matuska if (re != NULL) { 2222e716630dSMartin Matuska width = re->re_logical_width; 2223e716630dSMartin Matuska } else { 2224e716630dSMartin Matuska re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2225e716630dSMartin Matuska if (re != NULL) 2226e716630dSMartin Matuska width = re->re_logical_width; 2227e716630dSMartin Matuska else 2228e716630dSMartin Matuska width = vdrz->vd_original_width; 2229e716630dSMartin Matuska } 2230e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 2231e716630dSMartin Matuska return (width); 2232e716630dSMartin Matuska } 2233e716630dSMartin Matuska 2234e716630dSMartin Matuska /* 2235e716630dSMartin Matuska * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2236e716630dSMartin Matuska * more space due to the lower data-to-parity ratio. In this case it's 2237e716630dSMartin Matuska * important to pass in the correct txg. Note that vdev_gang_header_asize() 2238e716630dSMartin Matuska * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2239e716630dSMartin Matuska * regardless of txg. This is assured because for a single data sector, we 2240e716630dSMartin Matuska * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2241e716630dSMartin Matuska */ 2242e716630dSMartin Matuska static uint64_t 2243e716630dSMartin Matuska vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2244eda14cbcSMatt Macy { 22457877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2246eda14cbcSMatt Macy uint64_t asize; 2247eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 2248e716630dSMartin Matuska uint64_t cols = vdrz->vd_original_width; 22497877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2250eda14cbcSMatt Macy 2251e716630dSMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg); 2252e716630dSMartin Matuska 2253eda14cbcSMatt Macy asize = ((psize - 1) >> ashift) + 1; 2254eda14cbcSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2255eda14cbcSMatt Macy asize = roundup(asize, nparity + 1) << ashift; 2256eda14cbcSMatt Macy 2257e716630dSMartin Matuska #ifdef ZFS_DEBUG 2258e716630dSMartin Matuska uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2259e716630dSMartin Matuska uint64_t ncols_new = vdrz->vd_physical_width; 2260e716630dSMartin Matuska asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2261e716630dSMartin Matuska (ncols_new - nparity)); 2262e716630dSMartin Matuska asize_new = roundup(asize_new, nparity + 1) << ashift; 2263e716630dSMartin Matuska VERIFY3U(asize_new, <=, asize); 2264e716630dSMartin Matuska #endif 2265e716630dSMartin Matuska 2266eda14cbcSMatt Macy return (asize); 2267eda14cbcSMatt Macy } 2268eda14cbcSMatt Macy 22697877fdebSMatt Macy /* 22707877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child) 22717877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize. 22727877fdebSMatt Macy */ 22737877fdebSMatt Macy static uint64_t 22747877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd) 22757877fdebSMatt Macy { 22767877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) / 22777877fdebSMatt Macy vd->vdev_children); 22787877fdebSMatt Macy } 22797877fdebSMatt Macy 22807877fdebSMatt Macy void 2281eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio) 2282eda14cbcSMatt Macy { 2283eda14cbcSMatt Macy raidz_col_t *rc = zio->io_private; 2284eda14cbcSMatt Macy 228581b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 2286eda14cbcSMatt Macy rc->rc_error = zio->io_error; 2287eda14cbcSMatt Macy rc->rc_tried = 1; 2288eda14cbcSMatt Macy rc->rc_skipped = 0; 2289eda14cbcSMatt Macy } 2290eda14cbcSMatt Macy 2291eda14cbcSMatt Macy static void 2292e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio) 2293eda14cbcSMatt Macy { 2294e716630dSMartin Matuska raidz_col_t *rc = zio->io_private; 2295eda14cbcSMatt Macy 2296e716630dSMartin Matuska rc->rc_shadow_error = zio->io_error; 2297e716630dSMartin Matuska } 2298e716630dSMartin Matuska 2299e716630dSMartin Matuska static void 2300e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2301e716630dSMartin Matuska { 2302e716630dSMartin Matuska (void) rm; 2303e716630dSMartin Matuska #ifdef ZFS_DEBUG 23047877fdebSMatt Macy range_seg64_t logical_rs, physical_rs, remain_rs; 23057877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset; 2306eda14cbcSMatt Macy logical_rs.rs_end = logical_rs.rs_start + 2307e716630dSMartin Matuska vdev_raidz_asize(zio->io_vd, rr->rr_size, 2308783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2309eda14cbcSMatt Macy 23107877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col]; 2311e716630dSMartin Matuska vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2312eda14cbcSMatt Macy 23137877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 23147877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs)); 2315e716630dSMartin Matuska if (vdev_xlate_is_empty(&physical_rs)) { 2316e716630dSMartin Matuska /* 2317e716630dSMartin Matuska * If we are in the middle of expansion, the 2318e716630dSMartin Matuska * physical->logical mapping is changing so vdev_xlate() 2319e716630dSMartin Matuska * can't give us a reliable answer. 2320e716630dSMartin Matuska */ 2321e716630dSMartin Matuska return; 2322e716630dSMartin Matuska } 2323eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2324eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2325eda14cbcSMatt Macy /* 2326eda14cbcSMatt Macy * It would be nice to assert that rs_end is equal 2327eda14cbcSMatt Macy * to rc_offset + rc_size but there might be an 2328eda14cbcSMatt Macy * optional I/O at the end that is not accounted in 2329eda14cbcSMatt Macy * rc_size. 2330eda14cbcSMatt Macy */ 2331eda14cbcSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2332eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2333e716630dSMartin Matuska rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2334eda14cbcSMatt Macy } else { 2335eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2336eda14cbcSMatt Macy } 2337eda14cbcSMatt Macy #endif 2338eda14cbcSMatt Macy } 2339eda14cbcSMatt Macy 23407877fdebSMatt Macy static void 2341e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 23427877fdebSMatt Macy { 23437877fdebSMatt Macy vdev_t *vd = zio->io_vd; 23447877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 23457877fdebSMatt Macy 23467877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 23477877fdebSMatt Macy 234881b22a98SMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 23497877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 235081b22a98SMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 23517877fdebSMatt Macy 23527877fdebSMatt Macy /* Verify physical to logical translation */ 2353e716630dSMartin Matuska vdev_raidz_io_verify(zio, rm, rr, c); 23547877fdebSMatt Macy 2355e716630dSMartin Matuska if (rc->rc_size == 0) 2356e716630dSMartin Matuska continue; 2357e716630dSMartin Matuska 2358e716630dSMartin Matuska ASSERT3U(rc->rc_offset + rc->rc_size, <, 2359e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2360e716630dSMartin Matuska 236181b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 23627877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 236381b22a98SMartin Matuska rc->rc_offset, rc->rc_abd, 236481b22a98SMartin Matuska abd_get_size(rc->rc_abd), zio->io_type, 236581b22a98SMartin Matuska zio->io_priority, 0, vdev_raidz_child_done, rc)); 2366e716630dSMartin Matuska 2367e716630dSMartin Matuska if (rc->rc_shadow_devidx != INT_MAX) { 2368e716630dSMartin Matuska vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2369e716630dSMartin Matuska 2370e716630dSMartin Matuska ASSERT3U( 2371e716630dSMartin Matuska rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2372e716630dSMartin Matuska cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2373e716630dSMartin Matuska 2374e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2375e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, 2376e716630dSMartin Matuska abd_get_size(rc->rc_abd), 2377e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2378e716630dSMartin Matuska vdev_raidz_shadow_child_done, rc)); 237981b22a98SMartin Matuska } 23807877fdebSMatt Macy } 23817877fdebSMatt Macy } 23827877fdebSMatt Macy 2383e716630dSMartin Matuska /* 2384e716630dSMartin Matuska * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2385e716630dSMartin Matuska * This only works for vdev_raidz_map_alloc() (not _expanded()). 2386e716630dSMartin Matuska */ 23877877fdebSMatt Macy static void 2388e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio) 2389e716630dSMartin Matuska { 2390e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2391e716630dSMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift; 2392e716630dSMartin Matuska raidz_map_t *rm = zio->io_vsd; 2393e716630dSMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 2394e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 2395e716630dSMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 2396e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2397e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2398e716630dSMartin Matuska if (rc->rc_size != 0) 2399e716630dSMartin Matuska continue; 2400e716630dSMartin Matuska ASSERT3P(rc->rc_abd, ==, NULL); 2401e716630dSMartin Matuska 2402e716630dSMartin Matuska ASSERT3U(rc->rc_offset, <, 2403e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2404e716630dSMartin Matuska 2405e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2406e716630dSMartin Matuska NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2407e716630dSMartin Matuska ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2408e716630dSMartin Matuska } 2409e716630dSMartin Matuska } 2410e716630dSMartin Matuska 2411e716630dSMartin Matuska static void 2412e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 24137877fdebSMatt Macy { 24147877fdebSMatt Macy vdev_t *vd = zio->io_vd; 24157877fdebSMatt Macy 24167877fdebSMatt Macy /* 24177877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity 24187877fdebSMatt Macy * last -- any errors along the way will force us to read the parity. 24197877fdebSMatt Macy */ 24207877fdebSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) { 24217877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 24227877fdebSMatt Macy if (rc->rc_size == 0) 24237877fdebSMatt Macy continue; 24247877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 24257877fdebSMatt Macy if (!vdev_readable(cvd)) { 24267877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24277877fdebSMatt Macy rr->rr_missingdata++; 24287877fdebSMatt Macy else 24297877fdebSMatt Macy rr->rr_missingparity++; 24307877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO); 24317877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */ 24327877fdebSMatt Macy rc->rc_skipped = 1; 24337877fdebSMatt Macy continue; 24347877fdebSMatt Macy } 24357877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 24367877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24377877fdebSMatt Macy rr->rr_missingdata++; 24387877fdebSMatt Macy else 24397877fdebSMatt Macy rr->rr_missingparity++; 24407877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE); 24417877fdebSMatt Macy rc->rc_skipped = 1; 24427877fdebSMatt Macy continue; 24437877fdebSMatt Macy } 2444e716630dSMartin Matuska if (forceparity || 2445e716630dSMartin Matuska c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 24467877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 24477877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 24487877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 24497877fdebSMatt Macy zio->io_type, zio->io_priority, 0, 24507877fdebSMatt Macy vdev_raidz_child_done, rc)); 24517877fdebSMatt Macy } 24527877fdebSMatt Macy } 24537877fdebSMatt Macy } 24547877fdebSMatt Macy 2455e716630dSMartin Matuska static void 2456e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2457e716630dSMartin Matuska { 2458e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2459e716630dSMartin Matuska 2460e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 2461e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 2462e716630dSMartin Matuska if (prc->rc_size == 0) 2463e716630dSMartin Matuska continue; 2464e716630dSMartin Matuska 2465e716630dSMartin Matuska ASSERT3U(prc->rc_devidx, ==, i); 2466e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[i]; 2467e716630dSMartin Matuska if (!vdev_readable(cvd)) { 2468e716630dSMartin Matuska prc->rc_error = SET_ERROR(ENXIO); 2469e716630dSMartin Matuska prc->rc_tried = 1; /* don't even try */ 2470e716630dSMartin Matuska prc->rc_skipped = 1; 2471e716630dSMartin Matuska continue; 2472e716630dSMartin Matuska } 2473e716630dSMartin Matuska if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2474e716630dSMartin Matuska prc->rc_error = SET_ERROR(ESTALE); 2475e716630dSMartin Matuska prc->rc_skipped = 1; 2476e716630dSMartin Matuska continue; 2477e716630dSMartin Matuska } 2478e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2479e716630dSMartin Matuska prc->rc_offset, prc->rc_abd, prc->rc_size, 2480e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2481e716630dSMartin Matuska vdev_raidz_child_done, prc)); 2482e716630dSMartin Matuska } 2483e716630dSMartin Matuska } 2484e716630dSMartin Matuska 2485e716630dSMartin Matuska static void 2486e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2487e716630dSMartin Matuska { 2488e716630dSMartin Matuska /* 2489e716630dSMartin Matuska * If there are multiple rows, we will be hitting 2490e716630dSMartin Matuska * all disks, so go ahead and read the parity so 2491e716630dSMartin Matuska * that we are reading in decent size chunks. 2492e716630dSMartin Matuska */ 2493e716630dSMartin Matuska boolean_t forceparity = rm->rm_nrows > 1; 2494e716630dSMartin Matuska 2495e716630dSMartin Matuska if (rm->rm_phys_col) { 2496e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio, rm); 2497e716630dSMartin Matuska } else { 2498e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2499e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 2500e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio, rr, forceparity); 2501e716630dSMartin Matuska } 2502e716630dSMartin Matuska } 2503e716630dSMartin Matuska } 2504e716630dSMartin Matuska 2505eda14cbcSMatt Macy /* 2506eda14cbcSMatt Macy * Start an IO operation on a RAIDZ VDev 2507eda14cbcSMatt Macy * 2508eda14cbcSMatt Macy * Outline: 2509eda14cbcSMatt Macy * - For write operations: 2510eda14cbcSMatt Macy * 1. Generate the parity data 2511eda14cbcSMatt Macy * 2. Create child zio write operations to each column's vdev, for both 2512eda14cbcSMatt Macy * data and parity. 2513eda14cbcSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy 2514eda14cbcSMatt Macy * write zio children for those areas to improve aggregation continuity. 2515eda14cbcSMatt Macy * - For read operations: 2516eda14cbcSMatt Macy * 1. Create child zio read operations to each data column's vdev to read 2517eda14cbcSMatt Macy * the range of data required for zio. 2518eda14cbcSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data 2519eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity 2520eda14cbcSMatt Macy * columns' VDevs as well. 2521eda14cbcSMatt Macy */ 2522eda14cbcSMatt Macy static void 2523eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio) 2524eda14cbcSMatt Macy { 2525eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 2526eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top; 25277877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2528e716630dSMartin Matuska raidz_map_t *rm; 2529eda14cbcSMatt Macy 2530e716630dSMartin Matuska uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2531783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2532e716630dSMartin Matuska if (logical_width != vdrz->vd_physical_width) { 2533e716630dSMartin Matuska zfs_locked_range_t *lr = NULL; 2534e716630dSMartin Matuska uint64_t synced_offset = UINT64_MAX; 2535e716630dSMartin Matuska uint64_t next_offset = UINT64_MAX; 2536e716630dSMartin Matuska boolean_t use_scratch = B_FALSE; 2537e716630dSMartin Matuska /* 2538e716630dSMartin Matuska * Note: when the expansion is completing, we set 2539e716630dSMartin Matuska * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2540e716630dSMartin Matuska * in a later txg than when we last update spa_ubsync's state 2541e716630dSMartin Matuska * (see the end of spa_raidz_expand_thread()). Therefore we 2542e716630dSMartin Matuska * may see vre_state!=SCANNING before 2543e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2544e716630dSMartin Matuska * on disk, but the copying progress has been synced to disk 2545e716630dSMartin Matuska * (and reflected in spa_ubsync). In this case it's fine to 2546e716630dSMartin Matuska * treat the expansion as completed, since if we crash there's 2547e716630dSMartin Matuska * no additional copying to do. 2548e716630dSMartin Matuska */ 2549e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2550e716630dSMartin Matuska ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2551e716630dSMartin Matuska &vdrz->vn_vre); 2552e716630dSMartin Matuska lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2553e716630dSMartin Matuska zio->io_offset, zio->io_size, RL_READER); 2554e716630dSMartin Matuska use_scratch = 2555e716630dSMartin Matuska (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2556e716630dSMartin Matuska RRSS_SCRATCH_VALID); 2557e716630dSMartin Matuska synced_offset = 2558e716630dSMartin Matuska RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2559e716630dSMartin Matuska next_offset = vdrz->vn_vre.vre_offset; 2560e716630dSMartin Matuska /* 2561e716630dSMartin Matuska * If we haven't resumed expanding since importing the 2562e716630dSMartin Matuska * pool, vre_offset won't have been set yet. In 2563e716630dSMartin Matuska * this case the next offset to be copied is the same 2564e716630dSMartin Matuska * as what was synced. 2565e716630dSMartin Matuska */ 2566e716630dSMartin Matuska if (next_offset == UINT64_MAX) { 2567e716630dSMartin Matuska next_offset = synced_offset; 2568e716630dSMartin Matuska } 2569e716630dSMartin Matuska } 2570e716630dSMartin Matuska if (use_scratch) { 2571e716630dSMartin Matuska zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2572e716630dSMartin Matuska "%lld next_offset=%lld use_scratch=%u", 2573e716630dSMartin Matuska zio, 2574e716630dSMartin Matuska zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2575e716630dSMartin Matuska (long long)zio->io_offset, 2576e716630dSMartin Matuska (long long)synced_offset, 2577e716630dSMartin Matuska (long long)next_offset, 2578e716630dSMartin Matuska use_scratch); 2579e716630dSMartin Matuska } 2580e716630dSMartin Matuska 2581e716630dSMartin Matuska rm = vdev_raidz_map_alloc_expanded(zio, 2582e716630dSMartin Matuska tvd->vdev_ashift, vdrz->vd_physical_width, 2583e716630dSMartin Matuska logical_width, vdrz->vd_nparity, 2584e716630dSMartin Matuska synced_offset, next_offset, use_scratch); 2585e716630dSMartin Matuska rm->rm_lr = lr; 2586e716630dSMartin Matuska } else { 2587e716630dSMartin Matuska rm = vdev_raidz_map_alloc(zio, 2588e716630dSMartin Matuska tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2589e716630dSMartin Matuska } 2590e716630dSMartin Matuska rm->rm_original_width = vdrz->vd_original_width; 2591e716630dSMartin Matuska 2592f9693befSMartin Matuska zio->io_vsd = rm; 2593f9693befSMartin Matuska zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2594eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 2595e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2596e716630dSMartin Matuska vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2597e716630dSMartin Matuska } 2598e716630dSMartin Matuska 2599e716630dSMartin Matuska if (logical_width == vdrz->vd_physical_width) { 2600e716630dSMartin Matuska raidz_start_skip_writes(zio); 2601e716630dSMartin Matuska } 26027877fdebSMatt Macy } else { 2603eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 2604e716630dSMartin Matuska vdev_raidz_io_start_read(zio, rm); 2605eda14cbcSMatt Macy } 2606eda14cbcSMatt Macy 2607eda14cbcSMatt Macy zio_execute(zio); 2608eda14cbcSMatt Macy } 2609eda14cbcSMatt Macy 2610eda14cbcSMatt Macy /* 2611eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device. 2612eda14cbcSMatt Macy */ 2613e92ffd9bSMartin Matuska void 2614e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2615eda14cbcSMatt Macy { 2616eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2617eda14cbcSMatt Macy 26187877fdebSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 26197877fdebSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) { 2620eda14cbcSMatt Macy zio_bad_cksum_t zbc; 2621eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2622eda14cbcSMatt Macy 2623eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 2624eda14cbcSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 2625eda14cbcSMatt Macy 26262c48331dSMatt Macy mutex_enter(&vd->vdev_stat_lock); 26272c48331dSMatt Macy vd->vdev_stat.vs_checksum_errors++; 26282c48331dSMatt Macy mutex_exit(&vd->vdev_stat_lock); 2629bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2630bb2d13b6SMartin Matuska &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2631bb2d13b6SMartin Matuska rc->rc_abd, bad_data, &zbc); 26322c48331dSMatt Macy } 2633eda14cbcSMatt Macy } 2634eda14cbcSMatt Macy 2635eda14cbcSMatt Macy /* 2636eda14cbcSMatt Macy * We keep track of whether or not there were any injected errors, so that 2637eda14cbcSMatt Macy * any ereports we generate can note it. 2638eda14cbcSMatt Macy */ 2639eda14cbcSMatt Macy static int 2640eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio) 2641eda14cbcSMatt Macy { 2642315ee00fSMartin Matuska zio_bad_cksum_t zbc = {0}; 2643eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2644eda14cbcSMatt Macy 2645eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc); 2646*87bf66d4SMartin Matuska /* 2647*87bf66d4SMartin Matuska * Any Direct I/O read that has a checksum error must be treated as 2648*87bf66d4SMartin Matuska * suspicious as the contents of the buffer could be getting 2649*87bf66d4SMartin Matuska * manipulated while the I/O is taking place. The checksum verify error 2650*87bf66d4SMartin Matuska * will be reported to the top-level RAIDZ VDEV. 2651*87bf66d4SMartin Matuska */ 2652*87bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 2653*87bf66d4SMartin Matuska zio->io_error = ret; 2654*87bf66d4SMartin Matuska zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 2655*87bf66d4SMartin Matuska zio_dio_chksum_verify_error_report(zio); 2656*87bf66d4SMartin Matuska zio_checksum_verified(zio); 2657*87bf66d4SMartin Matuska return (0); 2658*87bf66d4SMartin Matuska } 2659*87bf66d4SMartin Matuska 2660eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0) 2661eda14cbcSMatt Macy rm->rm_ecksuminjected = 1; 2662eda14cbcSMatt Macy 2663eda14cbcSMatt Macy return (ret); 2664eda14cbcSMatt Macy } 2665eda14cbcSMatt Macy 2666eda14cbcSMatt Macy /* 2667eda14cbcSMatt Macy * Generate the parity from the data columns. If we tried and were able to 2668eda14cbcSMatt Macy * read the parity without error, verify that the generated parity matches the 2669eda14cbcSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the 26707877fdebSMatt Macy * number of such failures. 2671eda14cbcSMatt Macy */ 2672eda14cbcSMatt Macy static int 26737877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2674eda14cbcSMatt Macy { 2675eda14cbcSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2676eda14cbcSMatt Macy int c, ret = 0; 26777877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2678eda14cbcSMatt Macy raidz_col_t *rc; 2679eda14cbcSMatt Macy 2680eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp; 2681eda14cbcSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2682eda14cbcSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2683eda14cbcSMatt Macy 2684eda14cbcSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY) 2685eda14cbcSMatt Macy return (ret); 2686eda14cbcSMatt Macy 26877877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 26887877fdebSMatt Macy rc = &rr->rr_col[c]; 2689eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2690eda14cbcSMatt Macy continue; 2691eda14cbcSMatt Macy 2692a0b956f5SMartin Matuska orig[c] = rc->rc_abd; 2693a0b956f5SMartin Matuska ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2694a0b956f5SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2695eda14cbcSMatt Macy } 2696eda14cbcSMatt Macy 26977877fdebSMatt Macy /* 2698e92ffd9bSMartin Matuska * Verify any empty sectors are zero filled to ensure the parity 2699e92ffd9bSMartin Matuska * is calculated correctly even if these non-data sectors are damaged. 2700e92ffd9bSMartin Matuska */ 2701e92ffd9bSMartin Matuska if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2702e92ffd9bSMartin Matuska ret += vdev_draid_map_verify_empty(zio, rr); 2703e92ffd9bSMartin Matuska 2704e92ffd9bSMartin Matuska /* 27057877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This 27067877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff 27077877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0). 27087877fdebSMatt Macy */ 27097877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 2710eda14cbcSMatt Macy 27117877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 27127877fdebSMatt Macy rc = &rr->rr_col[c]; 27137877fdebSMatt Macy 2714eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2715eda14cbcSMatt Macy continue; 27167877fdebSMatt Macy 2717eda14cbcSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2718e716630dSMartin Matuska zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2719e716630dSMartin Matuska c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2720e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, rc, orig[c]); 2721eda14cbcSMatt Macy rc->rc_error = SET_ERROR(ECKSUM); 2722eda14cbcSMatt Macy ret++; 2723eda14cbcSMatt Macy } 2724eda14cbcSMatt Macy abd_free(orig[c]); 2725eda14cbcSMatt Macy } 2726eda14cbcSMatt Macy 2727eda14cbcSMatt Macy return (ret); 2728eda14cbcSMatt Macy } 2729eda14cbcSMatt Macy 2730eda14cbcSMatt Macy static int 27317877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr) 2732eda14cbcSMatt Macy { 2733eda14cbcSMatt Macy int error = 0; 2734eda14cbcSMatt Macy 2735e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 27367877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error); 2737e716630dSMartin Matuska error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2738e716630dSMartin Matuska } 2739eda14cbcSMatt Macy 2740eda14cbcSMatt Macy return (error); 2741eda14cbcSMatt Macy } 2742eda14cbcSMatt Macy 2743eda14cbcSMatt Macy static void 27447877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2745eda14cbcSMatt Macy { 2746eda14cbcSMatt Macy int unexpected_errors = 0; 2747eda14cbcSMatt Macy int parity_errors = 0; 2748eda14cbcSMatt Macy int parity_untried = 0; 2749eda14cbcSMatt Macy int data_errors = 0; 2750eda14cbcSMatt Macy 27517877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2752eda14cbcSMatt Macy 27537877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27547877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 2755eda14cbcSMatt Macy 2756eda14cbcSMatt Macy if (rc->rc_error) { 27577877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2758eda14cbcSMatt Macy parity_errors++; 2759eda14cbcSMatt Macy else 2760eda14cbcSMatt Macy data_errors++; 2761eda14cbcSMatt Macy 2762eda14cbcSMatt Macy if (!rc->rc_skipped) 2763eda14cbcSMatt Macy unexpected_errors++; 27647877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2765eda14cbcSMatt Macy parity_untried++; 2766eda14cbcSMatt Macy } 2767a0b956f5SMartin Matuska 2768a0b956f5SMartin Matuska if (rc->rc_force_repair) 2769a0b956f5SMartin Matuska unexpected_errors++; 2770eda14cbcSMatt Macy } 2771eda14cbcSMatt Macy 2772eda14cbcSMatt Macy /* 27737877fdebSMatt Macy * If we read more parity disks than were used for 27747877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced 27757877fdebSMatt Macy * correct data. 27767877fdebSMatt Macy * 27777877fdebSMatt Macy * Note that we also regenerate parity when resilvering so we 27787877fdebSMatt Macy * can write it out to failed devices later. 27797877fdebSMatt Macy */ 27807877fdebSMatt Macy if (parity_errors + parity_untried < 27817877fdebSMatt Macy rr->rr_firstdatacol - data_errors || 27827877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) { 27837877fdebSMatt Macy int n = raidz_parity_verify(zio, rr); 27847877fdebSMatt Macy unexpected_errors += n; 27857877fdebSMatt Macy } 27867877fdebSMatt Macy 27877877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 27887877fdebSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 27897877fdebSMatt Macy /* 27907877fdebSMatt Macy * Use the good data we have in hand to repair damaged children. 27917877fdebSMatt Macy */ 27927877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27937877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 27947877fdebSMatt Macy vdev_t *vd = zio->io_vd; 27957877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 27967877fdebSMatt Macy 279716038816SMartin Matuska if (!rc->rc_allow_repair) { 279816038816SMartin Matuska continue; 279916038816SMartin Matuska } else if (!rc->rc_force_repair && 280016038816SMartin Matuska (rc->rc_error == 0 || rc->rc_size == 0)) { 28017877fdebSMatt Macy continue; 28027877fdebSMatt Macy } 2803*87bf66d4SMartin Matuska /* 2804*87bf66d4SMartin Matuska * We do not allow self healing for Direct I/O reads. 2805*87bf66d4SMartin Matuska * See comment in vdev_raid_row_alloc(). 2806*87bf66d4SMartin Matuska */ 2807*87bf66d4SMartin Matuska ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 28087877fdebSMatt Macy 2809e716630dSMartin Matuska zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2810e716630dSMartin Matuska "offset=%llx", 2811e716630dSMartin Matuska zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2812e716630dSMartin Matuska 28137877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 28147877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 28157877fdebSMatt Macy ZIO_TYPE_WRITE, 28167877fdebSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 28177877fdebSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 28187877fdebSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 28197877fdebSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 28207877fdebSMatt Macy } 28217877fdebSMatt Macy } 2822e716630dSMartin Matuska 2823e716630dSMartin Matuska /* 2824e716630dSMartin Matuska * Scrub or resilver i/o's: overwrite any shadow locations with the 2825e716630dSMartin Matuska * good data. This ensures that if we've already copied this sector, 2826e716630dSMartin Matuska * it will be corrected if it was damaged. This writes more than is 2827e716630dSMartin Matuska * necessary, but since expansion is paused during scrub/resilver, at 2828e716630dSMartin Matuska * most a single row will have a shadow location. 2829e716630dSMartin Matuska */ 2830e716630dSMartin Matuska if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2831e716630dSMartin Matuska (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2832e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 2833e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2834e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2835e716630dSMartin Matuska 2836e716630dSMartin Matuska if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2837e716630dSMartin Matuska continue; 2838e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2839e716630dSMartin Matuska 2840e716630dSMartin Matuska /* 2841e716630dSMartin Matuska * Note: We don't want to update the repair stats 2842e716630dSMartin Matuska * because that would incorrectly indicate that there 2843e716630dSMartin Matuska * was bad data to repair, which we aren't sure about. 2844e716630dSMartin Matuska * By clearing the SCAN_THREAD flag, we prevent this 2845e716630dSMartin Matuska * from happening, despite having the REPAIR flag set. 2846e716630dSMartin Matuska * We need to set SELF_HEAL so that this i/o can't be 2847e716630dSMartin Matuska * bypassed by zio_vdev_io_start(). 2848e716630dSMartin Matuska */ 2849e716630dSMartin Matuska zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2850e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2851e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2852e716630dSMartin Matuska ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2853e716630dSMartin Matuska NULL, NULL); 2854e716630dSMartin Matuska cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2855e716630dSMartin Matuska zio_nowait(cio); 2856e716630dSMartin Matuska } 2857e716630dSMartin Matuska } 28587877fdebSMatt Macy } 28597877fdebSMatt Macy 28607877fdebSMatt Macy static void 28617877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm) 28627877fdebSMatt Macy { 28637877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 28647877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 28657877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 28667877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 28677877fdebSMatt Macy if (rc->rc_need_orig_restore) { 2868f9693befSMartin Matuska abd_copy(rc->rc_abd, 28697877fdebSMatt Macy rc->rc_orig_data, rc->rc_size); 28707877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 28717877fdebSMatt Macy } 28727877fdebSMatt Macy } 28737877fdebSMatt Macy } 28747877fdebSMatt Macy } 28757877fdebSMatt Macy 28767877fdebSMatt Macy /* 2877e716630dSMartin Matuska * During raidz_reconstruct() for expanded VDEV, we need special consideration 2878e716630dSMartin Matuska * failure simulations. See note in raidz_reconstruct() on simulating failure 2879e716630dSMartin Matuska * of a pre-expansion device. 2880e716630dSMartin Matuska * 2881e716630dSMartin Matuska * Treating logical child i as failed, return TRUE if the given column should 2882e716630dSMartin Matuska * be treated as failed. The idea of logical children allows us to imagine 2883e716630dSMartin Matuska * that a disk silently failed before a RAIDZ expansion (reads from this disk 2884e716630dSMartin Matuska * succeed but return the wrong data). Since the expansion doesn't verify 2885e716630dSMartin Matuska * checksums, the incorrect data will be moved to new locations spread among 2886e716630dSMartin Matuska * the children (going diagonally across them). 2887e716630dSMartin Matuska * 2888e716630dSMartin Matuska * Higher "logical child failures" (values of `i`) indicate these 2889e716630dSMartin Matuska * "pre-expansion failures". The first physical_width values imagine that a 2890e716630dSMartin Matuska * current child failed; the next physical_width-1 values imagine that a 2891e716630dSMartin Matuska * child failed before the most recent expansion; the next physical_width-2 2892e716630dSMartin Matuska * values imagine a child failed in the expansion before that, etc. 2893e716630dSMartin Matuska */ 2894e716630dSMartin Matuska static boolean_t 2895e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift, 2896e716630dSMartin Matuska int i, raidz_col_t *rc) 2897e716630dSMartin Matuska { 2898e716630dSMartin Matuska uint64_t sector_id = 2899e716630dSMartin Matuska physical_width * (rc->rc_offset >> ashift) + 2900e716630dSMartin Matuska rc->rc_devidx; 2901e716630dSMartin Matuska 2902e716630dSMartin Matuska for (int w = physical_width; w >= original_width; w--) { 2903e716630dSMartin Matuska if (i < w) { 2904e716630dSMartin Matuska return (sector_id % w == i); 2905e716630dSMartin Matuska } else { 2906e716630dSMartin Matuska i -= w; 2907e716630dSMartin Matuska } 2908e716630dSMartin Matuska } 2909e716630dSMartin Matuska ASSERT(!"invalid logical child id"); 2910e716630dSMartin Matuska return (B_FALSE); 2911e716630dSMartin Matuska } 2912e716630dSMartin Matuska 2913e716630dSMartin Matuska /* 29147877fdebSMatt Macy * returns EINVAL if reconstruction of the block will not be possible 29157877fdebSMatt Macy * returns ECKSUM if this specific reconstruction failed 29167877fdebSMatt Macy * returns 0 on successful reconstruction 29177877fdebSMatt Macy */ 29187877fdebSMatt Macy static int 29197877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 29207877fdebSMatt Macy { 29217877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2922e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 2923e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 2924e716630dSMartin Matuska rm->rm_original_width : physical_width; 2925e716630dSMartin Matuska int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2926e716630dSMartin Matuska 2927e716630dSMartin Matuska if (dbgmsg) { 2928e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2929e716630dSMartin Matuska "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2930e716630dSMartin Matuska } 29317877fdebSMatt Macy 29327877fdebSMatt Macy /* Reconstruct each row */ 29337877fdebSMatt Macy for (int r = 0; r < rm->rm_nrows; r++) { 29347877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[r]; 29357877fdebSMatt Macy int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 29367877fdebSMatt Macy int t = 0; 29377877fdebSMatt Macy int dead = 0; 29387877fdebSMatt Macy int dead_data = 0; 29397877fdebSMatt Macy 2940e716630dSMartin Matuska if (dbgmsg) 2941e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2942e716630dSMartin Matuska 29437877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29447877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29457877fdebSMatt Macy ASSERT0(rc->rc_need_orig_restore); 29467877fdebSMatt Macy if (rc->rc_error != 0) { 29477877fdebSMatt Macy dead++; 29487877fdebSMatt Macy if (c >= nparity) 29497877fdebSMatt Macy dead_data++; 29507877fdebSMatt Macy continue; 29517877fdebSMatt Macy } 29527877fdebSMatt Macy if (rc->rc_size == 0) 29537877fdebSMatt Macy continue; 29547877fdebSMatt Macy for (int lt = 0; lt < ntgts; lt++) { 2955e716630dSMartin Matuska if (raidz_simulate_failure(physical_width, 2956e716630dSMartin Matuska original_width, 2957e716630dSMartin Matuska zio->io_vd->vdev_top->vdev_ashift, 2958e716630dSMartin Matuska ltgts[lt], rc)) { 29597877fdebSMatt Macy if (rc->rc_orig_data == NULL) { 29607877fdebSMatt Macy rc->rc_orig_data = 2961f9693befSMartin Matuska abd_alloc_linear( 2962f9693befSMartin Matuska rc->rc_size, B_TRUE); 2963f9693befSMartin Matuska abd_copy(rc->rc_orig_data, 29647877fdebSMatt Macy rc->rc_abd, rc->rc_size); 29657877fdebSMatt Macy } 29667877fdebSMatt Macy rc->rc_need_orig_restore = B_TRUE; 29677877fdebSMatt Macy 29687877fdebSMatt Macy dead++; 29697877fdebSMatt Macy if (c >= nparity) 29707877fdebSMatt Macy dead_data++; 2971e716630dSMartin Matuska /* 2972e716630dSMartin Matuska * Note: simulating failure of a 2973e716630dSMartin Matuska * pre-expansion device can hit more 2974e716630dSMartin Matuska * than one column, in which case we 2975e716630dSMartin Matuska * might try to simulate more failures 2976e716630dSMartin Matuska * than can be reconstructed, which is 2977e716630dSMartin Matuska * also more than the size of my_tgts. 2978e716630dSMartin Matuska * This check prevents accessing past 2979e716630dSMartin Matuska * the end of my_tgts. The "dead > 2980e716630dSMartin Matuska * nparity" check below will fail this 2981e716630dSMartin Matuska * reconstruction attempt. 2982e716630dSMartin Matuska */ 2983e716630dSMartin Matuska if (t < VDEV_RAIDZ_MAXPARITY) { 29847877fdebSMatt Macy my_tgts[t++] = c; 2985e716630dSMartin Matuska if (dbgmsg) { 2986e716630dSMartin Matuska zfs_dbgmsg("simulating " 2987e716630dSMartin Matuska "failure of col %u " 2988e716630dSMartin Matuska "devidx %u", c, 2989e716630dSMartin Matuska (int)rc->rc_devidx); 2990e716630dSMartin Matuska } 2991e716630dSMartin Matuska } 29927877fdebSMatt Macy break; 29937877fdebSMatt Macy } 29947877fdebSMatt Macy } 29957877fdebSMatt Macy } 29967877fdebSMatt Macy if (dead > nparity) { 29977877fdebSMatt Macy /* reconstruction not possible */ 2998e716630dSMartin Matuska if (dbgmsg) { 2999e716630dSMartin Matuska zfs_dbgmsg("reconstruction not possible; " 3000e716630dSMartin Matuska "too many failures"); 3001e716630dSMartin Matuska } 30027877fdebSMatt Macy raidz_restore_orig_data(rm); 30037877fdebSMatt Macy return (EINVAL); 30047877fdebSMatt Macy } 30057877fdebSMatt Macy if (dead_data > 0) 3006f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 30077877fdebSMatt Macy } 30087877fdebSMatt Macy 30097877fdebSMatt Macy /* Check for success */ 30107877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 3011*87bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3012*87bf66d4SMartin Matuska return (0); 30137877fdebSMatt Macy 30147877fdebSMatt Macy /* Reconstruction succeeded - report errors */ 30157877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 30167877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 30177877fdebSMatt Macy 30187877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 30197877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 30207877fdebSMatt Macy if (rc->rc_need_orig_restore) { 30217877fdebSMatt Macy /* 30227877fdebSMatt Macy * Note: if this is a parity column, 30237877fdebSMatt Macy * we don't really know if it's wrong. 30247877fdebSMatt Macy * We need to let 30257877fdebSMatt Macy * vdev_raidz_io_done_verified() check 30267877fdebSMatt Macy * it, and if we set rc_error, it will 30277877fdebSMatt Macy * think that it is a "known" error 30287877fdebSMatt Macy * that doesn't need to be checked 30297877fdebSMatt Macy * or corrected. 30307877fdebSMatt Macy */ 30317877fdebSMatt Macy if (rc->rc_error == 0 && 30327877fdebSMatt Macy c >= rr->rr_firstdatacol) { 3033e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, 3034f9693befSMartin Matuska rc, rc->rc_orig_data); 30357877fdebSMatt Macy rc->rc_error = 30367877fdebSMatt Macy SET_ERROR(ECKSUM); 30377877fdebSMatt Macy } 30387877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 30397877fdebSMatt Macy } 30407877fdebSMatt Macy } 30417877fdebSMatt Macy 30427877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 30437877fdebSMatt Macy } 30447877fdebSMatt Macy 30457877fdebSMatt Macy zio_checksum_verified(zio); 30467877fdebSMatt Macy 3047e716630dSMartin Matuska if (dbgmsg) { 3048e716630dSMartin Matuska zfs_dbgmsg("reconstruction successful " 3049e716630dSMartin Matuska "(checksum verified)"); 3050e716630dSMartin Matuska } 30517877fdebSMatt Macy return (0); 30527877fdebSMatt Macy } 30537877fdebSMatt Macy 30547877fdebSMatt Macy /* Reconstruction failed - restore original data */ 30557877fdebSMatt Macy raidz_restore_orig_data(rm); 3056e716630dSMartin Matuska if (dbgmsg) { 3057e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3058e716630dSMartin Matuska "failed", zio); 3059e716630dSMartin Matuska } 30607877fdebSMatt Macy return (ECKSUM); 30617877fdebSMatt Macy } 30627877fdebSMatt Macy 30637877fdebSMatt Macy /* 30647877fdebSMatt Macy * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 30657877fdebSMatt Macy * Note that the algorithm below is non-optimal because it doesn't take into 30667877fdebSMatt Macy * account how reconstruction is actually performed. For example, with 30677877fdebSMatt Macy * triple-parity RAID-Z the reconstruction procedure is the same if column 4 30687877fdebSMatt Macy * is targeted as invalid as if columns 1 and 4 are targeted since in both 30697877fdebSMatt Macy * cases we'd only use parity information in column 0. 30707877fdebSMatt Macy * 30717877fdebSMatt Macy * The order that we find the various possible combinations of failed 30727877fdebSMatt Macy * disks is dictated by these rules: 30737877fdebSMatt Macy * - Examine each "slot" (the "i" in tgts[i]) 3074e716630dSMartin Matuska * - Try to increment this slot (tgts[i] += 1) 30757877fdebSMatt Macy * - if we can't increment because it runs into the next slot, 30767877fdebSMatt Macy * reset our slot to the minimum, and examine the next slot 30777877fdebSMatt Macy * 30787877fdebSMatt Macy * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 30797877fdebSMatt Macy * 3 columns to reconstruct), we will generate the following sequence: 30807877fdebSMatt Macy * 30817877fdebSMatt Macy * STATE ACTION 30827877fdebSMatt Macy * 0 1 2 special case: skip since these are all parity 30837877fdebSMatt Macy * 0 1 3 first slot: reset to 0; middle slot: increment to 2 30847877fdebSMatt Macy * 0 2 3 first slot: increment to 1 30857877fdebSMatt Macy * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 30867877fdebSMatt Macy * 0 1 4 first: reset to 0; middle: increment to 2 30877877fdebSMatt Macy * 0 2 4 first: increment to 1 30887877fdebSMatt Macy * 1 2 4 first: reset to 0; middle: increment to 3 30897877fdebSMatt Macy * 0 3 4 first: increment to 1 30907877fdebSMatt Macy * 1 3 4 first: increment to 2 30917877fdebSMatt Macy * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 30927877fdebSMatt Macy * 0 1 5 first: reset to 0; middle: increment to 2 30937877fdebSMatt Macy * 0 2 5 first: increment to 1 30947877fdebSMatt Macy * 1 2 5 first: reset to 0; middle: increment to 3 30957877fdebSMatt Macy * 0 3 5 first: increment to 1 30967877fdebSMatt Macy * 1 3 5 first: increment to 2 30977877fdebSMatt Macy * 2 3 5 first: reset to 0; middle: increment to 4 30987877fdebSMatt Macy * 0 4 5 first: increment to 1 30997877fdebSMatt Macy * 1 4 5 first: increment to 2 31007877fdebSMatt Macy * 2 4 5 first: increment to 3 31017877fdebSMatt Macy * 3 4 5 done 31027877fdebSMatt Macy * 310316038816SMartin Matuska * This strategy works for dRAID but is less efficient when there are a large 31047877fdebSMatt Macy * number of child vdevs and therefore permutations to check. Furthermore, 3105e716630dSMartin Matuska * since the raidz_map_t rows likely do not overlap, reconstruction would be 31067877fdebSMatt Macy * possible as long as there are no more than nparity data errors per row. 31077877fdebSMatt Macy * These additional permutations are not currently checked but could be as 31087877fdebSMatt Macy * a future improvement. 3109e716630dSMartin Matuska * 3110e716630dSMartin Matuska * Returns 0 on success, ECKSUM on failure. 31117877fdebSMatt Macy */ 31127877fdebSMatt Macy static int 31137877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio) 31147877fdebSMatt Macy { 31157877fdebSMatt Macy int nparity = vdev_get_nparity(zio->io_vd); 31167877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3117e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 3118e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 3119e716630dSMartin Matuska rm->rm_original_width : physical_width; 31207877fdebSMatt Macy 31217877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 31227877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 31237877fdebSMatt Macy int total_errors = 0; 31247877fdebSMatt Macy 31257877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 31267877fdebSMatt Macy if (rr->rr_col[c].rc_error) 31277877fdebSMatt Macy total_errors++; 31287877fdebSMatt Macy } 31297877fdebSMatt Macy 31307877fdebSMatt Macy if (total_errors > nparity) 31317877fdebSMatt Macy return (vdev_raidz_worst_error(rr)); 31327877fdebSMatt Macy } 31337877fdebSMatt Macy 31347877fdebSMatt Macy for (int num_failures = 1; num_failures <= nparity; num_failures++) { 31357877fdebSMatt Macy int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 31367877fdebSMatt Macy int *ltgts = &tstore[1]; /* value is logical child ID */ 31377877fdebSMatt Macy 3138e716630dSMartin Matuska 3139e716630dSMartin Matuska /* 3140e716630dSMartin Matuska * Determine number of logical children, n. See comment 3141e716630dSMartin Matuska * above raidz_simulate_failure(). 3142e716630dSMartin Matuska */ 3143e716630dSMartin Matuska int n = 0; 3144e716630dSMartin Matuska for (int w = physical_width; 3145e716630dSMartin Matuska w >= original_width; w--) { 3146e716630dSMartin Matuska n += w; 3147e716630dSMartin Matuska } 31487877fdebSMatt Macy 31497877fdebSMatt Macy ASSERT3U(num_failures, <=, nparity); 31507877fdebSMatt Macy ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 31517877fdebSMatt Macy 31527877fdebSMatt Macy /* Handle corner cases in combrec logic */ 31537877fdebSMatt Macy ltgts[-1] = -1; 31547877fdebSMatt Macy for (int i = 0; i < num_failures; i++) { 31557877fdebSMatt Macy ltgts[i] = i; 31567877fdebSMatt Macy } 31577877fdebSMatt Macy ltgts[num_failures] = n; 31587877fdebSMatt Macy 31597877fdebSMatt Macy for (;;) { 31607877fdebSMatt Macy int err = raidz_reconstruct(zio, ltgts, num_failures, 31617877fdebSMatt Macy nparity); 31627877fdebSMatt Macy if (err == EINVAL) { 31637877fdebSMatt Macy /* 31647877fdebSMatt Macy * Reconstruction not possible with this # 31657877fdebSMatt Macy * failures; try more failures. 31667877fdebSMatt Macy */ 31677877fdebSMatt Macy break; 31687877fdebSMatt Macy } else if (err == 0) 31697877fdebSMatt Macy return (0); 31707877fdebSMatt Macy 31717877fdebSMatt Macy /* Compute next targets to try */ 31727877fdebSMatt Macy for (int t = 0; ; t++) { 31737877fdebSMatt Macy ASSERT3U(t, <, num_failures); 31747877fdebSMatt Macy ltgts[t]++; 31757877fdebSMatt Macy if (ltgts[t] == n) { 31767877fdebSMatt Macy /* try more failures */ 31777877fdebSMatt Macy ASSERT3U(t, ==, num_failures - 1); 3178e716630dSMartin Matuska if (zfs_flags & 3179e716630dSMartin Matuska ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3180e716630dSMartin Matuska zfs_dbgmsg("reconstruction " 3181e716630dSMartin Matuska "failed for num_failures=" 3182e716630dSMartin Matuska "%u; tried all " 3183e716630dSMartin Matuska "combinations", 3184e716630dSMartin Matuska num_failures); 3185e716630dSMartin Matuska } 31867877fdebSMatt Macy break; 31877877fdebSMatt Macy } 31887877fdebSMatt Macy 31897877fdebSMatt Macy ASSERT3U(ltgts[t], <, n); 31907877fdebSMatt Macy ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 31917877fdebSMatt Macy 31927877fdebSMatt Macy /* 31937877fdebSMatt Macy * If that spot is available, we're done here. 31947877fdebSMatt Macy * Try the next combination. 31957877fdebSMatt Macy */ 31967877fdebSMatt Macy if (ltgts[t] != ltgts[t + 1]) 3197e716630dSMartin Matuska break; // found next combination 31987877fdebSMatt Macy 31997877fdebSMatt Macy /* 32007877fdebSMatt Macy * Otherwise, reset this tgt to the minimum, 32017877fdebSMatt Macy * and move on to the next tgt. 32027877fdebSMatt Macy */ 32037877fdebSMatt Macy ltgts[t] = ltgts[t - 1] + 1; 32047877fdebSMatt Macy ASSERT3U(ltgts[t], ==, t); 32057877fdebSMatt Macy } 32067877fdebSMatt Macy 32077877fdebSMatt Macy /* Increase the number of failures and keep trying. */ 32087877fdebSMatt Macy if (ltgts[num_failures - 1] == n) 32097877fdebSMatt Macy break; 32107877fdebSMatt Macy } 32117877fdebSMatt Macy } 3212e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3213e716630dSMartin Matuska zfs_dbgmsg("reconstruction failed for all num_failures"); 32147877fdebSMatt Macy return (ECKSUM); 32157877fdebSMatt Macy } 32167877fdebSMatt Macy 32177877fdebSMatt Macy void 32187877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 32197877fdebSMatt Macy { 32207877fdebSMatt Macy for (uint64_t row = 0; row < rm->rm_nrows; row++) { 32217877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[row]; 32227877fdebSMatt Macy vdev_raidz_reconstruct_row(rm, rr, t, nt); 32237877fdebSMatt Macy } 32247877fdebSMatt Macy } 32257877fdebSMatt Macy 32267877fdebSMatt Macy /* 32277877fdebSMatt Macy * Complete a write IO operation on a RAIDZ VDev 32287877fdebSMatt Macy * 32297877fdebSMatt Macy * Outline: 32307877fdebSMatt Macy * 1. Check for errors on the child IOs. 32317877fdebSMatt Macy * 2. Return, setting an error code if too few child VDevs were written 32327877fdebSMatt Macy * to reconstruct the data later. Note that partial writes are 32337877fdebSMatt Macy * considered successful if they can be reconstructed at all. 32347877fdebSMatt Macy */ 32357877fdebSMatt Macy static void 32367877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 32377877fdebSMatt Macy { 3238e716630dSMartin Matuska int normal_errors = 0; 3239e716630dSMartin Matuska int shadow_errors = 0; 32407877fdebSMatt Macy 32417877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32427877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32437877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 32447877fdebSMatt Macy 32457877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32467877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32477877fdebSMatt Macy 3248e716630dSMartin Matuska if (rc->rc_error != 0) { 32497877fdebSMatt Macy ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3250e716630dSMartin Matuska normal_errors++; 3251e716630dSMartin Matuska } 3252e716630dSMartin Matuska if (rc->rc_shadow_error != 0) { 3253e716630dSMartin Matuska ASSERT(rc->rc_shadow_error != ECKSUM); 3254e716630dSMartin Matuska shadow_errors++; 32557877fdebSMatt Macy } 32567877fdebSMatt Macy } 32577877fdebSMatt Macy 32587877fdebSMatt Macy /* 32597877fdebSMatt Macy * Treat partial writes as a success. If we couldn't write enough 3260e716630dSMartin Matuska * columns to reconstruct the data, the I/O failed. Otherwise, good 3261e716630dSMartin Matuska * enough. Note that in the case of a shadow write (during raidz 3262e716630dSMartin Matuska * expansion), depending on if we crash, either the normal (old) or 3263e716630dSMartin Matuska * shadow (new) location may become the "real" version of the block, 3264e716630dSMartin Matuska * so both locations must have sufficient redundancy. 3265eda14cbcSMatt Macy * 3266eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 3267eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 3268eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 3269eda14cbcSMatt Macy * if we intend to reallocate. 3270eda14cbcSMatt Macy */ 3271e716630dSMartin Matuska if (normal_errors > rr->rr_firstdatacol || 3272e716630dSMartin Matuska shadow_errors > rr->rr_firstdatacol) { 32737877fdebSMatt Macy zio->io_error = zio_worst_error(zio->io_error, 32747877fdebSMatt Macy vdev_raidz_worst_error(rr)); 32757877fdebSMatt Macy } 3276eda14cbcSMatt Macy } 3277eda14cbcSMatt Macy 3278f9693befSMartin Matuska static void 32797877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 32807877fdebSMatt Macy raidz_row_t *rr) 32817877fdebSMatt Macy { 32827877fdebSMatt Macy int parity_errors = 0; 32837877fdebSMatt Macy int parity_untried = 0; 32847877fdebSMatt Macy int data_errors = 0; 32857877fdebSMatt Macy int total_errors = 0; 32867877fdebSMatt Macy 32877877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32887877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32897877fdebSMatt Macy 32907877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32917877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32927877fdebSMatt Macy 3293a0b956f5SMartin Matuska /* 3294a0b956f5SMartin Matuska * If scrubbing and a replacing/sparing child vdev determined 3295a0b956f5SMartin Matuska * that not all of its children have an identical copy of the 3296a0b956f5SMartin Matuska * data, then clear the error so the column is treated like 3297a0b956f5SMartin Matuska * any other read and force a repair to correct the damage. 3298a0b956f5SMartin Matuska */ 3299a0b956f5SMartin Matuska if (rc->rc_error == ECKSUM) { 3300a0b956f5SMartin Matuska ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3301a0b956f5SMartin Matuska vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3302a0b956f5SMartin Matuska rc->rc_force_repair = 1; 3303a0b956f5SMartin Matuska rc->rc_error = 0; 3304a0b956f5SMartin Matuska } 33057877fdebSMatt Macy 3306a0b956f5SMartin Matuska if (rc->rc_error) { 33077877fdebSMatt Macy if (c < rr->rr_firstdatacol) 33087877fdebSMatt Macy parity_errors++; 33097877fdebSMatt Macy else 33107877fdebSMatt Macy data_errors++; 33117877fdebSMatt Macy 33127877fdebSMatt Macy total_errors++; 33137877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 33147877fdebSMatt Macy parity_untried++; 33157877fdebSMatt Macy } 33167877fdebSMatt Macy } 3317eda14cbcSMatt Macy 3318eda14cbcSMatt Macy /* 33197877fdebSMatt Macy * If there were data errors and the number of errors we saw was 33207877fdebSMatt Macy * correctable -- less than or equal to the number of parity disks read 33217877fdebSMatt Macy * -- reconstruct based on the missing data. 3322eda14cbcSMatt Macy */ 33237877fdebSMatt Macy if (data_errors != 0 && 33247877fdebSMatt Macy total_errors <= rr->rr_firstdatacol - parity_untried) { 3325eda14cbcSMatt Macy /* 3326eda14cbcSMatt Macy * We either attempt to read all the parity columns or 3327eda14cbcSMatt Macy * none of them. If we didn't try to read parity, we 3328eda14cbcSMatt Macy * wouldn't be here in the correctable case. There must 3329eda14cbcSMatt Macy * also have been fewer parity errors than parity 3330eda14cbcSMatt Macy * columns or, again, we wouldn't be in this code path. 3331eda14cbcSMatt Macy */ 3332eda14cbcSMatt Macy ASSERT(parity_untried == 0); 33337877fdebSMatt Macy ASSERT(parity_errors < rr->rr_firstdatacol); 3334eda14cbcSMatt Macy 3335eda14cbcSMatt Macy /* 3336eda14cbcSMatt Macy * Identify the data columns that reported an error. 3337eda14cbcSMatt Macy */ 33387877fdebSMatt Macy int n = 0; 33397877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY]; 33407877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 33417877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 3342eda14cbcSMatt Macy if (rc->rc_error != 0) { 3343eda14cbcSMatt Macy ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3344eda14cbcSMatt Macy tgts[n++] = c; 3345eda14cbcSMatt Macy } 3346eda14cbcSMatt Macy } 3347eda14cbcSMatt Macy 33487877fdebSMatt Macy ASSERT(rr->rr_firstdatacol >= n); 3349eda14cbcSMatt Macy 3350f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3351eda14cbcSMatt Macy } 3352eda14cbcSMatt Macy } 3353eda14cbcSMatt Macy 3354eda14cbcSMatt Macy /* 33557877fdebSMatt Macy * Return the number of reads issued. 3356eda14cbcSMatt Macy */ 33577877fdebSMatt Macy static int 33587877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 33597877fdebSMatt Macy { 33607877fdebSMatt Macy vdev_t *vd = zio->io_vd; 33617877fdebSMatt Macy int nread = 0; 3362eda14cbcSMatt Macy 33637877fdebSMatt Macy rr->rr_missingdata = 0; 33647877fdebSMatt Macy rr->rr_missingparity = 0; 33657877fdebSMatt Macy 33667877fdebSMatt Macy /* 33677877fdebSMatt Macy * If this rows contains empty sectors which are not required 33687877fdebSMatt Macy * for a normal read then allocate an ABD for them now so they 33697877fdebSMatt Macy * may be read, verified, and any needed repairs performed. 33707877fdebSMatt Macy */ 3371e716630dSMartin Matuska if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 33727877fdebSMatt Macy vdev_draid_map_alloc_empty(zio, rr); 33737877fdebSMatt Macy 33747877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33757877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33767877fdebSMatt Macy if (rc->rc_tried || rc->rc_size == 0) 3377eda14cbcSMatt Macy continue; 3378eda14cbcSMatt Macy 3379eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 3380eda14cbcSMatt Macy vd->vdev_child[rc->rc_devidx], 3381eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 3382eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 3383eda14cbcSMatt Macy vdev_raidz_child_done, rc)); 33847877fdebSMatt Macy nread++; 33857877fdebSMatt Macy } 33867877fdebSMatt Macy return (nread); 3387eda14cbcSMatt Macy } 3388eda14cbcSMatt Macy 3389eda14cbcSMatt Macy /* 33907877fdebSMatt Macy * We're here because either there were too many errors to even attempt 33917877fdebSMatt Macy * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 33927877fdebSMatt Macy * failed. In either case, there is enough bad data to prevent reconstruction. 33937877fdebSMatt Macy * Start checksum ereports for all children which haven't failed. 3394eda14cbcSMatt Macy */ 33957877fdebSMatt Macy static void 33967877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio) 33977877fdebSMatt Macy { 33987877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3399eda14cbcSMatt Macy 34007877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34017877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 3402eda14cbcSMatt Macy 34037877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 34047877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 34057877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 34067877fdebSMatt Macy 34072c48331dSMatt Macy if (rc->rc_error != 0) 34082c48331dSMatt Macy continue; 34092c48331dSMatt Macy 3410eda14cbcSMatt Macy zio_bad_cksum_t zbc; 3411eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 34122c48331dSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 3413eda14cbcSMatt Macy mutex_enter(&cvd->vdev_stat_lock); 3414eda14cbcSMatt Macy cvd->vdev_stat.vs_checksum_errors++; 3415eda14cbcSMatt Macy mutex_exit(&cvd->vdev_stat_lock); 3416bb2d13b6SMartin Matuska (void) zfs_ereport_start_checksum(zio->io_spa, 3417bb2d13b6SMartin Matuska cvd, &zio->io_bookmark, zio, rc->rc_offset, 3418bb2d13b6SMartin Matuska rc->rc_size, &zbc); 3419eda14cbcSMatt Macy } 3420eda14cbcSMatt Macy } 3421eda14cbcSMatt Macy } 3422eda14cbcSMatt Macy 34237877fdebSMatt Macy void 34247877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio) 34257877fdebSMatt Macy { 34267877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 34277877fdebSMatt Macy 3428e716630dSMartin Matuska ASSERT(zio->io_bp != NULL); 34297877fdebSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 34307877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34317877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 34327877fdebSMatt Macy } 34337877fdebSMatt Macy } else { 3434e716630dSMartin Matuska if (rm->rm_phys_col) { 3435e716630dSMartin Matuska /* 3436e716630dSMartin Matuska * This is an aggregated read. Copy the data and status 3437e716630dSMartin Matuska * from the aggregate abd's to the individual rows. 3438e716630dSMartin Matuska */ 3439e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 3440e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 3441e716630dSMartin Matuska 3442e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 3443e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 3444e716630dSMartin Matuska if (rc->rc_tried || rc->rc_size == 0) 3445e716630dSMartin Matuska continue; 3446e716630dSMartin Matuska 3447e716630dSMartin Matuska raidz_col_t *prc = 3448e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 3449e716630dSMartin Matuska rc->rc_error = prc->rc_error; 3450e716630dSMartin Matuska rc->rc_tried = prc->rc_tried; 3451e716630dSMartin Matuska rc->rc_skipped = prc->rc_skipped; 3452e716630dSMartin Matuska if (c >= rr->rr_firstdatacol) { 3453e716630dSMartin Matuska /* 3454e716630dSMartin Matuska * Note: this is slightly faster 3455e716630dSMartin Matuska * than using abd_copy_off(). 3456e716630dSMartin Matuska */ 3457e716630dSMartin Matuska char *physbuf = abd_to_buf( 3458e716630dSMartin Matuska prc->rc_abd); 3459e716630dSMartin Matuska void *physloc = physbuf + 3460e716630dSMartin Matuska rc->rc_offset - 3461e716630dSMartin Matuska prc->rc_offset; 3462e716630dSMartin Matuska 3463e716630dSMartin Matuska abd_copy_from_buf(rc->rc_abd, 3464e716630dSMartin Matuska physloc, rc->rc_size); 3465e716630dSMartin Matuska } 3466e716630dSMartin Matuska } 3467e716630dSMartin Matuska } 3468e716630dSMartin Matuska } 3469e716630dSMartin Matuska 34707877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34717877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34727877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio, 34737877fdebSMatt Macy rm, rr); 34747877fdebSMatt Macy } 34757877fdebSMatt Macy 34767877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 3477*87bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 3478*87bf66d4SMartin Matuska goto done; 3479*87bf66d4SMartin Matuska 34807877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34817877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34827877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 34837877fdebSMatt Macy } 3484eda14cbcSMatt Macy zio_checksum_verified(zio); 34857877fdebSMatt Macy } else { 3486eda14cbcSMatt Macy /* 34877877fdebSMatt Macy * A sequential resilver has no checksum which makes 34887877fdebSMatt Macy * combinatoral reconstruction impossible. This code 34897877fdebSMatt Macy * path is unreachable since raidz_checksum_verify() 34907877fdebSMatt Macy * has no checksum to verify and must succeed. 3491eda14cbcSMatt Macy */ 34927877fdebSMatt Macy ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3493eda14cbcSMatt Macy 34947877fdebSMatt Macy /* 34957877fdebSMatt Macy * This isn't a typical situation -- either we got a 34967877fdebSMatt Macy * read error or a child silently returned bad data. 34977877fdebSMatt Macy * Read every block so we can try again with as much 34987877fdebSMatt Macy * data and parity as we can track down. If we've 34997877fdebSMatt Macy * already been through once before, all children will 35007877fdebSMatt Macy * be marked as tried so we'll proceed to combinatorial 35017877fdebSMatt Macy * reconstruction. 35027877fdebSMatt Macy */ 35037877fdebSMatt Macy int nread = 0; 35047877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 35057877fdebSMatt Macy nread += vdev_raidz_read_all(zio, 35067877fdebSMatt Macy rm->rm_row[i]); 35077877fdebSMatt Macy } 35087877fdebSMatt Macy if (nread != 0) { 35097877fdebSMatt Macy /* 35107877fdebSMatt Macy * Normally our stage is VDEV_IO_DONE, but if 35117877fdebSMatt Macy * we've already called redone(), it will have 35127877fdebSMatt Macy * changed to VDEV_IO_START, in which case we 35137877fdebSMatt Macy * don't want to call redone() again. 35147877fdebSMatt Macy */ 35157877fdebSMatt Macy if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 35167877fdebSMatt Macy zio_vdev_io_redone(zio); 35177877fdebSMatt Macy return; 35187877fdebSMatt Macy } 3519e716630dSMartin Matuska /* 3520e716630dSMartin Matuska * It would be too expensive to try every possible 3521e716630dSMartin Matuska * combination of failed sectors in every row, so 3522e716630dSMartin Matuska * instead we try every combination of failed current or 3523e716630dSMartin Matuska * past physical disk. This means that if the incorrect 3524e716630dSMartin Matuska * sectors were all on Nparity disks at any point in the 3525e716630dSMartin Matuska * past, we will find the correct data. The only known 3526e716630dSMartin Matuska * case where this is less durable than a non-expanded 3527e716630dSMartin Matuska * RAIDZ, is if we have a silent failure during 3528e716630dSMartin Matuska * expansion. In that case, one block could be 3529e716630dSMartin Matuska * partially in the old format and partially in the 3530e716630dSMartin Matuska * new format, so we'd lost some sectors from the old 3531e716630dSMartin Matuska * format and some from the new format. 3532e716630dSMartin Matuska * 3533e716630dSMartin Matuska * e.g. logical_width=4 physical_width=6 3534e716630dSMartin Matuska * the 15 (6+5+4) possible failed disks are: 3535e716630dSMartin Matuska * width=6 child=0 3536e716630dSMartin Matuska * width=6 child=1 3537e716630dSMartin Matuska * width=6 child=2 3538e716630dSMartin Matuska * width=6 child=3 3539e716630dSMartin Matuska * width=6 child=4 3540e716630dSMartin Matuska * width=6 child=5 3541e716630dSMartin Matuska * width=5 child=0 3542e716630dSMartin Matuska * width=5 child=1 3543e716630dSMartin Matuska * width=5 child=2 3544e716630dSMartin Matuska * width=5 child=3 3545e716630dSMartin Matuska * width=5 child=4 3546e716630dSMartin Matuska * width=4 child=0 3547e716630dSMartin Matuska * width=4 child=1 3548e716630dSMartin Matuska * width=4 child=2 3549e716630dSMartin Matuska * width=4 child=3 3550e716630dSMartin Matuska * And we will try every combination of Nparity of these 3551e716630dSMartin Matuska * failing. 3552e716630dSMartin Matuska * 3553e716630dSMartin Matuska * As a first pass, we can generate every combo, 3554e716630dSMartin Matuska * and try reconstructing, ignoring any known 3555e716630dSMartin Matuska * failures. If any row has too many known + simulated 3556e716630dSMartin Matuska * failures, then we bail on reconstructing with this 3557e716630dSMartin Matuska * number of simulated failures. As an improvement, 3558e716630dSMartin Matuska * we could detect the number of whole known failures 3559e716630dSMartin Matuska * (i.e. we have known failures on these disks for 3560e716630dSMartin Matuska * every row; the disks never succeeded), and 3561e716630dSMartin Matuska * subtract that from the max # failures to simulate. 3562e716630dSMartin Matuska * We could go even further like the current 3563e716630dSMartin Matuska * combrec code, but that doesn't seem like it 3564e716630dSMartin Matuska * gains us very much. If we simulate a failure 3565e716630dSMartin Matuska * that is also a known failure, that's fine. 3566e716630dSMartin Matuska */ 35677877fdebSMatt Macy zio->io_error = vdev_raidz_combrec(zio); 35687877fdebSMatt Macy if (zio->io_error == ECKSUM && 35697877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 35707877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio); 35717877fdebSMatt Macy } 3572eda14cbcSMatt Macy } 3573eda14cbcSMatt Macy } 3574*87bf66d4SMartin Matuska done: 3575e716630dSMartin Matuska if (rm->rm_lr != NULL) { 3576e716630dSMartin Matuska zfs_rangelock_exit(rm->rm_lr); 3577e716630dSMartin Matuska rm->rm_lr = NULL; 3578e716630dSMartin Matuska } 3579eda14cbcSMatt Macy } 3580eda14cbcSMatt Macy 3581eda14cbcSMatt Macy static void 3582eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3583eda14cbcSMatt Macy { 35847877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 35857877fdebSMatt Macy if (faulted > vdrz->vd_nparity) 3586eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3587eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 3588eda14cbcSMatt Macy else if (degraded + faulted != 0) 3589eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3590eda14cbcSMatt Macy else 3591eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3592eda14cbcSMatt Macy } 3593eda14cbcSMatt Macy 3594eda14cbcSMatt Macy /* 3595eda14cbcSMatt Macy * Determine if any portion of the provided block resides on a child vdev 3596eda14cbcSMatt Macy * with a dirty DTL and therefore needs to be resilvered. The function 3597eda14cbcSMatt Macy * assumes that at least one DTL is dirty which implies that full stripe 3598eda14cbcSMatt Macy * width blocks must be resilvered. 3599eda14cbcSMatt Macy */ 3600eda14cbcSMatt Macy static boolean_t 36017877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 36027877fdebSMatt Macy uint64_t phys_birth) 3603eda14cbcSMatt Macy { 36047877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 3605e716630dSMartin Matuska 3606e716630dSMartin Matuska /* 3607e716630dSMartin Matuska * If we're in the middle of a RAIDZ expansion, this block may be in 3608e716630dSMartin Matuska * the old and/or new location. For simplicity, always resilver it. 3609e716630dSMartin Matuska */ 3610e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3611e716630dSMartin Matuska return (B_TRUE); 3612e716630dSMartin Matuska 3613eda14cbcSMatt Macy uint64_t dcols = vd->vdev_children; 36147877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 3615eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 3616eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 36177877fdebSMatt Macy uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3618eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 3619eda14cbcSMatt Macy uint64_t s = ((psize - 1) >> ashift) + 1; 3620eda14cbcSMatt Macy /* The first column for this stripe. */ 3621eda14cbcSMatt Macy uint64_t f = b % dcols; 3622eda14cbcSMatt Macy 36237877fdebSMatt Macy /* Unreachable by sequential resilver. */ 36247877fdebSMatt Macy ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 36257877fdebSMatt Macy 36267877fdebSMatt Macy if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 36277877fdebSMatt Macy return (B_FALSE); 36287877fdebSMatt Macy 3629eda14cbcSMatt Macy if (s + nparity >= dcols) 3630eda14cbcSMatt Macy return (B_TRUE); 3631eda14cbcSMatt Macy 3632eda14cbcSMatt Macy for (uint64_t c = 0; c < s + nparity; c++) { 3633eda14cbcSMatt Macy uint64_t devidx = (f + c) % dcols; 3634eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[devidx]; 3635eda14cbcSMatt Macy 3636eda14cbcSMatt Macy /* 3637eda14cbcSMatt Macy * dsl_scan_need_resilver() already checked vd with 3638eda14cbcSMatt Macy * vdev_dtl_contains(). So here just check cvd with 3639eda14cbcSMatt Macy * vdev_dtl_empty(), cheaper and a good approximation. 3640eda14cbcSMatt Macy */ 3641eda14cbcSMatt Macy if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3642eda14cbcSMatt Macy return (B_TRUE); 3643eda14cbcSMatt Macy } 3644eda14cbcSMatt Macy 3645eda14cbcSMatt Macy return (B_FALSE); 3646eda14cbcSMatt Macy } 3647eda14cbcSMatt Macy 3648eda14cbcSMatt Macy static void 36497877fdebSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 36507877fdebSMatt Macy range_seg64_t *physical_rs, range_seg64_t *remain_rs) 3651eda14cbcSMatt Macy { 3652e92ffd9bSMartin Matuska (void) remain_rs; 3653e92ffd9bSMartin Matuska 3654eda14cbcSMatt Macy vdev_t *raidvd = cvd->vdev_parent; 3655eda14cbcSMatt Macy ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3656eda14cbcSMatt Macy 3657e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3658e716630dSMartin Matuska 3659e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3660e716630dSMartin Matuska /* 3661e716630dSMartin Matuska * We're in the middle of expansion, in which case the 3662e716630dSMartin Matuska * translation is in flux. Any answer we give may be wrong 3663e716630dSMartin Matuska * by the time we return, so it isn't safe for the caller to 3664e716630dSMartin Matuska * act on it. Therefore we say that this range isn't present 3665e716630dSMartin Matuska * on any children. The only consumers of this are "zpool 3666e716630dSMartin Matuska * initialize" and trimming, both of which are "best effort" 3667e716630dSMartin Matuska * anyway. 3668e716630dSMartin Matuska */ 3669e716630dSMartin Matuska physical_rs->rs_start = physical_rs->rs_end = 0; 3670e716630dSMartin Matuska remain_rs->rs_start = remain_rs->rs_end = 0; 3671e716630dSMartin Matuska return; 3672e716630dSMartin Matuska } 3673e716630dSMartin Matuska 3674e716630dSMartin Matuska uint64_t width = vdrz->vd_physical_width; 3675eda14cbcSMatt Macy uint64_t tgt_col = cvd->vdev_id; 3676eda14cbcSMatt Macy uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3677eda14cbcSMatt Macy 3678eda14cbcSMatt Macy /* make sure the offsets are block-aligned */ 36797877fdebSMatt Macy ASSERT0(logical_rs->rs_start % (1 << ashift)); 36807877fdebSMatt Macy ASSERT0(logical_rs->rs_end % (1 << ashift)); 36817877fdebSMatt Macy uint64_t b_start = logical_rs->rs_start >> ashift; 36827877fdebSMatt Macy uint64_t b_end = logical_rs->rs_end >> ashift; 3683eda14cbcSMatt Macy 3684eda14cbcSMatt Macy uint64_t start_row = 0; 3685eda14cbcSMatt Macy if (b_start > tgt_col) /* avoid underflow */ 3686eda14cbcSMatt Macy start_row = ((b_start - tgt_col - 1) / width) + 1; 3687eda14cbcSMatt Macy 3688eda14cbcSMatt Macy uint64_t end_row = 0; 3689eda14cbcSMatt Macy if (b_end > tgt_col) 3690eda14cbcSMatt Macy end_row = ((b_end - tgt_col - 1) / width) + 1; 3691eda14cbcSMatt Macy 36927877fdebSMatt Macy physical_rs->rs_start = start_row << ashift; 36937877fdebSMatt Macy physical_rs->rs_end = end_row << ashift; 3694eda14cbcSMatt Macy 36957877fdebSMatt Macy ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 36967877fdebSMatt Macy ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 36977877fdebSMatt Macy logical_rs->rs_end - logical_rs->rs_start); 36987877fdebSMatt Macy } 36997877fdebSMatt Macy 3700e716630dSMartin Matuska static void 3701e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3702e716630dSMartin Matuska { 3703e716630dSMartin Matuska spa_t *spa = arg; 3704e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3705e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3706e716630dSMartin Matuska 3707e716630dSMartin Matuska /* 3708e716630dSMartin Matuska * Ensure there are no i/os to the range that is being committed. 3709e716630dSMartin Matuska */ 3710e716630dSMartin Matuska uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3711e716630dSMartin Matuska ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3712e716630dSMartin Matuska 3713e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3714e716630dSMartin Matuska uint64_t new_offset = 3715e716630dSMartin Matuska MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3716e716630dSMartin Matuska /* 3717e716630dSMartin Matuska * We should not have committed anything that failed. 3718e716630dSMartin Matuska */ 3719e716630dSMartin Matuska VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3720e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3721e716630dSMartin Matuska 3722e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3723e716630dSMartin Matuska old_offset, new_offset - old_offset, 3724e716630dSMartin Matuska RL_WRITER); 3725e716630dSMartin Matuska 3726e716630dSMartin Matuska /* 3727e716630dSMartin Matuska * Update the uberblock that will be written when this txg completes. 3728e716630dSMartin Matuska */ 3729e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3730e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3731e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = 0; 3732e716630dSMartin Matuska zfs_rangelock_exit(lr); 3733e716630dSMartin Matuska 3734e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3735e716630dSMartin Matuska vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3736e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = 0; 3737e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3738e716630dSMartin Matuska 3739e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3740e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3741e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3742e716630dSMartin Matuska sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3743e716630dSMartin Matuska } 3744e716630dSMartin Matuska 3745e716630dSMartin Matuska static void 3746e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3747e716630dSMartin Matuska { 3748e716630dSMartin Matuska spa_t *spa = arg; 3749e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3750e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3751e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3752e716630dSMartin Matuska 3753e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 3754e716630dSMartin Matuska VERIFY0(vre->vre_offset_pertxg[i]); 3755e716630dSMartin Matuska 3756e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3757e716630dSMartin Matuska re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3758e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width; 3759e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 3760e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 3761e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 3762e716630dSMartin Matuska 3763e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3764e716630dSMartin Matuska 3765e716630dSMartin Matuska /* 3766e716630dSMartin Matuska * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3767e716630dSMartin Matuska * will get written (based on vd_expand_txgs). 3768e716630dSMartin Matuska */ 3769e716630dSMartin Matuska vdev_config_dirty(vd); 3770e716630dSMartin Matuska 3771e716630dSMartin Matuska /* 3772e716630dSMartin Matuska * Before we change vre_state, the on-disk state must reflect that we 3773e716630dSMartin Matuska * have completed all copying, so that vdev_raidz_io_start() can use 3774e716630dSMartin Matuska * vre_state to determine if the reflow is in progress. See also the 3775e716630dSMartin Matuska * end of spa_raidz_expand_thread(). 3776e716630dSMartin Matuska */ 3777e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3778e716630dSMartin Matuska raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3779e716630dSMartin Matuska 3780e716630dSMartin Matuska vre->vre_end_time = gethrestime_sec(); 3781e716630dSMartin Matuska vre->vre_state = DSS_FINISHED; 3782e716630dSMartin Matuska 3783e716630dSMartin Matuska uint64_t state = vre->vre_state; 3784e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3785e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3786e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 3787e716630dSMartin Matuska 3788e716630dSMartin Matuska uint64_t end_time = vre->vre_end_time; 3789e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3790e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3791e716630dSMartin Matuska sizeof (end_time), 1, &end_time, tx)); 3792e716630dSMartin Matuska 3793e716630dSMartin Matuska spa->spa_uberblock.ub_raidz_reflow_info = 0; 3794e716630dSMartin Matuska 3795e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3796e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 3797e716630dSMartin Matuska (unsigned long long)vd->vdev_id, 3798e716630dSMartin Matuska (unsigned long long)vd->vdev_children); 3799e716630dSMartin Matuska 3800e716630dSMartin Matuska spa->spa_raidz_expand = NULL; 3801e716630dSMartin Matuska raidvd->vdev_rz_expanding = B_FALSE; 3802e716630dSMartin Matuska 3803e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3804e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3805e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3806e716630dSMartin Matuska 3807e716630dSMartin Matuska spa_notify_waiters(spa); 3808e716630dSMartin Matuska 3809e716630dSMartin Matuska /* 3810e716630dSMartin Matuska * While we're in syncing context take the opportunity to 3811e716630dSMartin Matuska * setup a scrub. All the data has been sucessfully copied 3812e716630dSMartin Matuska * but we have not validated any checksums. 3813e716630dSMartin Matuska */ 3814e716630dSMartin Matuska pool_scan_func_t func = POOL_SCAN_SCRUB; 3815e716630dSMartin Matuska if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) 3816e716630dSMartin Matuska dsl_scan_setup_sync(&func, tx); 3817e716630dSMartin Matuska } 3818e716630dSMartin Matuska 3819e716630dSMartin Matuska /* 3820e716630dSMartin Matuska * Struct for one copy zio. 3821e716630dSMartin Matuska */ 3822e716630dSMartin Matuska typedef struct raidz_reflow_arg { 3823e716630dSMartin Matuska vdev_raidz_expand_t *rra_vre; 3824e716630dSMartin Matuska zfs_locked_range_t *rra_lr; 3825e716630dSMartin Matuska uint64_t rra_txg; 3826e716630dSMartin Matuska } raidz_reflow_arg_t; 3827e716630dSMartin Matuska 3828e716630dSMartin Matuska /* 3829e716630dSMartin Matuska * The write of the new location is done. 3830e716630dSMartin Matuska */ 3831e716630dSMartin Matuska static void 3832e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio) 3833e716630dSMartin Matuska { 3834e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3835e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3836e716630dSMartin Matuska 3837e716630dSMartin Matuska abd_free(zio->io_abd); 3838e716630dSMartin Matuska 3839e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3840e716630dSMartin Matuska if (zio->io_error != 0) { 3841e716630dSMartin Matuska /* Force a reflow pause on errors */ 3842e716630dSMartin Matuska vre->vre_failed_offset = 3843e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3844e716630dSMartin Matuska } 3845e716630dSMartin Matuska ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3846e716630dSMartin Matuska vre->vre_outstanding_bytes -= zio->io_size; 3847e716630dSMartin Matuska if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3848e716630dSMartin Matuska vre->vre_failed_offset) { 3849e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3850e716630dSMartin Matuska zio->io_size; 3851e716630dSMartin Matuska } 3852e716630dSMartin Matuska cv_signal(&vre->vre_cv); 3853e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3854e716630dSMartin Matuska 3855e716630dSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 3856e716630dSMartin Matuska 3857e716630dSMartin Matuska kmem_free(rra, sizeof (*rra)); 3858e716630dSMartin Matuska spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3859e716630dSMartin Matuska } 3860e716630dSMartin Matuska 3861e716630dSMartin Matuska /* 3862e716630dSMartin Matuska * The read of the old location is done. The parent zio is the write to 3863e716630dSMartin Matuska * the new location. Allow it to start. 3864e716630dSMartin Matuska */ 3865e716630dSMartin Matuska static void 3866e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio) 3867e716630dSMartin Matuska { 3868e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3869e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3870e716630dSMartin Matuska 3871e716630dSMartin Matuska /* 3872e716630dSMartin Matuska * If the read failed, or if it was done on a vdev that is not fully 3873e716630dSMartin Matuska * healthy (e.g. a child that has a resilver in progress), we may not 3874e716630dSMartin Matuska * have the correct data. Note that it's OK if the write proceeds. 3875e716630dSMartin Matuska * It may write garbage but the location is otherwise unused and we 3876e716630dSMartin Matuska * will retry later due to vre_failed_offset. 3877e716630dSMartin Matuska */ 3878e716630dSMartin Matuska if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3879e716630dSMartin Matuska zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3880e716630dSMartin Matuska "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3881e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3882e716630dSMartin Matuska (long long)rra->rra_lr->lr_length, 3883e716630dSMartin Matuska (long long)rra->rra_txg, 3884e716630dSMartin Matuska zio->io_error, 3885e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3886e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3887e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3888e716630dSMartin Matuska /* Force a reflow pause on errors */ 3889e716630dSMartin Matuska vre->vre_failed_offset = 3890e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3891e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3892e716630dSMartin Matuska } 3893e716630dSMartin Matuska 3894e716630dSMartin Matuska zio_nowait(zio_unique_parent(zio)); 3895e716630dSMartin Matuska } 3896e716630dSMartin Matuska 3897e716630dSMartin Matuska static void 3898e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3899e716630dSMartin Matuska dmu_tx_t *tx) 3900e716630dSMartin Matuska { 3901e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3902e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3903e716630dSMartin Matuska 3904e716630dSMartin Matuska if (offset == 0) 3905e716630dSMartin Matuska return; 3906e716630dSMartin Matuska 3907e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3908e716630dSMartin Matuska ASSERT3U(vre->vre_offset, <=, offset); 3909e716630dSMartin Matuska vre->vre_offset = offset; 3910e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3911e716630dSMartin Matuska 3912e716630dSMartin Matuska if (vre->vre_offset_pertxg[txgoff] == 0) { 3913e716630dSMartin Matuska dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3914e716630dSMartin Matuska spa, tx); 3915e716630dSMartin Matuska } 3916e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = offset; 3917e716630dSMartin Matuska } 3918e716630dSMartin Matuska 3919e716630dSMartin Matuska static boolean_t 3920e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3921e716630dSMartin Matuska { 3922e716630dSMartin Matuska for (int i = 0; i < raidz_vd->vdev_children; i++) { 3923e716630dSMartin Matuska /* Quick check if a child is being replaced */ 3924e716630dSMartin Matuska if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3925e716630dSMartin Matuska return (B_TRUE); 3926e716630dSMartin Matuska } 3927e716630dSMartin Matuska return (B_FALSE); 3928e716630dSMartin Matuska } 3929e716630dSMartin Matuska 3930e716630dSMartin Matuska static boolean_t 3931e716630dSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, 3932e716630dSMartin Matuska dmu_tx_t *tx) 3933e716630dSMartin Matuska { 3934e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 3935e716630dSMartin Matuska int ashift = vd->vdev_top->vdev_ashift; 3936e716630dSMartin Matuska uint64_t offset, size; 3937e716630dSMartin Matuska 3938e716630dSMartin Matuska if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, 3939e716630dSMartin Matuska &offset, &size)) { 3940e716630dSMartin Matuska return (B_FALSE); 3941e716630dSMartin Matuska } 3942e716630dSMartin Matuska ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3943e716630dSMartin Matuska ASSERT3U(size, >=, 1 << ashift); 3944e716630dSMartin Matuska uint64_t length = 1 << ashift; 3945e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3946e716630dSMartin Matuska 3947e716630dSMartin Matuska uint64_t blkid = offset >> ashift; 3948e716630dSMartin Matuska 3949e716630dSMartin Matuska int old_children = vd->vdev_children - 1; 3950e716630dSMartin Matuska 3951e716630dSMartin Matuska /* 3952e716630dSMartin Matuska * We can only progress to the point that writes will not overlap 3953e716630dSMartin Matuska * with blocks whose progress has not yet been recorded on disk. 3954e716630dSMartin Matuska * Since partially-copied rows are still read from the old location, 3955e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent 3956e716630dSMartin Matuska * row-wise overlap. 3957e716630dSMartin Matuska * 3958e716630dSMartin Matuska * Note that even if we are skipping over a large unallocated region, 3959e716630dSMartin Matuska * we can't move the on-disk progress to `offset`, because concurrent 3960e716630dSMartin Matuska * writes/allocations could still use the currently-unallocated 3961e716630dSMartin Matuska * region. 3962e716630dSMartin Matuska */ 3963e716630dSMartin Matuska uint64_t ubsync_blkid = 3964e716630dSMartin Matuska RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3965e716630dSMartin Matuska uint64_t next_overwrite_blkid = ubsync_blkid + 3966e716630dSMartin Matuska ubsync_blkid / old_children - old_children; 3967e716630dSMartin Matuska VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3968e716630dSMartin Matuska 3969e716630dSMartin Matuska if (blkid >= next_overwrite_blkid) { 3970e716630dSMartin Matuska raidz_reflow_record_progress(vre, 3971e716630dSMartin Matuska next_overwrite_blkid << ashift, tx); 3972e716630dSMartin Matuska return (B_TRUE); 3973e716630dSMartin Matuska } 3974e716630dSMartin Matuska 3975e716630dSMartin Matuska range_tree_remove(rt, offset, length); 3976e716630dSMartin Matuska 3977e716630dSMartin Matuska raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); 3978e716630dSMartin Matuska rra->rra_vre = vre; 3979e716630dSMartin Matuska rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 3980e716630dSMartin Matuska offset, length, RL_WRITER); 3981e716630dSMartin Matuska rra->rra_txg = dmu_tx_get_txg(tx); 3982e716630dSMartin Matuska 3983e716630dSMartin Matuska raidz_reflow_record_progress(vre, offset + length, tx); 3984e716630dSMartin Matuska 3985e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3986e716630dSMartin Matuska vre->vre_outstanding_bytes += length; 3987e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3988e716630dSMartin Matuska 3989e716630dSMartin Matuska /* 3990e716630dSMartin Matuska * SCL_STATE will be released when the read and write are done, 3991e716630dSMartin Matuska * by raidz_reflow_write_done(). 3992e716630dSMartin Matuska */ 3993e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3994e716630dSMartin Matuska 3995e716630dSMartin Matuska /* check if a replacing vdev was added, if so treat it as an error */ 3996e716630dSMartin Matuska if (vdev_raidz_expand_child_replacing(vd)) { 3997e716630dSMartin Matuska zfs_dbgmsg("replacing vdev encountered, reflow paused at " 3998e716630dSMartin Matuska "offset=%llu txg=%llu", 3999e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 4000e716630dSMartin Matuska (long long)rra->rra_txg); 4001e716630dSMartin Matuska 4002e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4003e716630dSMartin Matuska vre->vre_failed_offset = 4004e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4005e716630dSMartin Matuska cv_signal(&vre->vre_cv); 4006e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4007e716630dSMartin Matuska 4008e716630dSMartin Matuska /* drop everything we acquired */ 4009e716630dSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 4010e716630dSMartin Matuska kmem_free(rra, sizeof (*rra)); 4011e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, spa); 4012e716630dSMartin Matuska return (B_TRUE); 4013e716630dSMartin Matuska } 4014e716630dSMartin Matuska 4015e716630dSMartin Matuska zio_t *pio = spa->spa_txg_zio[txgoff]; 4016e716630dSMartin Matuska abd_t *abd = abd_alloc_for_io(length, B_FALSE); 4017e716630dSMartin Matuska zio_t *write_zio = zio_vdev_child_io(pio, NULL, 4018e716630dSMartin Matuska vd->vdev_child[blkid % vd->vdev_children], 4019e716630dSMartin Matuska (blkid / vd->vdev_children) << ashift, 4020e716630dSMartin Matuska abd, length, 4021e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4022e716630dSMartin Matuska ZIO_FLAG_CANFAIL, 4023e716630dSMartin Matuska raidz_reflow_write_done, rra); 4024e716630dSMartin Matuska 4025e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(write_zio, NULL, 4026e716630dSMartin Matuska vd->vdev_child[blkid % old_children], 4027e716630dSMartin Matuska (blkid / old_children) << ashift, 4028e716630dSMartin Matuska abd, length, 4029e716630dSMartin Matuska ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4030e716630dSMartin Matuska ZIO_FLAG_CANFAIL, 4031e716630dSMartin Matuska raidz_reflow_read_done, rra)); 4032e716630dSMartin Matuska 4033e716630dSMartin Matuska return (B_FALSE); 4034e716630dSMartin Matuska } 4035e716630dSMartin Matuska 4036e716630dSMartin Matuska /* 4037e716630dSMartin Matuska * For testing (ztest specific) 4038e716630dSMartin Matuska */ 4039e716630dSMartin Matuska static void 4040e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point) 4041e716630dSMartin Matuska { 4042e716630dSMartin Matuska while (raidz_expand_pause_point != 0 && 4043e716630dSMartin Matuska raidz_expand_pause_point <= pause_point) 4044e716630dSMartin Matuska delay(hz); 4045e716630dSMartin Matuska } 4046e716630dSMartin Matuska 4047e716630dSMartin Matuska static void 4048e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio) 4049e716630dSMartin Matuska { 4050e716630dSMartin Matuska zio_t *pio = zio->io_private; 4051e716630dSMartin Matuska 4052e716630dSMartin Matuska mutex_enter(&pio->io_lock); 4053e716630dSMartin Matuska pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4054e716630dSMartin Matuska mutex_exit(&pio->io_lock); 4055e716630dSMartin Matuska } 4056e716630dSMartin Matuska 4057e716630dSMartin Matuska /* 4058e716630dSMartin Matuska * Reflow the beginning portion of the vdev into an intermediate scratch area 4059e716630dSMartin Matuska * in memory and on disk. This operation must be persisted on disk before we 4060e716630dSMartin Matuska * proceed to overwrite the beginning portion with the reflowed data. 4061e716630dSMartin Matuska * 4062e716630dSMartin Matuska * This multi-step task can fail to complete if disk errors are encountered 4063e716630dSMartin Matuska * and we can return here after a pause (waiting for disk to become healthy). 4064e716630dSMartin Matuska */ 4065e716630dSMartin Matuska static void 4066e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4067e716630dSMartin Matuska { 4068e716630dSMartin Matuska vdev_raidz_expand_t *vre = arg; 4069e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4070e716630dSMartin Matuska zio_t *pio; 4071e716630dSMartin Matuska int error; 4072e716630dSMartin Matuska 4073e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4074e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4075e716630dSMartin Matuska int ashift = raidvd->vdev_ashift; 4076aca928a5SMartin Matuska uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4077aca928a5SMartin Matuska uint64_t); 4078e716630dSMartin Matuska uint64_t logical_size = write_size * raidvd->vdev_children; 4079e716630dSMartin Matuska uint64_t read_size = 4080e716630dSMartin Matuska P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4081e716630dSMartin Matuska 1 << ashift); 4082e716630dSMartin Matuska 4083e716630dSMartin Matuska /* 4084e716630dSMartin Matuska * The scratch space must be large enough to get us to the point 4085e716630dSMartin Matuska * that one row does not overlap itself when moved. This is checked 4086e716630dSMartin Matuska * by vdev_raidz_attach_check(). 4087e716630dSMartin Matuska */ 4088e716630dSMartin Matuska VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4089e716630dSMartin Matuska VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4090e716630dSMartin Matuska VERIFY3U(write_size, <=, read_size); 4091e716630dSMartin Matuska 4092e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4093e716630dSMartin Matuska 0, logical_size, RL_WRITER); 4094e716630dSMartin Matuska 4095e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4096e716630dSMartin Matuska KM_SLEEP); 4097e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4098e716630dSMartin Matuska abds[i] = abd_alloc_linear(read_size, B_FALSE); 4099e716630dSMartin Matuska } 4100e716630dSMartin Matuska 4101e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4102e716630dSMartin Matuska 4103e716630dSMartin Matuska /* 4104e716630dSMartin Matuska * If we have already written the scratch area then we must read from 4105e716630dSMartin Matuska * there, since new writes were redirected there while we were paused 4106e716630dSMartin Matuska * or the original location may have been partially overwritten with 4107e716630dSMartin Matuska * reflowed data. 4108e716630dSMartin Matuska */ 4109e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4110e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4111e716630dSMartin Matuska /* 4112e716630dSMartin Matuska * Read from scratch space. 4113e716630dSMartin Matuska */ 4114e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4115e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4116e716630dSMartin Matuska /* 4117e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4118e716630dSMartin Matuska * to the offset to calculate the physical offset to 4119e716630dSMartin Matuska * write to. Passing in a negative offset makes us 4120e716630dSMartin Matuska * access the scratch area. 4121e716630dSMartin Matuska */ 4122e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 4123e716630dSMartin Matuska raidvd->vdev_child[i], 4124e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4125e716630dSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, 4126e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4127e716630dSMartin Matuska } 4128e716630dSMartin Matuska error = zio_wait(pio); 4129e716630dSMartin Matuska if (error != 0) { 4130e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading scratch location", 4131e716630dSMartin Matuska error); 4132e716630dSMartin Matuska goto io_error_exit; 4133e716630dSMartin Matuska } 4134e716630dSMartin Matuska goto overwrite; 4135e716630dSMartin Matuska } 4136e716630dSMartin Matuska 4137e716630dSMartin Matuska /* 4138e716630dSMartin Matuska * Read from original location. 4139e716630dSMartin Matuska */ 4140e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4141e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4142e716630dSMartin Matuska ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4143e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4144e716630dSMartin Matuska 0, abds[i], read_size, ZIO_TYPE_READ, 4145e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 4146e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4147e716630dSMartin Matuska } 4148e716630dSMartin Matuska error = zio_wait(pio); 4149e716630dSMartin Matuska if (error != 0) { 4150e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading original location", error); 4151e716630dSMartin Matuska io_error_exit: 4152e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4153e716630dSMartin Matuska abd_free(abds[i]); 4154e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4155e716630dSMartin Matuska zfs_rangelock_exit(lr); 4156e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4157e716630dSMartin Matuska return; 4158e716630dSMartin Matuska } 4159e716630dSMartin Matuska 4160e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4161e716630dSMartin Matuska 4162e716630dSMartin Matuska /* 4163e716630dSMartin Matuska * Reflow in memory. 4164e716630dSMartin Matuska */ 4165e716630dSMartin Matuska uint64_t logical_sectors = logical_size >> ashift; 4166e716630dSMartin Matuska for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4167e716630dSMartin Matuska int oldchild = i % (raidvd->vdev_children - 1); 4168e716630dSMartin Matuska uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4169e716630dSMartin Matuska 4170e716630dSMartin Matuska int newchild = i % raidvd->vdev_children; 4171e716630dSMartin Matuska uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4172e716630dSMartin Matuska 4173e716630dSMartin Matuska /* a single sector should not be copying over itself */ 4174e716630dSMartin Matuska ASSERT(!(newchild == oldchild && newoff == oldoff)); 4175e716630dSMartin Matuska 4176e716630dSMartin Matuska abd_copy_off(abds[newchild], abds[oldchild], 4177e716630dSMartin Matuska newoff, oldoff, 1 << ashift); 4178e716630dSMartin Matuska } 4179e716630dSMartin Matuska 4180e716630dSMartin Matuska /* 4181e716630dSMartin Matuska * Verify that we filled in everything we intended to (write_size on 4182e716630dSMartin Matuska * each child). 4183e716630dSMartin Matuska */ 4184e716630dSMartin Matuska VERIFY0(logical_sectors % raidvd->vdev_children); 4185e716630dSMartin Matuska VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4186e716630dSMartin Matuska write_size); 4187e716630dSMartin Matuska 4188e716630dSMartin Matuska /* 4189e716630dSMartin Matuska * Write to scratch location (boot area). 4190e716630dSMartin Matuska */ 4191e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4192e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4193e716630dSMartin Matuska /* 4194e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4195e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4196e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4197e716630dSMartin Matuska */ 4198e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4199e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4200e716630dSMartin Matuska write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 4201e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4202e716630dSMartin Matuska } 4203e716630dSMartin Matuska error = zio_wait(pio); 4204e716630dSMartin Matuska if (error != 0) { 4205e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing scratch location", error); 4206e716630dSMartin Matuska goto io_error_exit; 4207e716630dSMartin Matuska } 4208e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4209e716630dSMartin Matuska zio_flush(pio, raidvd); 4210e716630dSMartin Matuska zio_wait(pio); 4211e716630dSMartin Matuska 4212e716630dSMartin Matuska zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4213e716630dSMartin Matuska (long long)logical_size); 4214e716630dSMartin Matuska 4215e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4216e716630dSMartin Matuska 4217e716630dSMartin Matuska /* 4218e716630dSMartin Matuska * Update uberblock to indicate that scratch space is valid. This is 4219e716630dSMartin Matuska * needed because after this point, the real location may be 4220e716630dSMartin Matuska * overwritten. If we crash, we need to get the data from the 4221e716630dSMartin Matuska * scratch space, rather than the real location. 4222e716630dSMartin Matuska * 4223e716630dSMartin Matuska * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4224e716630dSMartin Matuska * will prefer this uberblock. 4225e716630dSMartin Matuska */ 4226e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4227e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4228e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4229e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4230e716630dSMartin Matuska if (spa_multihost(spa)) 4231e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4232e716630dSMartin Matuska 4233e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4234e716630dSMartin Matuska "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4235e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4236e716630dSMartin Matuska (long long)logical_size, 4237e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4238e716630dSMartin Matuska 4239e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4240e716630dSMartin Matuska 4241e716630dSMartin Matuska /* 4242e716630dSMartin Matuska * Overwrite with reflow'ed data. 4243e716630dSMartin Matuska */ 4244e716630dSMartin Matuska overwrite: 4245e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4246e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4247e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4248e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 4249e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, 4250e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4251e716630dSMartin Matuska } 4252e716630dSMartin Matuska error = zio_wait(pio); 4253e716630dSMartin Matuska if (error != 0) { 4254e716630dSMartin Matuska /* 4255e716630dSMartin Matuska * When we exit early here and drop the range lock, new 4256e716630dSMartin Matuska * writes will go into the scratch area so we'll need to 4257e716630dSMartin Matuska * read from there when we return after pausing. 4258e716630dSMartin Matuska */ 4259e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing real location", error); 4260e716630dSMartin Matuska /* 4261e716630dSMartin Matuska * Update the uberblock that is written when this txg completes. 4262e716630dSMartin Matuska */ 4263e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4264e716630dSMartin Matuska logical_size); 4265e716630dSMartin Matuska goto io_error_exit; 4266e716630dSMartin Matuska } 4267e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4268e716630dSMartin Matuska zio_flush(pio, raidvd); 4269e716630dSMartin Matuska zio_wait(pio); 4270e716630dSMartin Matuska 4271e716630dSMartin Matuska zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4272e716630dSMartin Matuska (long long)logical_size); 4273e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4274e716630dSMartin Matuska abd_free(abds[i]); 4275e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4276e716630dSMartin Matuska 4277e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4278e716630dSMartin Matuska 4279e716630dSMartin Matuska /* 4280e716630dSMartin Matuska * Update uberblock to indicate that the initial part has been 4281e716630dSMartin Matuska * reflow'ed. This is needed because after this point (when we exit 4282e716630dSMartin Matuska * the rangelock), we allow regular writes to this region, which will 4283e716630dSMartin Matuska * be written to the new location only (because reflow_offset_next == 4284e716630dSMartin Matuska * reflow_offset_synced). If we crashed and re-copied from the 4285e716630dSMartin Matuska * scratch space, we would lose the regular writes. 4286e716630dSMartin Matuska */ 4287e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4288e716630dSMartin Matuska logical_size); 4289e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4290e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4291e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4292e716630dSMartin Matuska if (spa_multihost(spa)) 4293e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4294e716630dSMartin Matuska 4295e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4296e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4297e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4298e716630dSMartin Matuska (long long)logical_size, 4299e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4300e716630dSMartin Matuska 4301e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4302e716630dSMartin Matuska 4303e716630dSMartin Matuska /* 4304e716630dSMartin Matuska * Update progress. 4305e716630dSMartin Matuska */ 4306e716630dSMartin Matuska vre->vre_offset = logical_size; 4307e716630dSMartin Matuska zfs_rangelock_exit(lr); 4308e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4309e716630dSMartin Matuska 4310e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4311e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4312e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4313e716630dSMartin Matuska /* 4314e716630dSMartin Matuska * Note - raidz_reflow_sync() will update the uberblock state to 4315e716630dSMartin Matuska * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4316e716630dSMartin Matuska */ 4317e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4318e716630dSMartin Matuska 4319e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4320e716630dSMartin Matuska } 4321e716630dSMartin Matuska 4322e716630dSMartin Matuska /* 4323e716630dSMartin Matuska * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4324e716630dSMartin Matuska * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4325e716630dSMartin Matuska */ 4326e716630dSMartin Matuska void 4327e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa) 4328e716630dSMartin Matuska { 4329e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4330e716630dSMartin Matuska uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4331e716630dSMartin Matuska ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4332e716630dSMartin Matuska 4333e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4334e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4335e716630dSMartin Matuska ASSERT0(logical_size % raidvd->vdev_children); 4336e716630dSMartin Matuska uint64_t write_size = logical_size / raidvd->vdev_children; 4337e716630dSMartin Matuska 4338e716630dSMartin Matuska zio_t *pio; 4339e716630dSMartin Matuska 4340e716630dSMartin Matuska /* 4341e716630dSMartin Matuska * Read from scratch space. 4342e716630dSMartin Matuska */ 4343e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4344e716630dSMartin Matuska KM_SLEEP); 4345e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4346e716630dSMartin Matuska abds[i] = abd_alloc_linear(write_size, B_FALSE); 4347e716630dSMartin Matuska } 4348e716630dSMartin Matuska 4349e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4350e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4351e716630dSMartin Matuska /* 4352e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4353e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4354e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4355e716630dSMartin Matuska */ 4356e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4357e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4358e716630dSMartin Matuska write_size, ZIO_TYPE_READ, 4359e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, 4360e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4361e716630dSMartin Matuska } 4362e716630dSMartin Matuska zio_wait(pio); 4363e716630dSMartin Matuska 4364e716630dSMartin Matuska /* 4365e716630dSMartin Matuska * Overwrite real location with reflow'ed data. 4366e716630dSMartin Matuska */ 4367e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4368e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4369e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4370e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 4371e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_WRITE, 0, 4372e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4373e716630dSMartin Matuska } 4374e716630dSMartin Matuska zio_wait(pio); 4375e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4376e716630dSMartin Matuska zio_flush(pio, raidvd); 4377e716630dSMartin Matuska zio_wait(pio); 4378e716630dSMartin Matuska 4379e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4380e716630dSMartin Matuska "to real location", (long long)logical_size); 4381e716630dSMartin Matuska 4382e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4383e716630dSMartin Matuska abd_free(abds[i]); 4384e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4385e716630dSMartin Matuska 4386e716630dSMartin Matuska /* 4387e716630dSMartin Matuska * Update uberblock. 4388e716630dSMartin Matuska */ 4389e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4390e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4391e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4392e716630dSMartin Matuska VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4393e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4394e716630dSMartin Matuska if (spa_multihost(spa)) 4395e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4396e716630dSMartin Matuska 4397e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: uberblock updated " 4398e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4399e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4400e716630dSMartin Matuska (long long)logical_size, 4401e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4402e716630dSMartin Matuska 4403e716630dSMartin Matuska dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4404e716630dSMartin Matuska spa_first_txg(spa)); 4405e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4406e716630dSMartin Matuska vre->vre_offset = logical_size; 4407e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4408e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4409e716630dSMartin Matuska /* 4410e716630dSMartin Matuska * Note that raidz_reflow_sync() will update the uberblock once more 4411e716630dSMartin Matuska */ 4412e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4413e716630dSMartin Matuska 4414e716630dSMartin Matuska dmu_tx_commit(tx); 4415e716630dSMartin Matuska 4416e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4417e716630dSMartin Matuska } 4418e716630dSMartin Matuska 4419e716630dSMartin Matuska static boolean_t 4420e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4421e716630dSMartin Matuska { 4422e716630dSMartin Matuska (void) zthr; 4423e716630dSMartin Matuska spa_t *spa = arg; 4424e716630dSMartin Matuska 4425e716630dSMartin Matuska return (spa->spa_raidz_expand != NULL && 4426e716630dSMartin Matuska !spa->spa_raidz_expand->vre_waiting_for_resilver); 4427e716630dSMartin Matuska } 4428e716630dSMartin Matuska 4429e716630dSMartin Matuska /* 4430e716630dSMartin Matuska * RAIDZ expansion background thread 4431e716630dSMartin Matuska * 4432e716630dSMartin Matuska * Can be called multiple times if the reflow is paused 4433e716630dSMartin Matuska */ 4434e716630dSMartin Matuska static void 4435e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4436e716630dSMartin Matuska { 4437e716630dSMartin Matuska spa_t *spa = arg; 4438e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4439e716630dSMartin Matuska 4440e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4441e716630dSMartin Matuska vre->vre_offset = 0; 4442e716630dSMartin Matuska else 4443e716630dSMartin Matuska vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4444e716630dSMartin Matuska 4445e716630dSMartin Matuska /* Reflow the begining portion using the scratch area */ 4446e716630dSMartin Matuska if (vre->vre_offset == 0) { 4447e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), 4448e716630dSMartin Matuska NULL, raidz_reflow_scratch_sync, 4449e716630dSMartin Matuska vre, 0, ZFS_SPACE_CHECK_NONE)); 4450e716630dSMartin Matuska 4451e716630dSMartin Matuska /* if we encountered errors then pause */ 4452e716630dSMartin Matuska if (vre->vre_offset == 0) { 4453e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4454e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4455e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4456e716630dSMartin Matuska return; 4457e716630dSMartin Matuska } 4458e716630dSMartin Matuska } 4459e716630dSMartin Matuska 4460e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4461e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4462e716630dSMartin Matuska 4463e716630dSMartin Matuska uint64_t guid = raidvd->vdev_guid; 4464e716630dSMartin Matuska 4465e716630dSMartin Matuska /* Iterate over all the remaining metaslabs */ 4466e716630dSMartin Matuska for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4467e716630dSMartin Matuska i < raidvd->vdev_ms_count && 4468e716630dSMartin Matuska !zthr_iscancelled(zthr) && 4469e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX; i++) { 4470e716630dSMartin Matuska metaslab_t *msp = raidvd->vdev_ms[i]; 4471e716630dSMartin Matuska 4472e716630dSMartin Matuska metaslab_disable(msp); 4473e716630dSMartin Matuska mutex_enter(&msp->ms_lock); 4474e716630dSMartin Matuska 4475e716630dSMartin Matuska /* 4476e716630dSMartin Matuska * The metaslab may be newly created (for the expanded 4477e716630dSMartin Matuska * space), in which case its trees won't exist yet, 4478e716630dSMartin Matuska * so we need to bail out early. 4479e716630dSMartin Matuska */ 4480e716630dSMartin Matuska if (msp->ms_new) { 4481e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4482e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4483e716630dSMartin Matuska continue; 4484e716630dSMartin Matuska } 4485e716630dSMartin Matuska 4486e716630dSMartin Matuska VERIFY0(metaslab_load(msp)); 4487e716630dSMartin Matuska 4488e716630dSMartin Matuska /* 4489e716630dSMartin Matuska * We want to copy everything except the free (allocatable) 4490e716630dSMartin Matuska * space. Note that there may be a little bit more free 4491e716630dSMartin Matuska * space (e.g. in ms_defer), and it's fine to copy that too. 4492e716630dSMartin Matuska */ 4493e716630dSMartin Matuska range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, 4494e716630dSMartin Matuska NULL, 0, 0); 4495e716630dSMartin Matuska range_tree_add(rt, msp->ms_start, msp->ms_size); 4496e716630dSMartin Matuska range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); 4497e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4498e716630dSMartin Matuska 4499e716630dSMartin Matuska /* 4500e716630dSMartin Matuska * Force the last sector of each metaslab to be copied. This 4501e716630dSMartin Matuska * ensures that we advance the on-disk progress to the end of 4502e716630dSMartin Matuska * this metaslab while the metaslab is disabled. Otherwise, we 4503e716630dSMartin Matuska * could move past this metaslab without advancing the on-disk 4504e716630dSMartin Matuska * progress, and then an allocation to this metaslab would not 4505e716630dSMartin Matuska * be copied. 4506e716630dSMartin Matuska */ 4507e716630dSMartin Matuska int sectorsz = 1 << raidvd->vdev_ashift; 4508e716630dSMartin Matuska uint64_t ms_last_offset = msp->ms_start + 4509e716630dSMartin Matuska msp->ms_size - sectorsz; 4510e716630dSMartin Matuska if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { 4511e716630dSMartin Matuska range_tree_add(rt, ms_last_offset, sectorsz); 4512e716630dSMartin Matuska } 4513e716630dSMartin Matuska 4514e716630dSMartin Matuska /* 4515e716630dSMartin Matuska * When we are resuming from a paused expansion (i.e. 4516e716630dSMartin Matuska * when importing a pool with a expansion in progress), 4517e716630dSMartin Matuska * discard any state that we have already processed. 4518e716630dSMartin Matuska */ 4519e716630dSMartin Matuska range_tree_clear(rt, 0, vre->vre_offset); 4520e716630dSMartin Matuska 4521e716630dSMartin Matuska while (!zthr_iscancelled(zthr) && 4522e716630dSMartin Matuska !range_tree_is_empty(rt) && 4523e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX) { 4524e716630dSMartin Matuska 4525e716630dSMartin Matuska /* 4526e716630dSMartin Matuska * We need to periodically drop the config lock so that 4527e716630dSMartin Matuska * writers can get in. Additionally, we can't wait 4528e716630dSMartin Matuska * for a txg to sync while holding a config lock 4529e716630dSMartin Matuska * (since a waiting writer could cause a 3-way deadlock 4530e716630dSMartin Matuska * with the sync thread, which also gets a config 4531e716630dSMartin Matuska * lock for reader). So we can't hold the config lock 4532e716630dSMartin Matuska * while calling dmu_tx_assign(). 4533e716630dSMartin Matuska */ 4534e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4535e716630dSMartin Matuska 4536e716630dSMartin Matuska /* 4537e716630dSMartin Matuska * If requested, pause the reflow when the amount 4538e716630dSMartin Matuska * specified by raidz_expand_max_reflow_bytes is reached 4539e716630dSMartin Matuska * 4540e716630dSMartin Matuska * This pause is only used during testing or debugging. 4541e716630dSMartin Matuska */ 4542e716630dSMartin Matuska while (raidz_expand_max_reflow_bytes != 0 && 4543e716630dSMartin Matuska raidz_expand_max_reflow_bytes <= 4544e716630dSMartin Matuska vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4545e716630dSMartin Matuska delay(hz); 4546e716630dSMartin Matuska } 4547e716630dSMartin Matuska 4548e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4549e716630dSMartin Matuska while (vre->vre_outstanding_bytes > 4550e716630dSMartin Matuska raidz_expand_max_copy_bytes) { 4551e716630dSMartin Matuska cv_wait(&vre->vre_cv, &vre->vre_lock); 4552e716630dSMartin Matuska } 4553e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4554e716630dSMartin Matuska 4555e716630dSMartin Matuska dmu_tx_t *tx = 4556e716630dSMartin Matuska dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4557e716630dSMartin Matuska 4558e716630dSMartin Matuska VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 4559e716630dSMartin Matuska uint64_t txg = dmu_tx_get_txg(tx); 4560e716630dSMartin Matuska 4561e716630dSMartin Matuska /* 4562e716630dSMartin Matuska * Reacquire the vdev_config lock. Theoretically, the 4563e716630dSMartin Matuska * vdev_t that we're expanding may have changed. 4564e716630dSMartin Matuska */ 4565e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4566e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4567e716630dSMartin Matuska 4568e716630dSMartin Matuska boolean_t needsync = 4569e716630dSMartin Matuska raidz_reflow_impl(raidvd, vre, rt, tx); 4570e716630dSMartin Matuska 4571e716630dSMartin Matuska dmu_tx_commit(tx); 4572e716630dSMartin Matuska 4573e716630dSMartin Matuska if (needsync) { 4574e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4575e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, txg); 4576e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, 4577e716630dSMartin Matuska RW_READER); 4578e716630dSMartin Matuska } 4579e716630dSMartin Matuska } 4580e716630dSMartin Matuska 4581e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4582e716630dSMartin Matuska 4583e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4584e716630dSMartin Matuska range_tree_vacate(rt, NULL, NULL); 4585e716630dSMartin Matuska range_tree_destroy(rt); 4586e716630dSMartin Matuska 4587e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4588e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4589e716630dSMartin Matuska } 4590e716630dSMartin Matuska 4591e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4592e716630dSMartin Matuska 4593e716630dSMartin Matuska /* 4594e716630dSMartin Matuska * The txg_wait_synced() here ensures that all reflow zio's have 4595e716630dSMartin Matuska * completed, and vre_failed_offset has been set if necessary. It 4596e716630dSMartin Matuska * also ensures that the progress of the last raidz_reflow_sync() is 4597e716630dSMartin Matuska * written to disk before raidz_reflow_complete_sync() changes the 4598e716630dSMartin Matuska * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4599e716630dSMartin Matuska * determine if a reflow is in progress, in which case we may need to 4600e716630dSMartin Matuska * write to both old and new locations. Therefore we can only change 4601e716630dSMartin Matuska * vre_state once this is not necessary, which is once the on-disk 4602e716630dSMartin Matuska * progress (in spa_ubsync) has been set past any possible writes (to 4603e716630dSMartin Matuska * the end of the last metaslab). 4604e716630dSMartin Matuska */ 4605e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, 0); 4606e716630dSMartin Matuska 4607e716630dSMartin Matuska if (!zthr_iscancelled(zthr) && 4608e716630dSMartin Matuska vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4609e716630dSMartin Matuska /* 4610e716630dSMartin Matuska * We are not being canceled or paused, so the reflow must be 4611e716630dSMartin Matuska * complete. In that case also mark it as completed on disk. 4612e716630dSMartin Matuska */ 4613e716630dSMartin Matuska ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4614e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4615e716630dSMartin Matuska raidz_reflow_complete_sync, spa, 4616e716630dSMartin Matuska 0, ZFS_SPACE_CHECK_NONE)); 4617e716630dSMartin Matuska (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4618e716630dSMartin Matuska } else { 4619e716630dSMartin Matuska /* 4620e716630dSMartin Matuska * Wait for all copy zio's to complete and for all the 4621e716630dSMartin Matuska * raidz_reflow_sync() synctasks to be run. 4622e716630dSMartin Matuska */ 4623e716630dSMartin Matuska spa_history_log_internal(spa, "reflow pause", 4624e716630dSMartin Matuska NULL, "offset=%llu failed_offset=%lld", 4625e716630dSMartin Matuska (long long)vre->vre_offset, 4626e716630dSMartin Matuska (long long)vre->vre_failed_offset); 4627e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4628e716630dSMartin Matuska if (vre->vre_failed_offset != UINT64_MAX) { 4629e716630dSMartin Matuska /* 4630e716630dSMartin Matuska * Reset progress so that we will retry everything 4631e716630dSMartin Matuska * after the point that something failed. 4632e716630dSMartin Matuska */ 4633e716630dSMartin Matuska vre->vre_offset = vre->vre_failed_offset; 4634e716630dSMartin Matuska vre->vre_failed_offset = UINT64_MAX; 4635e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4636e716630dSMartin Matuska } 4637e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4638e716630dSMartin Matuska } 4639e716630dSMartin Matuska } 4640e716630dSMartin Matuska 4641e716630dSMartin Matuska void 4642e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa) 4643e716630dSMartin Matuska { 4644e716630dSMartin Matuska ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4645e716630dSMartin Matuska spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4646e716630dSMartin Matuska spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4647e716630dSMartin Matuska spa, defclsyspri); 4648e716630dSMartin Matuska } 4649e716630dSMartin Matuska 4650e716630dSMartin Matuska void 4651e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd) 4652e716630dSMartin Matuska { 4653e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 4654e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL) { 4655e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4656e716630dSMartin Matuska /* 4657e716630dSMartin Matuska * we get called often from vdev_dtl_reassess() so make 4658e716630dSMartin Matuska * sure it's our vdev and any replacing is complete 4659e716630dSMartin Matuska */ 4660e716630dSMartin Matuska if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4661e716630dSMartin Matuska !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4662e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4663e716630dSMartin Matuska if (vre->vre_waiting_for_resilver) { 4664e716630dSMartin Matuska vdev_dbgmsg(vd, "DTL reassessed, " 4665e716630dSMartin Matuska "continuing raidz expansion"); 4666e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_FALSE; 4667e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4668e716630dSMartin Matuska } 4669e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4670e716630dSMartin Matuska } 4671e716630dSMartin Matuska } 4672e716630dSMartin Matuska } 4673e716630dSMartin Matuska 4674e716630dSMartin Matuska int 4675e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child) 4676e716630dSMartin Matuska { 4677e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4678e716630dSMartin Matuska uint64_t new_children = raidvd->vdev_children; 4679e716630dSMartin Matuska 4680e716630dSMartin Matuska /* 4681e716630dSMartin Matuska * We use the "boot" space as scratch space to handle overwriting the 4682e716630dSMartin Matuska * initial part of the vdev. If it is too small, then this expansion 4683e716630dSMartin Matuska * is not allowed. This would be very unusual (e.g. ashift > 13 and 4684e716630dSMartin Matuska * >200 children). 4685e716630dSMartin Matuska */ 4686e716630dSMartin Matuska if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4687e716630dSMartin Matuska return (EINVAL); 4688e716630dSMartin Matuska } 4689e716630dSMartin Matuska return (0); 4690e716630dSMartin Matuska } 4691e716630dSMartin Matuska 4692e716630dSMartin Matuska void 4693e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4694e716630dSMartin Matuska { 4695e716630dSMartin Matuska vdev_t *new_child = arg; 4696e716630dSMartin Matuska spa_t *spa = new_child->vdev_spa; 4697e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4698e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4699e716630dSMartin Matuska ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4700e716630dSMartin Matuska ASSERT3P(raidvd->vdev_top, ==, raidvd); 4701e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4702e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4703e716630dSMartin Matuska ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4704e716630dSMartin Matuska new_child); 4705e716630dSMartin Matuska 4706e716630dSMartin Matuska spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4707e716630dSMartin Matuska 4708e716630dSMartin Matuska vdrz->vd_physical_width++; 4709e716630dSMartin Matuska 4710e716630dSMartin Matuska VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4711e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4712e716630dSMartin Matuska vdrz->vn_vre.vre_offset = 0; 4713e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4714e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4715e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4716e716630dSMartin Matuska 4717e716630dSMartin Matuska /* 4718e716630dSMartin Matuska * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4719e716630dSMartin Matuska * written to the config. 4720e716630dSMartin Matuska */ 4721e716630dSMartin Matuska vdev_config_dirty(raidvd); 4722e716630dSMartin Matuska 4723e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4724e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = 0; 4725e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4726e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = 0; 4727e716630dSMartin Matuska 4728e716630dSMartin Matuska uint64_t state = vdrz->vn_vre.vre_state; 4729e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4730e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4731e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 4732e716630dSMartin Matuska 4733e716630dSMartin Matuska uint64_t start_time = vdrz->vn_vre.vre_start_time; 4734e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4735e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4736e716630dSMartin Matuska sizeof (start_time), 1, &start_time, tx)); 4737e716630dSMartin Matuska 4738e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4739e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4740e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4741e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4742e716630dSMartin Matuska 4743e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4744e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 4745e716630dSMartin Matuska (unsigned long long)raidvd->vdev_id, 4746e716630dSMartin Matuska (unsigned long long)raidvd->vdev_children); 4747e716630dSMartin Matuska } 4748e716630dSMartin Matuska 4749e716630dSMartin Matuska int 4750e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd) 4751e716630dSMartin Matuska { 4752e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4753e716630dSMartin Matuska int err; 4754e716630dSMartin Matuska 4755e716630dSMartin Matuska uint64_t state = DSS_NONE; 4756e716630dSMartin Matuska uint64_t start_time = 0; 4757e716630dSMartin Matuska uint64_t end_time = 0; 4758e716630dSMartin Matuska uint64_t bytes_copied = 0; 4759e716630dSMartin Matuska 4760e716630dSMartin Matuska if (vd->vdev_top_zap != 0) { 4761e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4762e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4763e716630dSMartin Matuska sizeof (state), 1, &state); 4764e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4765e716630dSMartin Matuska return (err); 4766e716630dSMartin Matuska 4767e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4768e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4769e716630dSMartin Matuska sizeof (start_time), 1, &start_time); 4770e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4771e716630dSMartin Matuska return (err); 4772e716630dSMartin Matuska 4773e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4774e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4775e716630dSMartin Matuska sizeof (end_time), 1, &end_time); 4776e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4777e716630dSMartin Matuska return (err); 4778e716630dSMartin Matuska 4779e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4780e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4781e716630dSMartin Matuska sizeof (bytes_copied), 1, &bytes_copied); 4782e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4783e716630dSMartin Matuska return (err); 4784e716630dSMartin Matuska } 4785e716630dSMartin Matuska 4786e716630dSMartin Matuska /* 4787e716630dSMartin Matuska * If we are in the middle of expansion, vre_state should have 4788e716630dSMartin Matuska * already been set by vdev_raidz_init(). 4789e716630dSMartin Matuska */ 4790e716630dSMartin Matuska EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4791e716630dSMartin Matuska vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4792e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = start_time; 4793e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = end_time; 4794e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4795e716630dSMartin Matuska 4796e716630dSMartin Matuska return (0); 4797e716630dSMartin Matuska } 4798e716630dSMartin Matuska 4799e716630dSMartin Matuska int 4800e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4801e716630dSMartin Matuska { 4802e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4803e716630dSMartin Matuska 4804e716630dSMartin Matuska if (vre == NULL) { 4805e716630dSMartin Matuska /* no removal in progress; find most recent completed */ 4806e716630dSMartin Matuska for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4807e716630dSMartin Matuska vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4808e716630dSMartin Matuska if (vd->vdev_ops == &vdev_raidz_ops) { 4809e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4810e716630dSMartin Matuska 4811e716630dSMartin Matuska if (vdrz->vn_vre.vre_end_time != 0 && 4812e716630dSMartin Matuska (vre == NULL || 4813e716630dSMartin Matuska vdrz->vn_vre.vre_end_time > 4814e716630dSMartin Matuska vre->vre_end_time)) { 4815e716630dSMartin Matuska vre = &vdrz->vn_vre; 4816e716630dSMartin Matuska } 4817e716630dSMartin Matuska } 4818e716630dSMartin Matuska } 4819e716630dSMartin Matuska } 4820e716630dSMartin Matuska 4821e716630dSMartin Matuska if (vre == NULL) { 4822e716630dSMartin Matuska return (SET_ERROR(ENOENT)); 4823e716630dSMartin Matuska } 4824e716630dSMartin Matuska 4825e716630dSMartin Matuska pres->pres_state = vre->vre_state; 4826e716630dSMartin Matuska pres->pres_expanding_vdev = vre->vre_vdev_id; 4827e716630dSMartin Matuska 4828e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4829e716630dSMartin Matuska pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4830e716630dSMartin Matuska 4831e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4832e716630dSMartin Matuska pres->pres_reflowed = vre->vre_bytes_copied; 4833e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 4834e716630dSMartin Matuska pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4835e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4836e716630dSMartin Matuska 4837e716630dSMartin Matuska pres->pres_start_time = vre->vre_start_time; 4838e716630dSMartin Matuska pres->pres_end_time = vre->vre_end_time; 4839e716630dSMartin Matuska pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4840e716630dSMartin Matuska 4841e716630dSMartin Matuska return (0); 4842e716630dSMartin Matuska } 4843e716630dSMartin Matuska 48447877fdebSMatt Macy /* 48457877fdebSMatt Macy * Initialize private RAIDZ specific fields from the nvlist. 48467877fdebSMatt Macy */ 48477877fdebSMatt Macy static int 48487877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 48497877fdebSMatt Macy { 48507877fdebSMatt Macy uint_t children; 48517877fdebSMatt Macy nvlist_t **child; 48527877fdebSMatt Macy int error = nvlist_lookup_nvlist_array(nv, 48537877fdebSMatt Macy ZPOOL_CONFIG_CHILDREN, &child, &children); 48547877fdebSMatt Macy if (error != 0) 48557877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48567877fdebSMatt Macy 4857e716630dSMartin Matuska uint64_t nparity; 48587877fdebSMatt Macy if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 48597877fdebSMatt Macy if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 48607877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48617877fdebSMatt Macy 48627877fdebSMatt Macy /* 48637877fdebSMatt Macy * Previous versions could only support 1 or 2 parity 48647877fdebSMatt Macy * device. 48657877fdebSMatt Macy */ 48667877fdebSMatt Macy if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 48677877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48687877fdebSMatt Macy else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 48697877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48707877fdebSMatt Macy } else { 48717877fdebSMatt Macy /* 48727877fdebSMatt Macy * We require the parity to be specified for SPAs that 48737877fdebSMatt Macy * support multiple parity levels. 48747877fdebSMatt Macy */ 48757877fdebSMatt Macy if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 48767877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48777877fdebSMatt Macy 48787877fdebSMatt Macy /* 48797877fdebSMatt Macy * Otherwise, we default to 1 parity device for RAID-Z. 48807877fdebSMatt Macy */ 48817877fdebSMatt Macy nparity = 1; 48827877fdebSMatt Macy } 48837877fdebSMatt Macy 4884e716630dSMartin Matuska vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4885e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = -1; 4886e716630dSMartin Matuska vdrz->vn_vre.vre_offset = UINT64_MAX; 4887e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4888e716630dSMartin Matuska mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4889e716630dSMartin Matuska cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4890e716630dSMartin Matuska zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4891e716630dSMartin Matuska mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4892e716630dSMartin Matuska avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4893e716630dSMartin Matuska sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4894e716630dSMartin Matuska 4895e716630dSMartin Matuska vdrz->vd_physical_width = children; 48967877fdebSMatt Macy vdrz->vd_nparity = nparity; 48977877fdebSMatt Macy 4898e716630dSMartin Matuska /* note, the ID does not exist when creating a pool */ 4899e716630dSMartin Matuska (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4900e716630dSMartin Matuska &vdrz->vn_vre.vre_vdev_id); 4901e716630dSMartin Matuska 4902e716630dSMartin Matuska boolean_t reflow_in_progress = 4903e716630dSMartin Matuska nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4904e716630dSMartin Matuska if (reflow_in_progress) { 4905e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4906e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4907e716630dSMartin Matuska } 4908e716630dSMartin Matuska 4909e716630dSMartin Matuska vdrz->vd_original_width = children; 4910e716630dSMartin Matuska uint64_t *txgs; 4911e716630dSMartin Matuska unsigned int txgs_size = 0; 4912e716630dSMartin Matuska error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4913e716630dSMartin Matuska &txgs, &txgs_size); 4914e716630dSMartin Matuska if (error == 0) { 4915e716630dSMartin Matuska for (int i = 0; i < txgs_size; i++) { 4916e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4917e716630dSMartin Matuska re->re_txg = txgs[txgs_size - i - 1]; 4918e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width - i; 4919e716630dSMartin Matuska 4920e716630dSMartin Matuska if (reflow_in_progress) 4921e716630dSMartin Matuska re->re_logical_width--; 4922e716630dSMartin Matuska 4923e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 4924e716630dSMartin Matuska } 4925e716630dSMartin Matuska 4926e716630dSMartin Matuska vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4927e716630dSMartin Matuska } 4928e716630dSMartin Matuska if (reflow_in_progress) { 4929e716630dSMartin Matuska vdrz->vd_original_width--; 4930e716630dSMartin Matuska zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 4931e716630dSMartin Matuska children, txgs_size); 4932e716630dSMartin Matuska } 4933e716630dSMartin Matuska 49347877fdebSMatt Macy *tsd = vdrz; 49357877fdebSMatt Macy 49367877fdebSMatt Macy return (0); 49377877fdebSMatt Macy } 49387877fdebSMatt Macy 49397877fdebSMatt Macy static void 49407877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd) 49417877fdebSMatt Macy { 4942e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4943e716630dSMartin Matuska if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 4944e716630dSMartin Matuska vd->vdev_spa->spa_raidz_expand = NULL; 4945e716630dSMartin Matuska reflow_node_t *re; 4946e716630dSMartin Matuska void *cookie = NULL; 4947e716630dSMartin Matuska avl_tree_t *tree = &vdrz->vd_expand_txgs; 4948e716630dSMartin Matuska while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 4949e716630dSMartin Matuska kmem_free(re, sizeof (*re)); 4950e716630dSMartin Matuska avl_destroy(&vdrz->vd_expand_txgs); 4951e716630dSMartin Matuska mutex_destroy(&vdrz->vd_expand_lock); 4952e716630dSMartin Matuska mutex_destroy(&vdrz->vn_vre.vre_lock); 4953e716630dSMartin Matuska cv_destroy(&vdrz->vn_vre.vre_cv); 4954e716630dSMartin Matuska zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 4955e716630dSMartin Matuska kmem_free(vdrz, sizeof (*vdrz)); 49567877fdebSMatt Macy } 49577877fdebSMatt Macy 49587877fdebSMatt Macy /* 49597877fdebSMatt Macy * Add RAIDZ specific fields to the config nvlist. 49607877fdebSMatt Macy */ 49617877fdebSMatt Macy static void 49627877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 49637877fdebSMatt Macy { 49647877fdebSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 49657877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 49667877fdebSMatt Macy 49677877fdebSMatt Macy /* 49687877fdebSMatt Macy * Make sure someone hasn't managed to sneak a fancy new vdev 49697877fdebSMatt Macy * into a crufty old storage pool. 49707877fdebSMatt Macy */ 49717877fdebSMatt Macy ASSERT(vdrz->vd_nparity == 1 || 49727877fdebSMatt Macy (vdrz->vd_nparity <= 2 && 49737877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 49747877fdebSMatt Macy (vdrz->vd_nparity <= 3 && 49757877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 49767877fdebSMatt Macy 49777877fdebSMatt Macy /* 49787877fdebSMatt Macy * Note that we'll add these even on storage pools where they 49797877fdebSMatt Macy * aren't strictly required -- older software will just ignore 49807877fdebSMatt Macy * it. 49817877fdebSMatt Macy */ 49827877fdebSMatt Macy fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 4983e716630dSMartin Matuska 4984e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4985e716630dSMartin Matuska fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4986e716630dSMartin Matuska } 4987e716630dSMartin Matuska 4988e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 4989e716630dSMartin Matuska if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 4990e716630dSMartin Matuska uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 4991e716630dSMartin Matuska uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 4992e716630dSMartin Matuska KM_SLEEP); 4993e716630dSMartin Matuska uint64_t i = 0; 4994e716630dSMartin Matuska 4995e716630dSMartin Matuska for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 4996e716630dSMartin Matuska re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 4997e716630dSMartin Matuska txgs[i++] = re->re_txg; 4998e716630dSMartin Matuska } 4999e716630dSMartin Matuska 5000e716630dSMartin Matuska fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5001e716630dSMartin Matuska txgs, count); 5002e716630dSMartin Matuska 5003e716630dSMartin Matuska kmem_free(txgs, sizeof (uint64_t) * count); 5004e716630dSMartin Matuska } 5005e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 50067877fdebSMatt Macy } 50077877fdebSMatt Macy 50087877fdebSMatt Macy static uint64_t 50097877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd) 50107877fdebSMatt Macy { 50117877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 50127877fdebSMatt Macy return (vdrz->vd_nparity); 50137877fdebSMatt Macy } 50147877fdebSMatt Macy 50157877fdebSMatt Macy static uint64_t 50167877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd) 50177877fdebSMatt Macy { 50187877fdebSMatt Macy return (vd->vdev_children); 5019eda14cbcSMatt Macy } 5020eda14cbcSMatt Macy 5021eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = { 50227877fdebSMatt Macy .vdev_op_init = vdev_raidz_init, 50237877fdebSMatt Macy .vdev_op_fini = vdev_raidz_fini, 5024eda14cbcSMatt Macy .vdev_op_open = vdev_raidz_open, 5025eda14cbcSMatt Macy .vdev_op_close = vdev_raidz_close, 5026eda14cbcSMatt Macy .vdev_op_asize = vdev_raidz_asize, 50277877fdebSMatt Macy .vdev_op_min_asize = vdev_raidz_min_asize, 50287877fdebSMatt Macy .vdev_op_min_alloc = NULL, 5029eda14cbcSMatt Macy .vdev_op_io_start = vdev_raidz_io_start, 5030eda14cbcSMatt Macy .vdev_op_io_done = vdev_raidz_io_done, 5031eda14cbcSMatt Macy .vdev_op_state_change = vdev_raidz_state_change, 5032eda14cbcSMatt Macy .vdev_op_need_resilver = vdev_raidz_need_resilver, 5033eda14cbcSMatt Macy .vdev_op_hold = NULL, 5034eda14cbcSMatt Macy .vdev_op_rele = NULL, 5035eda14cbcSMatt Macy .vdev_op_remap = NULL, 5036eda14cbcSMatt Macy .vdev_op_xlate = vdev_raidz_xlate, 50377877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 50387877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 50397877fdebSMatt Macy .vdev_op_config_generate = vdev_raidz_config_generate, 50407877fdebSMatt Macy .vdev_op_nparity = vdev_raidz_nparity, 50417877fdebSMatt Macy .vdev_op_ndisks = vdev_raidz_ndisks, 5042eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5043eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5044eda14cbcSMatt Macy }; 5045e716630dSMartin Matuska 5046e716630dSMartin Matuska /* BEGIN CSTYLED */ 5047e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5048e716630dSMartin Matuska "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5049e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5050e716630dSMartin Matuska "Max amount of concurrent i/o for RAIDZ expansion"); 5051e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5052e716630dSMartin Matuska "For expanded RAIDZ, aggregate reads that have more rows than this"); 5053e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5054e716630dSMartin Matuska "For expanded RAIDZ, automatically start a pool scrub when expansion " 5055e716630dSMartin Matuska "completes"); 5056e716630dSMartin Matuska /* END CSTYLED */ 5057