1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy 22eda14cbcSMatt Macy /* 23eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 242c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 26eda14cbcSMatt Macy */ 27eda14cbcSMatt Macy 28eda14cbcSMatt Macy #include <sys/zfs_context.h> 29eda14cbcSMatt Macy #include <sys/spa.h> 30*e716630dSMartin Matuska #include <sys/spa_impl.h> 31*e716630dSMartin Matuska #include <sys/zap.h> 32eda14cbcSMatt Macy #include <sys/vdev_impl.h> 33*e716630dSMartin Matuska #include <sys/metaslab_impl.h> 34eda14cbcSMatt Macy #include <sys/zio.h> 35eda14cbcSMatt Macy #include <sys/zio_checksum.h> 36*e716630dSMartin Matuska #include <sys/dmu_tx.h> 37eda14cbcSMatt Macy #include <sys/abd.h> 38*e716630dSMartin Matuska #include <sys/zfs_rlock.h> 39eda14cbcSMatt Macy #include <sys/fs/zfs.h> 40eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 41eda14cbcSMatt Macy #include <sys/vdev_raidz.h> 42eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h> 437877fdebSMatt Macy #include <sys/vdev_draid.h> 44*e716630dSMartin Matuska #include <sys/uberblock_impl.h> 45*e716630dSMartin Matuska #include <sys/dsl_scan.h> 46eda14cbcSMatt Macy 47eda14cbcSMatt Macy #ifdef ZFS_DEBUG 48eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 49eda14cbcSMatt Macy #endif 50eda14cbcSMatt Macy 51eda14cbcSMatt Macy /* 52eda14cbcSMatt Macy * Virtual device vector for RAID-Z. 53eda14cbcSMatt Macy * 54eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity, 55eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity, 56eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the 57eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 58eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 59eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 60eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance 61eda14cbcSMatt Macy * for writes. 62eda14cbcSMatt Macy * 63eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then 64eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its 65eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to 66eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to 67eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 68eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will 69eda14cbcSMatt Macy * suffer. 70eda14cbcSMatt Macy * 71eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the 72eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 73eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the 74eda14cbcSMatt Macy * field are defined as follows: 75eda14cbcSMatt Macy * 76eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR 77eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B 78eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression: 79eda14cbcSMatt Macy * 80eda14cbcSMatt Macy * (A * 2)_7 = A_6 81eda14cbcSMatt Macy * (A * 2)_6 = A_5 82eda14cbcSMatt Macy * (A * 2)_5 = A_4 83eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7 84eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7 85eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7 86eda14cbcSMatt Macy * (A * 2)_1 = A_0 87eda14cbcSMatt Macy * (A * 2)_0 = A_7 88eda14cbcSMatt Macy * 89eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 90eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting 91eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 92eda14cbcSMatt Macy * 93eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a 94eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of 95eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 96eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 97eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore 98eda14cbcSMatt Macy * A ^ (255 - 1) = A^254. 99eda14cbcSMatt Macy * 100eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns, 101eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations: 102eda14cbcSMatt Macy * 103eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1 104eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 105eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 106eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 107eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 108eda14cbcSMatt Macy * 109eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 110eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 111eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have 112eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.) 113eda14cbcSMatt Macy * 114eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually 115eda14cbcSMatt Macy * or in concert to recover missing data columns. 116eda14cbcSMatt Macy */ 117eda14cbcSMatt Macy 118eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0 119eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1 120eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 124eda14cbcSMatt Macy 125eda14cbcSMatt Macy /* 126eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a 127eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by 128eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to 129eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d. 130eda14cbcSMatt Macy */ 131eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \ 132eda14cbcSMatt Macy { \ 133eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \ 134eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \ 135eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 136eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 137eda14cbcSMatt Macy } 138eda14cbcSMatt Macy 139eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \ 140eda14cbcSMatt Macy { \ 141eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 143eda14cbcSMatt Macy } 144eda14cbcSMatt Macy 145*e716630dSMartin Matuska 146*e716630dSMartin Matuska /* 147*e716630dSMartin Matuska * Big Theory Statement for how a RAIDZ VDEV is expanded 148*e716630dSMartin Matuska * 149*e716630dSMartin Matuska * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 150*e716630dSMartin Matuska * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 151*e716630dSMartin Matuska * that have been previously expanded can be expanded again. 152*e716630dSMartin Matuska * 153*e716630dSMartin Matuska * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 154*e716630dSMartin Matuska * the VDEV) when an expansion starts. And the expansion will pause if any 155*e716630dSMartin Matuska * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 156*e716630dSMartin Matuska * operations on the pool can continue while an expansion is in progress (e.g. 157*e716630dSMartin Matuska * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 158*e716630dSMartin Matuska * and zpool initialize which can't be run during an expansion. Following a 159*e716630dSMartin Matuska * reboot or export/import, the expansion resumes where it left off. 160*e716630dSMartin Matuska * 161*e716630dSMartin Matuska * == Reflowing the Data == 162*e716630dSMartin Matuska * 163*e716630dSMartin Matuska * The expansion involves reflowing (copying) the data from the current set 164*e716630dSMartin Matuska * of disks to spread it across the new set which now has one more disk. This 165*e716630dSMartin Matuska * reflow operation is similar to reflowing text when the column width of a 166*e716630dSMartin Matuska * text editor window is expanded. The text doesn’t change but the location of 167*e716630dSMartin Matuska * the text changes to accommodate the new width. An example reflow result for 168*e716630dSMartin Matuska * a 4-wide RAIDZ1 to a 5-wide is shown below. 169*e716630dSMartin Matuska * 170*e716630dSMartin Matuska * Reflow End State 171*e716630dSMartin Matuska * Each letter indicates a parity group (logical stripe) 172*e716630dSMartin Matuska * 173*e716630dSMartin Matuska * Before expansion After Expansion 174*e716630dSMartin Matuska * D1 D2 D3 D4 D1 D2 D3 D4 D5 175*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 176*e716630dSMartin Matuska * | | | | | | | | | | | 177*e716630dSMartin Matuska * | A | A | A | A | | A | A | A | A | B | 178*e716630dSMartin Matuska * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 179*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 180*e716630dSMartin Matuska * | | | | | | | | | | | 181*e716630dSMartin Matuska * | B | B | C | C | | B | C | C | C | C | 182*e716630dSMartin Matuska * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 183*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 184*e716630dSMartin Matuska * | | | | | | | | | | | 185*e716630dSMartin Matuska * | C | C | D | D | | D | D | E | E | E | 186*e716630dSMartin Matuska * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 187*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 188*e716630dSMartin Matuska * | | | | | | | | | | | 189*e716630dSMartin Matuska * | E | E | E | E | --> | E | F | F | G | G | 190*e716630dSMartin Matuska * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 191*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 192*e716630dSMartin Matuska * | | | | | | | | | | | 193*e716630dSMartin Matuska * | F | F | G | G | | G | G | H | H | H | 194*e716630dSMartin Matuska * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 195*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 196*e716630dSMartin Matuska * | | | | | | | | | | | 197*e716630dSMartin Matuska * | G | G | H | H | | H | I | I | J | J | 198*e716630dSMartin Matuska * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 199*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 200*e716630dSMartin Matuska * | | | | | | | | | | | 201*e716630dSMartin Matuska * | H | H | I | I | | J | J | | | K | 202*e716630dSMartin Matuska * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 203*e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 204*e716630dSMartin Matuska * 205*e716630dSMartin Matuska * This reflow approach has several advantages. There is no need to read or 206*e716630dSMartin Matuska * modify the block pointers or recompute any block checksums. The reflow 207*e716630dSMartin Matuska * doesn’t need to know where the parity sectors reside. We can read and write 208*e716630dSMartin Matuska * data sequentially and the copy can occur in a background thread in open 209*e716630dSMartin Matuska * context. The design also allows for fast discovery of what data to copy. 210*e716630dSMartin Matuska * 211*e716630dSMartin Matuska * The VDEV metaslabs are processed, one at a time, to copy the block data to 212*e716630dSMartin Matuska * have it flow across all the disks. The metaslab is disabled for allocations 213*e716630dSMartin Matuska * during the copy. As an optimization, we only copy the allocated data which 214*e716630dSMartin Matuska * can be determined by looking at the metaslab range tree. During the copy we 215*e716630dSMartin Matuska * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 216*e716630dSMartin Matuska * need to be able to survive losing parity count disks). This means we 217*e716630dSMartin Matuska * cannot overwrite data during the reflow that would be needed if a disk is 218*e716630dSMartin Matuska * lost. 219*e716630dSMartin Matuska * 220*e716630dSMartin Matuska * After the reflow completes, all newly-written blocks will have the new 221*e716630dSMartin Matuska * layout, i.e., they will have the parity to data ratio implied by the new 222*e716630dSMartin Matuska * number of disks in the RAIDZ group. Even though the reflow copies all of 223*e716630dSMartin Matuska * the allocated space (data and parity), it is only rearranged, not changed. 224*e716630dSMartin Matuska * 225*e716630dSMartin Matuska * This act of reflowing the data has a few implications about blocks 226*e716630dSMartin Matuska * that were written before the reflow completes: 227*e716630dSMartin Matuska * 228*e716630dSMartin Matuska * - Old blocks will still use the same amount of space (i.e., they will have 229*e716630dSMartin Matuska * the parity to data ratio implied by the old number of disks in the RAIDZ 230*e716630dSMartin Matuska * group). 231*e716630dSMartin Matuska * - Reading old blocks will be slightly slower than before the reflow, for 232*e716630dSMartin Matuska * two reasons. First, we will have to read from all disks in the RAIDZ 233*e716630dSMartin Matuska * VDEV, rather than being able to skip the children that contain only 234*e716630dSMartin Matuska * parity of this block (because the data of a single block is now spread 235*e716630dSMartin Matuska * out across all the disks). Second, in most cases there will be an extra 236*e716630dSMartin Matuska * bcopy, needed to rearrange the data back to its original layout in memory. 237*e716630dSMartin Matuska * 238*e716630dSMartin Matuska * == Scratch Area == 239*e716630dSMartin Matuska * 240*e716630dSMartin Matuska * As we copy the block data, we can only progress to the point that writes 241*e716630dSMartin Matuska * will not overlap with blocks whose progress has not yet been recorded on 242*e716630dSMartin Matuska * disk. Since partially-copied rows are always read from the old location, 243*e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent any 244*e716630dSMartin Matuska * row-wise overlap. For example, in the diagram above, when we reflow sector 245*e716630dSMartin Matuska * B6 it will overwite the original location for B5. 246*e716630dSMartin Matuska * 247*e716630dSMartin Matuska * To get around this, a scratch space is used so that we can start copying 248*e716630dSMartin Matuska * without risking data loss by overlapping the row. As an added benefit, it 249*e716630dSMartin Matuska * improves performance at the beginning of the reflow, but that small perf 250*e716630dSMartin Matuska * boost wouldn't be worth the complexity on its own. 251*e716630dSMartin Matuska * 252*e716630dSMartin Matuska * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 253*e716630dSMartin Matuska * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 254*e716630dSMartin Matuska * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 255*e716630dSMartin Matuska * the widths will likely be single digits so we can get a substantial chuck 256*e716630dSMartin Matuska * size using only a few MB of scratch per disk. 257*e716630dSMartin Matuska * 258*e716630dSMartin Matuska * The scratch area is persisted to disk which holds a large amount of reflowed 259*e716630dSMartin Matuska * state. We can always read the partially written stripes when a disk fails or 260*e716630dSMartin Matuska * the copy is interrupted (crash) during the initial copying phase and also 261*e716630dSMartin Matuska * get past a small chunk size restriction. At a minimum, the scratch space 262*e716630dSMartin Matuska * must be large enough to get us to the point that one row does not overlap 263*e716630dSMartin Matuska * itself when moved (i.e new_width^2). But going larger is even better. We 264*e716630dSMartin Matuska * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 265*e716630dSMartin Matuska * as our scratch space to handle overwriting the initial part of the VDEV. 266*e716630dSMartin Matuska * 267*e716630dSMartin Matuska * 0 256K 512K 4M 268*e716630dSMartin Matuska * +------+------+-----------------------+----------------------------- 269*e716630dSMartin Matuska * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 270*e716630dSMartin Matuska * | L0 | L1 | Reserved | (Metaslabs) 271*e716630dSMartin Matuska * +------+------+-----------------------+------------------------------- 272*e716630dSMartin Matuska * Scratch Area 273*e716630dSMartin Matuska * 274*e716630dSMartin Matuska * == Reflow Progress Updates == 275*e716630dSMartin Matuska * After the initial scratch-based reflow, the expansion process works 276*e716630dSMartin Matuska * similarly to device removal. We create a new open context thread which 277*e716630dSMartin Matuska * reflows the data, and periodically kicks off sync tasks to update logical 278*e716630dSMartin Matuska * state. In this case, state is the committed progress (offset of next data 279*e716630dSMartin Matuska * to copy). We need to persist the completed offset on disk, so that if we 280*e716630dSMartin Matuska * crash we know which format each VDEV offset is in. 281*e716630dSMartin Matuska * 282*e716630dSMartin Matuska * == Time Dependent Geometry == 283*e716630dSMartin Matuska * 284*e716630dSMartin Matuska * In non-expanded RAIDZ, blocks are read from disk in a column by column 285*e716630dSMartin Matuska * fashion. For a multi-row block, the second sector is in the first column 286*e716630dSMartin Matuska * not in the second column. This allows us to issue full reads for each 287*e716630dSMartin Matuska * column directly into the request buffer. The block data is thus laid out 288*e716630dSMartin Matuska * sequentially in a column-by-column fashion. 289*e716630dSMartin Matuska * 290*e716630dSMartin Matuska * For example, in the before expansion diagram above, one logical block might 291*e716630dSMartin Matuska * be sectors G19-H26. The parity is in G19,H23; and the data is in 292*e716630dSMartin Matuska * G20,H24,G21,H25,G22,H26. 293*e716630dSMartin Matuska * 294*e716630dSMartin Matuska * After a block is reflowed, the sectors that were all in the original column 295*e716630dSMartin Matuska * data can now reside in different columns. When reading from an expanded 296*e716630dSMartin Matuska * VDEV, we need to know the logical stripe width for each block so we can 297*e716630dSMartin Matuska * reconstitute the block’s data after the reads are completed. Likewise, 298*e716630dSMartin Matuska * when we perform the combinatorial reconstruction we need to know the 299*e716630dSMartin Matuska * original width so we can retry combinations from the past layouts. 300*e716630dSMartin Matuska * 301*e716630dSMartin Matuska * Time dependent geometry is what we call having blocks with different layouts 302*e716630dSMartin Matuska * (stripe widths) in the same VDEV. This time-dependent geometry uses the 303*e716630dSMartin Matuska * block’s birth time (+ the time expansion ended) to establish the correct 304*e716630dSMartin Matuska * width for a given block. After an expansion completes, we record the time 305*e716630dSMartin Matuska * for blocks written with a particular width (geometry). 306*e716630dSMartin Matuska * 307*e716630dSMartin Matuska * == On Disk Format Changes == 308*e716630dSMartin Matuska * 309*e716630dSMartin Matuska * New pool feature flag, 'raidz_expansion' whose reference count is the number 310*e716630dSMartin Matuska * of RAIDZ VDEVs that have been expanded. 311*e716630dSMartin Matuska * 312*e716630dSMartin Matuska * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 313*e716630dSMartin Matuska * 314*e716630dSMartin Matuska * Since the uberblock can point to arbitrary blocks, which might be on the 315*e716630dSMartin Matuska * expanding RAIDZ, and might or might not have been expanded. We need to know 316*e716630dSMartin Matuska * which way a block is laid out before reading it. This info is the next 317*e716630dSMartin Matuska * offset that needs to be reflowed and we persist that in the uberblock, in 318*e716630dSMartin Matuska * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 319*e716630dSMartin Matuska * After the expansion is complete, we then use the raidz_expand_txgs array 320*e716630dSMartin Matuska * (see below) to determine how to read a block and the ub_raidz_reflow_info 321*e716630dSMartin Matuska * field no longer required. 322*e716630dSMartin Matuska * 323*e716630dSMartin Matuska * The uberblock's ub_raidz_reflow_info field also holds the scratch space 324*e716630dSMartin Matuska * state (i.e., active or not) which is also required before reading a block 325*e716630dSMartin Matuska * during the initial phase of reflowing the data. 326*e716630dSMartin Matuska * 327*e716630dSMartin Matuska * The top-level RAIDZ VDEV has two new entries in the nvlist: 328*e716630dSMartin Matuska * 329*e716630dSMartin Matuska * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 330*e716630dSMartin Matuska * and used after the expansion is complete to 331*e716630dSMartin Matuska * determine how to read a raidz block 332*e716630dSMartin Matuska * 'raidz_expanding' boolean: present during reflow and removed after completion 333*e716630dSMartin Matuska * used during a spa import to resume an unfinished 334*e716630dSMartin Matuska * expansion 335*e716630dSMartin Matuska * 336*e716630dSMartin Matuska * And finally the VDEVs top zap adds the following informational entries: 337*e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 338*e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 339*e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 340*e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 341*e716630dSMartin Matuska */ 342*e716630dSMartin Matuska 343*e716630dSMartin Matuska /* 344*e716630dSMartin Matuska * For testing only: pause the raidz expansion after reflowing this amount. 345*e716630dSMartin Matuska * (accessed by ZTS and ztest) 346*e716630dSMartin Matuska */ 347*e716630dSMartin Matuska #ifdef _KERNEL 348*e716630dSMartin Matuska static 349*e716630dSMartin Matuska #endif /* _KERNEL */ 350*e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0; 351*e716630dSMartin Matuska 352*e716630dSMartin Matuska /* 353*e716630dSMartin Matuska * For testing only: pause the raidz expansion at a certain point. 354*e716630dSMartin Matuska */ 355*e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0; 356*e716630dSMartin Matuska 357*e716630dSMartin Matuska /* 358*e716630dSMartin Matuska * Maximum amount of copy io's outstanding at once. 359*e716630dSMartin Matuska */ 360*e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 361*e716630dSMartin Matuska 362*e716630dSMartin Matuska /* 363*e716630dSMartin Matuska * Apply raidz map abds aggregation if the number of rows in the map is equal 364*e716630dSMartin Matuska * or greater than the value below. 365*e716630dSMartin Matuska */ 366*e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4; 367*e716630dSMartin Matuska 368*e716630dSMartin Matuska /* 369*e716630dSMartin Matuska * Automatically start a pool scrub when a RAIDZ expansion completes in 370*e716630dSMartin Matuska * order to verify the checksums of all blocks which have been copied 371*e716630dSMartin Matuska * during the expansion. Automatic scrubbing is enabled by default and 372*e716630dSMartin Matuska * is strongly recommended. 373*e716630dSMartin Matuska */ 374*e716630dSMartin Matuska static int zfs_scrub_after_expand = 1; 375*e716630dSMartin Matuska 3767877fdebSMatt Macy static void 3777877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr) 378eda14cbcSMatt Macy { 379184c1b94SMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 380184c1b94SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 381eda14cbcSMatt Macy 382184c1b94SMartin Matuska if (rc->rc_size != 0) 383184c1b94SMartin Matuska abd_free(rc->rc_abd); 384184c1b94SMartin Matuska if (rc->rc_orig_data != NULL) 385f9693befSMartin Matuska abd_free(rc->rc_orig_data); 386eda14cbcSMatt Macy } 387eda14cbcSMatt Macy 3887877fdebSMatt Macy if (rr->rr_abd_empty != NULL) 3897877fdebSMatt Macy abd_free(rr->rr_abd_empty); 390eda14cbcSMatt Macy 3917877fdebSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 3927877fdebSMatt Macy } 3937877fdebSMatt Macy 3947877fdebSMatt Macy void 3957877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm) 3967877fdebSMatt Macy { 3977877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) 3987877fdebSMatt Macy vdev_raidz_row_free(rm->rm_row[i]); 3997877fdebSMatt Macy 400*e716630dSMartin Matuska if (rm->rm_nphys_cols) { 401*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 402*e716630dSMartin Matuska if (rm->rm_phys_col[i].rc_abd != NULL) 403*e716630dSMartin Matuska abd_free(rm->rm_phys_col[i].rc_abd); 404*e716630dSMartin Matuska } 405*e716630dSMartin Matuska 406*e716630dSMartin Matuska kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 407*e716630dSMartin Matuska rm->rm_nphys_cols); 408*e716630dSMartin Matuska } 409*e716630dSMartin Matuska 410*e716630dSMartin Matuska ASSERT3P(rm->rm_lr, ==, NULL); 4117877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 412eda14cbcSMatt Macy } 413eda14cbcSMatt Macy 414eda14cbcSMatt Macy static void 415eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio) 416eda14cbcSMatt Macy { 417eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy vdev_raidz_map_free(rm); 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422*e716630dSMartin Matuska static int 423*e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2) 424*e716630dSMartin Matuska { 425*e716630dSMartin Matuska const reflow_node_t *l = x1; 426*e716630dSMartin Matuska const reflow_node_t *r = x2; 427*e716630dSMartin Matuska 428*e716630dSMartin Matuska return (TREE_CMP(l->re_txg, r->re_txg)); 429*e716630dSMartin Matuska } 430*e716630dSMartin Matuska 431f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = { 432eda14cbcSMatt Macy .vsd_free = vdev_raidz_map_free_vsd, 433eda14cbcSMatt Macy }; 434eda14cbcSMatt Macy 435*e716630dSMartin Matuska raidz_row_t * 436*e716630dSMartin Matuska vdev_raidz_row_alloc(int cols) 437*e716630dSMartin Matuska { 438*e716630dSMartin Matuska raidz_row_t *rr = 439*e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 440*e716630dSMartin Matuska 441*e716630dSMartin Matuska rr->rr_cols = cols; 442*e716630dSMartin Matuska rr->rr_scols = cols; 443*e716630dSMartin Matuska 444*e716630dSMartin Matuska for (int c = 0; c < cols; c++) { 445*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 446*e716630dSMartin Matuska rc->rc_shadow_devidx = INT_MAX; 447*e716630dSMartin Matuska rc->rc_shadow_offset = UINT64_MAX; 448*e716630dSMartin Matuska rc->rc_allow_repair = 1; 449*e716630dSMartin Matuska } 450*e716630dSMartin Matuska return (rr); 451*e716630dSMartin Matuska } 452*e716630dSMartin Matuska 45381b22a98SMartin Matuska static void 45481b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 45581b22a98SMartin Matuska { 45681b22a98SMartin Matuska int c; 45781b22a98SMartin Matuska int nwrapped = 0; 45881b22a98SMartin Matuska uint64_t off = 0; 45981b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 46081b22a98SMartin Matuska 46181b22a98SMartin Matuska ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 46281b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 46381b22a98SMartin Matuska 46481b22a98SMartin Matuska /* 46581b22a98SMartin Matuska * Pad any parity columns with additional space to account for skip 46681b22a98SMartin Matuska * sectors. 46781b22a98SMartin Matuska */ 46881b22a98SMartin Matuska if (rm->rm_skipstart < rr->rr_firstdatacol) { 46981b22a98SMartin Matuska ASSERT0(rm->rm_skipstart); 47081b22a98SMartin Matuska nwrapped = rm->rm_nskip; 47181b22a98SMartin Matuska } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 47281b22a98SMartin Matuska nwrapped = 47381b22a98SMartin Matuska (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 47481b22a98SMartin Matuska } 47581b22a98SMartin Matuska 47681b22a98SMartin Matuska /* 47781b22a98SMartin Matuska * Optional single skip sectors (rc_size == 0) will be handled in 47881b22a98SMartin Matuska * vdev_raidz_io_start_write(). 47981b22a98SMartin Matuska */ 48081b22a98SMartin Matuska int skipped = rr->rr_scols - rr->rr_cols; 48181b22a98SMartin Matuska 48281b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 48381b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) { 48481b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 48581b22a98SMartin Matuska 48681b22a98SMartin Matuska /* 48781b22a98SMartin Matuska * Parity columns will pad out a linear ABD to account for 48881b22a98SMartin Matuska * the skip sector. A linear ABD is used here because 48981b22a98SMartin Matuska * parity calculations use the ABD buffer directly to calculate 49081b22a98SMartin Matuska * parity. This avoids doing a memcpy back to the ABD after the 49181b22a98SMartin Matuska * parity has been calculated. By issuing the parity column 49281b22a98SMartin Matuska * with the skip sector we can reduce contention on the child 49381b22a98SMartin Matuska * VDEV queue locks (vq_lock). 49481b22a98SMartin Matuska */ 49581b22a98SMartin Matuska if (c < nwrapped) { 49681b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear( 49781b22a98SMartin Matuska rc->rc_size + (1ULL << ashift), B_FALSE); 49881b22a98SMartin Matuska abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 49981b22a98SMartin Matuska skipped++; 50081b22a98SMartin Matuska } else { 50181b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 50281b22a98SMartin Matuska } 50381b22a98SMartin Matuska } 50481b22a98SMartin Matuska 50581b22a98SMartin Matuska for (off = 0; c < rr->rr_cols; c++) { 50681b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 50781b22a98SMartin Matuska abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 50881b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 50981b22a98SMartin Matuska 51081b22a98SMartin Matuska /* 51181b22a98SMartin Matuska * Generate I/O for skip sectors to improve aggregation 51281b22a98SMartin Matuska * continuity. We will use gang ABD's to reduce contention 51381b22a98SMartin Matuska * on the child VDEV queue locks (vq_lock) by issuing 51481b22a98SMartin Matuska * a single I/O that contains the data and skip sector. 51581b22a98SMartin Matuska * 51681b22a98SMartin Matuska * It is important to make sure that rc_size is not updated 51781b22a98SMartin Matuska * even though we are adding a skip sector to the ABD. When 51881b22a98SMartin Matuska * calculating the parity in vdev_raidz_generate_parity_row() 51981b22a98SMartin Matuska * the rc_size is used to iterate through the ABD's. We can 52081b22a98SMartin Matuska * not have zero'd out skip sectors used for calculating 52181b22a98SMartin Matuska * parity for raidz, because those same sectors are not used 52281b22a98SMartin Matuska * during reconstruction. 52381b22a98SMartin Matuska */ 52481b22a98SMartin Matuska if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 52581b22a98SMartin Matuska rc->rc_abd = abd_alloc_gang(); 52681b22a98SMartin Matuska abd_gang_add(rc->rc_abd, abd, B_TRUE); 52781b22a98SMartin Matuska abd_gang_add(rc->rc_abd, 52881b22a98SMartin Matuska abd_get_zeros(1ULL << ashift), B_TRUE); 52981b22a98SMartin Matuska skipped++; 53081b22a98SMartin Matuska } else { 53181b22a98SMartin Matuska rc->rc_abd = abd; 53281b22a98SMartin Matuska } 53381b22a98SMartin Matuska off += rc->rc_size; 53481b22a98SMartin Matuska } 53581b22a98SMartin Matuska 53681b22a98SMartin Matuska ASSERT3U(off, ==, zio->io_size); 53781b22a98SMartin Matuska ASSERT3S(skipped, ==, rm->rm_nskip); 53881b22a98SMartin Matuska } 53981b22a98SMartin Matuska 54081b22a98SMartin Matuska static void 54181b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 54281b22a98SMartin Matuska { 54381b22a98SMartin Matuska int c; 54481b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 54581b22a98SMartin Matuska 54681b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 54781b22a98SMartin Matuska 54881b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 54981b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) 55081b22a98SMartin Matuska rr->rr_col[c].rc_abd = 55181b22a98SMartin Matuska abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 55281b22a98SMartin Matuska 55381b22a98SMartin Matuska for (uint64_t off = 0; c < rr->rr_cols; c++) { 55481b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 55581b22a98SMartin Matuska rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 55681b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 55781b22a98SMartin Matuska off += rc->rc_size; 55881b22a98SMartin Matuska } 55981b22a98SMartin Matuska } 56081b22a98SMartin Matuska 561eda14cbcSMatt Macy /* 562eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is 563eda14cbcSMatt Macy * the number of children in the target vdev. 564eda14cbcSMatt Macy * 565eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which 566eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 567eda14cbcSMatt Macy */ 568eda14cbcSMatt Macy noinline raidz_map_t * 569eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 570eda14cbcSMatt Macy uint64_t nparity) 571eda14cbcSMatt Macy { 5727877fdebSMatt Macy raidz_row_t *rr; 573eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 574eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift; 575eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 576eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift; 577eda14cbcSMatt Macy /* The first column for this stripe. */ 578eda14cbcSMatt Macy uint64_t f = b % dcols; 579eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */ 580eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift; 581*e716630dSMartin Matuska uint64_t acols, scols; 582eda14cbcSMatt Macy 5837877fdebSMatt Macy raidz_map_t *rm = 5847877fdebSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 5857877fdebSMatt Macy rm->rm_nrows = 1; 5867877fdebSMatt Macy 587eda14cbcSMatt Macy /* 588eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but 589eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data. 590eda14cbcSMatt Macy */ 591*e716630dSMartin Matuska uint64_t q = s / (dcols - nparity); 592eda14cbcSMatt Macy 593eda14cbcSMatt Macy /* 594eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O. 595eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs. 596eda14cbcSMatt Macy */ 597*e716630dSMartin Matuska uint64_t r = s - q * (dcols - nparity); 598eda14cbcSMatt Macy 599eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */ 600*e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 601eda14cbcSMatt Macy 602eda14cbcSMatt Macy /* 603eda14cbcSMatt Macy * The total number of data and parity sectors associated with 604eda14cbcSMatt Macy * this I/O. 605eda14cbcSMatt Macy */ 606*e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 607eda14cbcSMatt Macy 6087877fdebSMatt Macy /* 6097877fdebSMatt Macy * acols: The columns that will be accessed. 6107877fdebSMatt Macy * scols: The columns that will be accessed or skipped. 6117877fdebSMatt Macy */ 612eda14cbcSMatt Macy if (q == 0) { 613eda14cbcSMatt Macy /* Our I/O request doesn't span all child vdevs. */ 614eda14cbcSMatt Macy acols = bc; 615eda14cbcSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1)); 616eda14cbcSMatt Macy } else { 617eda14cbcSMatt Macy acols = dcols; 618eda14cbcSMatt Macy scols = dcols; 619eda14cbcSMatt Macy } 620eda14cbcSMatt Macy 621eda14cbcSMatt Macy ASSERT3U(acols, <=, scols); 622*e716630dSMartin Matuska rr = vdev_raidz_row_alloc(scols); 6237877fdebSMatt Macy rm->rm_row[0] = rr; 6247877fdebSMatt Macy rr->rr_cols = acols; 6257877fdebSMatt Macy rr->rr_bigcols = bc; 6267877fdebSMatt Macy rr->rr_firstdatacol = nparity; 6277877fdebSMatt Macy #ifdef ZFS_DEBUG 6287877fdebSMatt Macy rr->rr_offset = zio->io_offset; 6297877fdebSMatt Macy rr->rr_size = zio->io_size; 6307877fdebSMatt Macy #endif 631eda14cbcSMatt Macy 632*e716630dSMartin Matuska uint64_t asize = 0; 633eda14cbcSMatt Macy 634*e716630dSMartin Matuska for (uint64_t c = 0; c < scols; c++) { 6357877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 636*e716630dSMartin Matuska uint64_t col = f + c; 637*e716630dSMartin Matuska uint64_t coff = o; 638eda14cbcSMatt Macy if (col >= dcols) { 639eda14cbcSMatt Macy col -= dcols; 640eda14cbcSMatt Macy coff += 1ULL << ashift; 641eda14cbcSMatt Macy } 6427877fdebSMatt Macy rc->rc_devidx = col; 6437877fdebSMatt Macy rc->rc_offset = coff; 644eda14cbcSMatt Macy 645eda14cbcSMatt Macy if (c >= acols) 6467877fdebSMatt Macy rc->rc_size = 0; 647eda14cbcSMatt Macy else if (c < bc) 6487877fdebSMatt Macy rc->rc_size = (q + 1) << ashift; 649eda14cbcSMatt Macy else 6507877fdebSMatt Macy rc->rc_size = q << ashift; 651eda14cbcSMatt Macy 6527877fdebSMatt Macy asize += rc->rc_size; 653eda14cbcSMatt Macy } 654eda14cbcSMatt Macy 655eda14cbcSMatt Macy ASSERT3U(asize, ==, tot << ashift); 656eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot; 6577877fdebSMatt Macy rm->rm_skipstart = bc; 658eda14cbcSMatt Macy 659eda14cbcSMatt Macy /* 660eda14cbcSMatt Macy * If all data stored spans all columns, there's a danger that parity 661eda14cbcSMatt Macy * will always be on the same device and, since parity isn't read 662eda14cbcSMatt Macy * during normal operation, that device's I/O bandwidth won't be 663eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB. 664eda14cbcSMatt Macy * 665eda14cbcSMatt Macy * ... at least that was, ostensibly, the theory. As a practical 666eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we 667eda14cbcSMatt Macy * won't see any benefit. Further, occasional writes that aren't a 668eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum 669eda14cbcSMatt Macy * stripe width are sufficient to avoid pessimal behavior. 670eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format 671eda14cbcSMatt Macy * requirement that we need to support for all eternity, but only 672eda14cbcSMatt Macy * for single-parity RAID-Z. 673eda14cbcSMatt Macy * 674eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding 675eda14cbcSMatt Macy * we must make sure to note this swap. We will never intend to 676eda14cbcSMatt Macy * skip the first column since at least one data and one parity 677eda14cbcSMatt Macy * column must appear in each row. 678eda14cbcSMatt Macy */ 6797877fdebSMatt Macy ASSERT(rr->rr_cols >= 2); 6807877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 681eda14cbcSMatt Macy 6827877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 683*e716630dSMartin Matuska uint64_t devidx = rr->rr_col[0].rc_devidx; 6847877fdebSMatt Macy o = rr->rr_col[0].rc_offset; 6857877fdebSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 6867877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 6877877fdebSMatt Macy rr->rr_col[1].rc_devidx = devidx; 6887877fdebSMatt Macy rr->rr_col[1].rc_offset = o; 689eda14cbcSMatt Macy if (rm->rm_skipstart == 0) 690eda14cbcSMatt Macy rm->rm_skipstart = 1; 691eda14cbcSMatt Macy } 692eda14cbcSMatt Macy 69381b22a98SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) { 69481b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio, rm, ashift); 69581b22a98SMartin Matuska } else { 69681b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio, rm); 69781b22a98SMartin Matuska } 698*e716630dSMartin Matuska /* init RAIDZ parity ops */ 699*e716630dSMartin Matuska rm->rm_ops = vdev_raidz_math_get_ops(); 70081b22a98SMartin Matuska 701*e716630dSMartin Matuska return (rm); 702*e716630dSMartin Matuska } 703*e716630dSMartin Matuska 704*e716630dSMartin Matuska /* 705*e716630dSMartin Matuska * Everything before reflow_offset_synced should have been moved to the new 706*e716630dSMartin Matuska * location (read and write completed). However, this may not yet be reflected 707*e716630dSMartin Matuska * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 708*e716630dSMartin Matuska * uberblock has not yet been written). If reflow is not in progress, 709*e716630dSMartin Matuska * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 710*e716630dSMartin Matuska * entirely before reflow_offset_synced, it will come from the new location. 711*e716630dSMartin Matuska * Otherwise this row will come from the old location. Therefore, rows that 712*e716630dSMartin Matuska * straddle the reflow_offset_synced will come from the old location. 713*e716630dSMartin Matuska * 714*e716630dSMartin Matuska * For writes, reflow_offset_next is the next offset to copy. If a sector has 715*e716630dSMartin Matuska * been copied, but not yet reflected in the on-disk progress 716*e716630dSMartin Matuska * (reflow_offset_synced), it will also be written to the new (already copied) 717*e716630dSMartin Matuska * offset. 718*e716630dSMartin Matuska */ 719*e716630dSMartin Matuska noinline raidz_map_t * 720*e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio, 721*e716630dSMartin Matuska uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 722*e716630dSMartin Matuska uint64_t nparity, uint64_t reflow_offset_synced, 723*e716630dSMartin Matuska uint64_t reflow_offset_next, boolean_t use_scratch) 724*e716630dSMartin Matuska { 725*e716630dSMartin Matuska abd_t *abd = zio->io_abd; 726*e716630dSMartin Matuska uint64_t offset = zio->io_offset; 727*e716630dSMartin Matuska uint64_t size = zio->io_size; 728*e716630dSMartin Matuska 729*e716630dSMartin Matuska /* The zio's size in units of the vdev's minimum sector size. */ 730*e716630dSMartin Matuska uint64_t s = size >> ashift; 731*e716630dSMartin Matuska 732*e716630dSMartin Matuska /* 733*e716630dSMartin Matuska * "Quotient": The number of data sectors for this stripe on all but 734*e716630dSMartin Matuska * the "big column" child vdevs that also contain "remainder" data. 735*e716630dSMartin Matuska * AKA "full rows" 736*e716630dSMartin Matuska */ 737*e716630dSMartin Matuska uint64_t q = s / (logical_cols - nparity); 738*e716630dSMartin Matuska 739*e716630dSMartin Matuska /* 740*e716630dSMartin Matuska * "Remainder": The number of partial stripe data sectors in this I/O. 741*e716630dSMartin Matuska * This will add a sector to some, but not all, child vdevs. 742*e716630dSMartin Matuska */ 743*e716630dSMartin Matuska uint64_t r = s - q * (logical_cols - nparity); 744*e716630dSMartin Matuska 745*e716630dSMartin Matuska /* The number of "big columns" - those which contain remainder data. */ 746*e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 747*e716630dSMartin Matuska 748*e716630dSMartin Matuska /* 749*e716630dSMartin Matuska * The total number of data and parity sectors associated with 750*e716630dSMartin Matuska * this I/O. 751*e716630dSMartin Matuska */ 752*e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 753*e716630dSMartin Matuska 754*e716630dSMartin Matuska /* How many rows contain data (not skip) */ 755*e716630dSMartin Matuska uint64_t rows = howmany(tot, logical_cols); 756*e716630dSMartin Matuska int cols = MIN(tot, logical_cols); 757*e716630dSMartin Matuska 758*e716630dSMartin Matuska raidz_map_t *rm = 759*e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 760*e716630dSMartin Matuska KM_SLEEP); 761*e716630dSMartin Matuska rm->rm_nrows = rows; 762*e716630dSMartin Matuska rm->rm_nskip = roundup(tot, nparity + 1) - tot; 763*e716630dSMartin Matuska rm->rm_skipstart = bc; 764*e716630dSMartin Matuska uint64_t asize = 0; 765*e716630dSMartin Matuska 766*e716630dSMartin Matuska for (uint64_t row = 0; row < rows; row++) { 767*e716630dSMartin Matuska boolean_t row_use_scratch = B_FALSE; 768*e716630dSMartin Matuska raidz_row_t *rr = vdev_raidz_row_alloc(cols); 769*e716630dSMartin Matuska rm->rm_row[row] = rr; 770*e716630dSMartin Matuska 771*e716630dSMartin Matuska /* The starting RAIDZ (parent) vdev sector of the row. */ 772*e716630dSMartin Matuska uint64_t b = (offset >> ashift) + row * logical_cols; 773*e716630dSMartin Matuska 774*e716630dSMartin Matuska /* 775*e716630dSMartin Matuska * If we are in the middle of a reflow, and the copying has 776*e716630dSMartin Matuska * not yet completed for any part of this row, then use the 777*e716630dSMartin Matuska * old location of this row. Note that reflow_offset_synced 778*e716630dSMartin Matuska * reflects the i/o that's been completed, because it's 779*e716630dSMartin Matuska * updated by a synctask, after zio_wait(spa_txg_zio[]). 780*e716630dSMartin Matuska * This is sufficient for our check, even if that progress 781*e716630dSMartin Matuska * has not yet been recorded to disk (reflected in 782*e716630dSMartin Matuska * spa_ubsync). Also note that we consider the last row to 783*e716630dSMartin Matuska * be "full width" (`cols`-wide rather than `bc`-wide) for 784*e716630dSMartin Matuska * this calculation. This causes a tiny bit of unnecessary 785*e716630dSMartin Matuska * double-writes but is safe and simpler to calculate. 786*e716630dSMartin Matuska */ 787*e716630dSMartin Matuska int row_phys_cols = physical_cols; 788*e716630dSMartin Matuska if (b + cols > reflow_offset_synced >> ashift) 789*e716630dSMartin Matuska row_phys_cols--; 790*e716630dSMartin Matuska else if (use_scratch) 791*e716630dSMartin Matuska row_use_scratch = B_TRUE; 792*e716630dSMartin Matuska 793*e716630dSMartin Matuska /* starting child of this row */ 794*e716630dSMartin Matuska uint64_t child_id = b % row_phys_cols; 795*e716630dSMartin Matuska /* The starting byte offset on each child vdev. */ 796*e716630dSMartin Matuska uint64_t child_offset = (b / row_phys_cols) << ashift; 797*e716630dSMartin Matuska 798*e716630dSMartin Matuska /* 799*e716630dSMartin Matuska * Note, rr_cols is the entire width of the block, even 800*e716630dSMartin Matuska * if this row is shorter. This is needed because parity 801*e716630dSMartin Matuska * generation (for Q and R) needs to know the entire width, 802*e716630dSMartin Matuska * because it treats the short row as though it was 803*e716630dSMartin Matuska * full-width (and the "phantom" sectors were zero-filled). 804*e716630dSMartin Matuska * 805*e716630dSMartin Matuska * Another approach to this would be to set cols shorter 806*e716630dSMartin Matuska * (to just the number of columns that we might do i/o to) 807*e716630dSMartin Matuska * and have another mechanism to tell the parity generation 808*e716630dSMartin Matuska * about the "entire width". Reconstruction (at least 809*e716630dSMartin Matuska * vdev_raidz_reconstruct_general()) would also need to 810*e716630dSMartin Matuska * know about the "entire width". 811*e716630dSMartin Matuska */ 812*e716630dSMartin Matuska rr->rr_firstdatacol = nparity; 813*e716630dSMartin Matuska #ifdef ZFS_DEBUG 814*e716630dSMartin Matuska /* 815*e716630dSMartin Matuska * note: rr_size is PSIZE, not ASIZE 816*e716630dSMartin Matuska */ 817*e716630dSMartin Matuska rr->rr_offset = b << ashift; 818*e716630dSMartin Matuska rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 819*e716630dSMartin Matuska #endif 820*e716630dSMartin Matuska 821*e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++, child_id++) { 822*e716630dSMartin Matuska if (child_id >= row_phys_cols) { 823*e716630dSMartin Matuska child_id -= row_phys_cols; 824*e716630dSMartin Matuska child_offset += 1ULL << ashift; 825*e716630dSMartin Matuska } 826*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 827*e716630dSMartin Matuska rc->rc_devidx = child_id; 828*e716630dSMartin Matuska rc->rc_offset = child_offset; 829*e716630dSMartin Matuska 830*e716630dSMartin Matuska /* 831*e716630dSMartin Matuska * Get this from the scratch space if appropriate. 832*e716630dSMartin Matuska * This only happens if we crashed in the middle of 833*e716630dSMartin Matuska * raidz_reflow_scratch_sync() (while it's running, 834*e716630dSMartin Matuska * the rangelock prevents us from doing concurrent 835*e716630dSMartin Matuska * io), and even then only during zpool import or 836*e716630dSMartin Matuska * when the pool is imported readonly. 837*e716630dSMartin Matuska */ 838*e716630dSMartin Matuska if (row_use_scratch) 839*e716630dSMartin Matuska rc->rc_offset -= VDEV_BOOT_SIZE; 840*e716630dSMartin Matuska 841*e716630dSMartin Matuska uint64_t dc = c - rr->rr_firstdatacol; 842*e716630dSMartin Matuska if (c < rr->rr_firstdatacol) { 843*e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 844*e716630dSMartin Matuska 845*e716630dSMartin Matuska /* 846*e716630dSMartin Matuska * Parity sectors' rc_abd's are set below 847*e716630dSMartin Matuska * after determining if this is an aggregation. 848*e716630dSMartin Matuska */ 849*e716630dSMartin Matuska } else if (row == rows - 1 && bc != 0 && c >= bc) { 850*e716630dSMartin Matuska /* 851*e716630dSMartin Matuska * Past the end of the block (even including 852*e716630dSMartin Matuska * skip sectors). This sector is part of the 853*e716630dSMartin Matuska * map so that we have full rows for p/q parity 854*e716630dSMartin Matuska * generation. 855*e716630dSMartin Matuska */ 856*e716630dSMartin Matuska rc->rc_size = 0; 857*e716630dSMartin Matuska rc->rc_abd = NULL; 858*e716630dSMartin Matuska } else { 859*e716630dSMartin Matuska /* "data column" (col excluding parity) */ 860*e716630dSMartin Matuska uint64_t off; 861*e716630dSMartin Matuska 862*e716630dSMartin Matuska if (c < bc || r == 0) { 863*e716630dSMartin Matuska off = dc * rows + row; 864*e716630dSMartin Matuska } else { 865*e716630dSMartin Matuska off = r * rows + 866*e716630dSMartin Matuska (dc - r) * (rows - 1) + row; 867*e716630dSMartin Matuska } 868*e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 869*e716630dSMartin Matuska rc->rc_abd = abd_get_offset_struct( 870*e716630dSMartin Matuska &rc->rc_abdstruct, abd, off << ashift, 871*e716630dSMartin Matuska rc->rc_size); 872*e716630dSMartin Matuska } 873*e716630dSMartin Matuska 874*e716630dSMartin Matuska if (rc->rc_size == 0) 875*e716630dSMartin Matuska continue; 876*e716630dSMartin Matuska 877*e716630dSMartin Matuska /* 878*e716630dSMartin Matuska * If any part of this row is in both old and new 879*e716630dSMartin Matuska * locations, the primary location is the old 880*e716630dSMartin Matuska * location. If this sector was already copied to the 881*e716630dSMartin Matuska * new location, we need to also write to the new, 882*e716630dSMartin Matuska * "shadow" location. 883*e716630dSMartin Matuska * 884*e716630dSMartin Matuska * Note, `row_phys_cols != physical_cols` indicates 885*e716630dSMartin Matuska * that the primary location is the old location. 886*e716630dSMartin Matuska * `b+c < reflow_offset_next` indicates that the copy 887*e716630dSMartin Matuska * to the new location has been initiated. We know 888*e716630dSMartin Matuska * that the copy has completed because we have the 889*e716630dSMartin Matuska * rangelock, which is held exclusively while the 890*e716630dSMartin Matuska * copy is in progress. 891*e716630dSMartin Matuska */ 892*e716630dSMartin Matuska if (row_use_scratch || 893*e716630dSMartin Matuska (row_phys_cols != physical_cols && 894*e716630dSMartin Matuska b + c < reflow_offset_next >> ashift)) { 895*e716630dSMartin Matuska rc->rc_shadow_devidx = (b + c) % physical_cols; 896*e716630dSMartin Matuska rc->rc_shadow_offset = 897*e716630dSMartin Matuska ((b + c) / physical_cols) << ashift; 898*e716630dSMartin Matuska if (row_use_scratch) 899*e716630dSMartin Matuska rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 900*e716630dSMartin Matuska } 901*e716630dSMartin Matuska 902*e716630dSMartin Matuska asize += rc->rc_size; 903*e716630dSMartin Matuska } 904*e716630dSMartin Matuska 905*e716630dSMartin Matuska /* 906*e716630dSMartin Matuska * See comment in vdev_raidz_map_alloc() 907*e716630dSMartin Matuska */ 908*e716630dSMartin Matuska if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 909*e716630dSMartin Matuska (offset & (1ULL << 20))) { 910*e716630dSMartin Matuska ASSERT(rr->rr_cols >= 2); 911*e716630dSMartin Matuska ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 912*e716630dSMartin Matuska 913*e716630dSMartin Matuska int devidx0 = rr->rr_col[0].rc_devidx; 914*e716630dSMartin Matuska uint64_t offset0 = rr->rr_col[0].rc_offset; 915*e716630dSMartin Matuska int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 916*e716630dSMartin Matuska uint64_t shadow_offset0 = 917*e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset; 918*e716630dSMartin Matuska 919*e716630dSMartin Matuska rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 920*e716630dSMartin Matuska rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 921*e716630dSMartin Matuska rr->rr_col[0].rc_shadow_devidx = 922*e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx; 923*e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset = 924*e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset; 925*e716630dSMartin Matuska 926*e716630dSMartin Matuska rr->rr_col[1].rc_devidx = devidx0; 927*e716630dSMartin Matuska rr->rr_col[1].rc_offset = offset0; 928*e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 929*e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset = shadow_offset0; 930*e716630dSMartin Matuska } 931*e716630dSMartin Matuska } 932*e716630dSMartin Matuska ASSERT3U(asize, ==, tot << ashift); 933*e716630dSMartin Matuska 934*e716630dSMartin Matuska /* 935*e716630dSMartin Matuska * Determine if the block is contiguous, in which case we can use 936*e716630dSMartin Matuska * an aggregation. 937*e716630dSMartin Matuska */ 938*e716630dSMartin Matuska if (rows >= raidz_io_aggregate_rows) { 939*e716630dSMartin Matuska rm->rm_nphys_cols = physical_cols; 940*e716630dSMartin Matuska rm->rm_phys_col = 941*e716630dSMartin Matuska kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 942*e716630dSMartin Matuska KM_SLEEP); 943*e716630dSMartin Matuska 944*e716630dSMartin Matuska /* 945*e716630dSMartin Matuska * Determine the aggregate io's offset and size, and check 946*e716630dSMartin Matuska * that the io is contiguous. 947*e716630dSMartin Matuska */ 948*e716630dSMartin Matuska for (int i = 0; 949*e716630dSMartin Matuska i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 950*e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 951*e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 952*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 953*e716630dSMartin Matuska raidz_col_t *prc = 954*e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 955*e716630dSMartin Matuska 956*e716630dSMartin Matuska if (rc->rc_size == 0) 957*e716630dSMartin Matuska continue; 958*e716630dSMartin Matuska 959*e716630dSMartin Matuska if (prc->rc_size == 0) { 960*e716630dSMartin Matuska ASSERT0(prc->rc_offset); 961*e716630dSMartin Matuska prc->rc_offset = rc->rc_offset; 962*e716630dSMartin Matuska } else if (prc->rc_offset + prc->rc_size != 963*e716630dSMartin Matuska rc->rc_offset) { 964*e716630dSMartin Matuska /* 965*e716630dSMartin Matuska * This block is not contiguous and 966*e716630dSMartin Matuska * therefore can't be aggregated. 967*e716630dSMartin Matuska * This is expected to be rare, so 968*e716630dSMartin Matuska * the cost of allocating and then 969*e716630dSMartin Matuska * freeing rm_phys_col is not 970*e716630dSMartin Matuska * significant. 971*e716630dSMartin Matuska */ 972*e716630dSMartin Matuska kmem_free(rm->rm_phys_col, 973*e716630dSMartin Matuska sizeof (raidz_col_t) * 974*e716630dSMartin Matuska rm->rm_nphys_cols); 975*e716630dSMartin Matuska rm->rm_phys_col = NULL; 976*e716630dSMartin Matuska rm->rm_nphys_cols = 0; 977*e716630dSMartin Matuska break; 978*e716630dSMartin Matuska } 979*e716630dSMartin Matuska prc->rc_size += rc->rc_size; 980*e716630dSMartin Matuska } 981*e716630dSMartin Matuska } 982*e716630dSMartin Matuska } 983*e716630dSMartin Matuska if (rm->rm_phys_col != NULL) { 984*e716630dSMartin Matuska /* 985*e716630dSMartin Matuska * Allocate aggregate ABD's. 986*e716630dSMartin Matuska */ 987*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 988*e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 989*e716630dSMartin Matuska 990*e716630dSMartin Matuska prc->rc_devidx = i; 991*e716630dSMartin Matuska 992*e716630dSMartin Matuska if (prc->rc_size == 0) 993*e716630dSMartin Matuska continue; 994*e716630dSMartin Matuska 995*e716630dSMartin Matuska prc->rc_abd = 996*e716630dSMartin Matuska abd_alloc_linear(rm->rm_phys_col[i].rc_size, 997*e716630dSMartin Matuska B_FALSE); 998*e716630dSMartin Matuska } 999*e716630dSMartin Matuska 1000*e716630dSMartin Matuska /* 1001*e716630dSMartin Matuska * Point the parity abd's into the aggregate abd's. 1002*e716630dSMartin Matuska */ 1003*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1004*e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1005*e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1006*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1007*e716630dSMartin Matuska raidz_col_t *prc = 1008*e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 1009*e716630dSMartin Matuska rc->rc_abd = 1010*e716630dSMartin Matuska abd_get_offset_struct(&rc->rc_abdstruct, 1011*e716630dSMartin Matuska prc->rc_abd, 1012*e716630dSMartin Matuska rc->rc_offset - prc->rc_offset, 1013*e716630dSMartin Matuska rc->rc_size); 1014*e716630dSMartin Matuska } 1015*e716630dSMartin Matuska } 1016*e716630dSMartin Matuska } else { 1017*e716630dSMartin Matuska /* 1018*e716630dSMartin Matuska * Allocate new abd's for the parity sectors. 1019*e716630dSMartin Matuska */ 1020*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1021*e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1022*e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1023*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1024*e716630dSMartin Matuska rc->rc_abd = 1025*e716630dSMartin Matuska abd_alloc_linear(rc->rc_size, 1026*e716630dSMartin Matuska B_TRUE); 1027*e716630dSMartin Matuska } 1028*e716630dSMartin Matuska } 1029*e716630dSMartin Matuska } 1030eda14cbcSMatt Macy /* init RAIDZ parity ops */ 1031eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops(); 1032eda14cbcSMatt Macy 1033eda14cbcSMatt Macy return (rm); 1034eda14cbcSMatt Macy } 1035eda14cbcSMatt Macy 1036eda14cbcSMatt Macy struct pqr_struct { 1037eda14cbcSMatt Macy uint64_t *p; 1038eda14cbcSMatt Macy uint64_t *q; 1039eda14cbcSMatt Macy uint64_t *r; 1040eda14cbcSMatt Macy }; 1041eda14cbcSMatt Macy 1042eda14cbcSMatt Macy static int 1043eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private) 1044eda14cbcSMatt Macy { 1045eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1046eda14cbcSMatt Macy const uint64_t *src = buf; 1047*e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1048eda14cbcSMatt Macy 1049eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r); 1050eda14cbcSMatt Macy 1051*e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++) 1052eda14cbcSMatt Macy *pqr->p ^= *src; 1053eda14cbcSMatt Macy 1054eda14cbcSMatt Macy return (0); 1055eda14cbcSMatt Macy } 1056eda14cbcSMatt Macy 1057eda14cbcSMatt Macy static int 1058eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private) 1059eda14cbcSMatt Macy { 1060eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1061eda14cbcSMatt Macy const uint64_t *src = buf; 1062eda14cbcSMatt Macy uint64_t mask; 1063*e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1064eda14cbcSMatt Macy 1065eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r); 1066eda14cbcSMatt Macy 1067*e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1068eda14cbcSMatt Macy *pqr->p ^= *src; 1069eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1070eda14cbcSMatt Macy *pqr->q ^= *src; 1071eda14cbcSMatt Macy } 1072eda14cbcSMatt Macy 1073eda14cbcSMatt Macy return (0); 1074eda14cbcSMatt Macy } 1075eda14cbcSMatt Macy 1076eda14cbcSMatt Macy static int 1077eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1078eda14cbcSMatt Macy { 1079eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1080eda14cbcSMatt Macy const uint64_t *src = buf; 1081eda14cbcSMatt Macy uint64_t mask; 1082*e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1083eda14cbcSMatt Macy 1084eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r); 1085eda14cbcSMatt Macy 1086*e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1087eda14cbcSMatt Macy *pqr->p ^= *src; 1088eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1089eda14cbcSMatt Macy *pqr->q ^= *src; 1090eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1091eda14cbcSMatt Macy *pqr->r ^= *src; 1092eda14cbcSMatt Macy } 1093eda14cbcSMatt Macy 1094eda14cbcSMatt Macy return (0); 1095eda14cbcSMatt Macy } 1096eda14cbcSMatt Macy 1097eda14cbcSMatt Macy static void 10987877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr) 1099eda14cbcSMatt Macy { 11007877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1101eda14cbcSMatt Macy 11027877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11037877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1104eda14cbcSMatt Macy 11057877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 11067877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1107eda14cbcSMatt Macy } else { 1108eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL }; 11097877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1110eda14cbcSMatt Macy vdev_raidz_p_func, &pqr); 1111eda14cbcSMatt Macy } 1112eda14cbcSMatt Macy } 1113eda14cbcSMatt Macy } 1114eda14cbcSMatt Macy 1115eda14cbcSMatt Macy static void 11167877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1117eda14cbcSMatt Macy { 11187877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11197877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11207877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11217877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11227877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1123eda14cbcSMatt Macy 11247877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11257877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1126eda14cbcSMatt Macy 11277877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1128eda14cbcSMatt Macy 11297877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1130eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11317877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11327877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 1133eda14cbcSMatt Macy 11347877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1135eda14cbcSMatt Macy p[i] = 0; 1136eda14cbcSMatt Macy q[i] = 0; 1137eda14cbcSMatt Macy } 1138eda14cbcSMatt Macy } else { 1139eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL }; 1140eda14cbcSMatt Macy 1141eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11427877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1143eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr); 1144eda14cbcSMatt Macy 1145eda14cbcSMatt Macy /* 1146eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1147eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1148eda14cbcSMatt Macy */ 11497877fdebSMatt Macy uint64_t mask; 11507877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1151eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1152eda14cbcSMatt Macy } 1153eda14cbcSMatt Macy } 1154eda14cbcSMatt Macy } 1155eda14cbcSMatt Macy } 1156eda14cbcSMatt Macy 1157eda14cbcSMatt Macy static void 11587877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1159eda14cbcSMatt Macy { 11607877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11617877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11627877fdebSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 11637877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11647877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11657877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 11667877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11677877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size); 1168eda14cbcSMatt Macy 11697877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11707877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1171eda14cbcSMatt Macy 11727877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1173eda14cbcSMatt Macy 11747877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1175eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11767877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11777877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 11787877fdebSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size); 1179eda14cbcSMatt Macy 11807877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1181eda14cbcSMatt Macy p[i] = 0; 1182eda14cbcSMatt Macy q[i] = 0; 1183eda14cbcSMatt Macy r[i] = 0; 1184eda14cbcSMatt Macy } 1185eda14cbcSMatt Macy } else { 1186eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r }; 1187eda14cbcSMatt Macy 1188eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11897877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1190eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr); 1191eda14cbcSMatt Macy 1192eda14cbcSMatt Macy /* 1193eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1194eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1195eda14cbcSMatt Macy */ 11967877fdebSMatt Macy uint64_t mask; 11977877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1198eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1199eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask); 1200eda14cbcSMatt Macy } 1201eda14cbcSMatt Macy } 1202eda14cbcSMatt Macy } 1203eda14cbcSMatt Macy } 1204eda14cbcSMatt Macy 1205eda14cbcSMatt Macy /* 1206eda14cbcSMatt Macy * Generate RAID parity in the first virtual columns according to the number of 1207eda14cbcSMatt Macy * parity columns available. 1208eda14cbcSMatt Macy */ 1209eda14cbcSMatt Macy void 12107877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1211eda14cbcSMatt Macy { 1212*e716630dSMartin Matuska if (rr->rr_cols == 0) { 1213*e716630dSMartin Matuska /* 1214*e716630dSMartin Matuska * We are handling this block one row at a time (because 1215*e716630dSMartin Matuska * this block has a different logical vs physical width, 1216*e716630dSMartin Matuska * due to RAIDZ expansion), and this is a pad-only row, 1217*e716630dSMartin Matuska * which has no parity. 1218*e716630dSMartin Matuska */ 1219*e716630dSMartin Matuska return; 1220*e716630dSMartin Matuska } 12217877fdebSMatt Macy 1222eda14cbcSMatt Macy /* Generate using the new math implementation */ 12237877fdebSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1224eda14cbcSMatt Macy return; 1225eda14cbcSMatt Macy 12267877fdebSMatt Macy switch (rr->rr_firstdatacol) { 1227eda14cbcSMatt Macy case 1: 12287877fdebSMatt Macy vdev_raidz_generate_parity_p(rr); 1229eda14cbcSMatt Macy break; 1230eda14cbcSMatt Macy case 2: 12317877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1232eda14cbcSMatt Macy break; 1233eda14cbcSMatt Macy case 3: 12347877fdebSMatt Macy vdev_raidz_generate_parity_pqr(rr); 1235eda14cbcSMatt Macy break; 1236eda14cbcSMatt Macy default: 1237eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1238eda14cbcSMatt Macy } 1239eda14cbcSMatt Macy } 1240eda14cbcSMatt Macy 12417877fdebSMatt Macy void 12427877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm) 12437877fdebSMatt Macy { 12447877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 12457877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 12467877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 12477877fdebSMatt Macy } 12487877fdebSMatt Macy } 12497877fdebSMatt Macy 1250eda14cbcSMatt Macy static int 1251eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1252eda14cbcSMatt Macy { 1253e92ffd9bSMartin Matuska (void) private; 1254eda14cbcSMatt Macy uint64_t *dst = dbuf; 1255eda14cbcSMatt Macy uint64_t *src = sbuf; 1256eda14cbcSMatt Macy int cnt = size / sizeof (src[0]); 1257eda14cbcSMatt Macy 1258eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) { 1259eda14cbcSMatt Macy dst[i] ^= src[i]; 1260eda14cbcSMatt Macy } 1261eda14cbcSMatt Macy 1262eda14cbcSMatt Macy return (0); 1263eda14cbcSMatt Macy } 1264eda14cbcSMatt Macy 1265eda14cbcSMatt Macy static int 1266eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1267eda14cbcSMatt Macy void *private) 1268eda14cbcSMatt Macy { 1269e92ffd9bSMartin Matuska (void) private; 1270eda14cbcSMatt Macy uint64_t *dst = dbuf; 1271eda14cbcSMatt Macy uint64_t *src = sbuf; 1272eda14cbcSMatt Macy uint64_t mask; 1273eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1274eda14cbcSMatt Macy 1275eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) { 1276eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1277eda14cbcSMatt Macy *dst ^= *src; 1278eda14cbcSMatt Macy } 1279eda14cbcSMatt Macy 1280eda14cbcSMatt Macy return (0); 1281eda14cbcSMatt Macy } 1282eda14cbcSMatt Macy 1283eda14cbcSMatt Macy static int 1284eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1285eda14cbcSMatt Macy { 1286e92ffd9bSMartin Matuska (void) private; 1287eda14cbcSMatt Macy uint64_t *dst = buf; 1288eda14cbcSMatt Macy uint64_t mask; 1289eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1290eda14cbcSMatt Macy 1291eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) { 1292eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1293eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1294eda14cbcSMatt Macy } 1295eda14cbcSMatt Macy 1296eda14cbcSMatt Macy return (0); 1297eda14cbcSMatt Macy } 1298eda14cbcSMatt Macy 1299eda14cbcSMatt Macy struct reconst_q_struct { 1300eda14cbcSMatt Macy uint64_t *q; 1301eda14cbcSMatt Macy int exp; 1302eda14cbcSMatt Macy }; 1303eda14cbcSMatt Macy 1304eda14cbcSMatt Macy static int 1305eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1306eda14cbcSMatt Macy { 1307eda14cbcSMatt Macy struct reconst_q_struct *rq = private; 1308eda14cbcSMatt Macy uint64_t *dst = buf; 1309eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1310eda14cbcSMatt Macy 1311eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1312eda14cbcSMatt Macy int j; 1313eda14cbcSMatt Macy uint8_t *b; 1314eda14cbcSMatt Macy 1315eda14cbcSMatt Macy *dst ^= *rq->q; 1316eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1317eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp); 1318eda14cbcSMatt Macy } 1319eda14cbcSMatt Macy } 1320eda14cbcSMatt Macy 1321eda14cbcSMatt Macy return (0); 1322eda14cbcSMatt Macy } 1323eda14cbcSMatt Macy 1324eda14cbcSMatt Macy struct reconst_pq_struct { 1325eda14cbcSMatt Macy uint8_t *p; 1326eda14cbcSMatt Macy uint8_t *q; 1327eda14cbcSMatt Macy uint8_t *pxy; 1328eda14cbcSMatt Macy uint8_t *qxy; 1329eda14cbcSMatt Macy int aexp; 1330eda14cbcSMatt Macy int bexp; 1331eda14cbcSMatt Macy }; 1332eda14cbcSMatt Macy 1333eda14cbcSMatt Macy static int 1334eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1335eda14cbcSMatt Macy { 1336eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1337eda14cbcSMatt Macy uint8_t *xd = xbuf; 1338eda14cbcSMatt Macy uint8_t *yd = ybuf; 1339eda14cbcSMatt Macy 1340eda14cbcSMatt Macy for (int i = 0; i < size; 1341eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1342eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1343eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1344eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1345eda14cbcSMatt Macy } 1346eda14cbcSMatt Macy 1347eda14cbcSMatt Macy return (0); 1348eda14cbcSMatt Macy } 1349eda14cbcSMatt Macy 1350eda14cbcSMatt Macy static int 1351eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1352eda14cbcSMatt Macy { 1353eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1354eda14cbcSMatt Macy uint8_t *xd = xbuf; 1355eda14cbcSMatt Macy 1356eda14cbcSMatt Macy for (int i = 0; i < size; 1357eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1358eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1359eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1360eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1361eda14cbcSMatt Macy } 1362eda14cbcSMatt Macy 1363eda14cbcSMatt Macy return (0); 1364eda14cbcSMatt Macy } 1365eda14cbcSMatt Macy 1366f9693befSMartin Matuska static void 13677877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1368eda14cbcSMatt Macy { 1369eda14cbcSMatt Macy int x = tgts[0]; 1370eda14cbcSMatt Macy abd_t *dst, *src; 1371eda14cbcSMatt Macy 1372*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1373*e716630dSMartin Matuska zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1374*e716630dSMartin Matuska 13757877fdebSMatt Macy ASSERT3U(ntgts, ==, 1); 13767877fdebSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol); 13777877fdebSMatt Macy ASSERT3U(x, <, rr->rr_cols); 1378eda14cbcSMatt Macy 13797877fdebSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1380eda14cbcSMatt Macy 13817877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 13827877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1383eda14cbcSMatt Macy 13847877fdebSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1385eda14cbcSMatt Macy 13867877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 13877877fdebSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size, 13887877fdebSMatt Macy rr->rr_col[c].rc_size); 1389eda14cbcSMatt Macy 13907877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 1391eda14cbcSMatt Macy 1392eda14cbcSMatt Macy if (c == x) 1393eda14cbcSMatt Macy continue; 1394eda14cbcSMatt Macy 1395eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1396eda14cbcSMatt Macy vdev_raidz_reconst_p_func, NULL); 1397eda14cbcSMatt Macy } 1398eda14cbcSMatt Macy } 1399eda14cbcSMatt Macy 1400f9693befSMartin Matuska static void 14017877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1402eda14cbcSMatt Macy { 1403eda14cbcSMatt Macy int x = tgts[0]; 1404eda14cbcSMatt Macy int c, exp; 1405eda14cbcSMatt Macy abd_t *dst, *src; 1406eda14cbcSMatt Macy 1407*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1408*e716630dSMartin Matuska zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1409*e716630dSMartin Matuska 1410eda14cbcSMatt Macy ASSERT(ntgts == 1); 1411eda14cbcSMatt Macy 14127877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1413eda14cbcSMatt Macy 14147877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14157877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 14167877fdebSMatt Macy rr->rr_col[c].rc_size); 1417eda14cbcSMatt Macy 14187877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 14197877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1420eda14cbcSMatt Macy 14217877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1422eda14cbcSMatt Macy abd_copy(dst, src, size); 14237877fdebSMatt Macy if (rr->rr_col[x].rc_size > size) { 1424eda14cbcSMatt Macy abd_zero_off(dst, size, 14257877fdebSMatt Macy rr->rr_col[x].rc_size - size); 14267877fdebSMatt Macy } 1427eda14cbcSMatt Macy } else { 14287877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1429eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1430eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL); 1431eda14cbcSMatt Macy (void) abd_iterate_func(dst, 14327877fdebSMatt Macy size, rr->rr_col[x].rc_size - size, 1433eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL); 1434eda14cbcSMatt Macy } 1435eda14cbcSMatt Macy } 1436eda14cbcSMatt Macy 14377877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14387877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 14397877fdebSMatt Macy exp = 255 - (rr->rr_cols - 1 - x); 1440eda14cbcSMatt Macy 1441eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp }; 14427877fdebSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1443eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq); 1444eda14cbcSMatt Macy } 1445eda14cbcSMatt Macy 1446f9693befSMartin Matuska static void 14477877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1448eda14cbcSMatt Macy { 1449eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1450eda14cbcSMatt Macy abd_t *pdata, *qdata; 1451eda14cbcSMatt Macy uint64_t xsize, ysize; 1452eda14cbcSMatt Macy int x = tgts[0]; 1453eda14cbcSMatt Macy int y = tgts[1]; 1454eda14cbcSMatt Macy abd_t *xd, *yd; 1455eda14cbcSMatt Macy 1456*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1457*e716630dSMartin Matuska zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1458*e716630dSMartin Matuska 1459eda14cbcSMatt Macy ASSERT(ntgts == 2); 1460eda14cbcSMatt Macy ASSERT(x < y); 14617877fdebSMatt Macy ASSERT(x >= rr->rr_firstdatacol); 14627877fdebSMatt Macy ASSERT(y < rr->rr_cols); 1463eda14cbcSMatt Macy 14647877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1465eda14cbcSMatt Macy 1466eda14cbcSMatt Macy /* 1467eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as 1468eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1469eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual 1470eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by 1471eda14cbcSMatt Macy * setting their lengths to zero. 1472eda14cbcSMatt Macy */ 14737877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 14747877fdebSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14757877fdebSMatt Macy xsize = rr->rr_col[x].rc_size; 14767877fdebSMatt Macy ysize = rr->rr_col[y].rc_size; 1477eda14cbcSMatt Macy 14787877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = 14797877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 14807877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 14817877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 14827877fdebSMatt Macy rr->rr_col[x].rc_size = 0; 14837877fdebSMatt Macy rr->rr_col[y].rc_size = 0; 1484eda14cbcSMatt Macy 14857877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1486eda14cbcSMatt Macy 14877877fdebSMatt Macy rr->rr_col[x].rc_size = xsize; 14887877fdebSMatt Macy rr->rr_col[y].rc_size = ysize; 1489eda14cbcSMatt Macy 1490eda14cbcSMatt Macy p = abd_to_buf(pdata); 1491eda14cbcSMatt Macy q = abd_to_buf(qdata); 14927877fdebSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 14937877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 14947877fdebSMatt Macy xd = rr->rr_col[x].rc_abd; 14957877fdebSMatt Macy yd = rr->rr_col[y].rc_abd; 1496eda14cbcSMatt Macy 1497eda14cbcSMatt Macy /* 1498eda14cbcSMatt Macy * We now have: 1499eda14cbcSMatt Macy * Pxy = P + D_x + D_y 1500eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1501eda14cbcSMatt Macy * 1502eda14cbcSMatt Macy * We can then solve for D_x: 1503eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy) 1504eda14cbcSMatt Macy * where 1505eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1 1506eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1507eda14cbcSMatt Macy * 1508eda14cbcSMatt Macy * With D_x in hand, we can easily solve for D_y: 1509eda14cbcSMatt Macy * D_y = P + Pxy + D_x 1510eda14cbcSMatt Macy */ 1511eda14cbcSMatt Macy 1512eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y]; 15137877fdebSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1514eda14cbcSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1]; 1515eda14cbcSMatt Macy 1516eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1517eda14cbcSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1518eda14cbcSMatt Macy 1519eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize); 1520eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1521eda14cbcSMatt Macy 1522eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1523eda14cbcSMatt Macy vdev_raidz_reconst_pq_func, &rpq); 1524eda14cbcSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize, 1525eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq); 1526eda14cbcSMatt Macy 15277877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15287877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1529eda14cbcSMatt Macy 1530eda14cbcSMatt Macy /* 1531eda14cbcSMatt Macy * Restore the saved parity data. 1532eda14cbcSMatt Macy */ 15337877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 15347877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1535eda14cbcSMatt Macy } 1536eda14cbcSMatt Macy 1537eda14cbcSMatt Macy /* 1538eda14cbcSMatt Macy * In the general case of reconstruction, we must solve the system of linear 1539eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as 1540eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with 1541eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p) 1542eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1543eda14cbcSMatt Macy * 1544eda14cbcSMatt Macy * __ __ __ __ 1545eda14cbcSMatt Macy * | | __ __ | p_0 | 1546eda14cbcSMatt Macy * | V | | D_0 | | p_m-1 | 1547eda14cbcSMatt Macy * | | x | : | = | d_0 | 1548eda14cbcSMatt Macy * | I | | D_n-1 | | : | 1549eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 | 1550eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1551eda14cbcSMatt Macy * 1552eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde 1553eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns 1554eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1555eda14cbcSMatt Macy * computation as well as linear separability. 1556eda14cbcSMatt Macy * 1557eda14cbcSMatt Macy * __ __ __ __ 1558eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 | 1559eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : | 1560eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1561eda14cbcSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 | 1562eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1563eda14cbcSMatt Macy * | : : : : | | : | | d_2 | 1564eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : | 1565eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : | 1566eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 | 1567eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1568eda14cbcSMatt Macy * 1569eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the 1570eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown 1571eda14cbcSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond 1572eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p 1573eda14cbcSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up 1574eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1575eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity 1576eda14cbcSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1577eda14cbcSMatt Macy * __ __ 1578eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1579eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1580eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / / 1581eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / / 1582eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' / 1583eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1584eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1585eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1586eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1587eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1588eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1589eda14cbcSMatt Macy * ~~ ~~ 1590eda14cbcSMatt Macy * __ __ 1591eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1592eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | 1593eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | 1594eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | 1595eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | 1596eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 | 1597eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1598eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1599eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1600eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1601eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1602eda14cbcSMatt Macy * ~~ ~~ 1603eda14cbcSMatt Macy * 1604eda14cbcSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1605eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1606eda14cbcSMatt Macy * matrix is not singular. 1607eda14cbcSMatt Macy * __ __ 1608eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1609eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1610eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1611eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1612eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1613eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1614eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1615eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1616eda14cbcSMatt Macy * ~~ ~~ 1617eda14cbcSMatt Macy * __ __ 1618eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1619eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1620eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1621eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1622eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1623eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1624eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1625eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1626eda14cbcSMatt Macy * ~~ ~~ 1627eda14cbcSMatt Macy * __ __ 1628eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1629eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1630eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1631eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1632eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1633eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1634eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1635eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1636eda14cbcSMatt Macy * ~~ ~~ 1637eda14cbcSMatt Macy * __ __ 1638eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1639eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1640eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1641eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1642eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1643eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1644eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1645eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1646eda14cbcSMatt Macy * ~~ ~~ 1647eda14cbcSMatt Macy * __ __ 1648eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1649eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1650eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1651eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1652eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1653eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1654eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1655eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1656eda14cbcSMatt Macy * ~~ ~~ 1657eda14cbcSMatt Macy * __ __ 1658eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1659eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1660eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1661eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1662eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1663eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1664eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1665eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1666eda14cbcSMatt Macy * ~~ ~~ 1667eda14cbcSMatt Macy * __ __ 1668eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 | 1669eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 | 1670eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 | 1671eda14cbcSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1672eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1673eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1674eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1675eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1676eda14cbcSMatt Macy * ~~ ~~ 1677eda14cbcSMatt Macy * 1678eda14cbcSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1679eda14cbcSMatt Macy * of the missing data. 1680eda14cbcSMatt Macy * 1681eda14cbcSMatt Macy * As is apparent from the example above, the only non-trivial rows in the 1682eda14cbcSMatt Macy * inverse matrix correspond to the data disks that we're trying to 1683eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would 1684eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For 1685eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to 1686eda14cbcSMatt Macy * targeted columns. 1687eda14cbcSMatt Macy */ 1688eda14cbcSMatt Macy 1689eda14cbcSMatt Macy static void 16907877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1691eda14cbcSMatt Macy uint8_t **rows) 1692eda14cbcSMatt Macy { 1693eda14cbcSMatt Macy int i, j; 1694eda14cbcSMatt Macy int pow; 1695eda14cbcSMatt Macy 16967877fdebSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1697eda14cbcSMatt Macy 1698eda14cbcSMatt Macy /* 1699eda14cbcSMatt Macy * Fill in the missing rows of interest. 1700eda14cbcSMatt Macy */ 1701eda14cbcSMatt Macy for (i = 0; i < nmap; i++) { 1702eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]); 1703eda14cbcSMatt Macy ASSERT3S(map[i], <=, 2); 1704eda14cbcSMatt Macy 1705eda14cbcSMatt Macy pow = map[i] * n; 1706eda14cbcSMatt Macy if (pow > 255) 1707eda14cbcSMatt Macy pow -= 255; 1708eda14cbcSMatt Macy ASSERT(pow <= 255); 1709eda14cbcSMatt Macy 1710eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1711eda14cbcSMatt Macy pow -= map[i]; 1712eda14cbcSMatt Macy if (pow < 0) 1713eda14cbcSMatt Macy pow += 255; 1714eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow]; 1715eda14cbcSMatt Macy } 1716eda14cbcSMatt Macy } 1717eda14cbcSMatt Macy } 1718eda14cbcSMatt Macy 1719eda14cbcSMatt Macy static void 17207877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1721eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1722eda14cbcSMatt Macy { 1723eda14cbcSMatt Macy int i, j, ii, jj; 1724eda14cbcSMatt Macy uint8_t log; 1725eda14cbcSMatt Macy 1726eda14cbcSMatt Macy /* 1727eda14cbcSMatt Macy * Assert that the first nmissing entries from the array of used 1728eda14cbcSMatt Macy * columns correspond to parity columns and that subsequent entries 1729eda14cbcSMatt Macy * correspond to data columns. 1730eda14cbcSMatt Macy */ 1731eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 17327877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol); 1733eda14cbcSMatt Macy } 1734eda14cbcSMatt Macy for (; i < n; i++) { 17357877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1736eda14cbcSMatt Macy } 1737eda14cbcSMatt Macy 1738eda14cbcSMatt Macy /* 1739eda14cbcSMatt Macy * First initialize the storage where we'll compute the inverse rows. 1740eda14cbcSMatt Macy */ 1741eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1742eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1743eda14cbcSMatt Macy invrows[i][j] = (i == j) ? 1 : 0; 1744eda14cbcSMatt Macy } 1745eda14cbcSMatt Macy } 1746eda14cbcSMatt Macy 1747eda14cbcSMatt Macy /* 1748eda14cbcSMatt Macy * Subtract all trivial rows from the rows of consequence. 1749eda14cbcSMatt Macy */ 1750eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1751eda14cbcSMatt Macy for (j = nmissing; j < n; j++) { 17527877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol); 17537877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol; 1754eda14cbcSMatt Macy ASSERT3S(jj, <, n); 1755eda14cbcSMatt Macy invrows[i][j] = rows[i][jj]; 1756eda14cbcSMatt Macy rows[i][jj] = 0; 1757eda14cbcSMatt Macy } 1758eda14cbcSMatt Macy } 1759eda14cbcSMatt Macy 1760eda14cbcSMatt Macy /* 1761eda14cbcSMatt Macy * For each of the rows of interest, we must normalize it and subtract 1762eda14cbcSMatt Macy * a multiple of it from the other rows. 1763eda14cbcSMatt Macy */ 1764eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1765eda14cbcSMatt Macy for (j = 0; j < missing[i]; j++) { 1766eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1767eda14cbcSMatt Macy } 1768eda14cbcSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0); 1769eda14cbcSMatt Macy 1770eda14cbcSMatt Macy /* 1771eda14cbcSMatt Macy * Compute the inverse of the first element and multiply each 1772eda14cbcSMatt Macy * element in the row by that value. 1773eda14cbcSMatt Macy */ 1774eda14cbcSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1775eda14cbcSMatt Macy 1776eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1777eda14cbcSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1778eda14cbcSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1779eda14cbcSMatt Macy } 1780eda14cbcSMatt Macy 1781eda14cbcSMatt Macy for (ii = 0; ii < nmissing; ii++) { 1782eda14cbcSMatt Macy if (i == ii) 1783eda14cbcSMatt Macy continue; 1784eda14cbcSMatt Macy 1785eda14cbcSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0); 1786eda14cbcSMatt Macy 1787eda14cbcSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]]; 1788eda14cbcSMatt Macy 1789eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1790eda14cbcSMatt Macy rows[ii][j] ^= 1791eda14cbcSMatt Macy vdev_raidz_exp2(rows[i][j], log); 1792eda14cbcSMatt Macy invrows[ii][j] ^= 1793eda14cbcSMatt Macy vdev_raidz_exp2(invrows[i][j], log); 1794eda14cbcSMatt Macy } 1795eda14cbcSMatt Macy } 1796eda14cbcSMatt Macy } 1797eda14cbcSMatt Macy 1798eda14cbcSMatt Macy /* 1799eda14cbcSMatt Macy * Verify that the data that is left in the rows are properly part of 1800eda14cbcSMatt Macy * an identity matrix. 1801eda14cbcSMatt Macy */ 1802eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1803eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1804eda14cbcSMatt Macy if (j == missing[i]) { 1805eda14cbcSMatt Macy ASSERT3U(rows[i][j], ==, 1); 1806eda14cbcSMatt Macy } else { 1807eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1808eda14cbcSMatt Macy } 1809eda14cbcSMatt Macy } 1810eda14cbcSMatt Macy } 1811eda14cbcSMatt Macy } 1812eda14cbcSMatt Macy 1813eda14cbcSMatt Macy static void 18147877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1815eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used) 1816eda14cbcSMatt Macy { 1817eda14cbcSMatt Macy int i, j, x, cc, c; 1818eda14cbcSMatt Macy uint8_t *src; 1819eda14cbcSMatt Macy uint64_t ccount; 1820eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1821eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1822eda14cbcSMatt Macy uint8_t log = 0; 1823eda14cbcSMatt Macy uint8_t val; 1824eda14cbcSMatt Macy int ll; 1825eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1826eda14cbcSMatt Macy uint8_t *p, *pp; 1827eda14cbcSMatt Macy size_t psize; 1828eda14cbcSMatt Macy 1829eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing; 1830eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1831eda14cbcSMatt Macy 1832eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing; i++) { 1833eda14cbcSMatt Macy invlog[i] = pp; 1834eda14cbcSMatt Macy pp += n; 1835eda14cbcSMatt Macy } 1836eda14cbcSMatt Macy 1837eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1838eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1839eda14cbcSMatt Macy ASSERT3U(invrows[i][j], !=, 0); 1840eda14cbcSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1841eda14cbcSMatt Macy } 1842eda14cbcSMatt Macy } 1843eda14cbcSMatt Macy 1844eda14cbcSMatt Macy for (i = 0; i < n; i++) { 1845eda14cbcSMatt Macy c = used[i]; 18467877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols); 1847eda14cbcSMatt Macy 18487877fdebSMatt Macy ccount = rr->rr_col[c].rc_size; 18497877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 18507877fdebSMatt Macy if (ccount == 0) 18517877fdebSMatt Macy continue; 18527877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd); 1853eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) { 18547877fdebSMatt Macy cc = missing[j] + rr->rr_firstdatacol; 18557877fdebSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol); 18567877fdebSMatt Macy ASSERT3U(cc, <, rr->rr_cols); 1857eda14cbcSMatt Macy ASSERT3U(cc, !=, c); 1858eda14cbcSMatt Macy 18597877fdebSMatt Macy dcount[j] = rr->rr_col[cc].rc_size; 18607877fdebSMatt Macy if (dcount[j] != 0) 18617877fdebSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1862eda14cbcSMatt Macy } 1863eda14cbcSMatt Macy 1864eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) { 1865eda14cbcSMatt Macy if (*src != 0) 1866eda14cbcSMatt Macy log = vdev_raidz_log2[*src]; 1867eda14cbcSMatt Macy 1868eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) { 1869eda14cbcSMatt Macy if (x >= dcount[cc]) 1870eda14cbcSMatt Macy continue; 1871eda14cbcSMatt Macy 1872eda14cbcSMatt Macy if (*src == 0) { 1873eda14cbcSMatt Macy val = 0; 1874eda14cbcSMatt Macy } else { 1875eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255) 1876eda14cbcSMatt Macy ll -= 255; 1877eda14cbcSMatt Macy val = vdev_raidz_pow2[ll]; 1878eda14cbcSMatt Macy } 1879eda14cbcSMatt Macy 1880eda14cbcSMatt Macy if (i == 0) 1881eda14cbcSMatt Macy dst[cc][x] = val; 1882eda14cbcSMatt Macy else 1883eda14cbcSMatt Macy dst[cc][x] ^= val; 1884eda14cbcSMatt Macy } 1885eda14cbcSMatt Macy } 1886eda14cbcSMatt Macy } 1887eda14cbcSMatt Macy 1888eda14cbcSMatt Macy kmem_free(p, psize); 1889eda14cbcSMatt Macy } 1890eda14cbcSMatt Macy 1891f9693befSMartin Matuska static void 18927877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1893eda14cbcSMatt Macy { 1894eda14cbcSMatt Macy int n, i, c, t, tt; 1895eda14cbcSMatt Macy int nmissing_rows; 1896eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1897eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY]; 1898eda14cbcSMatt Macy uint8_t *p, *pp; 1899eda14cbcSMatt Macy size_t psize; 1900eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1901eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1902eda14cbcSMatt Macy uint8_t *used; 1903eda14cbcSMatt Macy 1904eda14cbcSMatt Macy abd_t **bufs = NULL; 1905eda14cbcSMatt Macy 1906*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1907*e716630dSMartin Matuska zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1908eda14cbcSMatt Macy /* 1909eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate 19107877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found. 1911eda14cbcSMatt Macy */ 19127877fdebSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1913*e716630dSMartin Matuska ASSERT(rr->rr_col[i].rc_abd != NULL); 19147877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 19157877fdebSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 19167877fdebSMatt Macy KM_PUSHPAGE); 1917eda14cbcSMatt Macy 19187877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 19197877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 1920eda14cbcSMatt Macy 1921eda14cbcSMatt Macy bufs[c] = col->rc_abd; 19227877fdebSMatt Macy if (bufs[c] != NULL) { 19237877fdebSMatt Macy col->rc_abd = abd_alloc_linear( 19247877fdebSMatt Macy col->rc_size, B_TRUE); 19257877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c], 19267877fdebSMatt Macy col->rc_size); 1927eda14cbcSMatt Macy } 1928eda14cbcSMatt Macy } 1929eda14cbcSMatt Macy 19307877fdebSMatt Macy break; 19317877fdebSMatt Macy } 19327877fdebSMatt Macy } 19337877fdebSMatt Macy 19347877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol; 1935eda14cbcSMatt Macy 1936eda14cbcSMatt Macy /* 1937eda14cbcSMatt Macy * Figure out which data columns are missing. 1938eda14cbcSMatt Macy */ 1939eda14cbcSMatt Macy nmissing_rows = 0; 1940eda14cbcSMatt Macy for (t = 0; t < ntgts; t++) { 19417877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) { 1942eda14cbcSMatt Macy missing_rows[nmissing_rows++] = 19437877fdebSMatt Macy tgts[t] - rr->rr_firstdatacol; 1944eda14cbcSMatt Macy } 1945eda14cbcSMatt Macy } 1946eda14cbcSMatt Macy 1947eda14cbcSMatt Macy /* 1948eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing 1949eda14cbcSMatt Macy * data columns. 1950eda14cbcSMatt Macy */ 1951eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1952eda14cbcSMatt Macy ASSERT(tt < ntgts); 19537877fdebSMatt Macy ASSERT(c < rr->rr_firstdatacol); 1954eda14cbcSMatt Macy 1955eda14cbcSMatt Macy /* 1956eda14cbcSMatt Macy * Skip any targeted parity columns. 1957eda14cbcSMatt Macy */ 1958eda14cbcSMatt Macy if (c == tgts[tt]) { 1959eda14cbcSMatt Macy tt++; 1960eda14cbcSMatt Macy continue; 1961eda14cbcSMatt Macy } 1962eda14cbcSMatt Macy 1963eda14cbcSMatt Macy parity_map[i] = c; 1964eda14cbcSMatt Macy i++; 1965eda14cbcSMatt Macy } 1966eda14cbcSMatt Macy 1967eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1968eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n; 1969eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1970eda14cbcSMatt Macy 1971eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) { 1972eda14cbcSMatt Macy rows[i] = pp; 1973eda14cbcSMatt Macy pp += n; 1974eda14cbcSMatt Macy invrows[i] = pp; 1975eda14cbcSMatt Macy pp += n; 1976eda14cbcSMatt Macy } 1977eda14cbcSMatt Macy used = pp; 1978eda14cbcSMatt Macy 1979eda14cbcSMatt Macy for (i = 0; i < nmissing_rows; i++) { 1980eda14cbcSMatt Macy used[i] = parity_map[i]; 1981eda14cbcSMatt Macy } 1982eda14cbcSMatt Macy 19837877fdebSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 1984eda14cbcSMatt Macy if (tt < nmissing_rows && 19857877fdebSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) { 1986eda14cbcSMatt Macy tt++; 1987eda14cbcSMatt Macy continue; 1988eda14cbcSMatt Macy } 1989eda14cbcSMatt Macy 1990eda14cbcSMatt Macy ASSERT3S(i, <, n); 1991eda14cbcSMatt Macy used[i] = c; 1992eda14cbcSMatt Macy i++; 1993eda14cbcSMatt Macy } 1994eda14cbcSMatt Macy 1995eda14cbcSMatt Macy /* 1996eda14cbcSMatt Macy * Initialize the interesting rows of the matrix. 1997eda14cbcSMatt Macy */ 19987877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 1999eda14cbcSMatt Macy 2000eda14cbcSMatt Macy /* 2001eda14cbcSMatt Macy * Invert the matrix. 2002eda14cbcSMatt Macy */ 20037877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2004eda14cbcSMatt Macy invrows, used); 2005eda14cbcSMatt Macy 2006eda14cbcSMatt Macy /* 2007eda14cbcSMatt Macy * Reconstruct the missing data using the generated matrix. 2008eda14cbcSMatt Macy */ 20097877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2010eda14cbcSMatt Macy invrows, used); 2011eda14cbcSMatt Macy 2012eda14cbcSMatt Macy kmem_free(p, psize); 2013eda14cbcSMatt Macy 2014eda14cbcSMatt Macy /* 2015eda14cbcSMatt Macy * copy back from temporary linear abds and free them 2016eda14cbcSMatt Macy */ 2017eda14cbcSMatt Macy if (bufs) { 20187877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 20197877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 2020eda14cbcSMatt Macy 20217877fdebSMatt Macy if (bufs[c] != NULL) { 2022eda14cbcSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size); 2023eda14cbcSMatt Macy abd_free(col->rc_abd); 20247877fdebSMatt Macy } 2025eda14cbcSMatt Macy col->rc_abd = bufs[c]; 2026eda14cbcSMatt Macy } 20277877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2028eda14cbcSMatt Macy } 2029eda14cbcSMatt Macy } 2030eda14cbcSMatt Macy 2031f9693befSMartin Matuska static void 20327877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 20337877fdebSMatt Macy const int *t, int nt) 2034eda14cbcSMatt Macy { 2035eda14cbcSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2036eda14cbcSMatt Macy int ntgts; 2037eda14cbcSMatt Macy int i, c, ret; 2038eda14cbcSMatt Macy int nbadparity, nbaddata; 2039eda14cbcSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2040eda14cbcSMatt Macy 2041*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2042*e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2043*e716630dSMartin Matuska rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2044*e716630dSMartin Matuska (int)rr->rr_missingparity); 2045*e716630dSMartin Matuska } 2046*e716630dSMartin Matuska 20477877fdebSMatt Macy nbadparity = rr->rr_firstdatacol; 20487877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity; 2049eda14cbcSMatt Macy ntgts = 0; 20507877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) { 2051*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2052*e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2053*e716630dSMartin Matuska "offset=%llx error=%u)", 2054*e716630dSMartin Matuska rr, c, (int)rr->rr_col[c].rc_devidx, 2055*e716630dSMartin Matuska (long long)rr->rr_col[c].rc_offset, 2056*e716630dSMartin Matuska (int)rr->rr_col[c].rc_error); 2057*e716630dSMartin Matuska } 20587877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2059eda14cbcSMatt Macy parity_valid[c] = B_FALSE; 2060eda14cbcSMatt Macy 2061eda14cbcSMatt Macy if (i < nt && c == t[i]) { 2062eda14cbcSMatt Macy tgts[ntgts++] = c; 2063eda14cbcSMatt Macy i++; 20647877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) { 2065eda14cbcSMatt Macy tgts[ntgts++] = c; 20667877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) { 2067eda14cbcSMatt Macy nbaddata--; 2068eda14cbcSMatt Macy } else { 2069eda14cbcSMatt Macy parity_valid[c] = B_TRUE; 2070eda14cbcSMatt Macy nbadparity--; 2071eda14cbcSMatt Macy } 2072eda14cbcSMatt Macy } 2073eda14cbcSMatt Macy 2074eda14cbcSMatt Macy ASSERT(ntgts >= nt); 2075eda14cbcSMatt Macy ASSERT(nbaddata >= 0); 2076eda14cbcSMatt Macy ASSERT(nbaddata + nbadparity == ntgts); 2077eda14cbcSMatt Macy 2078eda14cbcSMatt Macy dt = &tgts[nbadparity]; 2079eda14cbcSMatt Macy 2080eda14cbcSMatt Macy /* Reconstruct using the new math implementation */ 20817877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2082eda14cbcSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL) 2083f9693befSMartin Matuska return; 2084eda14cbcSMatt Macy 2085eda14cbcSMatt Macy /* 2086eda14cbcSMatt Macy * See if we can use any of our optimized reconstruction routines. 2087eda14cbcSMatt Macy */ 2088eda14cbcSMatt Macy switch (nbaddata) { 2089eda14cbcSMatt Macy case 1: 2090f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_P]) { 2091f9693befSMartin Matuska vdev_raidz_reconstruct_p(rr, dt, 1); 2092f9693befSMartin Matuska return; 2093f9693befSMartin Matuska } 2094eda14cbcSMatt Macy 20957877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2096eda14cbcSMatt Macy 2097f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_Q]) { 2098f9693befSMartin Matuska vdev_raidz_reconstruct_q(rr, dt, 1); 2099f9693befSMartin Matuska return; 2100f9693befSMartin Matuska } 2101eda14cbcSMatt Macy 21027877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2103eda14cbcSMatt Macy break; 2104eda14cbcSMatt Macy 2105eda14cbcSMatt Macy case 2: 21067877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2107eda14cbcSMatt Macy 2108eda14cbcSMatt Macy if (parity_valid[VDEV_RAIDZ_P] && 2109f9693befSMartin Matuska parity_valid[VDEV_RAIDZ_Q]) { 2110f9693befSMartin Matuska vdev_raidz_reconstruct_pq(rr, dt, 2); 2111f9693befSMartin Matuska return; 2112f9693befSMartin Matuska } 2113eda14cbcSMatt Macy 21147877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2115eda14cbcSMatt Macy 2116eda14cbcSMatt Macy break; 2117eda14cbcSMatt Macy } 2118eda14cbcSMatt Macy 2119f9693befSMartin Matuska vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2120eda14cbcSMatt Macy } 2121eda14cbcSMatt Macy 2122eda14cbcSMatt Macy static int 2123eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2124eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 2125eda14cbcSMatt Macy { 21267877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 21277877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2128eda14cbcSMatt Macy int c; 2129eda14cbcSMatt Macy int lasterror = 0; 2130eda14cbcSMatt Macy int numerrors = 0; 2131eda14cbcSMatt Macy 2132eda14cbcSMatt Macy ASSERT(nparity > 0); 2133eda14cbcSMatt Macy 2134eda14cbcSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY || 2135eda14cbcSMatt Macy vd->vdev_children < nparity + 1) { 2136eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2137eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 2138eda14cbcSMatt Macy } 2139eda14cbcSMatt Macy 2140eda14cbcSMatt Macy vdev_open_children(vd); 2141eda14cbcSMatt Macy 2142eda14cbcSMatt Macy for (c = 0; c < vd->vdev_children; c++) { 21437877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 2144eda14cbcSMatt Macy 2145eda14cbcSMatt Macy if (cvd->vdev_open_error != 0) { 2146eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 2147eda14cbcSMatt Macy numerrors++; 2148eda14cbcSMatt Macy continue; 2149eda14cbcSMatt Macy } 2150eda14cbcSMatt Macy 2151eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2152eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2153eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2154c7046f76SMartin Matuska } 2155c7046f76SMartin Matuska for (c = 0; c < vd->vdev_children; c++) { 2156c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c]; 2157c7046f76SMartin Matuska 2158c7046f76SMartin Matuska if (cvd->vdev_open_error != 0) 2159c7046f76SMartin Matuska continue; 2160c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift, 2161c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift); 2162eda14cbcSMatt Macy } 2163eda14cbcSMatt Macy 2164*e716630dSMartin Matuska if (vd->vdev_rz_expanding) { 2165*e716630dSMartin Matuska *asize *= vd->vdev_children - 1; 2166*e716630dSMartin Matuska *max_asize *= vd->vdev_children - 1; 2167*e716630dSMartin Matuska 2168*e716630dSMartin Matuska vd->vdev_min_asize = *asize; 2169*e716630dSMartin Matuska } else { 2170eda14cbcSMatt Macy *asize *= vd->vdev_children; 2171eda14cbcSMatt Macy *max_asize *= vd->vdev_children; 2172*e716630dSMartin Matuska } 2173eda14cbcSMatt Macy 2174eda14cbcSMatt Macy if (numerrors > nparity) { 2175eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2176eda14cbcSMatt Macy return (lasterror); 2177eda14cbcSMatt Macy } 2178eda14cbcSMatt Macy 2179eda14cbcSMatt Macy return (0); 2180eda14cbcSMatt Macy } 2181eda14cbcSMatt Macy 2182eda14cbcSMatt Macy static void 2183eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd) 2184eda14cbcSMatt Macy { 21857877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 21867877fdebSMatt Macy if (vd->vdev_child[c] != NULL) 2187eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 2188eda14cbcSMatt Macy } 21897877fdebSMatt Macy } 2190eda14cbcSMatt Macy 2191*e716630dSMartin Matuska /* 2192*e716630dSMartin Matuska * Return the logical width to use, given the txg in which the allocation 2193*e716630dSMartin Matuska * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the 2194*e716630dSMartin Matuska * BP was allocated. Remapped BP's (that were relocated due to device 2195*e716630dSMartin Matuska * removal, see remap_blkptr_cb()), will have a more recent 2196*e716630dSMartin Matuska * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can 2197*e716630dSMartin Matuska * ignore these because they can't be on RAIDZ (device removal doesn't 2198*e716630dSMartin Matuska * support RAIDZ). 2199*e716630dSMartin Matuska */ 2200eda14cbcSMatt Macy static uint64_t 2201*e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2202*e716630dSMartin Matuska { 2203*e716630dSMartin Matuska reflow_node_t lookup = { 2204*e716630dSMartin Matuska .re_txg = txg, 2205*e716630dSMartin Matuska }; 2206*e716630dSMartin Matuska avl_index_t where; 2207*e716630dSMartin Matuska 2208*e716630dSMartin Matuska uint64_t width; 2209*e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 2210*e716630dSMartin Matuska reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2211*e716630dSMartin Matuska if (re != NULL) { 2212*e716630dSMartin Matuska width = re->re_logical_width; 2213*e716630dSMartin Matuska } else { 2214*e716630dSMartin Matuska re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2215*e716630dSMartin Matuska if (re != NULL) 2216*e716630dSMartin Matuska width = re->re_logical_width; 2217*e716630dSMartin Matuska else 2218*e716630dSMartin Matuska width = vdrz->vd_original_width; 2219*e716630dSMartin Matuska } 2220*e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 2221*e716630dSMartin Matuska return (width); 2222*e716630dSMartin Matuska } 2223*e716630dSMartin Matuska 2224*e716630dSMartin Matuska /* 2225*e716630dSMartin Matuska * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2226*e716630dSMartin Matuska * more space due to the lower data-to-parity ratio. In this case it's 2227*e716630dSMartin Matuska * important to pass in the correct txg. Note that vdev_gang_header_asize() 2228*e716630dSMartin Matuska * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2229*e716630dSMartin Matuska * regardless of txg. This is assured because for a single data sector, we 2230*e716630dSMartin Matuska * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2231*e716630dSMartin Matuska */ 2232*e716630dSMartin Matuska static uint64_t 2233*e716630dSMartin Matuska vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2234eda14cbcSMatt Macy { 22357877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2236eda14cbcSMatt Macy uint64_t asize; 2237eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 2238*e716630dSMartin Matuska uint64_t cols = vdrz->vd_original_width; 22397877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2240eda14cbcSMatt Macy 2241*e716630dSMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg); 2242*e716630dSMartin Matuska 2243eda14cbcSMatt Macy asize = ((psize - 1) >> ashift) + 1; 2244eda14cbcSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2245eda14cbcSMatt Macy asize = roundup(asize, nparity + 1) << ashift; 2246eda14cbcSMatt Macy 2247*e716630dSMartin Matuska #ifdef ZFS_DEBUG 2248*e716630dSMartin Matuska uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2249*e716630dSMartin Matuska uint64_t ncols_new = vdrz->vd_physical_width; 2250*e716630dSMartin Matuska asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2251*e716630dSMartin Matuska (ncols_new - nparity)); 2252*e716630dSMartin Matuska asize_new = roundup(asize_new, nparity + 1) << ashift; 2253*e716630dSMartin Matuska VERIFY3U(asize_new, <=, asize); 2254*e716630dSMartin Matuska #endif 2255*e716630dSMartin Matuska 2256eda14cbcSMatt Macy return (asize); 2257eda14cbcSMatt Macy } 2258eda14cbcSMatt Macy 22597877fdebSMatt Macy /* 22607877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child) 22617877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize. 22627877fdebSMatt Macy */ 22637877fdebSMatt Macy static uint64_t 22647877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd) 22657877fdebSMatt Macy { 22667877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) / 22677877fdebSMatt Macy vd->vdev_children); 22687877fdebSMatt Macy } 22697877fdebSMatt Macy 22707877fdebSMatt Macy void 2271eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio) 2272eda14cbcSMatt Macy { 2273eda14cbcSMatt Macy raidz_col_t *rc = zio->io_private; 2274eda14cbcSMatt Macy 227581b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 2276eda14cbcSMatt Macy rc->rc_error = zio->io_error; 2277eda14cbcSMatt Macy rc->rc_tried = 1; 2278eda14cbcSMatt Macy rc->rc_skipped = 0; 2279eda14cbcSMatt Macy } 2280eda14cbcSMatt Macy 2281eda14cbcSMatt Macy static void 2282*e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio) 2283eda14cbcSMatt Macy { 2284*e716630dSMartin Matuska raidz_col_t *rc = zio->io_private; 2285eda14cbcSMatt Macy 2286*e716630dSMartin Matuska rc->rc_shadow_error = zio->io_error; 2287*e716630dSMartin Matuska } 2288*e716630dSMartin Matuska 2289*e716630dSMartin Matuska static void 2290*e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2291*e716630dSMartin Matuska { 2292*e716630dSMartin Matuska (void) rm; 2293*e716630dSMartin Matuska #ifdef ZFS_DEBUG 22947877fdebSMatt Macy range_seg64_t logical_rs, physical_rs, remain_rs; 22957877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset; 2296eda14cbcSMatt Macy logical_rs.rs_end = logical_rs.rs_start + 2297*e716630dSMartin Matuska vdev_raidz_asize(zio->io_vd, rr->rr_size, 2298*e716630dSMartin Matuska BP_PHYSICAL_BIRTH(zio->io_bp)); 2299eda14cbcSMatt Macy 23007877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col]; 2301*e716630dSMartin Matuska vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2302eda14cbcSMatt Macy 23037877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 23047877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs)); 2305*e716630dSMartin Matuska if (vdev_xlate_is_empty(&physical_rs)) { 2306*e716630dSMartin Matuska /* 2307*e716630dSMartin Matuska * If we are in the middle of expansion, the 2308*e716630dSMartin Matuska * physical->logical mapping is changing so vdev_xlate() 2309*e716630dSMartin Matuska * can't give us a reliable answer. 2310*e716630dSMartin Matuska */ 2311*e716630dSMartin Matuska return; 2312*e716630dSMartin Matuska } 2313eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2314eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2315eda14cbcSMatt Macy /* 2316eda14cbcSMatt Macy * It would be nice to assert that rs_end is equal 2317eda14cbcSMatt Macy * to rc_offset + rc_size but there might be an 2318eda14cbcSMatt Macy * optional I/O at the end that is not accounted in 2319eda14cbcSMatt Macy * rc_size. 2320eda14cbcSMatt Macy */ 2321eda14cbcSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2322eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2323*e716630dSMartin Matuska rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2324eda14cbcSMatt Macy } else { 2325eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2326eda14cbcSMatt Macy } 2327eda14cbcSMatt Macy #endif 2328eda14cbcSMatt Macy } 2329eda14cbcSMatt Macy 23307877fdebSMatt Macy static void 2331*e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 23327877fdebSMatt Macy { 23337877fdebSMatt Macy vdev_t *vd = zio->io_vd; 23347877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 23357877fdebSMatt Macy 23367877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 23377877fdebSMatt Macy 233881b22a98SMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 23397877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 234081b22a98SMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 23417877fdebSMatt Macy 23427877fdebSMatt Macy /* Verify physical to logical translation */ 2343*e716630dSMartin Matuska vdev_raidz_io_verify(zio, rm, rr, c); 23447877fdebSMatt Macy 2345*e716630dSMartin Matuska if (rc->rc_size == 0) 2346*e716630dSMartin Matuska continue; 2347*e716630dSMartin Matuska 2348*e716630dSMartin Matuska ASSERT3U(rc->rc_offset + rc->rc_size, <, 2349*e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2350*e716630dSMartin Matuska 235181b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 23527877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 235381b22a98SMartin Matuska rc->rc_offset, rc->rc_abd, 235481b22a98SMartin Matuska abd_get_size(rc->rc_abd), zio->io_type, 235581b22a98SMartin Matuska zio->io_priority, 0, vdev_raidz_child_done, rc)); 2356*e716630dSMartin Matuska 2357*e716630dSMartin Matuska if (rc->rc_shadow_devidx != INT_MAX) { 2358*e716630dSMartin Matuska vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2359*e716630dSMartin Matuska 2360*e716630dSMartin Matuska ASSERT3U( 2361*e716630dSMartin Matuska rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2362*e716630dSMartin Matuska cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2363*e716630dSMartin Matuska 2364*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2365*e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, 2366*e716630dSMartin Matuska abd_get_size(rc->rc_abd), 2367*e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2368*e716630dSMartin Matuska vdev_raidz_shadow_child_done, rc)); 236981b22a98SMartin Matuska } 23707877fdebSMatt Macy } 23717877fdebSMatt Macy } 23727877fdebSMatt Macy 2373*e716630dSMartin Matuska /* 2374*e716630dSMartin Matuska * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2375*e716630dSMartin Matuska * This only works for vdev_raidz_map_alloc() (not _expanded()). 2376*e716630dSMartin Matuska */ 23777877fdebSMatt Macy static void 2378*e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio) 2379*e716630dSMartin Matuska { 2380*e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2381*e716630dSMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift; 2382*e716630dSMartin Matuska raidz_map_t *rm = zio->io_vsd; 2383*e716630dSMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 2384*e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 2385*e716630dSMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 2386*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2387*e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2388*e716630dSMartin Matuska if (rc->rc_size != 0) 2389*e716630dSMartin Matuska continue; 2390*e716630dSMartin Matuska ASSERT3P(rc->rc_abd, ==, NULL); 2391*e716630dSMartin Matuska 2392*e716630dSMartin Matuska ASSERT3U(rc->rc_offset, <, 2393*e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2394*e716630dSMartin Matuska 2395*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2396*e716630dSMartin Matuska NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2397*e716630dSMartin Matuska ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2398*e716630dSMartin Matuska } 2399*e716630dSMartin Matuska } 2400*e716630dSMartin Matuska 2401*e716630dSMartin Matuska static void 2402*e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 24037877fdebSMatt Macy { 24047877fdebSMatt Macy vdev_t *vd = zio->io_vd; 24057877fdebSMatt Macy 24067877fdebSMatt Macy /* 24077877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity 24087877fdebSMatt Macy * last -- any errors along the way will force us to read the parity. 24097877fdebSMatt Macy */ 24107877fdebSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) { 24117877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 24127877fdebSMatt Macy if (rc->rc_size == 0) 24137877fdebSMatt Macy continue; 24147877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 24157877fdebSMatt Macy if (!vdev_readable(cvd)) { 24167877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24177877fdebSMatt Macy rr->rr_missingdata++; 24187877fdebSMatt Macy else 24197877fdebSMatt Macy rr->rr_missingparity++; 24207877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO); 24217877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */ 24227877fdebSMatt Macy rc->rc_skipped = 1; 24237877fdebSMatt Macy continue; 24247877fdebSMatt Macy } 24257877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 24267877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24277877fdebSMatt Macy rr->rr_missingdata++; 24287877fdebSMatt Macy else 24297877fdebSMatt Macy rr->rr_missingparity++; 24307877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE); 24317877fdebSMatt Macy rc->rc_skipped = 1; 24327877fdebSMatt Macy continue; 24337877fdebSMatt Macy } 2434*e716630dSMartin Matuska if (forceparity || 2435*e716630dSMartin Matuska c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 24367877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 24377877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 24387877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 24397877fdebSMatt Macy zio->io_type, zio->io_priority, 0, 24407877fdebSMatt Macy vdev_raidz_child_done, rc)); 24417877fdebSMatt Macy } 24427877fdebSMatt Macy } 24437877fdebSMatt Macy } 24447877fdebSMatt Macy 2445*e716630dSMartin Matuska static void 2446*e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2447*e716630dSMartin Matuska { 2448*e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2449*e716630dSMartin Matuska 2450*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 2451*e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 2452*e716630dSMartin Matuska if (prc->rc_size == 0) 2453*e716630dSMartin Matuska continue; 2454*e716630dSMartin Matuska 2455*e716630dSMartin Matuska ASSERT3U(prc->rc_devidx, ==, i); 2456*e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[i]; 2457*e716630dSMartin Matuska if (!vdev_readable(cvd)) { 2458*e716630dSMartin Matuska prc->rc_error = SET_ERROR(ENXIO); 2459*e716630dSMartin Matuska prc->rc_tried = 1; /* don't even try */ 2460*e716630dSMartin Matuska prc->rc_skipped = 1; 2461*e716630dSMartin Matuska continue; 2462*e716630dSMartin Matuska } 2463*e716630dSMartin Matuska if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2464*e716630dSMartin Matuska prc->rc_error = SET_ERROR(ESTALE); 2465*e716630dSMartin Matuska prc->rc_skipped = 1; 2466*e716630dSMartin Matuska continue; 2467*e716630dSMartin Matuska } 2468*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2469*e716630dSMartin Matuska prc->rc_offset, prc->rc_abd, prc->rc_size, 2470*e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2471*e716630dSMartin Matuska vdev_raidz_child_done, prc)); 2472*e716630dSMartin Matuska } 2473*e716630dSMartin Matuska } 2474*e716630dSMartin Matuska 2475*e716630dSMartin Matuska static void 2476*e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2477*e716630dSMartin Matuska { 2478*e716630dSMartin Matuska /* 2479*e716630dSMartin Matuska * If there are multiple rows, we will be hitting 2480*e716630dSMartin Matuska * all disks, so go ahead and read the parity so 2481*e716630dSMartin Matuska * that we are reading in decent size chunks. 2482*e716630dSMartin Matuska */ 2483*e716630dSMartin Matuska boolean_t forceparity = rm->rm_nrows > 1; 2484*e716630dSMartin Matuska 2485*e716630dSMartin Matuska if (rm->rm_phys_col) { 2486*e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio, rm); 2487*e716630dSMartin Matuska } else { 2488*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2489*e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 2490*e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio, rr, forceparity); 2491*e716630dSMartin Matuska } 2492*e716630dSMartin Matuska } 2493*e716630dSMartin Matuska } 2494*e716630dSMartin Matuska 2495eda14cbcSMatt Macy /* 2496eda14cbcSMatt Macy * Start an IO operation on a RAIDZ VDev 2497eda14cbcSMatt Macy * 2498eda14cbcSMatt Macy * Outline: 2499eda14cbcSMatt Macy * - For write operations: 2500eda14cbcSMatt Macy * 1. Generate the parity data 2501eda14cbcSMatt Macy * 2. Create child zio write operations to each column's vdev, for both 2502eda14cbcSMatt Macy * data and parity. 2503eda14cbcSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy 2504eda14cbcSMatt Macy * write zio children for those areas to improve aggregation continuity. 2505eda14cbcSMatt Macy * - For read operations: 2506eda14cbcSMatt Macy * 1. Create child zio read operations to each data column's vdev to read 2507eda14cbcSMatt Macy * the range of data required for zio. 2508eda14cbcSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data 2509eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity 2510eda14cbcSMatt Macy * columns' VDevs as well. 2511eda14cbcSMatt Macy */ 2512eda14cbcSMatt Macy static void 2513eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio) 2514eda14cbcSMatt Macy { 2515eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 2516eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top; 25177877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2518*e716630dSMartin Matuska raidz_map_t *rm; 2519eda14cbcSMatt Macy 2520*e716630dSMartin Matuska uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2521*e716630dSMartin Matuska BP_PHYSICAL_BIRTH(zio->io_bp)); 2522*e716630dSMartin Matuska if (logical_width != vdrz->vd_physical_width) { 2523*e716630dSMartin Matuska zfs_locked_range_t *lr = NULL; 2524*e716630dSMartin Matuska uint64_t synced_offset = UINT64_MAX; 2525*e716630dSMartin Matuska uint64_t next_offset = UINT64_MAX; 2526*e716630dSMartin Matuska boolean_t use_scratch = B_FALSE; 2527*e716630dSMartin Matuska /* 2528*e716630dSMartin Matuska * Note: when the expansion is completing, we set 2529*e716630dSMartin Matuska * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2530*e716630dSMartin Matuska * in a later txg than when we last update spa_ubsync's state 2531*e716630dSMartin Matuska * (see the end of spa_raidz_expand_thread()). Therefore we 2532*e716630dSMartin Matuska * may see vre_state!=SCANNING before 2533*e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2534*e716630dSMartin Matuska * on disk, but the copying progress has been synced to disk 2535*e716630dSMartin Matuska * (and reflected in spa_ubsync). In this case it's fine to 2536*e716630dSMartin Matuska * treat the expansion as completed, since if we crash there's 2537*e716630dSMartin Matuska * no additional copying to do. 2538*e716630dSMartin Matuska */ 2539*e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2540*e716630dSMartin Matuska ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2541*e716630dSMartin Matuska &vdrz->vn_vre); 2542*e716630dSMartin Matuska lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2543*e716630dSMartin Matuska zio->io_offset, zio->io_size, RL_READER); 2544*e716630dSMartin Matuska use_scratch = 2545*e716630dSMartin Matuska (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2546*e716630dSMartin Matuska RRSS_SCRATCH_VALID); 2547*e716630dSMartin Matuska synced_offset = 2548*e716630dSMartin Matuska RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2549*e716630dSMartin Matuska next_offset = vdrz->vn_vre.vre_offset; 2550*e716630dSMartin Matuska /* 2551*e716630dSMartin Matuska * If we haven't resumed expanding since importing the 2552*e716630dSMartin Matuska * pool, vre_offset won't have been set yet. In 2553*e716630dSMartin Matuska * this case the next offset to be copied is the same 2554*e716630dSMartin Matuska * as what was synced. 2555*e716630dSMartin Matuska */ 2556*e716630dSMartin Matuska if (next_offset == UINT64_MAX) { 2557*e716630dSMartin Matuska next_offset = synced_offset; 2558*e716630dSMartin Matuska } 2559*e716630dSMartin Matuska } 2560*e716630dSMartin Matuska if (use_scratch) { 2561*e716630dSMartin Matuska zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2562*e716630dSMartin Matuska "%lld next_offset=%lld use_scratch=%u", 2563*e716630dSMartin Matuska zio, 2564*e716630dSMartin Matuska zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2565*e716630dSMartin Matuska (long long)zio->io_offset, 2566*e716630dSMartin Matuska (long long)synced_offset, 2567*e716630dSMartin Matuska (long long)next_offset, 2568*e716630dSMartin Matuska use_scratch); 2569*e716630dSMartin Matuska } 2570*e716630dSMartin Matuska 2571*e716630dSMartin Matuska rm = vdev_raidz_map_alloc_expanded(zio, 2572*e716630dSMartin Matuska tvd->vdev_ashift, vdrz->vd_physical_width, 2573*e716630dSMartin Matuska logical_width, vdrz->vd_nparity, 2574*e716630dSMartin Matuska synced_offset, next_offset, use_scratch); 2575*e716630dSMartin Matuska rm->rm_lr = lr; 2576*e716630dSMartin Matuska } else { 2577*e716630dSMartin Matuska rm = vdev_raidz_map_alloc(zio, 2578*e716630dSMartin Matuska tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2579*e716630dSMartin Matuska } 2580*e716630dSMartin Matuska rm->rm_original_width = vdrz->vd_original_width; 2581*e716630dSMartin Matuska 2582f9693befSMartin Matuska zio->io_vsd = rm; 2583f9693befSMartin Matuska zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2584eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 2585*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2586*e716630dSMartin Matuska vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2587*e716630dSMartin Matuska } 2588*e716630dSMartin Matuska 2589*e716630dSMartin Matuska if (logical_width == vdrz->vd_physical_width) { 2590*e716630dSMartin Matuska raidz_start_skip_writes(zio); 2591*e716630dSMartin Matuska } 25927877fdebSMatt Macy } else { 2593eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 2594*e716630dSMartin Matuska vdev_raidz_io_start_read(zio, rm); 2595eda14cbcSMatt Macy } 2596eda14cbcSMatt Macy 2597eda14cbcSMatt Macy zio_execute(zio); 2598eda14cbcSMatt Macy } 2599eda14cbcSMatt Macy 2600eda14cbcSMatt Macy /* 2601eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device. 2602eda14cbcSMatt Macy */ 2603e92ffd9bSMartin Matuska void 2604e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2605eda14cbcSMatt Macy { 2606eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2607eda14cbcSMatt Macy 26087877fdebSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 26097877fdebSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) { 2610eda14cbcSMatt Macy zio_bad_cksum_t zbc; 2611eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2612eda14cbcSMatt Macy 2613eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 2614eda14cbcSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 2615eda14cbcSMatt Macy 26162c48331dSMatt Macy mutex_enter(&vd->vdev_stat_lock); 26172c48331dSMatt Macy vd->vdev_stat.vs_checksum_errors++; 26182c48331dSMatt Macy mutex_exit(&vd->vdev_stat_lock); 2619bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2620bb2d13b6SMartin Matuska &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2621bb2d13b6SMartin Matuska rc->rc_abd, bad_data, &zbc); 26222c48331dSMatt Macy } 2623eda14cbcSMatt Macy } 2624eda14cbcSMatt Macy 2625eda14cbcSMatt Macy /* 2626eda14cbcSMatt Macy * We keep track of whether or not there were any injected errors, so that 2627eda14cbcSMatt Macy * any ereports we generate can note it. 2628eda14cbcSMatt Macy */ 2629eda14cbcSMatt Macy static int 2630eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio) 2631eda14cbcSMatt Macy { 2632315ee00fSMartin Matuska zio_bad_cksum_t zbc = {0}; 2633eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2634eda14cbcSMatt Macy 2635eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc); 2636eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0) 2637eda14cbcSMatt Macy rm->rm_ecksuminjected = 1; 2638eda14cbcSMatt Macy 2639eda14cbcSMatt Macy return (ret); 2640eda14cbcSMatt Macy } 2641eda14cbcSMatt Macy 2642eda14cbcSMatt Macy /* 2643eda14cbcSMatt Macy * Generate the parity from the data columns. If we tried and were able to 2644eda14cbcSMatt Macy * read the parity without error, verify that the generated parity matches the 2645eda14cbcSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the 26467877fdebSMatt Macy * number of such failures. 2647eda14cbcSMatt Macy */ 2648eda14cbcSMatt Macy static int 26497877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2650eda14cbcSMatt Macy { 2651eda14cbcSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2652eda14cbcSMatt Macy int c, ret = 0; 26537877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2654eda14cbcSMatt Macy raidz_col_t *rc; 2655eda14cbcSMatt Macy 2656eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp; 2657eda14cbcSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2658eda14cbcSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2659eda14cbcSMatt Macy 2660eda14cbcSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY) 2661eda14cbcSMatt Macy return (ret); 2662eda14cbcSMatt Macy 26637877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 26647877fdebSMatt Macy rc = &rr->rr_col[c]; 2665eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2666eda14cbcSMatt Macy continue; 2667eda14cbcSMatt Macy 2668a0b956f5SMartin Matuska orig[c] = rc->rc_abd; 2669a0b956f5SMartin Matuska ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2670a0b956f5SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2671eda14cbcSMatt Macy } 2672eda14cbcSMatt Macy 26737877fdebSMatt Macy /* 2674e92ffd9bSMartin Matuska * Verify any empty sectors are zero filled to ensure the parity 2675e92ffd9bSMartin Matuska * is calculated correctly even if these non-data sectors are damaged. 2676e92ffd9bSMartin Matuska */ 2677e92ffd9bSMartin Matuska if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2678e92ffd9bSMartin Matuska ret += vdev_draid_map_verify_empty(zio, rr); 2679e92ffd9bSMartin Matuska 2680e92ffd9bSMartin Matuska /* 26817877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This 26827877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff 26837877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0). 26847877fdebSMatt Macy */ 26857877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 2686eda14cbcSMatt Macy 26877877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 26887877fdebSMatt Macy rc = &rr->rr_col[c]; 26897877fdebSMatt Macy 2690eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2691eda14cbcSMatt Macy continue; 26927877fdebSMatt Macy 2693eda14cbcSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2694*e716630dSMartin Matuska zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2695*e716630dSMartin Matuska c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2696e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, rc, orig[c]); 2697eda14cbcSMatt Macy rc->rc_error = SET_ERROR(ECKSUM); 2698eda14cbcSMatt Macy ret++; 2699eda14cbcSMatt Macy } 2700eda14cbcSMatt Macy abd_free(orig[c]); 2701eda14cbcSMatt Macy } 2702eda14cbcSMatt Macy 2703eda14cbcSMatt Macy return (ret); 2704eda14cbcSMatt Macy } 2705eda14cbcSMatt Macy 2706eda14cbcSMatt Macy static int 27077877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr) 2708eda14cbcSMatt Macy { 2709eda14cbcSMatt Macy int error = 0; 2710eda14cbcSMatt Macy 2711*e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 27127877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error); 2713*e716630dSMartin Matuska error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2714*e716630dSMartin Matuska } 2715eda14cbcSMatt Macy 2716eda14cbcSMatt Macy return (error); 2717eda14cbcSMatt Macy } 2718eda14cbcSMatt Macy 2719eda14cbcSMatt Macy static void 27207877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2721eda14cbcSMatt Macy { 2722eda14cbcSMatt Macy int unexpected_errors = 0; 2723eda14cbcSMatt Macy int parity_errors = 0; 2724eda14cbcSMatt Macy int parity_untried = 0; 2725eda14cbcSMatt Macy int data_errors = 0; 2726eda14cbcSMatt Macy 27277877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2728eda14cbcSMatt Macy 27297877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27307877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 2731eda14cbcSMatt Macy 2732eda14cbcSMatt Macy if (rc->rc_error) { 27337877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2734eda14cbcSMatt Macy parity_errors++; 2735eda14cbcSMatt Macy else 2736eda14cbcSMatt Macy data_errors++; 2737eda14cbcSMatt Macy 2738eda14cbcSMatt Macy if (!rc->rc_skipped) 2739eda14cbcSMatt Macy unexpected_errors++; 27407877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2741eda14cbcSMatt Macy parity_untried++; 2742eda14cbcSMatt Macy } 2743a0b956f5SMartin Matuska 2744a0b956f5SMartin Matuska if (rc->rc_force_repair) 2745a0b956f5SMartin Matuska unexpected_errors++; 2746eda14cbcSMatt Macy } 2747eda14cbcSMatt Macy 2748eda14cbcSMatt Macy /* 27497877fdebSMatt Macy * If we read more parity disks than were used for 27507877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced 27517877fdebSMatt Macy * correct data. 27527877fdebSMatt Macy * 27537877fdebSMatt Macy * Note that we also regenerate parity when resilvering so we 27547877fdebSMatt Macy * can write it out to failed devices later. 27557877fdebSMatt Macy */ 27567877fdebSMatt Macy if (parity_errors + parity_untried < 27577877fdebSMatt Macy rr->rr_firstdatacol - data_errors || 27587877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) { 27597877fdebSMatt Macy int n = raidz_parity_verify(zio, rr); 27607877fdebSMatt Macy unexpected_errors += n; 27617877fdebSMatt Macy } 27627877fdebSMatt Macy 27637877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 27647877fdebSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 27657877fdebSMatt Macy /* 27667877fdebSMatt Macy * Use the good data we have in hand to repair damaged children. 27677877fdebSMatt Macy */ 27687877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27697877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 27707877fdebSMatt Macy vdev_t *vd = zio->io_vd; 27717877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 27727877fdebSMatt Macy 277316038816SMartin Matuska if (!rc->rc_allow_repair) { 277416038816SMartin Matuska continue; 277516038816SMartin Matuska } else if (!rc->rc_force_repair && 277616038816SMartin Matuska (rc->rc_error == 0 || rc->rc_size == 0)) { 27777877fdebSMatt Macy continue; 27787877fdebSMatt Macy } 27797877fdebSMatt Macy 2780*e716630dSMartin Matuska zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2781*e716630dSMartin Matuska "offset=%llx", 2782*e716630dSMartin Matuska zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2783*e716630dSMartin Matuska 27847877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 27857877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 27867877fdebSMatt Macy ZIO_TYPE_WRITE, 27877877fdebSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 27887877fdebSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 27897877fdebSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 27907877fdebSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 27917877fdebSMatt Macy } 27927877fdebSMatt Macy } 2793*e716630dSMartin Matuska 2794*e716630dSMartin Matuska /* 2795*e716630dSMartin Matuska * Scrub or resilver i/o's: overwrite any shadow locations with the 2796*e716630dSMartin Matuska * good data. This ensures that if we've already copied this sector, 2797*e716630dSMartin Matuska * it will be corrected if it was damaged. This writes more than is 2798*e716630dSMartin Matuska * necessary, but since expansion is paused during scrub/resilver, at 2799*e716630dSMartin Matuska * most a single row will have a shadow location. 2800*e716630dSMartin Matuska */ 2801*e716630dSMartin Matuska if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2802*e716630dSMartin Matuska (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2803*e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 2804*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2805*e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2806*e716630dSMartin Matuska 2807*e716630dSMartin Matuska if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2808*e716630dSMartin Matuska continue; 2809*e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2810*e716630dSMartin Matuska 2811*e716630dSMartin Matuska /* 2812*e716630dSMartin Matuska * Note: We don't want to update the repair stats 2813*e716630dSMartin Matuska * because that would incorrectly indicate that there 2814*e716630dSMartin Matuska * was bad data to repair, which we aren't sure about. 2815*e716630dSMartin Matuska * By clearing the SCAN_THREAD flag, we prevent this 2816*e716630dSMartin Matuska * from happening, despite having the REPAIR flag set. 2817*e716630dSMartin Matuska * We need to set SELF_HEAL so that this i/o can't be 2818*e716630dSMartin Matuska * bypassed by zio_vdev_io_start(). 2819*e716630dSMartin Matuska */ 2820*e716630dSMartin Matuska zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2821*e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2822*e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2823*e716630dSMartin Matuska ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2824*e716630dSMartin Matuska NULL, NULL); 2825*e716630dSMartin Matuska cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2826*e716630dSMartin Matuska zio_nowait(cio); 2827*e716630dSMartin Matuska } 2828*e716630dSMartin Matuska } 28297877fdebSMatt Macy } 28307877fdebSMatt Macy 28317877fdebSMatt Macy static void 28327877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm) 28337877fdebSMatt Macy { 28347877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 28357877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 28367877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 28377877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 28387877fdebSMatt Macy if (rc->rc_need_orig_restore) { 2839f9693befSMartin Matuska abd_copy(rc->rc_abd, 28407877fdebSMatt Macy rc->rc_orig_data, rc->rc_size); 28417877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 28427877fdebSMatt Macy } 28437877fdebSMatt Macy } 28447877fdebSMatt Macy } 28457877fdebSMatt Macy } 28467877fdebSMatt Macy 28477877fdebSMatt Macy /* 2848*e716630dSMartin Matuska * During raidz_reconstruct() for expanded VDEV, we need special consideration 2849*e716630dSMartin Matuska * failure simulations. See note in raidz_reconstruct() on simulating failure 2850*e716630dSMartin Matuska * of a pre-expansion device. 2851*e716630dSMartin Matuska * 2852*e716630dSMartin Matuska * Treating logical child i as failed, return TRUE if the given column should 2853*e716630dSMartin Matuska * be treated as failed. The idea of logical children allows us to imagine 2854*e716630dSMartin Matuska * that a disk silently failed before a RAIDZ expansion (reads from this disk 2855*e716630dSMartin Matuska * succeed but return the wrong data). Since the expansion doesn't verify 2856*e716630dSMartin Matuska * checksums, the incorrect data will be moved to new locations spread among 2857*e716630dSMartin Matuska * the children (going diagonally across them). 2858*e716630dSMartin Matuska * 2859*e716630dSMartin Matuska * Higher "logical child failures" (values of `i`) indicate these 2860*e716630dSMartin Matuska * "pre-expansion failures". The first physical_width values imagine that a 2861*e716630dSMartin Matuska * current child failed; the next physical_width-1 values imagine that a 2862*e716630dSMartin Matuska * child failed before the most recent expansion; the next physical_width-2 2863*e716630dSMartin Matuska * values imagine a child failed in the expansion before that, etc. 2864*e716630dSMartin Matuska */ 2865*e716630dSMartin Matuska static boolean_t 2866*e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift, 2867*e716630dSMartin Matuska int i, raidz_col_t *rc) 2868*e716630dSMartin Matuska { 2869*e716630dSMartin Matuska uint64_t sector_id = 2870*e716630dSMartin Matuska physical_width * (rc->rc_offset >> ashift) + 2871*e716630dSMartin Matuska rc->rc_devidx; 2872*e716630dSMartin Matuska 2873*e716630dSMartin Matuska for (int w = physical_width; w >= original_width; w--) { 2874*e716630dSMartin Matuska if (i < w) { 2875*e716630dSMartin Matuska return (sector_id % w == i); 2876*e716630dSMartin Matuska } else { 2877*e716630dSMartin Matuska i -= w; 2878*e716630dSMartin Matuska } 2879*e716630dSMartin Matuska } 2880*e716630dSMartin Matuska ASSERT(!"invalid logical child id"); 2881*e716630dSMartin Matuska return (B_FALSE); 2882*e716630dSMartin Matuska } 2883*e716630dSMartin Matuska 2884*e716630dSMartin Matuska /* 28857877fdebSMatt Macy * returns EINVAL if reconstruction of the block will not be possible 28867877fdebSMatt Macy * returns ECKSUM if this specific reconstruction failed 28877877fdebSMatt Macy * returns 0 on successful reconstruction 28887877fdebSMatt Macy */ 28897877fdebSMatt Macy static int 28907877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 28917877fdebSMatt Macy { 28927877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2893*e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 2894*e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 2895*e716630dSMartin Matuska rm->rm_original_width : physical_width; 2896*e716630dSMartin Matuska int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2897*e716630dSMartin Matuska 2898*e716630dSMartin Matuska if (dbgmsg) { 2899*e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2900*e716630dSMartin Matuska "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2901*e716630dSMartin Matuska } 29027877fdebSMatt Macy 29037877fdebSMatt Macy /* Reconstruct each row */ 29047877fdebSMatt Macy for (int r = 0; r < rm->rm_nrows; r++) { 29057877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[r]; 29067877fdebSMatt Macy int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 29077877fdebSMatt Macy int t = 0; 29087877fdebSMatt Macy int dead = 0; 29097877fdebSMatt Macy int dead_data = 0; 29107877fdebSMatt Macy 2911*e716630dSMartin Matuska if (dbgmsg) 2912*e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2913*e716630dSMartin Matuska 29147877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29157877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29167877fdebSMatt Macy ASSERT0(rc->rc_need_orig_restore); 29177877fdebSMatt Macy if (rc->rc_error != 0) { 29187877fdebSMatt Macy dead++; 29197877fdebSMatt Macy if (c >= nparity) 29207877fdebSMatt Macy dead_data++; 29217877fdebSMatt Macy continue; 29227877fdebSMatt Macy } 29237877fdebSMatt Macy if (rc->rc_size == 0) 29247877fdebSMatt Macy continue; 29257877fdebSMatt Macy for (int lt = 0; lt < ntgts; lt++) { 2926*e716630dSMartin Matuska if (raidz_simulate_failure(physical_width, 2927*e716630dSMartin Matuska original_width, 2928*e716630dSMartin Matuska zio->io_vd->vdev_top->vdev_ashift, 2929*e716630dSMartin Matuska ltgts[lt], rc)) { 29307877fdebSMatt Macy if (rc->rc_orig_data == NULL) { 29317877fdebSMatt Macy rc->rc_orig_data = 2932f9693befSMartin Matuska abd_alloc_linear( 2933f9693befSMartin Matuska rc->rc_size, B_TRUE); 2934f9693befSMartin Matuska abd_copy(rc->rc_orig_data, 29357877fdebSMatt Macy rc->rc_abd, rc->rc_size); 29367877fdebSMatt Macy } 29377877fdebSMatt Macy rc->rc_need_orig_restore = B_TRUE; 29387877fdebSMatt Macy 29397877fdebSMatt Macy dead++; 29407877fdebSMatt Macy if (c >= nparity) 29417877fdebSMatt Macy dead_data++; 2942*e716630dSMartin Matuska /* 2943*e716630dSMartin Matuska * Note: simulating failure of a 2944*e716630dSMartin Matuska * pre-expansion device can hit more 2945*e716630dSMartin Matuska * than one column, in which case we 2946*e716630dSMartin Matuska * might try to simulate more failures 2947*e716630dSMartin Matuska * than can be reconstructed, which is 2948*e716630dSMartin Matuska * also more than the size of my_tgts. 2949*e716630dSMartin Matuska * This check prevents accessing past 2950*e716630dSMartin Matuska * the end of my_tgts. The "dead > 2951*e716630dSMartin Matuska * nparity" check below will fail this 2952*e716630dSMartin Matuska * reconstruction attempt. 2953*e716630dSMartin Matuska */ 2954*e716630dSMartin Matuska if (t < VDEV_RAIDZ_MAXPARITY) { 29557877fdebSMatt Macy my_tgts[t++] = c; 2956*e716630dSMartin Matuska if (dbgmsg) { 2957*e716630dSMartin Matuska zfs_dbgmsg("simulating " 2958*e716630dSMartin Matuska "failure of col %u " 2959*e716630dSMartin Matuska "devidx %u", c, 2960*e716630dSMartin Matuska (int)rc->rc_devidx); 2961*e716630dSMartin Matuska } 2962*e716630dSMartin Matuska } 29637877fdebSMatt Macy break; 29647877fdebSMatt Macy } 29657877fdebSMatt Macy } 29667877fdebSMatt Macy } 29677877fdebSMatt Macy if (dead > nparity) { 29687877fdebSMatt Macy /* reconstruction not possible */ 2969*e716630dSMartin Matuska if (dbgmsg) { 2970*e716630dSMartin Matuska zfs_dbgmsg("reconstruction not possible; " 2971*e716630dSMartin Matuska "too many failures"); 2972*e716630dSMartin Matuska } 29737877fdebSMatt Macy raidz_restore_orig_data(rm); 29747877fdebSMatt Macy return (EINVAL); 29757877fdebSMatt Macy } 29767877fdebSMatt Macy if (dead_data > 0) 2977f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 29787877fdebSMatt Macy } 29797877fdebSMatt Macy 29807877fdebSMatt Macy /* Check for success */ 29817877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 29827877fdebSMatt Macy 29837877fdebSMatt Macy /* Reconstruction succeeded - report errors */ 29847877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 29857877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 29867877fdebSMatt Macy 29877877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29887877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29897877fdebSMatt Macy if (rc->rc_need_orig_restore) { 29907877fdebSMatt Macy /* 29917877fdebSMatt Macy * Note: if this is a parity column, 29927877fdebSMatt Macy * we don't really know if it's wrong. 29937877fdebSMatt Macy * We need to let 29947877fdebSMatt Macy * vdev_raidz_io_done_verified() check 29957877fdebSMatt Macy * it, and if we set rc_error, it will 29967877fdebSMatt Macy * think that it is a "known" error 29977877fdebSMatt Macy * that doesn't need to be checked 29987877fdebSMatt Macy * or corrected. 29997877fdebSMatt Macy */ 30007877fdebSMatt Macy if (rc->rc_error == 0 && 30017877fdebSMatt Macy c >= rr->rr_firstdatacol) { 3002e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, 3003f9693befSMartin Matuska rc, rc->rc_orig_data); 30047877fdebSMatt Macy rc->rc_error = 30057877fdebSMatt Macy SET_ERROR(ECKSUM); 30067877fdebSMatt Macy } 30077877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 30087877fdebSMatt Macy } 30097877fdebSMatt Macy } 30107877fdebSMatt Macy 30117877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 30127877fdebSMatt Macy } 30137877fdebSMatt Macy 30147877fdebSMatt Macy zio_checksum_verified(zio); 30157877fdebSMatt Macy 3016*e716630dSMartin Matuska if (dbgmsg) { 3017*e716630dSMartin Matuska zfs_dbgmsg("reconstruction successful " 3018*e716630dSMartin Matuska "(checksum verified)"); 3019*e716630dSMartin Matuska } 30207877fdebSMatt Macy return (0); 30217877fdebSMatt Macy } 30227877fdebSMatt Macy 30237877fdebSMatt Macy /* Reconstruction failed - restore original data */ 30247877fdebSMatt Macy raidz_restore_orig_data(rm); 3025*e716630dSMartin Matuska if (dbgmsg) { 3026*e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3027*e716630dSMartin Matuska "failed", zio); 3028*e716630dSMartin Matuska } 30297877fdebSMatt Macy return (ECKSUM); 30307877fdebSMatt Macy } 30317877fdebSMatt Macy 30327877fdebSMatt Macy /* 30337877fdebSMatt Macy * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 30347877fdebSMatt Macy * Note that the algorithm below is non-optimal because it doesn't take into 30357877fdebSMatt Macy * account how reconstruction is actually performed. For example, with 30367877fdebSMatt Macy * triple-parity RAID-Z the reconstruction procedure is the same if column 4 30377877fdebSMatt Macy * is targeted as invalid as if columns 1 and 4 are targeted since in both 30387877fdebSMatt Macy * cases we'd only use parity information in column 0. 30397877fdebSMatt Macy * 30407877fdebSMatt Macy * The order that we find the various possible combinations of failed 30417877fdebSMatt Macy * disks is dictated by these rules: 30427877fdebSMatt Macy * - Examine each "slot" (the "i" in tgts[i]) 3043*e716630dSMartin Matuska * - Try to increment this slot (tgts[i] += 1) 30447877fdebSMatt Macy * - if we can't increment because it runs into the next slot, 30457877fdebSMatt Macy * reset our slot to the minimum, and examine the next slot 30467877fdebSMatt Macy * 30477877fdebSMatt Macy * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 30487877fdebSMatt Macy * 3 columns to reconstruct), we will generate the following sequence: 30497877fdebSMatt Macy * 30507877fdebSMatt Macy * STATE ACTION 30517877fdebSMatt Macy * 0 1 2 special case: skip since these are all parity 30527877fdebSMatt Macy * 0 1 3 first slot: reset to 0; middle slot: increment to 2 30537877fdebSMatt Macy * 0 2 3 first slot: increment to 1 30547877fdebSMatt Macy * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 30557877fdebSMatt Macy * 0 1 4 first: reset to 0; middle: increment to 2 30567877fdebSMatt Macy * 0 2 4 first: increment to 1 30577877fdebSMatt Macy * 1 2 4 first: reset to 0; middle: increment to 3 30587877fdebSMatt Macy * 0 3 4 first: increment to 1 30597877fdebSMatt Macy * 1 3 4 first: increment to 2 30607877fdebSMatt Macy * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 30617877fdebSMatt Macy * 0 1 5 first: reset to 0; middle: increment to 2 30627877fdebSMatt Macy * 0 2 5 first: increment to 1 30637877fdebSMatt Macy * 1 2 5 first: reset to 0; middle: increment to 3 30647877fdebSMatt Macy * 0 3 5 first: increment to 1 30657877fdebSMatt Macy * 1 3 5 first: increment to 2 30667877fdebSMatt Macy * 2 3 5 first: reset to 0; middle: increment to 4 30677877fdebSMatt Macy * 0 4 5 first: increment to 1 30687877fdebSMatt Macy * 1 4 5 first: increment to 2 30697877fdebSMatt Macy * 2 4 5 first: increment to 3 30707877fdebSMatt Macy * 3 4 5 done 30717877fdebSMatt Macy * 307216038816SMartin Matuska * This strategy works for dRAID but is less efficient when there are a large 30737877fdebSMatt Macy * number of child vdevs and therefore permutations to check. Furthermore, 3074*e716630dSMartin Matuska * since the raidz_map_t rows likely do not overlap, reconstruction would be 30757877fdebSMatt Macy * possible as long as there are no more than nparity data errors per row. 30767877fdebSMatt Macy * These additional permutations are not currently checked but could be as 30777877fdebSMatt Macy * a future improvement. 3078*e716630dSMartin Matuska * 3079*e716630dSMartin Matuska * Returns 0 on success, ECKSUM on failure. 30807877fdebSMatt Macy */ 30817877fdebSMatt Macy static int 30827877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio) 30837877fdebSMatt Macy { 30847877fdebSMatt Macy int nparity = vdev_get_nparity(zio->io_vd); 30857877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3086*e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 3087*e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 3088*e716630dSMartin Matuska rm->rm_original_width : physical_width; 30897877fdebSMatt Macy 30907877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 30917877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 30927877fdebSMatt Macy int total_errors = 0; 30937877fdebSMatt Macy 30947877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 30957877fdebSMatt Macy if (rr->rr_col[c].rc_error) 30967877fdebSMatt Macy total_errors++; 30977877fdebSMatt Macy } 30987877fdebSMatt Macy 30997877fdebSMatt Macy if (total_errors > nparity) 31007877fdebSMatt Macy return (vdev_raidz_worst_error(rr)); 31017877fdebSMatt Macy } 31027877fdebSMatt Macy 31037877fdebSMatt Macy for (int num_failures = 1; num_failures <= nparity; num_failures++) { 31047877fdebSMatt Macy int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 31057877fdebSMatt Macy int *ltgts = &tstore[1]; /* value is logical child ID */ 31067877fdebSMatt Macy 3107*e716630dSMartin Matuska 3108*e716630dSMartin Matuska /* 3109*e716630dSMartin Matuska * Determine number of logical children, n. See comment 3110*e716630dSMartin Matuska * above raidz_simulate_failure(). 3111*e716630dSMartin Matuska */ 3112*e716630dSMartin Matuska int n = 0; 3113*e716630dSMartin Matuska for (int w = physical_width; 3114*e716630dSMartin Matuska w >= original_width; w--) { 3115*e716630dSMartin Matuska n += w; 3116*e716630dSMartin Matuska } 31177877fdebSMatt Macy 31187877fdebSMatt Macy ASSERT3U(num_failures, <=, nparity); 31197877fdebSMatt Macy ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 31207877fdebSMatt Macy 31217877fdebSMatt Macy /* Handle corner cases in combrec logic */ 31227877fdebSMatt Macy ltgts[-1] = -1; 31237877fdebSMatt Macy for (int i = 0; i < num_failures; i++) { 31247877fdebSMatt Macy ltgts[i] = i; 31257877fdebSMatt Macy } 31267877fdebSMatt Macy ltgts[num_failures] = n; 31277877fdebSMatt Macy 31287877fdebSMatt Macy for (;;) { 31297877fdebSMatt Macy int err = raidz_reconstruct(zio, ltgts, num_failures, 31307877fdebSMatt Macy nparity); 31317877fdebSMatt Macy if (err == EINVAL) { 31327877fdebSMatt Macy /* 31337877fdebSMatt Macy * Reconstruction not possible with this # 31347877fdebSMatt Macy * failures; try more failures. 31357877fdebSMatt Macy */ 31367877fdebSMatt Macy break; 31377877fdebSMatt Macy } else if (err == 0) 31387877fdebSMatt Macy return (0); 31397877fdebSMatt Macy 31407877fdebSMatt Macy /* Compute next targets to try */ 31417877fdebSMatt Macy for (int t = 0; ; t++) { 31427877fdebSMatt Macy ASSERT3U(t, <, num_failures); 31437877fdebSMatt Macy ltgts[t]++; 31447877fdebSMatt Macy if (ltgts[t] == n) { 31457877fdebSMatt Macy /* try more failures */ 31467877fdebSMatt Macy ASSERT3U(t, ==, num_failures - 1); 3147*e716630dSMartin Matuska if (zfs_flags & 3148*e716630dSMartin Matuska ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3149*e716630dSMartin Matuska zfs_dbgmsg("reconstruction " 3150*e716630dSMartin Matuska "failed for num_failures=" 3151*e716630dSMartin Matuska "%u; tried all " 3152*e716630dSMartin Matuska "combinations", 3153*e716630dSMartin Matuska num_failures); 3154*e716630dSMartin Matuska } 31557877fdebSMatt Macy break; 31567877fdebSMatt Macy } 31577877fdebSMatt Macy 31587877fdebSMatt Macy ASSERT3U(ltgts[t], <, n); 31597877fdebSMatt Macy ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 31607877fdebSMatt Macy 31617877fdebSMatt Macy /* 31627877fdebSMatt Macy * If that spot is available, we're done here. 31637877fdebSMatt Macy * Try the next combination. 31647877fdebSMatt Macy */ 31657877fdebSMatt Macy if (ltgts[t] != ltgts[t + 1]) 3166*e716630dSMartin Matuska break; // found next combination 31677877fdebSMatt Macy 31687877fdebSMatt Macy /* 31697877fdebSMatt Macy * Otherwise, reset this tgt to the minimum, 31707877fdebSMatt Macy * and move on to the next tgt. 31717877fdebSMatt Macy */ 31727877fdebSMatt Macy ltgts[t] = ltgts[t - 1] + 1; 31737877fdebSMatt Macy ASSERT3U(ltgts[t], ==, t); 31747877fdebSMatt Macy } 31757877fdebSMatt Macy 31767877fdebSMatt Macy /* Increase the number of failures and keep trying. */ 31777877fdebSMatt Macy if (ltgts[num_failures - 1] == n) 31787877fdebSMatt Macy break; 31797877fdebSMatt Macy } 31807877fdebSMatt Macy } 3181*e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3182*e716630dSMartin Matuska zfs_dbgmsg("reconstruction failed for all num_failures"); 31837877fdebSMatt Macy return (ECKSUM); 31847877fdebSMatt Macy } 31857877fdebSMatt Macy 31867877fdebSMatt Macy void 31877877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 31887877fdebSMatt Macy { 31897877fdebSMatt Macy for (uint64_t row = 0; row < rm->rm_nrows; row++) { 31907877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[row]; 31917877fdebSMatt Macy vdev_raidz_reconstruct_row(rm, rr, t, nt); 31927877fdebSMatt Macy } 31937877fdebSMatt Macy } 31947877fdebSMatt Macy 31957877fdebSMatt Macy /* 31967877fdebSMatt Macy * Complete a write IO operation on a RAIDZ VDev 31977877fdebSMatt Macy * 31987877fdebSMatt Macy * Outline: 31997877fdebSMatt Macy * 1. Check for errors on the child IOs. 32007877fdebSMatt Macy * 2. Return, setting an error code if too few child VDevs were written 32017877fdebSMatt Macy * to reconstruct the data later. Note that partial writes are 32027877fdebSMatt Macy * considered successful if they can be reconstructed at all. 32037877fdebSMatt Macy */ 32047877fdebSMatt Macy static void 32057877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 32067877fdebSMatt Macy { 3207*e716630dSMartin Matuska int normal_errors = 0; 3208*e716630dSMartin Matuska int shadow_errors = 0; 32097877fdebSMatt Macy 32107877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32117877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32127877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 32137877fdebSMatt Macy 32147877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32157877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32167877fdebSMatt Macy 3217*e716630dSMartin Matuska if (rc->rc_error != 0) { 32187877fdebSMatt Macy ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3219*e716630dSMartin Matuska normal_errors++; 3220*e716630dSMartin Matuska } 3221*e716630dSMartin Matuska if (rc->rc_shadow_error != 0) { 3222*e716630dSMartin Matuska ASSERT(rc->rc_shadow_error != ECKSUM); 3223*e716630dSMartin Matuska shadow_errors++; 32247877fdebSMatt Macy } 32257877fdebSMatt Macy } 32267877fdebSMatt Macy 32277877fdebSMatt Macy /* 32287877fdebSMatt Macy * Treat partial writes as a success. If we couldn't write enough 3229*e716630dSMartin Matuska * columns to reconstruct the data, the I/O failed. Otherwise, good 3230*e716630dSMartin Matuska * enough. Note that in the case of a shadow write (during raidz 3231*e716630dSMartin Matuska * expansion), depending on if we crash, either the normal (old) or 3232*e716630dSMartin Matuska * shadow (new) location may become the "real" version of the block, 3233*e716630dSMartin Matuska * so both locations must have sufficient redundancy. 3234eda14cbcSMatt Macy * 3235eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 3236eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 3237eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 3238eda14cbcSMatt Macy * if we intend to reallocate. 3239eda14cbcSMatt Macy */ 3240*e716630dSMartin Matuska if (normal_errors > rr->rr_firstdatacol || 3241*e716630dSMartin Matuska shadow_errors > rr->rr_firstdatacol) { 32427877fdebSMatt Macy zio->io_error = zio_worst_error(zio->io_error, 32437877fdebSMatt Macy vdev_raidz_worst_error(rr)); 32447877fdebSMatt Macy } 3245eda14cbcSMatt Macy } 3246eda14cbcSMatt Macy 3247f9693befSMartin Matuska static void 32487877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 32497877fdebSMatt Macy raidz_row_t *rr) 32507877fdebSMatt Macy { 32517877fdebSMatt Macy int parity_errors = 0; 32527877fdebSMatt Macy int parity_untried = 0; 32537877fdebSMatt Macy int data_errors = 0; 32547877fdebSMatt Macy int total_errors = 0; 32557877fdebSMatt Macy 32567877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32577877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32587877fdebSMatt Macy 32597877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32607877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32617877fdebSMatt Macy 3262a0b956f5SMartin Matuska /* 3263a0b956f5SMartin Matuska * If scrubbing and a replacing/sparing child vdev determined 3264a0b956f5SMartin Matuska * that not all of its children have an identical copy of the 3265a0b956f5SMartin Matuska * data, then clear the error so the column is treated like 3266a0b956f5SMartin Matuska * any other read and force a repair to correct the damage. 3267a0b956f5SMartin Matuska */ 3268a0b956f5SMartin Matuska if (rc->rc_error == ECKSUM) { 3269a0b956f5SMartin Matuska ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3270a0b956f5SMartin Matuska vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3271a0b956f5SMartin Matuska rc->rc_force_repair = 1; 3272a0b956f5SMartin Matuska rc->rc_error = 0; 3273a0b956f5SMartin Matuska } 32747877fdebSMatt Macy 3275a0b956f5SMartin Matuska if (rc->rc_error) { 32767877fdebSMatt Macy if (c < rr->rr_firstdatacol) 32777877fdebSMatt Macy parity_errors++; 32787877fdebSMatt Macy else 32797877fdebSMatt Macy data_errors++; 32807877fdebSMatt Macy 32817877fdebSMatt Macy total_errors++; 32827877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 32837877fdebSMatt Macy parity_untried++; 32847877fdebSMatt Macy } 32857877fdebSMatt Macy } 3286eda14cbcSMatt Macy 3287eda14cbcSMatt Macy /* 32887877fdebSMatt Macy * If there were data errors and the number of errors we saw was 32897877fdebSMatt Macy * correctable -- less than or equal to the number of parity disks read 32907877fdebSMatt Macy * -- reconstruct based on the missing data. 3291eda14cbcSMatt Macy */ 32927877fdebSMatt Macy if (data_errors != 0 && 32937877fdebSMatt Macy total_errors <= rr->rr_firstdatacol - parity_untried) { 3294eda14cbcSMatt Macy /* 3295eda14cbcSMatt Macy * We either attempt to read all the parity columns or 3296eda14cbcSMatt Macy * none of them. If we didn't try to read parity, we 3297eda14cbcSMatt Macy * wouldn't be here in the correctable case. There must 3298eda14cbcSMatt Macy * also have been fewer parity errors than parity 3299eda14cbcSMatt Macy * columns or, again, we wouldn't be in this code path. 3300eda14cbcSMatt Macy */ 3301eda14cbcSMatt Macy ASSERT(parity_untried == 0); 33027877fdebSMatt Macy ASSERT(parity_errors < rr->rr_firstdatacol); 3303eda14cbcSMatt Macy 3304eda14cbcSMatt Macy /* 3305eda14cbcSMatt Macy * Identify the data columns that reported an error. 3306eda14cbcSMatt Macy */ 33077877fdebSMatt Macy int n = 0; 33087877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY]; 33097877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 33107877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 3311eda14cbcSMatt Macy if (rc->rc_error != 0) { 3312eda14cbcSMatt Macy ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3313eda14cbcSMatt Macy tgts[n++] = c; 3314eda14cbcSMatt Macy } 3315eda14cbcSMatt Macy } 3316eda14cbcSMatt Macy 33177877fdebSMatt Macy ASSERT(rr->rr_firstdatacol >= n); 3318eda14cbcSMatt Macy 3319f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3320eda14cbcSMatt Macy } 3321eda14cbcSMatt Macy } 3322eda14cbcSMatt Macy 3323eda14cbcSMatt Macy /* 33247877fdebSMatt Macy * Return the number of reads issued. 3325eda14cbcSMatt Macy */ 33267877fdebSMatt Macy static int 33277877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 33287877fdebSMatt Macy { 33297877fdebSMatt Macy vdev_t *vd = zio->io_vd; 33307877fdebSMatt Macy int nread = 0; 3331eda14cbcSMatt Macy 33327877fdebSMatt Macy rr->rr_missingdata = 0; 33337877fdebSMatt Macy rr->rr_missingparity = 0; 33347877fdebSMatt Macy 33357877fdebSMatt Macy /* 33367877fdebSMatt Macy * If this rows contains empty sectors which are not required 33377877fdebSMatt Macy * for a normal read then allocate an ABD for them now so they 33387877fdebSMatt Macy * may be read, verified, and any needed repairs performed. 33397877fdebSMatt Macy */ 3340*e716630dSMartin Matuska if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 33417877fdebSMatt Macy vdev_draid_map_alloc_empty(zio, rr); 33427877fdebSMatt Macy 33437877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33447877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33457877fdebSMatt Macy if (rc->rc_tried || rc->rc_size == 0) 3346eda14cbcSMatt Macy continue; 3347eda14cbcSMatt Macy 3348eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 3349eda14cbcSMatt Macy vd->vdev_child[rc->rc_devidx], 3350eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 3351eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 3352eda14cbcSMatt Macy vdev_raidz_child_done, rc)); 33537877fdebSMatt Macy nread++; 33547877fdebSMatt Macy } 33557877fdebSMatt Macy return (nread); 3356eda14cbcSMatt Macy } 3357eda14cbcSMatt Macy 3358eda14cbcSMatt Macy /* 33597877fdebSMatt Macy * We're here because either there were too many errors to even attempt 33607877fdebSMatt Macy * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 33617877fdebSMatt Macy * failed. In either case, there is enough bad data to prevent reconstruction. 33627877fdebSMatt Macy * Start checksum ereports for all children which haven't failed. 3363eda14cbcSMatt Macy */ 33647877fdebSMatt Macy static void 33657877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio) 33667877fdebSMatt Macy { 33677877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3368eda14cbcSMatt Macy 33697877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 33707877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 3371eda14cbcSMatt Macy 33727877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33737877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33747877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 33757877fdebSMatt Macy 33762c48331dSMatt Macy if (rc->rc_error != 0) 33772c48331dSMatt Macy continue; 33782c48331dSMatt Macy 3379eda14cbcSMatt Macy zio_bad_cksum_t zbc; 3380eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 33812c48331dSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 3382eda14cbcSMatt Macy 3383eda14cbcSMatt Macy mutex_enter(&cvd->vdev_stat_lock); 3384eda14cbcSMatt Macy cvd->vdev_stat.vs_checksum_errors++; 3385eda14cbcSMatt Macy mutex_exit(&cvd->vdev_stat_lock); 3386bb2d13b6SMartin Matuska (void) zfs_ereport_start_checksum(zio->io_spa, 3387bb2d13b6SMartin Matuska cvd, &zio->io_bookmark, zio, rc->rc_offset, 3388bb2d13b6SMartin Matuska rc->rc_size, &zbc); 3389eda14cbcSMatt Macy } 3390eda14cbcSMatt Macy } 3391eda14cbcSMatt Macy } 3392eda14cbcSMatt Macy 33937877fdebSMatt Macy void 33947877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio) 33957877fdebSMatt Macy { 33967877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 33977877fdebSMatt Macy 3398*e716630dSMartin Matuska ASSERT(zio->io_bp != NULL); 33997877fdebSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 34007877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34017877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 34027877fdebSMatt Macy } 34037877fdebSMatt Macy } else { 3404*e716630dSMartin Matuska if (rm->rm_phys_col) { 3405*e716630dSMartin Matuska /* 3406*e716630dSMartin Matuska * This is an aggregated read. Copy the data and status 3407*e716630dSMartin Matuska * from the aggregate abd's to the individual rows. 3408*e716630dSMartin Matuska */ 3409*e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 3410*e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 3411*e716630dSMartin Matuska 3412*e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 3413*e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 3414*e716630dSMartin Matuska if (rc->rc_tried || rc->rc_size == 0) 3415*e716630dSMartin Matuska continue; 3416*e716630dSMartin Matuska 3417*e716630dSMartin Matuska raidz_col_t *prc = 3418*e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 3419*e716630dSMartin Matuska rc->rc_error = prc->rc_error; 3420*e716630dSMartin Matuska rc->rc_tried = prc->rc_tried; 3421*e716630dSMartin Matuska rc->rc_skipped = prc->rc_skipped; 3422*e716630dSMartin Matuska if (c >= rr->rr_firstdatacol) { 3423*e716630dSMartin Matuska /* 3424*e716630dSMartin Matuska * Note: this is slightly faster 3425*e716630dSMartin Matuska * than using abd_copy_off(). 3426*e716630dSMartin Matuska */ 3427*e716630dSMartin Matuska char *physbuf = abd_to_buf( 3428*e716630dSMartin Matuska prc->rc_abd); 3429*e716630dSMartin Matuska void *physloc = physbuf + 3430*e716630dSMartin Matuska rc->rc_offset - 3431*e716630dSMartin Matuska prc->rc_offset; 3432*e716630dSMartin Matuska 3433*e716630dSMartin Matuska abd_copy_from_buf(rc->rc_abd, 3434*e716630dSMartin Matuska physloc, rc->rc_size); 3435*e716630dSMartin Matuska } 3436*e716630dSMartin Matuska } 3437*e716630dSMartin Matuska } 3438*e716630dSMartin Matuska } 3439*e716630dSMartin Matuska 34407877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34417877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34427877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio, 34437877fdebSMatt Macy rm, rr); 34447877fdebSMatt Macy } 34457877fdebSMatt Macy 34467877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 34477877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34487877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34497877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 34507877fdebSMatt Macy } 3451eda14cbcSMatt Macy zio_checksum_verified(zio); 34527877fdebSMatt Macy } else { 3453eda14cbcSMatt Macy /* 34547877fdebSMatt Macy * A sequential resilver has no checksum which makes 34557877fdebSMatt Macy * combinatoral reconstruction impossible. This code 34567877fdebSMatt Macy * path is unreachable since raidz_checksum_verify() 34577877fdebSMatt Macy * has no checksum to verify and must succeed. 3458eda14cbcSMatt Macy */ 34597877fdebSMatt Macy ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3460eda14cbcSMatt Macy 34617877fdebSMatt Macy /* 34627877fdebSMatt Macy * This isn't a typical situation -- either we got a 34637877fdebSMatt Macy * read error or a child silently returned bad data. 34647877fdebSMatt Macy * Read every block so we can try again with as much 34657877fdebSMatt Macy * data and parity as we can track down. If we've 34667877fdebSMatt Macy * already been through once before, all children will 34677877fdebSMatt Macy * be marked as tried so we'll proceed to combinatorial 34687877fdebSMatt Macy * reconstruction. 34697877fdebSMatt Macy */ 34707877fdebSMatt Macy int nread = 0; 34717877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34727877fdebSMatt Macy nread += vdev_raidz_read_all(zio, 34737877fdebSMatt Macy rm->rm_row[i]); 34747877fdebSMatt Macy } 34757877fdebSMatt Macy if (nread != 0) { 34767877fdebSMatt Macy /* 34777877fdebSMatt Macy * Normally our stage is VDEV_IO_DONE, but if 34787877fdebSMatt Macy * we've already called redone(), it will have 34797877fdebSMatt Macy * changed to VDEV_IO_START, in which case we 34807877fdebSMatt Macy * don't want to call redone() again. 34817877fdebSMatt Macy */ 34827877fdebSMatt Macy if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 34837877fdebSMatt Macy zio_vdev_io_redone(zio); 34847877fdebSMatt Macy return; 34857877fdebSMatt Macy } 3486*e716630dSMartin Matuska /* 3487*e716630dSMartin Matuska * It would be too expensive to try every possible 3488*e716630dSMartin Matuska * combination of failed sectors in every row, so 3489*e716630dSMartin Matuska * instead we try every combination of failed current or 3490*e716630dSMartin Matuska * past physical disk. This means that if the incorrect 3491*e716630dSMartin Matuska * sectors were all on Nparity disks at any point in the 3492*e716630dSMartin Matuska * past, we will find the correct data. The only known 3493*e716630dSMartin Matuska * case where this is less durable than a non-expanded 3494*e716630dSMartin Matuska * RAIDZ, is if we have a silent failure during 3495*e716630dSMartin Matuska * expansion. In that case, one block could be 3496*e716630dSMartin Matuska * partially in the old format and partially in the 3497*e716630dSMartin Matuska * new format, so we'd lost some sectors from the old 3498*e716630dSMartin Matuska * format and some from the new format. 3499*e716630dSMartin Matuska * 3500*e716630dSMartin Matuska * e.g. logical_width=4 physical_width=6 3501*e716630dSMartin Matuska * the 15 (6+5+4) possible failed disks are: 3502*e716630dSMartin Matuska * width=6 child=0 3503*e716630dSMartin Matuska * width=6 child=1 3504*e716630dSMartin Matuska * width=6 child=2 3505*e716630dSMartin Matuska * width=6 child=3 3506*e716630dSMartin Matuska * width=6 child=4 3507*e716630dSMartin Matuska * width=6 child=5 3508*e716630dSMartin Matuska * width=5 child=0 3509*e716630dSMartin Matuska * width=5 child=1 3510*e716630dSMartin Matuska * width=5 child=2 3511*e716630dSMartin Matuska * width=5 child=3 3512*e716630dSMartin Matuska * width=5 child=4 3513*e716630dSMartin Matuska * width=4 child=0 3514*e716630dSMartin Matuska * width=4 child=1 3515*e716630dSMartin Matuska * width=4 child=2 3516*e716630dSMartin Matuska * width=4 child=3 3517*e716630dSMartin Matuska * And we will try every combination of Nparity of these 3518*e716630dSMartin Matuska * failing. 3519*e716630dSMartin Matuska * 3520*e716630dSMartin Matuska * As a first pass, we can generate every combo, 3521*e716630dSMartin Matuska * and try reconstructing, ignoring any known 3522*e716630dSMartin Matuska * failures. If any row has too many known + simulated 3523*e716630dSMartin Matuska * failures, then we bail on reconstructing with this 3524*e716630dSMartin Matuska * number of simulated failures. As an improvement, 3525*e716630dSMartin Matuska * we could detect the number of whole known failures 3526*e716630dSMartin Matuska * (i.e. we have known failures on these disks for 3527*e716630dSMartin Matuska * every row; the disks never succeeded), and 3528*e716630dSMartin Matuska * subtract that from the max # failures to simulate. 3529*e716630dSMartin Matuska * We could go even further like the current 3530*e716630dSMartin Matuska * combrec code, but that doesn't seem like it 3531*e716630dSMartin Matuska * gains us very much. If we simulate a failure 3532*e716630dSMartin Matuska * that is also a known failure, that's fine. 3533*e716630dSMartin Matuska */ 35347877fdebSMatt Macy zio->io_error = vdev_raidz_combrec(zio); 35357877fdebSMatt Macy if (zio->io_error == ECKSUM && 35367877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 35377877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio); 35387877fdebSMatt Macy } 3539eda14cbcSMatt Macy } 3540eda14cbcSMatt Macy } 3541*e716630dSMartin Matuska if (rm->rm_lr != NULL) { 3542*e716630dSMartin Matuska zfs_rangelock_exit(rm->rm_lr); 3543*e716630dSMartin Matuska rm->rm_lr = NULL; 3544*e716630dSMartin Matuska } 3545eda14cbcSMatt Macy } 3546eda14cbcSMatt Macy 3547eda14cbcSMatt Macy static void 3548eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3549eda14cbcSMatt Macy { 35507877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 35517877fdebSMatt Macy if (faulted > vdrz->vd_nparity) 3552eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3553eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 3554eda14cbcSMatt Macy else if (degraded + faulted != 0) 3555eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3556eda14cbcSMatt Macy else 3557eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3558eda14cbcSMatt Macy } 3559eda14cbcSMatt Macy 3560eda14cbcSMatt Macy /* 3561eda14cbcSMatt Macy * Determine if any portion of the provided block resides on a child vdev 3562eda14cbcSMatt Macy * with a dirty DTL and therefore needs to be resilvered. The function 3563eda14cbcSMatt Macy * assumes that at least one DTL is dirty which implies that full stripe 3564eda14cbcSMatt Macy * width blocks must be resilvered. 3565eda14cbcSMatt Macy */ 3566eda14cbcSMatt Macy static boolean_t 35677877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 35687877fdebSMatt Macy uint64_t phys_birth) 3569eda14cbcSMatt Macy { 35707877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 3571*e716630dSMartin Matuska 3572*e716630dSMartin Matuska /* 3573*e716630dSMartin Matuska * If we're in the middle of a RAIDZ expansion, this block may be in 3574*e716630dSMartin Matuska * the old and/or new location. For simplicity, always resilver it. 3575*e716630dSMartin Matuska */ 3576*e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3577*e716630dSMartin Matuska return (B_TRUE); 3578*e716630dSMartin Matuska 3579eda14cbcSMatt Macy uint64_t dcols = vd->vdev_children; 35807877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 3581eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 3582eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 35837877fdebSMatt Macy uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3584eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 3585eda14cbcSMatt Macy uint64_t s = ((psize - 1) >> ashift) + 1; 3586eda14cbcSMatt Macy /* The first column for this stripe. */ 3587eda14cbcSMatt Macy uint64_t f = b % dcols; 3588eda14cbcSMatt Macy 35897877fdebSMatt Macy /* Unreachable by sequential resilver. */ 35907877fdebSMatt Macy ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 35917877fdebSMatt Macy 35927877fdebSMatt Macy if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 35937877fdebSMatt Macy return (B_FALSE); 35947877fdebSMatt Macy 3595eda14cbcSMatt Macy if (s + nparity >= dcols) 3596eda14cbcSMatt Macy return (B_TRUE); 3597eda14cbcSMatt Macy 3598eda14cbcSMatt Macy for (uint64_t c = 0; c < s + nparity; c++) { 3599eda14cbcSMatt Macy uint64_t devidx = (f + c) % dcols; 3600eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[devidx]; 3601eda14cbcSMatt Macy 3602eda14cbcSMatt Macy /* 3603eda14cbcSMatt Macy * dsl_scan_need_resilver() already checked vd with 3604eda14cbcSMatt Macy * vdev_dtl_contains(). So here just check cvd with 3605eda14cbcSMatt Macy * vdev_dtl_empty(), cheaper and a good approximation. 3606eda14cbcSMatt Macy */ 3607eda14cbcSMatt Macy if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3608eda14cbcSMatt Macy return (B_TRUE); 3609eda14cbcSMatt Macy } 3610eda14cbcSMatt Macy 3611eda14cbcSMatt Macy return (B_FALSE); 3612eda14cbcSMatt Macy } 3613eda14cbcSMatt Macy 3614eda14cbcSMatt Macy static void 36157877fdebSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, 36167877fdebSMatt Macy range_seg64_t *physical_rs, range_seg64_t *remain_rs) 3617eda14cbcSMatt Macy { 3618e92ffd9bSMartin Matuska (void) remain_rs; 3619e92ffd9bSMartin Matuska 3620eda14cbcSMatt Macy vdev_t *raidvd = cvd->vdev_parent; 3621eda14cbcSMatt Macy ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3622eda14cbcSMatt Macy 3623*e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3624*e716630dSMartin Matuska 3625*e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3626*e716630dSMartin Matuska /* 3627*e716630dSMartin Matuska * We're in the middle of expansion, in which case the 3628*e716630dSMartin Matuska * translation is in flux. Any answer we give may be wrong 3629*e716630dSMartin Matuska * by the time we return, so it isn't safe for the caller to 3630*e716630dSMartin Matuska * act on it. Therefore we say that this range isn't present 3631*e716630dSMartin Matuska * on any children. The only consumers of this are "zpool 3632*e716630dSMartin Matuska * initialize" and trimming, both of which are "best effort" 3633*e716630dSMartin Matuska * anyway. 3634*e716630dSMartin Matuska */ 3635*e716630dSMartin Matuska physical_rs->rs_start = physical_rs->rs_end = 0; 3636*e716630dSMartin Matuska remain_rs->rs_start = remain_rs->rs_end = 0; 3637*e716630dSMartin Matuska return; 3638*e716630dSMartin Matuska } 3639*e716630dSMartin Matuska 3640*e716630dSMartin Matuska uint64_t width = vdrz->vd_physical_width; 3641eda14cbcSMatt Macy uint64_t tgt_col = cvd->vdev_id; 3642eda14cbcSMatt Macy uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3643eda14cbcSMatt Macy 3644eda14cbcSMatt Macy /* make sure the offsets are block-aligned */ 36457877fdebSMatt Macy ASSERT0(logical_rs->rs_start % (1 << ashift)); 36467877fdebSMatt Macy ASSERT0(logical_rs->rs_end % (1 << ashift)); 36477877fdebSMatt Macy uint64_t b_start = logical_rs->rs_start >> ashift; 36487877fdebSMatt Macy uint64_t b_end = logical_rs->rs_end >> ashift; 3649eda14cbcSMatt Macy 3650eda14cbcSMatt Macy uint64_t start_row = 0; 3651eda14cbcSMatt Macy if (b_start > tgt_col) /* avoid underflow */ 3652eda14cbcSMatt Macy start_row = ((b_start - tgt_col - 1) / width) + 1; 3653eda14cbcSMatt Macy 3654eda14cbcSMatt Macy uint64_t end_row = 0; 3655eda14cbcSMatt Macy if (b_end > tgt_col) 3656eda14cbcSMatt Macy end_row = ((b_end - tgt_col - 1) / width) + 1; 3657eda14cbcSMatt Macy 36587877fdebSMatt Macy physical_rs->rs_start = start_row << ashift; 36597877fdebSMatt Macy physical_rs->rs_end = end_row << ashift; 3660eda14cbcSMatt Macy 36617877fdebSMatt Macy ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 36627877fdebSMatt Macy ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 36637877fdebSMatt Macy logical_rs->rs_end - logical_rs->rs_start); 36647877fdebSMatt Macy } 36657877fdebSMatt Macy 3666*e716630dSMartin Matuska static void 3667*e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3668*e716630dSMartin Matuska { 3669*e716630dSMartin Matuska spa_t *spa = arg; 3670*e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3671*e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3672*e716630dSMartin Matuska 3673*e716630dSMartin Matuska /* 3674*e716630dSMartin Matuska * Ensure there are no i/os to the range that is being committed. 3675*e716630dSMartin Matuska */ 3676*e716630dSMartin Matuska uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3677*e716630dSMartin Matuska ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3678*e716630dSMartin Matuska 3679*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3680*e716630dSMartin Matuska uint64_t new_offset = 3681*e716630dSMartin Matuska MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3682*e716630dSMartin Matuska /* 3683*e716630dSMartin Matuska * We should not have committed anything that failed. 3684*e716630dSMartin Matuska */ 3685*e716630dSMartin Matuska VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3686*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3687*e716630dSMartin Matuska 3688*e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3689*e716630dSMartin Matuska old_offset, new_offset - old_offset, 3690*e716630dSMartin Matuska RL_WRITER); 3691*e716630dSMartin Matuska 3692*e716630dSMartin Matuska /* 3693*e716630dSMartin Matuska * Update the uberblock that will be written when this txg completes. 3694*e716630dSMartin Matuska */ 3695*e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3696*e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3697*e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = 0; 3698*e716630dSMartin Matuska zfs_rangelock_exit(lr); 3699*e716630dSMartin Matuska 3700*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3701*e716630dSMartin Matuska vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3702*e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = 0; 3703*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3704*e716630dSMartin Matuska 3705*e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3706*e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3707*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3708*e716630dSMartin Matuska sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3709*e716630dSMartin Matuska } 3710*e716630dSMartin Matuska 3711*e716630dSMartin Matuska static void 3712*e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3713*e716630dSMartin Matuska { 3714*e716630dSMartin Matuska spa_t *spa = arg; 3715*e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3716*e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3717*e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3718*e716630dSMartin Matuska 3719*e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 3720*e716630dSMartin Matuska VERIFY0(vre->vre_offset_pertxg[i]); 3721*e716630dSMartin Matuska 3722*e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3723*e716630dSMartin Matuska re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3724*e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width; 3725*e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 3726*e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 3727*e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 3728*e716630dSMartin Matuska 3729*e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3730*e716630dSMartin Matuska 3731*e716630dSMartin Matuska /* 3732*e716630dSMartin Matuska * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3733*e716630dSMartin Matuska * will get written (based on vd_expand_txgs). 3734*e716630dSMartin Matuska */ 3735*e716630dSMartin Matuska vdev_config_dirty(vd); 3736*e716630dSMartin Matuska 3737*e716630dSMartin Matuska /* 3738*e716630dSMartin Matuska * Before we change vre_state, the on-disk state must reflect that we 3739*e716630dSMartin Matuska * have completed all copying, so that vdev_raidz_io_start() can use 3740*e716630dSMartin Matuska * vre_state to determine if the reflow is in progress. See also the 3741*e716630dSMartin Matuska * end of spa_raidz_expand_thread(). 3742*e716630dSMartin Matuska */ 3743*e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3744*e716630dSMartin Matuska raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3745*e716630dSMartin Matuska 3746*e716630dSMartin Matuska vre->vre_end_time = gethrestime_sec(); 3747*e716630dSMartin Matuska vre->vre_state = DSS_FINISHED; 3748*e716630dSMartin Matuska 3749*e716630dSMartin Matuska uint64_t state = vre->vre_state; 3750*e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3751*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3752*e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 3753*e716630dSMartin Matuska 3754*e716630dSMartin Matuska uint64_t end_time = vre->vre_end_time; 3755*e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3756*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3757*e716630dSMartin Matuska sizeof (end_time), 1, &end_time, tx)); 3758*e716630dSMartin Matuska 3759*e716630dSMartin Matuska spa->spa_uberblock.ub_raidz_reflow_info = 0; 3760*e716630dSMartin Matuska 3761*e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3762*e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 3763*e716630dSMartin Matuska (unsigned long long)vd->vdev_id, 3764*e716630dSMartin Matuska (unsigned long long)vd->vdev_children); 3765*e716630dSMartin Matuska 3766*e716630dSMartin Matuska spa->spa_raidz_expand = NULL; 3767*e716630dSMartin Matuska raidvd->vdev_rz_expanding = B_FALSE; 3768*e716630dSMartin Matuska 3769*e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3770*e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3771*e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3772*e716630dSMartin Matuska 3773*e716630dSMartin Matuska spa_notify_waiters(spa); 3774*e716630dSMartin Matuska 3775*e716630dSMartin Matuska /* 3776*e716630dSMartin Matuska * While we're in syncing context take the opportunity to 3777*e716630dSMartin Matuska * setup a scrub. All the data has been sucessfully copied 3778*e716630dSMartin Matuska * but we have not validated any checksums. 3779*e716630dSMartin Matuska */ 3780*e716630dSMartin Matuska pool_scan_func_t func = POOL_SCAN_SCRUB; 3781*e716630dSMartin Matuska if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) 3782*e716630dSMartin Matuska dsl_scan_setup_sync(&func, tx); 3783*e716630dSMartin Matuska } 3784*e716630dSMartin Matuska 3785*e716630dSMartin Matuska /* 3786*e716630dSMartin Matuska * Struct for one copy zio. 3787*e716630dSMartin Matuska */ 3788*e716630dSMartin Matuska typedef struct raidz_reflow_arg { 3789*e716630dSMartin Matuska vdev_raidz_expand_t *rra_vre; 3790*e716630dSMartin Matuska zfs_locked_range_t *rra_lr; 3791*e716630dSMartin Matuska uint64_t rra_txg; 3792*e716630dSMartin Matuska } raidz_reflow_arg_t; 3793*e716630dSMartin Matuska 3794*e716630dSMartin Matuska /* 3795*e716630dSMartin Matuska * The write of the new location is done. 3796*e716630dSMartin Matuska */ 3797*e716630dSMartin Matuska static void 3798*e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio) 3799*e716630dSMartin Matuska { 3800*e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3801*e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3802*e716630dSMartin Matuska 3803*e716630dSMartin Matuska abd_free(zio->io_abd); 3804*e716630dSMartin Matuska 3805*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3806*e716630dSMartin Matuska if (zio->io_error != 0) { 3807*e716630dSMartin Matuska /* Force a reflow pause on errors */ 3808*e716630dSMartin Matuska vre->vre_failed_offset = 3809*e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3810*e716630dSMartin Matuska } 3811*e716630dSMartin Matuska ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3812*e716630dSMartin Matuska vre->vre_outstanding_bytes -= zio->io_size; 3813*e716630dSMartin Matuska if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3814*e716630dSMartin Matuska vre->vre_failed_offset) { 3815*e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3816*e716630dSMartin Matuska zio->io_size; 3817*e716630dSMartin Matuska } 3818*e716630dSMartin Matuska cv_signal(&vre->vre_cv); 3819*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3820*e716630dSMartin Matuska 3821*e716630dSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 3822*e716630dSMartin Matuska 3823*e716630dSMartin Matuska kmem_free(rra, sizeof (*rra)); 3824*e716630dSMartin Matuska spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 3825*e716630dSMartin Matuska } 3826*e716630dSMartin Matuska 3827*e716630dSMartin Matuska /* 3828*e716630dSMartin Matuska * The read of the old location is done. The parent zio is the write to 3829*e716630dSMartin Matuska * the new location. Allow it to start. 3830*e716630dSMartin Matuska */ 3831*e716630dSMartin Matuska static void 3832*e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio) 3833*e716630dSMartin Matuska { 3834*e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3835*e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3836*e716630dSMartin Matuska 3837*e716630dSMartin Matuska /* 3838*e716630dSMartin Matuska * If the read failed, or if it was done on a vdev that is not fully 3839*e716630dSMartin Matuska * healthy (e.g. a child that has a resilver in progress), we may not 3840*e716630dSMartin Matuska * have the correct data. Note that it's OK if the write proceeds. 3841*e716630dSMartin Matuska * It may write garbage but the location is otherwise unused and we 3842*e716630dSMartin Matuska * will retry later due to vre_failed_offset. 3843*e716630dSMartin Matuska */ 3844*e716630dSMartin Matuska if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3845*e716630dSMartin Matuska zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3846*e716630dSMartin Matuska "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3847*e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3848*e716630dSMartin Matuska (long long)rra->rra_lr->lr_length, 3849*e716630dSMartin Matuska (long long)rra->rra_txg, 3850*e716630dSMartin Matuska zio->io_error, 3851*e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3852*e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3853*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3854*e716630dSMartin Matuska /* Force a reflow pause on errors */ 3855*e716630dSMartin Matuska vre->vre_failed_offset = 3856*e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3857*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3858*e716630dSMartin Matuska } 3859*e716630dSMartin Matuska 3860*e716630dSMartin Matuska zio_nowait(zio_unique_parent(zio)); 3861*e716630dSMartin Matuska } 3862*e716630dSMartin Matuska 3863*e716630dSMartin Matuska static void 3864*e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3865*e716630dSMartin Matuska dmu_tx_t *tx) 3866*e716630dSMartin Matuska { 3867*e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3868*e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3869*e716630dSMartin Matuska 3870*e716630dSMartin Matuska if (offset == 0) 3871*e716630dSMartin Matuska return; 3872*e716630dSMartin Matuska 3873*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3874*e716630dSMartin Matuska ASSERT3U(vre->vre_offset, <=, offset); 3875*e716630dSMartin Matuska vre->vre_offset = offset; 3876*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3877*e716630dSMartin Matuska 3878*e716630dSMartin Matuska if (vre->vre_offset_pertxg[txgoff] == 0) { 3879*e716630dSMartin Matuska dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3880*e716630dSMartin Matuska spa, tx); 3881*e716630dSMartin Matuska } 3882*e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = offset; 3883*e716630dSMartin Matuska } 3884*e716630dSMartin Matuska 3885*e716630dSMartin Matuska static boolean_t 3886*e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3887*e716630dSMartin Matuska { 3888*e716630dSMartin Matuska for (int i = 0; i < raidz_vd->vdev_children; i++) { 3889*e716630dSMartin Matuska /* Quick check if a child is being replaced */ 3890*e716630dSMartin Matuska if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3891*e716630dSMartin Matuska return (B_TRUE); 3892*e716630dSMartin Matuska } 3893*e716630dSMartin Matuska return (B_FALSE); 3894*e716630dSMartin Matuska } 3895*e716630dSMartin Matuska 3896*e716630dSMartin Matuska static boolean_t 3897*e716630dSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, 3898*e716630dSMartin Matuska dmu_tx_t *tx) 3899*e716630dSMartin Matuska { 3900*e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 3901*e716630dSMartin Matuska int ashift = vd->vdev_top->vdev_ashift; 3902*e716630dSMartin Matuska uint64_t offset, size; 3903*e716630dSMartin Matuska 3904*e716630dSMartin Matuska if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, 3905*e716630dSMartin Matuska &offset, &size)) { 3906*e716630dSMartin Matuska return (B_FALSE); 3907*e716630dSMartin Matuska } 3908*e716630dSMartin Matuska ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3909*e716630dSMartin Matuska ASSERT3U(size, >=, 1 << ashift); 3910*e716630dSMartin Matuska uint64_t length = 1 << ashift; 3911*e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3912*e716630dSMartin Matuska 3913*e716630dSMartin Matuska uint64_t blkid = offset >> ashift; 3914*e716630dSMartin Matuska 3915*e716630dSMartin Matuska int old_children = vd->vdev_children - 1; 3916*e716630dSMartin Matuska 3917*e716630dSMartin Matuska /* 3918*e716630dSMartin Matuska * We can only progress to the point that writes will not overlap 3919*e716630dSMartin Matuska * with blocks whose progress has not yet been recorded on disk. 3920*e716630dSMartin Matuska * Since partially-copied rows are still read from the old location, 3921*e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent 3922*e716630dSMartin Matuska * row-wise overlap. 3923*e716630dSMartin Matuska * 3924*e716630dSMartin Matuska * Note that even if we are skipping over a large unallocated region, 3925*e716630dSMartin Matuska * we can't move the on-disk progress to `offset`, because concurrent 3926*e716630dSMartin Matuska * writes/allocations could still use the currently-unallocated 3927*e716630dSMartin Matuska * region. 3928*e716630dSMartin Matuska */ 3929*e716630dSMartin Matuska uint64_t ubsync_blkid = 3930*e716630dSMartin Matuska RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3931*e716630dSMartin Matuska uint64_t next_overwrite_blkid = ubsync_blkid + 3932*e716630dSMartin Matuska ubsync_blkid / old_children - old_children; 3933*e716630dSMartin Matuska VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3934*e716630dSMartin Matuska 3935*e716630dSMartin Matuska if (blkid >= next_overwrite_blkid) { 3936*e716630dSMartin Matuska raidz_reflow_record_progress(vre, 3937*e716630dSMartin Matuska next_overwrite_blkid << ashift, tx); 3938*e716630dSMartin Matuska return (B_TRUE); 3939*e716630dSMartin Matuska } 3940*e716630dSMartin Matuska 3941*e716630dSMartin Matuska range_tree_remove(rt, offset, length); 3942*e716630dSMartin Matuska 3943*e716630dSMartin Matuska raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); 3944*e716630dSMartin Matuska rra->rra_vre = vre; 3945*e716630dSMartin Matuska rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 3946*e716630dSMartin Matuska offset, length, RL_WRITER); 3947*e716630dSMartin Matuska rra->rra_txg = dmu_tx_get_txg(tx); 3948*e716630dSMartin Matuska 3949*e716630dSMartin Matuska raidz_reflow_record_progress(vre, offset + length, tx); 3950*e716630dSMartin Matuska 3951*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3952*e716630dSMartin Matuska vre->vre_outstanding_bytes += length; 3953*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3954*e716630dSMartin Matuska 3955*e716630dSMartin Matuska /* 3956*e716630dSMartin Matuska * SCL_STATE will be released when the read and write are done, 3957*e716630dSMartin Matuska * by raidz_reflow_write_done(). 3958*e716630dSMartin Matuska */ 3959*e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3960*e716630dSMartin Matuska 3961*e716630dSMartin Matuska /* check if a replacing vdev was added, if so treat it as an error */ 3962*e716630dSMartin Matuska if (vdev_raidz_expand_child_replacing(vd)) { 3963*e716630dSMartin Matuska zfs_dbgmsg("replacing vdev encountered, reflow paused at " 3964*e716630dSMartin Matuska "offset=%llu txg=%llu", 3965*e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3966*e716630dSMartin Matuska (long long)rra->rra_txg); 3967*e716630dSMartin Matuska 3968*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3969*e716630dSMartin Matuska vre->vre_failed_offset = 3970*e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3971*e716630dSMartin Matuska cv_signal(&vre->vre_cv); 3972*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3973*e716630dSMartin Matuska 3974*e716630dSMartin Matuska /* drop everything we acquired */ 3975*e716630dSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 3976*e716630dSMartin Matuska kmem_free(rra, sizeof (*rra)); 3977*e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, spa); 3978*e716630dSMartin Matuska return (B_TRUE); 3979*e716630dSMartin Matuska } 3980*e716630dSMartin Matuska 3981*e716630dSMartin Matuska zio_t *pio = spa->spa_txg_zio[txgoff]; 3982*e716630dSMartin Matuska abd_t *abd = abd_alloc_for_io(length, B_FALSE); 3983*e716630dSMartin Matuska zio_t *write_zio = zio_vdev_child_io(pio, NULL, 3984*e716630dSMartin Matuska vd->vdev_child[blkid % vd->vdev_children], 3985*e716630dSMartin Matuska (blkid / vd->vdev_children) << ashift, 3986*e716630dSMartin Matuska abd, length, 3987*e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 3988*e716630dSMartin Matuska ZIO_FLAG_CANFAIL, 3989*e716630dSMartin Matuska raidz_reflow_write_done, rra); 3990*e716630dSMartin Matuska 3991*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(write_zio, NULL, 3992*e716630dSMartin Matuska vd->vdev_child[blkid % old_children], 3993*e716630dSMartin Matuska (blkid / old_children) << ashift, 3994*e716630dSMartin Matuska abd, length, 3995*e716630dSMartin Matuska ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 3996*e716630dSMartin Matuska ZIO_FLAG_CANFAIL, 3997*e716630dSMartin Matuska raidz_reflow_read_done, rra)); 3998*e716630dSMartin Matuska 3999*e716630dSMartin Matuska return (B_FALSE); 4000*e716630dSMartin Matuska } 4001*e716630dSMartin Matuska 4002*e716630dSMartin Matuska /* 4003*e716630dSMartin Matuska * For testing (ztest specific) 4004*e716630dSMartin Matuska */ 4005*e716630dSMartin Matuska static void 4006*e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point) 4007*e716630dSMartin Matuska { 4008*e716630dSMartin Matuska while (raidz_expand_pause_point != 0 && 4009*e716630dSMartin Matuska raidz_expand_pause_point <= pause_point) 4010*e716630dSMartin Matuska delay(hz); 4011*e716630dSMartin Matuska } 4012*e716630dSMartin Matuska 4013*e716630dSMartin Matuska static void 4014*e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio) 4015*e716630dSMartin Matuska { 4016*e716630dSMartin Matuska zio_t *pio = zio->io_private; 4017*e716630dSMartin Matuska 4018*e716630dSMartin Matuska mutex_enter(&pio->io_lock); 4019*e716630dSMartin Matuska pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4020*e716630dSMartin Matuska mutex_exit(&pio->io_lock); 4021*e716630dSMartin Matuska } 4022*e716630dSMartin Matuska 4023*e716630dSMartin Matuska /* 4024*e716630dSMartin Matuska * Reflow the beginning portion of the vdev into an intermediate scratch area 4025*e716630dSMartin Matuska * in memory and on disk. This operation must be persisted on disk before we 4026*e716630dSMartin Matuska * proceed to overwrite the beginning portion with the reflowed data. 4027*e716630dSMartin Matuska * 4028*e716630dSMartin Matuska * This multi-step task can fail to complete if disk errors are encountered 4029*e716630dSMartin Matuska * and we can return here after a pause (waiting for disk to become healthy). 4030*e716630dSMartin Matuska */ 4031*e716630dSMartin Matuska static void 4032*e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4033*e716630dSMartin Matuska { 4034*e716630dSMartin Matuska vdev_raidz_expand_t *vre = arg; 4035*e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4036*e716630dSMartin Matuska zio_t *pio; 4037*e716630dSMartin Matuska int error; 4038*e716630dSMartin Matuska 4039*e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4040*e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4041*e716630dSMartin Matuska int ashift = raidvd->vdev_ashift; 4042*e716630dSMartin Matuska uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); 4043*e716630dSMartin Matuska uint64_t logical_size = write_size * raidvd->vdev_children; 4044*e716630dSMartin Matuska uint64_t read_size = 4045*e716630dSMartin Matuska P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4046*e716630dSMartin Matuska 1 << ashift); 4047*e716630dSMartin Matuska 4048*e716630dSMartin Matuska /* 4049*e716630dSMartin Matuska * The scratch space must be large enough to get us to the point 4050*e716630dSMartin Matuska * that one row does not overlap itself when moved. This is checked 4051*e716630dSMartin Matuska * by vdev_raidz_attach_check(). 4052*e716630dSMartin Matuska */ 4053*e716630dSMartin Matuska VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4054*e716630dSMartin Matuska VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4055*e716630dSMartin Matuska VERIFY3U(write_size, <=, read_size); 4056*e716630dSMartin Matuska 4057*e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4058*e716630dSMartin Matuska 0, logical_size, RL_WRITER); 4059*e716630dSMartin Matuska 4060*e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4061*e716630dSMartin Matuska KM_SLEEP); 4062*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4063*e716630dSMartin Matuska abds[i] = abd_alloc_linear(read_size, B_FALSE); 4064*e716630dSMartin Matuska } 4065*e716630dSMartin Matuska 4066*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4067*e716630dSMartin Matuska 4068*e716630dSMartin Matuska /* 4069*e716630dSMartin Matuska * If we have already written the scratch area then we must read from 4070*e716630dSMartin Matuska * there, since new writes were redirected there while we were paused 4071*e716630dSMartin Matuska * or the original location may have been partially overwritten with 4072*e716630dSMartin Matuska * reflowed data. 4073*e716630dSMartin Matuska */ 4074*e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4075*e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4076*e716630dSMartin Matuska /* 4077*e716630dSMartin Matuska * Read from scratch space. 4078*e716630dSMartin Matuska */ 4079*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4080*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4081*e716630dSMartin Matuska /* 4082*e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4083*e716630dSMartin Matuska * to the offset to calculate the physical offset to 4084*e716630dSMartin Matuska * write to. Passing in a negative offset makes us 4085*e716630dSMartin Matuska * access the scratch area. 4086*e716630dSMartin Matuska */ 4087*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 4088*e716630dSMartin Matuska raidvd->vdev_child[i], 4089*e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4090*e716630dSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, 4091*e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4092*e716630dSMartin Matuska } 4093*e716630dSMartin Matuska error = zio_wait(pio); 4094*e716630dSMartin Matuska if (error != 0) { 4095*e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading scratch location", 4096*e716630dSMartin Matuska error); 4097*e716630dSMartin Matuska goto io_error_exit; 4098*e716630dSMartin Matuska } 4099*e716630dSMartin Matuska goto overwrite; 4100*e716630dSMartin Matuska } 4101*e716630dSMartin Matuska 4102*e716630dSMartin Matuska /* 4103*e716630dSMartin Matuska * Read from original location. 4104*e716630dSMartin Matuska */ 4105*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4106*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4107*e716630dSMartin Matuska ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4108*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4109*e716630dSMartin Matuska 0, abds[i], read_size, ZIO_TYPE_READ, 4110*e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 4111*e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4112*e716630dSMartin Matuska } 4113*e716630dSMartin Matuska error = zio_wait(pio); 4114*e716630dSMartin Matuska if (error != 0) { 4115*e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading original location", error); 4116*e716630dSMartin Matuska io_error_exit: 4117*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4118*e716630dSMartin Matuska abd_free(abds[i]); 4119*e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4120*e716630dSMartin Matuska zfs_rangelock_exit(lr); 4121*e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4122*e716630dSMartin Matuska return; 4123*e716630dSMartin Matuska } 4124*e716630dSMartin Matuska 4125*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4126*e716630dSMartin Matuska 4127*e716630dSMartin Matuska /* 4128*e716630dSMartin Matuska * Reflow in memory. 4129*e716630dSMartin Matuska */ 4130*e716630dSMartin Matuska uint64_t logical_sectors = logical_size >> ashift; 4131*e716630dSMartin Matuska for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4132*e716630dSMartin Matuska int oldchild = i % (raidvd->vdev_children - 1); 4133*e716630dSMartin Matuska uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4134*e716630dSMartin Matuska 4135*e716630dSMartin Matuska int newchild = i % raidvd->vdev_children; 4136*e716630dSMartin Matuska uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4137*e716630dSMartin Matuska 4138*e716630dSMartin Matuska /* a single sector should not be copying over itself */ 4139*e716630dSMartin Matuska ASSERT(!(newchild == oldchild && newoff == oldoff)); 4140*e716630dSMartin Matuska 4141*e716630dSMartin Matuska abd_copy_off(abds[newchild], abds[oldchild], 4142*e716630dSMartin Matuska newoff, oldoff, 1 << ashift); 4143*e716630dSMartin Matuska } 4144*e716630dSMartin Matuska 4145*e716630dSMartin Matuska /* 4146*e716630dSMartin Matuska * Verify that we filled in everything we intended to (write_size on 4147*e716630dSMartin Matuska * each child). 4148*e716630dSMartin Matuska */ 4149*e716630dSMartin Matuska VERIFY0(logical_sectors % raidvd->vdev_children); 4150*e716630dSMartin Matuska VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4151*e716630dSMartin Matuska write_size); 4152*e716630dSMartin Matuska 4153*e716630dSMartin Matuska /* 4154*e716630dSMartin Matuska * Write to scratch location (boot area). 4155*e716630dSMartin Matuska */ 4156*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4157*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4158*e716630dSMartin Matuska /* 4159*e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4160*e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4161*e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4162*e716630dSMartin Matuska */ 4163*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4164*e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4165*e716630dSMartin Matuska write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 4166*e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4167*e716630dSMartin Matuska } 4168*e716630dSMartin Matuska error = zio_wait(pio); 4169*e716630dSMartin Matuska if (error != 0) { 4170*e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing scratch location", error); 4171*e716630dSMartin Matuska goto io_error_exit; 4172*e716630dSMartin Matuska } 4173*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4174*e716630dSMartin Matuska zio_flush(pio, raidvd); 4175*e716630dSMartin Matuska zio_wait(pio); 4176*e716630dSMartin Matuska 4177*e716630dSMartin Matuska zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4178*e716630dSMartin Matuska (long long)logical_size); 4179*e716630dSMartin Matuska 4180*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4181*e716630dSMartin Matuska 4182*e716630dSMartin Matuska /* 4183*e716630dSMartin Matuska * Update uberblock to indicate that scratch space is valid. This is 4184*e716630dSMartin Matuska * needed because after this point, the real location may be 4185*e716630dSMartin Matuska * overwritten. If we crash, we need to get the data from the 4186*e716630dSMartin Matuska * scratch space, rather than the real location. 4187*e716630dSMartin Matuska * 4188*e716630dSMartin Matuska * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4189*e716630dSMartin Matuska * will prefer this uberblock. 4190*e716630dSMartin Matuska */ 4191*e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4192*e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4193*e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4194*e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4195*e716630dSMartin Matuska if (spa_multihost(spa)) 4196*e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4197*e716630dSMartin Matuska 4198*e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4199*e716630dSMartin Matuska "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4200*e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4201*e716630dSMartin Matuska (long long)logical_size, 4202*e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4203*e716630dSMartin Matuska 4204*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4205*e716630dSMartin Matuska 4206*e716630dSMartin Matuska /* 4207*e716630dSMartin Matuska * Overwrite with reflow'ed data. 4208*e716630dSMartin Matuska */ 4209*e716630dSMartin Matuska overwrite: 4210*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4211*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4212*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4213*e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 4214*e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, 4215*e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4216*e716630dSMartin Matuska } 4217*e716630dSMartin Matuska error = zio_wait(pio); 4218*e716630dSMartin Matuska if (error != 0) { 4219*e716630dSMartin Matuska /* 4220*e716630dSMartin Matuska * When we exit early here and drop the range lock, new 4221*e716630dSMartin Matuska * writes will go into the scratch area so we'll need to 4222*e716630dSMartin Matuska * read from there when we return after pausing. 4223*e716630dSMartin Matuska */ 4224*e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing real location", error); 4225*e716630dSMartin Matuska /* 4226*e716630dSMartin Matuska * Update the uberblock that is written when this txg completes. 4227*e716630dSMartin Matuska */ 4228*e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4229*e716630dSMartin Matuska logical_size); 4230*e716630dSMartin Matuska goto io_error_exit; 4231*e716630dSMartin Matuska } 4232*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4233*e716630dSMartin Matuska zio_flush(pio, raidvd); 4234*e716630dSMartin Matuska zio_wait(pio); 4235*e716630dSMartin Matuska 4236*e716630dSMartin Matuska zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4237*e716630dSMartin Matuska (long long)logical_size); 4238*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4239*e716630dSMartin Matuska abd_free(abds[i]); 4240*e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4241*e716630dSMartin Matuska 4242*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4243*e716630dSMartin Matuska 4244*e716630dSMartin Matuska /* 4245*e716630dSMartin Matuska * Update uberblock to indicate that the initial part has been 4246*e716630dSMartin Matuska * reflow'ed. This is needed because after this point (when we exit 4247*e716630dSMartin Matuska * the rangelock), we allow regular writes to this region, which will 4248*e716630dSMartin Matuska * be written to the new location only (because reflow_offset_next == 4249*e716630dSMartin Matuska * reflow_offset_synced). If we crashed and re-copied from the 4250*e716630dSMartin Matuska * scratch space, we would lose the regular writes. 4251*e716630dSMartin Matuska */ 4252*e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4253*e716630dSMartin Matuska logical_size); 4254*e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4255*e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4256*e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4257*e716630dSMartin Matuska if (spa_multihost(spa)) 4258*e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4259*e716630dSMartin Matuska 4260*e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4261*e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4262*e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4263*e716630dSMartin Matuska (long long)logical_size, 4264*e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4265*e716630dSMartin Matuska 4266*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4267*e716630dSMartin Matuska 4268*e716630dSMartin Matuska /* 4269*e716630dSMartin Matuska * Update progress. 4270*e716630dSMartin Matuska */ 4271*e716630dSMartin Matuska vre->vre_offset = logical_size; 4272*e716630dSMartin Matuska zfs_rangelock_exit(lr); 4273*e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4274*e716630dSMartin Matuska 4275*e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4276*e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4277*e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4278*e716630dSMartin Matuska /* 4279*e716630dSMartin Matuska * Note - raidz_reflow_sync() will update the uberblock state to 4280*e716630dSMartin Matuska * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4281*e716630dSMartin Matuska */ 4282*e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4283*e716630dSMartin Matuska 4284*e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4285*e716630dSMartin Matuska } 4286*e716630dSMartin Matuska 4287*e716630dSMartin Matuska /* 4288*e716630dSMartin Matuska * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4289*e716630dSMartin Matuska * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4290*e716630dSMartin Matuska */ 4291*e716630dSMartin Matuska void 4292*e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa) 4293*e716630dSMartin Matuska { 4294*e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4295*e716630dSMartin Matuska uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4296*e716630dSMartin Matuska ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4297*e716630dSMartin Matuska 4298*e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4299*e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4300*e716630dSMartin Matuska ASSERT0(logical_size % raidvd->vdev_children); 4301*e716630dSMartin Matuska uint64_t write_size = logical_size / raidvd->vdev_children; 4302*e716630dSMartin Matuska 4303*e716630dSMartin Matuska zio_t *pio; 4304*e716630dSMartin Matuska 4305*e716630dSMartin Matuska /* 4306*e716630dSMartin Matuska * Read from scratch space. 4307*e716630dSMartin Matuska */ 4308*e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4309*e716630dSMartin Matuska KM_SLEEP); 4310*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4311*e716630dSMartin Matuska abds[i] = abd_alloc_linear(write_size, B_FALSE); 4312*e716630dSMartin Matuska } 4313*e716630dSMartin Matuska 4314*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4315*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4316*e716630dSMartin Matuska /* 4317*e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4318*e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4319*e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4320*e716630dSMartin Matuska */ 4321*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4322*e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 4323*e716630dSMartin Matuska write_size, ZIO_TYPE_READ, 4324*e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, 4325*e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4326*e716630dSMartin Matuska } 4327*e716630dSMartin Matuska zio_wait(pio); 4328*e716630dSMartin Matuska 4329*e716630dSMartin Matuska /* 4330*e716630dSMartin Matuska * Overwrite real location with reflow'ed data. 4331*e716630dSMartin Matuska */ 4332*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4333*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4334*e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4335*e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 4336*e716630dSMartin Matuska ZIO_PRIORITY_ASYNC_WRITE, 0, 4337*e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4338*e716630dSMartin Matuska } 4339*e716630dSMartin Matuska zio_wait(pio); 4340*e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4341*e716630dSMartin Matuska zio_flush(pio, raidvd); 4342*e716630dSMartin Matuska zio_wait(pio); 4343*e716630dSMartin Matuska 4344*e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4345*e716630dSMartin Matuska "to real location", (long long)logical_size); 4346*e716630dSMartin Matuska 4347*e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4348*e716630dSMartin Matuska abd_free(abds[i]); 4349*e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4350*e716630dSMartin Matuska 4351*e716630dSMartin Matuska /* 4352*e716630dSMartin Matuska * Update uberblock. 4353*e716630dSMartin Matuska */ 4354*e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4355*e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4356*e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4357*e716630dSMartin Matuska VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4358*e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4359*e716630dSMartin Matuska if (spa_multihost(spa)) 4360*e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4361*e716630dSMartin Matuska 4362*e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: uberblock updated " 4363*e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4364*e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4365*e716630dSMartin Matuska (long long)logical_size, 4366*e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4367*e716630dSMartin Matuska 4368*e716630dSMartin Matuska dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4369*e716630dSMartin Matuska spa_first_txg(spa)); 4370*e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4371*e716630dSMartin Matuska vre->vre_offset = logical_size; 4372*e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4373*e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4374*e716630dSMartin Matuska /* 4375*e716630dSMartin Matuska * Note that raidz_reflow_sync() will update the uberblock once more 4376*e716630dSMartin Matuska */ 4377*e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4378*e716630dSMartin Matuska 4379*e716630dSMartin Matuska dmu_tx_commit(tx); 4380*e716630dSMartin Matuska 4381*e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4382*e716630dSMartin Matuska } 4383*e716630dSMartin Matuska 4384*e716630dSMartin Matuska static boolean_t 4385*e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4386*e716630dSMartin Matuska { 4387*e716630dSMartin Matuska (void) zthr; 4388*e716630dSMartin Matuska spa_t *spa = arg; 4389*e716630dSMartin Matuska 4390*e716630dSMartin Matuska return (spa->spa_raidz_expand != NULL && 4391*e716630dSMartin Matuska !spa->spa_raidz_expand->vre_waiting_for_resilver); 4392*e716630dSMartin Matuska } 4393*e716630dSMartin Matuska 4394*e716630dSMartin Matuska /* 4395*e716630dSMartin Matuska * RAIDZ expansion background thread 4396*e716630dSMartin Matuska * 4397*e716630dSMartin Matuska * Can be called multiple times if the reflow is paused 4398*e716630dSMartin Matuska */ 4399*e716630dSMartin Matuska static void 4400*e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4401*e716630dSMartin Matuska { 4402*e716630dSMartin Matuska spa_t *spa = arg; 4403*e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4404*e716630dSMartin Matuska 4405*e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4406*e716630dSMartin Matuska vre->vre_offset = 0; 4407*e716630dSMartin Matuska else 4408*e716630dSMartin Matuska vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4409*e716630dSMartin Matuska 4410*e716630dSMartin Matuska /* Reflow the begining portion using the scratch area */ 4411*e716630dSMartin Matuska if (vre->vre_offset == 0) { 4412*e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), 4413*e716630dSMartin Matuska NULL, raidz_reflow_scratch_sync, 4414*e716630dSMartin Matuska vre, 0, ZFS_SPACE_CHECK_NONE)); 4415*e716630dSMartin Matuska 4416*e716630dSMartin Matuska /* if we encountered errors then pause */ 4417*e716630dSMartin Matuska if (vre->vre_offset == 0) { 4418*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4419*e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4420*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4421*e716630dSMartin Matuska return; 4422*e716630dSMartin Matuska } 4423*e716630dSMartin Matuska } 4424*e716630dSMartin Matuska 4425*e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4426*e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4427*e716630dSMartin Matuska 4428*e716630dSMartin Matuska uint64_t guid = raidvd->vdev_guid; 4429*e716630dSMartin Matuska 4430*e716630dSMartin Matuska /* Iterate over all the remaining metaslabs */ 4431*e716630dSMartin Matuska for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4432*e716630dSMartin Matuska i < raidvd->vdev_ms_count && 4433*e716630dSMartin Matuska !zthr_iscancelled(zthr) && 4434*e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX; i++) { 4435*e716630dSMartin Matuska metaslab_t *msp = raidvd->vdev_ms[i]; 4436*e716630dSMartin Matuska 4437*e716630dSMartin Matuska metaslab_disable(msp); 4438*e716630dSMartin Matuska mutex_enter(&msp->ms_lock); 4439*e716630dSMartin Matuska 4440*e716630dSMartin Matuska /* 4441*e716630dSMartin Matuska * The metaslab may be newly created (for the expanded 4442*e716630dSMartin Matuska * space), in which case its trees won't exist yet, 4443*e716630dSMartin Matuska * so we need to bail out early. 4444*e716630dSMartin Matuska */ 4445*e716630dSMartin Matuska if (msp->ms_new) { 4446*e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4447*e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4448*e716630dSMartin Matuska continue; 4449*e716630dSMartin Matuska } 4450*e716630dSMartin Matuska 4451*e716630dSMartin Matuska VERIFY0(metaslab_load(msp)); 4452*e716630dSMartin Matuska 4453*e716630dSMartin Matuska /* 4454*e716630dSMartin Matuska * We want to copy everything except the free (allocatable) 4455*e716630dSMartin Matuska * space. Note that there may be a little bit more free 4456*e716630dSMartin Matuska * space (e.g. in ms_defer), and it's fine to copy that too. 4457*e716630dSMartin Matuska */ 4458*e716630dSMartin Matuska range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, 4459*e716630dSMartin Matuska NULL, 0, 0); 4460*e716630dSMartin Matuska range_tree_add(rt, msp->ms_start, msp->ms_size); 4461*e716630dSMartin Matuska range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); 4462*e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4463*e716630dSMartin Matuska 4464*e716630dSMartin Matuska /* 4465*e716630dSMartin Matuska * Force the last sector of each metaslab to be copied. This 4466*e716630dSMartin Matuska * ensures that we advance the on-disk progress to the end of 4467*e716630dSMartin Matuska * this metaslab while the metaslab is disabled. Otherwise, we 4468*e716630dSMartin Matuska * could move past this metaslab without advancing the on-disk 4469*e716630dSMartin Matuska * progress, and then an allocation to this metaslab would not 4470*e716630dSMartin Matuska * be copied. 4471*e716630dSMartin Matuska */ 4472*e716630dSMartin Matuska int sectorsz = 1 << raidvd->vdev_ashift; 4473*e716630dSMartin Matuska uint64_t ms_last_offset = msp->ms_start + 4474*e716630dSMartin Matuska msp->ms_size - sectorsz; 4475*e716630dSMartin Matuska if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { 4476*e716630dSMartin Matuska range_tree_add(rt, ms_last_offset, sectorsz); 4477*e716630dSMartin Matuska } 4478*e716630dSMartin Matuska 4479*e716630dSMartin Matuska /* 4480*e716630dSMartin Matuska * When we are resuming from a paused expansion (i.e. 4481*e716630dSMartin Matuska * when importing a pool with a expansion in progress), 4482*e716630dSMartin Matuska * discard any state that we have already processed. 4483*e716630dSMartin Matuska */ 4484*e716630dSMartin Matuska range_tree_clear(rt, 0, vre->vre_offset); 4485*e716630dSMartin Matuska 4486*e716630dSMartin Matuska while (!zthr_iscancelled(zthr) && 4487*e716630dSMartin Matuska !range_tree_is_empty(rt) && 4488*e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX) { 4489*e716630dSMartin Matuska 4490*e716630dSMartin Matuska /* 4491*e716630dSMartin Matuska * We need to periodically drop the config lock so that 4492*e716630dSMartin Matuska * writers can get in. Additionally, we can't wait 4493*e716630dSMartin Matuska * for a txg to sync while holding a config lock 4494*e716630dSMartin Matuska * (since a waiting writer could cause a 3-way deadlock 4495*e716630dSMartin Matuska * with the sync thread, which also gets a config 4496*e716630dSMartin Matuska * lock for reader). So we can't hold the config lock 4497*e716630dSMartin Matuska * while calling dmu_tx_assign(). 4498*e716630dSMartin Matuska */ 4499*e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4500*e716630dSMartin Matuska 4501*e716630dSMartin Matuska /* 4502*e716630dSMartin Matuska * If requested, pause the reflow when the amount 4503*e716630dSMartin Matuska * specified by raidz_expand_max_reflow_bytes is reached 4504*e716630dSMartin Matuska * 4505*e716630dSMartin Matuska * This pause is only used during testing or debugging. 4506*e716630dSMartin Matuska */ 4507*e716630dSMartin Matuska while (raidz_expand_max_reflow_bytes != 0 && 4508*e716630dSMartin Matuska raidz_expand_max_reflow_bytes <= 4509*e716630dSMartin Matuska vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4510*e716630dSMartin Matuska delay(hz); 4511*e716630dSMartin Matuska } 4512*e716630dSMartin Matuska 4513*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4514*e716630dSMartin Matuska while (vre->vre_outstanding_bytes > 4515*e716630dSMartin Matuska raidz_expand_max_copy_bytes) { 4516*e716630dSMartin Matuska cv_wait(&vre->vre_cv, &vre->vre_lock); 4517*e716630dSMartin Matuska } 4518*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4519*e716630dSMartin Matuska 4520*e716630dSMartin Matuska dmu_tx_t *tx = 4521*e716630dSMartin Matuska dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4522*e716630dSMartin Matuska 4523*e716630dSMartin Matuska VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 4524*e716630dSMartin Matuska uint64_t txg = dmu_tx_get_txg(tx); 4525*e716630dSMartin Matuska 4526*e716630dSMartin Matuska /* 4527*e716630dSMartin Matuska * Reacquire the vdev_config lock. Theoretically, the 4528*e716630dSMartin Matuska * vdev_t that we're expanding may have changed. 4529*e716630dSMartin Matuska */ 4530*e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4531*e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4532*e716630dSMartin Matuska 4533*e716630dSMartin Matuska boolean_t needsync = 4534*e716630dSMartin Matuska raidz_reflow_impl(raidvd, vre, rt, tx); 4535*e716630dSMartin Matuska 4536*e716630dSMartin Matuska dmu_tx_commit(tx); 4537*e716630dSMartin Matuska 4538*e716630dSMartin Matuska if (needsync) { 4539*e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4540*e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, txg); 4541*e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, 4542*e716630dSMartin Matuska RW_READER); 4543*e716630dSMartin Matuska } 4544*e716630dSMartin Matuska } 4545*e716630dSMartin Matuska 4546*e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4547*e716630dSMartin Matuska 4548*e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4549*e716630dSMartin Matuska range_tree_vacate(rt, NULL, NULL); 4550*e716630dSMartin Matuska range_tree_destroy(rt); 4551*e716630dSMartin Matuska 4552*e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4553*e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4554*e716630dSMartin Matuska } 4555*e716630dSMartin Matuska 4556*e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4557*e716630dSMartin Matuska 4558*e716630dSMartin Matuska /* 4559*e716630dSMartin Matuska * The txg_wait_synced() here ensures that all reflow zio's have 4560*e716630dSMartin Matuska * completed, and vre_failed_offset has been set if necessary. It 4561*e716630dSMartin Matuska * also ensures that the progress of the last raidz_reflow_sync() is 4562*e716630dSMartin Matuska * written to disk before raidz_reflow_complete_sync() changes the 4563*e716630dSMartin Matuska * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4564*e716630dSMartin Matuska * determine if a reflow is in progress, in which case we may need to 4565*e716630dSMartin Matuska * write to both old and new locations. Therefore we can only change 4566*e716630dSMartin Matuska * vre_state once this is not necessary, which is once the on-disk 4567*e716630dSMartin Matuska * progress (in spa_ubsync) has been set past any possible writes (to 4568*e716630dSMartin Matuska * the end of the last metaslab). 4569*e716630dSMartin Matuska */ 4570*e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, 0); 4571*e716630dSMartin Matuska 4572*e716630dSMartin Matuska if (!zthr_iscancelled(zthr) && 4573*e716630dSMartin Matuska vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4574*e716630dSMartin Matuska /* 4575*e716630dSMartin Matuska * We are not being canceled or paused, so the reflow must be 4576*e716630dSMartin Matuska * complete. In that case also mark it as completed on disk. 4577*e716630dSMartin Matuska */ 4578*e716630dSMartin Matuska ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4579*e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4580*e716630dSMartin Matuska raidz_reflow_complete_sync, spa, 4581*e716630dSMartin Matuska 0, ZFS_SPACE_CHECK_NONE)); 4582*e716630dSMartin Matuska (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4583*e716630dSMartin Matuska } else { 4584*e716630dSMartin Matuska /* 4585*e716630dSMartin Matuska * Wait for all copy zio's to complete and for all the 4586*e716630dSMartin Matuska * raidz_reflow_sync() synctasks to be run. 4587*e716630dSMartin Matuska */ 4588*e716630dSMartin Matuska spa_history_log_internal(spa, "reflow pause", 4589*e716630dSMartin Matuska NULL, "offset=%llu failed_offset=%lld", 4590*e716630dSMartin Matuska (long long)vre->vre_offset, 4591*e716630dSMartin Matuska (long long)vre->vre_failed_offset); 4592*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4593*e716630dSMartin Matuska if (vre->vre_failed_offset != UINT64_MAX) { 4594*e716630dSMartin Matuska /* 4595*e716630dSMartin Matuska * Reset progress so that we will retry everything 4596*e716630dSMartin Matuska * after the point that something failed. 4597*e716630dSMartin Matuska */ 4598*e716630dSMartin Matuska vre->vre_offset = vre->vre_failed_offset; 4599*e716630dSMartin Matuska vre->vre_failed_offset = UINT64_MAX; 4600*e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4601*e716630dSMartin Matuska } 4602*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4603*e716630dSMartin Matuska } 4604*e716630dSMartin Matuska } 4605*e716630dSMartin Matuska 4606*e716630dSMartin Matuska void 4607*e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa) 4608*e716630dSMartin Matuska { 4609*e716630dSMartin Matuska ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4610*e716630dSMartin Matuska spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4611*e716630dSMartin Matuska spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4612*e716630dSMartin Matuska spa, defclsyspri); 4613*e716630dSMartin Matuska } 4614*e716630dSMartin Matuska 4615*e716630dSMartin Matuska void 4616*e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd) 4617*e716630dSMartin Matuska { 4618*e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 4619*e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL) { 4620*e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4621*e716630dSMartin Matuska /* 4622*e716630dSMartin Matuska * we get called often from vdev_dtl_reassess() so make 4623*e716630dSMartin Matuska * sure it's our vdev and any replacing is complete 4624*e716630dSMartin Matuska */ 4625*e716630dSMartin Matuska if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4626*e716630dSMartin Matuska !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4627*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4628*e716630dSMartin Matuska if (vre->vre_waiting_for_resilver) { 4629*e716630dSMartin Matuska vdev_dbgmsg(vd, "DTL reassessed, " 4630*e716630dSMartin Matuska "continuing raidz expansion"); 4631*e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_FALSE; 4632*e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4633*e716630dSMartin Matuska } 4634*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4635*e716630dSMartin Matuska } 4636*e716630dSMartin Matuska } 4637*e716630dSMartin Matuska } 4638*e716630dSMartin Matuska 4639*e716630dSMartin Matuska int 4640*e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child) 4641*e716630dSMartin Matuska { 4642*e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4643*e716630dSMartin Matuska uint64_t new_children = raidvd->vdev_children; 4644*e716630dSMartin Matuska 4645*e716630dSMartin Matuska /* 4646*e716630dSMartin Matuska * We use the "boot" space as scratch space to handle overwriting the 4647*e716630dSMartin Matuska * initial part of the vdev. If it is too small, then this expansion 4648*e716630dSMartin Matuska * is not allowed. This would be very unusual (e.g. ashift > 13 and 4649*e716630dSMartin Matuska * >200 children). 4650*e716630dSMartin Matuska */ 4651*e716630dSMartin Matuska if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4652*e716630dSMartin Matuska return (EINVAL); 4653*e716630dSMartin Matuska } 4654*e716630dSMartin Matuska return (0); 4655*e716630dSMartin Matuska } 4656*e716630dSMartin Matuska 4657*e716630dSMartin Matuska void 4658*e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4659*e716630dSMartin Matuska { 4660*e716630dSMartin Matuska vdev_t *new_child = arg; 4661*e716630dSMartin Matuska spa_t *spa = new_child->vdev_spa; 4662*e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4663*e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4664*e716630dSMartin Matuska ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4665*e716630dSMartin Matuska ASSERT3P(raidvd->vdev_top, ==, raidvd); 4666*e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4667*e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4668*e716630dSMartin Matuska ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4669*e716630dSMartin Matuska new_child); 4670*e716630dSMartin Matuska 4671*e716630dSMartin Matuska spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4672*e716630dSMartin Matuska 4673*e716630dSMartin Matuska vdrz->vd_physical_width++; 4674*e716630dSMartin Matuska 4675*e716630dSMartin Matuska VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4676*e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4677*e716630dSMartin Matuska vdrz->vn_vre.vre_offset = 0; 4678*e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4679*e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4680*e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4681*e716630dSMartin Matuska 4682*e716630dSMartin Matuska /* 4683*e716630dSMartin Matuska * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4684*e716630dSMartin Matuska * written to the config. 4685*e716630dSMartin Matuska */ 4686*e716630dSMartin Matuska vdev_config_dirty(raidvd); 4687*e716630dSMartin Matuska 4688*e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4689*e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = 0; 4690*e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4691*e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = 0; 4692*e716630dSMartin Matuska 4693*e716630dSMartin Matuska uint64_t state = vdrz->vn_vre.vre_state; 4694*e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4695*e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4696*e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 4697*e716630dSMartin Matuska 4698*e716630dSMartin Matuska uint64_t start_time = vdrz->vn_vre.vre_start_time; 4699*e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4700*e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4701*e716630dSMartin Matuska sizeof (start_time), 1, &start_time, tx)); 4702*e716630dSMartin Matuska 4703*e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4704*e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4705*e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4706*e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4707*e716630dSMartin Matuska 4708*e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4709*e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 4710*e716630dSMartin Matuska (unsigned long long)raidvd->vdev_id, 4711*e716630dSMartin Matuska (unsigned long long)raidvd->vdev_children); 4712*e716630dSMartin Matuska } 4713*e716630dSMartin Matuska 4714*e716630dSMartin Matuska int 4715*e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd) 4716*e716630dSMartin Matuska { 4717*e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4718*e716630dSMartin Matuska int err; 4719*e716630dSMartin Matuska 4720*e716630dSMartin Matuska uint64_t state = DSS_NONE; 4721*e716630dSMartin Matuska uint64_t start_time = 0; 4722*e716630dSMartin Matuska uint64_t end_time = 0; 4723*e716630dSMartin Matuska uint64_t bytes_copied = 0; 4724*e716630dSMartin Matuska 4725*e716630dSMartin Matuska if (vd->vdev_top_zap != 0) { 4726*e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4727*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4728*e716630dSMartin Matuska sizeof (state), 1, &state); 4729*e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4730*e716630dSMartin Matuska return (err); 4731*e716630dSMartin Matuska 4732*e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4733*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4734*e716630dSMartin Matuska sizeof (start_time), 1, &start_time); 4735*e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4736*e716630dSMartin Matuska return (err); 4737*e716630dSMartin Matuska 4738*e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4739*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4740*e716630dSMartin Matuska sizeof (end_time), 1, &end_time); 4741*e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4742*e716630dSMartin Matuska return (err); 4743*e716630dSMartin Matuska 4744*e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4745*e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4746*e716630dSMartin Matuska sizeof (bytes_copied), 1, &bytes_copied); 4747*e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4748*e716630dSMartin Matuska return (err); 4749*e716630dSMartin Matuska } 4750*e716630dSMartin Matuska 4751*e716630dSMartin Matuska /* 4752*e716630dSMartin Matuska * If we are in the middle of expansion, vre_state should have 4753*e716630dSMartin Matuska * already been set by vdev_raidz_init(). 4754*e716630dSMartin Matuska */ 4755*e716630dSMartin Matuska EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4756*e716630dSMartin Matuska vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4757*e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = start_time; 4758*e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = end_time; 4759*e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4760*e716630dSMartin Matuska 4761*e716630dSMartin Matuska return (0); 4762*e716630dSMartin Matuska } 4763*e716630dSMartin Matuska 4764*e716630dSMartin Matuska int 4765*e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4766*e716630dSMartin Matuska { 4767*e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4768*e716630dSMartin Matuska 4769*e716630dSMartin Matuska if (vre == NULL) { 4770*e716630dSMartin Matuska /* no removal in progress; find most recent completed */ 4771*e716630dSMartin Matuska for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4772*e716630dSMartin Matuska vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4773*e716630dSMartin Matuska if (vd->vdev_ops == &vdev_raidz_ops) { 4774*e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4775*e716630dSMartin Matuska 4776*e716630dSMartin Matuska if (vdrz->vn_vre.vre_end_time != 0 && 4777*e716630dSMartin Matuska (vre == NULL || 4778*e716630dSMartin Matuska vdrz->vn_vre.vre_end_time > 4779*e716630dSMartin Matuska vre->vre_end_time)) { 4780*e716630dSMartin Matuska vre = &vdrz->vn_vre; 4781*e716630dSMartin Matuska } 4782*e716630dSMartin Matuska } 4783*e716630dSMartin Matuska } 4784*e716630dSMartin Matuska } 4785*e716630dSMartin Matuska 4786*e716630dSMartin Matuska if (vre == NULL) { 4787*e716630dSMartin Matuska return (SET_ERROR(ENOENT)); 4788*e716630dSMartin Matuska } 4789*e716630dSMartin Matuska 4790*e716630dSMartin Matuska pres->pres_state = vre->vre_state; 4791*e716630dSMartin Matuska pres->pres_expanding_vdev = vre->vre_vdev_id; 4792*e716630dSMartin Matuska 4793*e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4794*e716630dSMartin Matuska pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4795*e716630dSMartin Matuska 4796*e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4797*e716630dSMartin Matuska pres->pres_reflowed = vre->vre_bytes_copied; 4798*e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 4799*e716630dSMartin Matuska pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4800*e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4801*e716630dSMartin Matuska 4802*e716630dSMartin Matuska pres->pres_start_time = vre->vre_start_time; 4803*e716630dSMartin Matuska pres->pres_end_time = vre->vre_end_time; 4804*e716630dSMartin Matuska pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4805*e716630dSMartin Matuska 4806*e716630dSMartin Matuska return (0); 4807*e716630dSMartin Matuska } 4808*e716630dSMartin Matuska 48097877fdebSMatt Macy /* 48107877fdebSMatt Macy * Initialize private RAIDZ specific fields from the nvlist. 48117877fdebSMatt Macy */ 48127877fdebSMatt Macy static int 48137877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 48147877fdebSMatt Macy { 48157877fdebSMatt Macy uint_t children; 48167877fdebSMatt Macy nvlist_t **child; 48177877fdebSMatt Macy int error = nvlist_lookup_nvlist_array(nv, 48187877fdebSMatt Macy ZPOOL_CONFIG_CHILDREN, &child, &children); 48197877fdebSMatt Macy if (error != 0) 48207877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48217877fdebSMatt Macy 4822*e716630dSMartin Matuska uint64_t nparity; 48237877fdebSMatt Macy if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 48247877fdebSMatt Macy if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 48257877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48267877fdebSMatt Macy 48277877fdebSMatt Macy /* 48287877fdebSMatt Macy * Previous versions could only support 1 or 2 parity 48297877fdebSMatt Macy * device. 48307877fdebSMatt Macy */ 48317877fdebSMatt Macy if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 48327877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48337877fdebSMatt Macy else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 48347877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48357877fdebSMatt Macy } else { 48367877fdebSMatt Macy /* 48377877fdebSMatt Macy * We require the parity to be specified for SPAs that 48387877fdebSMatt Macy * support multiple parity levels. 48397877fdebSMatt Macy */ 48407877fdebSMatt Macy if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 48417877fdebSMatt Macy return (SET_ERROR(EINVAL)); 48427877fdebSMatt Macy 48437877fdebSMatt Macy /* 48447877fdebSMatt Macy * Otherwise, we default to 1 parity device for RAID-Z. 48457877fdebSMatt Macy */ 48467877fdebSMatt Macy nparity = 1; 48477877fdebSMatt Macy } 48487877fdebSMatt Macy 4849*e716630dSMartin Matuska vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4850*e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = -1; 4851*e716630dSMartin Matuska vdrz->vn_vre.vre_offset = UINT64_MAX; 4852*e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4853*e716630dSMartin Matuska mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4854*e716630dSMartin Matuska cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4855*e716630dSMartin Matuska zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4856*e716630dSMartin Matuska mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4857*e716630dSMartin Matuska avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4858*e716630dSMartin Matuska sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4859*e716630dSMartin Matuska 4860*e716630dSMartin Matuska vdrz->vd_physical_width = children; 48617877fdebSMatt Macy vdrz->vd_nparity = nparity; 48627877fdebSMatt Macy 4863*e716630dSMartin Matuska /* note, the ID does not exist when creating a pool */ 4864*e716630dSMartin Matuska (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4865*e716630dSMartin Matuska &vdrz->vn_vre.vre_vdev_id); 4866*e716630dSMartin Matuska 4867*e716630dSMartin Matuska boolean_t reflow_in_progress = 4868*e716630dSMartin Matuska nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4869*e716630dSMartin Matuska if (reflow_in_progress) { 4870*e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4871*e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4872*e716630dSMartin Matuska } 4873*e716630dSMartin Matuska 4874*e716630dSMartin Matuska vdrz->vd_original_width = children; 4875*e716630dSMartin Matuska uint64_t *txgs; 4876*e716630dSMartin Matuska unsigned int txgs_size = 0; 4877*e716630dSMartin Matuska error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4878*e716630dSMartin Matuska &txgs, &txgs_size); 4879*e716630dSMartin Matuska if (error == 0) { 4880*e716630dSMartin Matuska for (int i = 0; i < txgs_size; i++) { 4881*e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4882*e716630dSMartin Matuska re->re_txg = txgs[txgs_size - i - 1]; 4883*e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width - i; 4884*e716630dSMartin Matuska 4885*e716630dSMartin Matuska if (reflow_in_progress) 4886*e716630dSMartin Matuska re->re_logical_width--; 4887*e716630dSMartin Matuska 4888*e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 4889*e716630dSMartin Matuska } 4890*e716630dSMartin Matuska 4891*e716630dSMartin Matuska vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4892*e716630dSMartin Matuska } 4893*e716630dSMartin Matuska if (reflow_in_progress) { 4894*e716630dSMartin Matuska vdrz->vd_original_width--; 4895*e716630dSMartin Matuska zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 4896*e716630dSMartin Matuska children, txgs_size); 4897*e716630dSMartin Matuska } 4898*e716630dSMartin Matuska 48997877fdebSMatt Macy *tsd = vdrz; 49007877fdebSMatt Macy 49017877fdebSMatt Macy return (0); 49027877fdebSMatt Macy } 49037877fdebSMatt Macy 49047877fdebSMatt Macy static void 49057877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd) 49067877fdebSMatt Macy { 4907*e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4908*e716630dSMartin Matuska if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 4909*e716630dSMartin Matuska vd->vdev_spa->spa_raidz_expand = NULL; 4910*e716630dSMartin Matuska reflow_node_t *re; 4911*e716630dSMartin Matuska void *cookie = NULL; 4912*e716630dSMartin Matuska avl_tree_t *tree = &vdrz->vd_expand_txgs; 4913*e716630dSMartin Matuska while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 4914*e716630dSMartin Matuska kmem_free(re, sizeof (*re)); 4915*e716630dSMartin Matuska avl_destroy(&vdrz->vd_expand_txgs); 4916*e716630dSMartin Matuska mutex_destroy(&vdrz->vd_expand_lock); 4917*e716630dSMartin Matuska mutex_destroy(&vdrz->vn_vre.vre_lock); 4918*e716630dSMartin Matuska cv_destroy(&vdrz->vn_vre.vre_cv); 4919*e716630dSMartin Matuska zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 4920*e716630dSMartin Matuska kmem_free(vdrz, sizeof (*vdrz)); 49217877fdebSMatt Macy } 49227877fdebSMatt Macy 49237877fdebSMatt Macy /* 49247877fdebSMatt Macy * Add RAIDZ specific fields to the config nvlist. 49257877fdebSMatt Macy */ 49267877fdebSMatt Macy static void 49277877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 49287877fdebSMatt Macy { 49297877fdebSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 49307877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 49317877fdebSMatt Macy 49327877fdebSMatt Macy /* 49337877fdebSMatt Macy * Make sure someone hasn't managed to sneak a fancy new vdev 49347877fdebSMatt Macy * into a crufty old storage pool. 49357877fdebSMatt Macy */ 49367877fdebSMatt Macy ASSERT(vdrz->vd_nparity == 1 || 49377877fdebSMatt Macy (vdrz->vd_nparity <= 2 && 49387877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 49397877fdebSMatt Macy (vdrz->vd_nparity <= 3 && 49407877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 49417877fdebSMatt Macy 49427877fdebSMatt Macy /* 49437877fdebSMatt Macy * Note that we'll add these even on storage pools where they 49447877fdebSMatt Macy * aren't strictly required -- older software will just ignore 49457877fdebSMatt Macy * it. 49467877fdebSMatt Macy */ 49477877fdebSMatt Macy fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 4948*e716630dSMartin Matuska 4949*e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 4950*e716630dSMartin Matuska fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4951*e716630dSMartin Matuska } 4952*e716630dSMartin Matuska 4953*e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 4954*e716630dSMartin Matuska if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 4955*e716630dSMartin Matuska uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 4956*e716630dSMartin Matuska uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 4957*e716630dSMartin Matuska KM_SLEEP); 4958*e716630dSMartin Matuska uint64_t i = 0; 4959*e716630dSMartin Matuska 4960*e716630dSMartin Matuska for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 4961*e716630dSMartin Matuska re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 4962*e716630dSMartin Matuska txgs[i++] = re->re_txg; 4963*e716630dSMartin Matuska } 4964*e716630dSMartin Matuska 4965*e716630dSMartin Matuska fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4966*e716630dSMartin Matuska txgs, count); 4967*e716630dSMartin Matuska 4968*e716630dSMartin Matuska kmem_free(txgs, sizeof (uint64_t) * count); 4969*e716630dSMartin Matuska } 4970*e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 49717877fdebSMatt Macy } 49727877fdebSMatt Macy 49737877fdebSMatt Macy static uint64_t 49747877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd) 49757877fdebSMatt Macy { 49767877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 49777877fdebSMatt Macy return (vdrz->vd_nparity); 49787877fdebSMatt Macy } 49797877fdebSMatt Macy 49807877fdebSMatt Macy static uint64_t 49817877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd) 49827877fdebSMatt Macy { 49837877fdebSMatt Macy return (vd->vdev_children); 4984eda14cbcSMatt Macy } 4985eda14cbcSMatt Macy 4986eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = { 49877877fdebSMatt Macy .vdev_op_init = vdev_raidz_init, 49887877fdebSMatt Macy .vdev_op_fini = vdev_raidz_fini, 4989eda14cbcSMatt Macy .vdev_op_open = vdev_raidz_open, 4990eda14cbcSMatt Macy .vdev_op_close = vdev_raidz_close, 4991eda14cbcSMatt Macy .vdev_op_asize = vdev_raidz_asize, 49927877fdebSMatt Macy .vdev_op_min_asize = vdev_raidz_min_asize, 49937877fdebSMatt Macy .vdev_op_min_alloc = NULL, 4994eda14cbcSMatt Macy .vdev_op_io_start = vdev_raidz_io_start, 4995eda14cbcSMatt Macy .vdev_op_io_done = vdev_raidz_io_done, 4996eda14cbcSMatt Macy .vdev_op_state_change = vdev_raidz_state_change, 4997eda14cbcSMatt Macy .vdev_op_need_resilver = vdev_raidz_need_resilver, 4998eda14cbcSMatt Macy .vdev_op_hold = NULL, 4999eda14cbcSMatt Macy .vdev_op_rele = NULL, 5000eda14cbcSMatt Macy .vdev_op_remap = NULL, 5001eda14cbcSMatt Macy .vdev_op_xlate = vdev_raidz_xlate, 50027877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 50037877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 50047877fdebSMatt Macy .vdev_op_config_generate = vdev_raidz_config_generate, 50057877fdebSMatt Macy .vdev_op_nparity = vdev_raidz_nparity, 50067877fdebSMatt Macy .vdev_op_ndisks = vdev_raidz_ndisks, 5007eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5008eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5009eda14cbcSMatt Macy }; 5010*e716630dSMartin Matuska 5011*e716630dSMartin Matuska /* BEGIN CSTYLED */ 5012*e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5013*e716630dSMartin Matuska "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5014*e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5015*e716630dSMartin Matuska "Max amount of concurrent i/o for RAIDZ expansion"); 5016*e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5017*e716630dSMartin Matuska "For expanded RAIDZ, aggregate reads that have more rows than this"); 5018*e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5019*e716630dSMartin Matuska "For expanded RAIDZ, automatically start a pool scrub when expansion " 5020*e716630dSMartin Matuska "completes"); 5021*e716630dSMartin Matuska /* END CSTYLED */ 5022