161145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0 2eda14cbcSMatt Macy /* 3eda14cbcSMatt Macy * CDDL HEADER START 4eda14cbcSMatt Macy * 5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 6eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 7eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 8eda14cbcSMatt Macy * 9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 11eda14cbcSMatt Macy * See the License for the specific language governing permissions 12eda14cbcSMatt Macy * and limitations under the License. 13eda14cbcSMatt Macy * 14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 19eda14cbcSMatt Macy * 20eda14cbcSMatt Macy * CDDL HEADER END 21eda14cbcSMatt Macy */ 22eda14cbcSMatt Macy 23eda14cbcSMatt Macy /* 24eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 252c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 26eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 27eda14cbcSMatt Macy */ 28eda14cbcSMatt Macy 29eda14cbcSMatt Macy #include <sys/zfs_context.h> 30eda14cbcSMatt Macy #include <sys/spa.h> 31e716630dSMartin Matuska #include <sys/spa_impl.h> 32e716630dSMartin Matuska #include <sys/zap.h> 33eda14cbcSMatt Macy #include <sys/vdev_impl.h> 34e716630dSMartin Matuska #include <sys/metaslab_impl.h> 35eda14cbcSMatt Macy #include <sys/zio.h> 36eda14cbcSMatt Macy #include <sys/zio_checksum.h> 37e716630dSMartin Matuska #include <sys/dmu_tx.h> 38eda14cbcSMatt Macy #include <sys/abd.h> 39e716630dSMartin Matuska #include <sys/zfs_rlock.h> 40eda14cbcSMatt Macy #include <sys/fs/zfs.h> 41eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 42eda14cbcSMatt Macy #include <sys/vdev_raidz.h> 43eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h> 447877fdebSMatt Macy #include <sys/vdev_draid.h> 45e716630dSMartin Matuska #include <sys/uberblock_impl.h> 46e716630dSMartin Matuska #include <sys/dsl_scan.h> 47eda14cbcSMatt Macy 48eda14cbcSMatt Macy #ifdef ZFS_DEBUG 49eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 50eda14cbcSMatt Macy #endif 51eda14cbcSMatt Macy 52eda14cbcSMatt Macy /* 53eda14cbcSMatt Macy * Virtual device vector for RAID-Z. 54eda14cbcSMatt Macy * 55eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity, 56eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity, 57eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the 58eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 59eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 60eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 61eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance 62eda14cbcSMatt Macy * for writes. 63eda14cbcSMatt Macy * 64eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then 65eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its 66eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to 67eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to 68eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 69eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will 70eda14cbcSMatt Macy * suffer. 71eda14cbcSMatt Macy * 72eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the 73eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 74eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the 75eda14cbcSMatt Macy * field are defined as follows: 76eda14cbcSMatt Macy * 77eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR 78eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B 79eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression: 80eda14cbcSMatt Macy * 81eda14cbcSMatt Macy * (A * 2)_7 = A_6 82eda14cbcSMatt Macy * (A * 2)_6 = A_5 83eda14cbcSMatt Macy * (A * 2)_5 = A_4 84eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7 85eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7 86eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7 87eda14cbcSMatt Macy * (A * 2)_1 = A_0 88eda14cbcSMatt Macy * (A * 2)_0 = A_7 89eda14cbcSMatt Macy * 90eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 91eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting 92eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 93eda14cbcSMatt Macy * 94eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a 95eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of 96eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 97eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 98eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore 99eda14cbcSMatt Macy * A ^ (255 - 1) = A^254. 100eda14cbcSMatt Macy * 101eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns, 102eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations: 103eda14cbcSMatt Macy * 104eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1 105eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 106eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 107eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 108eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 109eda14cbcSMatt Macy * 110eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 111eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 112eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have 113eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.) 114eda14cbcSMatt Macy * 115eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually 116eda14cbcSMatt Macy * or in concert to recover missing data columns. 117eda14cbcSMatt Macy */ 118eda14cbcSMatt Macy 119eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0 120eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1 121eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2 122eda14cbcSMatt Macy 123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 124eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 125eda14cbcSMatt Macy 126eda14cbcSMatt Macy /* 127eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a 128eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by 129eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to 130eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d. 131eda14cbcSMatt Macy */ 132eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \ 133eda14cbcSMatt Macy { \ 134eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \ 135eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \ 136eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 137eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 138eda14cbcSMatt Macy } 139eda14cbcSMatt Macy 140eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \ 141eda14cbcSMatt Macy { \ 142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 143eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 144eda14cbcSMatt Macy } 145eda14cbcSMatt Macy 146e716630dSMartin Matuska 147e716630dSMartin Matuska /* 148e716630dSMartin Matuska * Big Theory Statement for how a RAIDZ VDEV is expanded 149e716630dSMartin Matuska * 150e716630dSMartin Matuska * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 151e716630dSMartin Matuska * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 152e716630dSMartin Matuska * that have been previously expanded can be expanded again. 153e716630dSMartin Matuska * 154e716630dSMartin Matuska * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 155e716630dSMartin Matuska * the VDEV) when an expansion starts. And the expansion will pause if any 156e716630dSMartin Matuska * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 157e716630dSMartin Matuska * operations on the pool can continue while an expansion is in progress (e.g. 158e716630dSMartin Matuska * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 159e716630dSMartin Matuska * and zpool initialize which can't be run during an expansion. Following a 160e716630dSMartin Matuska * reboot or export/import, the expansion resumes where it left off. 161e716630dSMartin Matuska * 162e716630dSMartin Matuska * == Reflowing the Data == 163e716630dSMartin Matuska * 164e716630dSMartin Matuska * The expansion involves reflowing (copying) the data from the current set 165e716630dSMartin Matuska * of disks to spread it across the new set which now has one more disk. This 166e716630dSMartin Matuska * reflow operation is similar to reflowing text when the column width of a 167e716630dSMartin Matuska * text editor window is expanded. The text doesn’t change but the location of 168e716630dSMartin Matuska * the text changes to accommodate the new width. An example reflow result for 169e716630dSMartin Matuska * a 4-wide RAIDZ1 to a 5-wide is shown below. 170e716630dSMartin Matuska * 171e716630dSMartin Matuska * Reflow End State 172e716630dSMartin Matuska * Each letter indicates a parity group (logical stripe) 173e716630dSMartin Matuska * 174e716630dSMartin Matuska * Before expansion After Expansion 175e716630dSMartin Matuska * D1 D2 D3 D4 D1 D2 D3 D4 D5 176e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 177e716630dSMartin Matuska * | | | | | | | | | | | 178e716630dSMartin Matuska * | A | A | A | A | | A | A | A | A | B | 179e716630dSMartin Matuska * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 180e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 181e716630dSMartin Matuska * | | | | | | | | | | | 182e716630dSMartin Matuska * | B | B | C | C | | B | C | C | C | C | 183e716630dSMartin Matuska * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 184e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 185e716630dSMartin Matuska * | | | | | | | | | | | 186e716630dSMartin Matuska * | C | C | D | D | | D | D | E | E | E | 187e716630dSMartin Matuska * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 188e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 189e716630dSMartin Matuska * | | | | | | | | | | | 190e716630dSMartin Matuska * | E | E | E | E | --> | E | F | F | G | G | 191e716630dSMartin Matuska * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 192e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 193e716630dSMartin Matuska * | | | | | | | | | | | 194e716630dSMartin Matuska * | F | F | G | G | | G | G | H | H | H | 195e716630dSMartin Matuska * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 196e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 197e716630dSMartin Matuska * | | | | | | | | | | | 198e716630dSMartin Matuska * | G | G | H | H | | H | I | I | J | J | 199e716630dSMartin Matuska * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 200e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 201e716630dSMartin Matuska * | | | | | | | | | | | 202e716630dSMartin Matuska * | H | H | I | I | | J | J | | | K | 203e716630dSMartin Matuska * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 204e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 205e716630dSMartin Matuska * 206e716630dSMartin Matuska * This reflow approach has several advantages. There is no need to read or 207e716630dSMartin Matuska * modify the block pointers or recompute any block checksums. The reflow 208e716630dSMartin Matuska * doesn’t need to know where the parity sectors reside. We can read and write 209e716630dSMartin Matuska * data sequentially and the copy can occur in a background thread in open 210e716630dSMartin Matuska * context. The design also allows for fast discovery of what data to copy. 211e716630dSMartin Matuska * 212e716630dSMartin Matuska * The VDEV metaslabs are processed, one at a time, to copy the block data to 213e716630dSMartin Matuska * have it flow across all the disks. The metaslab is disabled for allocations 214e716630dSMartin Matuska * during the copy. As an optimization, we only copy the allocated data which 215e716630dSMartin Matuska * can be determined by looking at the metaslab range tree. During the copy we 216e716630dSMartin Matuska * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 217e716630dSMartin Matuska * need to be able to survive losing parity count disks). This means we 218e716630dSMartin Matuska * cannot overwrite data during the reflow that would be needed if a disk is 219e716630dSMartin Matuska * lost. 220e716630dSMartin Matuska * 221e716630dSMartin Matuska * After the reflow completes, all newly-written blocks will have the new 222e716630dSMartin Matuska * layout, i.e., they will have the parity to data ratio implied by the new 223e716630dSMartin Matuska * number of disks in the RAIDZ group. Even though the reflow copies all of 224e716630dSMartin Matuska * the allocated space (data and parity), it is only rearranged, not changed. 225e716630dSMartin Matuska * 226e716630dSMartin Matuska * This act of reflowing the data has a few implications about blocks 227e716630dSMartin Matuska * that were written before the reflow completes: 228e716630dSMartin Matuska * 229e716630dSMartin Matuska * - Old blocks will still use the same amount of space (i.e., they will have 230e716630dSMartin Matuska * the parity to data ratio implied by the old number of disks in the RAIDZ 231e716630dSMartin Matuska * group). 232e716630dSMartin Matuska * - Reading old blocks will be slightly slower than before the reflow, for 233e716630dSMartin Matuska * two reasons. First, we will have to read from all disks in the RAIDZ 234e716630dSMartin Matuska * VDEV, rather than being able to skip the children that contain only 235e716630dSMartin Matuska * parity of this block (because the data of a single block is now spread 236e716630dSMartin Matuska * out across all the disks). Second, in most cases there will be an extra 237e716630dSMartin Matuska * bcopy, needed to rearrange the data back to its original layout in memory. 238e716630dSMartin Matuska * 239e716630dSMartin Matuska * == Scratch Area == 240e716630dSMartin Matuska * 241e716630dSMartin Matuska * As we copy the block data, we can only progress to the point that writes 242e716630dSMartin Matuska * will not overlap with blocks whose progress has not yet been recorded on 243e716630dSMartin Matuska * disk. Since partially-copied rows are always read from the old location, 244e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent any 245e716630dSMartin Matuska * row-wise overlap. For example, in the diagram above, when we reflow sector 246e716630dSMartin Matuska * B6 it will overwite the original location for B5. 247e716630dSMartin Matuska * 248e716630dSMartin Matuska * To get around this, a scratch space is used so that we can start copying 249e716630dSMartin Matuska * without risking data loss by overlapping the row. As an added benefit, it 250e716630dSMartin Matuska * improves performance at the beginning of the reflow, but that small perf 251e716630dSMartin Matuska * boost wouldn't be worth the complexity on its own. 252e716630dSMartin Matuska * 253e716630dSMartin Matuska * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 254e716630dSMartin Matuska * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 255e716630dSMartin Matuska * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 256e716630dSMartin Matuska * the widths will likely be single digits so we can get a substantial chuck 257e716630dSMartin Matuska * size using only a few MB of scratch per disk. 258e716630dSMartin Matuska * 259e716630dSMartin Matuska * The scratch area is persisted to disk which holds a large amount of reflowed 260e716630dSMartin Matuska * state. We can always read the partially written stripes when a disk fails or 261e716630dSMartin Matuska * the copy is interrupted (crash) during the initial copying phase and also 262e716630dSMartin Matuska * get past a small chunk size restriction. At a minimum, the scratch space 263e716630dSMartin Matuska * must be large enough to get us to the point that one row does not overlap 264e716630dSMartin Matuska * itself when moved (i.e new_width^2). But going larger is even better. We 265e716630dSMartin Matuska * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 266e716630dSMartin Matuska * as our scratch space to handle overwriting the initial part of the VDEV. 267e716630dSMartin Matuska * 268e716630dSMartin Matuska * 0 256K 512K 4M 269e716630dSMartin Matuska * +------+------+-----------------------+----------------------------- 270e716630dSMartin Matuska * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 271e716630dSMartin Matuska * | L0 | L1 | Reserved | (Metaslabs) 272e716630dSMartin Matuska * +------+------+-----------------------+------------------------------- 273e716630dSMartin Matuska * Scratch Area 274e716630dSMartin Matuska * 275e716630dSMartin Matuska * == Reflow Progress Updates == 276e716630dSMartin Matuska * After the initial scratch-based reflow, the expansion process works 277e716630dSMartin Matuska * similarly to device removal. We create a new open context thread which 278e716630dSMartin Matuska * reflows the data, and periodically kicks off sync tasks to update logical 279e716630dSMartin Matuska * state. In this case, state is the committed progress (offset of next data 280e716630dSMartin Matuska * to copy). We need to persist the completed offset on disk, so that if we 281e716630dSMartin Matuska * crash we know which format each VDEV offset is in. 282e716630dSMartin Matuska * 283e716630dSMartin Matuska * == Time Dependent Geometry == 284e716630dSMartin Matuska * 285e716630dSMartin Matuska * In non-expanded RAIDZ, blocks are read from disk in a column by column 286e716630dSMartin Matuska * fashion. For a multi-row block, the second sector is in the first column 287e716630dSMartin Matuska * not in the second column. This allows us to issue full reads for each 288e716630dSMartin Matuska * column directly into the request buffer. The block data is thus laid out 289e716630dSMartin Matuska * sequentially in a column-by-column fashion. 290e716630dSMartin Matuska * 291e716630dSMartin Matuska * For example, in the before expansion diagram above, one logical block might 292e716630dSMartin Matuska * be sectors G19-H26. The parity is in G19,H23; and the data is in 293e716630dSMartin Matuska * G20,H24,G21,H25,G22,H26. 294e716630dSMartin Matuska * 295e716630dSMartin Matuska * After a block is reflowed, the sectors that were all in the original column 296e716630dSMartin Matuska * data can now reside in different columns. When reading from an expanded 297e716630dSMartin Matuska * VDEV, we need to know the logical stripe width for each block so we can 298e716630dSMartin Matuska * reconstitute the block’s data after the reads are completed. Likewise, 299e716630dSMartin Matuska * when we perform the combinatorial reconstruction we need to know the 300e716630dSMartin Matuska * original width so we can retry combinations from the past layouts. 301e716630dSMartin Matuska * 302e716630dSMartin Matuska * Time dependent geometry is what we call having blocks with different layouts 303e716630dSMartin Matuska * (stripe widths) in the same VDEV. This time-dependent geometry uses the 304e716630dSMartin Matuska * block’s birth time (+ the time expansion ended) to establish the correct 305e716630dSMartin Matuska * width for a given block. After an expansion completes, we record the time 306e716630dSMartin Matuska * for blocks written with a particular width (geometry). 307e716630dSMartin Matuska * 308e716630dSMartin Matuska * == On Disk Format Changes == 309e716630dSMartin Matuska * 310e716630dSMartin Matuska * New pool feature flag, 'raidz_expansion' whose reference count is the number 311e716630dSMartin Matuska * of RAIDZ VDEVs that have been expanded. 312e716630dSMartin Matuska * 313e716630dSMartin Matuska * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 314e716630dSMartin Matuska * 315e716630dSMartin Matuska * Since the uberblock can point to arbitrary blocks, which might be on the 316e716630dSMartin Matuska * expanding RAIDZ, and might or might not have been expanded. We need to know 317e716630dSMartin Matuska * which way a block is laid out before reading it. This info is the next 318e716630dSMartin Matuska * offset that needs to be reflowed and we persist that in the uberblock, in 319e716630dSMartin Matuska * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 320e716630dSMartin Matuska * After the expansion is complete, we then use the raidz_expand_txgs array 321e716630dSMartin Matuska * (see below) to determine how to read a block and the ub_raidz_reflow_info 322e716630dSMartin Matuska * field no longer required. 323e716630dSMartin Matuska * 324e716630dSMartin Matuska * The uberblock's ub_raidz_reflow_info field also holds the scratch space 325e716630dSMartin Matuska * state (i.e., active or not) which is also required before reading a block 326e716630dSMartin Matuska * during the initial phase of reflowing the data. 327e716630dSMartin Matuska * 328e716630dSMartin Matuska * The top-level RAIDZ VDEV has two new entries in the nvlist: 329e716630dSMartin Matuska * 330e716630dSMartin Matuska * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 331e716630dSMartin Matuska * and used after the expansion is complete to 332e716630dSMartin Matuska * determine how to read a raidz block 333e716630dSMartin Matuska * 'raidz_expanding' boolean: present during reflow and removed after completion 334e716630dSMartin Matuska * used during a spa import to resume an unfinished 335e716630dSMartin Matuska * expansion 336e716630dSMartin Matuska * 337e716630dSMartin Matuska * And finally the VDEVs top zap adds the following informational entries: 338e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 339e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 340e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 341e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 342e716630dSMartin Matuska */ 343e716630dSMartin Matuska 344e716630dSMartin Matuska /* 345e716630dSMartin Matuska * For testing only: pause the raidz expansion after reflowing this amount. 346e716630dSMartin Matuska * (accessed by ZTS and ztest) 347e716630dSMartin Matuska */ 348e716630dSMartin Matuska #ifdef _KERNEL 349e716630dSMartin Matuska static 350e716630dSMartin Matuska #endif /* _KERNEL */ 351e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0; 352e716630dSMartin Matuska 353e716630dSMartin Matuska /* 354e716630dSMartin Matuska * For testing only: pause the raidz expansion at a certain point. 355e716630dSMartin Matuska */ 356e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0; 357e716630dSMartin Matuska 358e716630dSMartin Matuska /* 359e716630dSMartin Matuska * Maximum amount of copy io's outstanding at once. 360e716630dSMartin Matuska */ 36117aab35aSMartin Matuska #ifdef _ILP32 36217aab35aSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; 36317aab35aSMartin Matuska #else 364e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 36517aab35aSMartin Matuska #endif 366e716630dSMartin Matuska 367e716630dSMartin Matuska /* 368e716630dSMartin Matuska * Apply raidz map abds aggregation if the number of rows in the map is equal 369e716630dSMartin Matuska * or greater than the value below. 370e716630dSMartin Matuska */ 371e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4; 372e716630dSMartin Matuska 373e716630dSMartin Matuska /* 374e716630dSMartin Matuska * Automatically start a pool scrub when a RAIDZ expansion completes in 375e716630dSMartin Matuska * order to verify the checksums of all blocks which have been copied 376e716630dSMartin Matuska * during the expansion. Automatic scrubbing is enabled by default and 377e716630dSMartin Matuska * is strongly recommended. 378e716630dSMartin Matuska */ 379e716630dSMartin Matuska static int zfs_scrub_after_expand = 1; 380e716630dSMartin Matuska 3817877fdebSMatt Macy static void 3827877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr) 383eda14cbcSMatt Macy { 384184c1b94SMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 385184c1b94SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 386eda14cbcSMatt Macy 387184c1b94SMartin Matuska if (rc->rc_size != 0) 388184c1b94SMartin Matuska abd_free(rc->rc_abd); 389184c1b94SMartin Matuska if (rc->rc_orig_data != NULL) 390f9693befSMartin Matuska abd_free(rc->rc_orig_data); 391eda14cbcSMatt Macy } 392eda14cbcSMatt Macy 3937877fdebSMatt Macy if (rr->rr_abd_empty != NULL) 3947877fdebSMatt Macy abd_free(rr->rr_abd_empty); 395eda14cbcSMatt Macy 3967877fdebSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 3977877fdebSMatt Macy } 3987877fdebSMatt Macy 3997877fdebSMatt Macy void 4007877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm) 4017877fdebSMatt Macy { 4027877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) 4037877fdebSMatt Macy vdev_raidz_row_free(rm->rm_row[i]); 4047877fdebSMatt Macy 405e716630dSMartin Matuska if (rm->rm_nphys_cols) { 406e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 407e716630dSMartin Matuska if (rm->rm_phys_col[i].rc_abd != NULL) 408e716630dSMartin Matuska abd_free(rm->rm_phys_col[i].rc_abd); 409e716630dSMartin Matuska } 410e716630dSMartin Matuska 411e716630dSMartin Matuska kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 412e716630dSMartin Matuska rm->rm_nphys_cols); 413e716630dSMartin Matuska } 414e716630dSMartin Matuska 415e716630dSMartin Matuska ASSERT3P(rm->rm_lr, ==, NULL); 4167877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 417eda14cbcSMatt Macy } 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy static void 420eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio) 421eda14cbcSMatt Macy { 422eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 423eda14cbcSMatt Macy 424eda14cbcSMatt Macy vdev_raidz_map_free(rm); 425eda14cbcSMatt Macy } 426eda14cbcSMatt Macy 427e716630dSMartin Matuska static int 428e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2) 429e716630dSMartin Matuska { 430e716630dSMartin Matuska const reflow_node_t *l = x1; 431e716630dSMartin Matuska const reflow_node_t *r = x2; 432e716630dSMartin Matuska 433e716630dSMartin Matuska return (TREE_CMP(l->re_txg, r->re_txg)); 434e716630dSMartin Matuska } 435e716630dSMartin Matuska 436f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = { 437eda14cbcSMatt Macy .vsd_free = vdev_raidz_map_free_vsd, 438eda14cbcSMatt Macy }; 439eda14cbcSMatt Macy 440e716630dSMartin Matuska raidz_row_t * 44187bf66d4SMartin Matuska vdev_raidz_row_alloc(int cols, zio_t *zio) 442e716630dSMartin Matuska { 443e716630dSMartin Matuska raidz_row_t *rr = 444e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 445e716630dSMartin Matuska 446e716630dSMartin Matuska rr->rr_cols = cols; 447e716630dSMartin Matuska rr->rr_scols = cols; 448e716630dSMartin Matuska 449e716630dSMartin Matuska for (int c = 0; c < cols; c++) { 450e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 451e716630dSMartin Matuska rc->rc_shadow_devidx = INT_MAX; 452e716630dSMartin Matuska rc->rc_shadow_offset = UINT64_MAX; 45387bf66d4SMartin Matuska /* 45487bf66d4SMartin Matuska * We can not allow self healing to take place for Direct I/O 45587bf66d4SMartin Matuska * reads. There is nothing that stops the buffer contents from 45687bf66d4SMartin Matuska * being manipulated while the I/O is in flight. It is possible 45787bf66d4SMartin Matuska * that the checksum could be verified on the buffer and then 45887bf66d4SMartin Matuska * the contents of that buffer are manipulated afterwards. This 45987bf66d4SMartin Matuska * could lead to bad data being written out during self 46087bf66d4SMartin Matuska * healing. 46187bf66d4SMartin Matuska */ 46287bf66d4SMartin Matuska if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 463e716630dSMartin Matuska rc->rc_allow_repair = 1; 464e716630dSMartin Matuska } 465e716630dSMartin Matuska return (rr); 466e716630dSMartin Matuska } 467e716630dSMartin Matuska 46881b22a98SMartin Matuska static void 46981b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 47081b22a98SMartin Matuska { 47181b22a98SMartin Matuska int c; 47281b22a98SMartin Matuska int nwrapped = 0; 47381b22a98SMartin Matuska uint64_t off = 0; 47481b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 47581b22a98SMartin Matuska 47681b22a98SMartin Matuska ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 47781b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 47881b22a98SMartin Matuska 47981b22a98SMartin Matuska /* 48081b22a98SMartin Matuska * Pad any parity columns with additional space to account for skip 48181b22a98SMartin Matuska * sectors. 48281b22a98SMartin Matuska */ 48381b22a98SMartin Matuska if (rm->rm_skipstart < rr->rr_firstdatacol) { 48481b22a98SMartin Matuska ASSERT0(rm->rm_skipstart); 48581b22a98SMartin Matuska nwrapped = rm->rm_nskip; 48681b22a98SMartin Matuska } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 48781b22a98SMartin Matuska nwrapped = 48881b22a98SMartin Matuska (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 48981b22a98SMartin Matuska } 49081b22a98SMartin Matuska 49181b22a98SMartin Matuska /* 49281b22a98SMartin Matuska * Optional single skip sectors (rc_size == 0) will be handled in 49381b22a98SMartin Matuska * vdev_raidz_io_start_write(). 49481b22a98SMartin Matuska */ 49581b22a98SMartin Matuska int skipped = rr->rr_scols - rr->rr_cols; 49681b22a98SMartin Matuska 49781b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 49881b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) { 49981b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 50081b22a98SMartin Matuska 50181b22a98SMartin Matuska /* 50281b22a98SMartin Matuska * Parity columns will pad out a linear ABD to account for 50381b22a98SMartin Matuska * the skip sector. A linear ABD is used here because 50481b22a98SMartin Matuska * parity calculations use the ABD buffer directly to calculate 50581b22a98SMartin Matuska * parity. This avoids doing a memcpy back to the ABD after the 50681b22a98SMartin Matuska * parity has been calculated. By issuing the parity column 50781b22a98SMartin Matuska * with the skip sector we can reduce contention on the child 50881b22a98SMartin Matuska * VDEV queue locks (vq_lock). 50981b22a98SMartin Matuska */ 51081b22a98SMartin Matuska if (c < nwrapped) { 51181b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear( 51281b22a98SMartin Matuska rc->rc_size + (1ULL << ashift), B_FALSE); 51381b22a98SMartin Matuska abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 51481b22a98SMartin Matuska skipped++; 51581b22a98SMartin Matuska } else { 51681b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 51781b22a98SMartin Matuska } 51881b22a98SMartin Matuska } 51981b22a98SMartin Matuska 52081b22a98SMartin Matuska for (off = 0; c < rr->rr_cols; c++) { 52181b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 52281b22a98SMartin Matuska abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 52381b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 52481b22a98SMartin Matuska 52581b22a98SMartin Matuska /* 52681b22a98SMartin Matuska * Generate I/O for skip sectors to improve aggregation 52781b22a98SMartin Matuska * continuity. We will use gang ABD's to reduce contention 52881b22a98SMartin Matuska * on the child VDEV queue locks (vq_lock) by issuing 52981b22a98SMartin Matuska * a single I/O that contains the data and skip sector. 53081b22a98SMartin Matuska * 53181b22a98SMartin Matuska * It is important to make sure that rc_size is not updated 53281b22a98SMartin Matuska * even though we are adding a skip sector to the ABD. When 53381b22a98SMartin Matuska * calculating the parity in vdev_raidz_generate_parity_row() 53481b22a98SMartin Matuska * the rc_size is used to iterate through the ABD's. We can 53581b22a98SMartin Matuska * not have zero'd out skip sectors used for calculating 53681b22a98SMartin Matuska * parity for raidz, because those same sectors are not used 53781b22a98SMartin Matuska * during reconstruction. 53881b22a98SMartin Matuska */ 53981b22a98SMartin Matuska if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 54081b22a98SMartin Matuska rc->rc_abd = abd_alloc_gang(); 54181b22a98SMartin Matuska abd_gang_add(rc->rc_abd, abd, B_TRUE); 54281b22a98SMartin Matuska abd_gang_add(rc->rc_abd, 54381b22a98SMartin Matuska abd_get_zeros(1ULL << ashift), B_TRUE); 54481b22a98SMartin Matuska skipped++; 54581b22a98SMartin Matuska } else { 54681b22a98SMartin Matuska rc->rc_abd = abd; 54781b22a98SMartin Matuska } 54881b22a98SMartin Matuska off += rc->rc_size; 54981b22a98SMartin Matuska } 55081b22a98SMartin Matuska 55181b22a98SMartin Matuska ASSERT3U(off, ==, zio->io_size); 55281b22a98SMartin Matuska ASSERT3S(skipped, ==, rm->rm_nskip); 55381b22a98SMartin Matuska } 55481b22a98SMartin Matuska 55581b22a98SMartin Matuska static void 55681b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 55781b22a98SMartin Matuska { 55881b22a98SMartin Matuska int c; 55981b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 56081b22a98SMartin Matuska 56181b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 56281b22a98SMartin Matuska 56381b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 56481b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) 56581b22a98SMartin Matuska rr->rr_col[c].rc_abd = 56681b22a98SMartin Matuska abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 56781b22a98SMartin Matuska 56881b22a98SMartin Matuska for (uint64_t off = 0; c < rr->rr_cols; c++) { 56981b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 57081b22a98SMartin Matuska rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 57181b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 57281b22a98SMartin Matuska off += rc->rc_size; 57381b22a98SMartin Matuska } 57481b22a98SMartin Matuska } 57581b22a98SMartin Matuska 576eda14cbcSMatt Macy /* 577eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is 578eda14cbcSMatt Macy * the number of children in the target vdev. 579eda14cbcSMatt Macy * 580eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which 581eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 582eda14cbcSMatt Macy */ 583eda14cbcSMatt Macy noinline raidz_map_t * 584eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 585eda14cbcSMatt Macy uint64_t nparity) 586eda14cbcSMatt Macy { 5877877fdebSMatt Macy raidz_row_t *rr; 588eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 589eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift; 590eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 591eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift; 592eda14cbcSMatt Macy /* The first column for this stripe. */ 593eda14cbcSMatt Macy uint64_t f = b % dcols; 594eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */ 595eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift; 596e716630dSMartin Matuska uint64_t acols, scols; 597eda14cbcSMatt Macy 5987877fdebSMatt Macy raidz_map_t *rm = 5997877fdebSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 6007877fdebSMatt Macy rm->rm_nrows = 1; 6017877fdebSMatt Macy 602eda14cbcSMatt Macy /* 603eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but 604eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data. 605eda14cbcSMatt Macy */ 606e716630dSMartin Matuska uint64_t q = s / (dcols - nparity); 607eda14cbcSMatt Macy 608eda14cbcSMatt Macy /* 609eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O. 610eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs. 611eda14cbcSMatt Macy */ 612e716630dSMartin Matuska uint64_t r = s - q * (dcols - nparity); 613eda14cbcSMatt Macy 614eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */ 615e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 616eda14cbcSMatt Macy 617eda14cbcSMatt Macy /* 618eda14cbcSMatt Macy * The total number of data and parity sectors associated with 619eda14cbcSMatt Macy * this I/O. 620eda14cbcSMatt Macy */ 621e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 622eda14cbcSMatt Macy 6237877fdebSMatt Macy /* 6247877fdebSMatt Macy * acols: The columns that will be accessed. 6257877fdebSMatt Macy * scols: The columns that will be accessed or skipped. 6267877fdebSMatt Macy */ 627eda14cbcSMatt Macy if (q == 0) { 628eda14cbcSMatt Macy /* Our I/O request doesn't span all child vdevs. */ 629eda14cbcSMatt Macy acols = bc; 630eda14cbcSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1)); 631eda14cbcSMatt Macy } else { 632eda14cbcSMatt Macy acols = dcols; 633eda14cbcSMatt Macy scols = dcols; 634eda14cbcSMatt Macy } 635eda14cbcSMatt Macy 636eda14cbcSMatt Macy ASSERT3U(acols, <=, scols); 63787bf66d4SMartin Matuska rr = vdev_raidz_row_alloc(scols, zio); 6387877fdebSMatt Macy rm->rm_row[0] = rr; 6397877fdebSMatt Macy rr->rr_cols = acols; 6407877fdebSMatt Macy rr->rr_bigcols = bc; 6417877fdebSMatt Macy rr->rr_firstdatacol = nparity; 6427877fdebSMatt Macy #ifdef ZFS_DEBUG 6437877fdebSMatt Macy rr->rr_offset = zio->io_offset; 6447877fdebSMatt Macy rr->rr_size = zio->io_size; 6457877fdebSMatt Macy #endif 646eda14cbcSMatt Macy 647e716630dSMartin Matuska uint64_t asize = 0; 648eda14cbcSMatt Macy 649e716630dSMartin Matuska for (uint64_t c = 0; c < scols; c++) { 6507877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 651e716630dSMartin Matuska uint64_t col = f + c; 652e716630dSMartin Matuska uint64_t coff = o; 653eda14cbcSMatt Macy if (col >= dcols) { 654eda14cbcSMatt Macy col -= dcols; 655eda14cbcSMatt Macy coff += 1ULL << ashift; 656eda14cbcSMatt Macy } 6577877fdebSMatt Macy rc->rc_devidx = col; 6587877fdebSMatt Macy rc->rc_offset = coff; 659eda14cbcSMatt Macy 660eda14cbcSMatt Macy if (c >= acols) 6617877fdebSMatt Macy rc->rc_size = 0; 662eda14cbcSMatt Macy else if (c < bc) 6637877fdebSMatt Macy rc->rc_size = (q + 1) << ashift; 664eda14cbcSMatt Macy else 6657877fdebSMatt Macy rc->rc_size = q << ashift; 666eda14cbcSMatt Macy 6677877fdebSMatt Macy asize += rc->rc_size; 668eda14cbcSMatt Macy } 669eda14cbcSMatt Macy 670eda14cbcSMatt Macy ASSERT3U(asize, ==, tot << ashift); 671eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot; 6727877fdebSMatt Macy rm->rm_skipstart = bc; 673eda14cbcSMatt Macy 674eda14cbcSMatt Macy /* 675eda14cbcSMatt Macy * If all data stored spans all columns, there's a danger that parity 676eda14cbcSMatt Macy * will always be on the same device and, since parity isn't read 677eda14cbcSMatt Macy * during normal operation, that device's I/O bandwidth won't be 678eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB. 679eda14cbcSMatt Macy * 680eda14cbcSMatt Macy * ... at least that was, ostensibly, the theory. As a practical 681eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we 682eda14cbcSMatt Macy * won't see any benefit. Further, occasional writes that aren't a 683eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum 684eda14cbcSMatt Macy * stripe width are sufficient to avoid pessimal behavior. 685eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format 686eda14cbcSMatt Macy * requirement that we need to support for all eternity, but only 687eda14cbcSMatt Macy * for single-parity RAID-Z. 688eda14cbcSMatt Macy * 689eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding 690eda14cbcSMatt Macy * we must make sure to note this swap. We will never intend to 691eda14cbcSMatt Macy * skip the first column since at least one data and one parity 692eda14cbcSMatt Macy * column must appear in each row. 693eda14cbcSMatt Macy */ 6947877fdebSMatt Macy ASSERT(rr->rr_cols >= 2); 6957877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 696eda14cbcSMatt Macy 6977877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 698e716630dSMartin Matuska uint64_t devidx = rr->rr_col[0].rc_devidx; 6997877fdebSMatt Macy o = rr->rr_col[0].rc_offset; 7007877fdebSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 7017877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 7027877fdebSMatt Macy rr->rr_col[1].rc_devidx = devidx; 7037877fdebSMatt Macy rr->rr_col[1].rc_offset = o; 704eda14cbcSMatt Macy if (rm->rm_skipstart == 0) 705eda14cbcSMatt Macy rm->rm_skipstart = 1; 706eda14cbcSMatt Macy } 707eda14cbcSMatt Macy 70881b22a98SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) { 70981b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio, rm, ashift); 71081b22a98SMartin Matuska } else { 71181b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio, rm); 71281b22a98SMartin Matuska } 713e716630dSMartin Matuska /* init RAIDZ parity ops */ 714e716630dSMartin Matuska rm->rm_ops = vdev_raidz_math_get_ops(); 71581b22a98SMartin Matuska 716e716630dSMartin Matuska return (rm); 717e716630dSMartin Matuska } 718e716630dSMartin Matuska 719e716630dSMartin Matuska /* 720e716630dSMartin Matuska * Everything before reflow_offset_synced should have been moved to the new 721e716630dSMartin Matuska * location (read and write completed). However, this may not yet be reflected 722e716630dSMartin Matuska * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 723e716630dSMartin Matuska * uberblock has not yet been written). If reflow is not in progress, 724e716630dSMartin Matuska * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 725e716630dSMartin Matuska * entirely before reflow_offset_synced, it will come from the new location. 726e716630dSMartin Matuska * Otherwise this row will come from the old location. Therefore, rows that 727e716630dSMartin Matuska * straddle the reflow_offset_synced will come from the old location. 728e716630dSMartin Matuska * 729e716630dSMartin Matuska * For writes, reflow_offset_next is the next offset to copy. If a sector has 730e716630dSMartin Matuska * been copied, but not yet reflected in the on-disk progress 731e716630dSMartin Matuska * (reflow_offset_synced), it will also be written to the new (already copied) 732e716630dSMartin Matuska * offset. 733e716630dSMartin Matuska */ 734e716630dSMartin Matuska noinline raidz_map_t * 735e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio, 736e716630dSMartin Matuska uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 737e716630dSMartin Matuska uint64_t nparity, uint64_t reflow_offset_synced, 738e716630dSMartin Matuska uint64_t reflow_offset_next, boolean_t use_scratch) 739e716630dSMartin Matuska { 740e716630dSMartin Matuska abd_t *abd = zio->io_abd; 741e716630dSMartin Matuska uint64_t offset = zio->io_offset; 742e716630dSMartin Matuska uint64_t size = zio->io_size; 743e716630dSMartin Matuska 744e716630dSMartin Matuska /* The zio's size in units of the vdev's minimum sector size. */ 745e716630dSMartin Matuska uint64_t s = size >> ashift; 746e716630dSMartin Matuska 747e716630dSMartin Matuska /* 748e716630dSMartin Matuska * "Quotient": The number of data sectors for this stripe on all but 749e716630dSMartin Matuska * the "big column" child vdevs that also contain "remainder" data. 750e716630dSMartin Matuska * AKA "full rows" 751e716630dSMartin Matuska */ 752e716630dSMartin Matuska uint64_t q = s / (logical_cols - nparity); 753e716630dSMartin Matuska 754e716630dSMartin Matuska /* 755e716630dSMartin Matuska * "Remainder": The number of partial stripe data sectors in this I/O. 756e716630dSMartin Matuska * This will add a sector to some, but not all, child vdevs. 757e716630dSMartin Matuska */ 758e716630dSMartin Matuska uint64_t r = s - q * (logical_cols - nparity); 759e716630dSMartin Matuska 760e716630dSMartin Matuska /* The number of "big columns" - those which contain remainder data. */ 761e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 762e716630dSMartin Matuska 763e716630dSMartin Matuska /* 764e716630dSMartin Matuska * The total number of data and parity sectors associated with 765e716630dSMartin Matuska * this I/O. 766e716630dSMartin Matuska */ 767e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 768e716630dSMartin Matuska 769e716630dSMartin Matuska /* How many rows contain data (not skip) */ 770e716630dSMartin Matuska uint64_t rows = howmany(tot, logical_cols); 771e716630dSMartin Matuska int cols = MIN(tot, logical_cols); 772e716630dSMartin Matuska 773e716630dSMartin Matuska raidz_map_t *rm = 774e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 775e716630dSMartin Matuska KM_SLEEP); 776e716630dSMartin Matuska rm->rm_nrows = rows; 777e716630dSMartin Matuska rm->rm_nskip = roundup(tot, nparity + 1) - tot; 778e716630dSMartin Matuska rm->rm_skipstart = bc; 779e716630dSMartin Matuska uint64_t asize = 0; 780e716630dSMartin Matuska 781e716630dSMartin Matuska for (uint64_t row = 0; row < rows; row++) { 782e716630dSMartin Matuska boolean_t row_use_scratch = B_FALSE; 78387bf66d4SMartin Matuska raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 784e716630dSMartin Matuska rm->rm_row[row] = rr; 785e716630dSMartin Matuska 786e716630dSMartin Matuska /* The starting RAIDZ (parent) vdev sector of the row. */ 787e716630dSMartin Matuska uint64_t b = (offset >> ashift) + row * logical_cols; 788e716630dSMartin Matuska 789e716630dSMartin Matuska /* 790e716630dSMartin Matuska * If we are in the middle of a reflow, and the copying has 791e716630dSMartin Matuska * not yet completed for any part of this row, then use the 792e716630dSMartin Matuska * old location of this row. Note that reflow_offset_synced 793e716630dSMartin Matuska * reflects the i/o that's been completed, because it's 794e716630dSMartin Matuska * updated by a synctask, after zio_wait(spa_txg_zio[]). 795e716630dSMartin Matuska * This is sufficient for our check, even if that progress 796e716630dSMartin Matuska * has not yet been recorded to disk (reflected in 797e716630dSMartin Matuska * spa_ubsync). Also note that we consider the last row to 798e716630dSMartin Matuska * be "full width" (`cols`-wide rather than `bc`-wide) for 799e716630dSMartin Matuska * this calculation. This causes a tiny bit of unnecessary 800e716630dSMartin Matuska * double-writes but is safe and simpler to calculate. 801e716630dSMartin Matuska */ 802e716630dSMartin Matuska int row_phys_cols = physical_cols; 803e716630dSMartin Matuska if (b + cols > reflow_offset_synced >> ashift) 804e716630dSMartin Matuska row_phys_cols--; 805e716630dSMartin Matuska else if (use_scratch) 806e716630dSMartin Matuska row_use_scratch = B_TRUE; 807e716630dSMartin Matuska 808e716630dSMartin Matuska /* starting child of this row */ 809e716630dSMartin Matuska uint64_t child_id = b % row_phys_cols; 810e716630dSMartin Matuska /* The starting byte offset on each child vdev. */ 811e716630dSMartin Matuska uint64_t child_offset = (b / row_phys_cols) << ashift; 812e716630dSMartin Matuska 813e716630dSMartin Matuska /* 814e716630dSMartin Matuska * Note, rr_cols is the entire width of the block, even 815e716630dSMartin Matuska * if this row is shorter. This is needed because parity 816e716630dSMartin Matuska * generation (for Q and R) needs to know the entire width, 817e716630dSMartin Matuska * because it treats the short row as though it was 818e716630dSMartin Matuska * full-width (and the "phantom" sectors were zero-filled). 819e716630dSMartin Matuska * 820e716630dSMartin Matuska * Another approach to this would be to set cols shorter 821e716630dSMartin Matuska * (to just the number of columns that we might do i/o to) 822e716630dSMartin Matuska * and have another mechanism to tell the parity generation 823e716630dSMartin Matuska * about the "entire width". Reconstruction (at least 824e716630dSMartin Matuska * vdev_raidz_reconstruct_general()) would also need to 825e716630dSMartin Matuska * know about the "entire width". 826e716630dSMartin Matuska */ 827e716630dSMartin Matuska rr->rr_firstdatacol = nparity; 828e716630dSMartin Matuska #ifdef ZFS_DEBUG 829e716630dSMartin Matuska /* 830e716630dSMartin Matuska * note: rr_size is PSIZE, not ASIZE 831e716630dSMartin Matuska */ 832e716630dSMartin Matuska rr->rr_offset = b << ashift; 833e716630dSMartin Matuska rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 834e716630dSMartin Matuska #endif 835e716630dSMartin Matuska 836e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++, child_id++) { 837e716630dSMartin Matuska if (child_id >= row_phys_cols) { 838e716630dSMartin Matuska child_id -= row_phys_cols; 839e716630dSMartin Matuska child_offset += 1ULL << ashift; 840e716630dSMartin Matuska } 841e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 842e716630dSMartin Matuska rc->rc_devidx = child_id; 843e716630dSMartin Matuska rc->rc_offset = child_offset; 844e716630dSMartin Matuska 845e716630dSMartin Matuska /* 846e716630dSMartin Matuska * Get this from the scratch space if appropriate. 847e716630dSMartin Matuska * This only happens if we crashed in the middle of 848e716630dSMartin Matuska * raidz_reflow_scratch_sync() (while it's running, 849e716630dSMartin Matuska * the rangelock prevents us from doing concurrent 850e716630dSMartin Matuska * io), and even then only during zpool import or 851e716630dSMartin Matuska * when the pool is imported readonly. 852e716630dSMartin Matuska */ 853e716630dSMartin Matuska if (row_use_scratch) 854e716630dSMartin Matuska rc->rc_offset -= VDEV_BOOT_SIZE; 855e716630dSMartin Matuska 856e716630dSMartin Matuska uint64_t dc = c - rr->rr_firstdatacol; 857e716630dSMartin Matuska if (c < rr->rr_firstdatacol) { 858e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 859e716630dSMartin Matuska 860e716630dSMartin Matuska /* 861e716630dSMartin Matuska * Parity sectors' rc_abd's are set below 862e716630dSMartin Matuska * after determining if this is an aggregation. 863e716630dSMartin Matuska */ 864e716630dSMartin Matuska } else if (row == rows - 1 && bc != 0 && c >= bc) { 865e716630dSMartin Matuska /* 866e716630dSMartin Matuska * Past the end of the block (even including 867e716630dSMartin Matuska * skip sectors). This sector is part of the 868e716630dSMartin Matuska * map so that we have full rows for p/q parity 869e716630dSMartin Matuska * generation. 870e716630dSMartin Matuska */ 871e716630dSMartin Matuska rc->rc_size = 0; 872e716630dSMartin Matuska rc->rc_abd = NULL; 873e716630dSMartin Matuska } else { 874e716630dSMartin Matuska /* "data column" (col excluding parity) */ 875e716630dSMartin Matuska uint64_t off; 876e716630dSMartin Matuska 877e716630dSMartin Matuska if (c < bc || r == 0) { 878e716630dSMartin Matuska off = dc * rows + row; 879e716630dSMartin Matuska } else { 880e716630dSMartin Matuska off = r * rows + 881e716630dSMartin Matuska (dc - r) * (rows - 1) + row; 882e716630dSMartin Matuska } 883e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 884e716630dSMartin Matuska rc->rc_abd = abd_get_offset_struct( 885e716630dSMartin Matuska &rc->rc_abdstruct, abd, off << ashift, 886e716630dSMartin Matuska rc->rc_size); 887e716630dSMartin Matuska } 888e716630dSMartin Matuska 889e716630dSMartin Matuska if (rc->rc_size == 0) 890e716630dSMartin Matuska continue; 891e716630dSMartin Matuska 892e716630dSMartin Matuska /* 893e716630dSMartin Matuska * If any part of this row is in both old and new 894e716630dSMartin Matuska * locations, the primary location is the old 895e716630dSMartin Matuska * location. If this sector was already copied to the 896e716630dSMartin Matuska * new location, we need to also write to the new, 897e716630dSMartin Matuska * "shadow" location. 898e716630dSMartin Matuska * 899e716630dSMartin Matuska * Note, `row_phys_cols != physical_cols` indicates 900e716630dSMartin Matuska * that the primary location is the old location. 901e716630dSMartin Matuska * `b+c < reflow_offset_next` indicates that the copy 902e716630dSMartin Matuska * to the new location has been initiated. We know 903e716630dSMartin Matuska * that the copy has completed because we have the 904e716630dSMartin Matuska * rangelock, which is held exclusively while the 905e716630dSMartin Matuska * copy is in progress. 906e716630dSMartin Matuska */ 907e716630dSMartin Matuska if (row_use_scratch || 908e716630dSMartin Matuska (row_phys_cols != physical_cols && 909e716630dSMartin Matuska b + c < reflow_offset_next >> ashift)) { 910e716630dSMartin Matuska rc->rc_shadow_devidx = (b + c) % physical_cols; 911e716630dSMartin Matuska rc->rc_shadow_offset = 912e716630dSMartin Matuska ((b + c) / physical_cols) << ashift; 913e716630dSMartin Matuska if (row_use_scratch) 914e716630dSMartin Matuska rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 915e716630dSMartin Matuska } 916e716630dSMartin Matuska 917e716630dSMartin Matuska asize += rc->rc_size; 918e716630dSMartin Matuska } 919e716630dSMartin Matuska 920e716630dSMartin Matuska /* 921e716630dSMartin Matuska * See comment in vdev_raidz_map_alloc() 922e716630dSMartin Matuska */ 923e716630dSMartin Matuska if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 924e716630dSMartin Matuska (offset & (1ULL << 20))) { 925e716630dSMartin Matuska ASSERT(rr->rr_cols >= 2); 926e716630dSMartin Matuska ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 927e716630dSMartin Matuska 928e716630dSMartin Matuska int devidx0 = rr->rr_col[0].rc_devidx; 929e716630dSMartin Matuska uint64_t offset0 = rr->rr_col[0].rc_offset; 930e716630dSMartin Matuska int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 931e716630dSMartin Matuska uint64_t shadow_offset0 = 932e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset; 933e716630dSMartin Matuska 934e716630dSMartin Matuska rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 935e716630dSMartin Matuska rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 936e716630dSMartin Matuska rr->rr_col[0].rc_shadow_devidx = 937e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx; 938e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset = 939e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset; 940e716630dSMartin Matuska 941e716630dSMartin Matuska rr->rr_col[1].rc_devidx = devidx0; 942e716630dSMartin Matuska rr->rr_col[1].rc_offset = offset0; 943e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 944e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset = shadow_offset0; 945e716630dSMartin Matuska } 946e716630dSMartin Matuska } 947e716630dSMartin Matuska ASSERT3U(asize, ==, tot << ashift); 948e716630dSMartin Matuska 949e716630dSMartin Matuska /* 950e716630dSMartin Matuska * Determine if the block is contiguous, in which case we can use 951e716630dSMartin Matuska * an aggregation. 952e716630dSMartin Matuska */ 953e716630dSMartin Matuska if (rows >= raidz_io_aggregate_rows) { 954e716630dSMartin Matuska rm->rm_nphys_cols = physical_cols; 955e716630dSMartin Matuska rm->rm_phys_col = 956e716630dSMartin Matuska kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 957e716630dSMartin Matuska KM_SLEEP); 958e716630dSMartin Matuska 959e716630dSMartin Matuska /* 960e716630dSMartin Matuska * Determine the aggregate io's offset and size, and check 961e716630dSMartin Matuska * that the io is contiguous. 962e716630dSMartin Matuska */ 963e716630dSMartin Matuska for (int i = 0; 964e716630dSMartin Matuska i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 965e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 966e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 967e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 968e716630dSMartin Matuska raidz_col_t *prc = 969e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 970e716630dSMartin Matuska 971e716630dSMartin Matuska if (rc->rc_size == 0) 972e716630dSMartin Matuska continue; 973e716630dSMartin Matuska 974e716630dSMartin Matuska if (prc->rc_size == 0) { 975e716630dSMartin Matuska ASSERT0(prc->rc_offset); 976e716630dSMartin Matuska prc->rc_offset = rc->rc_offset; 977e716630dSMartin Matuska } else if (prc->rc_offset + prc->rc_size != 978e716630dSMartin Matuska rc->rc_offset) { 979e716630dSMartin Matuska /* 980e716630dSMartin Matuska * This block is not contiguous and 981e716630dSMartin Matuska * therefore can't be aggregated. 982e716630dSMartin Matuska * This is expected to be rare, so 983e716630dSMartin Matuska * the cost of allocating and then 984e716630dSMartin Matuska * freeing rm_phys_col is not 985e716630dSMartin Matuska * significant. 986e716630dSMartin Matuska */ 987e716630dSMartin Matuska kmem_free(rm->rm_phys_col, 988e716630dSMartin Matuska sizeof (raidz_col_t) * 989e716630dSMartin Matuska rm->rm_nphys_cols); 990e716630dSMartin Matuska rm->rm_phys_col = NULL; 991e716630dSMartin Matuska rm->rm_nphys_cols = 0; 992e716630dSMartin Matuska break; 993e716630dSMartin Matuska } 994e716630dSMartin Matuska prc->rc_size += rc->rc_size; 995e716630dSMartin Matuska } 996e716630dSMartin Matuska } 997e716630dSMartin Matuska } 998e716630dSMartin Matuska if (rm->rm_phys_col != NULL) { 999e716630dSMartin Matuska /* 1000e716630dSMartin Matuska * Allocate aggregate ABD's. 1001e716630dSMartin Matuska */ 1002e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 1003e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 1004e716630dSMartin Matuska 1005e716630dSMartin Matuska prc->rc_devidx = i; 1006e716630dSMartin Matuska 1007e716630dSMartin Matuska if (prc->rc_size == 0) 1008e716630dSMartin Matuska continue; 1009e716630dSMartin Matuska 1010e716630dSMartin Matuska prc->rc_abd = 1011e716630dSMartin Matuska abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1012e716630dSMartin Matuska B_FALSE); 1013e716630dSMartin Matuska } 1014e716630dSMartin Matuska 1015e716630dSMartin Matuska /* 1016e716630dSMartin Matuska * Point the parity abd's into the aggregate abd's. 1017e716630dSMartin Matuska */ 1018e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1019e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1020e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1021e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1022e716630dSMartin Matuska raidz_col_t *prc = 1023e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 1024e716630dSMartin Matuska rc->rc_abd = 1025e716630dSMartin Matuska abd_get_offset_struct(&rc->rc_abdstruct, 1026e716630dSMartin Matuska prc->rc_abd, 1027e716630dSMartin Matuska rc->rc_offset - prc->rc_offset, 1028e716630dSMartin Matuska rc->rc_size); 1029e716630dSMartin Matuska } 1030e716630dSMartin Matuska } 1031e716630dSMartin Matuska } else { 1032e716630dSMartin Matuska /* 1033e716630dSMartin Matuska * Allocate new abd's for the parity sectors. 1034e716630dSMartin Matuska */ 1035e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1036e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1037e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1038e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1039e716630dSMartin Matuska rc->rc_abd = 1040e716630dSMartin Matuska abd_alloc_linear(rc->rc_size, 1041e716630dSMartin Matuska B_TRUE); 1042e716630dSMartin Matuska } 1043e716630dSMartin Matuska } 1044e716630dSMartin Matuska } 1045eda14cbcSMatt Macy /* init RAIDZ parity ops */ 1046eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops(); 1047eda14cbcSMatt Macy 1048eda14cbcSMatt Macy return (rm); 1049eda14cbcSMatt Macy } 1050eda14cbcSMatt Macy 1051eda14cbcSMatt Macy struct pqr_struct { 1052eda14cbcSMatt Macy uint64_t *p; 1053eda14cbcSMatt Macy uint64_t *q; 1054eda14cbcSMatt Macy uint64_t *r; 1055eda14cbcSMatt Macy }; 1056eda14cbcSMatt Macy 1057eda14cbcSMatt Macy static int 1058eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private) 1059eda14cbcSMatt Macy { 1060eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1061eda14cbcSMatt Macy const uint64_t *src = buf; 1062e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1063eda14cbcSMatt Macy 1064eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r); 1065eda14cbcSMatt Macy 1066e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++) 1067eda14cbcSMatt Macy *pqr->p ^= *src; 1068eda14cbcSMatt Macy 1069eda14cbcSMatt Macy return (0); 1070eda14cbcSMatt Macy } 1071eda14cbcSMatt Macy 1072eda14cbcSMatt Macy static int 1073eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private) 1074eda14cbcSMatt Macy { 1075eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1076eda14cbcSMatt Macy const uint64_t *src = buf; 1077eda14cbcSMatt Macy uint64_t mask; 1078e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1079eda14cbcSMatt Macy 1080eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r); 1081eda14cbcSMatt Macy 1082e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1083eda14cbcSMatt Macy *pqr->p ^= *src; 1084eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1085eda14cbcSMatt Macy *pqr->q ^= *src; 1086eda14cbcSMatt Macy } 1087eda14cbcSMatt Macy 1088eda14cbcSMatt Macy return (0); 1089eda14cbcSMatt Macy } 1090eda14cbcSMatt Macy 1091eda14cbcSMatt Macy static int 1092eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1093eda14cbcSMatt Macy { 1094eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1095eda14cbcSMatt Macy const uint64_t *src = buf; 1096eda14cbcSMatt Macy uint64_t mask; 1097e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1098eda14cbcSMatt Macy 1099eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r); 1100eda14cbcSMatt Macy 1101e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1102eda14cbcSMatt Macy *pqr->p ^= *src; 1103eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1104eda14cbcSMatt Macy *pqr->q ^= *src; 1105eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1106eda14cbcSMatt Macy *pqr->r ^= *src; 1107eda14cbcSMatt Macy } 1108eda14cbcSMatt Macy 1109eda14cbcSMatt Macy return (0); 1110eda14cbcSMatt Macy } 1111eda14cbcSMatt Macy 1112eda14cbcSMatt Macy static void 11137877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr) 1114eda14cbcSMatt Macy { 11157877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1116eda14cbcSMatt Macy 11177877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11187877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1119eda14cbcSMatt Macy 11207877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 11217877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1122eda14cbcSMatt Macy } else { 1123eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL }; 11247877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1125eda14cbcSMatt Macy vdev_raidz_p_func, &pqr); 1126eda14cbcSMatt Macy } 1127eda14cbcSMatt Macy } 1128eda14cbcSMatt Macy } 1129eda14cbcSMatt Macy 1130eda14cbcSMatt Macy static void 11317877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1132eda14cbcSMatt Macy { 11337877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11347877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11357877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11367877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11377877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1138eda14cbcSMatt Macy 11397877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11407877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1141eda14cbcSMatt Macy 11427877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1143eda14cbcSMatt Macy 11447877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1145eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11467877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11477877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 1148eda14cbcSMatt Macy 11497877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1150eda14cbcSMatt Macy p[i] = 0; 1151eda14cbcSMatt Macy q[i] = 0; 1152eda14cbcSMatt Macy } 1153eda14cbcSMatt Macy } else { 1154eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL }; 1155eda14cbcSMatt Macy 1156eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11577877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1158eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr); 1159eda14cbcSMatt Macy 1160eda14cbcSMatt Macy /* 1161eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1162eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1163eda14cbcSMatt Macy */ 11647877fdebSMatt Macy uint64_t mask; 11657877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1166eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1167eda14cbcSMatt Macy } 1168eda14cbcSMatt Macy } 1169eda14cbcSMatt Macy } 1170eda14cbcSMatt Macy } 1171eda14cbcSMatt Macy 1172eda14cbcSMatt Macy static void 11737877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1174eda14cbcSMatt Macy { 11757877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11767877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11777877fdebSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 11787877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11797877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11807877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 11817877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11827877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size); 1183eda14cbcSMatt Macy 11847877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11857877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1186eda14cbcSMatt Macy 11877877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1188eda14cbcSMatt Macy 11897877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1190eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11917877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11927877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 11937877fdebSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size); 1194eda14cbcSMatt Macy 11957877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1196eda14cbcSMatt Macy p[i] = 0; 1197eda14cbcSMatt Macy q[i] = 0; 1198eda14cbcSMatt Macy r[i] = 0; 1199eda14cbcSMatt Macy } 1200eda14cbcSMatt Macy } else { 1201eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r }; 1202eda14cbcSMatt Macy 1203eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 12047877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1205eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr); 1206eda14cbcSMatt Macy 1207eda14cbcSMatt Macy /* 1208eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1209eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1210eda14cbcSMatt Macy */ 12117877fdebSMatt Macy uint64_t mask; 12127877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1213eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1214eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask); 1215eda14cbcSMatt Macy } 1216eda14cbcSMatt Macy } 1217eda14cbcSMatt Macy } 1218eda14cbcSMatt Macy } 1219eda14cbcSMatt Macy 1220eda14cbcSMatt Macy /* 1221eda14cbcSMatt Macy * Generate RAID parity in the first virtual columns according to the number of 1222eda14cbcSMatt Macy * parity columns available. 1223eda14cbcSMatt Macy */ 1224eda14cbcSMatt Macy void 12257877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1226eda14cbcSMatt Macy { 1227e716630dSMartin Matuska if (rr->rr_cols == 0) { 1228e716630dSMartin Matuska /* 1229e716630dSMartin Matuska * We are handling this block one row at a time (because 1230e716630dSMartin Matuska * this block has a different logical vs physical width, 1231e716630dSMartin Matuska * due to RAIDZ expansion), and this is a pad-only row, 1232e716630dSMartin Matuska * which has no parity. 1233e716630dSMartin Matuska */ 1234e716630dSMartin Matuska return; 1235e716630dSMartin Matuska } 12367877fdebSMatt Macy 1237eda14cbcSMatt Macy /* Generate using the new math implementation */ 12387877fdebSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1239eda14cbcSMatt Macy return; 1240eda14cbcSMatt Macy 12417877fdebSMatt Macy switch (rr->rr_firstdatacol) { 1242eda14cbcSMatt Macy case 1: 12437877fdebSMatt Macy vdev_raidz_generate_parity_p(rr); 1244eda14cbcSMatt Macy break; 1245eda14cbcSMatt Macy case 2: 12467877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1247eda14cbcSMatt Macy break; 1248eda14cbcSMatt Macy case 3: 12497877fdebSMatt Macy vdev_raidz_generate_parity_pqr(rr); 1250eda14cbcSMatt Macy break; 1251eda14cbcSMatt Macy default: 1252eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1253eda14cbcSMatt Macy } 1254eda14cbcSMatt Macy } 1255eda14cbcSMatt Macy 12567877fdebSMatt Macy void 12577877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm) 12587877fdebSMatt Macy { 12597877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 12607877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 12617877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 12627877fdebSMatt Macy } 12637877fdebSMatt Macy } 12647877fdebSMatt Macy 1265eda14cbcSMatt Macy static int 1266eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1267eda14cbcSMatt Macy { 1268e92ffd9bSMartin Matuska (void) private; 1269eda14cbcSMatt Macy uint64_t *dst = dbuf; 1270eda14cbcSMatt Macy uint64_t *src = sbuf; 1271eda14cbcSMatt Macy int cnt = size / sizeof (src[0]); 1272eda14cbcSMatt Macy 1273eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) { 1274eda14cbcSMatt Macy dst[i] ^= src[i]; 1275eda14cbcSMatt Macy } 1276eda14cbcSMatt Macy 1277eda14cbcSMatt Macy return (0); 1278eda14cbcSMatt Macy } 1279eda14cbcSMatt Macy 1280eda14cbcSMatt Macy static int 1281eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1282eda14cbcSMatt Macy void *private) 1283eda14cbcSMatt Macy { 1284e92ffd9bSMartin Matuska (void) private; 1285eda14cbcSMatt Macy uint64_t *dst = dbuf; 1286eda14cbcSMatt Macy uint64_t *src = sbuf; 1287eda14cbcSMatt Macy uint64_t mask; 1288eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1289eda14cbcSMatt Macy 1290eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) { 1291eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1292eda14cbcSMatt Macy *dst ^= *src; 1293eda14cbcSMatt Macy } 1294eda14cbcSMatt Macy 1295eda14cbcSMatt Macy return (0); 1296eda14cbcSMatt Macy } 1297eda14cbcSMatt Macy 1298eda14cbcSMatt Macy static int 1299eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1300eda14cbcSMatt Macy { 1301e92ffd9bSMartin Matuska (void) private; 1302eda14cbcSMatt Macy uint64_t *dst = buf; 1303eda14cbcSMatt Macy uint64_t mask; 1304eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1305eda14cbcSMatt Macy 1306eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) { 1307eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1308eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1309eda14cbcSMatt Macy } 1310eda14cbcSMatt Macy 1311eda14cbcSMatt Macy return (0); 1312eda14cbcSMatt Macy } 1313eda14cbcSMatt Macy 1314eda14cbcSMatt Macy struct reconst_q_struct { 1315eda14cbcSMatt Macy uint64_t *q; 1316eda14cbcSMatt Macy int exp; 1317eda14cbcSMatt Macy }; 1318eda14cbcSMatt Macy 1319eda14cbcSMatt Macy static int 1320eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1321eda14cbcSMatt Macy { 1322eda14cbcSMatt Macy struct reconst_q_struct *rq = private; 1323eda14cbcSMatt Macy uint64_t *dst = buf; 1324eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1325eda14cbcSMatt Macy 1326eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1327eda14cbcSMatt Macy int j; 1328eda14cbcSMatt Macy uint8_t *b; 1329eda14cbcSMatt Macy 1330eda14cbcSMatt Macy *dst ^= *rq->q; 1331eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1332eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp); 1333eda14cbcSMatt Macy } 1334eda14cbcSMatt Macy } 1335eda14cbcSMatt Macy 1336eda14cbcSMatt Macy return (0); 1337eda14cbcSMatt Macy } 1338eda14cbcSMatt Macy 1339eda14cbcSMatt Macy struct reconst_pq_struct { 1340eda14cbcSMatt Macy uint8_t *p; 1341eda14cbcSMatt Macy uint8_t *q; 1342eda14cbcSMatt Macy uint8_t *pxy; 1343eda14cbcSMatt Macy uint8_t *qxy; 1344eda14cbcSMatt Macy int aexp; 1345eda14cbcSMatt Macy int bexp; 1346eda14cbcSMatt Macy }; 1347eda14cbcSMatt Macy 1348eda14cbcSMatt Macy static int 1349eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1350eda14cbcSMatt Macy { 1351eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1352eda14cbcSMatt Macy uint8_t *xd = xbuf; 1353eda14cbcSMatt Macy uint8_t *yd = ybuf; 1354eda14cbcSMatt Macy 1355eda14cbcSMatt Macy for (int i = 0; i < size; 1356eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1357eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1358eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1359eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1360eda14cbcSMatt Macy } 1361eda14cbcSMatt Macy 1362eda14cbcSMatt Macy return (0); 1363eda14cbcSMatt Macy } 1364eda14cbcSMatt Macy 1365eda14cbcSMatt Macy static int 1366eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1367eda14cbcSMatt Macy { 1368eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1369eda14cbcSMatt Macy uint8_t *xd = xbuf; 1370eda14cbcSMatt Macy 1371eda14cbcSMatt Macy for (int i = 0; i < size; 1372eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1373eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1374eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1375eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1376eda14cbcSMatt Macy } 1377eda14cbcSMatt Macy 1378eda14cbcSMatt Macy return (0); 1379eda14cbcSMatt Macy } 1380eda14cbcSMatt Macy 1381f9693befSMartin Matuska static void 13827877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1383eda14cbcSMatt Macy { 1384eda14cbcSMatt Macy int x = tgts[0]; 1385eda14cbcSMatt Macy abd_t *dst, *src; 1386eda14cbcSMatt Macy 1387e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1388e716630dSMartin Matuska zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1389e716630dSMartin Matuska 13907877fdebSMatt Macy ASSERT3U(ntgts, ==, 1); 13917877fdebSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol); 13927877fdebSMatt Macy ASSERT3U(x, <, rr->rr_cols); 1393eda14cbcSMatt Macy 13947877fdebSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1395eda14cbcSMatt Macy 13967877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 13977877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1398eda14cbcSMatt Macy 13997877fdebSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1400eda14cbcSMatt Macy 14017877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14027877fdebSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size, 14037877fdebSMatt Macy rr->rr_col[c].rc_size); 1404eda14cbcSMatt Macy 14057877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 1406eda14cbcSMatt Macy 1407eda14cbcSMatt Macy if (c == x) 1408eda14cbcSMatt Macy continue; 1409eda14cbcSMatt Macy 1410eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1411eda14cbcSMatt Macy vdev_raidz_reconst_p_func, NULL); 1412eda14cbcSMatt Macy } 1413eda14cbcSMatt Macy } 1414eda14cbcSMatt Macy 1415f9693befSMartin Matuska static void 14167877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1417eda14cbcSMatt Macy { 1418eda14cbcSMatt Macy int x = tgts[0]; 1419eda14cbcSMatt Macy int c, exp; 1420eda14cbcSMatt Macy abd_t *dst, *src; 1421eda14cbcSMatt Macy 1422e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1423e716630dSMartin Matuska zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1424e716630dSMartin Matuska 1425eda14cbcSMatt Macy ASSERT(ntgts == 1); 1426eda14cbcSMatt Macy 14277877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1428eda14cbcSMatt Macy 14297877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14307877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 14317877fdebSMatt Macy rr->rr_col[c].rc_size); 1432eda14cbcSMatt Macy 14337877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 14347877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1435eda14cbcSMatt Macy 14367877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1437eda14cbcSMatt Macy abd_copy(dst, src, size); 14387877fdebSMatt Macy if (rr->rr_col[x].rc_size > size) { 1439eda14cbcSMatt Macy abd_zero_off(dst, size, 14407877fdebSMatt Macy rr->rr_col[x].rc_size - size); 14417877fdebSMatt Macy } 1442eda14cbcSMatt Macy } else { 14437877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1444eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1445eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL); 1446eda14cbcSMatt Macy (void) abd_iterate_func(dst, 14477877fdebSMatt Macy size, rr->rr_col[x].rc_size - size, 1448eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL); 1449eda14cbcSMatt Macy } 1450eda14cbcSMatt Macy } 1451eda14cbcSMatt Macy 14527877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14537877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 14547877fdebSMatt Macy exp = 255 - (rr->rr_cols - 1 - x); 1455eda14cbcSMatt Macy 1456eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp }; 14577877fdebSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1458eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq); 1459eda14cbcSMatt Macy } 1460eda14cbcSMatt Macy 1461f9693befSMartin Matuska static void 14627877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1463eda14cbcSMatt Macy { 1464eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1465eda14cbcSMatt Macy abd_t *pdata, *qdata; 1466eda14cbcSMatt Macy uint64_t xsize, ysize; 1467eda14cbcSMatt Macy int x = tgts[0]; 1468eda14cbcSMatt Macy int y = tgts[1]; 1469eda14cbcSMatt Macy abd_t *xd, *yd; 1470eda14cbcSMatt Macy 1471e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1472e716630dSMartin Matuska zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1473e716630dSMartin Matuska 1474eda14cbcSMatt Macy ASSERT(ntgts == 2); 1475eda14cbcSMatt Macy ASSERT(x < y); 14767877fdebSMatt Macy ASSERT(x >= rr->rr_firstdatacol); 14777877fdebSMatt Macy ASSERT(y < rr->rr_cols); 1478eda14cbcSMatt Macy 14797877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1480eda14cbcSMatt Macy 1481eda14cbcSMatt Macy /* 1482eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as 1483eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1484eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual 1485eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by 1486eda14cbcSMatt Macy * setting their lengths to zero. 1487eda14cbcSMatt Macy */ 14887877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 14897877fdebSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14907877fdebSMatt Macy xsize = rr->rr_col[x].rc_size; 14917877fdebSMatt Macy ysize = rr->rr_col[y].rc_size; 1492eda14cbcSMatt Macy 14937877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = 14947877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 14957877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 14967877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 14977877fdebSMatt Macy rr->rr_col[x].rc_size = 0; 14987877fdebSMatt Macy rr->rr_col[y].rc_size = 0; 1499eda14cbcSMatt Macy 15007877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1501eda14cbcSMatt Macy 15027877fdebSMatt Macy rr->rr_col[x].rc_size = xsize; 15037877fdebSMatt Macy rr->rr_col[y].rc_size = ysize; 1504eda14cbcSMatt Macy 1505eda14cbcSMatt Macy p = abd_to_buf(pdata); 1506eda14cbcSMatt Macy q = abd_to_buf(qdata); 15077877fdebSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15087877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 15097877fdebSMatt Macy xd = rr->rr_col[x].rc_abd; 15107877fdebSMatt Macy yd = rr->rr_col[y].rc_abd; 1511eda14cbcSMatt Macy 1512eda14cbcSMatt Macy /* 1513eda14cbcSMatt Macy * We now have: 1514eda14cbcSMatt Macy * Pxy = P + D_x + D_y 1515eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1516eda14cbcSMatt Macy * 1517eda14cbcSMatt Macy * We can then solve for D_x: 1518eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy) 1519eda14cbcSMatt Macy * where 1520eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1 1521eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1522eda14cbcSMatt Macy * 1523eda14cbcSMatt Macy * With D_x in hand, we can easily solve for D_y: 1524eda14cbcSMatt Macy * D_y = P + Pxy + D_x 1525eda14cbcSMatt Macy */ 1526eda14cbcSMatt Macy 1527eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y]; 15287877fdebSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1529eda14cbcSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1]; 1530eda14cbcSMatt Macy 1531eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1532eda14cbcSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1533eda14cbcSMatt Macy 1534eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize); 1535eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1536eda14cbcSMatt Macy 1537eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1538eda14cbcSMatt Macy vdev_raidz_reconst_pq_func, &rpq); 1539eda14cbcSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize, 1540eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq); 1541eda14cbcSMatt Macy 15427877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15437877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1544eda14cbcSMatt Macy 1545eda14cbcSMatt Macy /* 1546eda14cbcSMatt Macy * Restore the saved parity data. 1547eda14cbcSMatt Macy */ 15487877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 15497877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1550eda14cbcSMatt Macy } 1551eda14cbcSMatt Macy 1552eda14cbcSMatt Macy /* 1553eda14cbcSMatt Macy * In the general case of reconstruction, we must solve the system of linear 1554eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as 1555eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with 1556eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p) 1557eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1558eda14cbcSMatt Macy * 1559eda14cbcSMatt Macy * __ __ __ __ 1560eda14cbcSMatt Macy * | | __ __ | p_0 | 1561eda14cbcSMatt Macy * | V | | D_0 | | p_m-1 | 1562eda14cbcSMatt Macy * | | x | : | = | d_0 | 1563eda14cbcSMatt Macy * | I | | D_n-1 | | : | 1564eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 | 1565eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1566eda14cbcSMatt Macy * 1567eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde 1568eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns 1569eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1570eda14cbcSMatt Macy * computation as well as linear separability. 1571eda14cbcSMatt Macy * 1572eda14cbcSMatt Macy * __ __ __ __ 1573eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 | 1574eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : | 1575eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1576eda14cbcSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 | 1577eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1578eda14cbcSMatt Macy * | : : : : | | : | | d_2 | 1579eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : | 1580eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : | 1581eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 | 1582eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1583eda14cbcSMatt Macy * 1584eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the 1585eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown 1586eda14cbcSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond 1587eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p 1588eda14cbcSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up 1589eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1590eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity 1591eda14cbcSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1592eda14cbcSMatt Macy * __ __ 1593eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1594eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1595eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / / 1596eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / / 1597eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' / 1598eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1599eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1600eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1601eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1602eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1603eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1604eda14cbcSMatt Macy * ~~ ~~ 1605eda14cbcSMatt Macy * __ __ 1606eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1607eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | 1608eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | 1609eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | 1610eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | 1611eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 | 1612eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1613eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1614eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1615eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1616eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1617eda14cbcSMatt Macy * ~~ ~~ 1618eda14cbcSMatt Macy * 1619eda14cbcSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1620eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1621eda14cbcSMatt Macy * matrix is not singular. 1622eda14cbcSMatt Macy * __ __ 1623eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1624eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1625eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1626eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1627eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1628eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1629eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1630eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1631eda14cbcSMatt Macy * ~~ ~~ 1632eda14cbcSMatt Macy * __ __ 1633eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1634eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1635eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1636eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1637eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1638eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1639eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1640eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1641eda14cbcSMatt Macy * ~~ ~~ 1642eda14cbcSMatt Macy * __ __ 1643eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1644eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1645eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1646eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1647eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1648eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1649eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1650eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1651eda14cbcSMatt Macy * ~~ ~~ 1652eda14cbcSMatt Macy * __ __ 1653eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1654eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1655eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1656eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1657eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1658eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1659eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1660eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1661eda14cbcSMatt Macy * ~~ ~~ 1662eda14cbcSMatt Macy * __ __ 1663eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1664eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1665eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1666eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1667eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1668eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1669eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1670eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1671eda14cbcSMatt Macy * ~~ ~~ 1672eda14cbcSMatt Macy * __ __ 1673eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1674eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1675eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1676eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1677eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1678eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1679eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1680eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1681eda14cbcSMatt Macy * ~~ ~~ 1682eda14cbcSMatt Macy * __ __ 1683eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 | 1684eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 | 1685eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 | 1686eda14cbcSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1687eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1688eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1689eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1690eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1691eda14cbcSMatt Macy * ~~ ~~ 1692eda14cbcSMatt Macy * 1693eda14cbcSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1694eda14cbcSMatt Macy * of the missing data. 1695eda14cbcSMatt Macy * 1696eda14cbcSMatt Macy * As is apparent from the example above, the only non-trivial rows in the 1697eda14cbcSMatt Macy * inverse matrix correspond to the data disks that we're trying to 1698eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would 1699eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For 1700eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to 1701eda14cbcSMatt Macy * targeted columns. 1702eda14cbcSMatt Macy */ 1703eda14cbcSMatt Macy 1704eda14cbcSMatt Macy static void 17057877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1706eda14cbcSMatt Macy uint8_t **rows) 1707eda14cbcSMatt Macy { 1708eda14cbcSMatt Macy int i, j; 1709eda14cbcSMatt Macy int pow; 1710eda14cbcSMatt Macy 17117877fdebSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1712eda14cbcSMatt Macy 1713eda14cbcSMatt Macy /* 1714eda14cbcSMatt Macy * Fill in the missing rows of interest. 1715eda14cbcSMatt Macy */ 1716eda14cbcSMatt Macy for (i = 0; i < nmap; i++) { 1717eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]); 1718eda14cbcSMatt Macy ASSERT3S(map[i], <=, 2); 1719eda14cbcSMatt Macy 1720eda14cbcSMatt Macy pow = map[i] * n; 1721eda14cbcSMatt Macy if (pow > 255) 1722eda14cbcSMatt Macy pow -= 255; 1723eda14cbcSMatt Macy ASSERT(pow <= 255); 1724eda14cbcSMatt Macy 1725eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1726eda14cbcSMatt Macy pow -= map[i]; 1727eda14cbcSMatt Macy if (pow < 0) 1728eda14cbcSMatt Macy pow += 255; 1729eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow]; 1730eda14cbcSMatt Macy } 1731eda14cbcSMatt Macy } 1732eda14cbcSMatt Macy } 1733eda14cbcSMatt Macy 1734eda14cbcSMatt Macy static void 17357877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1736eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1737eda14cbcSMatt Macy { 1738eda14cbcSMatt Macy int i, j, ii, jj; 1739eda14cbcSMatt Macy uint8_t log; 1740eda14cbcSMatt Macy 1741eda14cbcSMatt Macy /* 1742eda14cbcSMatt Macy * Assert that the first nmissing entries from the array of used 1743eda14cbcSMatt Macy * columns correspond to parity columns and that subsequent entries 1744eda14cbcSMatt Macy * correspond to data columns. 1745eda14cbcSMatt Macy */ 1746eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 17477877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol); 1748eda14cbcSMatt Macy } 1749eda14cbcSMatt Macy for (; i < n; i++) { 17507877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1751eda14cbcSMatt Macy } 1752eda14cbcSMatt Macy 1753eda14cbcSMatt Macy /* 1754eda14cbcSMatt Macy * First initialize the storage where we'll compute the inverse rows. 1755eda14cbcSMatt Macy */ 1756eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1757eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1758eda14cbcSMatt Macy invrows[i][j] = (i == j) ? 1 : 0; 1759eda14cbcSMatt Macy } 1760eda14cbcSMatt Macy } 1761eda14cbcSMatt Macy 1762eda14cbcSMatt Macy /* 1763eda14cbcSMatt Macy * Subtract all trivial rows from the rows of consequence. 1764eda14cbcSMatt Macy */ 1765eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1766eda14cbcSMatt Macy for (j = nmissing; j < n; j++) { 17677877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol); 17687877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol; 1769eda14cbcSMatt Macy ASSERT3S(jj, <, n); 1770eda14cbcSMatt Macy invrows[i][j] = rows[i][jj]; 1771eda14cbcSMatt Macy rows[i][jj] = 0; 1772eda14cbcSMatt Macy } 1773eda14cbcSMatt Macy } 1774eda14cbcSMatt Macy 1775eda14cbcSMatt Macy /* 1776eda14cbcSMatt Macy * For each of the rows of interest, we must normalize it and subtract 1777eda14cbcSMatt Macy * a multiple of it from the other rows. 1778eda14cbcSMatt Macy */ 1779eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1780eda14cbcSMatt Macy for (j = 0; j < missing[i]; j++) { 1781eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1782eda14cbcSMatt Macy } 1783eda14cbcSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0); 1784eda14cbcSMatt Macy 1785eda14cbcSMatt Macy /* 1786eda14cbcSMatt Macy * Compute the inverse of the first element and multiply each 1787eda14cbcSMatt Macy * element in the row by that value. 1788eda14cbcSMatt Macy */ 1789eda14cbcSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1790eda14cbcSMatt Macy 1791eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1792eda14cbcSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1793eda14cbcSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1794eda14cbcSMatt Macy } 1795eda14cbcSMatt Macy 1796eda14cbcSMatt Macy for (ii = 0; ii < nmissing; ii++) { 1797eda14cbcSMatt Macy if (i == ii) 1798eda14cbcSMatt Macy continue; 1799eda14cbcSMatt Macy 1800eda14cbcSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0); 1801eda14cbcSMatt Macy 1802eda14cbcSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]]; 1803eda14cbcSMatt Macy 1804eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1805eda14cbcSMatt Macy rows[ii][j] ^= 1806eda14cbcSMatt Macy vdev_raidz_exp2(rows[i][j], log); 1807eda14cbcSMatt Macy invrows[ii][j] ^= 1808eda14cbcSMatt Macy vdev_raidz_exp2(invrows[i][j], log); 1809eda14cbcSMatt Macy } 1810eda14cbcSMatt Macy } 1811eda14cbcSMatt Macy } 1812eda14cbcSMatt Macy 1813eda14cbcSMatt Macy /* 1814eda14cbcSMatt Macy * Verify that the data that is left in the rows are properly part of 1815eda14cbcSMatt Macy * an identity matrix. 1816eda14cbcSMatt Macy */ 1817eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1818eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1819eda14cbcSMatt Macy if (j == missing[i]) { 1820eda14cbcSMatt Macy ASSERT3U(rows[i][j], ==, 1); 1821eda14cbcSMatt Macy } else { 1822eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1823eda14cbcSMatt Macy } 1824eda14cbcSMatt Macy } 1825eda14cbcSMatt Macy } 1826eda14cbcSMatt Macy } 1827eda14cbcSMatt Macy 1828eda14cbcSMatt Macy static void 18297877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1830eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used) 1831eda14cbcSMatt Macy { 1832eda14cbcSMatt Macy int i, j, x, cc, c; 1833eda14cbcSMatt Macy uint8_t *src; 1834eda14cbcSMatt Macy uint64_t ccount; 1835eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1836eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1837eda14cbcSMatt Macy uint8_t log = 0; 1838eda14cbcSMatt Macy uint8_t val; 1839eda14cbcSMatt Macy int ll; 1840eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1841eda14cbcSMatt Macy uint8_t *p, *pp; 1842eda14cbcSMatt Macy size_t psize; 1843eda14cbcSMatt Macy 1844eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing; 1845eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1846eda14cbcSMatt Macy 1847eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing; i++) { 1848eda14cbcSMatt Macy invlog[i] = pp; 1849eda14cbcSMatt Macy pp += n; 1850eda14cbcSMatt Macy } 1851eda14cbcSMatt Macy 1852eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1853eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1854eda14cbcSMatt Macy ASSERT3U(invrows[i][j], !=, 0); 1855eda14cbcSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1856eda14cbcSMatt Macy } 1857eda14cbcSMatt Macy } 1858eda14cbcSMatt Macy 1859eda14cbcSMatt Macy for (i = 0; i < n; i++) { 1860eda14cbcSMatt Macy c = used[i]; 18617877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols); 1862eda14cbcSMatt Macy 18637877fdebSMatt Macy ccount = rr->rr_col[c].rc_size; 18647877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 18657877fdebSMatt Macy if (ccount == 0) 18667877fdebSMatt Macy continue; 18677877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd); 1868eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) { 18697877fdebSMatt Macy cc = missing[j] + rr->rr_firstdatacol; 18707877fdebSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol); 18717877fdebSMatt Macy ASSERT3U(cc, <, rr->rr_cols); 1872eda14cbcSMatt Macy ASSERT3U(cc, !=, c); 1873eda14cbcSMatt Macy 18747877fdebSMatt Macy dcount[j] = rr->rr_col[cc].rc_size; 18757877fdebSMatt Macy if (dcount[j] != 0) 18767877fdebSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1877eda14cbcSMatt Macy } 1878eda14cbcSMatt Macy 1879eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) { 1880eda14cbcSMatt Macy if (*src != 0) 1881eda14cbcSMatt Macy log = vdev_raidz_log2[*src]; 1882eda14cbcSMatt Macy 1883eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) { 1884eda14cbcSMatt Macy if (x >= dcount[cc]) 1885eda14cbcSMatt Macy continue; 1886eda14cbcSMatt Macy 1887eda14cbcSMatt Macy if (*src == 0) { 1888eda14cbcSMatt Macy val = 0; 1889eda14cbcSMatt Macy } else { 1890eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255) 1891eda14cbcSMatt Macy ll -= 255; 1892eda14cbcSMatt Macy val = vdev_raidz_pow2[ll]; 1893eda14cbcSMatt Macy } 1894eda14cbcSMatt Macy 1895eda14cbcSMatt Macy if (i == 0) 1896eda14cbcSMatt Macy dst[cc][x] = val; 1897eda14cbcSMatt Macy else 1898eda14cbcSMatt Macy dst[cc][x] ^= val; 1899eda14cbcSMatt Macy } 1900eda14cbcSMatt Macy } 1901eda14cbcSMatt Macy } 1902eda14cbcSMatt Macy 1903eda14cbcSMatt Macy kmem_free(p, psize); 1904eda14cbcSMatt Macy } 1905eda14cbcSMatt Macy 1906f9693befSMartin Matuska static void 19077877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1908eda14cbcSMatt Macy { 1909b985c9caSMartin Matuska int i, c, t, tt; 1910b985c9caSMartin Matuska unsigned int n; 1911b985c9caSMartin Matuska unsigned int nmissing_rows; 1912eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1913eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY]; 1914eda14cbcSMatt Macy uint8_t *p, *pp; 1915eda14cbcSMatt Macy size_t psize; 1916eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1917eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1918eda14cbcSMatt Macy uint8_t *used; 1919eda14cbcSMatt Macy 1920eda14cbcSMatt Macy abd_t **bufs = NULL; 1921eda14cbcSMatt Macy 1922e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1923e716630dSMartin Matuska zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1924eda14cbcSMatt Macy /* 1925eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate 19267877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found. 1927eda14cbcSMatt Macy */ 19287877fdebSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1929e716630dSMartin Matuska ASSERT(rr->rr_col[i].rc_abd != NULL); 19307877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 19317877fdebSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 19327877fdebSMatt Macy KM_PUSHPAGE); 1933eda14cbcSMatt Macy 19347877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 19357877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 1936eda14cbcSMatt Macy 1937eda14cbcSMatt Macy bufs[c] = col->rc_abd; 19387877fdebSMatt Macy if (bufs[c] != NULL) { 19397877fdebSMatt Macy col->rc_abd = abd_alloc_linear( 19407877fdebSMatt Macy col->rc_size, B_TRUE); 19417877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c], 19427877fdebSMatt Macy col->rc_size); 1943eda14cbcSMatt Macy } 1944eda14cbcSMatt Macy } 1945eda14cbcSMatt Macy 19467877fdebSMatt Macy break; 19477877fdebSMatt Macy } 19487877fdebSMatt Macy } 19497877fdebSMatt Macy 19507877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol; 1951eda14cbcSMatt Macy 1952eda14cbcSMatt Macy /* 1953eda14cbcSMatt Macy * Figure out which data columns are missing. 1954eda14cbcSMatt Macy */ 1955eda14cbcSMatt Macy nmissing_rows = 0; 1956eda14cbcSMatt Macy for (t = 0; t < ntgts; t++) { 19577877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) { 1958eda14cbcSMatt Macy missing_rows[nmissing_rows++] = 19597877fdebSMatt Macy tgts[t] - rr->rr_firstdatacol; 1960eda14cbcSMatt Macy } 1961eda14cbcSMatt Macy } 1962eda14cbcSMatt Macy 1963eda14cbcSMatt Macy /* 1964eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing 1965eda14cbcSMatt Macy * data columns. 1966eda14cbcSMatt Macy */ 1967eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1968eda14cbcSMatt Macy ASSERT(tt < ntgts); 19697877fdebSMatt Macy ASSERT(c < rr->rr_firstdatacol); 1970eda14cbcSMatt Macy 1971eda14cbcSMatt Macy /* 1972eda14cbcSMatt Macy * Skip any targeted parity columns. 1973eda14cbcSMatt Macy */ 1974eda14cbcSMatt Macy if (c == tgts[tt]) { 1975eda14cbcSMatt Macy tt++; 1976eda14cbcSMatt Macy continue; 1977eda14cbcSMatt Macy } 1978eda14cbcSMatt Macy 1979eda14cbcSMatt Macy parity_map[i] = c; 1980eda14cbcSMatt Macy i++; 1981eda14cbcSMatt Macy } 1982eda14cbcSMatt Macy 1983eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1984eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n; 1985eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1986eda14cbcSMatt Macy 1987eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) { 1988eda14cbcSMatt Macy rows[i] = pp; 1989eda14cbcSMatt Macy pp += n; 1990eda14cbcSMatt Macy invrows[i] = pp; 1991eda14cbcSMatt Macy pp += n; 1992eda14cbcSMatt Macy } 1993eda14cbcSMatt Macy used = pp; 1994eda14cbcSMatt Macy 1995eda14cbcSMatt Macy for (i = 0; i < nmissing_rows; i++) { 1996eda14cbcSMatt Macy used[i] = parity_map[i]; 1997eda14cbcSMatt Macy } 1998eda14cbcSMatt Macy 19997877fdebSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2000eda14cbcSMatt Macy if (tt < nmissing_rows && 20017877fdebSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) { 2002eda14cbcSMatt Macy tt++; 2003eda14cbcSMatt Macy continue; 2004eda14cbcSMatt Macy } 2005eda14cbcSMatt Macy 2006eda14cbcSMatt Macy ASSERT3S(i, <, n); 2007eda14cbcSMatt Macy used[i] = c; 2008eda14cbcSMatt Macy i++; 2009eda14cbcSMatt Macy } 2010eda14cbcSMatt Macy 2011eda14cbcSMatt Macy /* 2012eda14cbcSMatt Macy * Initialize the interesting rows of the matrix. 2013eda14cbcSMatt Macy */ 20147877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2015eda14cbcSMatt Macy 2016eda14cbcSMatt Macy /* 2017eda14cbcSMatt Macy * Invert the matrix. 2018eda14cbcSMatt Macy */ 20197877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2020eda14cbcSMatt Macy invrows, used); 2021eda14cbcSMatt Macy 2022eda14cbcSMatt Macy /* 2023eda14cbcSMatt Macy * Reconstruct the missing data using the generated matrix. 2024eda14cbcSMatt Macy */ 20257877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2026eda14cbcSMatt Macy invrows, used); 2027eda14cbcSMatt Macy 2028eda14cbcSMatt Macy kmem_free(p, psize); 2029eda14cbcSMatt Macy 2030eda14cbcSMatt Macy /* 2031eda14cbcSMatt Macy * copy back from temporary linear abds and free them 2032eda14cbcSMatt Macy */ 2033eda14cbcSMatt Macy if (bufs) { 20347877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 20357877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 2036eda14cbcSMatt Macy 20377877fdebSMatt Macy if (bufs[c] != NULL) { 2038eda14cbcSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size); 2039eda14cbcSMatt Macy abd_free(col->rc_abd); 20407877fdebSMatt Macy } 2041eda14cbcSMatt Macy col->rc_abd = bufs[c]; 2042eda14cbcSMatt Macy } 20437877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2044eda14cbcSMatt Macy } 2045eda14cbcSMatt Macy } 2046eda14cbcSMatt Macy 2047f9693befSMartin Matuska static void 20487877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 20497877fdebSMatt Macy const int *t, int nt) 2050eda14cbcSMatt Macy { 2051eda14cbcSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2052eda14cbcSMatt Macy int ntgts; 2053eda14cbcSMatt Macy int i, c, ret; 2054eda14cbcSMatt Macy int nbadparity, nbaddata; 2055eda14cbcSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2056eda14cbcSMatt Macy 2057e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2058e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2059e716630dSMartin Matuska rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2060e716630dSMartin Matuska (int)rr->rr_missingparity); 2061e716630dSMartin Matuska } 2062e716630dSMartin Matuska 20637877fdebSMatt Macy nbadparity = rr->rr_firstdatacol; 20647877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity; 2065eda14cbcSMatt Macy ntgts = 0; 20667877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) { 2067e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2068e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2069e716630dSMartin Matuska "offset=%llx error=%u)", 2070e716630dSMartin Matuska rr, c, (int)rr->rr_col[c].rc_devidx, 2071e716630dSMartin Matuska (long long)rr->rr_col[c].rc_offset, 2072e716630dSMartin Matuska (int)rr->rr_col[c].rc_error); 2073e716630dSMartin Matuska } 20747877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2075eda14cbcSMatt Macy parity_valid[c] = B_FALSE; 2076eda14cbcSMatt Macy 2077eda14cbcSMatt Macy if (i < nt && c == t[i]) { 2078eda14cbcSMatt Macy tgts[ntgts++] = c; 2079eda14cbcSMatt Macy i++; 20807877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) { 2081eda14cbcSMatt Macy tgts[ntgts++] = c; 20827877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) { 2083eda14cbcSMatt Macy nbaddata--; 2084eda14cbcSMatt Macy } else { 2085eda14cbcSMatt Macy parity_valid[c] = B_TRUE; 2086eda14cbcSMatt Macy nbadparity--; 2087eda14cbcSMatt Macy } 2088eda14cbcSMatt Macy } 2089eda14cbcSMatt Macy 2090eda14cbcSMatt Macy ASSERT(ntgts >= nt); 2091eda14cbcSMatt Macy ASSERT(nbaddata >= 0); 2092eda14cbcSMatt Macy ASSERT(nbaddata + nbadparity == ntgts); 2093eda14cbcSMatt Macy 2094eda14cbcSMatt Macy dt = &tgts[nbadparity]; 2095eda14cbcSMatt Macy 2096eda14cbcSMatt Macy /* Reconstruct using the new math implementation */ 20977877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2098eda14cbcSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL) 2099f9693befSMartin Matuska return; 2100eda14cbcSMatt Macy 2101eda14cbcSMatt Macy /* 2102eda14cbcSMatt Macy * See if we can use any of our optimized reconstruction routines. 2103eda14cbcSMatt Macy */ 2104eda14cbcSMatt Macy switch (nbaddata) { 2105eda14cbcSMatt Macy case 1: 2106f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_P]) { 2107f9693befSMartin Matuska vdev_raidz_reconstruct_p(rr, dt, 1); 2108f9693befSMartin Matuska return; 2109f9693befSMartin Matuska } 2110eda14cbcSMatt Macy 21117877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2112eda14cbcSMatt Macy 2113f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_Q]) { 2114f9693befSMartin Matuska vdev_raidz_reconstruct_q(rr, dt, 1); 2115f9693befSMartin Matuska return; 2116f9693befSMartin Matuska } 2117eda14cbcSMatt Macy 21187877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2119eda14cbcSMatt Macy break; 2120eda14cbcSMatt Macy 2121eda14cbcSMatt Macy case 2: 21227877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2123eda14cbcSMatt Macy 2124eda14cbcSMatt Macy if (parity_valid[VDEV_RAIDZ_P] && 2125f9693befSMartin Matuska parity_valid[VDEV_RAIDZ_Q]) { 2126f9693befSMartin Matuska vdev_raidz_reconstruct_pq(rr, dt, 2); 2127f9693befSMartin Matuska return; 2128f9693befSMartin Matuska } 2129eda14cbcSMatt Macy 21307877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2131eda14cbcSMatt Macy 2132eda14cbcSMatt Macy break; 2133eda14cbcSMatt Macy } 2134eda14cbcSMatt Macy 2135f9693befSMartin Matuska vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2136eda14cbcSMatt Macy } 2137eda14cbcSMatt Macy 2138eda14cbcSMatt Macy static int 2139eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2140eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 2141eda14cbcSMatt Macy { 21427877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 21437877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2144eda14cbcSMatt Macy int c; 2145eda14cbcSMatt Macy int lasterror = 0; 2146eda14cbcSMatt Macy int numerrors = 0; 2147eda14cbcSMatt Macy 2148eda14cbcSMatt Macy ASSERT(nparity > 0); 2149eda14cbcSMatt Macy 2150eda14cbcSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY || 2151eda14cbcSMatt Macy vd->vdev_children < nparity + 1) { 2152eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2153eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 2154eda14cbcSMatt Macy } 2155eda14cbcSMatt Macy 2156eda14cbcSMatt Macy vdev_open_children(vd); 2157eda14cbcSMatt Macy 2158eda14cbcSMatt Macy for (c = 0; c < vd->vdev_children; c++) { 21597877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 2160eda14cbcSMatt Macy 2161eda14cbcSMatt Macy if (cvd->vdev_open_error != 0) { 2162eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 2163eda14cbcSMatt Macy numerrors++; 2164eda14cbcSMatt Macy continue; 2165eda14cbcSMatt Macy } 2166eda14cbcSMatt Macy 2167eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2168eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2169eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2170c7046f76SMartin Matuska } 2171c7046f76SMartin Matuska for (c = 0; c < vd->vdev_children; c++) { 2172c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c]; 2173c7046f76SMartin Matuska 2174c7046f76SMartin Matuska if (cvd->vdev_open_error != 0) 2175c7046f76SMartin Matuska continue; 2176c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift, 2177c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift); 2178eda14cbcSMatt Macy } 2179eda14cbcSMatt Macy 2180e716630dSMartin Matuska if (vd->vdev_rz_expanding) { 2181e716630dSMartin Matuska *asize *= vd->vdev_children - 1; 2182e716630dSMartin Matuska *max_asize *= vd->vdev_children - 1; 2183e716630dSMartin Matuska 2184e716630dSMartin Matuska vd->vdev_min_asize = *asize; 2185e716630dSMartin Matuska } else { 2186eda14cbcSMatt Macy *asize *= vd->vdev_children; 2187eda14cbcSMatt Macy *max_asize *= vd->vdev_children; 2188e716630dSMartin Matuska } 2189eda14cbcSMatt Macy 2190eda14cbcSMatt Macy if (numerrors > nparity) { 2191eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2192eda14cbcSMatt Macy return (lasterror); 2193eda14cbcSMatt Macy } 2194eda14cbcSMatt Macy 2195eda14cbcSMatt Macy return (0); 2196eda14cbcSMatt Macy } 2197eda14cbcSMatt Macy 2198eda14cbcSMatt Macy static void 2199eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd) 2200eda14cbcSMatt Macy { 22017877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 22027877fdebSMatt Macy if (vd->vdev_child[c] != NULL) 2203eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 2204eda14cbcSMatt Macy } 22057877fdebSMatt Macy } 2206eda14cbcSMatt Macy 2207e716630dSMartin Matuska /* 2208e716630dSMartin Matuska * Return the logical width to use, given the txg in which the allocation 2209783d3ff6SMartin Matuska * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2210e716630dSMartin Matuska * BP was allocated. Remapped BP's (that were relocated due to device 2211783d3ff6SMartin Matuska * removal, see remap_blkptr_cb()), will have a more recent physical birth 2212783d3ff6SMartin Matuska * which reflects when the BP was relocated, but we can ignore these because 2213783d3ff6SMartin Matuska * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2214e716630dSMartin Matuska */ 2215eda14cbcSMatt Macy static uint64_t 2216e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2217e716630dSMartin Matuska { 2218e716630dSMartin Matuska reflow_node_t lookup = { 2219e716630dSMartin Matuska .re_txg = txg, 2220e716630dSMartin Matuska }; 2221e716630dSMartin Matuska avl_index_t where; 2222e716630dSMartin Matuska 2223e716630dSMartin Matuska uint64_t width; 2224e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 2225e716630dSMartin Matuska reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2226e716630dSMartin Matuska if (re != NULL) { 2227e716630dSMartin Matuska width = re->re_logical_width; 2228e716630dSMartin Matuska } else { 2229e716630dSMartin Matuska re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2230e716630dSMartin Matuska if (re != NULL) 2231e716630dSMartin Matuska width = re->re_logical_width; 2232e716630dSMartin Matuska else 2233e716630dSMartin Matuska width = vdrz->vd_original_width; 2234e716630dSMartin Matuska } 2235e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 2236e716630dSMartin Matuska return (width); 2237e716630dSMartin Matuska } 2238071ab5a1SMartin Matuska /* 2239071ab5a1SMartin Matuska * This code converts an asize into the largest psize that can safely be written 2240071ab5a1SMartin Matuska * to an allocation of that size for this vdev. 2241071ab5a1SMartin Matuska * 2242071ab5a1SMartin Matuska * Note that this function will not take into account the effect of gang 2243071ab5a1SMartin Matuska * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of 2244071ab5a1SMartin Matuska * the psize_to_asize function. 2245071ab5a1SMartin Matuska */ 2246071ab5a1SMartin Matuska static uint64_t 2247071ab5a1SMartin Matuska vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) 2248071ab5a1SMartin Matuska { 2249071ab5a1SMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 2250071ab5a1SMartin Matuska uint64_t psize; 2251071ab5a1SMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift; 2252071ab5a1SMartin Matuska uint64_t cols = vdrz->vd_original_width; 2253071ab5a1SMartin Matuska uint64_t nparity = vdrz->vd_nparity; 2254071ab5a1SMartin Matuska 2255071ab5a1SMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg); 2256071ab5a1SMartin Matuska 2257071ab5a1SMartin Matuska ASSERT0(asize % (1 << ashift)); 2258071ab5a1SMartin Matuska 2259071ab5a1SMartin Matuska psize = (asize >> ashift); 2260071ab5a1SMartin Matuska psize -= nparity * DIV_ROUND_UP(psize, cols); 2261071ab5a1SMartin Matuska psize <<= ashift; 2262071ab5a1SMartin Matuska 2263071ab5a1SMartin Matuska return (asize); 2264071ab5a1SMartin Matuska } 2265e716630dSMartin Matuska 2266e716630dSMartin Matuska /* 2267e716630dSMartin Matuska * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2268e716630dSMartin Matuska * more space due to the lower data-to-parity ratio. In this case it's 2269e716630dSMartin Matuska * important to pass in the correct txg. Note that vdev_gang_header_asize() 2270e716630dSMartin Matuska * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2271e716630dSMartin Matuska * regardless of txg. This is assured because for a single data sector, we 2272e716630dSMartin Matuska * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2273e716630dSMartin Matuska */ 2274e716630dSMartin Matuska static uint64_t 2275071ab5a1SMartin Matuska vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2276eda14cbcSMatt Macy { 22777877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2278eda14cbcSMatt Macy uint64_t asize; 2279eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 2280e716630dSMartin Matuska uint64_t cols = vdrz->vd_original_width; 22817877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2282eda14cbcSMatt Macy 2283e716630dSMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg); 2284e716630dSMartin Matuska 2285eda14cbcSMatt Macy asize = ((psize - 1) >> ashift) + 1; 2286eda14cbcSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2287eda14cbcSMatt Macy asize = roundup(asize, nparity + 1) << ashift; 2288eda14cbcSMatt Macy 2289e716630dSMartin Matuska #ifdef ZFS_DEBUG 2290e716630dSMartin Matuska uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2291e716630dSMartin Matuska uint64_t ncols_new = vdrz->vd_physical_width; 2292e716630dSMartin Matuska asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2293e716630dSMartin Matuska (ncols_new - nparity)); 2294e716630dSMartin Matuska asize_new = roundup(asize_new, nparity + 1) << ashift; 2295e716630dSMartin Matuska VERIFY3U(asize_new, <=, asize); 2296e716630dSMartin Matuska #endif 2297e716630dSMartin Matuska 2298eda14cbcSMatt Macy return (asize); 2299eda14cbcSMatt Macy } 2300eda14cbcSMatt Macy 23017877fdebSMatt Macy /* 23027877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child) 23037877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize. 23047877fdebSMatt Macy */ 23057877fdebSMatt Macy static uint64_t 23067877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd) 23077877fdebSMatt Macy { 23087877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) / 23097877fdebSMatt Macy vd->vdev_children); 23107877fdebSMatt Macy } 23117877fdebSMatt Macy 23127877fdebSMatt Macy void 2313eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio) 2314eda14cbcSMatt Macy { 2315eda14cbcSMatt Macy raidz_col_t *rc = zio->io_private; 2316eda14cbcSMatt Macy 231781b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 2318eda14cbcSMatt Macy rc->rc_error = zio->io_error; 2319eda14cbcSMatt Macy rc->rc_tried = 1; 2320eda14cbcSMatt Macy rc->rc_skipped = 0; 2321eda14cbcSMatt Macy } 2322eda14cbcSMatt Macy 2323eda14cbcSMatt Macy static void 2324e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio) 2325eda14cbcSMatt Macy { 2326e716630dSMartin Matuska raidz_col_t *rc = zio->io_private; 2327eda14cbcSMatt Macy 2328e716630dSMartin Matuska rc->rc_shadow_error = zio->io_error; 2329e716630dSMartin Matuska } 2330e716630dSMartin Matuska 2331e716630dSMartin Matuska static void 2332e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2333e716630dSMartin Matuska { 2334e716630dSMartin Matuska (void) rm; 2335e716630dSMartin Matuska #ifdef ZFS_DEBUG 2336b59a0cdeSMartin Matuska zfs_range_seg64_t logical_rs, physical_rs, remain_rs; 23377877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset; 2338eda14cbcSMatt Macy logical_rs.rs_end = logical_rs.rs_start + 2339071ab5a1SMartin Matuska vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, 2340783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2341eda14cbcSMatt Macy 23427877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col]; 2343e716630dSMartin Matuska vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2344eda14cbcSMatt Macy 23457877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 23467877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs)); 2347e716630dSMartin Matuska if (vdev_xlate_is_empty(&physical_rs)) { 2348e716630dSMartin Matuska /* 2349e716630dSMartin Matuska * If we are in the middle of expansion, the 2350e716630dSMartin Matuska * physical->logical mapping is changing so vdev_xlate() 2351e716630dSMartin Matuska * can't give us a reliable answer. 2352e716630dSMartin Matuska */ 2353e716630dSMartin Matuska return; 2354e716630dSMartin Matuska } 2355eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2356eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2357eda14cbcSMatt Macy /* 2358eda14cbcSMatt Macy * It would be nice to assert that rs_end is equal 2359eda14cbcSMatt Macy * to rc_offset + rc_size but there might be an 2360eda14cbcSMatt Macy * optional I/O at the end that is not accounted in 2361eda14cbcSMatt Macy * rc_size. 2362eda14cbcSMatt Macy */ 2363eda14cbcSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2364eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2365e716630dSMartin Matuska rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2366eda14cbcSMatt Macy } else { 2367eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2368eda14cbcSMatt Macy } 2369eda14cbcSMatt Macy #endif 2370eda14cbcSMatt Macy } 2371eda14cbcSMatt Macy 23727877fdebSMatt Macy static void 2373e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 23747877fdebSMatt Macy { 23757877fdebSMatt Macy vdev_t *vd = zio->io_vd; 23767877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 23777877fdebSMatt Macy 23787877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 23797877fdebSMatt Macy 238081b22a98SMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 23817877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 238281b22a98SMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 23837877fdebSMatt Macy 23847877fdebSMatt Macy /* Verify physical to logical translation */ 2385e716630dSMartin Matuska vdev_raidz_io_verify(zio, rm, rr, c); 23867877fdebSMatt Macy 2387e716630dSMartin Matuska if (rc->rc_size == 0) 2388e716630dSMartin Matuska continue; 2389e716630dSMartin Matuska 2390e716630dSMartin Matuska ASSERT3U(rc->rc_offset + rc->rc_size, <, 2391e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2392e716630dSMartin Matuska 239381b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 23947877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 239581b22a98SMartin Matuska rc->rc_offset, rc->rc_abd, 239681b22a98SMartin Matuska abd_get_size(rc->rc_abd), zio->io_type, 239781b22a98SMartin Matuska zio->io_priority, 0, vdev_raidz_child_done, rc)); 2398e716630dSMartin Matuska 2399e716630dSMartin Matuska if (rc->rc_shadow_devidx != INT_MAX) { 2400e716630dSMartin Matuska vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2401e716630dSMartin Matuska 2402e716630dSMartin Matuska ASSERT3U( 2403e716630dSMartin Matuska rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2404e716630dSMartin Matuska cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2405e716630dSMartin Matuska 2406e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2407e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, 2408e716630dSMartin Matuska abd_get_size(rc->rc_abd), 2409e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2410e716630dSMartin Matuska vdev_raidz_shadow_child_done, rc)); 241181b22a98SMartin Matuska } 24127877fdebSMatt Macy } 24137877fdebSMatt Macy } 24147877fdebSMatt Macy 2415e716630dSMartin Matuska /* 2416e716630dSMartin Matuska * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2417e716630dSMartin Matuska * This only works for vdev_raidz_map_alloc() (not _expanded()). 2418e716630dSMartin Matuska */ 24197877fdebSMatt Macy static void 2420e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio) 2421e716630dSMartin Matuska { 2422e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2423e716630dSMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift; 2424e716630dSMartin Matuska raidz_map_t *rm = zio->io_vsd; 2425e716630dSMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 2426e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 2427e716630dSMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 2428e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2429e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2430e716630dSMartin Matuska if (rc->rc_size != 0) 2431e716630dSMartin Matuska continue; 2432e716630dSMartin Matuska ASSERT3P(rc->rc_abd, ==, NULL); 2433e716630dSMartin Matuska 2434e716630dSMartin Matuska ASSERT3U(rc->rc_offset, <, 2435e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2436e716630dSMartin Matuska 2437e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2438e716630dSMartin Matuska NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2439e716630dSMartin Matuska ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2440e716630dSMartin Matuska } 2441e716630dSMartin Matuska } 2442e716630dSMartin Matuska 2443e716630dSMartin Matuska static void 2444e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 24457877fdebSMatt Macy { 24467877fdebSMatt Macy vdev_t *vd = zio->io_vd; 24477877fdebSMatt Macy 24487877fdebSMatt Macy /* 24497877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity 24507877fdebSMatt Macy * last -- any errors along the way will force us to read the parity. 24517877fdebSMatt Macy */ 24527877fdebSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) { 24537877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 24547877fdebSMatt Macy if (rc->rc_size == 0) 24557877fdebSMatt Macy continue; 24567877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 24577877fdebSMatt Macy if (!vdev_readable(cvd)) { 24587877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24597877fdebSMatt Macy rr->rr_missingdata++; 24607877fdebSMatt Macy else 24617877fdebSMatt Macy rr->rr_missingparity++; 24627877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO); 24637877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */ 24647877fdebSMatt Macy rc->rc_skipped = 1; 24657877fdebSMatt Macy continue; 24667877fdebSMatt Macy } 24677877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 24687877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24697877fdebSMatt Macy rr->rr_missingdata++; 24707877fdebSMatt Macy else 24717877fdebSMatt Macy rr->rr_missingparity++; 24727877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE); 24737877fdebSMatt Macy rc->rc_skipped = 1; 24747877fdebSMatt Macy continue; 24757877fdebSMatt Macy } 2476e716630dSMartin Matuska if (forceparity || 2477e716630dSMartin Matuska c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 24787877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 24797877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 24807877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 24817877fdebSMatt Macy zio->io_type, zio->io_priority, 0, 24827877fdebSMatt Macy vdev_raidz_child_done, rc)); 24837877fdebSMatt Macy } 24847877fdebSMatt Macy } 24857877fdebSMatt Macy } 24867877fdebSMatt Macy 2487e716630dSMartin Matuska static void 2488e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2489e716630dSMartin Matuska { 2490e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2491e716630dSMartin Matuska 2492e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 2493e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 2494e716630dSMartin Matuska if (prc->rc_size == 0) 2495e716630dSMartin Matuska continue; 2496e716630dSMartin Matuska 2497e716630dSMartin Matuska ASSERT3U(prc->rc_devidx, ==, i); 2498e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[i]; 2499e716630dSMartin Matuska if (!vdev_readable(cvd)) { 2500e716630dSMartin Matuska prc->rc_error = SET_ERROR(ENXIO); 2501e716630dSMartin Matuska prc->rc_tried = 1; /* don't even try */ 2502e716630dSMartin Matuska prc->rc_skipped = 1; 2503e716630dSMartin Matuska continue; 2504e716630dSMartin Matuska } 2505e716630dSMartin Matuska if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2506e716630dSMartin Matuska prc->rc_error = SET_ERROR(ESTALE); 2507e716630dSMartin Matuska prc->rc_skipped = 1; 2508e716630dSMartin Matuska continue; 2509e716630dSMartin Matuska } 2510e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2511e716630dSMartin Matuska prc->rc_offset, prc->rc_abd, prc->rc_size, 2512e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2513e716630dSMartin Matuska vdev_raidz_child_done, prc)); 2514e716630dSMartin Matuska } 2515e716630dSMartin Matuska } 2516e716630dSMartin Matuska 2517e716630dSMartin Matuska static void 2518e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2519e716630dSMartin Matuska { 2520e716630dSMartin Matuska /* 2521e716630dSMartin Matuska * If there are multiple rows, we will be hitting 2522e716630dSMartin Matuska * all disks, so go ahead and read the parity so 2523e716630dSMartin Matuska * that we are reading in decent size chunks. 2524e716630dSMartin Matuska */ 2525e716630dSMartin Matuska boolean_t forceparity = rm->rm_nrows > 1; 2526e716630dSMartin Matuska 2527e716630dSMartin Matuska if (rm->rm_phys_col) { 2528e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio, rm); 2529e716630dSMartin Matuska } else { 2530e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2531e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 2532e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio, rr, forceparity); 2533e716630dSMartin Matuska } 2534e716630dSMartin Matuska } 2535e716630dSMartin Matuska } 2536e716630dSMartin Matuska 2537eda14cbcSMatt Macy /* 2538eda14cbcSMatt Macy * Start an IO operation on a RAIDZ VDev 2539eda14cbcSMatt Macy * 2540eda14cbcSMatt Macy * Outline: 2541eda14cbcSMatt Macy * - For write operations: 2542eda14cbcSMatt Macy * 1. Generate the parity data 2543eda14cbcSMatt Macy * 2. Create child zio write operations to each column's vdev, for both 2544eda14cbcSMatt Macy * data and parity. 2545eda14cbcSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy 2546eda14cbcSMatt Macy * write zio children for those areas to improve aggregation continuity. 2547eda14cbcSMatt Macy * - For read operations: 2548eda14cbcSMatt Macy * 1. Create child zio read operations to each data column's vdev to read 2549eda14cbcSMatt Macy * the range of data required for zio. 2550eda14cbcSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data 2551eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity 2552eda14cbcSMatt Macy * columns' VDevs as well. 2553eda14cbcSMatt Macy */ 2554eda14cbcSMatt Macy static void 2555eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio) 2556eda14cbcSMatt Macy { 2557eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 2558eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top; 25597877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2560e716630dSMartin Matuska raidz_map_t *rm; 2561eda14cbcSMatt Macy 2562e716630dSMartin Matuska uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2563783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2564e716630dSMartin Matuska if (logical_width != vdrz->vd_physical_width) { 2565e716630dSMartin Matuska zfs_locked_range_t *lr = NULL; 2566e716630dSMartin Matuska uint64_t synced_offset = UINT64_MAX; 2567e716630dSMartin Matuska uint64_t next_offset = UINT64_MAX; 2568e716630dSMartin Matuska boolean_t use_scratch = B_FALSE; 2569e716630dSMartin Matuska /* 2570e716630dSMartin Matuska * Note: when the expansion is completing, we set 2571e716630dSMartin Matuska * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2572e716630dSMartin Matuska * in a later txg than when we last update spa_ubsync's state 2573e716630dSMartin Matuska * (see the end of spa_raidz_expand_thread()). Therefore we 2574e716630dSMartin Matuska * may see vre_state!=SCANNING before 2575e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2576e716630dSMartin Matuska * on disk, but the copying progress has been synced to disk 2577e716630dSMartin Matuska * (and reflected in spa_ubsync). In this case it's fine to 2578e716630dSMartin Matuska * treat the expansion as completed, since if we crash there's 2579e716630dSMartin Matuska * no additional copying to do. 2580e716630dSMartin Matuska */ 2581e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2582e716630dSMartin Matuska ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2583e716630dSMartin Matuska &vdrz->vn_vre); 2584e716630dSMartin Matuska lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2585e716630dSMartin Matuska zio->io_offset, zio->io_size, RL_READER); 2586e716630dSMartin Matuska use_scratch = 2587e716630dSMartin Matuska (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2588e716630dSMartin Matuska RRSS_SCRATCH_VALID); 2589e716630dSMartin Matuska synced_offset = 2590e716630dSMartin Matuska RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2591e716630dSMartin Matuska next_offset = vdrz->vn_vre.vre_offset; 2592e716630dSMartin Matuska /* 2593e716630dSMartin Matuska * If we haven't resumed expanding since importing the 2594e716630dSMartin Matuska * pool, vre_offset won't have been set yet. In 2595e716630dSMartin Matuska * this case the next offset to be copied is the same 2596e716630dSMartin Matuska * as what was synced. 2597e716630dSMartin Matuska */ 2598e716630dSMartin Matuska if (next_offset == UINT64_MAX) { 2599e716630dSMartin Matuska next_offset = synced_offset; 2600e716630dSMartin Matuska } 2601e716630dSMartin Matuska } 2602e716630dSMartin Matuska if (use_scratch) { 2603e716630dSMartin Matuska zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2604e716630dSMartin Matuska "%lld next_offset=%lld use_scratch=%u", 2605e716630dSMartin Matuska zio, 2606e716630dSMartin Matuska zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2607e716630dSMartin Matuska (long long)zio->io_offset, 2608e716630dSMartin Matuska (long long)synced_offset, 2609e716630dSMartin Matuska (long long)next_offset, 2610e716630dSMartin Matuska use_scratch); 2611e716630dSMartin Matuska } 2612e716630dSMartin Matuska 2613e716630dSMartin Matuska rm = vdev_raidz_map_alloc_expanded(zio, 2614e716630dSMartin Matuska tvd->vdev_ashift, vdrz->vd_physical_width, 2615e716630dSMartin Matuska logical_width, vdrz->vd_nparity, 2616e716630dSMartin Matuska synced_offset, next_offset, use_scratch); 2617e716630dSMartin Matuska rm->rm_lr = lr; 2618e716630dSMartin Matuska } else { 2619e716630dSMartin Matuska rm = vdev_raidz_map_alloc(zio, 2620e716630dSMartin Matuska tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2621e716630dSMartin Matuska } 2622e716630dSMartin Matuska rm->rm_original_width = vdrz->vd_original_width; 2623e716630dSMartin Matuska 2624f9693befSMartin Matuska zio->io_vsd = rm; 2625f9693befSMartin Matuska zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2626eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 2627e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2628e716630dSMartin Matuska vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2629e716630dSMartin Matuska } 2630e716630dSMartin Matuska 2631e716630dSMartin Matuska if (logical_width == vdrz->vd_physical_width) { 2632e716630dSMartin Matuska raidz_start_skip_writes(zio); 2633e716630dSMartin Matuska } 26347877fdebSMatt Macy } else { 2635eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 2636e716630dSMartin Matuska vdev_raidz_io_start_read(zio, rm); 2637eda14cbcSMatt Macy } 2638eda14cbcSMatt Macy 2639eda14cbcSMatt Macy zio_execute(zio); 2640eda14cbcSMatt Macy } 2641eda14cbcSMatt Macy 2642eda14cbcSMatt Macy /* 2643eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device. 2644eda14cbcSMatt Macy */ 2645e92ffd9bSMartin Matuska void 2646e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2647eda14cbcSMatt Macy { 2648eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2649eda14cbcSMatt Macy 26507877fdebSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 26517877fdebSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) { 2652eda14cbcSMatt Macy zio_bad_cksum_t zbc; 2653eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2654eda14cbcSMatt Macy 2655eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 2656eda14cbcSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 2657eda14cbcSMatt Macy 26582c48331dSMatt Macy mutex_enter(&vd->vdev_stat_lock); 26592c48331dSMatt Macy vd->vdev_stat.vs_checksum_errors++; 26602c48331dSMatt Macy mutex_exit(&vd->vdev_stat_lock); 2661bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2662bb2d13b6SMartin Matuska &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2663bb2d13b6SMartin Matuska rc->rc_abd, bad_data, &zbc); 26642c48331dSMatt Macy } 2665eda14cbcSMatt Macy } 2666eda14cbcSMatt Macy 2667eda14cbcSMatt Macy /* 2668eda14cbcSMatt Macy * We keep track of whether or not there were any injected errors, so that 2669eda14cbcSMatt Macy * any ereports we generate can note it. 2670eda14cbcSMatt Macy */ 2671eda14cbcSMatt Macy static int 2672eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio) 2673eda14cbcSMatt Macy { 2674315ee00fSMartin Matuska zio_bad_cksum_t zbc = {0}; 2675eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2676eda14cbcSMatt Macy 2677eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc); 267887bf66d4SMartin Matuska /* 267987bf66d4SMartin Matuska * Any Direct I/O read that has a checksum error must be treated as 268087bf66d4SMartin Matuska * suspicious as the contents of the buffer could be getting 268187bf66d4SMartin Matuska * manipulated while the I/O is taking place. The checksum verify error 268287bf66d4SMartin Matuska * will be reported to the top-level RAIDZ VDEV. 268387bf66d4SMartin Matuska */ 268487bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 268587bf66d4SMartin Matuska zio->io_error = ret; 268687bf66d4SMartin Matuska zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 268787bf66d4SMartin Matuska zio_dio_chksum_verify_error_report(zio); 268887bf66d4SMartin Matuska zio_checksum_verified(zio); 268987bf66d4SMartin Matuska return (0); 269087bf66d4SMartin Matuska } 269187bf66d4SMartin Matuska 2692eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0) 2693eda14cbcSMatt Macy rm->rm_ecksuminjected = 1; 2694eda14cbcSMatt Macy 2695eda14cbcSMatt Macy return (ret); 2696eda14cbcSMatt Macy } 2697eda14cbcSMatt Macy 2698eda14cbcSMatt Macy /* 2699eda14cbcSMatt Macy * Generate the parity from the data columns. If we tried and were able to 2700eda14cbcSMatt Macy * read the parity without error, verify that the generated parity matches the 2701eda14cbcSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the 27027877fdebSMatt Macy * number of such failures. 2703eda14cbcSMatt Macy */ 2704eda14cbcSMatt Macy static int 27057877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2706eda14cbcSMatt Macy { 2707eda14cbcSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2708eda14cbcSMatt Macy int c, ret = 0; 27097877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2710eda14cbcSMatt Macy raidz_col_t *rc; 2711eda14cbcSMatt Macy 2712eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp; 2713eda14cbcSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2714eda14cbcSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2715eda14cbcSMatt Macy 2716eda14cbcSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY) 2717eda14cbcSMatt Macy return (ret); 2718eda14cbcSMatt Macy 27197877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 27207877fdebSMatt Macy rc = &rr->rr_col[c]; 2721eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2722eda14cbcSMatt Macy continue; 2723eda14cbcSMatt Macy 2724a0b956f5SMartin Matuska orig[c] = rc->rc_abd; 2725a0b956f5SMartin Matuska ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2726a0b956f5SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2727eda14cbcSMatt Macy } 2728eda14cbcSMatt Macy 27297877fdebSMatt Macy /* 2730e92ffd9bSMartin Matuska * Verify any empty sectors are zero filled to ensure the parity 2731e92ffd9bSMartin Matuska * is calculated correctly even if these non-data sectors are damaged. 2732e92ffd9bSMartin Matuska */ 2733e92ffd9bSMartin Matuska if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2734e92ffd9bSMartin Matuska ret += vdev_draid_map_verify_empty(zio, rr); 2735e92ffd9bSMartin Matuska 2736e92ffd9bSMartin Matuska /* 27377877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This 27387877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff 27397877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0). 27407877fdebSMatt Macy */ 27417877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 2742eda14cbcSMatt Macy 27437877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 27447877fdebSMatt Macy rc = &rr->rr_col[c]; 27457877fdebSMatt Macy 2746eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2747eda14cbcSMatt Macy continue; 27487877fdebSMatt Macy 2749eda14cbcSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2750e716630dSMartin Matuska zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2751e716630dSMartin Matuska c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2752e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, rc, orig[c]); 2753eda14cbcSMatt Macy rc->rc_error = SET_ERROR(ECKSUM); 2754eda14cbcSMatt Macy ret++; 2755eda14cbcSMatt Macy } 2756eda14cbcSMatt Macy abd_free(orig[c]); 2757eda14cbcSMatt Macy } 2758eda14cbcSMatt Macy 2759eda14cbcSMatt Macy return (ret); 2760eda14cbcSMatt Macy } 2761eda14cbcSMatt Macy 2762eda14cbcSMatt Macy static int 27637877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr) 2764eda14cbcSMatt Macy { 2765eda14cbcSMatt Macy int error = 0; 2766eda14cbcSMatt Macy 2767e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 27687877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error); 2769e716630dSMartin Matuska error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2770e716630dSMartin Matuska } 2771eda14cbcSMatt Macy 2772eda14cbcSMatt Macy return (error); 2773eda14cbcSMatt Macy } 2774eda14cbcSMatt Macy 2775eda14cbcSMatt Macy static void 27767877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2777eda14cbcSMatt Macy { 2778eda14cbcSMatt Macy int unexpected_errors = 0; 2779eda14cbcSMatt Macy int parity_errors = 0; 2780eda14cbcSMatt Macy int parity_untried = 0; 2781eda14cbcSMatt Macy int data_errors = 0; 2782eda14cbcSMatt Macy 27837877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2784eda14cbcSMatt Macy 27857877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27867877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 2787eda14cbcSMatt Macy 2788eda14cbcSMatt Macy if (rc->rc_error) { 27897877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2790eda14cbcSMatt Macy parity_errors++; 2791eda14cbcSMatt Macy else 2792eda14cbcSMatt Macy data_errors++; 2793eda14cbcSMatt Macy 2794eda14cbcSMatt Macy if (!rc->rc_skipped) 2795eda14cbcSMatt Macy unexpected_errors++; 27967877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2797eda14cbcSMatt Macy parity_untried++; 2798eda14cbcSMatt Macy } 2799a0b956f5SMartin Matuska 2800a0b956f5SMartin Matuska if (rc->rc_force_repair) 2801a0b956f5SMartin Matuska unexpected_errors++; 2802eda14cbcSMatt Macy } 2803eda14cbcSMatt Macy 2804eda14cbcSMatt Macy /* 28057877fdebSMatt Macy * If we read more parity disks than were used for 28067877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced 28077877fdebSMatt Macy * correct data. 28087877fdebSMatt Macy * 28097877fdebSMatt Macy * Note that we also regenerate parity when resilvering so we 28107877fdebSMatt Macy * can write it out to failed devices later. 28117877fdebSMatt Macy */ 28127877fdebSMatt Macy if (parity_errors + parity_untried < 28137877fdebSMatt Macy rr->rr_firstdatacol - data_errors || 28147877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) { 28157877fdebSMatt Macy int n = raidz_parity_verify(zio, rr); 28167877fdebSMatt Macy unexpected_errors += n; 28177877fdebSMatt Macy } 28187877fdebSMatt Macy 28197877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 28207877fdebSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 28217877fdebSMatt Macy /* 28227877fdebSMatt Macy * Use the good data we have in hand to repair damaged children. 28237877fdebSMatt Macy */ 28247877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 28257877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 28267877fdebSMatt Macy vdev_t *vd = zio->io_vd; 28277877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 28287877fdebSMatt Macy 282916038816SMartin Matuska if (!rc->rc_allow_repair) { 283016038816SMartin Matuska continue; 283116038816SMartin Matuska } else if (!rc->rc_force_repair && 283216038816SMartin Matuska (rc->rc_error == 0 || rc->rc_size == 0)) { 28337877fdebSMatt Macy continue; 28347877fdebSMatt Macy } 283587bf66d4SMartin Matuska /* 283687bf66d4SMartin Matuska * We do not allow self healing for Direct I/O reads. 283787bf66d4SMartin Matuska * See comment in vdev_raid_row_alloc(). 283887bf66d4SMartin Matuska */ 283987bf66d4SMartin Matuska ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 28407877fdebSMatt Macy 2841e716630dSMartin Matuska zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2842e716630dSMartin Matuska "offset=%llx", 2843e716630dSMartin Matuska zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2844e716630dSMartin Matuska 28457877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 28467877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 28477877fdebSMatt Macy ZIO_TYPE_WRITE, 28487877fdebSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 28497877fdebSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 28507877fdebSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 28517877fdebSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 28527877fdebSMatt Macy } 28537877fdebSMatt Macy } 2854e716630dSMartin Matuska 2855e716630dSMartin Matuska /* 2856e716630dSMartin Matuska * Scrub or resilver i/o's: overwrite any shadow locations with the 2857e716630dSMartin Matuska * good data. This ensures that if we've already copied this sector, 2858e716630dSMartin Matuska * it will be corrected if it was damaged. This writes more than is 2859e716630dSMartin Matuska * necessary, but since expansion is paused during scrub/resilver, at 2860e716630dSMartin Matuska * most a single row will have a shadow location. 2861e716630dSMartin Matuska */ 2862e716630dSMartin Matuska if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2863e716630dSMartin Matuska (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2864e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 2865e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2866e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2867e716630dSMartin Matuska 2868e716630dSMartin Matuska if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2869e716630dSMartin Matuska continue; 2870e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2871e716630dSMartin Matuska 2872e716630dSMartin Matuska /* 2873e716630dSMartin Matuska * Note: We don't want to update the repair stats 2874e716630dSMartin Matuska * because that would incorrectly indicate that there 2875e716630dSMartin Matuska * was bad data to repair, which we aren't sure about. 2876e716630dSMartin Matuska * By clearing the SCAN_THREAD flag, we prevent this 2877e716630dSMartin Matuska * from happening, despite having the REPAIR flag set. 2878e716630dSMartin Matuska * We need to set SELF_HEAL so that this i/o can't be 2879e716630dSMartin Matuska * bypassed by zio_vdev_io_start(). 2880e716630dSMartin Matuska */ 2881e716630dSMartin Matuska zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2882e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2883e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2884e716630dSMartin Matuska ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2885e716630dSMartin Matuska NULL, NULL); 2886e716630dSMartin Matuska cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2887e716630dSMartin Matuska zio_nowait(cio); 2888e716630dSMartin Matuska } 2889e716630dSMartin Matuska } 28907877fdebSMatt Macy } 28917877fdebSMatt Macy 28927877fdebSMatt Macy static void 28937877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm) 28947877fdebSMatt Macy { 28957877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 28967877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 28977877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 28987877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 28997877fdebSMatt Macy if (rc->rc_need_orig_restore) { 2900f9693befSMartin Matuska abd_copy(rc->rc_abd, 29017877fdebSMatt Macy rc->rc_orig_data, rc->rc_size); 29027877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 29037877fdebSMatt Macy } 29047877fdebSMatt Macy } 29057877fdebSMatt Macy } 29067877fdebSMatt Macy } 29077877fdebSMatt Macy 29087877fdebSMatt Macy /* 2909e716630dSMartin Matuska * During raidz_reconstruct() for expanded VDEV, we need special consideration 2910e716630dSMartin Matuska * failure simulations. See note in raidz_reconstruct() on simulating failure 2911e716630dSMartin Matuska * of a pre-expansion device. 2912e716630dSMartin Matuska * 2913e716630dSMartin Matuska * Treating logical child i as failed, return TRUE if the given column should 2914e716630dSMartin Matuska * be treated as failed. The idea of logical children allows us to imagine 2915e716630dSMartin Matuska * that a disk silently failed before a RAIDZ expansion (reads from this disk 2916e716630dSMartin Matuska * succeed but return the wrong data). Since the expansion doesn't verify 2917e716630dSMartin Matuska * checksums, the incorrect data will be moved to new locations spread among 2918e716630dSMartin Matuska * the children (going diagonally across them). 2919e716630dSMartin Matuska * 2920e716630dSMartin Matuska * Higher "logical child failures" (values of `i`) indicate these 2921e716630dSMartin Matuska * "pre-expansion failures". The first physical_width values imagine that a 2922e716630dSMartin Matuska * current child failed; the next physical_width-1 values imagine that a 2923e716630dSMartin Matuska * child failed before the most recent expansion; the next physical_width-2 2924e716630dSMartin Matuska * values imagine a child failed in the expansion before that, etc. 2925e716630dSMartin Matuska */ 2926e716630dSMartin Matuska static boolean_t 2927e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift, 2928e716630dSMartin Matuska int i, raidz_col_t *rc) 2929e716630dSMartin Matuska { 2930e716630dSMartin Matuska uint64_t sector_id = 2931e716630dSMartin Matuska physical_width * (rc->rc_offset >> ashift) + 2932e716630dSMartin Matuska rc->rc_devidx; 2933e716630dSMartin Matuska 2934e716630dSMartin Matuska for (int w = physical_width; w >= original_width; w--) { 2935e716630dSMartin Matuska if (i < w) { 2936e716630dSMartin Matuska return (sector_id % w == i); 2937e716630dSMartin Matuska } else { 2938e716630dSMartin Matuska i -= w; 2939e716630dSMartin Matuska } 2940e716630dSMartin Matuska } 2941e716630dSMartin Matuska ASSERT(!"invalid logical child id"); 2942e716630dSMartin Matuska return (B_FALSE); 2943e716630dSMartin Matuska } 2944e716630dSMartin Matuska 2945e716630dSMartin Matuska /* 29467877fdebSMatt Macy * returns EINVAL if reconstruction of the block will not be possible 29477877fdebSMatt Macy * returns ECKSUM if this specific reconstruction failed 29487877fdebSMatt Macy * returns 0 on successful reconstruction 29497877fdebSMatt Macy */ 29507877fdebSMatt Macy static int 29517877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 29527877fdebSMatt Macy { 29537877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2954e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 2955e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 2956e716630dSMartin Matuska rm->rm_original_width : physical_width; 2957e716630dSMartin Matuska int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2958e716630dSMartin Matuska 2959e716630dSMartin Matuska if (dbgmsg) { 2960e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2961e716630dSMartin Matuska "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2962e716630dSMartin Matuska } 29637877fdebSMatt Macy 29647877fdebSMatt Macy /* Reconstruct each row */ 29657877fdebSMatt Macy for (int r = 0; r < rm->rm_nrows; r++) { 29667877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[r]; 29677877fdebSMatt Macy int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 29687877fdebSMatt Macy int t = 0; 29697877fdebSMatt Macy int dead = 0; 29707877fdebSMatt Macy int dead_data = 0; 29717877fdebSMatt Macy 2972e716630dSMartin Matuska if (dbgmsg) 2973e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2974e716630dSMartin Matuska 29757877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29767877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29777877fdebSMatt Macy ASSERT0(rc->rc_need_orig_restore); 29787877fdebSMatt Macy if (rc->rc_error != 0) { 29797877fdebSMatt Macy dead++; 29807877fdebSMatt Macy if (c >= nparity) 29817877fdebSMatt Macy dead_data++; 29827877fdebSMatt Macy continue; 29837877fdebSMatt Macy } 29847877fdebSMatt Macy if (rc->rc_size == 0) 29857877fdebSMatt Macy continue; 29867877fdebSMatt Macy for (int lt = 0; lt < ntgts; lt++) { 2987e716630dSMartin Matuska if (raidz_simulate_failure(physical_width, 2988e716630dSMartin Matuska original_width, 2989e716630dSMartin Matuska zio->io_vd->vdev_top->vdev_ashift, 2990e716630dSMartin Matuska ltgts[lt], rc)) { 29917877fdebSMatt Macy if (rc->rc_orig_data == NULL) { 29927877fdebSMatt Macy rc->rc_orig_data = 2993f9693befSMartin Matuska abd_alloc_linear( 2994f9693befSMartin Matuska rc->rc_size, B_TRUE); 2995f9693befSMartin Matuska abd_copy(rc->rc_orig_data, 29967877fdebSMatt Macy rc->rc_abd, rc->rc_size); 29977877fdebSMatt Macy } 29987877fdebSMatt Macy rc->rc_need_orig_restore = B_TRUE; 29997877fdebSMatt Macy 30007877fdebSMatt Macy dead++; 30017877fdebSMatt Macy if (c >= nparity) 30027877fdebSMatt Macy dead_data++; 3003e716630dSMartin Matuska /* 3004e716630dSMartin Matuska * Note: simulating failure of a 3005e716630dSMartin Matuska * pre-expansion device can hit more 3006e716630dSMartin Matuska * than one column, in which case we 3007e716630dSMartin Matuska * might try to simulate more failures 3008e716630dSMartin Matuska * than can be reconstructed, which is 3009e716630dSMartin Matuska * also more than the size of my_tgts. 3010e716630dSMartin Matuska * This check prevents accessing past 3011e716630dSMartin Matuska * the end of my_tgts. The "dead > 3012e716630dSMartin Matuska * nparity" check below will fail this 3013e716630dSMartin Matuska * reconstruction attempt. 3014e716630dSMartin Matuska */ 3015e716630dSMartin Matuska if (t < VDEV_RAIDZ_MAXPARITY) { 30167877fdebSMatt Macy my_tgts[t++] = c; 3017e716630dSMartin Matuska if (dbgmsg) { 3018e716630dSMartin Matuska zfs_dbgmsg("simulating " 3019e716630dSMartin Matuska "failure of col %u " 3020e716630dSMartin Matuska "devidx %u", c, 3021e716630dSMartin Matuska (int)rc->rc_devidx); 3022e716630dSMartin Matuska } 3023e716630dSMartin Matuska } 30247877fdebSMatt Macy break; 30257877fdebSMatt Macy } 30267877fdebSMatt Macy } 30277877fdebSMatt Macy } 30287877fdebSMatt Macy if (dead > nparity) { 30297877fdebSMatt Macy /* reconstruction not possible */ 3030e716630dSMartin Matuska if (dbgmsg) { 3031e716630dSMartin Matuska zfs_dbgmsg("reconstruction not possible; " 3032e716630dSMartin Matuska "too many failures"); 3033e716630dSMartin Matuska } 30347877fdebSMatt Macy raidz_restore_orig_data(rm); 30357877fdebSMatt Macy return (EINVAL); 30367877fdebSMatt Macy } 30377877fdebSMatt Macy if (dead_data > 0) 3038f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 30397877fdebSMatt Macy } 30407877fdebSMatt Macy 30417877fdebSMatt Macy /* Check for success */ 30427877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 304387bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 304487bf66d4SMartin Matuska return (0); 30457877fdebSMatt Macy 30467877fdebSMatt Macy /* Reconstruction succeeded - report errors */ 30477877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 30487877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 30497877fdebSMatt Macy 30507877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 30517877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 30527877fdebSMatt Macy if (rc->rc_need_orig_restore) { 30537877fdebSMatt Macy /* 30547877fdebSMatt Macy * Note: if this is a parity column, 30557877fdebSMatt Macy * we don't really know if it's wrong. 30567877fdebSMatt Macy * We need to let 30577877fdebSMatt Macy * vdev_raidz_io_done_verified() check 30587877fdebSMatt Macy * it, and if we set rc_error, it will 30597877fdebSMatt Macy * think that it is a "known" error 30607877fdebSMatt Macy * that doesn't need to be checked 30617877fdebSMatt Macy * or corrected. 30627877fdebSMatt Macy */ 30637877fdebSMatt Macy if (rc->rc_error == 0 && 30647877fdebSMatt Macy c >= rr->rr_firstdatacol) { 3065e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, 3066f9693befSMartin Matuska rc, rc->rc_orig_data); 30677877fdebSMatt Macy rc->rc_error = 30687877fdebSMatt Macy SET_ERROR(ECKSUM); 30697877fdebSMatt Macy } 30707877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 30717877fdebSMatt Macy } 30727877fdebSMatt Macy } 30737877fdebSMatt Macy 30747877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 30757877fdebSMatt Macy } 30767877fdebSMatt Macy 30777877fdebSMatt Macy zio_checksum_verified(zio); 30787877fdebSMatt Macy 3079e716630dSMartin Matuska if (dbgmsg) { 3080e716630dSMartin Matuska zfs_dbgmsg("reconstruction successful " 3081e716630dSMartin Matuska "(checksum verified)"); 3082e716630dSMartin Matuska } 30837877fdebSMatt Macy return (0); 30847877fdebSMatt Macy } 30857877fdebSMatt Macy 30867877fdebSMatt Macy /* Reconstruction failed - restore original data */ 30877877fdebSMatt Macy raidz_restore_orig_data(rm); 3088e716630dSMartin Matuska if (dbgmsg) { 3089e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3090e716630dSMartin Matuska "failed", zio); 3091e716630dSMartin Matuska } 30927877fdebSMatt Macy return (ECKSUM); 30937877fdebSMatt Macy } 30947877fdebSMatt Macy 30957877fdebSMatt Macy /* 30967877fdebSMatt Macy * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 30977877fdebSMatt Macy * Note that the algorithm below is non-optimal because it doesn't take into 30987877fdebSMatt Macy * account how reconstruction is actually performed. For example, with 30997877fdebSMatt Macy * triple-parity RAID-Z the reconstruction procedure is the same if column 4 31007877fdebSMatt Macy * is targeted as invalid as if columns 1 and 4 are targeted since in both 31017877fdebSMatt Macy * cases we'd only use parity information in column 0. 31027877fdebSMatt Macy * 31037877fdebSMatt Macy * The order that we find the various possible combinations of failed 31047877fdebSMatt Macy * disks is dictated by these rules: 31057877fdebSMatt Macy * - Examine each "slot" (the "i" in tgts[i]) 3106e716630dSMartin Matuska * - Try to increment this slot (tgts[i] += 1) 31077877fdebSMatt Macy * - if we can't increment because it runs into the next slot, 31087877fdebSMatt Macy * reset our slot to the minimum, and examine the next slot 31097877fdebSMatt Macy * 31107877fdebSMatt Macy * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 31117877fdebSMatt Macy * 3 columns to reconstruct), we will generate the following sequence: 31127877fdebSMatt Macy * 31137877fdebSMatt Macy * STATE ACTION 31147877fdebSMatt Macy * 0 1 2 special case: skip since these are all parity 31157877fdebSMatt Macy * 0 1 3 first slot: reset to 0; middle slot: increment to 2 31167877fdebSMatt Macy * 0 2 3 first slot: increment to 1 31177877fdebSMatt Macy * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 31187877fdebSMatt Macy * 0 1 4 first: reset to 0; middle: increment to 2 31197877fdebSMatt Macy * 0 2 4 first: increment to 1 31207877fdebSMatt Macy * 1 2 4 first: reset to 0; middle: increment to 3 31217877fdebSMatt Macy * 0 3 4 first: increment to 1 31227877fdebSMatt Macy * 1 3 4 first: increment to 2 31237877fdebSMatt Macy * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 31247877fdebSMatt Macy * 0 1 5 first: reset to 0; middle: increment to 2 31257877fdebSMatt Macy * 0 2 5 first: increment to 1 31267877fdebSMatt Macy * 1 2 5 first: reset to 0; middle: increment to 3 31277877fdebSMatt Macy * 0 3 5 first: increment to 1 31287877fdebSMatt Macy * 1 3 5 first: increment to 2 31297877fdebSMatt Macy * 2 3 5 first: reset to 0; middle: increment to 4 31307877fdebSMatt Macy * 0 4 5 first: increment to 1 31317877fdebSMatt Macy * 1 4 5 first: increment to 2 31327877fdebSMatt Macy * 2 4 5 first: increment to 3 31337877fdebSMatt Macy * 3 4 5 done 31347877fdebSMatt Macy * 313516038816SMartin Matuska * This strategy works for dRAID but is less efficient when there are a large 31367877fdebSMatt Macy * number of child vdevs and therefore permutations to check. Furthermore, 3137e716630dSMartin Matuska * since the raidz_map_t rows likely do not overlap, reconstruction would be 31387877fdebSMatt Macy * possible as long as there are no more than nparity data errors per row. 31397877fdebSMatt Macy * These additional permutations are not currently checked but could be as 31407877fdebSMatt Macy * a future improvement. 3141e716630dSMartin Matuska * 3142e716630dSMartin Matuska * Returns 0 on success, ECKSUM on failure. 31437877fdebSMatt Macy */ 31447877fdebSMatt Macy static int 31457877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio) 31467877fdebSMatt Macy { 31477877fdebSMatt Macy int nparity = vdev_get_nparity(zio->io_vd); 31487877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3149e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 3150e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 3151e716630dSMartin Matuska rm->rm_original_width : physical_width; 31527877fdebSMatt Macy 31537877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 31547877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 31557877fdebSMatt Macy int total_errors = 0; 31567877fdebSMatt Macy 31577877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 31587877fdebSMatt Macy if (rr->rr_col[c].rc_error) 31597877fdebSMatt Macy total_errors++; 31607877fdebSMatt Macy } 31617877fdebSMatt Macy 31627877fdebSMatt Macy if (total_errors > nparity) 31637877fdebSMatt Macy return (vdev_raidz_worst_error(rr)); 31647877fdebSMatt Macy } 31657877fdebSMatt Macy 31667877fdebSMatt Macy for (int num_failures = 1; num_failures <= nparity; num_failures++) { 31677877fdebSMatt Macy int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 31687877fdebSMatt Macy int *ltgts = &tstore[1]; /* value is logical child ID */ 31697877fdebSMatt Macy 3170e716630dSMartin Matuska 3171e716630dSMartin Matuska /* 3172e716630dSMartin Matuska * Determine number of logical children, n. See comment 3173e716630dSMartin Matuska * above raidz_simulate_failure(). 3174e716630dSMartin Matuska */ 3175e716630dSMartin Matuska int n = 0; 3176e716630dSMartin Matuska for (int w = physical_width; 3177e716630dSMartin Matuska w >= original_width; w--) { 3178e716630dSMartin Matuska n += w; 3179e716630dSMartin Matuska } 31807877fdebSMatt Macy 31817877fdebSMatt Macy ASSERT3U(num_failures, <=, nparity); 31827877fdebSMatt Macy ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 31837877fdebSMatt Macy 31847877fdebSMatt Macy /* Handle corner cases in combrec logic */ 31857877fdebSMatt Macy ltgts[-1] = -1; 31867877fdebSMatt Macy for (int i = 0; i < num_failures; i++) { 31877877fdebSMatt Macy ltgts[i] = i; 31887877fdebSMatt Macy } 31897877fdebSMatt Macy ltgts[num_failures] = n; 31907877fdebSMatt Macy 31917877fdebSMatt Macy for (;;) { 31927877fdebSMatt Macy int err = raidz_reconstruct(zio, ltgts, num_failures, 31937877fdebSMatt Macy nparity); 31947877fdebSMatt Macy if (err == EINVAL) { 31957877fdebSMatt Macy /* 31967877fdebSMatt Macy * Reconstruction not possible with this # 31977877fdebSMatt Macy * failures; try more failures. 31987877fdebSMatt Macy */ 31997877fdebSMatt Macy break; 32007877fdebSMatt Macy } else if (err == 0) 32017877fdebSMatt Macy return (0); 32027877fdebSMatt Macy 32037877fdebSMatt Macy /* Compute next targets to try */ 32047877fdebSMatt Macy for (int t = 0; ; t++) { 32057877fdebSMatt Macy ASSERT3U(t, <, num_failures); 32067877fdebSMatt Macy ltgts[t]++; 32077877fdebSMatt Macy if (ltgts[t] == n) { 32087877fdebSMatt Macy /* try more failures */ 32097877fdebSMatt Macy ASSERT3U(t, ==, num_failures - 1); 3210e716630dSMartin Matuska if (zfs_flags & 3211e716630dSMartin Matuska ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3212e716630dSMartin Matuska zfs_dbgmsg("reconstruction " 3213e716630dSMartin Matuska "failed for num_failures=" 3214e716630dSMartin Matuska "%u; tried all " 3215e716630dSMartin Matuska "combinations", 3216e716630dSMartin Matuska num_failures); 3217e716630dSMartin Matuska } 32187877fdebSMatt Macy break; 32197877fdebSMatt Macy } 32207877fdebSMatt Macy 32217877fdebSMatt Macy ASSERT3U(ltgts[t], <, n); 32227877fdebSMatt Macy ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 32237877fdebSMatt Macy 32247877fdebSMatt Macy /* 32257877fdebSMatt Macy * If that spot is available, we're done here. 32267877fdebSMatt Macy * Try the next combination. 32277877fdebSMatt Macy */ 32287877fdebSMatt Macy if (ltgts[t] != ltgts[t + 1]) 3229e716630dSMartin Matuska break; // found next combination 32307877fdebSMatt Macy 32317877fdebSMatt Macy /* 32327877fdebSMatt Macy * Otherwise, reset this tgt to the minimum, 32337877fdebSMatt Macy * and move on to the next tgt. 32347877fdebSMatt Macy */ 32357877fdebSMatt Macy ltgts[t] = ltgts[t - 1] + 1; 32367877fdebSMatt Macy ASSERT3U(ltgts[t], ==, t); 32377877fdebSMatt Macy } 32387877fdebSMatt Macy 32397877fdebSMatt Macy /* Increase the number of failures and keep trying. */ 32407877fdebSMatt Macy if (ltgts[num_failures - 1] == n) 32417877fdebSMatt Macy break; 32427877fdebSMatt Macy } 32437877fdebSMatt Macy } 3244e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3245e716630dSMartin Matuska zfs_dbgmsg("reconstruction failed for all num_failures"); 32467877fdebSMatt Macy return (ECKSUM); 32477877fdebSMatt Macy } 32487877fdebSMatt Macy 32497877fdebSMatt Macy void 32507877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 32517877fdebSMatt Macy { 32527877fdebSMatt Macy for (uint64_t row = 0; row < rm->rm_nrows; row++) { 32537877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[row]; 32547877fdebSMatt Macy vdev_raidz_reconstruct_row(rm, rr, t, nt); 32557877fdebSMatt Macy } 32567877fdebSMatt Macy } 32577877fdebSMatt Macy 32587877fdebSMatt Macy /* 32597877fdebSMatt Macy * Complete a write IO operation on a RAIDZ VDev 32607877fdebSMatt Macy * 32617877fdebSMatt Macy * Outline: 32627877fdebSMatt Macy * 1. Check for errors on the child IOs. 32637877fdebSMatt Macy * 2. Return, setting an error code if too few child VDevs were written 32647877fdebSMatt Macy * to reconstruct the data later. Note that partial writes are 32657877fdebSMatt Macy * considered successful if they can be reconstructed at all. 32667877fdebSMatt Macy */ 32677877fdebSMatt Macy static void 32687877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 32697877fdebSMatt Macy { 3270e716630dSMartin Matuska int normal_errors = 0; 3271e716630dSMartin Matuska int shadow_errors = 0; 32727877fdebSMatt Macy 32737877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32747877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32757877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 32767877fdebSMatt Macy 32777877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32787877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32797877fdebSMatt Macy 3280e716630dSMartin Matuska if (rc->rc_error != 0) { 32817877fdebSMatt Macy ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3282e716630dSMartin Matuska normal_errors++; 3283e716630dSMartin Matuska } 3284e716630dSMartin Matuska if (rc->rc_shadow_error != 0) { 3285e716630dSMartin Matuska ASSERT(rc->rc_shadow_error != ECKSUM); 3286e716630dSMartin Matuska shadow_errors++; 32877877fdebSMatt Macy } 32887877fdebSMatt Macy } 32897877fdebSMatt Macy 32907877fdebSMatt Macy /* 32917877fdebSMatt Macy * Treat partial writes as a success. If we couldn't write enough 3292e716630dSMartin Matuska * columns to reconstruct the data, the I/O failed. Otherwise, good 3293e716630dSMartin Matuska * enough. Note that in the case of a shadow write (during raidz 3294e716630dSMartin Matuska * expansion), depending on if we crash, either the normal (old) or 3295e716630dSMartin Matuska * shadow (new) location may become the "real" version of the block, 3296e716630dSMartin Matuska * so both locations must have sufficient redundancy. 3297eda14cbcSMatt Macy * 3298eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 3299eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 3300eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 3301eda14cbcSMatt Macy * if we intend to reallocate. 3302eda14cbcSMatt Macy */ 3303e716630dSMartin Matuska if (normal_errors > rr->rr_firstdatacol || 3304e716630dSMartin Matuska shadow_errors > rr->rr_firstdatacol) { 33057877fdebSMatt Macy zio->io_error = zio_worst_error(zio->io_error, 33067877fdebSMatt Macy vdev_raidz_worst_error(rr)); 33077877fdebSMatt Macy } 3308eda14cbcSMatt Macy } 3309eda14cbcSMatt Macy 3310f9693befSMartin Matuska static void 33117877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 33127877fdebSMatt Macy raidz_row_t *rr) 33137877fdebSMatt Macy { 33147877fdebSMatt Macy int parity_errors = 0; 33157877fdebSMatt Macy int parity_untried = 0; 33167877fdebSMatt Macy int data_errors = 0; 33177877fdebSMatt Macy int total_errors = 0; 33187877fdebSMatt Macy 33197877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 33207877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 33217877fdebSMatt Macy 33227877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33237877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33247877fdebSMatt Macy 3325a0b956f5SMartin Matuska /* 3326a0b956f5SMartin Matuska * If scrubbing and a replacing/sparing child vdev determined 3327a0b956f5SMartin Matuska * that not all of its children have an identical copy of the 3328a0b956f5SMartin Matuska * data, then clear the error so the column is treated like 3329a0b956f5SMartin Matuska * any other read and force a repair to correct the damage. 3330a0b956f5SMartin Matuska */ 3331a0b956f5SMartin Matuska if (rc->rc_error == ECKSUM) { 3332a0b956f5SMartin Matuska ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3333a0b956f5SMartin Matuska vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3334a0b956f5SMartin Matuska rc->rc_force_repair = 1; 3335a0b956f5SMartin Matuska rc->rc_error = 0; 3336a0b956f5SMartin Matuska } 33377877fdebSMatt Macy 3338a0b956f5SMartin Matuska if (rc->rc_error) { 33397877fdebSMatt Macy if (c < rr->rr_firstdatacol) 33407877fdebSMatt Macy parity_errors++; 33417877fdebSMatt Macy else 33427877fdebSMatt Macy data_errors++; 33437877fdebSMatt Macy 33447877fdebSMatt Macy total_errors++; 33457877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 33467877fdebSMatt Macy parity_untried++; 33477877fdebSMatt Macy } 33487877fdebSMatt Macy } 3349eda14cbcSMatt Macy 3350eda14cbcSMatt Macy /* 33517877fdebSMatt Macy * If there were data errors and the number of errors we saw was 33527877fdebSMatt Macy * correctable -- less than or equal to the number of parity disks read 33537877fdebSMatt Macy * -- reconstruct based on the missing data. 3354eda14cbcSMatt Macy */ 33557877fdebSMatt Macy if (data_errors != 0 && 33567877fdebSMatt Macy total_errors <= rr->rr_firstdatacol - parity_untried) { 3357eda14cbcSMatt Macy /* 3358eda14cbcSMatt Macy * We either attempt to read all the parity columns or 3359eda14cbcSMatt Macy * none of them. If we didn't try to read parity, we 3360eda14cbcSMatt Macy * wouldn't be here in the correctable case. There must 3361eda14cbcSMatt Macy * also have been fewer parity errors than parity 3362eda14cbcSMatt Macy * columns or, again, we wouldn't be in this code path. 3363eda14cbcSMatt Macy */ 3364eda14cbcSMatt Macy ASSERT(parity_untried == 0); 33657877fdebSMatt Macy ASSERT(parity_errors < rr->rr_firstdatacol); 3366eda14cbcSMatt Macy 3367eda14cbcSMatt Macy /* 3368eda14cbcSMatt Macy * Identify the data columns that reported an error. 3369eda14cbcSMatt Macy */ 33707877fdebSMatt Macy int n = 0; 33717877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY]; 33727877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 33737877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 3374eda14cbcSMatt Macy if (rc->rc_error != 0) { 3375eda14cbcSMatt Macy ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3376eda14cbcSMatt Macy tgts[n++] = c; 3377eda14cbcSMatt Macy } 3378eda14cbcSMatt Macy } 3379eda14cbcSMatt Macy 33807877fdebSMatt Macy ASSERT(rr->rr_firstdatacol >= n); 3381eda14cbcSMatt Macy 3382f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3383eda14cbcSMatt Macy } 3384eda14cbcSMatt Macy } 3385eda14cbcSMatt Macy 3386eda14cbcSMatt Macy /* 33877877fdebSMatt Macy * Return the number of reads issued. 3388eda14cbcSMatt Macy */ 33897877fdebSMatt Macy static int 33907877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 33917877fdebSMatt Macy { 33927877fdebSMatt Macy vdev_t *vd = zio->io_vd; 33937877fdebSMatt Macy int nread = 0; 3394eda14cbcSMatt Macy 33957877fdebSMatt Macy rr->rr_missingdata = 0; 33967877fdebSMatt Macy rr->rr_missingparity = 0; 33977877fdebSMatt Macy 33987877fdebSMatt Macy /* 33997877fdebSMatt Macy * If this rows contains empty sectors which are not required 34007877fdebSMatt Macy * for a normal read then allocate an ABD for them now so they 34017877fdebSMatt Macy * may be read, verified, and any needed repairs performed. 34027877fdebSMatt Macy */ 3403e716630dSMartin Matuska if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 34047877fdebSMatt Macy vdev_draid_map_alloc_empty(zio, rr); 34057877fdebSMatt Macy 34067877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 34077877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 34087877fdebSMatt Macy if (rc->rc_tried || rc->rc_size == 0) 3409eda14cbcSMatt Macy continue; 3410eda14cbcSMatt Macy 3411eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 3412eda14cbcSMatt Macy vd->vdev_child[rc->rc_devidx], 3413eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 3414eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 3415eda14cbcSMatt Macy vdev_raidz_child_done, rc)); 34167877fdebSMatt Macy nread++; 34177877fdebSMatt Macy } 34187877fdebSMatt Macy return (nread); 3419eda14cbcSMatt Macy } 3420eda14cbcSMatt Macy 3421eda14cbcSMatt Macy /* 34227877fdebSMatt Macy * We're here because either there were too many errors to even attempt 34237877fdebSMatt Macy * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 34247877fdebSMatt Macy * failed. In either case, there is enough bad data to prevent reconstruction. 34257877fdebSMatt Macy * Start checksum ereports for all children which haven't failed. 3426eda14cbcSMatt Macy */ 34277877fdebSMatt Macy static void 34287877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio) 34297877fdebSMatt Macy { 34307877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3431eda14cbcSMatt Macy 34327877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34337877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 3434eda14cbcSMatt Macy 34357877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 34367877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 34377877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 34387877fdebSMatt Macy 34392c48331dSMatt Macy if (rc->rc_error != 0) 34402c48331dSMatt Macy continue; 34412c48331dSMatt Macy 3442eda14cbcSMatt Macy zio_bad_cksum_t zbc; 3443eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 34442c48331dSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 3445eda14cbcSMatt Macy mutex_enter(&cvd->vdev_stat_lock); 3446eda14cbcSMatt Macy cvd->vdev_stat.vs_checksum_errors++; 3447eda14cbcSMatt Macy mutex_exit(&cvd->vdev_stat_lock); 3448bb2d13b6SMartin Matuska (void) zfs_ereport_start_checksum(zio->io_spa, 3449bb2d13b6SMartin Matuska cvd, &zio->io_bookmark, zio, rc->rc_offset, 3450bb2d13b6SMartin Matuska rc->rc_size, &zbc); 3451eda14cbcSMatt Macy } 3452eda14cbcSMatt Macy } 3453eda14cbcSMatt Macy } 3454eda14cbcSMatt Macy 34557877fdebSMatt Macy void 34567877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio) 34577877fdebSMatt Macy { 34587877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 34597877fdebSMatt Macy 3460e716630dSMartin Matuska ASSERT(zio->io_bp != NULL); 34617877fdebSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 34627877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34637877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 34647877fdebSMatt Macy } 34657877fdebSMatt Macy } else { 3466e716630dSMartin Matuska if (rm->rm_phys_col) { 3467e716630dSMartin Matuska /* 3468e716630dSMartin Matuska * This is an aggregated read. Copy the data and status 3469e716630dSMartin Matuska * from the aggregate abd's to the individual rows. 3470e716630dSMartin Matuska */ 3471e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 3472e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 3473e716630dSMartin Matuska 3474e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 3475e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 3476e716630dSMartin Matuska if (rc->rc_tried || rc->rc_size == 0) 3477e716630dSMartin Matuska continue; 3478e716630dSMartin Matuska 3479e716630dSMartin Matuska raidz_col_t *prc = 3480e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 3481e716630dSMartin Matuska rc->rc_error = prc->rc_error; 3482e716630dSMartin Matuska rc->rc_tried = prc->rc_tried; 3483e716630dSMartin Matuska rc->rc_skipped = prc->rc_skipped; 3484e716630dSMartin Matuska if (c >= rr->rr_firstdatacol) { 3485e716630dSMartin Matuska /* 3486e716630dSMartin Matuska * Note: this is slightly faster 3487e716630dSMartin Matuska * than using abd_copy_off(). 3488e716630dSMartin Matuska */ 3489e716630dSMartin Matuska char *physbuf = abd_to_buf( 3490e716630dSMartin Matuska prc->rc_abd); 3491e716630dSMartin Matuska void *physloc = physbuf + 3492e716630dSMartin Matuska rc->rc_offset - 3493e716630dSMartin Matuska prc->rc_offset; 3494e716630dSMartin Matuska 3495e716630dSMartin Matuska abd_copy_from_buf(rc->rc_abd, 3496e716630dSMartin Matuska physloc, rc->rc_size); 3497e716630dSMartin Matuska } 3498e716630dSMartin Matuska } 3499e716630dSMartin Matuska } 3500e716630dSMartin Matuska } 3501e716630dSMartin Matuska 35027877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 35037877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 35047877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio, 35057877fdebSMatt Macy rm, rr); 35067877fdebSMatt Macy } 35077877fdebSMatt Macy 35087877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 350987bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 351087bf66d4SMartin Matuska goto done; 351187bf66d4SMartin Matuska 35127877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 35137877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 35147877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 35157877fdebSMatt Macy } 3516eda14cbcSMatt Macy zio_checksum_verified(zio); 35177877fdebSMatt Macy } else { 3518eda14cbcSMatt Macy /* 35197877fdebSMatt Macy * A sequential resilver has no checksum which makes 35207877fdebSMatt Macy * combinatoral reconstruction impossible. This code 35217877fdebSMatt Macy * path is unreachable since raidz_checksum_verify() 35227877fdebSMatt Macy * has no checksum to verify and must succeed. 3523eda14cbcSMatt Macy */ 35247877fdebSMatt Macy ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3525eda14cbcSMatt Macy 35267877fdebSMatt Macy /* 35277877fdebSMatt Macy * This isn't a typical situation -- either we got a 35287877fdebSMatt Macy * read error or a child silently returned bad data. 35297877fdebSMatt Macy * Read every block so we can try again with as much 35307877fdebSMatt Macy * data and parity as we can track down. If we've 35317877fdebSMatt Macy * already been through once before, all children will 35327877fdebSMatt Macy * be marked as tried so we'll proceed to combinatorial 35337877fdebSMatt Macy * reconstruction. 35347877fdebSMatt Macy */ 35357877fdebSMatt Macy int nread = 0; 35367877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 35377877fdebSMatt Macy nread += vdev_raidz_read_all(zio, 35387877fdebSMatt Macy rm->rm_row[i]); 35397877fdebSMatt Macy } 35407877fdebSMatt Macy if (nread != 0) { 35417877fdebSMatt Macy /* 35427877fdebSMatt Macy * Normally our stage is VDEV_IO_DONE, but if 35437877fdebSMatt Macy * we've already called redone(), it will have 35447877fdebSMatt Macy * changed to VDEV_IO_START, in which case we 35457877fdebSMatt Macy * don't want to call redone() again. 35467877fdebSMatt Macy */ 35477877fdebSMatt Macy if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 35487877fdebSMatt Macy zio_vdev_io_redone(zio); 35497877fdebSMatt Macy return; 35507877fdebSMatt Macy } 3551e716630dSMartin Matuska /* 3552e716630dSMartin Matuska * It would be too expensive to try every possible 3553e716630dSMartin Matuska * combination of failed sectors in every row, so 3554e716630dSMartin Matuska * instead we try every combination of failed current or 3555e716630dSMartin Matuska * past physical disk. This means that if the incorrect 3556e716630dSMartin Matuska * sectors were all on Nparity disks at any point in the 3557e716630dSMartin Matuska * past, we will find the correct data. The only known 3558e716630dSMartin Matuska * case where this is less durable than a non-expanded 3559e716630dSMartin Matuska * RAIDZ, is if we have a silent failure during 3560e716630dSMartin Matuska * expansion. In that case, one block could be 3561e716630dSMartin Matuska * partially in the old format and partially in the 3562e716630dSMartin Matuska * new format, so we'd lost some sectors from the old 3563e716630dSMartin Matuska * format and some from the new format. 3564e716630dSMartin Matuska * 3565e716630dSMartin Matuska * e.g. logical_width=4 physical_width=6 3566e716630dSMartin Matuska * the 15 (6+5+4) possible failed disks are: 3567e716630dSMartin Matuska * width=6 child=0 3568e716630dSMartin Matuska * width=6 child=1 3569e716630dSMartin Matuska * width=6 child=2 3570e716630dSMartin Matuska * width=6 child=3 3571e716630dSMartin Matuska * width=6 child=4 3572e716630dSMartin Matuska * width=6 child=5 3573e716630dSMartin Matuska * width=5 child=0 3574e716630dSMartin Matuska * width=5 child=1 3575e716630dSMartin Matuska * width=5 child=2 3576e716630dSMartin Matuska * width=5 child=3 3577e716630dSMartin Matuska * width=5 child=4 3578e716630dSMartin Matuska * width=4 child=0 3579e716630dSMartin Matuska * width=4 child=1 3580e716630dSMartin Matuska * width=4 child=2 3581e716630dSMartin Matuska * width=4 child=3 3582e716630dSMartin Matuska * And we will try every combination of Nparity of these 3583e716630dSMartin Matuska * failing. 3584e716630dSMartin Matuska * 3585e716630dSMartin Matuska * As a first pass, we can generate every combo, 3586e716630dSMartin Matuska * and try reconstructing, ignoring any known 3587e716630dSMartin Matuska * failures. If any row has too many known + simulated 3588e716630dSMartin Matuska * failures, then we bail on reconstructing with this 3589e716630dSMartin Matuska * number of simulated failures. As an improvement, 3590e716630dSMartin Matuska * we could detect the number of whole known failures 3591e716630dSMartin Matuska * (i.e. we have known failures on these disks for 3592e716630dSMartin Matuska * every row; the disks never succeeded), and 3593e716630dSMartin Matuska * subtract that from the max # failures to simulate. 3594e716630dSMartin Matuska * We could go even further like the current 3595e716630dSMartin Matuska * combrec code, but that doesn't seem like it 3596e716630dSMartin Matuska * gains us very much. If we simulate a failure 3597e716630dSMartin Matuska * that is also a known failure, that's fine. 3598e716630dSMartin Matuska */ 35997877fdebSMatt Macy zio->io_error = vdev_raidz_combrec(zio); 36007877fdebSMatt Macy if (zio->io_error == ECKSUM && 36017877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 36027877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio); 36037877fdebSMatt Macy } 3604eda14cbcSMatt Macy } 3605eda14cbcSMatt Macy } 360687bf66d4SMartin Matuska done: 3607e716630dSMartin Matuska if (rm->rm_lr != NULL) { 3608e716630dSMartin Matuska zfs_rangelock_exit(rm->rm_lr); 3609e716630dSMartin Matuska rm->rm_lr = NULL; 3610e716630dSMartin Matuska } 3611eda14cbcSMatt Macy } 3612eda14cbcSMatt Macy 3613eda14cbcSMatt Macy static void 3614eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3615eda14cbcSMatt Macy { 36167877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 36177877fdebSMatt Macy if (faulted > vdrz->vd_nparity) 3618eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3619eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 3620eda14cbcSMatt Macy else if (degraded + faulted != 0) 3621eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3622eda14cbcSMatt Macy else 3623eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3624eda14cbcSMatt Macy } 3625eda14cbcSMatt Macy 3626eda14cbcSMatt Macy /* 3627eda14cbcSMatt Macy * Determine if any portion of the provided block resides on a child vdev 3628eda14cbcSMatt Macy * with a dirty DTL and therefore needs to be resilvered. The function 3629eda14cbcSMatt Macy * assumes that at least one DTL is dirty which implies that full stripe 3630eda14cbcSMatt Macy * width blocks must be resilvered. 3631eda14cbcSMatt Macy */ 3632eda14cbcSMatt Macy static boolean_t 36337877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 36347877fdebSMatt Macy uint64_t phys_birth) 3635eda14cbcSMatt Macy { 36367877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 3637e716630dSMartin Matuska 3638e716630dSMartin Matuska /* 3639e716630dSMartin Matuska * If we're in the middle of a RAIDZ expansion, this block may be in 3640e716630dSMartin Matuska * the old and/or new location. For simplicity, always resilver it. 3641e716630dSMartin Matuska */ 3642e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3643e716630dSMartin Matuska return (B_TRUE); 3644e716630dSMartin Matuska 3645eda14cbcSMatt Macy uint64_t dcols = vd->vdev_children; 36467877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 3647eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 3648eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 36497877fdebSMatt Macy uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3650eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 3651eda14cbcSMatt Macy uint64_t s = ((psize - 1) >> ashift) + 1; 3652eda14cbcSMatt Macy /* The first column for this stripe. */ 3653eda14cbcSMatt Macy uint64_t f = b % dcols; 3654eda14cbcSMatt Macy 36557877fdebSMatt Macy /* Unreachable by sequential resilver. */ 36567877fdebSMatt Macy ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 36577877fdebSMatt Macy 36587877fdebSMatt Macy if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 36597877fdebSMatt Macy return (B_FALSE); 36607877fdebSMatt Macy 3661eda14cbcSMatt Macy if (s + nparity >= dcols) 3662eda14cbcSMatt Macy return (B_TRUE); 3663eda14cbcSMatt Macy 3664eda14cbcSMatt Macy for (uint64_t c = 0; c < s + nparity; c++) { 3665eda14cbcSMatt Macy uint64_t devidx = (f + c) % dcols; 3666eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[devidx]; 3667eda14cbcSMatt Macy 3668eda14cbcSMatt Macy /* 3669eda14cbcSMatt Macy * dsl_scan_need_resilver() already checked vd with 3670eda14cbcSMatt Macy * vdev_dtl_contains(). So here just check cvd with 3671eda14cbcSMatt Macy * vdev_dtl_empty(), cheaper and a good approximation. 3672eda14cbcSMatt Macy */ 3673eda14cbcSMatt Macy if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3674eda14cbcSMatt Macy return (B_TRUE); 3675eda14cbcSMatt Macy } 3676eda14cbcSMatt Macy 3677eda14cbcSMatt Macy return (B_FALSE); 3678eda14cbcSMatt Macy } 3679eda14cbcSMatt Macy 3680eda14cbcSMatt Macy static void 3681b59a0cdeSMartin Matuska vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, 3682b59a0cdeSMartin Matuska zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 3683eda14cbcSMatt Macy { 3684e92ffd9bSMartin Matuska (void) remain_rs; 3685e92ffd9bSMartin Matuska 3686eda14cbcSMatt Macy vdev_t *raidvd = cvd->vdev_parent; 3687eda14cbcSMatt Macy ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3688eda14cbcSMatt Macy 3689e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3690e716630dSMartin Matuska 3691e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3692e716630dSMartin Matuska /* 3693e716630dSMartin Matuska * We're in the middle of expansion, in which case the 3694e716630dSMartin Matuska * translation is in flux. Any answer we give may be wrong 3695e716630dSMartin Matuska * by the time we return, so it isn't safe for the caller to 3696e716630dSMartin Matuska * act on it. Therefore we say that this range isn't present 3697e716630dSMartin Matuska * on any children. The only consumers of this are "zpool 3698e716630dSMartin Matuska * initialize" and trimming, both of which are "best effort" 3699e716630dSMartin Matuska * anyway. 3700e716630dSMartin Matuska */ 3701e716630dSMartin Matuska physical_rs->rs_start = physical_rs->rs_end = 0; 3702e716630dSMartin Matuska remain_rs->rs_start = remain_rs->rs_end = 0; 3703e716630dSMartin Matuska return; 3704e716630dSMartin Matuska } 3705e716630dSMartin Matuska 3706e716630dSMartin Matuska uint64_t width = vdrz->vd_physical_width; 3707eda14cbcSMatt Macy uint64_t tgt_col = cvd->vdev_id; 3708eda14cbcSMatt Macy uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3709eda14cbcSMatt Macy 3710eda14cbcSMatt Macy /* make sure the offsets are block-aligned */ 37117877fdebSMatt Macy ASSERT0(logical_rs->rs_start % (1 << ashift)); 37127877fdebSMatt Macy ASSERT0(logical_rs->rs_end % (1 << ashift)); 37137877fdebSMatt Macy uint64_t b_start = logical_rs->rs_start >> ashift; 37147877fdebSMatt Macy uint64_t b_end = logical_rs->rs_end >> ashift; 3715eda14cbcSMatt Macy 3716eda14cbcSMatt Macy uint64_t start_row = 0; 3717eda14cbcSMatt Macy if (b_start > tgt_col) /* avoid underflow */ 3718eda14cbcSMatt Macy start_row = ((b_start - tgt_col - 1) / width) + 1; 3719eda14cbcSMatt Macy 3720eda14cbcSMatt Macy uint64_t end_row = 0; 3721eda14cbcSMatt Macy if (b_end > tgt_col) 3722eda14cbcSMatt Macy end_row = ((b_end - tgt_col - 1) / width) + 1; 3723eda14cbcSMatt Macy 37247877fdebSMatt Macy physical_rs->rs_start = start_row << ashift; 37257877fdebSMatt Macy physical_rs->rs_end = end_row << ashift; 3726eda14cbcSMatt Macy 37277877fdebSMatt Macy ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 37287877fdebSMatt Macy ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 37297877fdebSMatt Macy logical_rs->rs_end - logical_rs->rs_start); 37307877fdebSMatt Macy } 37317877fdebSMatt Macy 3732e716630dSMartin Matuska static void 3733e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3734e716630dSMartin Matuska { 3735e716630dSMartin Matuska spa_t *spa = arg; 3736e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3737e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3738e716630dSMartin Matuska 3739e716630dSMartin Matuska /* 3740e716630dSMartin Matuska * Ensure there are no i/os to the range that is being committed. 3741e716630dSMartin Matuska */ 3742e716630dSMartin Matuska uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3743e716630dSMartin Matuska ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3744e716630dSMartin Matuska 3745e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3746e716630dSMartin Matuska uint64_t new_offset = 3747e716630dSMartin Matuska MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3748e716630dSMartin Matuska /* 3749e716630dSMartin Matuska * We should not have committed anything that failed. 3750e716630dSMartin Matuska */ 3751e716630dSMartin Matuska VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3752e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3753e716630dSMartin Matuska 3754e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3755e716630dSMartin Matuska old_offset, new_offset - old_offset, 3756e716630dSMartin Matuska RL_WRITER); 3757e716630dSMartin Matuska 3758e716630dSMartin Matuska /* 3759e716630dSMartin Matuska * Update the uberblock that will be written when this txg completes. 3760e716630dSMartin Matuska */ 3761e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3762e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3763e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = 0; 3764e716630dSMartin Matuska zfs_rangelock_exit(lr); 3765e716630dSMartin Matuska 3766e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3767e716630dSMartin Matuska vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3768e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = 0; 3769e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3770e716630dSMartin Matuska 3771e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3772e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3773e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3774e716630dSMartin Matuska sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3775e716630dSMartin Matuska } 3776e716630dSMartin Matuska 3777e716630dSMartin Matuska static void 3778e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3779e716630dSMartin Matuska { 3780e716630dSMartin Matuska spa_t *spa = arg; 3781e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3782e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3783e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3784e716630dSMartin Matuska 3785e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 3786e716630dSMartin Matuska VERIFY0(vre->vre_offset_pertxg[i]); 3787e716630dSMartin Matuska 3788e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3789e716630dSMartin Matuska re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3790e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width; 3791e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 3792e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 3793e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 3794e716630dSMartin Matuska 3795e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3796e716630dSMartin Matuska 3797e716630dSMartin Matuska /* 3798e716630dSMartin Matuska * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3799e716630dSMartin Matuska * will get written (based on vd_expand_txgs). 3800e716630dSMartin Matuska */ 3801e716630dSMartin Matuska vdev_config_dirty(vd); 3802e716630dSMartin Matuska 3803e716630dSMartin Matuska /* 3804e716630dSMartin Matuska * Before we change vre_state, the on-disk state must reflect that we 3805e716630dSMartin Matuska * have completed all copying, so that vdev_raidz_io_start() can use 3806e716630dSMartin Matuska * vre_state to determine if the reflow is in progress. See also the 3807e716630dSMartin Matuska * end of spa_raidz_expand_thread(). 3808e716630dSMartin Matuska */ 3809e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3810e716630dSMartin Matuska raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3811e716630dSMartin Matuska 3812e716630dSMartin Matuska vre->vre_end_time = gethrestime_sec(); 3813e716630dSMartin Matuska vre->vre_state = DSS_FINISHED; 3814e716630dSMartin Matuska 3815e716630dSMartin Matuska uint64_t state = vre->vre_state; 3816e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3817e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3818e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 3819e716630dSMartin Matuska 3820e716630dSMartin Matuska uint64_t end_time = vre->vre_end_time; 3821e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3822e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3823e716630dSMartin Matuska sizeof (end_time), 1, &end_time, tx)); 3824e716630dSMartin Matuska 3825e716630dSMartin Matuska spa->spa_uberblock.ub_raidz_reflow_info = 0; 3826e716630dSMartin Matuska 3827e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3828e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 3829e716630dSMartin Matuska (unsigned long long)vd->vdev_id, 3830e716630dSMartin Matuska (unsigned long long)vd->vdev_children); 3831e716630dSMartin Matuska 3832e716630dSMartin Matuska spa->spa_raidz_expand = NULL; 3833e716630dSMartin Matuska raidvd->vdev_rz_expanding = B_FALSE; 3834e716630dSMartin Matuska 3835e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3836e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3837e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3838e716630dSMartin Matuska 3839e716630dSMartin Matuska spa_notify_waiters(spa); 3840e716630dSMartin Matuska 3841e716630dSMartin Matuska /* 3842e716630dSMartin Matuska * While we're in syncing context take the opportunity to 3843e716630dSMartin Matuska * setup a scrub. All the data has been sucessfully copied 3844e716630dSMartin Matuska * but we have not validated any checksums. 3845e716630dSMartin Matuska */ 384617aab35aSMartin Matuska setup_sync_arg_t setup_sync_arg = { 384717aab35aSMartin Matuska .func = POOL_SCAN_SCRUB, 384817aab35aSMartin Matuska .txgstart = 0, 384917aab35aSMartin Matuska .txgend = 0, 385017aab35aSMartin Matuska }; 385117aab35aSMartin Matuska if (zfs_scrub_after_expand && 385217aab35aSMartin Matuska dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { 385317aab35aSMartin Matuska dsl_scan_setup_sync(&setup_sync_arg, tx); 385417aab35aSMartin Matuska } 3855e716630dSMartin Matuska } 3856e716630dSMartin Matuska 3857e716630dSMartin Matuska /* 385817aab35aSMartin Matuska * State of one copy batch. 3859e716630dSMartin Matuska */ 3860e716630dSMartin Matuska typedef struct raidz_reflow_arg { 386117aab35aSMartin Matuska vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ 386217aab35aSMartin Matuska zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ 386317aab35aSMartin Matuska uint64_t rra_txg; /* TXG of this batch. */ 386417aab35aSMartin Matuska uint_t rra_ashift; /* Ashift of the vdev. */ 386517aab35aSMartin Matuska uint32_t rra_tbd; /* Number of in-flight ZIOs. */ 386617aab35aSMartin Matuska uint32_t rra_writes; /* Number of write ZIOs. */ 386717aab35aSMartin Matuska zio_t *rra_zio[]; /* Write ZIO pointers. */ 3868e716630dSMartin Matuska } raidz_reflow_arg_t; 3869e716630dSMartin Matuska 3870e716630dSMartin Matuska /* 387117aab35aSMartin Matuska * Write of the new location on one child is done. Once all of them are done 387217aab35aSMartin Matuska * we can unlock and free everything. 3873e716630dSMartin Matuska */ 3874e716630dSMartin Matuska static void 3875e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio) 3876e716630dSMartin Matuska { 3877e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3878e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3879e716630dSMartin Matuska 3880e716630dSMartin Matuska abd_free(zio->io_abd); 3881e716630dSMartin Matuska 3882e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3883e716630dSMartin Matuska if (zio->io_error != 0) { 3884e716630dSMartin Matuska /* Force a reflow pause on errors */ 3885e716630dSMartin Matuska vre->vre_failed_offset = 3886e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3887e716630dSMartin Matuska } 3888e716630dSMartin Matuska ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3889e716630dSMartin Matuska vre->vre_outstanding_bytes -= zio->io_size; 3890e716630dSMartin Matuska if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3891e716630dSMartin Matuska vre->vre_failed_offset) { 3892e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3893e716630dSMartin Matuska zio->io_size; 3894e716630dSMartin Matuska } 3895e716630dSMartin Matuska cv_signal(&vre->vre_cv); 389617aab35aSMartin Matuska boolean_t done = (--rra->rra_tbd == 0); 3897e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3898e716630dSMartin Matuska 389917aab35aSMartin Matuska if (!done) 390017aab35aSMartin Matuska return; 3901e716630dSMartin Matuska spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 390217aab35aSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 390317aab35aSMartin Matuska kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); 3904e716630dSMartin Matuska } 3905e716630dSMartin Matuska 3906e716630dSMartin Matuska /* 390717aab35aSMartin Matuska * Read of the old location on one child is done. Once all of them are done 390817aab35aSMartin Matuska * writes should have all the data and we can issue them. 3909e716630dSMartin Matuska */ 3910e716630dSMartin Matuska static void 3911e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio) 3912e716630dSMartin Matuska { 3913e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3914e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3915e716630dSMartin Matuska 391617aab35aSMartin Matuska /* Reads of only one block use write ABDs. For bigger free gangs. */ 391717aab35aSMartin Matuska if (zio->io_size > (1 << rra->rra_ashift)) 391817aab35aSMartin Matuska abd_free(zio->io_abd); 391917aab35aSMartin Matuska 3920e716630dSMartin Matuska /* 3921e716630dSMartin Matuska * If the read failed, or if it was done on a vdev that is not fully 3922e716630dSMartin Matuska * healthy (e.g. a child that has a resilver in progress), we may not 3923e716630dSMartin Matuska * have the correct data. Note that it's OK if the write proceeds. 3924e716630dSMartin Matuska * It may write garbage but the location is otherwise unused and we 3925e716630dSMartin Matuska * will retry later due to vre_failed_offset. 3926e716630dSMartin Matuska */ 3927e716630dSMartin Matuska if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3928e716630dSMartin Matuska zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3929e716630dSMartin Matuska "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3930e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3931e716630dSMartin Matuska (long long)rra->rra_lr->lr_length, 3932e716630dSMartin Matuska (long long)rra->rra_txg, 3933e716630dSMartin Matuska zio->io_error, 3934e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3935e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3936e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3937e716630dSMartin Matuska /* Force a reflow pause on errors */ 3938e716630dSMartin Matuska vre->vre_failed_offset = 3939e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3940e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3941e716630dSMartin Matuska } 3942e716630dSMartin Matuska 394317aab35aSMartin Matuska if (atomic_dec_32_nv(&rra->rra_tbd) > 0) 394417aab35aSMartin Matuska return; 3945dd215568SMartin Matuska uint32_t writes = rra->rra_tbd = rra->rra_writes; 3946dd215568SMartin Matuska for (uint64_t i = 0; i < writes; i++) 394717aab35aSMartin Matuska zio_nowait(rra->rra_zio[i]); 3948e716630dSMartin Matuska } 3949e716630dSMartin Matuska 3950e716630dSMartin Matuska static void 3951e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3952e716630dSMartin Matuska dmu_tx_t *tx) 3953e716630dSMartin Matuska { 3954e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3955e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3956e716630dSMartin Matuska 3957e716630dSMartin Matuska if (offset == 0) 3958e716630dSMartin Matuska return; 3959e716630dSMartin Matuska 3960e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3961e716630dSMartin Matuska ASSERT3U(vre->vre_offset, <=, offset); 3962e716630dSMartin Matuska vre->vre_offset = offset; 3963e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3964e716630dSMartin Matuska 3965e716630dSMartin Matuska if (vre->vre_offset_pertxg[txgoff] == 0) { 3966e716630dSMartin Matuska dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3967e716630dSMartin Matuska spa, tx); 3968e716630dSMartin Matuska } 3969e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = offset; 3970e716630dSMartin Matuska } 3971e716630dSMartin Matuska 3972e716630dSMartin Matuska static boolean_t 3973e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3974e716630dSMartin Matuska { 3975e716630dSMartin Matuska for (int i = 0; i < raidz_vd->vdev_children; i++) { 3976e716630dSMartin Matuska /* Quick check if a child is being replaced */ 3977e716630dSMartin Matuska if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3978e716630dSMartin Matuska return (B_TRUE); 3979e716630dSMartin Matuska } 3980e716630dSMartin Matuska return (B_FALSE); 3981e716630dSMartin Matuska } 3982e716630dSMartin Matuska 3983e716630dSMartin Matuska static boolean_t 3984b59a0cdeSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, 3985e716630dSMartin Matuska dmu_tx_t *tx) 3986e716630dSMartin Matuska { 3987e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 398817aab35aSMartin Matuska uint_t ashift = vd->vdev_top->vdev_ashift; 3989e716630dSMartin Matuska 3990b59a0cdeSMartin Matuska zfs_range_seg_t *rs = zfs_range_tree_first(rt); 399117aab35aSMartin Matuska if (rt == NULL) 3992e716630dSMartin Matuska return (B_FALSE); 3993b59a0cdeSMartin Matuska uint64_t offset = zfs_rs_get_start(rs, rt); 3994e716630dSMartin Matuska ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3995b59a0cdeSMartin Matuska uint64_t size = zfs_rs_get_end(rs, rt) - offset; 3996e716630dSMartin Matuska ASSERT3U(size, >=, 1 << ashift); 399717aab35aSMartin Matuska ASSERT(IS_P2ALIGNED(size, 1 << ashift)); 3998e716630dSMartin Matuska 3999e716630dSMartin Matuska uint64_t blkid = offset >> ashift; 400017aab35aSMartin Matuska uint_t old_children = vd->vdev_children - 1; 4001e716630dSMartin Matuska 4002e716630dSMartin Matuska /* 4003e716630dSMartin Matuska * We can only progress to the point that writes will not overlap 4004e716630dSMartin Matuska * with blocks whose progress has not yet been recorded on disk. 4005e716630dSMartin Matuska * Since partially-copied rows are still read from the old location, 4006e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent 4007e716630dSMartin Matuska * row-wise overlap. 4008e716630dSMartin Matuska * 4009e716630dSMartin Matuska * Note that even if we are skipping over a large unallocated region, 4010e716630dSMartin Matuska * we can't move the on-disk progress to `offset`, because concurrent 4011e716630dSMartin Matuska * writes/allocations could still use the currently-unallocated 4012e716630dSMartin Matuska * region. 4013e716630dSMartin Matuska */ 4014e716630dSMartin Matuska uint64_t ubsync_blkid = 4015e716630dSMartin Matuska RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 4016e716630dSMartin Matuska uint64_t next_overwrite_blkid = ubsync_blkid + 4017e716630dSMartin Matuska ubsync_blkid / old_children - old_children; 4018e716630dSMartin Matuska VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 4019e716630dSMartin Matuska if (blkid >= next_overwrite_blkid) { 4020e716630dSMartin Matuska raidz_reflow_record_progress(vre, 4021e716630dSMartin Matuska next_overwrite_blkid << ashift, tx); 4022e716630dSMartin Matuska return (B_TRUE); 4023e716630dSMartin Matuska } 4024e716630dSMartin Matuska 402517aab35aSMartin Matuska size = MIN(size, raidz_expand_max_copy_bytes); 402617aab35aSMartin Matuska size = MIN(size, (uint64_t)old_children * 402717aab35aSMartin Matuska MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); 402817aab35aSMartin Matuska size = MAX(size, 1 << ashift); 402917aab35aSMartin Matuska uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); 403017aab35aSMartin Matuska size = (uint64_t)blocks << ashift; 4031e716630dSMartin Matuska 4032b59a0cdeSMartin Matuska zfs_range_tree_remove(rt, offset, size); 403317aab35aSMartin Matuska 403417aab35aSMartin Matuska uint_t reads = MIN(blocks, old_children); 403517aab35aSMartin Matuska uint_t writes = MIN(blocks, vd->vdev_children); 403617aab35aSMartin Matuska raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + 403717aab35aSMartin Matuska sizeof (zio_t *) * writes, KM_SLEEP); 4038e716630dSMartin Matuska rra->rra_vre = vre; 4039e716630dSMartin Matuska rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 404017aab35aSMartin Matuska offset, size, RL_WRITER); 4041e716630dSMartin Matuska rra->rra_txg = dmu_tx_get_txg(tx); 404217aab35aSMartin Matuska rra->rra_ashift = ashift; 404317aab35aSMartin Matuska rra->rra_tbd = reads; 404417aab35aSMartin Matuska rra->rra_writes = writes; 4045e716630dSMartin Matuska 404617aab35aSMartin Matuska raidz_reflow_record_progress(vre, offset + size, tx); 4047e716630dSMartin Matuska 4048e716630dSMartin Matuska /* 4049e716630dSMartin Matuska * SCL_STATE will be released when the read and write are done, 4050e716630dSMartin Matuska * by raidz_reflow_write_done(). 4051e716630dSMartin Matuska */ 4052e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4053e716630dSMartin Matuska 4054e716630dSMartin Matuska /* check if a replacing vdev was added, if so treat it as an error */ 4055e716630dSMartin Matuska if (vdev_raidz_expand_child_replacing(vd)) { 4056e716630dSMartin Matuska zfs_dbgmsg("replacing vdev encountered, reflow paused at " 4057e716630dSMartin Matuska "offset=%llu txg=%llu", 4058e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 4059e716630dSMartin Matuska (long long)rra->rra_txg); 4060e716630dSMartin Matuska 4061e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4062e716630dSMartin Matuska vre->vre_failed_offset = 4063e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4064e716630dSMartin Matuska cv_signal(&vre->vre_cv); 4065e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4066e716630dSMartin Matuska 4067e716630dSMartin Matuska /* drop everything we acquired */ 4068e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, spa); 406917aab35aSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 407017aab35aSMartin Matuska kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); 4071e716630dSMartin Matuska return (B_TRUE); 4072e716630dSMartin Matuska } 4073e716630dSMartin Matuska 407417aab35aSMartin Matuska mutex_enter(&vre->vre_lock); 407517aab35aSMartin Matuska vre->vre_outstanding_bytes += size; 407617aab35aSMartin Matuska mutex_exit(&vre->vre_lock); 4077e716630dSMartin Matuska 407817aab35aSMartin Matuska /* Allocate ABD and ZIO for each child we write. */ 407917aab35aSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 408017aab35aSMartin Matuska zio_t *pio = spa->spa_txg_zio[txgoff]; 408117aab35aSMartin Matuska uint_t b = blocks / vd->vdev_children; 408217aab35aSMartin Matuska uint_t bb = blocks % vd->vdev_children; 408317aab35aSMartin Matuska for (uint_t i = 0; i < writes; i++) { 408417aab35aSMartin Matuska uint_t n = b + (i < bb); 408517aab35aSMartin Matuska abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); 408617aab35aSMartin Matuska rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, 408717aab35aSMartin Matuska vd->vdev_child[(blkid + i) % vd->vdev_children], 408817aab35aSMartin Matuska ((blkid + i) / vd->vdev_children) << ashift, 408917aab35aSMartin Matuska abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 409017aab35aSMartin Matuska ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); 409117aab35aSMartin Matuska } 409217aab35aSMartin Matuska 409317aab35aSMartin Matuska /* 409417aab35aSMartin Matuska * Allocate and issue ZIO for each child we read. For reads of only 409517aab35aSMartin Matuska * one block we can use respective writer ABDs, since they will also 409617aab35aSMartin Matuska * have only one block. For bigger reads create gang ABDs and fill 409717aab35aSMartin Matuska * them with respective blocks from writer ABDs. 409817aab35aSMartin Matuska */ 409917aab35aSMartin Matuska b = blocks / old_children; 410017aab35aSMartin Matuska bb = blocks % old_children; 410117aab35aSMartin Matuska for (uint_t i = 0; i < reads; i++) { 410217aab35aSMartin Matuska uint_t n = b + (i < bb); 410317aab35aSMartin Matuska abd_t *abd; 410417aab35aSMartin Matuska if (n > 1) { 410517aab35aSMartin Matuska abd = abd_alloc_gang(); 410617aab35aSMartin Matuska for (uint_t j = 0; j < n; j++) { 410717aab35aSMartin Matuska uint_t b = j * old_children + i; 410817aab35aSMartin Matuska abd_t *cabd = abd_get_offset_size( 410917aab35aSMartin Matuska rra->rra_zio[b % vd->vdev_children]->io_abd, 411017aab35aSMartin Matuska (b / vd->vdev_children) << ashift, 411117aab35aSMartin Matuska 1 << ashift); 411217aab35aSMartin Matuska abd_gang_add(abd, cabd, B_TRUE); 411317aab35aSMartin Matuska } 411417aab35aSMartin Matuska } else { 411517aab35aSMartin Matuska abd = rra->rra_zio[i]->io_abd; 411617aab35aSMartin Matuska } 411717aab35aSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 411817aab35aSMartin Matuska vd->vdev_child[(blkid + i) % old_children], 411917aab35aSMartin Matuska ((blkid + i) / old_children) << ashift, abd, 412017aab35aSMartin Matuska n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 412117aab35aSMartin Matuska ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); 412217aab35aSMartin Matuska } 4123e716630dSMartin Matuska 4124e716630dSMartin Matuska return (B_FALSE); 4125e716630dSMartin Matuska } 4126e716630dSMartin Matuska 4127e716630dSMartin Matuska /* 4128e716630dSMartin Matuska * For testing (ztest specific) 4129e716630dSMartin Matuska */ 4130e716630dSMartin Matuska static void 4131e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point) 4132e716630dSMartin Matuska { 4133e716630dSMartin Matuska while (raidz_expand_pause_point != 0 && 4134e716630dSMartin Matuska raidz_expand_pause_point <= pause_point) 4135e716630dSMartin Matuska delay(hz); 4136e716630dSMartin Matuska } 4137e716630dSMartin Matuska 4138e716630dSMartin Matuska static void 4139e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio) 4140e716630dSMartin Matuska { 4141e716630dSMartin Matuska zio_t *pio = zio->io_private; 4142e716630dSMartin Matuska 4143e716630dSMartin Matuska mutex_enter(&pio->io_lock); 4144e716630dSMartin Matuska pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4145e716630dSMartin Matuska mutex_exit(&pio->io_lock); 4146e716630dSMartin Matuska } 4147e716630dSMartin Matuska 4148e716630dSMartin Matuska /* 4149e716630dSMartin Matuska * Reflow the beginning portion of the vdev into an intermediate scratch area 4150e716630dSMartin Matuska * in memory and on disk. This operation must be persisted on disk before we 4151e716630dSMartin Matuska * proceed to overwrite the beginning portion with the reflowed data. 4152e716630dSMartin Matuska * 4153e716630dSMartin Matuska * This multi-step task can fail to complete if disk errors are encountered 4154e716630dSMartin Matuska * and we can return here after a pause (waiting for disk to become healthy). 4155e716630dSMartin Matuska */ 4156e716630dSMartin Matuska static void 4157e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4158e716630dSMartin Matuska { 4159e716630dSMartin Matuska vdev_raidz_expand_t *vre = arg; 4160e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4161e716630dSMartin Matuska zio_t *pio; 4162e716630dSMartin Matuska int error; 4163e716630dSMartin Matuska 4164e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4165e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4166e716630dSMartin Matuska int ashift = raidvd->vdev_ashift; 4167aca928a5SMartin Matuska uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4168aca928a5SMartin Matuska uint64_t); 4169e716630dSMartin Matuska uint64_t logical_size = write_size * raidvd->vdev_children; 4170e716630dSMartin Matuska uint64_t read_size = 4171e716630dSMartin Matuska P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4172e716630dSMartin Matuska 1 << ashift); 4173e716630dSMartin Matuska 4174e716630dSMartin Matuska /* 4175e716630dSMartin Matuska * The scratch space must be large enough to get us to the point 4176e716630dSMartin Matuska * that one row does not overlap itself when moved. This is checked 4177e716630dSMartin Matuska * by vdev_raidz_attach_check(). 4178e716630dSMartin Matuska */ 4179e716630dSMartin Matuska VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4180e716630dSMartin Matuska VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4181e716630dSMartin Matuska VERIFY3U(write_size, <=, read_size); 4182e716630dSMartin Matuska 4183e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4184e716630dSMartin Matuska 0, logical_size, RL_WRITER); 4185e716630dSMartin Matuska 4186e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4187e716630dSMartin Matuska KM_SLEEP); 4188e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4189e716630dSMartin Matuska abds[i] = abd_alloc_linear(read_size, B_FALSE); 4190e716630dSMartin Matuska } 4191e716630dSMartin Matuska 4192e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4193e716630dSMartin Matuska 4194e716630dSMartin Matuska /* 4195e716630dSMartin Matuska * If we have already written the scratch area then we must read from 4196e716630dSMartin Matuska * there, since new writes were redirected there while we were paused 4197e716630dSMartin Matuska * or the original location may have been partially overwritten with 4198e716630dSMartin Matuska * reflowed data. 4199e716630dSMartin Matuska */ 4200e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4201e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4202e716630dSMartin Matuska /* 4203e716630dSMartin Matuska * Read from scratch space. 4204e716630dSMartin Matuska */ 4205e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4206e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4207e716630dSMartin Matuska /* 4208e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4209e716630dSMartin Matuska * to the offset to calculate the physical offset to 4210e716630dSMartin Matuska * write to. Passing in a negative offset makes us 4211e716630dSMartin Matuska * access the scratch area. 4212e716630dSMartin Matuska */ 4213e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 4214e716630dSMartin Matuska raidvd->vdev_child[i], 4215e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 421617aab35aSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4217e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4218e716630dSMartin Matuska } 4219e716630dSMartin Matuska error = zio_wait(pio); 4220e716630dSMartin Matuska if (error != 0) { 4221e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading scratch location", 4222e716630dSMartin Matuska error); 4223e716630dSMartin Matuska goto io_error_exit; 4224e716630dSMartin Matuska } 4225e716630dSMartin Matuska goto overwrite; 4226e716630dSMartin Matuska } 4227e716630dSMartin Matuska 4228e716630dSMartin Matuska /* 4229e716630dSMartin Matuska * Read from original location. 4230e716630dSMartin Matuska */ 4231e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4232e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4233e716630dSMartin Matuska ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4234e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4235e716630dSMartin Matuska 0, abds[i], read_size, ZIO_TYPE_READ, 423617aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4237e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4238e716630dSMartin Matuska } 4239e716630dSMartin Matuska error = zio_wait(pio); 4240e716630dSMartin Matuska if (error != 0) { 4241e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading original location", error); 4242e716630dSMartin Matuska io_error_exit: 4243e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4244e716630dSMartin Matuska abd_free(abds[i]); 4245e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4246e716630dSMartin Matuska zfs_rangelock_exit(lr); 4247e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4248e716630dSMartin Matuska return; 4249e716630dSMartin Matuska } 4250e716630dSMartin Matuska 4251e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4252e716630dSMartin Matuska 4253e716630dSMartin Matuska /* 4254e716630dSMartin Matuska * Reflow in memory. 4255e716630dSMartin Matuska */ 4256e716630dSMartin Matuska uint64_t logical_sectors = logical_size >> ashift; 4257e716630dSMartin Matuska for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4258e716630dSMartin Matuska int oldchild = i % (raidvd->vdev_children - 1); 4259e716630dSMartin Matuska uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4260e716630dSMartin Matuska 4261e716630dSMartin Matuska int newchild = i % raidvd->vdev_children; 4262e716630dSMartin Matuska uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4263e716630dSMartin Matuska 4264e716630dSMartin Matuska /* a single sector should not be copying over itself */ 4265e716630dSMartin Matuska ASSERT(!(newchild == oldchild && newoff == oldoff)); 4266e716630dSMartin Matuska 4267e716630dSMartin Matuska abd_copy_off(abds[newchild], abds[oldchild], 4268e716630dSMartin Matuska newoff, oldoff, 1 << ashift); 4269e716630dSMartin Matuska } 4270e716630dSMartin Matuska 4271e716630dSMartin Matuska /* 4272e716630dSMartin Matuska * Verify that we filled in everything we intended to (write_size on 4273e716630dSMartin Matuska * each child). 4274e716630dSMartin Matuska */ 4275e716630dSMartin Matuska VERIFY0(logical_sectors % raidvd->vdev_children); 4276e716630dSMartin Matuska VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4277e716630dSMartin Matuska write_size); 4278e716630dSMartin Matuska 4279e716630dSMartin Matuska /* 4280e716630dSMartin Matuska * Write to scratch location (boot area). 4281e716630dSMartin Matuska */ 4282e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4283e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4284e716630dSMartin Matuska /* 4285e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4286e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4287e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4288e716630dSMartin Matuska */ 4289e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4290e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 429117aab35aSMartin Matuska write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4292e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4293e716630dSMartin Matuska } 4294e716630dSMartin Matuska error = zio_wait(pio); 4295e716630dSMartin Matuska if (error != 0) { 4296e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing scratch location", error); 4297e716630dSMartin Matuska goto io_error_exit; 4298e716630dSMartin Matuska } 4299e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4300e716630dSMartin Matuska zio_flush(pio, raidvd); 4301e716630dSMartin Matuska zio_wait(pio); 4302e716630dSMartin Matuska 4303e716630dSMartin Matuska zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4304e716630dSMartin Matuska (long long)logical_size); 4305e716630dSMartin Matuska 4306e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4307e716630dSMartin Matuska 4308e716630dSMartin Matuska /* 4309e716630dSMartin Matuska * Update uberblock to indicate that scratch space is valid. This is 4310e716630dSMartin Matuska * needed because after this point, the real location may be 4311e716630dSMartin Matuska * overwritten. If we crash, we need to get the data from the 4312e716630dSMartin Matuska * scratch space, rather than the real location. 4313e716630dSMartin Matuska * 4314e716630dSMartin Matuska * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4315e716630dSMartin Matuska * will prefer this uberblock. 4316e716630dSMartin Matuska */ 4317e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4318e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4319e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4320e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4321e716630dSMartin Matuska if (spa_multihost(spa)) 4322e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4323e716630dSMartin Matuska 4324e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4325e716630dSMartin Matuska "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4326e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4327e716630dSMartin Matuska (long long)logical_size, 4328e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4329e716630dSMartin Matuska 4330e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4331e716630dSMartin Matuska 4332e716630dSMartin Matuska /* 4333e716630dSMartin Matuska * Overwrite with reflow'ed data. 4334e716630dSMartin Matuska */ 4335e716630dSMartin Matuska overwrite: 4336e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4337e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4338e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4339e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 434017aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4341e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4342e716630dSMartin Matuska } 4343e716630dSMartin Matuska error = zio_wait(pio); 4344e716630dSMartin Matuska if (error != 0) { 4345e716630dSMartin Matuska /* 4346e716630dSMartin Matuska * When we exit early here and drop the range lock, new 4347e716630dSMartin Matuska * writes will go into the scratch area so we'll need to 4348e716630dSMartin Matuska * read from there when we return after pausing. 4349e716630dSMartin Matuska */ 4350e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing real location", error); 4351e716630dSMartin Matuska /* 4352e716630dSMartin Matuska * Update the uberblock that is written when this txg completes. 4353e716630dSMartin Matuska */ 4354e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4355e716630dSMartin Matuska logical_size); 4356e716630dSMartin Matuska goto io_error_exit; 4357e716630dSMartin Matuska } 4358e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4359e716630dSMartin Matuska zio_flush(pio, raidvd); 4360e716630dSMartin Matuska zio_wait(pio); 4361e716630dSMartin Matuska 4362e716630dSMartin Matuska zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4363e716630dSMartin Matuska (long long)logical_size); 4364e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4365e716630dSMartin Matuska abd_free(abds[i]); 4366e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4367e716630dSMartin Matuska 4368e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4369e716630dSMartin Matuska 4370e716630dSMartin Matuska /* 4371e716630dSMartin Matuska * Update uberblock to indicate that the initial part has been 4372e716630dSMartin Matuska * reflow'ed. This is needed because after this point (when we exit 4373e716630dSMartin Matuska * the rangelock), we allow regular writes to this region, which will 4374e716630dSMartin Matuska * be written to the new location only (because reflow_offset_next == 4375e716630dSMartin Matuska * reflow_offset_synced). If we crashed and re-copied from the 4376e716630dSMartin Matuska * scratch space, we would lose the regular writes. 4377e716630dSMartin Matuska */ 4378e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4379e716630dSMartin Matuska logical_size); 4380e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4381e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4382e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4383e716630dSMartin Matuska if (spa_multihost(spa)) 4384e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4385e716630dSMartin Matuska 4386e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4387e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4388e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4389e716630dSMartin Matuska (long long)logical_size, 4390e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4391e716630dSMartin Matuska 4392e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4393e716630dSMartin Matuska 4394e716630dSMartin Matuska /* 4395e716630dSMartin Matuska * Update progress. 4396e716630dSMartin Matuska */ 4397e716630dSMartin Matuska vre->vre_offset = logical_size; 4398e716630dSMartin Matuska zfs_rangelock_exit(lr); 4399e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4400e716630dSMartin Matuska 4401e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4402e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4403e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4404e716630dSMartin Matuska /* 4405e716630dSMartin Matuska * Note - raidz_reflow_sync() will update the uberblock state to 4406e716630dSMartin Matuska * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4407e716630dSMartin Matuska */ 4408e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4409e716630dSMartin Matuska 4410e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4411e716630dSMartin Matuska } 4412e716630dSMartin Matuska 4413e716630dSMartin Matuska /* 4414e716630dSMartin Matuska * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4415e716630dSMartin Matuska * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4416e716630dSMartin Matuska */ 4417e716630dSMartin Matuska void 4418e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa) 4419e716630dSMartin Matuska { 4420e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4421e716630dSMartin Matuska uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4422e716630dSMartin Matuska ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4423e716630dSMartin Matuska 4424e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4425e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4426e716630dSMartin Matuska ASSERT0(logical_size % raidvd->vdev_children); 4427e716630dSMartin Matuska uint64_t write_size = logical_size / raidvd->vdev_children; 4428e716630dSMartin Matuska 4429e716630dSMartin Matuska zio_t *pio; 4430e716630dSMartin Matuska 4431e716630dSMartin Matuska /* 4432e716630dSMartin Matuska * Read from scratch space. 4433e716630dSMartin Matuska */ 4434e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4435e716630dSMartin Matuska KM_SLEEP); 4436e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4437e716630dSMartin Matuska abds[i] = abd_alloc_linear(write_size, B_FALSE); 4438e716630dSMartin Matuska } 4439e716630dSMartin Matuska 4440e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4441e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4442e716630dSMartin Matuska /* 4443e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4444e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4445e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4446e716630dSMartin Matuska */ 4447e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4448e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 444917aab35aSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, 4450e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4451e716630dSMartin Matuska } 4452e716630dSMartin Matuska zio_wait(pio); 4453e716630dSMartin Matuska 4454e716630dSMartin Matuska /* 4455e716630dSMartin Matuska * Overwrite real location with reflow'ed data. 4456e716630dSMartin Matuska */ 4457e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4458e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4459e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4460e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 446117aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, 0, 4462e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4463e716630dSMartin Matuska } 4464e716630dSMartin Matuska zio_wait(pio); 4465e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4466e716630dSMartin Matuska zio_flush(pio, raidvd); 4467e716630dSMartin Matuska zio_wait(pio); 4468e716630dSMartin Matuska 4469e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4470e716630dSMartin Matuska "to real location", (long long)logical_size); 4471e716630dSMartin Matuska 4472e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4473e716630dSMartin Matuska abd_free(abds[i]); 4474e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4475e716630dSMartin Matuska 4476e716630dSMartin Matuska /* 4477e716630dSMartin Matuska * Update uberblock. 4478e716630dSMartin Matuska */ 4479e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4480e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4481e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4482e716630dSMartin Matuska VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4483e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4484e716630dSMartin Matuska if (spa_multihost(spa)) 4485e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4486e716630dSMartin Matuska 4487e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: uberblock updated " 4488e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4489e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4490e716630dSMartin Matuska (long long)logical_size, 4491e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4492e716630dSMartin Matuska 4493e716630dSMartin Matuska dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4494e716630dSMartin Matuska spa_first_txg(spa)); 4495e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4496e716630dSMartin Matuska vre->vre_offset = logical_size; 4497e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4498e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4499e716630dSMartin Matuska /* 4500e716630dSMartin Matuska * Note that raidz_reflow_sync() will update the uberblock once more 4501e716630dSMartin Matuska */ 4502e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4503e716630dSMartin Matuska 4504e716630dSMartin Matuska dmu_tx_commit(tx); 4505e716630dSMartin Matuska 4506e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4507e716630dSMartin Matuska } 4508e716630dSMartin Matuska 4509e716630dSMartin Matuska static boolean_t 4510e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4511e716630dSMartin Matuska { 4512e716630dSMartin Matuska (void) zthr; 4513e716630dSMartin Matuska spa_t *spa = arg; 4514e716630dSMartin Matuska 4515e716630dSMartin Matuska return (spa->spa_raidz_expand != NULL && 4516e716630dSMartin Matuska !spa->spa_raidz_expand->vre_waiting_for_resilver); 4517e716630dSMartin Matuska } 4518e716630dSMartin Matuska 4519e716630dSMartin Matuska /* 4520e716630dSMartin Matuska * RAIDZ expansion background thread 4521e716630dSMartin Matuska * 4522e716630dSMartin Matuska * Can be called multiple times if the reflow is paused 4523e716630dSMartin Matuska */ 4524e716630dSMartin Matuska static void 4525e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4526e716630dSMartin Matuska { 4527e716630dSMartin Matuska spa_t *spa = arg; 4528e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4529e716630dSMartin Matuska 4530e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4531e716630dSMartin Matuska vre->vre_offset = 0; 4532e716630dSMartin Matuska else 4533e716630dSMartin Matuska vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4534e716630dSMartin Matuska 4535e716630dSMartin Matuska /* Reflow the begining portion using the scratch area */ 4536e716630dSMartin Matuska if (vre->vre_offset == 0) { 4537e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), 4538e716630dSMartin Matuska NULL, raidz_reflow_scratch_sync, 4539e716630dSMartin Matuska vre, 0, ZFS_SPACE_CHECK_NONE)); 4540e716630dSMartin Matuska 4541e716630dSMartin Matuska /* if we encountered errors then pause */ 4542e716630dSMartin Matuska if (vre->vre_offset == 0) { 4543e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4544e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4545e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4546e716630dSMartin Matuska return; 4547e716630dSMartin Matuska } 4548e716630dSMartin Matuska } 4549e716630dSMartin Matuska 4550e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4551e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4552e716630dSMartin Matuska 4553e716630dSMartin Matuska uint64_t guid = raidvd->vdev_guid; 4554e716630dSMartin Matuska 4555e716630dSMartin Matuska /* Iterate over all the remaining metaslabs */ 4556e716630dSMartin Matuska for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4557e716630dSMartin Matuska i < raidvd->vdev_ms_count && 4558e716630dSMartin Matuska !zthr_iscancelled(zthr) && 4559e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX; i++) { 4560e716630dSMartin Matuska metaslab_t *msp = raidvd->vdev_ms[i]; 4561e716630dSMartin Matuska 4562e716630dSMartin Matuska metaslab_disable(msp); 4563e716630dSMartin Matuska mutex_enter(&msp->ms_lock); 4564e716630dSMartin Matuska 4565e716630dSMartin Matuska /* 4566e716630dSMartin Matuska * The metaslab may be newly created (for the expanded 4567e716630dSMartin Matuska * space), in which case its trees won't exist yet, 4568e716630dSMartin Matuska * so we need to bail out early. 4569e716630dSMartin Matuska */ 4570e716630dSMartin Matuska if (msp->ms_new) { 4571e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4572e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4573e716630dSMartin Matuska continue; 4574e716630dSMartin Matuska } 4575e716630dSMartin Matuska 4576e716630dSMartin Matuska VERIFY0(metaslab_load(msp)); 4577e716630dSMartin Matuska 4578e716630dSMartin Matuska /* 4579e716630dSMartin Matuska * We want to copy everything except the free (allocatable) 4580e716630dSMartin Matuska * space. Note that there may be a little bit more free 4581e716630dSMartin Matuska * space (e.g. in ms_defer), and it's fine to copy that too. 4582e716630dSMartin Matuska */ 458317aab35aSMartin Matuska uint64_t shift, start; 4584b59a0cdeSMartin Matuska zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( 458517aab35aSMartin Matuska raidvd, msp, &start, &shift); 4586b59a0cdeSMartin Matuska zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, 458717aab35aSMartin Matuska start, shift); 4588b59a0cdeSMartin Matuska zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); 4589b59a0cdeSMartin Matuska zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, 4590b59a0cdeSMartin Matuska rt); 4591e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4592e716630dSMartin Matuska 4593e716630dSMartin Matuska /* 4594e716630dSMartin Matuska * Force the last sector of each metaslab to be copied. This 4595e716630dSMartin Matuska * ensures that we advance the on-disk progress to the end of 4596e716630dSMartin Matuska * this metaslab while the metaslab is disabled. Otherwise, we 4597e716630dSMartin Matuska * could move past this metaslab without advancing the on-disk 4598e716630dSMartin Matuska * progress, and then an allocation to this metaslab would not 4599e716630dSMartin Matuska * be copied. 4600e716630dSMartin Matuska */ 4601e716630dSMartin Matuska int sectorsz = 1 << raidvd->vdev_ashift; 4602e716630dSMartin Matuska uint64_t ms_last_offset = msp->ms_start + 4603e716630dSMartin Matuska msp->ms_size - sectorsz; 4604b59a0cdeSMartin Matuska if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { 4605b59a0cdeSMartin Matuska zfs_range_tree_add(rt, ms_last_offset, sectorsz); 4606e716630dSMartin Matuska } 4607e716630dSMartin Matuska 4608e716630dSMartin Matuska /* 4609e716630dSMartin Matuska * When we are resuming from a paused expansion (i.e. 4610e716630dSMartin Matuska * when importing a pool with a expansion in progress), 4611e716630dSMartin Matuska * discard any state that we have already processed. 4612e716630dSMartin Matuska */ 461317aab35aSMartin Matuska if (vre->vre_offset > msp->ms_start) { 4614b59a0cdeSMartin Matuska zfs_range_tree_clear(rt, msp->ms_start, 461517aab35aSMartin Matuska vre->vre_offset - msp->ms_start); 461617aab35aSMartin Matuska } 4617e716630dSMartin Matuska 4618e716630dSMartin Matuska while (!zthr_iscancelled(zthr) && 4619b59a0cdeSMartin Matuska !zfs_range_tree_is_empty(rt) && 4620e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX) { 4621e716630dSMartin Matuska 4622e716630dSMartin Matuska /* 4623e716630dSMartin Matuska * We need to periodically drop the config lock so that 4624e716630dSMartin Matuska * writers can get in. Additionally, we can't wait 4625e716630dSMartin Matuska * for a txg to sync while holding a config lock 4626e716630dSMartin Matuska * (since a waiting writer could cause a 3-way deadlock 4627e716630dSMartin Matuska * with the sync thread, which also gets a config 4628e716630dSMartin Matuska * lock for reader). So we can't hold the config lock 4629e716630dSMartin Matuska * while calling dmu_tx_assign(). 4630e716630dSMartin Matuska */ 4631e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4632e716630dSMartin Matuska 4633e716630dSMartin Matuska /* 4634e716630dSMartin Matuska * If requested, pause the reflow when the amount 4635e716630dSMartin Matuska * specified by raidz_expand_max_reflow_bytes is reached 4636e716630dSMartin Matuska * 4637e716630dSMartin Matuska * This pause is only used during testing or debugging. 4638e716630dSMartin Matuska */ 4639e716630dSMartin Matuska while (raidz_expand_max_reflow_bytes != 0 && 4640e716630dSMartin Matuska raidz_expand_max_reflow_bytes <= 4641e716630dSMartin Matuska vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4642e716630dSMartin Matuska delay(hz); 4643e716630dSMartin Matuska } 4644e716630dSMartin Matuska 4645e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4646e716630dSMartin Matuska while (vre->vre_outstanding_bytes > 4647e716630dSMartin Matuska raidz_expand_max_copy_bytes) { 4648e716630dSMartin Matuska cv_wait(&vre->vre_cv, &vre->vre_lock); 4649e716630dSMartin Matuska } 4650e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4651e716630dSMartin Matuska 4652e716630dSMartin Matuska dmu_tx_t *tx = 4653e716630dSMartin Matuska dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4654e716630dSMartin Matuska 4655*b1c1ee44SMartin Matuska VERIFY0(dmu_tx_assign(tx, 4656*b1c1ee44SMartin Matuska DMU_TX_WAIT | DMU_TX_SUSPEND)); 4657e716630dSMartin Matuska uint64_t txg = dmu_tx_get_txg(tx); 4658e716630dSMartin Matuska 4659e716630dSMartin Matuska /* 4660e716630dSMartin Matuska * Reacquire the vdev_config lock. Theoretically, the 4661e716630dSMartin Matuska * vdev_t that we're expanding may have changed. 4662e716630dSMartin Matuska */ 4663e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4664e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4665e716630dSMartin Matuska 4666e716630dSMartin Matuska boolean_t needsync = 4667e716630dSMartin Matuska raidz_reflow_impl(raidvd, vre, rt, tx); 4668e716630dSMartin Matuska 4669e716630dSMartin Matuska dmu_tx_commit(tx); 4670e716630dSMartin Matuska 4671e716630dSMartin Matuska if (needsync) { 4672e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4673e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, txg); 4674e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, 4675e716630dSMartin Matuska RW_READER); 4676e716630dSMartin Matuska } 4677e716630dSMartin Matuska } 4678e716630dSMartin Matuska 4679e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4680e716630dSMartin Matuska 4681e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4682b59a0cdeSMartin Matuska zfs_range_tree_vacate(rt, NULL, NULL); 4683b59a0cdeSMartin Matuska zfs_range_tree_destroy(rt); 4684e716630dSMartin Matuska 4685e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4686e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4687e716630dSMartin Matuska } 4688e716630dSMartin Matuska 4689e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4690e716630dSMartin Matuska 4691e716630dSMartin Matuska /* 4692e716630dSMartin Matuska * The txg_wait_synced() here ensures that all reflow zio's have 4693e716630dSMartin Matuska * completed, and vre_failed_offset has been set if necessary. It 4694e716630dSMartin Matuska * also ensures that the progress of the last raidz_reflow_sync() is 4695e716630dSMartin Matuska * written to disk before raidz_reflow_complete_sync() changes the 4696e716630dSMartin Matuska * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4697e716630dSMartin Matuska * determine if a reflow is in progress, in which case we may need to 4698e716630dSMartin Matuska * write to both old and new locations. Therefore we can only change 4699e716630dSMartin Matuska * vre_state once this is not necessary, which is once the on-disk 4700e716630dSMartin Matuska * progress (in spa_ubsync) has been set past any possible writes (to 4701e716630dSMartin Matuska * the end of the last metaslab). 4702e716630dSMartin Matuska */ 4703e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, 0); 4704e716630dSMartin Matuska 4705e716630dSMartin Matuska if (!zthr_iscancelled(zthr) && 4706e716630dSMartin Matuska vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4707e716630dSMartin Matuska /* 4708e716630dSMartin Matuska * We are not being canceled or paused, so the reflow must be 4709e716630dSMartin Matuska * complete. In that case also mark it as completed on disk. 4710e716630dSMartin Matuska */ 4711e716630dSMartin Matuska ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4712e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4713e716630dSMartin Matuska raidz_reflow_complete_sync, spa, 4714e716630dSMartin Matuska 0, ZFS_SPACE_CHECK_NONE)); 4715e716630dSMartin Matuska (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4716e716630dSMartin Matuska } else { 4717e716630dSMartin Matuska /* 4718e716630dSMartin Matuska * Wait for all copy zio's to complete and for all the 4719e716630dSMartin Matuska * raidz_reflow_sync() synctasks to be run. 4720e716630dSMartin Matuska */ 4721e716630dSMartin Matuska spa_history_log_internal(spa, "reflow pause", 4722e716630dSMartin Matuska NULL, "offset=%llu failed_offset=%lld", 4723e716630dSMartin Matuska (long long)vre->vre_offset, 4724e716630dSMartin Matuska (long long)vre->vre_failed_offset); 4725e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4726e716630dSMartin Matuska if (vre->vre_failed_offset != UINT64_MAX) { 4727e716630dSMartin Matuska /* 4728e716630dSMartin Matuska * Reset progress so that we will retry everything 4729e716630dSMartin Matuska * after the point that something failed. 4730e716630dSMartin Matuska */ 4731e716630dSMartin Matuska vre->vre_offset = vre->vre_failed_offset; 4732e716630dSMartin Matuska vre->vre_failed_offset = UINT64_MAX; 4733e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4734e716630dSMartin Matuska } 4735e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4736e716630dSMartin Matuska } 4737e716630dSMartin Matuska } 4738e716630dSMartin Matuska 4739e716630dSMartin Matuska void 4740e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa) 4741e716630dSMartin Matuska { 4742e716630dSMartin Matuska ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4743e716630dSMartin Matuska spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4744e716630dSMartin Matuska spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4745e716630dSMartin Matuska spa, defclsyspri); 4746e716630dSMartin Matuska } 4747e716630dSMartin Matuska 4748e716630dSMartin Matuska void 4749e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd) 4750e716630dSMartin Matuska { 4751e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 4752e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL) { 4753e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4754e716630dSMartin Matuska /* 4755e716630dSMartin Matuska * we get called often from vdev_dtl_reassess() so make 4756e716630dSMartin Matuska * sure it's our vdev and any replacing is complete 4757e716630dSMartin Matuska */ 4758e716630dSMartin Matuska if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4759e716630dSMartin Matuska !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4760e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4761e716630dSMartin Matuska if (vre->vre_waiting_for_resilver) { 4762e716630dSMartin Matuska vdev_dbgmsg(vd, "DTL reassessed, " 4763e716630dSMartin Matuska "continuing raidz expansion"); 4764e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_FALSE; 4765e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4766e716630dSMartin Matuska } 4767e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4768e716630dSMartin Matuska } 4769e716630dSMartin Matuska } 4770e716630dSMartin Matuska } 4771e716630dSMartin Matuska 4772e716630dSMartin Matuska int 4773e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child) 4774e716630dSMartin Matuska { 4775e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4776e716630dSMartin Matuska uint64_t new_children = raidvd->vdev_children; 4777e716630dSMartin Matuska 4778e716630dSMartin Matuska /* 4779e716630dSMartin Matuska * We use the "boot" space as scratch space to handle overwriting the 4780e716630dSMartin Matuska * initial part of the vdev. If it is too small, then this expansion 4781e716630dSMartin Matuska * is not allowed. This would be very unusual (e.g. ashift > 13 and 4782e716630dSMartin Matuska * >200 children). 4783e716630dSMartin Matuska */ 4784e716630dSMartin Matuska if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4785e716630dSMartin Matuska return (EINVAL); 4786e716630dSMartin Matuska } 4787e716630dSMartin Matuska return (0); 4788e716630dSMartin Matuska } 4789e716630dSMartin Matuska 4790e716630dSMartin Matuska void 4791e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4792e716630dSMartin Matuska { 4793e716630dSMartin Matuska vdev_t *new_child = arg; 4794e716630dSMartin Matuska spa_t *spa = new_child->vdev_spa; 4795e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4796e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4797e716630dSMartin Matuska ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4798e716630dSMartin Matuska ASSERT3P(raidvd->vdev_top, ==, raidvd); 4799e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4800e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4801e716630dSMartin Matuska ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4802e716630dSMartin Matuska new_child); 4803e716630dSMartin Matuska 4804e716630dSMartin Matuska spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4805e716630dSMartin Matuska 4806e716630dSMartin Matuska vdrz->vd_physical_width++; 4807e716630dSMartin Matuska 4808e716630dSMartin Matuska VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4809e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4810e716630dSMartin Matuska vdrz->vn_vre.vre_offset = 0; 4811e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4812e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4813e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4814e716630dSMartin Matuska 4815e716630dSMartin Matuska /* 4816e716630dSMartin Matuska * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4817e716630dSMartin Matuska * written to the config. 4818e716630dSMartin Matuska */ 4819e716630dSMartin Matuska vdev_config_dirty(raidvd); 4820e716630dSMartin Matuska 4821e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4822e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = 0; 4823e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4824e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = 0; 4825e716630dSMartin Matuska 4826e716630dSMartin Matuska uint64_t state = vdrz->vn_vre.vre_state; 4827e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4828e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4829e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 4830e716630dSMartin Matuska 4831e716630dSMartin Matuska uint64_t start_time = vdrz->vn_vre.vre_start_time; 4832e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4833e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4834e716630dSMartin Matuska sizeof (start_time), 1, &start_time, tx)); 4835e716630dSMartin Matuska 4836e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4837e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4838e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4839e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4840e716630dSMartin Matuska 4841e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4842e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 4843e716630dSMartin Matuska (unsigned long long)raidvd->vdev_id, 4844e716630dSMartin Matuska (unsigned long long)raidvd->vdev_children); 4845e716630dSMartin Matuska } 4846e716630dSMartin Matuska 4847e716630dSMartin Matuska int 4848e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd) 4849e716630dSMartin Matuska { 4850e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4851e716630dSMartin Matuska int err; 4852e716630dSMartin Matuska 4853e716630dSMartin Matuska uint64_t state = DSS_NONE; 4854e716630dSMartin Matuska uint64_t start_time = 0; 4855e716630dSMartin Matuska uint64_t end_time = 0; 4856e716630dSMartin Matuska uint64_t bytes_copied = 0; 4857e716630dSMartin Matuska 4858e716630dSMartin Matuska if (vd->vdev_top_zap != 0) { 4859e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4860e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4861e716630dSMartin Matuska sizeof (state), 1, &state); 4862e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4863e716630dSMartin Matuska return (err); 4864e716630dSMartin Matuska 4865e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4866e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4867e716630dSMartin Matuska sizeof (start_time), 1, &start_time); 4868e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4869e716630dSMartin Matuska return (err); 4870e716630dSMartin Matuska 4871e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4872e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4873e716630dSMartin Matuska sizeof (end_time), 1, &end_time); 4874e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4875e716630dSMartin Matuska return (err); 4876e716630dSMartin Matuska 4877e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4878e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4879e716630dSMartin Matuska sizeof (bytes_copied), 1, &bytes_copied); 4880e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4881e716630dSMartin Matuska return (err); 4882e716630dSMartin Matuska } 4883e716630dSMartin Matuska 4884e716630dSMartin Matuska /* 4885e716630dSMartin Matuska * If we are in the middle of expansion, vre_state should have 4886e716630dSMartin Matuska * already been set by vdev_raidz_init(). 4887e716630dSMartin Matuska */ 4888e716630dSMartin Matuska EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4889e716630dSMartin Matuska vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4890e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = start_time; 4891e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = end_time; 4892e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4893e716630dSMartin Matuska 4894e716630dSMartin Matuska return (0); 4895e716630dSMartin Matuska } 4896e716630dSMartin Matuska 4897e716630dSMartin Matuska int 4898e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4899e716630dSMartin Matuska { 4900e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4901e716630dSMartin Matuska 4902e716630dSMartin Matuska if (vre == NULL) { 4903e716630dSMartin Matuska /* no removal in progress; find most recent completed */ 4904e716630dSMartin Matuska for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4905e716630dSMartin Matuska vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4906e716630dSMartin Matuska if (vd->vdev_ops == &vdev_raidz_ops) { 4907e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4908e716630dSMartin Matuska 4909e716630dSMartin Matuska if (vdrz->vn_vre.vre_end_time != 0 && 4910e716630dSMartin Matuska (vre == NULL || 4911e716630dSMartin Matuska vdrz->vn_vre.vre_end_time > 4912e716630dSMartin Matuska vre->vre_end_time)) { 4913e716630dSMartin Matuska vre = &vdrz->vn_vre; 4914e716630dSMartin Matuska } 4915e716630dSMartin Matuska } 4916e716630dSMartin Matuska } 4917e716630dSMartin Matuska } 4918e716630dSMartin Matuska 4919e716630dSMartin Matuska if (vre == NULL) { 4920e716630dSMartin Matuska return (SET_ERROR(ENOENT)); 4921e716630dSMartin Matuska } 4922e716630dSMartin Matuska 4923e716630dSMartin Matuska pres->pres_state = vre->vre_state; 4924e716630dSMartin Matuska pres->pres_expanding_vdev = vre->vre_vdev_id; 4925e716630dSMartin Matuska 4926e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4927e716630dSMartin Matuska pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4928e716630dSMartin Matuska 4929e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4930e716630dSMartin Matuska pres->pres_reflowed = vre->vre_bytes_copied; 4931e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 4932e716630dSMartin Matuska pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4933e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4934e716630dSMartin Matuska 4935e716630dSMartin Matuska pres->pres_start_time = vre->vre_start_time; 4936e716630dSMartin Matuska pres->pres_end_time = vre->vre_end_time; 4937e716630dSMartin Matuska pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4938e716630dSMartin Matuska 4939e716630dSMartin Matuska return (0); 4940e716630dSMartin Matuska } 4941e716630dSMartin Matuska 49427877fdebSMatt Macy /* 49437877fdebSMatt Macy * Initialize private RAIDZ specific fields from the nvlist. 49447877fdebSMatt Macy */ 49457877fdebSMatt Macy static int 49467877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 49477877fdebSMatt Macy { 49487877fdebSMatt Macy uint_t children; 49497877fdebSMatt Macy nvlist_t **child; 49507877fdebSMatt Macy int error = nvlist_lookup_nvlist_array(nv, 49517877fdebSMatt Macy ZPOOL_CONFIG_CHILDREN, &child, &children); 49527877fdebSMatt Macy if (error != 0) 49537877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49547877fdebSMatt Macy 4955e716630dSMartin Matuska uint64_t nparity; 49567877fdebSMatt Macy if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 49577877fdebSMatt Macy if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 49587877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49597877fdebSMatt Macy 49607877fdebSMatt Macy /* 49617877fdebSMatt Macy * Previous versions could only support 1 or 2 parity 49627877fdebSMatt Macy * device. 49637877fdebSMatt Macy */ 49647877fdebSMatt Macy if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 49657877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49667877fdebSMatt Macy else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 49677877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49687877fdebSMatt Macy } else { 49697877fdebSMatt Macy /* 49707877fdebSMatt Macy * We require the parity to be specified for SPAs that 49717877fdebSMatt Macy * support multiple parity levels. 49727877fdebSMatt Macy */ 49737877fdebSMatt Macy if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 49747877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49757877fdebSMatt Macy 49767877fdebSMatt Macy /* 49777877fdebSMatt Macy * Otherwise, we default to 1 parity device for RAID-Z. 49787877fdebSMatt Macy */ 49797877fdebSMatt Macy nparity = 1; 49807877fdebSMatt Macy } 49817877fdebSMatt Macy 4982e716630dSMartin Matuska vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4983e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = -1; 4984e716630dSMartin Matuska vdrz->vn_vre.vre_offset = UINT64_MAX; 4985e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4986e716630dSMartin Matuska mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4987e716630dSMartin Matuska cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4988e716630dSMartin Matuska zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4989e716630dSMartin Matuska mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4990e716630dSMartin Matuska avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4991e716630dSMartin Matuska sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4992e716630dSMartin Matuska 4993e716630dSMartin Matuska vdrz->vd_physical_width = children; 49947877fdebSMatt Macy vdrz->vd_nparity = nparity; 49957877fdebSMatt Macy 4996e716630dSMartin Matuska /* note, the ID does not exist when creating a pool */ 4997e716630dSMartin Matuska (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4998e716630dSMartin Matuska &vdrz->vn_vre.vre_vdev_id); 4999e716630dSMartin Matuska 5000e716630dSMartin Matuska boolean_t reflow_in_progress = 5001e716630dSMartin Matuska nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5002e716630dSMartin Matuska if (reflow_in_progress) { 5003e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 5004e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 5005e716630dSMartin Matuska } 5006e716630dSMartin Matuska 5007e716630dSMartin Matuska vdrz->vd_original_width = children; 5008e716630dSMartin Matuska uint64_t *txgs; 5009e716630dSMartin Matuska unsigned int txgs_size = 0; 5010e716630dSMartin Matuska error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5011e716630dSMartin Matuska &txgs, &txgs_size); 5012e716630dSMartin Matuska if (error == 0) { 5013e716630dSMartin Matuska for (int i = 0; i < txgs_size; i++) { 5014e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 5015e716630dSMartin Matuska re->re_txg = txgs[txgs_size - i - 1]; 5016e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width - i; 5017e716630dSMartin Matuska 5018e716630dSMartin Matuska if (reflow_in_progress) 5019e716630dSMartin Matuska re->re_logical_width--; 5020e716630dSMartin Matuska 5021e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 5022e716630dSMartin Matuska } 5023e716630dSMartin Matuska 5024e716630dSMartin Matuska vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 5025e716630dSMartin Matuska } 5026e716630dSMartin Matuska if (reflow_in_progress) { 5027e716630dSMartin Matuska vdrz->vd_original_width--; 5028e716630dSMartin Matuska zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 5029e716630dSMartin Matuska children, txgs_size); 5030e716630dSMartin Matuska } 5031e716630dSMartin Matuska 50327877fdebSMatt Macy *tsd = vdrz; 50337877fdebSMatt Macy 50347877fdebSMatt Macy return (0); 50357877fdebSMatt Macy } 50367877fdebSMatt Macy 50377877fdebSMatt Macy static void 50387877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd) 50397877fdebSMatt Macy { 5040e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 5041e716630dSMartin Matuska if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 5042e716630dSMartin Matuska vd->vdev_spa->spa_raidz_expand = NULL; 5043e716630dSMartin Matuska reflow_node_t *re; 5044e716630dSMartin Matuska void *cookie = NULL; 5045e716630dSMartin Matuska avl_tree_t *tree = &vdrz->vd_expand_txgs; 5046e716630dSMartin Matuska while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 5047e716630dSMartin Matuska kmem_free(re, sizeof (*re)); 5048e716630dSMartin Matuska avl_destroy(&vdrz->vd_expand_txgs); 5049e716630dSMartin Matuska mutex_destroy(&vdrz->vd_expand_lock); 5050e716630dSMartin Matuska mutex_destroy(&vdrz->vn_vre.vre_lock); 5051e716630dSMartin Matuska cv_destroy(&vdrz->vn_vre.vre_cv); 5052e716630dSMartin Matuska zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 5053e716630dSMartin Matuska kmem_free(vdrz, sizeof (*vdrz)); 50547877fdebSMatt Macy } 50557877fdebSMatt Macy 50567877fdebSMatt Macy /* 50577877fdebSMatt Macy * Add RAIDZ specific fields to the config nvlist. 50587877fdebSMatt Macy */ 50597877fdebSMatt Macy static void 50607877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 50617877fdebSMatt Macy { 50627877fdebSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 50637877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 50647877fdebSMatt Macy 50657877fdebSMatt Macy /* 50667877fdebSMatt Macy * Make sure someone hasn't managed to sneak a fancy new vdev 50677877fdebSMatt Macy * into a crufty old storage pool. 50687877fdebSMatt Macy */ 50697877fdebSMatt Macy ASSERT(vdrz->vd_nparity == 1 || 50707877fdebSMatt Macy (vdrz->vd_nparity <= 2 && 50717877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 50727877fdebSMatt Macy (vdrz->vd_nparity <= 3 && 50737877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 50747877fdebSMatt Macy 50757877fdebSMatt Macy /* 50767877fdebSMatt Macy * Note that we'll add these even on storage pools where they 50777877fdebSMatt Macy * aren't strictly required -- older software will just ignore 50787877fdebSMatt Macy * it. 50797877fdebSMatt Macy */ 50807877fdebSMatt Macy fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 5081e716630dSMartin Matuska 5082e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 5083e716630dSMartin Matuska fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5084e716630dSMartin Matuska } 5085e716630dSMartin Matuska 5086e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 5087e716630dSMartin Matuska if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 5088e716630dSMartin Matuska uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 5089e716630dSMartin Matuska uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 5090e716630dSMartin Matuska KM_SLEEP); 5091e716630dSMartin Matuska uint64_t i = 0; 5092e716630dSMartin Matuska 5093e716630dSMartin Matuska for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 5094e716630dSMartin Matuska re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 5095e716630dSMartin Matuska txgs[i++] = re->re_txg; 5096e716630dSMartin Matuska } 5097e716630dSMartin Matuska 5098e716630dSMartin Matuska fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5099e716630dSMartin Matuska txgs, count); 5100e716630dSMartin Matuska 5101e716630dSMartin Matuska kmem_free(txgs, sizeof (uint64_t) * count); 5102e716630dSMartin Matuska } 5103e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 51047877fdebSMatt Macy } 51057877fdebSMatt Macy 51067877fdebSMatt Macy static uint64_t 51077877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd) 51087877fdebSMatt Macy { 51097877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 51107877fdebSMatt Macy return (vdrz->vd_nparity); 51117877fdebSMatt Macy } 51127877fdebSMatt Macy 51137877fdebSMatt Macy static uint64_t 51147877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd) 51157877fdebSMatt Macy { 51167877fdebSMatt Macy return (vd->vdev_children); 5117eda14cbcSMatt Macy } 5118eda14cbcSMatt Macy 5119eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = { 51207877fdebSMatt Macy .vdev_op_init = vdev_raidz_init, 51217877fdebSMatt Macy .vdev_op_fini = vdev_raidz_fini, 5122eda14cbcSMatt Macy .vdev_op_open = vdev_raidz_open, 5123eda14cbcSMatt Macy .vdev_op_close = vdev_raidz_close, 5124071ab5a1SMartin Matuska .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, 5125071ab5a1SMartin Matuska .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, 51267877fdebSMatt Macy .vdev_op_min_asize = vdev_raidz_min_asize, 51277877fdebSMatt Macy .vdev_op_min_alloc = NULL, 5128eda14cbcSMatt Macy .vdev_op_io_start = vdev_raidz_io_start, 5129eda14cbcSMatt Macy .vdev_op_io_done = vdev_raidz_io_done, 5130eda14cbcSMatt Macy .vdev_op_state_change = vdev_raidz_state_change, 5131eda14cbcSMatt Macy .vdev_op_need_resilver = vdev_raidz_need_resilver, 5132eda14cbcSMatt Macy .vdev_op_hold = NULL, 5133eda14cbcSMatt Macy .vdev_op_rele = NULL, 5134eda14cbcSMatt Macy .vdev_op_remap = NULL, 5135eda14cbcSMatt Macy .vdev_op_xlate = vdev_raidz_xlate, 51367877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 51377877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 51387877fdebSMatt Macy .vdev_op_config_generate = vdev_raidz_config_generate, 51397877fdebSMatt Macy .vdev_op_nparity = vdev_raidz_nparity, 51407877fdebSMatt Macy .vdev_op_ndisks = vdev_raidz_ndisks, 5141eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5142eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5143eda14cbcSMatt Macy }; 5144e716630dSMartin Matuska 5145e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5146e716630dSMartin Matuska "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5147e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5148e716630dSMartin Matuska "Max amount of concurrent i/o for RAIDZ expansion"); 5149e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5150e716630dSMartin Matuska "For expanded RAIDZ, aggregate reads that have more rows than this"); 5151e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5152e716630dSMartin Matuska "For expanded RAIDZ, automatically start a pool scrub when expansion " 5153e716630dSMartin Matuska "completes"); 5154