1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0 2eda14cbcSMatt Macy /* 3eda14cbcSMatt Macy * CDDL HEADER START 4eda14cbcSMatt Macy * 5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 6eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 7eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 8eda14cbcSMatt Macy * 9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 11eda14cbcSMatt Macy * See the License for the specific language governing permissions 12eda14cbcSMatt Macy * and limitations under the License. 13eda14cbcSMatt Macy * 14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 19eda14cbcSMatt Macy * 20eda14cbcSMatt Macy * CDDL HEADER END 21eda14cbcSMatt Macy */ 22eda14cbcSMatt Macy 23eda14cbcSMatt Macy /* 24eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 252c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 26eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved. 27eda14cbcSMatt Macy */ 28eda14cbcSMatt Macy 29eda14cbcSMatt Macy #include <sys/zfs_context.h> 30eda14cbcSMatt Macy #include <sys/spa.h> 31e716630dSMartin Matuska #include <sys/spa_impl.h> 32e716630dSMartin Matuska #include <sys/zap.h> 33eda14cbcSMatt Macy #include <sys/vdev_impl.h> 34e716630dSMartin Matuska #include <sys/metaslab_impl.h> 35eda14cbcSMatt Macy #include <sys/zio.h> 36eda14cbcSMatt Macy #include <sys/zio_checksum.h> 37e716630dSMartin Matuska #include <sys/dmu_tx.h> 38eda14cbcSMatt Macy #include <sys/abd.h> 39e716630dSMartin Matuska #include <sys/zfs_rlock.h> 40eda14cbcSMatt Macy #include <sys/fs/zfs.h> 41eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 42eda14cbcSMatt Macy #include <sys/vdev_raidz.h> 43eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h> 447877fdebSMatt Macy #include <sys/vdev_draid.h> 45e716630dSMartin Matuska #include <sys/uberblock_impl.h> 46e716630dSMartin Matuska #include <sys/dsl_scan.h> 47eda14cbcSMatt Macy 48eda14cbcSMatt Macy #ifdef ZFS_DEBUG 49eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ 50eda14cbcSMatt Macy #endif 51eda14cbcSMatt Macy 52eda14cbcSMatt Macy /* 53eda14cbcSMatt Macy * Virtual device vector for RAID-Z. 54eda14cbcSMatt Macy * 55eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity, 56eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity, 57eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the 58eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 59eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 60eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 61eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance 62eda14cbcSMatt Macy * for writes. 63eda14cbcSMatt Macy * 64eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then 65eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its 66eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to 67eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to 68eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 69eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will 70eda14cbcSMatt Macy * suffer. 71eda14cbcSMatt Macy * 72eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the 73eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 74eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the 75eda14cbcSMatt Macy * field are defined as follows: 76eda14cbcSMatt Macy * 77eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR 78eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B 79eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression: 80eda14cbcSMatt Macy * 81eda14cbcSMatt Macy * (A * 2)_7 = A_6 82eda14cbcSMatt Macy * (A * 2)_6 = A_5 83eda14cbcSMatt Macy * (A * 2)_5 = A_4 84eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7 85eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7 86eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7 87eda14cbcSMatt Macy * (A * 2)_1 = A_0 88eda14cbcSMatt Macy * (A * 2)_0 = A_7 89eda14cbcSMatt Macy * 90eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 91eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting 92eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 93eda14cbcSMatt Macy * 94eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a 95eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of 96eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 97eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 98eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore 99eda14cbcSMatt Macy * A ^ (255 - 1) = A^254. 100eda14cbcSMatt Macy * 101eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns, 102eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations: 103eda14cbcSMatt Macy * 104eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1 105eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 106eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 107eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 108eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 109eda14cbcSMatt Macy * 110eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial 111eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 112eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have 113eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.) 114eda14cbcSMatt Macy * 115eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually 116eda14cbcSMatt Macy * or in concert to recover missing data columns. 117eda14cbcSMatt Macy */ 118eda14cbcSMatt Macy 119eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0 120eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1 121eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2 122eda14cbcSMatt Macy 123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 124eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 125eda14cbcSMatt Macy 126eda14cbcSMatt Macy /* 127eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a 128eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by 129eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to 130eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d. 131eda14cbcSMatt Macy */ 132eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \ 133eda14cbcSMatt Macy { \ 134eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \ 135eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \ 136eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 137eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 138eda14cbcSMatt Macy } 139eda14cbcSMatt Macy 140eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \ 141eda14cbcSMatt Macy { \ 142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 143eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \ 144eda14cbcSMatt Macy } 145eda14cbcSMatt Macy 146e716630dSMartin Matuska 147e716630dSMartin Matuska /* 148e716630dSMartin Matuska * Big Theory Statement for how a RAIDZ VDEV is expanded 149e716630dSMartin Matuska * 150e716630dSMartin Matuska * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion 151e716630dSMartin Matuska * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs 152e716630dSMartin Matuska * that have been previously expanded can be expanded again. 153e716630dSMartin Matuska * 154e716630dSMartin Matuska * The RAIDZ VDEV must be healthy (must be able to write to all the drives in 155e716630dSMartin Matuska * the VDEV) when an expansion starts. And the expansion will pause if any 156e716630dSMartin Matuska * disk in the VDEV fails, and resume once the VDEV is healthy again. All other 157e716630dSMartin Matuska * operations on the pool can continue while an expansion is in progress (e.g. 158e716630dSMartin Matuska * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, 159e716630dSMartin Matuska * and zpool initialize which can't be run during an expansion. Following a 160e716630dSMartin Matuska * reboot or export/import, the expansion resumes where it left off. 161e716630dSMartin Matuska * 162e716630dSMartin Matuska * == Reflowing the Data == 163e716630dSMartin Matuska * 164e716630dSMartin Matuska * The expansion involves reflowing (copying) the data from the current set 165e716630dSMartin Matuska * of disks to spread it across the new set which now has one more disk. This 166e716630dSMartin Matuska * reflow operation is similar to reflowing text when the column width of a 167e716630dSMartin Matuska * text editor window is expanded. The text doesn’t change but the location of 168e716630dSMartin Matuska * the text changes to accommodate the new width. An example reflow result for 169e716630dSMartin Matuska * a 4-wide RAIDZ1 to a 5-wide is shown below. 170e716630dSMartin Matuska * 171e716630dSMartin Matuska * Reflow End State 172e716630dSMartin Matuska * Each letter indicates a parity group (logical stripe) 173e716630dSMartin Matuska * 174e716630dSMartin Matuska * Before expansion After Expansion 175e716630dSMartin Matuska * D1 D2 D3 D4 D1 D2 D3 D4 D5 176e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 177e716630dSMartin Matuska * | | | | | | | | | | | 178e716630dSMartin Matuska * | A | A | A | A | | A | A | A | A | B | 179e716630dSMartin Matuska * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| 180e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 181e716630dSMartin Matuska * | | | | | | | | | | | 182e716630dSMartin Matuska * | B | B | C | C | | B | C | C | C | C | 183e716630dSMartin Matuska * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| 184e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 185e716630dSMartin Matuska * | | | | | | | | | | | 186e716630dSMartin Matuska * | C | C | D | D | | D | D | E | E | E | 187e716630dSMartin Matuska * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| 188e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 189e716630dSMartin Matuska * | | | | | | | | | | | 190e716630dSMartin Matuska * | E | E | E | E | --> | E | F | F | G | G | 191e716630dSMartin Matuska * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| 192e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 193e716630dSMartin Matuska * | | | | | | | | | | | 194e716630dSMartin Matuska * | F | F | G | G | | G | G | H | H | H | 195e716630dSMartin Matuska * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| 196e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 197e716630dSMartin Matuska * | | | | | | | | | | | 198e716630dSMartin Matuska * | G | G | H | H | | H | I | I | J | J | 199e716630dSMartin Matuska * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| 200e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 201e716630dSMartin Matuska * | | | | | | | | | | | 202e716630dSMartin Matuska * | H | H | I | I | | J | J | | | K | 203e716630dSMartin Matuska * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| 204e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+ 205e716630dSMartin Matuska * 206e716630dSMartin Matuska * This reflow approach has several advantages. There is no need to read or 207e716630dSMartin Matuska * modify the block pointers or recompute any block checksums. The reflow 208e716630dSMartin Matuska * doesn’t need to know where the parity sectors reside. We can read and write 209e716630dSMartin Matuska * data sequentially and the copy can occur in a background thread in open 210e716630dSMartin Matuska * context. The design also allows for fast discovery of what data to copy. 211e716630dSMartin Matuska * 212e716630dSMartin Matuska * The VDEV metaslabs are processed, one at a time, to copy the block data to 213e716630dSMartin Matuska * have it flow across all the disks. The metaslab is disabled for allocations 214e716630dSMartin Matuska * during the copy. As an optimization, we only copy the allocated data which 215e716630dSMartin Matuska * can be determined by looking at the metaslab range tree. During the copy we 216e716630dSMartin Matuska * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still 217e716630dSMartin Matuska * need to be able to survive losing parity count disks). This means we 218e716630dSMartin Matuska * cannot overwrite data during the reflow that would be needed if a disk is 219e716630dSMartin Matuska * lost. 220e716630dSMartin Matuska * 221e716630dSMartin Matuska * After the reflow completes, all newly-written blocks will have the new 222e716630dSMartin Matuska * layout, i.e., they will have the parity to data ratio implied by the new 223e716630dSMartin Matuska * number of disks in the RAIDZ group. Even though the reflow copies all of 224e716630dSMartin Matuska * the allocated space (data and parity), it is only rearranged, not changed. 225e716630dSMartin Matuska * 226e716630dSMartin Matuska * This act of reflowing the data has a few implications about blocks 227e716630dSMartin Matuska * that were written before the reflow completes: 228e716630dSMartin Matuska * 229e716630dSMartin Matuska * - Old blocks will still use the same amount of space (i.e., they will have 230e716630dSMartin Matuska * the parity to data ratio implied by the old number of disks in the RAIDZ 231e716630dSMartin Matuska * group). 232e716630dSMartin Matuska * - Reading old blocks will be slightly slower than before the reflow, for 233e716630dSMartin Matuska * two reasons. First, we will have to read from all disks in the RAIDZ 234e716630dSMartin Matuska * VDEV, rather than being able to skip the children that contain only 235e716630dSMartin Matuska * parity of this block (because the data of a single block is now spread 236e716630dSMartin Matuska * out across all the disks). Second, in most cases there will be an extra 237e716630dSMartin Matuska * bcopy, needed to rearrange the data back to its original layout in memory. 238e716630dSMartin Matuska * 239e716630dSMartin Matuska * == Scratch Area == 240e716630dSMartin Matuska * 241e716630dSMartin Matuska * As we copy the block data, we can only progress to the point that writes 242e716630dSMartin Matuska * will not overlap with blocks whose progress has not yet been recorded on 243e716630dSMartin Matuska * disk. Since partially-copied rows are always read from the old location, 244e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent any 245e716630dSMartin Matuska * row-wise overlap. For example, in the diagram above, when we reflow sector 246e716630dSMartin Matuska * B6 it will overwite the original location for B5. 247e716630dSMartin Matuska * 248e716630dSMartin Matuska * To get around this, a scratch space is used so that we can start copying 249e716630dSMartin Matuska * without risking data loss by overlapping the row. As an added benefit, it 250e716630dSMartin Matuska * improves performance at the beginning of the reflow, but that small perf 251e716630dSMartin Matuska * boost wouldn't be worth the complexity on its own. 252e716630dSMartin Matuska * 253e716630dSMartin Matuska * Ideally we want to copy at least 2 * (new_width)^2 so that we have a 254e716630dSMartin Matuska * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max 255e716630dSMartin Matuska * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice 256e716630dSMartin Matuska * the widths will likely be single digits so we can get a substantial chuck 257e716630dSMartin Matuska * size using only a few MB of scratch per disk. 258e716630dSMartin Matuska * 259e716630dSMartin Matuska * The scratch area is persisted to disk which holds a large amount of reflowed 260e716630dSMartin Matuska * state. We can always read the partially written stripes when a disk fails or 261e716630dSMartin Matuska * the copy is interrupted (crash) during the initial copying phase and also 262e716630dSMartin Matuska * get past a small chunk size restriction. At a minimum, the scratch space 263e716630dSMartin Matuska * must be large enough to get us to the point that one row does not overlap 264e716630dSMartin Matuska * itself when moved (i.e new_width^2). But going larger is even better. We 265e716630dSMartin Matuska * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels 266e716630dSMartin Matuska * as our scratch space to handle overwriting the initial part of the VDEV. 267e716630dSMartin Matuska * 268e716630dSMartin Matuska * 0 256K 512K 4M 269e716630dSMartin Matuska * +------+------+-----------------------+----------------------------- 270e716630dSMartin Matuska * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... 271e716630dSMartin Matuska * | L0 | L1 | Reserved | (Metaslabs) 272e716630dSMartin Matuska * +------+------+-----------------------+------------------------------- 273e716630dSMartin Matuska * Scratch Area 274e716630dSMartin Matuska * 275e716630dSMartin Matuska * == Reflow Progress Updates == 276e716630dSMartin Matuska * After the initial scratch-based reflow, the expansion process works 277e716630dSMartin Matuska * similarly to device removal. We create a new open context thread which 278e716630dSMartin Matuska * reflows the data, and periodically kicks off sync tasks to update logical 279e716630dSMartin Matuska * state. In this case, state is the committed progress (offset of next data 280e716630dSMartin Matuska * to copy). We need to persist the completed offset on disk, so that if we 281e716630dSMartin Matuska * crash we know which format each VDEV offset is in. 282e716630dSMartin Matuska * 283e716630dSMartin Matuska * == Time Dependent Geometry == 284e716630dSMartin Matuska * 285e716630dSMartin Matuska * In non-expanded RAIDZ, blocks are read from disk in a column by column 286e716630dSMartin Matuska * fashion. For a multi-row block, the second sector is in the first column 287e716630dSMartin Matuska * not in the second column. This allows us to issue full reads for each 288e716630dSMartin Matuska * column directly into the request buffer. The block data is thus laid out 289e716630dSMartin Matuska * sequentially in a column-by-column fashion. 290e716630dSMartin Matuska * 291e716630dSMartin Matuska * For example, in the before expansion diagram above, one logical block might 292e716630dSMartin Matuska * be sectors G19-H26. The parity is in G19,H23; and the data is in 293e716630dSMartin Matuska * G20,H24,G21,H25,G22,H26. 294e716630dSMartin Matuska * 295e716630dSMartin Matuska * After a block is reflowed, the sectors that were all in the original column 296e716630dSMartin Matuska * data can now reside in different columns. When reading from an expanded 297e716630dSMartin Matuska * VDEV, we need to know the logical stripe width for each block so we can 298e716630dSMartin Matuska * reconstitute the block’s data after the reads are completed. Likewise, 299e716630dSMartin Matuska * when we perform the combinatorial reconstruction we need to know the 300e716630dSMartin Matuska * original width so we can retry combinations from the past layouts. 301e716630dSMartin Matuska * 302e716630dSMartin Matuska * Time dependent geometry is what we call having blocks with different layouts 303e716630dSMartin Matuska * (stripe widths) in the same VDEV. This time-dependent geometry uses the 304e716630dSMartin Matuska * block’s birth time (+ the time expansion ended) to establish the correct 305e716630dSMartin Matuska * width for a given block. After an expansion completes, we record the time 306e716630dSMartin Matuska * for blocks written with a particular width (geometry). 307e716630dSMartin Matuska * 308e716630dSMartin Matuska * == On Disk Format Changes == 309e716630dSMartin Matuska * 310e716630dSMartin Matuska * New pool feature flag, 'raidz_expansion' whose reference count is the number 311e716630dSMartin Matuska * of RAIDZ VDEVs that have been expanded. 312e716630dSMartin Matuska * 313e716630dSMartin Matuska * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. 314e716630dSMartin Matuska * 315e716630dSMartin Matuska * Since the uberblock can point to arbitrary blocks, which might be on the 316e716630dSMartin Matuska * expanding RAIDZ, and might or might not have been expanded. We need to know 317e716630dSMartin Matuska * which way a block is laid out before reading it. This info is the next 318e716630dSMartin Matuska * offset that needs to be reflowed and we persist that in the uberblock, in 319e716630dSMartin Matuska * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. 320e716630dSMartin Matuska * After the expansion is complete, we then use the raidz_expand_txgs array 321e716630dSMartin Matuska * (see below) to determine how to read a block and the ub_raidz_reflow_info 322e716630dSMartin Matuska * field no longer required. 323e716630dSMartin Matuska * 324e716630dSMartin Matuska * The uberblock's ub_raidz_reflow_info field also holds the scratch space 325e716630dSMartin Matuska * state (i.e., active or not) which is also required before reading a block 326e716630dSMartin Matuska * during the initial phase of reflowing the data. 327e716630dSMartin Matuska * 328e716630dSMartin Matuska * The top-level RAIDZ VDEV has two new entries in the nvlist: 329e716630dSMartin Matuska * 330e716630dSMartin Matuska * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here 331e716630dSMartin Matuska * and used after the expansion is complete to 332e716630dSMartin Matuska * determine how to read a raidz block 333e716630dSMartin Matuska * 'raidz_expanding' boolean: present during reflow and removed after completion 334e716630dSMartin Matuska * used during a spa import to resume an unfinished 335e716630dSMartin Matuska * expansion 336e716630dSMartin Matuska * 337e716630dSMartin Matuska * And finally the VDEVs top zap adds the following informational entries: 338e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE 339e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME 340e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME 341e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED 342e716630dSMartin Matuska */ 343e716630dSMartin Matuska 344e716630dSMartin Matuska /* 345e716630dSMartin Matuska * For testing only: pause the raidz expansion after reflowing this amount. 346e716630dSMartin Matuska * (accessed by ZTS and ztest) 347e716630dSMartin Matuska */ 348e716630dSMartin Matuska #ifdef _KERNEL 349e716630dSMartin Matuska static 350e716630dSMartin Matuska #endif /* _KERNEL */ 351e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0; 352e716630dSMartin Matuska 353e716630dSMartin Matuska /* 354e716630dSMartin Matuska * For testing only: pause the raidz expansion at a certain point. 355e716630dSMartin Matuska */ 356e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0; 357e716630dSMartin Matuska 358e716630dSMartin Matuska /* 359e716630dSMartin Matuska * Maximum amount of copy io's outstanding at once. 360e716630dSMartin Matuska */ 36117aab35aSMartin Matuska #ifdef _ILP32 36217aab35aSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; 36317aab35aSMartin Matuska #else 364e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; 36517aab35aSMartin Matuska #endif 366e716630dSMartin Matuska 367e716630dSMartin Matuska /* 368e716630dSMartin Matuska * Apply raidz map abds aggregation if the number of rows in the map is equal 369e716630dSMartin Matuska * or greater than the value below. 370e716630dSMartin Matuska */ 371e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4; 372e716630dSMartin Matuska 373e716630dSMartin Matuska /* 374e716630dSMartin Matuska * Automatically start a pool scrub when a RAIDZ expansion completes in 375e716630dSMartin Matuska * order to verify the checksums of all blocks which have been copied 376e716630dSMartin Matuska * during the expansion. Automatic scrubbing is enabled by default and 377e716630dSMartin Matuska * is strongly recommended. 378e716630dSMartin Matuska */ 379e716630dSMartin Matuska static int zfs_scrub_after_expand = 1; 380e716630dSMartin Matuska 3817877fdebSMatt Macy static void 3827877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr) 383eda14cbcSMatt Macy { 384184c1b94SMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 385184c1b94SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 386eda14cbcSMatt Macy 387184c1b94SMartin Matuska if (rc->rc_size != 0) 388184c1b94SMartin Matuska abd_free(rc->rc_abd); 389184c1b94SMartin Matuska if (rc->rc_orig_data != NULL) 390f9693befSMartin Matuska abd_free(rc->rc_orig_data); 391eda14cbcSMatt Macy } 392eda14cbcSMatt Macy 3937877fdebSMatt Macy if (rr->rr_abd_empty != NULL) 3947877fdebSMatt Macy abd_free(rr->rr_abd_empty); 395eda14cbcSMatt Macy 3967877fdebSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); 3977877fdebSMatt Macy } 3987877fdebSMatt Macy 3997877fdebSMatt Macy void 4007877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm) 4017877fdebSMatt Macy { 4027877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) 4037877fdebSMatt Macy vdev_raidz_row_free(rm->rm_row[i]); 4047877fdebSMatt Macy 405e716630dSMartin Matuska if (rm->rm_nphys_cols) { 406e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 407e716630dSMartin Matuska if (rm->rm_phys_col[i].rc_abd != NULL) 408e716630dSMartin Matuska abd_free(rm->rm_phys_col[i].rc_abd); 409e716630dSMartin Matuska } 410e716630dSMartin Matuska 411e716630dSMartin Matuska kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * 412e716630dSMartin Matuska rm->rm_nphys_cols); 413e716630dSMartin Matuska } 414e716630dSMartin Matuska 415e716630dSMartin Matuska ASSERT3P(rm->rm_lr, ==, NULL); 4167877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); 417eda14cbcSMatt Macy } 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy static void 420eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio) 421eda14cbcSMatt Macy { 422eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 423eda14cbcSMatt Macy 424eda14cbcSMatt Macy vdev_raidz_map_free(rm); 425eda14cbcSMatt Macy } 426eda14cbcSMatt Macy 427e716630dSMartin Matuska static int 428e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2) 429e716630dSMartin Matuska { 430e716630dSMartin Matuska const reflow_node_t *l = x1; 431e716630dSMartin Matuska const reflow_node_t *r = x2; 432e716630dSMartin Matuska 433e716630dSMartin Matuska return (TREE_CMP(l->re_txg, r->re_txg)); 434e716630dSMartin Matuska } 435e716630dSMartin Matuska 436f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = { 437eda14cbcSMatt Macy .vsd_free = vdev_raidz_map_free_vsd, 438eda14cbcSMatt Macy }; 439eda14cbcSMatt Macy 440e716630dSMartin Matuska raidz_row_t * 44187bf66d4SMartin Matuska vdev_raidz_row_alloc(int cols, zio_t *zio) 442e716630dSMartin Matuska { 443e716630dSMartin Matuska raidz_row_t *rr = 444e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); 445e716630dSMartin Matuska 446e716630dSMartin Matuska rr->rr_cols = cols; 447e716630dSMartin Matuska rr->rr_scols = cols; 448e716630dSMartin Matuska 449e716630dSMartin Matuska for (int c = 0; c < cols; c++) { 450e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 451e716630dSMartin Matuska rc->rc_shadow_devidx = INT_MAX; 452e716630dSMartin Matuska rc->rc_shadow_offset = UINT64_MAX; 45387bf66d4SMartin Matuska /* 45487bf66d4SMartin Matuska * We can not allow self healing to take place for Direct I/O 45587bf66d4SMartin Matuska * reads. There is nothing that stops the buffer contents from 45687bf66d4SMartin Matuska * being manipulated while the I/O is in flight. It is possible 45787bf66d4SMartin Matuska * that the checksum could be verified on the buffer and then 45887bf66d4SMartin Matuska * the contents of that buffer are manipulated afterwards. This 45987bf66d4SMartin Matuska * could lead to bad data being written out during self 46087bf66d4SMartin Matuska * healing. 46187bf66d4SMartin Matuska */ 46287bf66d4SMartin Matuska if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) 463e716630dSMartin Matuska rc->rc_allow_repair = 1; 464e716630dSMartin Matuska } 465e716630dSMartin Matuska return (rr); 466e716630dSMartin Matuska } 467e716630dSMartin Matuska 46881b22a98SMartin Matuska static void 46981b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) 47081b22a98SMartin Matuska { 47181b22a98SMartin Matuska int c; 47281b22a98SMartin Matuska int nwrapped = 0; 47381b22a98SMartin Matuska uint64_t off = 0; 47481b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 47581b22a98SMartin Matuska 47681b22a98SMartin Matuska ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 47781b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 47881b22a98SMartin Matuska 47981b22a98SMartin Matuska /* 48081b22a98SMartin Matuska * Pad any parity columns with additional space to account for skip 48181b22a98SMartin Matuska * sectors. 48281b22a98SMartin Matuska */ 48381b22a98SMartin Matuska if (rm->rm_skipstart < rr->rr_firstdatacol) { 48481b22a98SMartin Matuska ASSERT0(rm->rm_skipstart); 48581b22a98SMartin Matuska nwrapped = rm->rm_nskip; 48681b22a98SMartin Matuska } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { 48781b22a98SMartin Matuska nwrapped = 48881b22a98SMartin Matuska (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; 48981b22a98SMartin Matuska } 49081b22a98SMartin Matuska 49181b22a98SMartin Matuska /* 49281b22a98SMartin Matuska * Optional single skip sectors (rc_size == 0) will be handled in 49381b22a98SMartin Matuska * vdev_raidz_io_start_write(). 49481b22a98SMartin Matuska */ 49581b22a98SMartin Matuska int skipped = rr->rr_scols - rr->rr_cols; 49681b22a98SMartin Matuska 49781b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 49881b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) { 49981b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 50081b22a98SMartin Matuska 50181b22a98SMartin Matuska /* 50281b22a98SMartin Matuska * Parity columns will pad out a linear ABD to account for 50381b22a98SMartin Matuska * the skip sector. A linear ABD is used here because 50481b22a98SMartin Matuska * parity calculations use the ABD buffer directly to calculate 50581b22a98SMartin Matuska * parity. This avoids doing a memcpy back to the ABD after the 50681b22a98SMartin Matuska * parity has been calculated. By issuing the parity column 50781b22a98SMartin Matuska * with the skip sector we can reduce contention on the child 50881b22a98SMartin Matuska * VDEV queue locks (vq_lock). 50981b22a98SMartin Matuska */ 51081b22a98SMartin Matuska if (c < nwrapped) { 51181b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear( 51281b22a98SMartin Matuska rc->rc_size + (1ULL << ashift), B_FALSE); 51381b22a98SMartin Matuska abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); 51481b22a98SMartin Matuska skipped++; 51581b22a98SMartin Matuska } else { 51681b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 51781b22a98SMartin Matuska } 51881b22a98SMartin Matuska } 51981b22a98SMartin Matuska 52081b22a98SMartin Matuska for (off = 0; c < rr->rr_cols; c++) { 52181b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 52281b22a98SMartin Matuska abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, 52381b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 52481b22a98SMartin Matuska 52581b22a98SMartin Matuska /* 52681b22a98SMartin Matuska * Generate I/O for skip sectors to improve aggregation 52781b22a98SMartin Matuska * continuity. We will use gang ABD's to reduce contention 52881b22a98SMartin Matuska * on the child VDEV queue locks (vq_lock) by issuing 52981b22a98SMartin Matuska * a single I/O that contains the data and skip sector. 53081b22a98SMartin Matuska * 53181b22a98SMartin Matuska * It is important to make sure that rc_size is not updated 53281b22a98SMartin Matuska * even though we are adding a skip sector to the ABD. When 53381b22a98SMartin Matuska * calculating the parity in vdev_raidz_generate_parity_row() 53481b22a98SMartin Matuska * the rc_size is used to iterate through the ABD's. We can 53581b22a98SMartin Matuska * not have zero'd out skip sectors used for calculating 53681b22a98SMartin Matuska * parity for raidz, because those same sectors are not used 53781b22a98SMartin Matuska * during reconstruction. 53881b22a98SMartin Matuska */ 53981b22a98SMartin Matuska if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { 54081b22a98SMartin Matuska rc->rc_abd = abd_alloc_gang(); 54181b22a98SMartin Matuska abd_gang_add(rc->rc_abd, abd, B_TRUE); 54281b22a98SMartin Matuska abd_gang_add(rc->rc_abd, 54381b22a98SMartin Matuska abd_get_zeros(1ULL << ashift), B_TRUE); 54481b22a98SMartin Matuska skipped++; 54581b22a98SMartin Matuska } else { 54681b22a98SMartin Matuska rc->rc_abd = abd; 54781b22a98SMartin Matuska } 54881b22a98SMartin Matuska off += rc->rc_size; 54981b22a98SMartin Matuska } 55081b22a98SMartin Matuska 55181b22a98SMartin Matuska ASSERT3U(off, ==, zio->io_size); 55281b22a98SMartin Matuska ASSERT3S(skipped, ==, rm->rm_nskip); 55381b22a98SMartin Matuska } 55481b22a98SMartin Matuska 55581b22a98SMartin Matuska static void 55681b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) 55781b22a98SMartin Matuska { 55881b22a98SMartin Matuska int c; 55981b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 56081b22a98SMartin Matuska 56181b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 56281b22a98SMartin Matuska 56381b22a98SMartin Matuska /* Allocate buffers for the parity columns */ 56481b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) 56581b22a98SMartin Matuska rr->rr_col[c].rc_abd = 56681b22a98SMartin Matuska abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); 56781b22a98SMartin Matuska 56881b22a98SMartin Matuska for (uint64_t off = 0; c < rr->rr_cols; c++) { 56981b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 57081b22a98SMartin Matuska rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, 57181b22a98SMartin Matuska zio->io_abd, off, rc->rc_size); 57281b22a98SMartin Matuska off += rc->rc_size; 57381b22a98SMartin Matuska } 57481b22a98SMartin Matuska } 57581b22a98SMartin Matuska 576eda14cbcSMatt Macy /* 577eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is 578eda14cbcSMatt Macy * the number of children in the target vdev. 579eda14cbcSMatt Macy * 580eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which 581eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 582eda14cbcSMatt Macy */ 583eda14cbcSMatt Macy noinline raidz_map_t * 584eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, 585eda14cbcSMatt Macy uint64_t nparity) 586eda14cbcSMatt Macy { 5877877fdebSMatt Macy raidz_row_t *rr; 588eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 589eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift; 590eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 591eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift; 592eda14cbcSMatt Macy /* The first column for this stripe. */ 593eda14cbcSMatt Macy uint64_t f = b % dcols; 594eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */ 595eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift; 596e716630dSMartin Matuska uint64_t acols, scols; 597eda14cbcSMatt Macy 5987877fdebSMatt Macy raidz_map_t *rm = 5997877fdebSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); 6007877fdebSMatt Macy rm->rm_nrows = 1; 6017877fdebSMatt Macy 602eda14cbcSMatt Macy /* 603eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but 604eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data. 605eda14cbcSMatt Macy */ 606e716630dSMartin Matuska uint64_t q = s / (dcols - nparity); 607eda14cbcSMatt Macy 608eda14cbcSMatt Macy /* 609eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O. 610eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs. 611eda14cbcSMatt Macy */ 612e716630dSMartin Matuska uint64_t r = s - q * (dcols - nparity); 613eda14cbcSMatt Macy 614eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */ 615e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 616eda14cbcSMatt Macy 617eda14cbcSMatt Macy /* 618eda14cbcSMatt Macy * The total number of data and parity sectors associated with 619eda14cbcSMatt Macy * this I/O. 620eda14cbcSMatt Macy */ 621e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 622eda14cbcSMatt Macy 6237877fdebSMatt Macy /* 6247877fdebSMatt Macy * acols: The columns that will be accessed. 6257877fdebSMatt Macy * scols: The columns that will be accessed or skipped. 6267877fdebSMatt Macy */ 627eda14cbcSMatt Macy if (q == 0) { 628eda14cbcSMatt Macy /* Our I/O request doesn't span all child vdevs. */ 629eda14cbcSMatt Macy acols = bc; 630eda14cbcSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1)); 631eda14cbcSMatt Macy } else { 632eda14cbcSMatt Macy acols = dcols; 633eda14cbcSMatt Macy scols = dcols; 634eda14cbcSMatt Macy } 635eda14cbcSMatt Macy 636eda14cbcSMatt Macy ASSERT3U(acols, <=, scols); 63787bf66d4SMartin Matuska rr = vdev_raidz_row_alloc(scols, zio); 6387877fdebSMatt Macy rm->rm_row[0] = rr; 6397877fdebSMatt Macy rr->rr_cols = acols; 6407877fdebSMatt Macy rr->rr_bigcols = bc; 6417877fdebSMatt Macy rr->rr_firstdatacol = nparity; 6427877fdebSMatt Macy #ifdef ZFS_DEBUG 6437877fdebSMatt Macy rr->rr_offset = zio->io_offset; 6447877fdebSMatt Macy rr->rr_size = zio->io_size; 6457877fdebSMatt Macy #endif 646eda14cbcSMatt Macy 647e716630dSMartin Matuska uint64_t asize = 0; 648eda14cbcSMatt Macy 649e716630dSMartin Matuska for (uint64_t c = 0; c < scols; c++) { 6507877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 651e716630dSMartin Matuska uint64_t col = f + c; 652e716630dSMartin Matuska uint64_t coff = o; 653eda14cbcSMatt Macy if (col >= dcols) { 654eda14cbcSMatt Macy col -= dcols; 655eda14cbcSMatt Macy coff += 1ULL << ashift; 656eda14cbcSMatt Macy } 6577877fdebSMatt Macy rc->rc_devidx = col; 6587877fdebSMatt Macy rc->rc_offset = coff; 659eda14cbcSMatt Macy 660eda14cbcSMatt Macy if (c >= acols) 6617877fdebSMatt Macy rc->rc_size = 0; 662eda14cbcSMatt Macy else if (c < bc) 6637877fdebSMatt Macy rc->rc_size = (q + 1) << ashift; 664eda14cbcSMatt Macy else 6657877fdebSMatt Macy rc->rc_size = q << ashift; 666eda14cbcSMatt Macy 6677877fdebSMatt Macy asize += rc->rc_size; 668eda14cbcSMatt Macy } 669eda14cbcSMatt Macy 670eda14cbcSMatt Macy ASSERT3U(asize, ==, tot << ashift); 671eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot; 6727877fdebSMatt Macy rm->rm_skipstart = bc; 673eda14cbcSMatt Macy 674eda14cbcSMatt Macy /* 675eda14cbcSMatt Macy * If all data stored spans all columns, there's a danger that parity 676eda14cbcSMatt Macy * will always be on the same device and, since parity isn't read 677eda14cbcSMatt Macy * during normal operation, that device's I/O bandwidth won't be 678eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB. 679eda14cbcSMatt Macy * 680eda14cbcSMatt Macy * ... at least that was, ostensibly, the theory. As a practical 681eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we 682eda14cbcSMatt Macy * won't see any benefit. Further, occasional writes that aren't a 683eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum 684eda14cbcSMatt Macy * stripe width are sufficient to avoid pessimal behavior. 685eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format 686eda14cbcSMatt Macy * requirement that we need to support for all eternity, but only 687eda14cbcSMatt Macy * for single-parity RAID-Z. 688eda14cbcSMatt Macy * 689eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding 690eda14cbcSMatt Macy * we must make sure to note this swap. We will never intend to 691eda14cbcSMatt Macy * skip the first column since at least one data and one parity 692eda14cbcSMatt Macy * column must appear in each row. 693eda14cbcSMatt Macy */ 6947877fdebSMatt Macy ASSERT(rr->rr_cols >= 2); 6957877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 696eda14cbcSMatt Macy 6977877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 698e716630dSMartin Matuska uint64_t devidx = rr->rr_col[0].rc_devidx; 6997877fdebSMatt Macy o = rr->rr_col[0].rc_offset; 7007877fdebSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 7017877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 7027877fdebSMatt Macy rr->rr_col[1].rc_devidx = devidx; 7037877fdebSMatt Macy rr->rr_col[1].rc_offset = o; 704eda14cbcSMatt Macy if (rm->rm_skipstart == 0) 705eda14cbcSMatt Macy rm->rm_skipstart = 1; 706eda14cbcSMatt Macy } 707eda14cbcSMatt Macy 70881b22a98SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) { 70981b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio, rm, ashift); 71081b22a98SMartin Matuska } else { 71181b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio, rm); 71281b22a98SMartin Matuska } 713e716630dSMartin Matuska /* init RAIDZ parity ops */ 714e716630dSMartin Matuska rm->rm_ops = vdev_raidz_math_get_ops(); 71581b22a98SMartin Matuska 716e716630dSMartin Matuska return (rm); 717e716630dSMartin Matuska } 718e716630dSMartin Matuska 719e716630dSMartin Matuska /* 720e716630dSMartin Matuska * Everything before reflow_offset_synced should have been moved to the new 721e716630dSMartin Matuska * location (read and write completed). However, this may not yet be reflected 722e716630dSMartin Matuska * in the on-disk format (e.g. raidz_reflow_sync() has been called but the 723e716630dSMartin Matuska * uberblock has not yet been written). If reflow is not in progress, 724e716630dSMartin Matuska * reflow_offset_synced should be UINT64_MAX. For each row, if the row is 725e716630dSMartin Matuska * entirely before reflow_offset_synced, it will come from the new location. 726e716630dSMartin Matuska * Otherwise this row will come from the old location. Therefore, rows that 727e716630dSMartin Matuska * straddle the reflow_offset_synced will come from the old location. 728e716630dSMartin Matuska * 729e716630dSMartin Matuska * For writes, reflow_offset_next is the next offset to copy. If a sector has 730e716630dSMartin Matuska * been copied, but not yet reflected in the on-disk progress 731e716630dSMartin Matuska * (reflow_offset_synced), it will also be written to the new (already copied) 732e716630dSMartin Matuska * offset. 733e716630dSMartin Matuska */ 734e716630dSMartin Matuska noinline raidz_map_t * 735e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio, 736e716630dSMartin Matuska uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 737e716630dSMartin Matuska uint64_t nparity, uint64_t reflow_offset_synced, 738e716630dSMartin Matuska uint64_t reflow_offset_next, boolean_t use_scratch) 739e716630dSMartin Matuska { 740e716630dSMartin Matuska abd_t *abd = zio->io_abd; 741e716630dSMartin Matuska uint64_t offset = zio->io_offset; 742e716630dSMartin Matuska uint64_t size = zio->io_size; 743e716630dSMartin Matuska 744e716630dSMartin Matuska /* The zio's size in units of the vdev's minimum sector size. */ 745e716630dSMartin Matuska uint64_t s = size >> ashift; 746e716630dSMartin Matuska 747e716630dSMartin Matuska /* 748e716630dSMartin Matuska * "Quotient": The number of data sectors for this stripe on all but 749e716630dSMartin Matuska * the "big column" child vdevs that also contain "remainder" data. 750e716630dSMartin Matuska * AKA "full rows" 751e716630dSMartin Matuska */ 752e716630dSMartin Matuska uint64_t q = s / (logical_cols - nparity); 753e716630dSMartin Matuska 754e716630dSMartin Matuska /* 755e716630dSMartin Matuska * "Remainder": The number of partial stripe data sectors in this I/O. 756e716630dSMartin Matuska * This will add a sector to some, but not all, child vdevs. 757e716630dSMartin Matuska */ 758e716630dSMartin Matuska uint64_t r = s - q * (logical_cols - nparity); 759e716630dSMartin Matuska 760e716630dSMartin Matuska /* The number of "big columns" - those which contain remainder data. */ 761e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity); 762e716630dSMartin Matuska 763e716630dSMartin Matuska /* 764e716630dSMartin Matuska * The total number of data and parity sectors associated with 765e716630dSMartin Matuska * this I/O. 766e716630dSMartin Matuska */ 767e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); 768e716630dSMartin Matuska 769e716630dSMartin Matuska /* How many rows contain data (not skip) */ 770e716630dSMartin Matuska uint64_t rows = howmany(tot, logical_cols); 771e716630dSMartin Matuska int cols = MIN(tot, logical_cols); 772e716630dSMartin Matuska 773e716630dSMartin Matuska raidz_map_t *rm = 774e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 775e716630dSMartin Matuska KM_SLEEP); 776e716630dSMartin Matuska rm->rm_nrows = rows; 777e716630dSMartin Matuska rm->rm_nskip = roundup(tot, nparity + 1) - tot; 778e716630dSMartin Matuska rm->rm_skipstart = bc; 779e716630dSMartin Matuska uint64_t asize = 0; 780e716630dSMartin Matuska 781e716630dSMartin Matuska for (uint64_t row = 0; row < rows; row++) { 782e716630dSMartin Matuska boolean_t row_use_scratch = B_FALSE; 78387bf66d4SMartin Matuska raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); 784e716630dSMartin Matuska rm->rm_row[row] = rr; 785e716630dSMartin Matuska 786e716630dSMartin Matuska /* The starting RAIDZ (parent) vdev sector of the row. */ 787e716630dSMartin Matuska uint64_t b = (offset >> ashift) + row * logical_cols; 788e716630dSMartin Matuska 789e716630dSMartin Matuska /* 790e716630dSMartin Matuska * If we are in the middle of a reflow, and the copying has 791e716630dSMartin Matuska * not yet completed for any part of this row, then use the 792e716630dSMartin Matuska * old location of this row. Note that reflow_offset_synced 793e716630dSMartin Matuska * reflects the i/o that's been completed, because it's 794e716630dSMartin Matuska * updated by a synctask, after zio_wait(spa_txg_zio[]). 795e716630dSMartin Matuska * This is sufficient for our check, even if that progress 796e716630dSMartin Matuska * has not yet been recorded to disk (reflected in 797e716630dSMartin Matuska * spa_ubsync). Also note that we consider the last row to 798e716630dSMartin Matuska * be "full width" (`cols`-wide rather than `bc`-wide) for 799e716630dSMartin Matuska * this calculation. This causes a tiny bit of unnecessary 800e716630dSMartin Matuska * double-writes but is safe and simpler to calculate. 801e716630dSMartin Matuska */ 802e716630dSMartin Matuska int row_phys_cols = physical_cols; 803e716630dSMartin Matuska if (b + cols > reflow_offset_synced >> ashift) 804e716630dSMartin Matuska row_phys_cols--; 805e716630dSMartin Matuska else if (use_scratch) 806e716630dSMartin Matuska row_use_scratch = B_TRUE; 807e716630dSMartin Matuska 808e716630dSMartin Matuska /* starting child of this row */ 809e716630dSMartin Matuska uint64_t child_id = b % row_phys_cols; 810e716630dSMartin Matuska /* The starting byte offset on each child vdev. */ 811e716630dSMartin Matuska uint64_t child_offset = (b / row_phys_cols) << ashift; 812e716630dSMartin Matuska 813e716630dSMartin Matuska /* 814e716630dSMartin Matuska * Note, rr_cols is the entire width of the block, even 815e716630dSMartin Matuska * if this row is shorter. This is needed because parity 816e716630dSMartin Matuska * generation (for Q and R) needs to know the entire width, 817e716630dSMartin Matuska * because it treats the short row as though it was 818e716630dSMartin Matuska * full-width (and the "phantom" sectors were zero-filled). 819e716630dSMartin Matuska * 820e716630dSMartin Matuska * Another approach to this would be to set cols shorter 821e716630dSMartin Matuska * (to just the number of columns that we might do i/o to) 822e716630dSMartin Matuska * and have another mechanism to tell the parity generation 823e716630dSMartin Matuska * about the "entire width". Reconstruction (at least 824e716630dSMartin Matuska * vdev_raidz_reconstruct_general()) would also need to 825e716630dSMartin Matuska * know about the "entire width". 826e716630dSMartin Matuska */ 827e716630dSMartin Matuska rr->rr_firstdatacol = nparity; 828e716630dSMartin Matuska #ifdef ZFS_DEBUG 829e716630dSMartin Matuska /* 830e716630dSMartin Matuska * note: rr_size is PSIZE, not ASIZE 831e716630dSMartin Matuska */ 832e716630dSMartin Matuska rr->rr_offset = b << ashift; 833e716630dSMartin Matuska rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; 834e716630dSMartin Matuska #endif 835e716630dSMartin Matuska 836e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++, child_id++) { 837e716630dSMartin Matuska if (child_id >= row_phys_cols) { 838e716630dSMartin Matuska child_id -= row_phys_cols; 839e716630dSMartin Matuska child_offset += 1ULL << ashift; 840e716630dSMartin Matuska } 841e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 842e716630dSMartin Matuska rc->rc_devidx = child_id; 843e716630dSMartin Matuska rc->rc_offset = child_offset; 844e716630dSMartin Matuska 845e716630dSMartin Matuska /* 846e716630dSMartin Matuska * Get this from the scratch space if appropriate. 847e716630dSMartin Matuska * This only happens if we crashed in the middle of 848e716630dSMartin Matuska * raidz_reflow_scratch_sync() (while it's running, 849e716630dSMartin Matuska * the rangelock prevents us from doing concurrent 850e716630dSMartin Matuska * io), and even then only during zpool import or 851e716630dSMartin Matuska * when the pool is imported readonly. 852e716630dSMartin Matuska */ 853e716630dSMartin Matuska if (row_use_scratch) 854e716630dSMartin Matuska rc->rc_offset -= VDEV_BOOT_SIZE; 855e716630dSMartin Matuska 856e716630dSMartin Matuska uint64_t dc = c - rr->rr_firstdatacol; 857e716630dSMartin Matuska if (c < rr->rr_firstdatacol) { 858e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 859e716630dSMartin Matuska 860e716630dSMartin Matuska /* 861e716630dSMartin Matuska * Parity sectors' rc_abd's are set below 862e716630dSMartin Matuska * after determining if this is an aggregation. 863e716630dSMartin Matuska */ 864e716630dSMartin Matuska } else if (row == rows - 1 && bc != 0 && c >= bc) { 865e716630dSMartin Matuska /* 866e716630dSMartin Matuska * Past the end of the block (even including 867e716630dSMartin Matuska * skip sectors). This sector is part of the 868e716630dSMartin Matuska * map so that we have full rows for p/q parity 869e716630dSMartin Matuska * generation. 870e716630dSMartin Matuska */ 871e716630dSMartin Matuska rc->rc_size = 0; 872e716630dSMartin Matuska rc->rc_abd = NULL; 873e716630dSMartin Matuska } else { 874e716630dSMartin Matuska /* "data column" (col excluding parity) */ 875e716630dSMartin Matuska uint64_t off; 876e716630dSMartin Matuska 877e716630dSMartin Matuska if (c < bc || r == 0) { 878e716630dSMartin Matuska off = dc * rows + row; 879e716630dSMartin Matuska } else { 880e716630dSMartin Matuska off = r * rows + 881e716630dSMartin Matuska (dc - r) * (rows - 1) + row; 882e716630dSMartin Matuska } 883e716630dSMartin Matuska rc->rc_size = 1ULL << ashift; 884e716630dSMartin Matuska rc->rc_abd = abd_get_offset_struct( 885e716630dSMartin Matuska &rc->rc_abdstruct, abd, off << ashift, 886e716630dSMartin Matuska rc->rc_size); 887e716630dSMartin Matuska } 888e716630dSMartin Matuska 889e716630dSMartin Matuska if (rc->rc_size == 0) 890e716630dSMartin Matuska continue; 891e716630dSMartin Matuska 892e716630dSMartin Matuska /* 893e716630dSMartin Matuska * If any part of this row is in both old and new 894e716630dSMartin Matuska * locations, the primary location is the old 895e716630dSMartin Matuska * location. If this sector was already copied to the 896e716630dSMartin Matuska * new location, we need to also write to the new, 897e716630dSMartin Matuska * "shadow" location. 898e716630dSMartin Matuska * 899e716630dSMartin Matuska * Note, `row_phys_cols != physical_cols` indicates 900e716630dSMartin Matuska * that the primary location is the old location. 901e716630dSMartin Matuska * `b+c < reflow_offset_next` indicates that the copy 902e716630dSMartin Matuska * to the new location has been initiated. We know 903e716630dSMartin Matuska * that the copy has completed because we have the 904e716630dSMartin Matuska * rangelock, which is held exclusively while the 905e716630dSMartin Matuska * copy is in progress. 906e716630dSMartin Matuska */ 907e716630dSMartin Matuska if (row_use_scratch || 908e716630dSMartin Matuska (row_phys_cols != physical_cols && 909e716630dSMartin Matuska b + c < reflow_offset_next >> ashift)) { 910e716630dSMartin Matuska rc->rc_shadow_devidx = (b + c) % physical_cols; 911e716630dSMartin Matuska rc->rc_shadow_offset = 912e716630dSMartin Matuska ((b + c) / physical_cols) << ashift; 913e716630dSMartin Matuska if (row_use_scratch) 914e716630dSMartin Matuska rc->rc_shadow_offset -= VDEV_BOOT_SIZE; 915e716630dSMartin Matuska } 916e716630dSMartin Matuska 917e716630dSMartin Matuska asize += rc->rc_size; 918e716630dSMartin Matuska } 919e716630dSMartin Matuska 920e716630dSMartin Matuska /* 921e716630dSMartin Matuska * See comment in vdev_raidz_map_alloc() 922e716630dSMartin Matuska */ 923e716630dSMartin Matuska if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 924e716630dSMartin Matuska (offset & (1ULL << 20))) { 925e716630dSMartin Matuska ASSERT(rr->rr_cols >= 2); 926e716630dSMartin Matuska ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 927e716630dSMartin Matuska 928e716630dSMartin Matuska int devidx0 = rr->rr_col[0].rc_devidx; 929e716630dSMartin Matuska uint64_t offset0 = rr->rr_col[0].rc_offset; 930e716630dSMartin Matuska int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; 931e716630dSMartin Matuska uint64_t shadow_offset0 = 932e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset; 933e716630dSMartin Matuska 934e716630dSMartin Matuska rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 935e716630dSMartin Matuska rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 936e716630dSMartin Matuska rr->rr_col[0].rc_shadow_devidx = 937e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx; 938e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset = 939e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset; 940e716630dSMartin Matuska 941e716630dSMartin Matuska rr->rr_col[1].rc_devidx = devidx0; 942e716630dSMartin Matuska rr->rr_col[1].rc_offset = offset0; 943e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; 944e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset = shadow_offset0; 945e716630dSMartin Matuska } 946e716630dSMartin Matuska } 947e716630dSMartin Matuska ASSERT3U(asize, ==, tot << ashift); 948e716630dSMartin Matuska 949e716630dSMartin Matuska /* 950e716630dSMartin Matuska * Determine if the block is contiguous, in which case we can use 951e716630dSMartin Matuska * an aggregation. 952e716630dSMartin Matuska */ 953e716630dSMartin Matuska if (rows >= raidz_io_aggregate_rows) { 954e716630dSMartin Matuska rm->rm_nphys_cols = physical_cols; 955e716630dSMartin Matuska rm->rm_phys_col = 956e716630dSMartin Matuska kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, 957e716630dSMartin Matuska KM_SLEEP); 958e716630dSMartin Matuska 959e716630dSMartin Matuska /* 960e716630dSMartin Matuska * Determine the aggregate io's offset and size, and check 961e716630dSMartin Matuska * that the io is contiguous. 962e716630dSMartin Matuska */ 963e716630dSMartin Matuska for (int i = 0; 964e716630dSMartin Matuska i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { 965e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 966e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 967e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 968e716630dSMartin Matuska raidz_col_t *prc = 969e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 970e716630dSMartin Matuska 971e716630dSMartin Matuska if (rc->rc_size == 0) 972e716630dSMartin Matuska continue; 973e716630dSMartin Matuska 974e716630dSMartin Matuska if (prc->rc_size == 0) { 975e716630dSMartin Matuska ASSERT0(prc->rc_offset); 976e716630dSMartin Matuska prc->rc_offset = rc->rc_offset; 977e716630dSMartin Matuska } else if (prc->rc_offset + prc->rc_size != 978e716630dSMartin Matuska rc->rc_offset) { 979e716630dSMartin Matuska /* 980e716630dSMartin Matuska * This block is not contiguous and 981e716630dSMartin Matuska * therefore can't be aggregated. 982e716630dSMartin Matuska * This is expected to be rare, so 983e716630dSMartin Matuska * the cost of allocating and then 984e716630dSMartin Matuska * freeing rm_phys_col is not 985e716630dSMartin Matuska * significant. 986e716630dSMartin Matuska */ 987e716630dSMartin Matuska kmem_free(rm->rm_phys_col, 988e716630dSMartin Matuska sizeof (raidz_col_t) * 989e716630dSMartin Matuska rm->rm_nphys_cols); 990e716630dSMartin Matuska rm->rm_phys_col = NULL; 991e716630dSMartin Matuska rm->rm_nphys_cols = 0; 992e716630dSMartin Matuska break; 993e716630dSMartin Matuska } 994e716630dSMartin Matuska prc->rc_size += rc->rc_size; 995e716630dSMartin Matuska } 996e716630dSMartin Matuska } 997e716630dSMartin Matuska } 998e716630dSMartin Matuska if (rm->rm_phys_col != NULL) { 999e716630dSMartin Matuska /* 1000e716630dSMartin Matuska * Allocate aggregate ABD's. 1001e716630dSMartin Matuska */ 1002e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 1003e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 1004e716630dSMartin Matuska 1005e716630dSMartin Matuska prc->rc_devidx = i; 1006e716630dSMartin Matuska 1007e716630dSMartin Matuska if (prc->rc_size == 0) 1008e716630dSMartin Matuska continue; 1009e716630dSMartin Matuska 1010e716630dSMartin Matuska prc->rc_abd = 1011e716630dSMartin Matuska abd_alloc_linear(rm->rm_phys_col[i].rc_size, 1012e716630dSMartin Matuska B_FALSE); 1013e716630dSMartin Matuska } 1014e716630dSMartin Matuska 1015e716630dSMartin Matuska /* 1016e716630dSMartin Matuska * Point the parity abd's into the aggregate abd's. 1017e716630dSMartin Matuska */ 1018e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1019e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1020e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1021e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1022e716630dSMartin Matuska raidz_col_t *prc = 1023e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 1024e716630dSMartin Matuska rc->rc_abd = 1025e716630dSMartin Matuska abd_get_offset_struct(&rc->rc_abdstruct, 1026e716630dSMartin Matuska prc->rc_abd, 1027e716630dSMartin Matuska rc->rc_offset - prc->rc_offset, 1028e716630dSMartin Matuska rc->rc_size); 1029e716630dSMartin Matuska } 1030e716630dSMartin Matuska } 1031e716630dSMartin Matuska } else { 1032e716630dSMartin Matuska /* 1033e716630dSMartin Matuska * Allocate new abd's for the parity sectors. 1034e716630dSMartin Matuska */ 1035e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 1036e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 1037e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) { 1038e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 1039e716630dSMartin Matuska rc->rc_abd = 1040e716630dSMartin Matuska abd_alloc_linear(rc->rc_size, 1041e716630dSMartin Matuska B_TRUE); 1042e716630dSMartin Matuska } 1043e716630dSMartin Matuska } 1044e716630dSMartin Matuska } 1045eda14cbcSMatt Macy /* init RAIDZ parity ops */ 1046eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops(); 1047eda14cbcSMatt Macy 1048eda14cbcSMatt Macy return (rm); 1049eda14cbcSMatt Macy } 1050eda14cbcSMatt Macy 1051eda14cbcSMatt Macy struct pqr_struct { 1052eda14cbcSMatt Macy uint64_t *p; 1053eda14cbcSMatt Macy uint64_t *q; 1054eda14cbcSMatt Macy uint64_t *r; 1055eda14cbcSMatt Macy }; 1056eda14cbcSMatt Macy 1057eda14cbcSMatt Macy static int 1058eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private) 1059eda14cbcSMatt Macy { 1060eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1061eda14cbcSMatt Macy const uint64_t *src = buf; 1062e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1063eda14cbcSMatt Macy 1064eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r); 1065eda14cbcSMatt Macy 1066e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++) 1067eda14cbcSMatt Macy *pqr->p ^= *src; 1068eda14cbcSMatt Macy 1069eda14cbcSMatt Macy return (0); 1070eda14cbcSMatt Macy } 1071eda14cbcSMatt Macy 1072eda14cbcSMatt Macy static int 1073eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private) 1074eda14cbcSMatt Macy { 1075eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1076eda14cbcSMatt Macy const uint64_t *src = buf; 1077eda14cbcSMatt Macy uint64_t mask; 1078e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1079eda14cbcSMatt Macy 1080eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r); 1081eda14cbcSMatt Macy 1082e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 1083eda14cbcSMatt Macy *pqr->p ^= *src; 1084eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1085eda14cbcSMatt Macy *pqr->q ^= *src; 1086eda14cbcSMatt Macy } 1087eda14cbcSMatt Macy 1088eda14cbcSMatt Macy return (0); 1089eda14cbcSMatt Macy } 1090eda14cbcSMatt Macy 1091eda14cbcSMatt Macy static int 1092eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private) 1093eda14cbcSMatt Macy { 1094eda14cbcSMatt Macy struct pqr_struct *pqr = private; 1095eda14cbcSMatt Macy const uint64_t *src = buf; 1096eda14cbcSMatt Macy uint64_t mask; 1097e716630dSMartin Matuska int cnt = size / sizeof (src[0]); 1098eda14cbcSMatt Macy 1099eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r); 1100eda14cbcSMatt Macy 1101e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 1102eda14cbcSMatt Macy *pqr->p ^= *src; 1103eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 1104eda14cbcSMatt Macy *pqr->q ^= *src; 1105eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 1106eda14cbcSMatt Macy *pqr->r ^= *src; 1107eda14cbcSMatt Macy } 1108eda14cbcSMatt Macy 1109eda14cbcSMatt Macy return (0); 1110eda14cbcSMatt Macy } 1111eda14cbcSMatt Macy 1112eda14cbcSMatt Macy static void 11137877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr) 1114eda14cbcSMatt Macy { 11157877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 1116eda14cbcSMatt Macy 11177877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11187877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1119eda14cbcSMatt Macy 11207877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 11217877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 1122eda14cbcSMatt Macy } else { 1123eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL }; 11247877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1125eda14cbcSMatt Macy vdev_raidz_p_func, &pqr); 1126eda14cbcSMatt Macy } 1127eda14cbcSMatt Macy } 1128eda14cbcSMatt Macy } 1129eda14cbcSMatt Macy 1130eda14cbcSMatt Macy static void 11317877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr) 1132eda14cbcSMatt Macy { 11337877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11347877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11357877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11367877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11377877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1138eda14cbcSMatt Macy 11397877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11407877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1141eda14cbcSMatt Macy 11427877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1143eda14cbcSMatt Macy 11447877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1145eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11467877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11477877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 1148eda14cbcSMatt Macy 11497877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1150eda14cbcSMatt Macy p[i] = 0; 1151eda14cbcSMatt Macy q[i] = 0; 1152eda14cbcSMatt Macy } 1153eda14cbcSMatt Macy } else { 1154eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL }; 1155eda14cbcSMatt Macy 1156eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 11577877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1158eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr); 1159eda14cbcSMatt Macy 1160eda14cbcSMatt Macy /* 1161eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1162eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1163eda14cbcSMatt Macy */ 11647877fdebSMatt Macy uint64_t mask; 11657877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1166eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1167eda14cbcSMatt Macy } 1168eda14cbcSMatt Macy } 1169eda14cbcSMatt Macy } 1170eda14cbcSMatt Macy } 1171eda14cbcSMatt Macy 1172eda14cbcSMatt Macy static void 11737877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr) 1174eda14cbcSMatt Macy { 11757877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 11767877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 11777877fdebSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); 11787877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 11797877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11807877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size); 11817877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == 11827877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size); 1183eda14cbcSMatt Macy 11847877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 11857877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd; 1186eda14cbcSMatt Macy 11877877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); 1188eda14cbcSMatt Macy 11897877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1190eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0); 11917877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); 11927877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size); 11937877fdebSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size); 1194eda14cbcSMatt Macy 11957877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1196eda14cbcSMatt Macy p[i] = 0; 1197eda14cbcSMatt Macy q[i] = 0; 1198eda14cbcSMatt Macy r[i] = 0; 1199eda14cbcSMatt Macy } 1200eda14cbcSMatt Macy } else { 1201eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r }; 1202eda14cbcSMatt Macy 1203eda14cbcSMatt Macy ASSERT(ccnt <= pcnt); 12047877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, 1205eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr); 1206eda14cbcSMatt Macy 1207eda14cbcSMatt Macy /* 1208eda14cbcSMatt Macy * Treat short columns as though they are full of 0s. 1209eda14cbcSMatt Macy * Note that there's therefore nothing needed for P. 1210eda14cbcSMatt Macy */ 12117877fdebSMatt Macy uint64_t mask; 12127877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) { 1213eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask); 1214eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask); 1215eda14cbcSMatt Macy } 1216eda14cbcSMatt Macy } 1217eda14cbcSMatt Macy } 1218eda14cbcSMatt Macy } 1219eda14cbcSMatt Macy 1220eda14cbcSMatt Macy /* 1221eda14cbcSMatt Macy * Generate RAID parity in the first virtual columns according to the number of 1222eda14cbcSMatt Macy * parity columns available. 1223eda14cbcSMatt Macy */ 1224eda14cbcSMatt Macy void 12257877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) 1226eda14cbcSMatt Macy { 1227e716630dSMartin Matuska if (rr->rr_cols == 0) { 1228e716630dSMartin Matuska /* 1229e716630dSMartin Matuska * We are handling this block one row at a time (because 1230e716630dSMartin Matuska * this block has a different logical vs physical width, 1231e716630dSMartin Matuska * due to RAIDZ expansion), and this is a pad-only row, 1232e716630dSMartin Matuska * which has no parity. 1233e716630dSMartin Matuska */ 1234e716630dSMartin Matuska return; 1235e716630dSMartin Matuska } 12367877fdebSMatt Macy 1237eda14cbcSMatt Macy /* Generate using the new math implementation */ 12387877fdebSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) 1239eda14cbcSMatt Macy return; 1240eda14cbcSMatt Macy 12417877fdebSMatt Macy switch (rr->rr_firstdatacol) { 1242eda14cbcSMatt Macy case 1: 12437877fdebSMatt Macy vdev_raidz_generate_parity_p(rr); 1244eda14cbcSMatt Macy break; 1245eda14cbcSMatt Macy case 2: 12467877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1247eda14cbcSMatt Macy break; 1248eda14cbcSMatt Macy case 3: 12497877fdebSMatt Macy vdev_raidz_generate_parity_pqr(rr); 1250eda14cbcSMatt Macy break; 1251eda14cbcSMatt Macy default: 1252eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 1253eda14cbcSMatt Macy } 1254eda14cbcSMatt Macy } 1255eda14cbcSMatt Macy 12567877fdebSMatt Macy void 12577877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm) 12587877fdebSMatt Macy { 12597877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 12607877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 12617877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 12627877fdebSMatt Macy } 12637877fdebSMatt Macy } 12647877fdebSMatt Macy 1265eda14cbcSMatt Macy static int 1266eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 1267eda14cbcSMatt Macy { 1268e92ffd9bSMartin Matuska (void) private; 1269eda14cbcSMatt Macy uint64_t *dst = dbuf; 1270eda14cbcSMatt Macy uint64_t *src = sbuf; 1271eda14cbcSMatt Macy int cnt = size / sizeof (src[0]); 1272eda14cbcSMatt Macy 1273eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) { 1274eda14cbcSMatt Macy dst[i] ^= src[i]; 1275eda14cbcSMatt Macy } 1276eda14cbcSMatt Macy 1277eda14cbcSMatt Macy return (0); 1278eda14cbcSMatt Macy } 1279eda14cbcSMatt Macy 1280eda14cbcSMatt Macy static int 1281eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 1282eda14cbcSMatt Macy void *private) 1283eda14cbcSMatt Macy { 1284e92ffd9bSMartin Matuska (void) private; 1285eda14cbcSMatt Macy uint64_t *dst = dbuf; 1286eda14cbcSMatt Macy uint64_t *src = sbuf; 1287eda14cbcSMatt Macy uint64_t mask; 1288eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1289eda14cbcSMatt Macy 1290eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) { 1291eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1292eda14cbcSMatt Macy *dst ^= *src; 1293eda14cbcSMatt Macy } 1294eda14cbcSMatt Macy 1295eda14cbcSMatt Macy return (0); 1296eda14cbcSMatt Macy } 1297eda14cbcSMatt Macy 1298eda14cbcSMatt Macy static int 1299eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 1300eda14cbcSMatt Macy { 1301e92ffd9bSMartin Matuska (void) private; 1302eda14cbcSMatt Macy uint64_t *dst = buf; 1303eda14cbcSMatt Macy uint64_t mask; 1304eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1305eda14cbcSMatt Macy 1306eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) { 1307eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 1308eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask); 1309eda14cbcSMatt Macy } 1310eda14cbcSMatt Macy 1311eda14cbcSMatt Macy return (0); 1312eda14cbcSMatt Macy } 1313eda14cbcSMatt Macy 1314eda14cbcSMatt Macy struct reconst_q_struct { 1315eda14cbcSMatt Macy uint64_t *q; 1316eda14cbcSMatt Macy int exp; 1317eda14cbcSMatt Macy }; 1318eda14cbcSMatt Macy 1319eda14cbcSMatt Macy static int 1320eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 1321eda14cbcSMatt Macy { 1322eda14cbcSMatt Macy struct reconst_q_struct *rq = private; 1323eda14cbcSMatt Macy uint64_t *dst = buf; 1324eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]); 1325eda14cbcSMatt Macy 1326eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) { 1327eda14cbcSMatt Macy int j; 1328eda14cbcSMatt Macy uint8_t *b; 1329eda14cbcSMatt Macy 1330eda14cbcSMatt Macy *dst ^= *rq->q; 1331eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 1332eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp); 1333eda14cbcSMatt Macy } 1334eda14cbcSMatt Macy } 1335eda14cbcSMatt Macy 1336eda14cbcSMatt Macy return (0); 1337eda14cbcSMatt Macy } 1338eda14cbcSMatt Macy 1339eda14cbcSMatt Macy struct reconst_pq_struct { 1340eda14cbcSMatt Macy uint8_t *p; 1341eda14cbcSMatt Macy uint8_t *q; 1342eda14cbcSMatt Macy uint8_t *pxy; 1343eda14cbcSMatt Macy uint8_t *qxy; 1344eda14cbcSMatt Macy int aexp; 1345eda14cbcSMatt Macy int bexp; 1346eda14cbcSMatt Macy }; 1347eda14cbcSMatt Macy 1348eda14cbcSMatt Macy static int 1349eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 1350eda14cbcSMatt Macy { 1351eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1352eda14cbcSMatt Macy uint8_t *xd = xbuf; 1353eda14cbcSMatt Macy uint8_t *yd = ybuf; 1354eda14cbcSMatt Macy 1355eda14cbcSMatt Macy for (int i = 0; i < size; 1356eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 1357eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1358eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1359eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd; 1360eda14cbcSMatt Macy } 1361eda14cbcSMatt Macy 1362eda14cbcSMatt Macy return (0); 1363eda14cbcSMatt Macy } 1364eda14cbcSMatt Macy 1365eda14cbcSMatt Macy static int 1366eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 1367eda14cbcSMatt Macy { 1368eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private; 1369eda14cbcSMatt Macy uint8_t *xd = xbuf; 1370eda14cbcSMatt Macy 1371eda14cbcSMatt Macy for (int i = 0; i < size; 1372eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 1373eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */ 1374eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 1375eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 1376eda14cbcSMatt Macy } 1377eda14cbcSMatt Macy 1378eda14cbcSMatt Macy return (0); 1379eda14cbcSMatt Macy } 1380eda14cbcSMatt Macy 1381f9693befSMartin Matuska static void 13827877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) 1383eda14cbcSMatt Macy { 1384eda14cbcSMatt Macy int x = tgts[0]; 1385eda14cbcSMatt Macy abd_t *dst, *src; 1386eda14cbcSMatt Macy 1387e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1388e716630dSMartin Matuska zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); 1389e716630dSMartin Matuska 13907877fdebSMatt Macy ASSERT3U(ntgts, ==, 1); 13917877fdebSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol); 13927877fdebSMatt Macy ASSERT3U(x, <, rr->rr_cols); 1393eda14cbcSMatt Macy 13947877fdebSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); 1395eda14cbcSMatt Macy 13967877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 13977877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1398eda14cbcSMatt Macy 13997877fdebSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); 1400eda14cbcSMatt Macy 14017877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14027877fdebSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size, 14037877fdebSMatt Macy rr->rr_col[c].rc_size); 1404eda14cbcSMatt Macy 14057877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 1406eda14cbcSMatt Macy 1407eda14cbcSMatt Macy if (c == x) 1408eda14cbcSMatt Macy continue; 1409eda14cbcSMatt Macy 1410eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1411eda14cbcSMatt Macy vdev_raidz_reconst_p_func, NULL); 1412eda14cbcSMatt Macy } 1413eda14cbcSMatt Macy } 1414eda14cbcSMatt Macy 1415f9693befSMartin Matuska static void 14167877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) 1417eda14cbcSMatt Macy { 1418eda14cbcSMatt Macy int x = tgts[0]; 1419eda14cbcSMatt Macy int c, exp; 1420eda14cbcSMatt Macy abd_t *dst, *src; 1421eda14cbcSMatt Macy 1422e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1423e716630dSMartin Matuska zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); 1424e716630dSMartin Matuska 1425eda14cbcSMatt Macy ASSERT(ntgts == 1); 1426eda14cbcSMatt Macy 14277877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); 1428eda14cbcSMatt Macy 14297877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 14307877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, 14317877fdebSMatt Macy rr->rr_col[c].rc_size); 1432eda14cbcSMatt Macy 14337877fdebSMatt Macy src = rr->rr_col[c].rc_abd; 14347877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 1435eda14cbcSMatt Macy 14367877fdebSMatt Macy if (c == rr->rr_firstdatacol) { 1437eda14cbcSMatt Macy abd_copy(dst, src, size); 14387877fdebSMatt Macy if (rr->rr_col[x].rc_size > size) { 1439eda14cbcSMatt Macy abd_zero_off(dst, size, 14407877fdebSMatt Macy rr->rr_col[x].rc_size - size); 14417877fdebSMatt Macy } 1442eda14cbcSMatt Macy } else { 14437877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size); 1444eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size, 1445eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL); 1446eda14cbcSMatt Macy (void) abd_iterate_func(dst, 14477877fdebSMatt Macy size, rr->rr_col[x].rc_size - size, 1448eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL); 1449eda14cbcSMatt Macy } 1450eda14cbcSMatt Macy } 1451eda14cbcSMatt Macy 14527877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14537877fdebSMatt Macy dst = rr->rr_col[x].rc_abd; 14547877fdebSMatt Macy exp = 255 - (rr->rr_cols - 1 - x); 1455eda14cbcSMatt Macy 1456eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp }; 14577877fdebSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, 1458eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq); 1459eda14cbcSMatt Macy } 1460eda14cbcSMatt Macy 1461f9693befSMartin Matuska static void 14627877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) 1463eda14cbcSMatt Macy { 1464eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1465eda14cbcSMatt Macy abd_t *pdata, *qdata; 1466eda14cbcSMatt Macy uint64_t xsize, ysize; 1467eda14cbcSMatt Macy int x = tgts[0]; 1468eda14cbcSMatt Macy int y = tgts[1]; 1469eda14cbcSMatt Macy abd_t *xd, *yd; 1470eda14cbcSMatt Macy 1471e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1472e716630dSMartin Matuska zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); 1473e716630dSMartin Matuska 1474eda14cbcSMatt Macy ASSERT(ntgts == 2); 1475eda14cbcSMatt Macy ASSERT(x < y); 14767877fdebSMatt Macy ASSERT(x >= rr->rr_firstdatacol); 14777877fdebSMatt Macy ASSERT(y < rr->rr_cols); 1478eda14cbcSMatt Macy 14797877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); 1480eda14cbcSMatt Macy 1481eda14cbcSMatt Macy /* 1482eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as 1483eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1484eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual 1485eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by 1486eda14cbcSMatt Macy * setting their lengths to zero. 1487eda14cbcSMatt Macy */ 14887877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; 14897877fdebSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; 14907877fdebSMatt Macy xsize = rr->rr_col[x].rc_size; 14917877fdebSMatt Macy ysize = rr->rr_col[y].rc_size; 1492eda14cbcSMatt Macy 14937877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = 14947877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 14957877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = 14967877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 14977877fdebSMatt Macy rr->rr_col[x].rc_size = 0; 14987877fdebSMatt Macy rr->rr_col[y].rc_size = 0; 1499eda14cbcSMatt Macy 15007877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr); 1501eda14cbcSMatt Macy 15027877fdebSMatt Macy rr->rr_col[x].rc_size = xsize; 15037877fdebSMatt Macy rr->rr_col[y].rc_size = ysize; 1504eda14cbcSMatt Macy 1505eda14cbcSMatt Macy p = abd_to_buf(pdata); 1506eda14cbcSMatt Macy q = abd_to_buf(qdata); 15077877fdebSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15087877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 15097877fdebSMatt Macy xd = rr->rr_col[x].rc_abd; 15107877fdebSMatt Macy yd = rr->rr_col[y].rc_abd; 1511eda14cbcSMatt Macy 1512eda14cbcSMatt Macy /* 1513eda14cbcSMatt Macy * We now have: 1514eda14cbcSMatt Macy * Pxy = P + D_x + D_y 1515eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1516eda14cbcSMatt Macy * 1517eda14cbcSMatt Macy * We can then solve for D_x: 1518eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy) 1519eda14cbcSMatt Macy * where 1520eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1 1521eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1522eda14cbcSMatt Macy * 1523eda14cbcSMatt Macy * With D_x in hand, we can easily solve for D_y: 1524eda14cbcSMatt Macy * D_y = P + Pxy + D_x 1525eda14cbcSMatt Macy */ 1526eda14cbcSMatt Macy 1527eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y]; 15287877fdebSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; 1529eda14cbcSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1]; 1530eda14cbcSMatt Macy 1531eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1532eda14cbcSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1533eda14cbcSMatt Macy 1534eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize); 1535eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1536eda14cbcSMatt Macy 1537eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1538eda14cbcSMatt Macy vdev_raidz_reconst_pq_func, &rpq); 1539eda14cbcSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize, 1540eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq); 1541eda14cbcSMatt Macy 15427877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); 15437877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); 1544eda14cbcSMatt Macy 1545eda14cbcSMatt Macy /* 1546eda14cbcSMatt Macy * Restore the saved parity data. 1547eda14cbcSMatt Macy */ 15487877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; 15497877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1550eda14cbcSMatt Macy } 1551eda14cbcSMatt Macy 1552eda14cbcSMatt Macy /* 1553eda14cbcSMatt Macy * In the general case of reconstruction, we must solve the system of linear 1554eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as 1555eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with 1556eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p) 1557eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1558eda14cbcSMatt Macy * 1559eda14cbcSMatt Macy * __ __ __ __ 1560eda14cbcSMatt Macy * | | __ __ | p_0 | 1561eda14cbcSMatt Macy * | V | | D_0 | | p_m-1 | 1562eda14cbcSMatt Macy * | | x | : | = | d_0 | 1563eda14cbcSMatt Macy * | I | | D_n-1 | | : | 1564eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 | 1565eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1566eda14cbcSMatt Macy * 1567eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde 1568eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns 1569eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1570eda14cbcSMatt Macy * computation as well as linear separability. 1571eda14cbcSMatt Macy * 1572eda14cbcSMatt Macy * __ __ __ __ 1573eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 | 1574eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : | 1575eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1576eda14cbcSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 | 1577eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1578eda14cbcSMatt Macy * | : : : : | | : | | d_2 | 1579eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : | 1580eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : | 1581eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 | 1582eda14cbcSMatt Macy * ~~ ~~ ~~ ~~ 1583eda14cbcSMatt Macy * 1584eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the 1585eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown 1586eda14cbcSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond 1587eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p 1588eda14cbcSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up 1589eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1590eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity 1591eda14cbcSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1592eda14cbcSMatt Macy * __ __ 1593eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1594eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1595eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / / 1596eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / / 1597eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' / 1598eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1599eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1600eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1601eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1602eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1603eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1604eda14cbcSMatt Macy * ~~ ~~ 1605eda14cbcSMatt Macy * __ __ 1606eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 | 1607eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | 1608eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | 1609eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | 1610eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | 1611eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 | 1612eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 | 1613eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1614eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1615eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1616eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1617eda14cbcSMatt Macy * ~~ ~~ 1618eda14cbcSMatt Macy * 1619eda14cbcSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1620eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1621eda14cbcSMatt Macy * matrix is not singular. 1622eda14cbcSMatt Macy * __ __ 1623eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1624eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1625eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1626eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1627eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1628eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1629eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1630eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1631eda14cbcSMatt Macy * ~~ ~~ 1632eda14cbcSMatt Macy * __ __ 1633eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1634eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1635eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1636eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1637eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1638eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1639eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1640eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1641eda14cbcSMatt Macy * ~~ ~~ 1642eda14cbcSMatt Macy * __ __ 1643eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1644eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1645eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1646eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1647eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1648eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1649eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1650eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1651eda14cbcSMatt Macy * ~~ ~~ 1652eda14cbcSMatt Macy * __ __ 1653eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1654eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1655eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1656eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1657eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1658eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1659eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1660eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1661eda14cbcSMatt Macy * ~~ ~~ 1662eda14cbcSMatt Macy * __ __ 1663eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1664eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1665eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1666eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1667eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1668eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1669eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1670eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1671eda14cbcSMatt Macy * ~~ ~~ 1672eda14cbcSMatt Macy * __ __ 1673eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1674eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1675eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1676eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1677eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1678eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1679eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1680eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1681eda14cbcSMatt Macy * ~~ ~~ 1682eda14cbcSMatt Macy * __ __ 1683eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 | 1684eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 | 1685eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 | 1686eda14cbcSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1687eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 | 1688eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 | 1689eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 | 1690eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 | 1691eda14cbcSMatt Macy * ~~ ~~ 1692eda14cbcSMatt Macy * 1693eda14cbcSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1694eda14cbcSMatt Macy * of the missing data. 1695eda14cbcSMatt Macy * 1696eda14cbcSMatt Macy * As is apparent from the example above, the only non-trivial rows in the 1697eda14cbcSMatt Macy * inverse matrix correspond to the data disks that we're trying to 1698eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would 1699eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For 1700eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to 1701eda14cbcSMatt Macy * targeted columns. 1702eda14cbcSMatt Macy */ 1703eda14cbcSMatt Macy 1704eda14cbcSMatt Macy static void 17057877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, 1706eda14cbcSMatt Macy uint8_t **rows) 1707eda14cbcSMatt Macy { 1708eda14cbcSMatt Macy int i, j; 1709eda14cbcSMatt Macy int pow; 1710eda14cbcSMatt Macy 17117877fdebSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); 1712eda14cbcSMatt Macy 1713eda14cbcSMatt Macy /* 1714eda14cbcSMatt Macy * Fill in the missing rows of interest. 1715eda14cbcSMatt Macy */ 1716eda14cbcSMatt Macy for (i = 0; i < nmap; i++) { 1717eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]); 1718eda14cbcSMatt Macy ASSERT3S(map[i], <=, 2); 1719eda14cbcSMatt Macy 1720eda14cbcSMatt Macy pow = map[i] * n; 1721eda14cbcSMatt Macy if (pow > 255) 1722eda14cbcSMatt Macy pow -= 255; 1723eda14cbcSMatt Macy ASSERT(pow <= 255); 1724eda14cbcSMatt Macy 1725eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1726eda14cbcSMatt Macy pow -= map[i]; 1727eda14cbcSMatt Macy if (pow < 0) 1728eda14cbcSMatt Macy pow += 255; 1729eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow]; 1730eda14cbcSMatt Macy } 1731eda14cbcSMatt Macy } 1732eda14cbcSMatt Macy } 1733eda14cbcSMatt Macy 1734eda14cbcSMatt Macy static void 17357877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, 1736eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1737eda14cbcSMatt Macy { 1738eda14cbcSMatt Macy int i, j, ii, jj; 1739eda14cbcSMatt Macy uint8_t log; 1740eda14cbcSMatt Macy 1741eda14cbcSMatt Macy /* 1742eda14cbcSMatt Macy * Assert that the first nmissing entries from the array of used 1743eda14cbcSMatt Macy * columns correspond to parity columns and that subsequent entries 1744eda14cbcSMatt Macy * correspond to data columns. 1745eda14cbcSMatt Macy */ 1746eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 17477877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol); 1748eda14cbcSMatt Macy } 1749eda14cbcSMatt Macy for (; i < n; i++) { 17507877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol); 1751eda14cbcSMatt Macy } 1752eda14cbcSMatt Macy 1753eda14cbcSMatt Macy /* 1754eda14cbcSMatt Macy * First initialize the storage where we'll compute the inverse rows. 1755eda14cbcSMatt Macy */ 1756eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1757eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1758eda14cbcSMatt Macy invrows[i][j] = (i == j) ? 1 : 0; 1759eda14cbcSMatt Macy } 1760eda14cbcSMatt Macy } 1761eda14cbcSMatt Macy 1762eda14cbcSMatt Macy /* 1763eda14cbcSMatt Macy * Subtract all trivial rows from the rows of consequence. 1764eda14cbcSMatt Macy */ 1765eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1766eda14cbcSMatt Macy for (j = nmissing; j < n; j++) { 17677877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol); 17687877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol; 1769eda14cbcSMatt Macy ASSERT3S(jj, <, n); 1770eda14cbcSMatt Macy invrows[i][j] = rows[i][jj]; 1771eda14cbcSMatt Macy rows[i][jj] = 0; 1772eda14cbcSMatt Macy } 1773eda14cbcSMatt Macy } 1774eda14cbcSMatt Macy 1775eda14cbcSMatt Macy /* 1776eda14cbcSMatt Macy * For each of the rows of interest, we must normalize it and subtract 1777eda14cbcSMatt Macy * a multiple of it from the other rows. 1778eda14cbcSMatt Macy */ 1779eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1780eda14cbcSMatt Macy for (j = 0; j < missing[i]; j++) { 1781eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1782eda14cbcSMatt Macy } 1783eda14cbcSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0); 1784eda14cbcSMatt Macy 1785eda14cbcSMatt Macy /* 1786eda14cbcSMatt Macy * Compute the inverse of the first element and multiply each 1787eda14cbcSMatt Macy * element in the row by that value. 1788eda14cbcSMatt Macy */ 1789eda14cbcSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1790eda14cbcSMatt Macy 1791eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1792eda14cbcSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1793eda14cbcSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1794eda14cbcSMatt Macy } 1795eda14cbcSMatt Macy 1796eda14cbcSMatt Macy for (ii = 0; ii < nmissing; ii++) { 1797eda14cbcSMatt Macy if (i == ii) 1798eda14cbcSMatt Macy continue; 1799eda14cbcSMatt Macy 1800eda14cbcSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0); 1801eda14cbcSMatt Macy 1802eda14cbcSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]]; 1803eda14cbcSMatt Macy 1804eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1805eda14cbcSMatt Macy rows[ii][j] ^= 1806eda14cbcSMatt Macy vdev_raidz_exp2(rows[i][j], log); 1807eda14cbcSMatt Macy invrows[ii][j] ^= 1808eda14cbcSMatt Macy vdev_raidz_exp2(invrows[i][j], log); 1809eda14cbcSMatt Macy } 1810eda14cbcSMatt Macy } 1811eda14cbcSMatt Macy } 1812eda14cbcSMatt Macy 1813eda14cbcSMatt Macy /* 1814eda14cbcSMatt Macy * Verify that the data that is left in the rows are properly part of 1815eda14cbcSMatt Macy * an identity matrix. 1816eda14cbcSMatt Macy */ 1817eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1818eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1819eda14cbcSMatt Macy if (j == missing[i]) { 1820eda14cbcSMatt Macy ASSERT3U(rows[i][j], ==, 1); 1821eda14cbcSMatt Macy } else { 1822eda14cbcSMatt Macy ASSERT0(rows[i][j]); 1823eda14cbcSMatt Macy } 1824eda14cbcSMatt Macy } 1825eda14cbcSMatt Macy } 1826eda14cbcSMatt Macy } 1827eda14cbcSMatt Macy 1828eda14cbcSMatt Macy static void 18297877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, 1830eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used) 1831eda14cbcSMatt Macy { 1832eda14cbcSMatt Macy int i, j, x, cc, c; 1833eda14cbcSMatt Macy uint8_t *src; 1834eda14cbcSMatt Macy uint64_t ccount; 1835eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; 1836eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; 1837eda14cbcSMatt Macy uint8_t log = 0; 1838eda14cbcSMatt Macy uint8_t val; 1839eda14cbcSMatt Macy int ll; 1840eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1841eda14cbcSMatt Macy uint8_t *p, *pp; 1842eda14cbcSMatt Macy size_t psize; 1843eda14cbcSMatt Macy 1844eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing; 1845eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1846eda14cbcSMatt Macy 1847eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing; i++) { 1848eda14cbcSMatt Macy invlog[i] = pp; 1849eda14cbcSMatt Macy pp += n; 1850eda14cbcSMatt Macy } 1851eda14cbcSMatt Macy 1852eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) { 1853eda14cbcSMatt Macy for (j = 0; j < n; j++) { 1854eda14cbcSMatt Macy ASSERT3U(invrows[i][j], !=, 0); 1855eda14cbcSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1856eda14cbcSMatt Macy } 1857eda14cbcSMatt Macy } 1858eda14cbcSMatt Macy 1859eda14cbcSMatt Macy for (i = 0; i < n; i++) { 1860eda14cbcSMatt Macy c = used[i]; 18617877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols); 1862eda14cbcSMatt Macy 18637877fdebSMatt Macy ccount = rr->rr_col[c].rc_size; 18647877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); 18657877fdebSMatt Macy if (ccount == 0) 18667877fdebSMatt Macy continue; 18677877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd); 1868eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) { 18697877fdebSMatt Macy cc = missing[j] + rr->rr_firstdatacol; 18707877fdebSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol); 18717877fdebSMatt Macy ASSERT3U(cc, <, rr->rr_cols); 1872eda14cbcSMatt Macy ASSERT3U(cc, !=, c); 1873eda14cbcSMatt Macy 18747877fdebSMatt Macy dcount[j] = rr->rr_col[cc].rc_size; 18757877fdebSMatt Macy if (dcount[j] != 0) 18767877fdebSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); 1877eda14cbcSMatt Macy } 1878eda14cbcSMatt Macy 1879eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) { 1880eda14cbcSMatt Macy if (*src != 0) 1881eda14cbcSMatt Macy log = vdev_raidz_log2[*src]; 1882eda14cbcSMatt Macy 1883eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) { 1884eda14cbcSMatt Macy if (x >= dcount[cc]) 1885eda14cbcSMatt Macy continue; 1886eda14cbcSMatt Macy 1887eda14cbcSMatt Macy if (*src == 0) { 1888eda14cbcSMatt Macy val = 0; 1889eda14cbcSMatt Macy } else { 1890eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255) 1891eda14cbcSMatt Macy ll -= 255; 1892eda14cbcSMatt Macy val = vdev_raidz_pow2[ll]; 1893eda14cbcSMatt Macy } 1894eda14cbcSMatt Macy 1895eda14cbcSMatt Macy if (i == 0) 1896eda14cbcSMatt Macy dst[cc][x] = val; 1897eda14cbcSMatt Macy else 1898eda14cbcSMatt Macy dst[cc][x] ^= val; 1899eda14cbcSMatt Macy } 1900eda14cbcSMatt Macy } 1901eda14cbcSMatt Macy } 1902eda14cbcSMatt Macy 1903eda14cbcSMatt Macy kmem_free(p, psize); 1904eda14cbcSMatt Macy } 1905eda14cbcSMatt Macy 1906f9693befSMartin Matuska static void 19077877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) 1908eda14cbcSMatt Macy { 1909b985c9caSMartin Matuska int i, c, t, tt; 1910b985c9caSMartin Matuska unsigned int n; 1911b985c9caSMartin Matuska unsigned int nmissing_rows; 1912eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1913eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY]; 1914eda14cbcSMatt Macy uint8_t *p, *pp; 1915eda14cbcSMatt Macy size_t psize; 1916eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1917eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1918eda14cbcSMatt Macy uint8_t *used; 1919eda14cbcSMatt Macy 1920eda14cbcSMatt Macy abd_t **bufs = NULL; 1921eda14cbcSMatt Macy 1922e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 1923e716630dSMartin Matuska zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); 1924eda14cbcSMatt Macy /* 1925eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate 19267877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found. 1927eda14cbcSMatt Macy */ 19287877fdebSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { 1929e716630dSMartin Matuska ASSERT(rr->rr_col[i].rc_abd != NULL); 19307877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) { 19317877fdebSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), 19327877fdebSMatt Macy KM_PUSHPAGE); 1933eda14cbcSMatt Macy 19347877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 19357877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 1936eda14cbcSMatt Macy 1937eda14cbcSMatt Macy bufs[c] = col->rc_abd; 19387877fdebSMatt Macy if (bufs[c] != NULL) { 19397877fdebSMatt Macy col->rc_abd = abd_alloc_linear( 19407877fdebSMatt Macy col->rc_size, B_TRUE); 19417877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c], 19427877fdebSMatt Macy col->rc_size); 1943eda14cbcSMatt Macy } 1944eda14cbcSMatt Macy } 1945eda14cbcSMatt Macy 19467877fdebSMatt Macy break; 19477877fdebSMatt Macy } 19487877fdebSMatt Macy } 19497877fdebSMatt Macy 19507877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol; 1951eda14cbcSMatt Macy 1952eda14cbcSMatt Macy /* 1953eda14cbcSMatt Macy * Figure out which data columns are missing. 1954eda14cbcSMatt Macy */ 1955eda14cbcSMatt Macy nmissing_rows = 0; 1956eda14cbcSMatt Macy for (t = 0; t < ntgts; t++) { 19577877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) { 1958eda14cbcSMatt Macy missing_rows[nmissing_rows++] = 19597877fdebSMatt Macy tgts[t] - rr->rr_firstdatacol; 1960eda14cbcSMatt Macy } 1961eda14cbcSMatt Macy } 1962eda14cbcSMatt Macy 1963eda14cbcSMatt Macy /* 1964eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing 1965eda14cbcSMatt Macy * data columns. 1966eda14cbcSMatt Macy */ 1967eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1968eda14cbcSMatt Macy ASSERT(tt < ntgts); 19697877fdebSMatt Macy ASSERT(c < rr->rr_firstdatacol); 1970eda14cbcSMatt Macy 1971eda14cbcSMatt Macy /* 1972eda14cbcSMatt Macy * Skip any targeted parity columns. 1973eda14cbcSMatt Macy */ 1974eda14cbcSMatt Macy if (c == tgts[tt]) { 1975eda14cbcSMatt Macy tt++; 1976eda14cbcSMatt Macy continue; 1977eda14cbcSMatt Macy } 1978eda14cbcSMatt Macy 1979eda14cbcSMatt Macy parity_map[i] = c; 1980eda14cbcSMatt Macy i++; 1981eda14cbcSMatt Macy } 1982eda14cbcSMatt Macy 1983eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1984eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n; 1985eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP); 1986eda14cbcSMatt Macy 1987eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) { 1988eda14cbcSMatt Macy rows[i] = pp; 1989eda14cbcSMatt Macy pp += n; 1990eda14cbcSMatt Macy invrows[i] = pp; 1991eda14cbcSMatt Macy pp += n; 1992eda14cbcSMatt Macy } 1993eda14cbcSMatt Macy used = pp; 1994eda14cbcSMatt Macy 1995eda14cbcSMatt Macy for (i = 0; i < nmissing_rows; i++) { 1996eda14cbcSMatt Macy used[i] = parity_map[i]; 1997eda14cbcSMatt Macy } 1998eda14cbcSMatt Macy 19997877fdebSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 2000eda14cbcSMatt Macy if (tt < nmissing_rows && 20017877fdebSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) { 2002eda14cbcSMatt Macy tt++; 2003eda14cbcSMatt Macy continue; 2004eda14cbcSMatt Macy } 2005eda14cbcSMatt Macy 2006eda14cbcSMatt Macy ASSERT3S(i, <, n); 2007eda14cbcSMatt Macy used[i] = c; 2008eda14cbcSMatt Macy i++; 2009eda14cbcSMatt Macy } 2010eda14cbcSMatt Macy 2011eda14cbcSMatt Macy /* 2012eda14cbcSMatt Macy * Initialize the interesting rows of the matrix. 2013eda14cbcSMatt Macy */ 20147877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); 2015eda14cbcSMatt Macy 2016eda14cbcSMatt Macy /* 2017eda14cbcSMatt Macy * Invert the matrix. 2018eda14cbcSMatt Macy */ 20197877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, 2020eda14cbcSMatt Macy invrows, used); 2021eda14cbcSMatt Macy 2022eda14cbcSMatt Macy /* 2023eda14cbcSMatt Macy * Reconstruct the missing data using the generated matrix. 2024eda14cbcSMatt Macy */ 20257877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, 2026eda14cbcSMatt Macy invrows, used); 2027eda14cbcSMatt Macy 2028eda14cbcSMatt Macy kmem_free(p, psize); 2029eda14cbcSMatt Macy 2030eda14cbcSMatt Macy /* 2031eda14cbcSMatt Macy * copy back from temporary linear abds and free them 2032eda14cbcSMatt Macy */ 2033eda14cbcSMatt Macy if (bufs) { 20347877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 20357877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c]; 2036eda14cbcSMatt Macy 20377877fdebSMatt Macy if (bufs[c] != NULL) { 2038eda14cbcSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size); 2039eda14cbcSMatt Macy abd_free(col->rc_abd); 20407877fdebSMatt Macy } 2041eda14cbcSMatt Macy col->rc_abd = bufs[c]; 2042eda14cbcSMatt Macy } 20437877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); 2044eda14cbcSMatt Macy } 2045eda14cbcSMatt Macy } 2046eda14cbcSMatt Macy 2047f9693befSMartin Matuska static void 20487877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, 20497877fdebSMatt Macy const int *t, int nt) 2050eda14cbcSMatt Macy { 2051eda14cbcSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 2052eda14cbcSMatt Macy int ntgts; 2053eda14cbcSMatt Macy int i, c, ret; 2054eda14cbcSMatt Macy int nbadparity, nbaddata; 2055eda14cbcSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY]; 2056eda14cbcSMatt Macy 2057e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2058e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", 2059e716630dSMartin Matuska rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, 2060e716630dSMartin Matuska (int)rr->rr_missingparity); 2061e716630dSMartin Matuska } 2062e716630dSMartin Matuska 20637877fdebSMatt Macy nbadparity = rr->rr_firstdatacol; 20647877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity; 2065eda14cbcSMatt Macy ntgts = 0; 20667877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) { 2067e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 2068e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " 2069e716630dSMartin Matuska "offset=%llx error=%u)", 2070e716630dSMartin Matuska rr, c, (int)rr->rr_col[c].rc_devidx, 2071e716630dSMartin Matuska (long long)rr->rr_col[c].rc_offset, 2072e716630dSMartin Matuska (int)rr->rr_col[c].rc_error); 2073e716630dSMartin Matuska } 20747877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2075eda14cbcSMatt Macy parity_valid[c] = B_FALSE; 2076eda14cbcSMatt Macy 2077eda14cbcSMatt Macy if (i < nt && c == t[i]) { 2078eda14cbcSMatt Macy tgts[ntgts++] = c; 2079eda14cbcSMatt Macy i++; 20807877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) { 2081eda14cbcSMatt Macy tgts[ntgts++] = c; 20827877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) { 2083eda14cbcSMatt Macy nbaddata--; 2084eda14cbcSMatt Macy } else { 2085eda14cbcSMatt Macy parity_valid[c] = B_TRUE; 2086eda14cbcSMatt Macy nbadparity--; 2087eda14cbcSMatt Macy } 2088eda14cbcSMatt Macy } 2089eda14cbcSMatt Macy 2090eda14cbcSMatt Macy ASSERT(ntgts >= nt); 2091eda14cbcSMatt Macy ASSERT(nbaddata >= 0); 2092eda14cbcSMatt Macy ASSERT(nbaddata + nbadparity == ntgts); 2093eda14cbcSMatt Macy 2094eda14cbcSMatt Macy dt = &tgts[nbadparity]; 2095eda14cbcSMatt Macy 2096eda14cbcSMatt Macy /* Reconstruct using the new math implementation */ 20977877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); 2098eda14cbcSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL) 2099f9693befSMartin Matuska return; 2100eda14cbcSMatt Macy 2101eda14cbcSMatt Macy /* 2102eda14cbcSMatt Macy * See if we can use any of our optimized reconstruction routines. 2103eda14cbcSMatt Macy */ 2104eda14cbcSMatt Macy switch (nbaddata) { 2105eda14cbcSMatt Macy case 1: 2106f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_P]) { 2107f9693befSMartin Matuska vdev_raidz_reconstruct_p(rr, dt, 1); 2108f9693befSMartin Matuska return; 2109f9693befSMartin Matuska } 2110eda14cbcSMatt Macy 21117877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2112eda14cbcSMatt Macy 2113f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_Q]) { 2114f9693befSMartin Matuska vdev_raidz_reconstruct_q(rr, dt, 1); 2115f9693befSMartin Matuska return; 2116f9693befSMartin Matuska } 2117eda14cbcSMatt Macy 21187877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2119eda14cbcSMatt Macy break; 2120eda14cbcSMatt Macy 2121eda14cbcSMatt Macy case 2: 21227877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1); 2123eda14cbcSMatt Macy 2124eda14cbcSMatt Macy if (parity_valid[VDEV_RAIDZ_P] && 2125f9693befSMartin Matuska parity_valid[VDEV_RAIDZ_Q]) { 2126f9693befSMartin Matuska vdev_raidz_reconstruct_pq(rr, dt, 2); 2127f9693befSMartin Matuska return; 2128f9693befSMartin Matuska } 2129eda14cbcSMatt Macy 21307877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2); 2131eda14cbcSMatt Macy 2132eda14cbcSMatt Macy break; 2133eda14cbcSMatt Macy } 2134eda14cbcSMatt Macy 2135f9693befSMartin Matuska vdev_raidz_reconstruct_general(rr, tgts, ntgts); 2136eda14cbcSMatt Macy } 2137eda14cbcSMatt Macy 2138eda14cbcSMatt Macy static int 2139eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 2140eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 2141eda14cbcSMatt Macy { 21427877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 21437877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2144eda14cbcSMatt Macy int c; 2145eda14cbcSMatt Macy int lasterror = 0; 2146eda14cbcSMatt Macy int numerrors = 0; 2147eda14cbcSMatt Macy 2148eda14cbcSMatt Macy ASSERT(nparity > 0); 2149eda14cbcSMatt Macy 2150eda14cbcSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY || 2151eda14cbcSMatt Macy vd->vdev_children < nparity + 1) { 2152eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 2153eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 2154eda14cbcSMatt Macy } 2155eda14cbcSMatt Macy 2156eda14cbcSMatt Macy vdev_open_children(vd); 2157eda14cbcSMatt Macy 2158eda14cbcSMatt Macy for (c = 0; c < vd->vdev_children; c++) { 21597877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 2160eda14cbcSMatt Macy 2161eda14cbcSMatt Macy if (cvd->vdev_open_error != 0) { 2162eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 2163eda14cbcSMatt Macy numerrors++; 2164eda14cbcSMatt Macy continue; 2165eda14cbcSMatt Macy } 2166eda14cbcSMatt Macy 2167eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 2168eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 2169eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 2170c7046f76SMartin Matuska } 2171c7046f76SMartin Matuska for (c = 0; c < vd->vdev_children; c++) { 2172c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c]; 2173c7046f76SMartin Matuska 2174c7046f76SMartin Matuska if (cvd->vdev_open_error != 0) 2175c7046f76SMartin Matuska continue; 2176c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift, 2177c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift); 2178eda14cbcSMatt Macy } 2179eda14cbcSMatt Macy 2180e716630dSMartin Matuska if (vd->vdev_rz_expanding) { 2181e716630dSMartin Matuska *asize *= vd->vdev_children - 1; 2182e716630dSMartin Matuska *max_asize *= vd->vdev_children - 1; 2183e716630dSMartin Matuska 2184e716630dSMartin Matuska vd->vdev_min_asize = *asize; 2185e716630dSMartin Matuska } else { 2186eda14cbcSMatt Macy *asize *= vd->vdev_children; 2187eda14cbcSMatt Macy *max_asize *= vd->vdev_children; 2188e716630dSMartin Matuska } 2189eda14cbcSMatt Macy 2190eda14cbcSMatt Macy if (numerrors > nparity) { 2191eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 2192eda14cbcSMatt Macy return (lasterror); 2193eda14cbcSMatt Macy } 2194eda14cbcSMatt Macy 2195eda14cbcSMatt Macy return (0); 2196eda14cbcSMatt Macy } 2197eda14cbcSMatt Macy 2198eda14cbcSMatt Macy static void 2199eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd) 2200eda14cbcSMatt Macy { 22017877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 22027877fdebSMatt Macy if (vd->vdev_child[c] != NULL) 2203eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 2204eda14cbcSMatt Macy } 22057877fdebSMatt Macy } 2206eda14cbcSMatt Macy 2207e716630dSMartin Matuska /* 2208e716630dSMartin Matuska * Return the logical width to use, given the txg in which the allocation 2209783d3ff6SMartin Matuska * happened. Note that BP_GET_BIRTH() is usually the txg in which the 2210e716630dSMartin Matuska * BP was allocated. Remapped BP's (that were relocated due to device 2211783d3ff6SMartin Matuska * removal, see remap_blkptr_cb()), will have a more recent physical birth 2212783d3ff6SMartin Matuska * which reflects when the BP was relocated, but we can ignore these because 2213783d3ff6SMartin Matuska * they can't be on RAIDZ (device removal doesn't support RAIDZ). 2214e716630dSMartin Matuska */ 2215eda14cbcSMatt Macy static uint64_t 2216e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) 2217e716630dSMartin Matuska { 2218e716630dSMartin Matuska reflow_node_t lookup = { 2219e716630dSMartin Matuska .re_txg = txg, 2220e716630dSMartin Matuska }; 2221e716630dSMartin Matuska avl_index_t where; 2222e716630dSMartin Matuska 2223e716630dSMartin Matuska uint64_t width; 2224e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 2225e716630dSMartin Matuska reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); 2226e716630dSMartin Matuska if (re != NULL) { 2227e716630dSMartin Matuska width = re->re_logical_width; 2228e716630dSMartin Matuska } else { 2229e716630dSMartin Matuska re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); 2230e716630dSMartin Matuska if (re != NULL) 2231e716630dSMartin Matuska width = re->re_logical_width; 2232e716630dSMartin Matuska else 2233e716630dSMartin Matuska width = vdrz->vd_original_width; 2234e716630dSMartin Matuska } 2235e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 2236e716630dSMartin Matuska return (width); 2237e716630dSMartin Matuska } 2238e716630dSMartin Matuska 2239e716630dSMartin Matuska /* 2240e716630dSMartin Matuska * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated 2241e716630dSMartin Matuska * more space due to the lower data-to-parity ratio. In this case it's 2242e716630dSMartin Matuska * important to pass in the correct txg. Note that vdev_gang_header_asize() 2243e716630dSMartin Matuska * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, 2244e716630dSMartin Matuska * regardless of txg. This is assured because for a single data sector, we 2245e716630dSMartin Matuska * allocate P+1 sectors regardless of width ("cols", which is at least P+1). 2246e716630dSMartin Matuska */ 2247e716630dSMartin Matuska static uint64_t 2248e716630dSMartin Matuska vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 2249eda14cbcSMatt Macy { 22507877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2251eda14cbcSMatt Macy uint64_t asize; 2252eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 2253e716630dSMartin Matuska uint64_t cols = vdrz->vd_original_width; 22547877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 2255eda14cbcSMatt Macy 2256e716630dSMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg); 2257e716630dSMartin Matuska 2258eda14cbcSMatt Macy asize = ((psize - 1) >> ashift) + 1; 2259eda14cbcSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 2260eda14cbcSMatt Macy asize = roundup(asize, nparity + 1) << ashift; 2261eda14cbcSMatt Macy 2262e716630dSMartin Matuska #ifdef ZFS_DEBUG 2263e716630dSMartin Matuska uint64_t asize_new = ((psize - 1) >> ashift) + 1; 2264e716630dSMartin Matuska uint64_t ncols_new = vdrz->vd_physical_width; 2265e716630dSMartin Matuska asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / 2266e716630dSMartin Matuska (ncols_new - nparity)); 2267e716630dSMartin Matuska asize_new = roundup(asize_new, nparity + 1) << ashift; 2268e716630dSMartin Matuska VERIFY3U(asize_new, <=, asize); 2269e716630dSMartin Matuska #endif 2270e716630dSMartin Matuska 2271eda14cbcSMatt Macy return (asize); 2272eda14cbcSMatt Macy } 2273eda14cbcSMatt Macy 22747877fdebSMatt Macy /* 22757877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child) 22767877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize. 22777877fdebSMatt Macy */ 22787877fdebSMatt Macy static uint64_t 22797877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd) 22807877fdebSMatt Macy { 22817877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) / 22827877fdebSMatt Macy vd->vdev_children); 22837877fdebSMatt Macy } 22847877fdebSMatt Macy 22857877fdebSMatt Macy void 2286eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio) 2287eda14cbcSMatt Macy { 2288eda14cbcSMatt Macy raidz_col_t *rc = zio->io_private; 2289eda14cbcSMatt Macy 229081b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 2291eda14cbcSMatt Macy rc->rc_error = zio->io_error; 2292eda14cbcSMatt Macy rc->rc_tried = 1; 2293eda14cbcSMatt Macy rc->rc_skipped = 0; 2294eda14cbcSMatt Macy } 2295eda14cbcSMatt Macy 2296eda14cbcSMatt Macy static void 2297e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio) 2298eda14cbcSMatt Macy { 2299e716630dSMartin Matuska raidz_col_t *rc = zio->io_private; 2300eda14cbcSMatt Macy 2301e716630dSMartin Matuska rc->rc_shadow_error = zio->io_error; 2302e716630dSMartin Matuska } 2303e716630dSMartin Matuska 2304e716630dSMartin Matuska static void 2305e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) 2306e716630dSMartin Matuska { 2307e716630dSMartin Matuska (void) rm; 2308e716630dSMartin Matuska #ifdef ZFS_DEBUG 2309b59a0cdeSMartin Matuska zfs_range_seg64_t logical_rs, physical_rs, remain_rs; 23107877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset; 2311eda14cbcSMatt Macy logical_rs.rs_end = logical_rs.rs_start + 2312e716630dSMartin Matuska vdev_raidz_asize(zio->io_vd, rr->rr_size, 2313783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2314eda14cbcSMatt Macy 23157877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col]; 2316e716630dSMartin Matuska vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 2317eda14cbcSMatt Macy 23187877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); 23197877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs)); 2320e716630dSMartin Matuska if (vdev_xlate_is_empty(&physical_rs)) { 2321e716630dSMartin Matuska /* 2322e716630dSMartin Matuska * If we are in the middle of expansion, the 2323e716630dSMartin Matuska * physical->logical mapping is changing so vdev_xlate() 2324e716630dSMartin Matuska * can't give us a reliable answer. 2325e716630dSMartin Matuska */ 2326e716630dSMartin Matuska return; 2327e716630dSMartin Matuska } 2328eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 2329eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 2330eda14cbcSMatt Macy /* 2331eda14cbcSMatt Macy * It would be nice to assert that rs_end is equal 2332eda14cbcSMatt Macy * to rc_offset + rc_size but there might be an 2333eda14cbcSMatt Macy * optional I/O at the end that is not accounted in 2334eda14cbcSMatt Macy * rc_size. 2335eda14cbcSMatt Macy */ 2336eda14cbcSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 2337eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 2338e716630dSMartin Matuska rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); 2339eda14cbcSMatt Macy } else { 2340eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 2341eda14cbcSMatt Macy } 2342eda14cbcSMatt Macy #endif 2343eda14cbcSMatt Macy } 2344eda14cbcSMatt Macy 23457877fdebSMatt Macy static void 2346e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) 23477877fdebSMatt Macy { 23487877fdebSMatt Macy vdev_t *vd = zio->io_vd; 23497877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 23507877fdebSMatt Macy 23517877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 23527877fdebSMatt Macy 235381b22a98SMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 23547877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 235581b22a98SMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 23567877fdebSMatt Macy 23577877fdebSMatt Macy /* Verify physical to logical translation */ 2358e716630dSMartin Matuska vdev_raidz_io_verify(zio, rm, rr, c); 23597877fdebSMatt Macy 2360e716630dSMartin Matuska if (rc->rc_size == 0) 2361e716630dSMartin Matuska continue; 2362e716630dSMartin Matuska 2363e716630dSMartin Matuska ASSERT3U(rc->rc_offset + rc->rc_size, <, 2364e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2365e716630dSMartin Matuska 236681b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL); 23677877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 236881b22a98SMartin Matuska rc->rc_offset, rc->rc_abd, 236981b22a98SMartin Matuska abd_get_size(rc->rc_abd), zio->io_type, 237081b22a98SMartin Matuska zio->io_priority, 0, vdev_raidz_child_done, rc)); 2371e716630dSMartin Matuska 2372e716630dSMartin Matuska if (rc->rc_shadow_devidx != INT_MAX) { 2373e716630dSMartin Matuska vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; 2374e716630dSMartin Matuska 2375e716630dSMartin Matuska ASSERT3U( 2376e716630dSMartin Matuska rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, 2377e716630dSMartin Matuska cvd2->vdev_psize - VDEV_LABEL_END_SIZE); 2378e716630dSMartin Matuska 2379e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, 2380e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, 2381e716630dSMartin Matuska abd_get_size(rc->rc_abd), 2382e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2383e716630dSMartin Matuska vdev_raidz_shadow_child_done, rc)); 238481b22a98SMartin Matuska } 23857877fdebSMatt Macy } 23867877fdebSMatt Macy } 23877877fdebSMatt Macy 2388e716630dSMartin Matuska /* 2389e716630dSMartin Matuska * Generate optional I/Os for skip sectors to improve aggregation contiguity. 2390e716630dSMartin Matuska * This only works for vdev_raidz_map_alloc() (not _expanded()). 2391e716630dSMartin Matuska */ 23927877fdebSMatt Macy static void 2393e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio) 2394e716630dSMartin Matuska { 2395e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2396e716630dSMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift; 2397e716630dSMartin Matuska raidz_map_t *rm = zio->io_vsd; 2398e716630dSMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1); 2399e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[0]; 2400e716630dSMartin Matuska for (int c = 0; c < rr->rr_scols; c++) { 2401e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2402e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 2403e716630dSMartin Matuska if (rc->rc_size != 0) 2404e716630dSMartin Matuska continue; 2405e716630dSMartin Matuska ASSERT3P(rc->rc_abd, ==, NULL); 2406e716630dSMartin Matuska 2407e716630dSMartin Matuska ASSERT3U(rc->rc_offset, <, 2408e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE); 2409e716630dSMartin Matuska 2410e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, 2411e716630dSMartin Matuska NULL, 1ULL << ashift, zio->io_type, zio->io_priority, 2412e716630dSMartin Matuska ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2413e716630dSMartin Matuska } 2414e716630dSMartin Matuska } 2415e716630dSMartin Matuska 2416e716630dSMartin Matuska static void 2417e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) 24187877fdebSMatt Macy { 24197877fdebSMatt Macy vdev_t *vd = zio->io_vd; 24207877fdebSMatt Macy 24217877fdebSMatt Macy /* 24227877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity 24237877fdebSMatt Macy * last -- any errors along the way will force us to read the parity. 24247877fdebSMatt Macy */ 24257877fdebSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) { 24267877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 24277877fdebSMatt Macy if (rc->rc_size == 0) 24287877fdebSMatt Macy continue; 24297877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 24307877fdebSMatt Macy if (!vdev_readable(cvd)) { 24317877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24327877fdebSMatt Macy rr->rr_missingdata++; 24337877fdebSMatt Macy else 24347877fdebSMatt Macy rr->rr_missingparity++; 24357877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO); 24367877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */ 24377877fdebSMatt Macy rc->rc_skipped = 1; 24387877fdebSMatt Macy continue; 24397877fdebSMatt Macy } 24407877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 24417877fdebSMatt Macy if (c >= rr->rr_firstdatacol) 24427877fdebSMatt Macy rr->rr_missingdata++; 24437877fdebSMatt Macy else 24447877fdebSMatt Macy rr->rr_missingparity++; 24457877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE); 24467877fdebSMatt Macy rc->rc_skipped = 1; 24477877fdebSMatt Macy continue; 24487877fdebSMatt Macy } 2449e716630dSMartin Matuska if (forceparity || 2450e716630dSMartin Matuska c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || 24517877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 24527877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 24537877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 24547877fdebSMatt Macy zio->io_type, zio->io_priority, 0, 24557877fdebSMatt Macy vdev_raidz_child_done, rc)); 24567877fdebSMatt Macy } 24577877fdebSMatt Macy } 24587877fdebSMatt Macy } 24597877fdebSMatt Macy 2460e716630dSMartin Matuska static void 2461e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) 2462e716630dSMartin Matuska { 2463e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2464e716630dSMartin Matuska 2465e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) { 2466e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i]; 2467e716630dSMartin Matuska if (prc->rc_size == 0) 2468e716630dSMartin Matuska continue; 2469e716630dSMartin Matuska 2470e716630dSMartin Matuska ASSERT3U(prc->rc_devidx, ==, i); 2471e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[i]; 2472e716630dSMartin Matuska if (!vdev_readable(cvd)) { 2473e716630dSMartin Matuska prc->rc_error = SET_ERROR(ENXIO); 2474e716630dSMartin Matuska prc->rc_tried = 1; /* don't even try */ 2475e716630dSMartin Matuska prc->rc_skipped = 1; 2476e716630dSMartin Matuska continue; 2477e716630dSMartin Matuska } 2478e716630dSMartin Matuska if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2479e716630dSMartin Matuska prc->rc_error = SET_ERROR(ESTALE); 2480e716630dSMartin Matuska prc->rc_skipped = 1; 2481e716630dSMartin Matuska continue; 2482e716630dSMartin Matuska } 2483e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2484e716630dSMartin Matuska prc->rc_offset, prc->rc_abd, prc->rc_size, 2485e716630dSMartin Matuska zio->io_type, zio->io_priority, 0, 2486e716630dSMartin Matuska vdev_raidz_child_done, prc)); 2487e716630dSMartin Matuska } 2488e716630dSMartin Matuska } 2489e716630dSMartin Matuska 2490e716630dSMartin Matuska static void 2491e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) 2492e716630dSMartin Matuska { 2493e716630dSMartin Matuska /* 2494e716630dSMartin Matuska * If there are multiple rows, we will be hitting 2495e716630dSMartin Matuska * all disks, so go ahead and read the parity so 2496e716630dSMartin Matuska * that we are reading in decent size chunks. 2497e716630dSMartin Matuska */ 2498e716630dSMartin Matuska boolean_t forceparity = rm->rm_nrows > 1; 2499e716630dSMartin Matuska 2500e716630dSMartin Matuska if (rm->rm_phys_col) { 2501e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio, rm); 2502e716630dSMartin Matuska } else { 2503e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2504e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 2505e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio, rr, forceparity); 2506e716630dSMartin Matuska } 2507e716630dSMartin Matuska } 2508e716630dSMartin Matuska } 2509e716630dSMartin Matuska 2510eda14cbcSMatt Macy /* 2511eda14cbcSMatt Macy * Start an IO operation on a RAIDZ VDev 2512eda14cbcSMatt Macy * 2513eda14cbcSMatt Macy * Outline: 2514eda14cbcSMatt Macy * - For write operations: 2515eda14cbcSMatt Macy * 1. Generate the parity data 2516eda14cbcSMatt Macy * 2. Create child zio write operations to each column's vdev, for both 2517eda14cbcSMatt Macy * data and parity. 2518eda14cbcSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy 2519eda14cbcSMatt Macy * write zio children for those areas to improve aggregation continuity. 2520eda14cbcSMatt Macy * - For read operations: 2521eda14cbcSMatt Macy * 1. Create child zio read operations to each data column's vdev to read 2522eda14cbcSMatt Macy * the range of data required for zio. 2523eda14cbcSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data 2524eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity 2525eda14cbcSMatt Macy * columns' VDevs as well. 2526eda14cbcSMatt Macy */ 2527eda14cbcSMatt Macy static void 2528eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio) 2529eda14cbcSMatt Macy { 2530eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 2531eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top; 25327877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 2533e716630dSMartin Matuska raidz_map_t *rm; 2534eda14cbcSMatt Macy 2535e716630dSMartin Matuska uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, 2536783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp)); 2537e716630dSMartin Matuska if (logical_width != vdrz->vd_physical_width) { 2538e716630dSMartin Matuska zfs_locked_range_t *lr = NULL; 2539e716630dSMartin Matuska uint64_t synced_offset = UINT64_MAX; 2540e716630dSMartin Matuska uint64_t next_offset = UINT64_MAX; 2541e716630dSMartin Matuska boolean_t use_scratch = B_FALSE; 2542e716630dSMartin Matuska /* 2543e716630dSMartin Matuska * Note: when the expansion is completing, we set 2544e716630dSMartin Matuska * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) 2545e716630dSMartin Matuska * in a later txg than when we last update spa_ubsync's state 2546e716630dSMartin Matuska * (see the end of spa_raidz_expand_thread()). Therefore we 2547e716630dSMartin Matuska * may see vre_state!=SCANNING before 2548e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected 2549e716630dSMartin Matuska * on disk, but the copying progress has been synced to disk 2550e716630dSMartin Matuska * (and reflected in spa_ubsync). In this case it's fine to 2551e716630dSMartin Matuska * treat the expansion as completed, since if we crash there's 2552e716630dSMartin Matuska * no additional copying to do. 2553e716630dSMartin Matuska */ 2554e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 2555e716630dSMartin Matuska ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, 2556e716630dSMartin Matuska &vdrz->vn_vre); 2557e716630dSMartin Matuska lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, 2558e716630dSMartin Matuska zio->io_offset, zio->io_size, RL_READER); 2559e716630dSMartin Matuska use_scratch = 2560e716630dSMartin Matuska (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == 2561e716630dSMartin Matuska RRSS_SCRATCH_VALID); 2562e716630dSMartin Matuska synced_offset = 2563e716630dSMartin Matuska RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); 2564e716630dSMartin Matuska next_offset = vdrz->vn_vre.vre_offset; 2565e716630dSMartin Matuska /* 2566e716630dSMartin Matuska * If we haven't resumed expanding since importing the 2567e716630dSMartin Matuska * pool, vre_offset won't have been set yet. In 2568e716630dSMartin Matuska * this case the next offset to be copied is the same 2569e716630dSMartin Matuska * as what was synced. 2570e716630dSMartin Matuska */ 2571e716630dSMartin Matuska if (next_offset == UINT64_MAX) { 2572e716630dSMartin Matuska next_offset = synced_offset; 2573e716630dSMartin Matuska } 2574e716630dSMartin Matuska } 2575e716630dSMartin Matuska if (use_scratch) { 2576e716630dSMartin Matuska zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" 2577e716630dSMartin Matuska "%lld next_offset=%lld use_scratch=%u", 2578e716630dSMartin Matuska zio, 2579e716630dSMartin Matuska zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", 2580e716630dSMartin Matuska (long long)zio->io_offset, 2581e716630dSMartin Matuska (long long)synced_offset, 2582e716630dSMartin Matuska (long long)next_offset, 2583e716630dSMartin Matuska use_scratch); 2584e716630dSMartin Matuska } 2585e716630dSMartin Matuska 2586e716630dSMartin Matuska rm = vdev_raidz_map_alloc_expanded(zio, 2587e716630dSMartin Matuska tvd->vdev_ashift, vdrz->vd_physical_width, 2588e716630dSMartin Matuska logical_width, vdrz->vd_nparity, 2589e716630dSMartin Matuska synced_offset, next_offset, use_scratch); 2590e716630dSMartin Matuska rm->rm_lr = lr; 2591e716630dSMartin Matuska } else { 2592e716630dSMartin Matuska rm = vdev_raidz_map_alloc(zio, 2593e716630dSMartin Matuska tvd->vdev_ashift, logical_width, vdrz->vd_nparity); 2594e716630dSMartin Matuska } 2595e716630dSMartin Matuska rm->rm_original_width = vdrz->vd_original_width; 2596e716630dSMartin Matuska 2597f9693befSMartin Matuska zio->io_vsd = rm; 2598f9693befSMartin Matuska zio->io_vsd_ops = &vdev_raidz_vsd_ops; 2599eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 2600e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 2601e716630dSMartin Matuska vdev_raidz_io_start_write(zio, rm->rm_row[i]); 2602e716630dSMartin Matuska } 2603e716630dSMartin Matuska 2604e716630dSMartin Matuska if (logical_width == vdrz->vd_physical_width) { 2605e716630dSMartin Matuska raidz_start_skip_writes(zio); 2606e716630dSMartin Matuska } 26077877fdebSMatt Macy } else { 2608eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 2609e716630dSMartin Matuska vdev_raidz_io_start_read(zio, rm); 2610eda14cbcSMatt Macy } 2611eda14cbcSMatt Macy 2612eda14cbcSMatt Macy zio_execute(zio); 2613eda14cbcSMatt Macy } 2614eda14cbcSMatt Macy 2615eda14cbcSMatt Macy /* 2616eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device. 2617eda14cbcSMatt Macy */ 2618e92ffd9bSMartin Matuska void 2619e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) 2620eda14cbcSMatt Macy { 2621eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2622eda14cbcSMatt Macy 26237877fdebSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 26247877fdebSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) { 2625eda14cbcSMatt Macy zio_bad_cksum_t zbc; 2626eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2627eda14cbcSMatt Macy 2628eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 2629eda14cbcSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 2630eda14cbcSMatt Macy 26312c48331dSMatt Macy mutex_enter(&vd->vdev_stat_lock); 26322c48331dSMatt Macy vd->vdev_stat.vs_checksum_errors++; 26332c48331dSMatt Macy mutex_exit(&vd->vdev_stat_lock); 2634bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd, 2635bb2d13b6SMartin Matuska &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, 2636bb2d13b6SMartin Matuska rc->rc_abd, bad_data, &zbc); 26372c48331dSMatt Macy } 2638eda14cbcSMatt Macy } 2639eda14cbcSMatt Macy 2640eda14cbcSMatt Macy /* 2641eda14cbcSMatt Macy * We keep track of whether or not there were any injected errors, so that 2642eda14cbcSMatt Macy * any ereports we generate can note it. 2643eda14cbcSMatt Macy */ 2644eda14cbcSMatt Macy static int 2645eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio) 2646eda14cbcSMatt Macy { 2647315ee00fSMartin Matuska zio_bad_cksum_t zbc = {0}; 2648eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd; 2649eda14cbcSMatt Macy 2650eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc); 265187bf66d4SMartin Matuska /* 265287bf66d4SMartin Matuska * Any Direct I/O read that has a checksum error must be treated as 265387bf66d4SMartin Matuska * suspicious as the contents of the buffer could be getting 265487bf66d4SMartin Matuska * manipulated while the I/O is taking place. The checksum verify error 265587bf66d4SMartin Matuska * will be reported to the top-level RAIDZ VDEV. 265687bf66d4SMartin Matuska */ 265787bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { 265887bf66d4SMartin Matuska zio->io_error = ret; 265987bf66d4SMartin Matuska zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; 266087bf66d4SMartin Matuska zio_dio_chksum_verify_error_report(zio); 266187bf66d4SMartin Matuska zio_checksum_verified(zio); 266287bf66d4SMartin Matuska return (0); 266387bf66d4SMartin Matuska } 266487bf66d4SMartin Matuska 2665eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0) 2666eda14cbcSMatt Macy rm->rm_ecksuminjected = 1; 2667eda14cbcSMatt Macy 2668eda14cbcSMatt Macy return (ret); 2669eda14cbcSMatt Macy } 2670eda14cbcSMatt Macy 2671eda14cbcSMatt Macy /* 2672eda14cbcSMatt Macy * Generate the parity from the data columns. If we tried and were able to 2673eda14cbcSMatt Macy * read the parity without error, verify that the generated parity matches the 2674eda14cbcSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the 26757877fdebSMatt Macy * number of such failures. 2676eda14cbcSMatt Macy */ 2677eda14cbcSMatt Macy static int 26787877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr) 2679eda14cbcSMatt Macy { 2680eda14cbcSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY]; 2681eda14cbcSMatt Macy int c, ret = 0; 26827877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2683eda14cbcSMatt Macy raidz_col_t *rc; 2684eda14cbcSMatt Macy 2685eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp; 2686eda14cbcSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2687eda14cbcSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2688eda14cbcSMatt Macy 2689eda14cbcSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY) 2690eda14cbcSMatt Macy return (ret); 2691eda14cbcSMatt Macy 26927877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 26937877fdebSMatt Macy rc = &rr->rr_col[c]; 2694eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2695eda14cbcSMatt Macy continue; 2696eda14cbcSMatt Macy 2697a0b956f5SMartin Matuska orig[c] = rc->rc_abd; 2698a0b956f5SMartin Matuska ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); 2699a0b956f5SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); 2700eda14cbcSMatt Macy } 2701eda14cbcSMatt Macy 27027877fdebSMatt Macy /* 2703e92ffd9bSMartin Matuska * Verify any empty sectors are zero filled to ensure the parity 2704e92ffd9bSMartin Matuska * is calculated correctly even if these non-data sectors are damaged. 2705e92ffd9bSMartin Matuska */ 2706e92ffd9bSMartin Matuska if (rr->rr_nempty && rr->rr_abd_empty != NULL) 2707e92ffd9bSMartin Matuska ret += vdev_draid_map_verify_empty(zio, rr); 2708e92ffd9bSMartin Matuska 2709e92ffd9bSMartin Matuska /* 27107877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This 27117877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff 27127877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0). 27137877fdebSMatt Macy */ 27147877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr); 2715eda14cbcSMatt Macy 27167877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) { 27177877fdebSMatt Macy rc = &rr->rr_col[c]; 27187877fdebSMatt Macy 2719eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0) 2720eda14cbcSMatt Macy continue; 27217877fdebSMatt Macy 2722eda14cbcSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) { 2723e716630dSMartin Matuska zfs_dbgmsg("found error on col=%u devidx=%u off %llx", 2724e716630dSMartin Matuska c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); 2725e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, rc, orig[c]); 2726eda14cbcSMatt Macy rc->rc_error = SET_ERROR(ECKSUM); 2727eda14cbcSMatt Macy ret++; 2728eda14cbcSMatt Macy } 2729eda14cbcSMatt Macy abd_free(orig[c]); 2730eda14cbcSMatt Macy } 2731eda14cbcSMatt Macy 2732eda14cbcSMatt Macy return (ret); 2733eda14cbcSMatt Macy } 2734eda14cbcSMatt Macy 2735eda14cbcSMatt Macy static int 27367877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr) 2737eda14cbcSMatt Macy { 2738eda14cbcSMatt Macy int error = 0; 2739eda14cbcSMatt Macy 2740e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 27417877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error); 2742e716630dSMartin Matuska error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); 2743e716630dSMartin Matuska } 2744eda14cbcSMatt Macy 2745eda14cbcSMatt Macy return (error); 2746eda14cbcSMatt Macy } 2747eda14cbcSMatt Macy 2748eda14cbcSMatt Macy static void 27497877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) 2750eda14cbcSMatt Macy { 2751eda14cbcSMatt Macy int unexpected_errors = 0; 2752eda14cbcSMatt Macy int parity_errors = 0; 2753eda14cbcSMatt Macy int parity_untried = 0; 2754eda14cbcSMatt Macy int data_errors = 0; 2755eda14cbcSMatt Macy 27567877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 2757eda14cbcSMatt Macy 27587877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27597877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 2760eda14cbcSMatt Macy 2761eda14cbcSMatt Macy if (rc->rc_error) { 27627877fdebSMatt Macy if (c < rr->rr_firstdatacol) 2763eda14cbcSMatt Macy parity_errors++; 2764eda14cbcSMatt Macy else 2765eda14cbcSMatt Macy data_errors++; 2766eda14cbcSMatt Macy 2767eda14cbcSMatt Macy if (!rc->rc_skipped) 2768eda14cbcSMatt Macy unexpected_errors++; 27697877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 2770eda14cbcSMatt Macy parity_untried++; 2771eda14cbcSMatt Macy } 2772a0b956f5SMartin Matuska 2773a0b956f5SMartin Matuska if (rc->rc_force_repair) 2774a0b956f5SMartin Matuska unexpected_errors++; 2775eda14cbcSMatt Macy } 2776eda14cbcSMatt Macy 2777eda14cbcSMatt Macy /* 27787877fdebSMatt Macy * If we read more parity disks than were used for 27797877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced 27807877fdebSMatt Macy * correct data. 27817877fdebSMatt Macy * 27827877fdebSMatt Macy * Note that we also regenerate parity when resilvering so we 27837877fdebSMatt Macy * can write it out to failed devices later. 27847877fdebSMatt Macy */ 27857877fdebSMatt Macy if (parity_errors + parity_untried < 27867877fdebSMatt Macy rr->rr_firstdatacol - data_errors || 27877877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) { 27887877fdebSMatt Macy int n = raidz_parity_verify(zio, rr); 27897877fdebSMatt Macy unexpected_errors += n; 27907877fdebSMatt Macy } 27917877fdebSMatt Macy 27927877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 27937877fdebSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { 27947877fdebSMatt Macy /* 27957877fdebSMatt Macy * Use the good data we have in hand to repair damaged children. 27967877fdebSMatt Macy */ 27977877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 27987877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 27997877fdebSMatt Macy vdev_t *vd = zio->io_vd; 28007877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 28017877fdebSMatt Macy 280216038816SMartin Matuska if (!rc->rc_allow_repair) { 280316038816SMartin Matuska continue; 280416038816SMartin Matuska } else if (!rc->rc_force_repair && 280516038816SMartin Matuska (rc->rc_error == 0 || rc->rc_size == 0)) { 28067877fdebSMatt Macy continue; 28077877fdebSMatt Macy } 280887bf66d4SMartin Matuska /* 280987bf66d4SMartin Matuska * We do not allow self healing for Direct I/O reads. 281087bf66d4SMartin Matuska * See comment in vdev_raid_row_alloc(). 281187bf66d4SMartin Matuska */ 281287bf66d4SMartin Matuska ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); 28137877fdebSMatt Macy 2814e716630dSMartin Matuska zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " 2815e716630dSMartin Matuska "offset=%llx", 2816e716630dSMartin Matuska zio, c, rc->rc_devidx, (long long)rc->rc_offset); 2817e716630dSMartin Matuska 28187877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 28197877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 28207877fdebSMatt Macy ZIO_TYPE_WRITE, 28217877fdebSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 28227877fdebSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 28237877fdebSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 28247877fdebSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 28257877fdebSMatt Macy } 28267877fdebSMatt Macy } 2827e716630dSMartin Matuska 2828e716630dSMartin Matuska /* 2829e716630dSMartin Matuska * Scrub or resilver i/o's: overwrite any shadow locations with the 2830e716630dSMartin Matuska * good data. This ensures that if we've already copied this sector, 2831e716630dSMartin Matuska * it will be corrected if it was damaged. This writes more than is 2832e716630dSMartin Matuska * necessary, but since expansion is paused during scrub/resilver, at 2833e716630dSMartin Matuska * most a single row will have a shadow location. 2834e716630dSMartin Matuska */ 2835e716630dSMartin Matuska if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2836e716630dSMartin Matuska (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { 2837e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 2838e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 2839e716630dSMartin Matuska vdev_t *vd = zio->io_vd; 2840e716630dSMartin Matuska 2841e716630dSMartin Matuska if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) 2842e716630dSMartin Matuska continue; 2843e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; 2844e716630dSMartin Matuska 2845e716630dSMartin Matuska /* 2846e716630dSMartin Matuska * Note: We don't want to update the repair stats 2847e716630dSMartin Matuska * because that would incorrectly indicate that there 2848e716630dSMartin Matuska * was bad data to repair, which we aren't sure about. 2849e716630dSMartin Matuska * By clearing the SCAN_THREAD flag, we prevent this 2850e716630dSMartin Matuska * from happening, despite having the REPAIR flag set. 2851e716630dSMartin Matuska * We need to set SELF_HEAL so that this i/o can't be 2852e716630dSMartin Matuska * bypassed by zio_vdev_io_start(). 2853e716630dSMartin Matuska */ 2854e716630dSMartin Matuska zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, 2855e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, 2856e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2857e716630dSMartin Matuska ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 2858e716630dSMartin Matuska NULL, NULL); 2859e716630dSMartin Matuska cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; 2860e716630dSMartin Matuska zio_nowait(cio); 2861e716630dSMartin Matuska } 2862e716630dSMartin Matuska } 28637877fdebSMatt Macy } 28647877fdebSMatt Macy 28657877fdebSMatt Macy static void 28667877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm) 28677877fdebSMatt Macy { 28687877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 28697877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 28707877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 28717877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 28727877fdebSMatt Macy if (rc->rc_need_orig_restore) { 2873f9693befSMartin Matuska abd_copy(rc->rc_abd, 28747877fdebSMatt Macy rc->rc_orig_data, rc->rc_size); 28757877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 28767877fdebSMatt Macy } 28777877fdebSMatt Macy } 28787877fdebSMatt Macy } 28797877fdebSMatt Macy } 28807877fdebSMatt Macy 28817877fdebSMatt Macy /* 2882e716630dSMartin Matuska * During raidz_reconstruct() for expanded VDEV, we need special consideration 2883e716630dSMartin Matuska * failure simulations. See note in raidz_reconstruct() on simulating failure 2884e716630dSMartin Matuska * of a pre-expansion device. 2885e716630dSMartin Matuska * 2886e716630dSMartin Matuska * Treating logical child i as failed, return TRUE if the given column should 2887e716630dSMartin Matuska * be treated as failed. The idea of logical children allows us to imagine 2888e716630dSMartin Matuska * that a disk silently failed before a RAIDZ expansion (reads from this disk 2889e716630dSMartin Matuska * succeed but return the wrong data). Since the expansion doesn't verify 2890e716630dSMartin Matuska * checksums, the incorrect data will be moved to new locations spread among 2891e716630dSMartin Matuska * the children (going diagonally across them). 2892e716630dSMartin Matuska * 2893e716630dSMartin Matuska * Higher "logical child failures" (values of `i`) indicate these 2894e716630dSMartin Matuska * "pre-expansion failures". The first physical_width values imagine that a 2895e716630dSMartin Matuska * current child failed; the next physical_width-1 values imagine that a 2896e716630dSMartin Matuska * child failed before the most recent expansion; the next physical_width-2 2897e716630dSMartin Matuska * values imagine a child failed in the expansion before that, etc. 2898e716630dSMartin Matuska */ 2899e716630dSMartin Matuska static boolean_t 2900e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift, 2901e716630dSMartin Matuska int i, raidz_col_t *rc) 2902e716630dSMartin Matuska { 2903e716630dSMartin Matuska uint64_t sector_id = 2904e716630dSMartin Matuska physical_width * (rc->rc_offset >> ashift) + 2905e716630dSMartin Matuska rc->rc_devidx; 2906e716630dSMartin Matuska 2907e716630dSMartin Matuska for (int w = physical_width; w >= original_width; w--) { 2908e716630dSMartin Matuska if (i < w) { 2909e716630dSMartin Matuska return (sector_id % w == i); 2910e716630dSMartin Matuska } else { 2911e716630dSMartin Matuska i -= w; 2912e716630dSMartin Matuska } 2913e716630dSMartin Matuska } 2914e716630dSMartin Matuska ASSERT(!"invalid logical child id"); 2915e716630dSMartin Matuska return (B_FALSE); 2916e716630dSMartin Matuska } 2917e716630dSMartin Matuska 2918e716630dSMartin Matuska /* 29197877fdebSMatt Macy * returns EINVAL if reconstruction of the block will not be possible 29207877fdebSMatt Macy * returns ECKSUM if this specific reconstruction failed 29217877fdebSMatt Macy * returns 0 on successful reconstruction 29227877fdebSMatt Macy */ 29237877fdebSMatt Macy static int 29247877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) 29257877fdebSMatt Macy { 29267877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 2927e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 2928e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 2929e716630dSMartin Matuska rm->rm_original_width : physical_width; 2930e716630dSMartin Matuska int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; 2931e716630dSMartin Matuska 2932e716630dSMartin Matuska if (dbgmsg) { 2933e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " 2934e716630dSMartin Matuska "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); 2935e716630dSMartin Matuska } 29367877fdebSMatt Macy 29377877fdebSMatt Macy /* Reconstruct each row */ 29387877fdebSMatt Macy for (int r = 0; r < rm->rm_nrows; r++) { 29397877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[r]; 29407877fdebSMatt Macy int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ 29417877fdebSMatt Macy int t = 0; 29427877fdebSMatt Macy int dead = 0; 29437877fdebSMatt Macy int dead_data = 0; 29447877fdebSMatt Macy 2945e716630dSMartin Matuska if (dbgmsg) 2946e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); 2947e716630dSMartin Matuska 29487877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 29497877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 29507877fdebSMatt Macy ASSERT0(rc->rc_need_orig_restore); 29517877fdebSMatt Macy if (rc->rc_error != 0) { 29527877fdebSMatt Macy dead++; 29537877fdebSMatt Macy if (c >= nparity) 29547877fdebSMatt Macy dead_data++; 29557877fdebSMatt Macy continue; 29567877fdebSMatt Macy } 29577877fdebSMatt Macy if (rc->rc_size == 0) 29587877fdebSMatt Macy continue; 29597877fdebSMatt Macy for (int lt = 0; lt < ntgts; lt++) { 2960e716630dSMartin Matuska if (raidz_simulate_failure(physical_width, 2961e716630dSMartin Matuska original_width, 2962e716630dSMartin Matuska zio->io_vd->vdev_top->vdev_ashift, 2963e716630dSMartin Matuska ltgts[lt], rc)) { 29647877fdebSMatt Macy if (rc->rc_orig_data == NULL) { 29657877fdebSMatt Macy rc->rc_orig_data = 2966f9693befSMartin Matuska abd_alloc_linear( 2967f9693befSMartin Matuska rc->rc_size, B_TRUE); 2968f9693befSMartin Matuska abd_copy(rc->rc_orig_data, 29697877fdebSMatt Macy rc->rc_abd, rc->rc_size); 29707877fdebSMatt Macy } 29717877fdebSMatt Macy rc->rc_need_orig_restore = B_TRUE; 29727877fdebSMatt Macy 29737877fdebSMatt Macy dead++; 29747877fdebSMatt Macy if (c >= nparity) 29757877fdebSMatt Macy dead_data++; 2976e716630dSMartin Matuska /* 2977e716630dSMartin Matuska * Note: simulating failure of a 2978e716630dSMartin Matuska * pre-expansion device can hit more 2979e716630dSMartin Matuska * than one column, in which case we 2980e716630dSMartin Matuska * might try to simulate more failures 2981e716630dSMartin Matuska * than can be reconstructed, which is 2982e716630dSMartin Matuska * also more than the size of my_tgts. 2983e716630dSMartin Matuska * This check prevents accessing past 2984e716630dSMartin Matuska * the end of my_tgts. The "dead > 2985e716630dSMartin Matuska * nparity" check below will fail this 2986e716630dSMartin Matuska * reconstruction attempt. 2987e716630dSMartin Matuska */ 2988e716630dSMartin Matuska if (t < VDEV_RAIDZ_MAXPARITY) { 29897877fdebSMatt Macy my_tgts[t++] = c; 2990e716630dSMartin Matuska if (dbgmsg) { 2991e716630dSMartin Matuska zfs_dbgmsg("simulating " 2992e716630dSMartin Matuska "failure of col %u " 2993e716630dSMartin Matuska "devidx %u", c, 2994e716630dSMartin Matuska (int)rc->rc_devidx); 2995e716630dSMartin Matuska } 2996e716630dSMartin Matuska } 29977877fdebSMatt Macy break; 29987877fdebSMatt Macy } 29997877fdebSMatt Macy } 30007877fdebSMatt Macy } 30017877fdebSMatt Macy if (dead > nparity) { 30027877fdebSMatt Macy /* reconstruction not possible */ 3003e716630dSMartin Matuska if (dbgmsg) { 3004e716630dSMartin Matuska zfs_dbgmsg("reconstruction not possible; " 3005e716630dSMartin Matuska "too many failures"); 3006e716630dSMartin Matuska } 30077877fdebSMatt Macy raidz_restore_orig_data(rm); 30087877fdebSMatt Macy return (EINVAL); 30097877fdebSMatt Macy } 30107877fdebSMatt Macy if (dead_data > 0) 3011f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); 30127877fdebSMatt Macy } 30137877fdebSMatt Macy 30147877fdebSMatt Macy /* Check for success */ 30157877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 301687bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 301787bf66d4SMartin Matuska return (0); 30187877fdebSMatt Macy 30197877fdebSMatt Macy /* Reconstruction succeeded - report errors */ 30207877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 30217877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 30227877fdebSMatt Macy 30237877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 30247877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 30257877fdebSMatt Macy if (rc->rc_need_orig_restore) { 30267877fdebSMatt Macy /* 30277877fdebSMatt Macy * Note: if this is a parity column, 30287877fdebSMatt Macy * we don't really know if it's wrong. 30297877fdebSMatt Macy * We need to let 30307877fdebSMatt Macy * vdev_raidz_io_done_verified() check 30317877fdebSMatt Macy * it, and if we set rc_error, it will 30327877fdebSMatt Macy * think that it is a "known" error 30337877fdebSMatt Macy * that doesn't need to be checked 30347877fdebSMatt Macy * or corrected. 30357877fdebSMatt Macy */ 30367877fdebSMatt Macy if (rc->rc_error == 0 && 30377877fdebSMatt Macy c >= rr->rr_firstdatacol) { 3038e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, 3039f9693befSMartin Matuska rc, rc->rc_orig_data); 30407877fdebSMatt Macy rc->rc_error = 30417877fdebSMatt Macy SET_ERROR(ECKSUM); 30427877fdebSMatt Macy } 30437877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE; 30447877fdebSMatt Macy } 30457877fdebSMatt Macy } 30467877fdebSMatt Macy 30477877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 30487877fdebSMatt Macy } 30497877fdebSMatt Macy 30507877fdebSMatt Macy zio_checksum_verified(zio); 30517877fdebSMatt Macy 3052e716630dSMartin Matuska if (dbgmsg) { 3053e716630dSMartin Matuska zfs_dbgmsg("reconstruction successful " 3054e716630dSMartin Matuska "(checksum verified)"); 3055e716630dSMartin Matuska } 30567877fdebSMatt Macy return (0); 30577877fdebSMatt Macy } 30587877fdebSMatt Macy 30597877fdebSMatt Macy /* Reconstruction failed - restore original data */ 30607877fdebSMatt Macy raidz_restore_orig_data(rm); 3061e716630dSMartin Matuska if (dbgmsg) { 3062e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " 3063e716630dSMartin Matuska "failed", zio); 3064e716630dSMartin Matuska } 30657877fdebSMatt Macy return (ECKSUM); 30667877fdebSMatt Macy } 30677877fdebSMatt Macy 30687877fdebSMatt Macy /* 30697877fdebSMatt Macy * Iterate over all combinations of N bad vdevs and attempt a reconstruction. 30707877fdebSMatt Macy * Note that the algorithm below is non-optimal because it doesn't take into 30717877fdebSMatt Macy * account how reconstruction is actually performed. For example, with 30727877fdebSMatt Macy * triple-parity RAID-Z the reconstruction procedure is the same if column 4 30737877fdebSMatt Macy * is targeted as invalid as if columns 1 and 4 are targeted since in both 30747877fdebSMatt Macy * cases we'd only use parity information in column 0. 30757877fdebSMatt Macy * 30767877fdebSMatt Macy * The order that we find the various possible combinations of failed 30777877fdebSMatt Macy * disks is dictated by these rules: 30787877fdebSMatt Macy * - Examine each "slot" (the "i" in tgts[i]) 3079e716630dSMartin Matuska * - Try to increment this slot (tgts[i] += 1) 30807877fdebSMatt Macy * - if we can't increment because it runs into the next slot, 30817877fdebSMatt Macy * reset our slot to the minimum, and examine the next slot 30827877fdebSMatt Macy * 30837877fdebSMatt Macy * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose 30847877fdebSMatt Macy * 3 columns to reconstruct), we will generate the following sequence: 30857877fdebSMatt Macy * 30867877fdebSMatt Macy * STATE ACTION 30877877fdebSMatt Macy * 0 1 2 special case: skip since these are all parity 30887877fdebSMatt Macy * 0 1 3 first slot: reset to 0; middle slot: increment to 2 30897877fdebSMatt Macy * 0 2 3 first slot: increment to 1 30907877fdebSMatt Macy * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 30917877fdebSMatt Macy * 0 1 4 first: reset to 0; middle: increment to 2 30927877fdebSMatt Macy * 0 2 4 first: increment to 1 30937877fdebSMatt Macy * 1 2 4 first: reset to 0; middle: increment to 3 30947877fdebSMatt Macy * 0 3 4 first: increment to 1 30957877fdebSMatt Macy * 1 3 4 first: increment to 2 30967877fdebSMatt Macy * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 30977877fdebSMatt Macy * 0 1 5 first: reset to 0; middle: increment to 2 30987877fdebSMatt Macy * 0 2 5 first: increment to 1 30997877fdebSMatt Macy * 1 2 5 first: reset to 0; middle: increment to 3 31007877fdebSMatt Macy * 0 3 5 first: increment to 1 31017877fdebSMatt Macy * 1 3 5 first: increment to 2 31027877fdebSMatt Macy * 2 3 5 first: reset to 0; middle: increment to 4 31037877fdebSMatt Macy * 0 4 5 first: increment to 1 31047877fdebSMatt Macy * 1 4 5 first: increment to 2 31057877fdebSMatt Macy * 2 4 5 first: increment to 3 31067877fdebSMatt Macy * 3 4 5 done 31077877fdebSMatt Macy * 310816038816SMartin Matuska * This strategy works for dRAID but is less efficient when there are a large 31097877fdebSMatt Macy * number of child vdevs and therefore permutations to check. Furthermore, 3110e716630dSMartin Matuska * since the raidz_map_t rows likely do not overlap, reconstruction would be 31117877fdebSMatt Macy * possible as long as there are no more than nparity data errors per row. 31127877fdebSMatt Macy * These additional permutations are not currently checked but could be as 31137877fdebSMatt Macy * a future improvement. 3114e716630dSMartin Matuska * 3115e716630dSMartin Matuska * Returns 0 on success, ECKSUM on failure. 31167877fdebSMatt Macy */ 31177877fdebSMatt Macy static int 31187877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio) 31197877fdebSMatt Macy { 31207877fdebSMatt Macy int nparity = vdev_get_nparity(zio->io_vd); 31217877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3122e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children; 3123e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ? 3124e716630dSMartin Matuska rm->rm_original_width : physical_width; 31257877fdebSMatt Macy 31267877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 31277877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 31287877fdebSMatt Macy int total_errors = 0; 31297877fdebSMatt Macy 31307877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 31317877fdebSMatt Macy if (rr->rr_col[c].rc_error) 31327877fdebSMatt Macy total_errors++; 31337877fdebSMatt Macy } 31347877fdebSMatt Macy 31357877fdebSMatt Macy if (total_errors > nparity) 31367877fdebSMatt Macy return (vdev_raidz_worst_error(rr)); 31377877fdebSMatt Macy } 31387877fdebSMatt Macy 31397877fdebSMatt Macy for (int num_failures = 1; num_failures <= nparity; num_failures++) { 31407877fdebSMatt Macy int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 31417877fdebSMatt Macy int *ltgts = &tstore[1]; /* value is logical child ID */ 31427877fdebSMatt Macy 3143e716630dSMartin Matuska 3144e716630dSMartin Matuska /* 3145e716630dSMartin Matuska * Determine number of logical children, n. See comment 3146e716630dSMartin Matuska * above raidz_simulate_failure(). 3147e716630dSMartin Matuska */ 3148e716630dSMartin Matuska int n = 0; 3149e716630dSMartin Matuska for (int w = physical_width; 3150e716630dSMartin Matuska w >= original_width; w--) { 3151e716630dSMartin Matuska n += w; 3152e716630dSMartin Matuska } 31537877fdebSMatt Macy 31547877fdebSMatt Macy ASSERT3U(num_failures, <=, nparity); 31557877fdebSMatt Macy ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); 31567877fdebSMatt Macy 31577877fdebSMatt Macy /* Handle corner cases in combrec logic */ 31587877fdebSMatt Macy ltgts[-1] = -1; 31597877fdebSMatt Macy for (int i = 0; i < num_failures; i++) { 31607877fdebSMatt Macy ltgts[i] = i; 31617877fdebSMatt Macy } 31627877fdebSMatt Macy ltgts[num_failures] = n; 31637877fdebSMatt Macy 31647877fdebSMatt Macy for (;;) { 31657877fdebSMatt Macy int err = raidz_reconstruct(zio, ltgts, num_failures, 31667877fdebSMatt Macy nparity); 31677877fdebSMatt Macy if (err == EINVAL) { 31687877fdebSMatt Macy /* 31697877fdebSMatt Macy * Reconstruction not possible with this # 31707877fdebSMatt Macy * failures; try more failures. 31717877fdebSMatt Macy */ 31727877fdebSMatt Macy break; 31737877fdebSMatt Macy } else if (err == 0) 31747877fdebSMatt Macy return (0); 31757877fdebSMatt Macy 31767877fdebSMatt Macy /* Compute next targets to try */ 31777877fdebSMatt Macy for (int t = 0; ; t++) { 31787877fdebSMatt Macy ASSERT3U(t, <, num_failures); 31797877fdebSMatt Macy ltgts[t]++; 31807877fdebSMatt Macy if (ltgts[t] == n) { 31817877fdebSMatt Macy /* try more failures */ 31827877fdebSMatt Macy ASSERT3U(t, ==, num_failures - 1); 3183e716630dSMartin Matuska if (zfs_flags & 3184e716630dSMartin Matuska ZFS_DEBUG_RAIDZ_RECONSTRUCT) { 3185e716630dSMartin Matuska zfs_dbgmsg("reconstruction " 3186e716630dSMartin Matuska "failed for num_failures=" 3187e716630dSMartin Matuska "%u; tried all " 3188e716630dSMartin Matuska "combinations", 3189e716630dSMartin Matuska num_failures); 3190e716630dSMartin Matuska } 31917877fdebSMatt Macy break; 31927877fdebSMatt Macy } 31937877fdebSMatt Macy 31947877fdebSMatt Macy ASSERT3U(ltgts[t], <, n); 31957877fdebSMatt Macy ASSERT3U(ltgts[t], <=, ltgts[t + 1]); 31967877fdebSMatt Macy 31977877fdebSMatt Macy /* 31987877fdebSMatt Macy * If that spot is available, we're done here. 31997877fdebSMatt Macy * Try the next combination. 32007877fdebSMatt Macy */ 32017877fdebSMatt Macy if (ltgts[t] != ltgts[t + 1]) 3202e716630dSMartin Matuska break; // found next combination 32037877fdebSMatt Macy 32047877fdebSMatt Macy /* 32057877fdebSMatt Macy * Otherwise, reset this tgt to the minimum, 32067877fdebSMatt Macy * and move on to the next tgt. 32077877fdebSMatt Macy */ 32087877fdebSMatt Macy ltgts[t] = ltgts[t - 1] + 1; 32097877fdebSMatt Macy ASSERT3U(ltgts[t], ==, t); 32107877fdebSMatt Macy } 32117877fdebSMatt Macy 32127877fdebSMatt Macy /* Increase the number of failures and keep trying. */ 32137877fdebSMatt Macy if (ltgts[num_failures - 1] == n) 32147877fdebSMatt Macy break; 32157877fdebSMatt Macy } 32167877fdebSMatt Macy } 3217e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) 3218e716630dSMartin Matuska zfs_dbgmsg("reconstruction failed for all num_failures"); 32197877fdebSMatt Macy return (ECKSUM); 32207877fdebSMatt Macy } 32217877fdebSMatt Macy 32227877fdebSMatt Macy void 32237877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) 32247877fdebSMatt Macy { 32257877fdebSMatt Macy for (uint64_t row = 0; row < rm->rm_nrows; row++) { 32267877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[row]; 32277877fdebSMatt Macy vdev_raidz_reconstruct_row(rm, rr, t, nt); 32287877fdebSMatt Macy } 32297877fdebSMatt Macy } 32307877fdebSMatt Macy 32317877fdebSMatt Macy /* 32327877fdebSMatt Macy * Complete a write IO operation on a RAIDZ VDev 32337877fdebSMatt Macy * 32347877fdebSMatt Macy * Outline: 32357877fdebSMatt Macy * 1. Check for errors on the child IOs. 32367877fdebSMatt Macy * 2. Return, setting an error code if too few child VDevs were written 32377877fdebSMatt Macy * to reconstruct the data later. Note that partial writes are 32387877fdebSMatt Macy * considered successful if they can be reconstructed at all. 32397877fdebSMatt Macy */ 32407877fdebSMatt Macy static void 32417877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) 32427877fdebSMatt Macy { 3243e716630dSMartin Matuska int normal_errors = 0; 3244e716630dSMartin Matuska int shadow_errors = 0; 32457877fdebSMatt Macy 32467877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32477877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32487877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 32497877fdebSMatt Macy 32507877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32517877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32527877fdebSMatt Macy 3253e716630dSMartin Matuska if (rc->rc_error != 0) { 32547877fdebSMatt Macy ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 3255e716630dSMartin Matuska normal_errors++; 3256e716630dSMartin Matuska } 3257e716630dSMartin Matuska if (rc->rc_shadow_error != 0) { 3258e716630dSMartin Matuska ASSERT(rc->rc_shadow_error != ECKSUM); 3259e716630dSMartin Matuska shadow_errors++; 32607877fdebSMatt Macy } 32617877fdebSMatt Macy } 32627877fdebSMatt Macy 32637877fdebSMatt Macy /* 32647877fdebSMatt Macy * Treat partial writes as a success. If we couldn't write enough 3265e716630dSMartin Matuska * columns to reconstruct the data, the I/O failed. Otherwise, good 3266e716630dSMartin Matuska * enough. Note that in the case of a shadow write (during raidz 3267e716630dSMartin Matuska * expansion), depending on if we crash, either the normal (old) or 3268e716630dSMartin Matuska * shadow (new) location may become the "real" version of the block, 3269e716630dSMartin Matuska * so both locations must have sufficient redundancy. 3270eda14cbcSMatt Macy * 3271eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 3272eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 3273eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 3274eda14cbcSMatt Macy * if we intend to reallocate. 3275eda14cbcSMatt Macy */ 3276e716630dSMartin Matuska if (normal_errors > rr->rr_firstdatacol || 3277e716630dSMartin Matuska shadow_errors > rr->rr_firstdatacol) { 32787877fdebSMatt Macy zio->io_error = zio_worst_error(zio->io_error, 32797877fdebSMatt Macy vdev_raidz_worst_error(rr)); 32807877fdebSMatt Macy } 3281eda14cbcSMatt Macy } 3282eda14cbcSMatt Macy 3283f9693befSMartin Matuska static void 32847877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, 32857877fdebSMatt Macy raidz_row_t *rr) 32867877fdebSMatt Macy { 32877877fdebSMatt Macy int parity_errors = 0; 32887877fdebSMatt Macy int parity_untried = 0; 32897877fdebSMatt Macy int data_errors = 0; 32907877fdebSMatt Macy int total_errors = 0; 32917877fdebSMatt Macy 32927877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); 32937877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); 32947877fdebSMatt Macy 32957877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 32967877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 32977877fdebSMatt Macy 3298a0b956f5SMartin Matuska /* 3299a0b956f5SMartin Matuska * If scrubbing and a replacing/sparing child vdev determined 3300a0b956f5SMartin Matuska * that not all of its children have an identical copy of the 3301a0b956f5SMartin Matuska * data, then clear the error so the column is treated like 3302a0b956f5SMartin Matuska * any other read and force a repair to correct the damage. 3303a0b956f5SMartin Matuska */ 3304a0b956f5SMartin Matuska if (rc->rc_error == ECKSUM) { 3305a0b956f5SMartin Matuska ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); 3306a0b956f5SMartin Matuska vdev_raidz_checksum_error(zio, rc, rc->rc_abd); 3307a0b956f5SMartin Matuska rc->rc_force_repair = 1; 3308a0b956f5SMartin Matuska rc->rc_error = 0; 3309a0b956f5SMartin Matuska } 33107877fdebSMatt Macy 3311a0b956f5SMartin Matuska if (rc->rc_error) { 33127877fdebSMatt Macy if (c < rr->rr_firstdatacol) 33137877fdebSMatt Macy parity_errors++; 33147877fdebSMatt Macy else 33157877fdebSMatt Macy data_errors++; 33167877fdebSMatt Macy 33177877fdebSMatt Macy total_errors++; 33187877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { 33197877fdebSMatt Macy parity_untried++; 33207877fdebSMatt Macy } 33217877fdebSMatt Macy } 3322eda14cbcSMatt Macy 3323eda14cbcSMatt Macy /* 33247877fdebSMatt Macy * If there were data errors and the number of errors we saw was 33257877fdebSMatt Macy * correctable -- less than or equal to the number of parity disks read 33267877fdebSMatt Macy * -- reconstruct based on the missing data. 3327eda14cbcSMatt Macy */ 33287877fdebSMatt Macy if (data_errors != 0 && 33297877fdebSMatt Macy total_errors <= rr->rr_firstdatacol - parity_untried) { 3330eda14cbcSMatt Macy /* 3331eda14cbcSMatt Macy * We either attempt to read all the parity columns or 3332eda14cbcSMatt Macy * none of them. If we didn't try to read parity, we 3333eda14cbcSMatt Macy * wouldn't be here in the correctable case. There must 3334eda14cbcSMatt Macy * also have been fewer parity errors than parity 3335eda14cbcSMatt Macy * columns or, again, we wouldn't be in this code path. 3336eda14cbcSMatt Macy */ 3337eda14cbcSMatt Macy ASSERT(parity_untried == 0); 33387877fdebSMatt Macy ASSERT(parity_errors < rr->rr_firstdatacol); 3339eda14cbcSMatt Macy 3340eda14cbcSMatt Macy /* 3341eda14cbcSMatt Macy * Identify the data columns that reported an error. 3342eda14cbcSMatt Macy */ 33437877fdebSMatt Macy int n = 0; 33447877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY]; 33457877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { 33467877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 3347eda14cbcSMatt Macy if (rc->rc_error != 0) { 3348eda14cbcSMatt Macy ASSERT(n < VDEV_RAIDZ_MAXPARITY); 3349eda14cbcSMatt Macy tgts[n++] = c; 3350eda14cbcSMatt Macy } 3351eda14cbcSMatt Macy } 3352eda14cbcSMatt Macy 33537877fdebSMatt Macy ASSERT(rr->rr_firstdatacol >= n); 3354eda14cbcSMatt Macy 3355f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, tgts, n); 3356eda14cbcSMatt Macy } 3357eda14cbcSMatt Macy } 3358eda14cbcSMatt Macy 3359eda14cbcSMatt Macy /* 33607877fdebSMatt Macy * Return the number of reads issued. 3361eda14cbcSMatt Macy */ 33627877fdebSMatt Macy static int 33637877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) 33647877fdebSMatt Macy { 33657877fdebSMatt Macy vdev_t *vd = zio->io_vd; 33667877fdebSMatt Macy int nread = 0; 3367eda14cbcSMatt Macy 33687877fdebSMatt Macy rr->rr_missingdata = 0; 33697877fdebSMatt Macy rr->rr_missingparity = 0; 33707877fdebSMatt Macy 33717877fdebSMatt Macy /* 33727877fdebSMatt Macy * If this rows contains empty sectors which are not required 33737877fdebSMatt Macy * for a normal read then allocate an ABD for them now so they 33747877fdebSMatt Macy * may be read, verified, and any needed repairs performed. 33757877fdebSMatt Macy */ 3376e716630dSMartin Matuska if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) 33777877fdebSMatt Macy vdev_draid_map_alloc_empty(zio, rr); 33787877fdebSMatt Macy 33797877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 33807877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 33817877fdebSMatt Macy if (rc->rc_tried || rc->rc_size == 0) 3382eda14cbcSMatt Macy continue; 3383eda14cbcSMatt Macy 3384eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, 3385eda14cbcSMatt Macy vd->vdev_child[rc->rc_devidx], 3386eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size, 3387eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 3388eda14cbcSMatt Macy vdev_raidz_child_done, rc)); 33897877fdebSMatt Macy nread++; 33907877fdebSMatt Macy } 33917877fdebSMatt Macy return (nread); 3392eda14cbcSMatt Macy } 3393eda14cbcSMatt Macy 3394eda14cbcSMatt Macy /* 33957877fdebSMatt Macy * We're here because either there were too many errors to even attempt 33967877fdebSMatt Macy * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() 33977877fdebSMatt Macy * failed. In either case, there is enough bad data to prevent reconstruction. 33987877fdebSMatt Macy * Start checksum ereports for all children which haven't failed. 3399eda14cbcSMatt Macy */ 34007877fdebSMatt Macy static void 34017877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio) 34027877fdebSMatt Macy { 34037877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 3404eda14cbcSMatt Macy 34057877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34067877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 3407eda14cbcSMatt Macy 34087877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) { 34097877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c]; 34107877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; 34117877fdebSMatt Macy 34122c48331dSMatt Macy if (rc->rc_error != 0) 34132c48331dSMatt Macy continue; 34142c48331dSMatt Macy 3415eda14cbcSMatt Macy zio_bad_cksum_t zbc; 3416eda14cbcSMatt Macy zbc.zbc_has_cksum = 0; 34172c48331dSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected; 3418eda14cbcSMatt Macy mutex_enter(&cvd->vdev_stat_lock); 3419eda14cbcSMatt Macy cvd->vdev_stat.vs_checksum_errors++; 3420eda14cbcSMatt Macy mutex_exit(&cvd->vdev_stat_lock); 3421bb2d13b6SMartin Matuska (void) zfs_ereport_start_checksum(zio->io_spa, 3422bb2d13b6SMartin Matuska cvd, &zio->io_bookmark, zio, rc->rc_offset, 3423bb2d13b6SMartin Matuska rc->rc_size, &zbc); 3424eda14cbcSMatt Macy } 3425eda14cbcSMatt Macy } 3426eda14cbcSMatt Macy } 3427eda14cbcSMatt Macy 34287877fdebSMatt Macy void 34297877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio) 34307877fdebSMatt Macy { 34317877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd; 34327877fdebSMatt Macy 3433e716630dSMartin Matuska ASSERT(zio->io_bp != NULL); 34347877fdebSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 34357877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34367877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); 34377877fdebSMatt Macy } 34387877fdebSMatt Macy } else { 3439e716630dSMartin Matuska if (rm->rm_phys_col) { 3440e716630dSMartin Matuska /* 3441e716630dSMartin Matuska * This is an aggregated read. Copy the data and status 3442e716630dSMartin Matuska * from the aggregate abd's to the individual rows. 3443e716630dSMartin Matuska */ 3444e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) { 3445e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i]; 3446e716630dSMartin Matuska 3447e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) { 3448e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c]; 3449e716630dSMartin Matuska if (rc->rc_tried || rc->rc_size == 0) 3450e716630dSMartin Matuska continue; 3451e716630dSMartin Matuska 3452e716630dSMartin Matuska raidz_col_t *prc = 3453e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx]; 3454e716630dSMartin Matuska rc->rc_error = prc->rc_error; 3455e716630dSMartin Matuska rc->rc_tried = prc->rc_tried; 3456e716630dSMartin Matuska rc->rc_skipped = prc->rc_skipped; 3457e716630dSMartin Matuska if (c >= rr->rr_firstdatacol) { 3458e716630dSMartin Matuska /* 3459e716630dSMartin Matuska * Note: this is slightly faster 3460e716630dSMartin Matuska * than using abd_copy_off(). 3461e716630dSMartin Matuska */ 3462e716630dSMartin Matuska char *physbuf = abd_to_buf( 3463e716630dSMartin Matuska prc->rc_abd); 3464e716630dSMartin Matuska void *physloc = physbuf + 3465e716630dSMartin Matuska rc->rc_offset - 3466e716630dSMartin Matuska prc->rc_offset; 3467e716630dSMartin Matuska 3468e716630dSMartin Matuska abd_copy_from_buf(rc->rc_abd, 3469e716630dSMartin Matuska physloc, rc->rc_size); 3470e716630dSMartin Matuska } 3471e716630dSMartin Matuska } 3472e716630dSMartin Matuska } 3473e716630dSMartin Matuska } 3474e716630dSMartin Matuska 34757877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34767877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34777877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio, 34787877fdebSMatt Macy rm, rr); 34797877fdebSMatt Macy } 34807877fdebSMatt Macy 34817877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) { 348287bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 348387bf66d4SMartin Matuska goto done; 348487bf66d4SMartin Matuska 34857877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 34867877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i]; 34877877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr); 34887877fdebSMatt Macy } 3489eda14cbcSMatt Macy zio_checksum_verified(zio); 34907877fdebSMatt Macy } else { 3491eda14cbcSMatt Macy /* 34927877fdebSMatt Macy * A sequential resilver has no checksum which makes 34937877fdebSMatt Macy * combinatoral reconstruction impossible. This code 34947877fdebSMatt Macy * path is unreachable since raidz_checksum_verify() 34957877fdebSMatt Macy * has no checksum to verify and must succeed. 3496eda14cbcSMatt Macy */ 34977877fdebSMatt Macy ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); 3498eda14cbcSMatt Macy 34997877fdebSMatt Macy /* 35007877fdebSMatt Macy * This isn't a typical situation -- either we got a 35017877fdebSMatt Macy * read error or a child silently returned bad data. 35027877fdebSMatt Macy * Read every block so we can try again with as much 35037877fdebSMatt Macy * data and parity as we can track down. If we've 35047877fdebSMatt Macy * already been through once before, all children will 35057877fdebSMatt Macy * be marked as tried so we'll proceed to combinatorial 35067877fdebSMatt Macy * reconstruction. 35077877fdebSMatt Macy */ 35087877fdebSMatt Macy int nread = 0; 35097877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) { 35107877fdebSMatt Macy nread += vdev_raidz_read_all(zio, 35117877fdebSMatt Macy rm->rm_row[i]); 35127877fdebSMatt Macy } 35137877fdebSMatt Macy if (nread != 0) { 35147877fdebSMatt Macy /* 35157877fdebSMatt Macy * Normally our stage is VDEV_IO_DONE, but if 35167877fdebSMatt Macy * we've already called redone(), it will have 35177877fdebSMatt Macy * changed to VDEV_IO_START, in which case we 35187877fdebSMatt Macy * don't want to call redone() again. 35197877fdebSMatt Macy */ 35207877fdebSMatt Macy if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) 35217877fdebSMatt Macy zio_vdev_io_redone(zio); 35227877fdebSMatt Macy return; 35237877fdebSMatt Macy } 3524e716630dSMartin Matuska /* 3525e716630dSMartin Matuska * It would be too expensive to try every possible 3526e716630dSMartin Matuska * combination of failed sectors in every row, so 3527e716630dSMartin Matuska * instead we try every combination of failed current or 3528e716630dSMartin Matuska * past physical disk. This means that if the incorrect 3529e716630dSMartin Matuska * sectors were all on Nparity disks at any point in the 3530e716630dSMartin Matuska * past, we will find the correct data. The only known 3531e716630dSMartin Matuska * case where this is less durable than a non-expanded 3532e716630dSMartin Matuska * RAIDZ, is if we have a silent failure during 3533e716630dSMartin Matuska * expansion. In that case, one block could be 3534e716630dSMartin Matuska * partially in the old format and partially in the 3535e716630dSMartin Matuska * new format, so we'd lost some sectors from the old 3536e716630dSMartin Matuska * format and some from the new format. 3537e716630dSMartin Matuska * 3538e716630dSMartin Matuska * e.g. logical_width=4 physical_width=6 3539e716630dSMartin Matuska * the 15 (6+5+4) possible failed disks are: 3540e716630dSMartin Matuska * width=6 child=0 3541e716630dSMartin Matuska * width=6 child=1 3542e716630dSMartin Matuska * width=6 child=2 3543e716630dSMartin Matuska * width=6 child=3 3544e716630dSMartin Matuska * width=6 child=4 3545e716630dSMartin Matuska * width=6 child=5 3546e716630dSMartin Matuska * width=5 child=0 3547e716630dSMartin Matuska * width=5 child=1 3548e716630dSMartin Matuska * width=5 child=2 3549e716630dSMartin Matuska * width=5 child=3 3550e716630dSMartin Matuska * width=5 child=4 3551e716630dSMartin Matuska * width=4 child=0 3552e716630dSMartin Matuska * width=4 child=1 3553e716630dSMartin Matuska * width=4 child=2 3554e716630dSMartin Matuska * width=4 child=3 3555e716630dSMartin Matuska * And we will try every combination of Nparity of these 3556e716630dSMartin Matuska * failing. 3557e716630dSMartin Matuska * 3558e716630dSMartin Matuska * As a first pass, we can generate every combo, 3559e716630dSMartin Matuska * and try reconstructing, ignoring any known 3560e716630dSMartin Matuska * failures. If any row has too many known + simulated 3561e716630dSMartin Matuska * failures, then we bail on reconstructing with this 3562e716630dSMartin Matuska * number of simulated failures. As an improvement, 3563e716630dSMartin Matuska * we could detect the number of whole known failures 3564e716630dSMartin Matuska * (i.e. we have known failures on these disks for 3565e716630dSMartin Matuska * every row; the disks never succeeded), and 3566e716630dSMartin Matuska * subtract that from the max # failures to simulate. 3567e716630dSMartin Matuska * We could go even further like the current 3568e716630dSMartin Matuska * combrec code, but that doesn't seem like it 3569e716630dSMartin Matuska * gains us very much. If we simulate a failure 3570e716630dSMartin Matuska * that is also a known failure, that's fine. 3571e716630dSMartin Matuska */ 35727877fdebSMatt Macy zio->io_error = vdev_raidz_combrec(zio); 35737877fdebSMatt Macy if (zio->io_error == ECKSUM && 35747877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 35757877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio); 35767877fdebSMatt Macy } 3577eda14cbcSMatt Macy } 3578eda14cbcSMatt Macy } 357987bf66d4SMartin Matuska done: 3580e716630dSMartin Matuska if (rm->rm_lr != NULL) { 3581e716630dSMartin Matuska zfs_rangelock_exit(rm->rm_lr); 3582e716630dSMartin Matuska rm->rm_lr = NULL; 3583e716630dSMartin Matuska } 3584eda14cbcSMatt Macy } 3585eda14cbcSMatt Macy 3586eda14cbcSMatt Macy static void 3587eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 3588eda14cbcSMatt Macy { 35897877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 35907877fdebSMatt Macy if (faulted > vdrz->vd_nparity) 3591eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3592eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 3593eda14cbcSMatt Macy else if (degraded + faulted != 0) 3594eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 3595eda14cbcSMatt Macy else 3596eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 3597eda14cbcSMatt Macy } 3598eda14cbcSMatt Macy 3599eda14cbcSMatt Macy /* 3600eda14cbcSMatt Macy * Determine if any portion of the provided block resides on a child vdev 3601eda14cbcSMatt Macy * with a dirty DTL and therefore needs to be resilvered. The function 3602eda14cbcSMatt Macy * assumes that at least one DTL is dirty which implies that full stripe 3603eda14cbcSMatt Macy * width blocks must be resilvered. 3604eda14cbcSMatt Macy */ 3605eda14cbcSMatt Macy static boolean_t 36067877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 36077877fdebSMatt Macy uint64_t phys_birth) 3608eda14cbcSMatt Macy { 36097877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 3610e716630dSMartin Matuska 3611e716630dSMartin Matuska /* 3612e716630dSMartin Matuska * If we're in the middle of a RAIDZ expansion, this block may be in 3613e716630dSMartin Matuska * the old and/or new location. For simplicity, always resilver it. 3614e716630dSMartin Matuska */ 3615e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) 3616e716630dSMartin Matuska return (B_TRUE); 3617e716630dSMartin Matuska 3618eda14cbcSMatt Macy uint64_t dcols = vd->vdev_children; 36197877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity; 3620eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift; 3621eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */ 36227877fdebSMatt Macy uint64_t b = DVA_GET_OFFSET(dva) >> ashift; 3623eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */ 3624eda14cbcSMatt Macy uint64_t s = ((psize - 1) >> ashift) + 1; 3625eda14cbcSMatt Macy /* The first column for this stripe. */ 3626eda14cbcSMatt Macy uint64_t f = b % dcols; 3627eda14cbcSMatt Macy 36287877fdebSMatt Macy /* Unreachable by sequential resilver. */ 36297877fdebSMatt Macy ASSERT3U(phys_birth, !=, TXG_UNKNOWN); 36307877fdebSMatt Macy 36317877fdebSMatt Macy if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 36327877fdebSMatt Macy return (B_FALSE); 36337877fdebSMatt Macy 3634eda14cbcSMatt Macy if (s + nparity >= dcols) 3635eda14cbcSMatt Macy return (B_TRUE); 3636eda14cbcSMatt Macy 3637eda14cbcSMatt Macy for (uint64_t c = 0; c < s + nparity; c++) { 3638eda14cbcSMatt Macy uint64_t devidx = (f + c) % dcols; 3639eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[devidx]; 3640eda14cbcSMatt Macy 3641eda14cbcSMatt Macy /* 3642eda14cbcSMatt Macy * dsl_scan_need_resilver() already checked vd with 3643eda14cbcSMatt Macy * vdev_dtl_contains(). So here just check cvd with 3644eda14cbcSMatt Macy * vdev_dtl_empty(), cheaper and a good approximation. 3645eda14cbcSMatt Macy */ 3646eda14cbcSMatt Macy if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 3647eda14cbcSMatt Macy return (B_TRUE); 3648eda14cbcSMatt Macy } 3649eda14cbcSMatt Macy 3650eda14cbcSMatt Macy return (B_FALSE); 3651eda14cbcSMatt Macy } 3652eda14cbcSMatt Macy 3653eda14cbcSMatt Macy static void 3654b59a0cdeSMartin Matuska vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, 3655b59a0cdeSMartin Matuska zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 3656eda14cbcSMatt Macy { 3657e92ffd9bSMartin Matuska (void) remain_rs; 3658e92ffd9bSMartin Matuska 3659eda14cbcSMatt Macy vdev_t *raidvd = cvd->vdev_parent; 3660eda14cbcSMatt Macy ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3661eda14cbcSMatt Macy 3662e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3663e716630dSMartin Matuska 3664e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 3665e716630dSMartin Matuska /* 3666e716630dSMartin Matuska * We're in the middle of expansion, in which case the 3667e716630dSMartin Matuska * translation is in flux. Any answer we give may be wrong 3668e716630dSMartin Matuska * by the time we return, so it isn't safe for the caller to 3669e716630dSMartin Matuska * act on it. Therefore we say that this range isn't present 3670e716630dSMartin Matuska * on any children. The only consumers of this are "zpool 3671e716630dSMartin Matuska * initialize" and trimming, both of which are "best effort" 3672e716630dSMartin Matuska * anyway. 3673e716630dSMartin Matuska */ 3674e716630dSMartin Matuska physical_rs->rs_start = physical_rs->rs_end = 0; 3675e716630dSMartin Matuska remain_rs->rs_start = remain_rs->rs_end = 0; 3676e716630dSMartin Matuska return; 3677e716630dSMartin Matuska } 3678e716630dSMartin Matuska 3679e716630dSMartin Matuska uint64_t width = vdrz->vd_physical_width; 3680eda14cbcSMatt Macy uint64_t tgt_col = cvd->vdev_id; 3681eda14cbcSMatt Macy uint64_t ashift = raidvd->vdev_top->vdev_ashift; 3682eda14cbcSMatt Macy 3683eda14cbcSMatt Macy /* make sure the offsets are block-aligned */ 36847877fdebSMatt Macy ASSERT0(logical_rs->rs_start % (1 << ashift)); 36857877fdebSMatt Macy ASSERT0(logical_rs->rs_end % (1 << ashift)); 36867877fdebSMatt Macy uint64_t b_start = logical_rs->rs_start >> ashift; 36877877fdebSMatt Macy uint64_t b_end = logical_rs->rs_end >> ashift; 3688eda14cbcSMatt Macy 3689eda14cbcSMatt Macy uint64_t start_row = 0; 3690eda14cbcSMatt Macy if (b_start > tgt_col) /* avoid underflow */ 3691eda14cbcSMatt Macy start_row = ((b_start - tgt_col - 1) / width) + 1; 3692eda14cbcSMatt Macy 3693eda14cbcSMatt Macy uint64_t end_row = 0; 3694eda14cbcSMatt Macy if (b_end > tgt_col) 3695eda14cbcSMatt Macy end_row = ((b_end - tgt_col - 1) / width) + 1; 3696eda14cbcSMatt Macy 36977877fdebSMatt Macy physical_rs->rs_start = start_row << ashift; 36987877fdebSMatt Macy physical_rs->rs_end = end_row << ashift; 3699eda14cbcSMatt Macy 37007877fdebSMatt Macy ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); 37017877fdebSMatt Macy ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, 37027877fdebSMatt Macy logical_rs->rs_end - logical_rs->rs_start); 37037877fdebSMatt Macy } 37047877fdebSMatt Macy 3705e716630dSMartin Matuska static void 3706e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx) 3707e716630dSMartin Matuska { 3708e716630dSMartin Matuska spa_t *spa = arg; 3709e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3710e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3711e716630dSMartin Matuska 3712e716630dSMartin Matuska /* 3713e716630dSMartin Matuska * Ensure there are no i/os to the range that is being committed. 3714e716630dSMartin Matuska */ 3715e716630dSMartin Matuska uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3716e716630dSMartin Matuska ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); 3717e716630dSMartin Matuska 3718e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3719e716630dSMartin Matuska uint64_t new_offset = 3720e716630dSMartin Matuska MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); 3721e716630dSMartin Matuska /* 3722e716630dSMartin Matuska * We should not have committed anything that failed. 3723e716630dSMartin Matuska */ 3724e716630dSMartin Matuska VERIFY3U(vre->vre_failed_offset, >=, old_offset); 3725e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3726e716630dSMartin Matuska 3727e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 3728e716630dSMartin Matuska old_offset, new_offset - old_offset, 3729e716630dSMartin Matuska RL_WRITER); 3730e716630dSMartin Matuska 3731e716630dSMartin Matuska /* 3732e716630dSMartin Matuska * Update the uberblock that will be written when this txg completes. 3733e716630dSMartin Matuska */ 3734e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, 3735e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); 3736e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = 0; 3737e716630dSMartin Matuska zfs_rangelock_exit(lr); 3738e716630dSMartin Matuska 3739e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3740e716630dSMartin Matuska vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; 3741e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = 0; 3742e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3743e716630dSMartin Matuska 3744e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3745e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3746e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 3747e716630dSMartin Matuska sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); 3748e716630dSMartin Matuska } 3749e716630dSMartin Matuska 3750e716630dSMartin Matuska static void 3751e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) 3752e716630dSMartin Matuska { 3753e716630dSMartin Matuska spa_t *spa = arg; 3754e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 3755e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3756e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 3757e716630dSMartin Matuska 3758e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 3759e716630dSMartin Matuska VERIFY0(vre->vre_offset_pertxg[i]); 3760e716630dSMartin Matuska 3761e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 3762e716630dSMartin Matuska re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; 3763e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width; 3764e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 3765e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 3766e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 3767e716630dSMartin Matuska 3768e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 3769e716630dSMartin Matuska 3770e716630dSMartin Matuska /* 3771e716630dSMartin Matuska * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS 3772e716630dSMartin Matuska * will get written (based on vd_expand_txgs). 3773e716630dSMartin Matuska */ 3774e716630dSMartin Matuska vdev_config_dirty(vd); 3775e716630dSMartin Matuska 3776e716630dSMartin Matuska /* 3777e716630dSMartin Matuska * Before we change vre_state, the on-disk state must reflect that we 3778e716630dSMartin Matuska * have completed all copying, so that vdev_raidz_io_start() can use 3779e716630dSMartin Matuska * vre_state to determine if the reflow is in progress. See also the 3780e716630dSMartin Matuska * end of spa_raidz_expand_thread(). 3781e716630dSMartin Matuska */ 3782e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, 3783e716630dSMartin Matuska raidvd->vdev_ms_count << raidvd->vdev_ms_shift); 3784e716630dSMartin Matuska 3785e716630dSMartin Matuska vre->vre_end_time = gethrestime_sec(); 3786e716630dSMartin Matuska vre->vre_state = DSS_FINISHED; 3787e716630dSMartin Matuska 3788e716630dSMartin Matuska uint64_t state = vre->vre_state; 3789e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3790e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 3791e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 3792e716630dSMartin Matuska 3793e716630dSMartin Matuska uint64_t end_time = vre->vre_end_time; 3794e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 3795e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 3796e716630dSMartin Matuska sizeof (end_time), 1, &end_time, tx)); 3797e716630dSMartin Matuska 3798e716630dSMartin Matuska spa->spa_uberblock.ub_raidz_reflow_info = 0; 3799e716630dSMartin Matuska 3800e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion completed", tx, 3801e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 3802e716630dSMartin Matuska (unsigned long long)vd->vdev_id, 3803e716630dSMartin Matuska (unsigned long long)vd->vdev_children); 3804e716630dSMartin Matuska 3805e716630dSMartin Matuska spa->spa_raidz_expand = NULL; 3806e716630dSMartin Matuska raidvd->vdev_rz_expanding = B_FALSE; 3807e716630dSMartin Matuska 3808e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 3809e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 3810e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 3811e716630dSMartin Matuska 3812e716630dSMartin Matuska spa_notify_waiters(spa); 3813e716630dSMartin Matuska 3814e716630dSMartin Matuska /* 3815e716630dSMartin Matuska * While we're in syncing context take the opportunity to 3816e716630dSMartin Matuska * setup a scrub. All the data has been sucessfully copied 3817e716630dSMartin Matuska * but we have not validated any checksums. 3818e716630dSMartin Matuska */ 381917aab35aSMartin Matuska setup_sync_arg_t setup_sync_arg = { 382017aab35aSMartin Matuska .func = POOL_SCAN_SCRUB, 382117aab35aSMartin Matuska .txgstart = 0, 382217aab35aSMartin Matuska .txgend = 0, 382317aab35aSMartin Matuska }; 382417aab35aSMartin Matuska if (zfs_scrub_after_expand && 382517aab35aSMartin Matuska dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { 382617aab35aSMartin Matuska dsl_scan_setup_sync(&setup_sync_arg, tx); 382717aab35aSMartin Matuska } 3828e716630dSMartin Matuska } 3829e716630dSMartin Matuska 3830e716630dSMartin Matuska /* 383117aab35aSMartin Matuska * State of one copy batch. 3832e716630dSMartin Matuska */ 3833e716630dSMartin Matuska typedef struct raidz_reflow_arg { 383417aab35aSMartin Matuska vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ 383517aab35aSMartin Matuska zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ 383617aab35aSMartin Matuska uint64_t rra_txg; /* TXG of this batch. */ 383717aab35aSMartin Matuska uint_t rra_ashift; /* Ashift of the vdev. */ 383817aab35aSMartin Matuska uint32_t rra_tbd; /* Number of in-flight ZIOs. */ 383917aab35aSMartin Matuska uint32_t rra_writes; /* Number of write ZIOs. */ 384017aab35aSMartin Matuska zio_t *rra_zio[]; /* Write ZIO pointers. */ 3841e716630dSMartin Matuska } raidz_reflow_arg_t; 3842e716630dSMartin Matuska 3843e716630dSMartin Matuska /* 384417aab35aSMartin Matuska * Write of the new location on one child is done. Once all of them are done 384517aab35aSMartin Matuska * we can unlock and free everything. 3846e716630dSMartin Matuska */ 3847e716630dSMartin Matuska static void 3848e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio) 3849e716630dSMartin Matuska { 3850e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3851e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3852e716630dSMartin Matuska 3853e716630dSMartin Matuska abd_free(zio->io_abd); 3854e716630dSMartin Matuska 3855e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3856e716630dSMartin Matuska if (zio->io_error != 0) { 3857e716630dSMartin Matuska /* Force a reflow pause on errors */ 3858e716630dSMartin Matuska vre->vre_failed_offset = 3859e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3860e716630dSMartin Matuska } 3861e716630dSMartin Matuska ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); 3862e716630dSMartin Matuska vre->vre_outstanding_bytes -= zio->io_size; 3863e716630dSMartin Matuska if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < 3864e716630dSMartin Matuska vre->vre_failed_offset) { 3865e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += 3866e716630dSMartin Matuska zio->io_size; 3867e716630dSMartin Matuska } 3868e716630dSMartin Matuska cv_signal(&vre->vre_cv); 386917aab35aSMartin Matuska boolean_t done = (--rra->rra_tbd == 0); 3870e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3871e716630dSMartin Matuska 387217aab35aSMartin Matuska if (!done) 387317aab35aSMartin Matuska return; 3874e716630dSMartin Matuska spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 387517aab35aSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 387617aab35aSMartin Matuska kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); 3877e716630dSMartin Matuska } 3878e716630dSMartin Matuska 3879e716630dSMartin Matuska /* 388017aab35aSMartin Matuska * Read of the old location on one child is done. Once all of them are done 388117aab35aSMartin Matuska * writes should have all the data and we can issue them. 3882e716630dSMartin Matuska */ 3883e716630dSMartin Matuska static void 3884e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio) 3885e716630dSMartin Matuska { 3886e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private; 3887e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre; 3888e716630dSMartin Matuska 388917aab35aSMartin Matuska /* Reads of only one block use write ABDs. For bigger free gangs. */ 389017aab35aSMartin Matuska if (zio->io_size > (1 << rra->rra_ashift)) 389117aab35aSMartin Matuska abd_free(zio->io_abd); 389217aab35aSMartin Matuska 3893e716630dSMartin Matuska /* 3894e716630dSMartin Matuska * If the read failed, or if it was done on a vdev that is not fully 3895e716630dSMartin Matuska * healthy (e.g. a child that has a resilver in progress), we may not 3896e716630dSMartin Matuska * have the correct data. Note that it's OK if the write proceeds. 3897e716630dSMartin Matuska * It may write garbage but the location is otherwise unused and we 3898e716630dSMartin Matuska * will retry later due to vre_failed_offset. 3899e716630dSMartin Matuska */ 3900e716630dSMartin Matuska if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { 3901e716630dSMartin Matuska zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " 3902e716630dSMartin Matuska "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", 3903e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 3904e716630dSMartin Matuska (long long)rra->rra_lr->lr_length, 3905e716630dSMartin Matuska (long long)rra->rra_txg, 3906e716630dSMartin Matuska zio->io_error, 3907e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), 3908e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_MISSING)); 3909e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3910e716630dSMartin Matuska /* Force a reflow pause on errors */ 3911e716630dSMartin Matuska vre->vre_failed_offset = 3912e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 3913e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3914e716630dSMartin Matuska } 3915e716630dSMartin Matuska 391617aab35aSMartin Matuska if (atomic_dec_32_nv(&rra->rra_tbd) > 0) 391717aab35aSMartin Matuska return; 3918dd215568SMartin Matuska uint32_t writes = rra->rra_tbd = rra->rra_writes; 3919dd215568SMartin Matuska for (uint64_t i = 0; i < writes; i++) 392017aab35aSMartin Matuska zio_nowait(rra->rra_zio[i]); 3921e716630dSMartin Matuska } 3922e716630dSMartin Matuska 3923e716630dSMartin Matuska static void 3924e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, 3925e716630dSMartin Matuska dmu_tx_t *tx) 3926e716630dSMartin Matuska { 3927e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 3928e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3929e716630dSMartin Matuska 3930e716630dSMartin Matuska if (offset == 0) 3931e716630dSMartin Matuska return; 3932e716630dSMartin Matuska 3933e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 3934e716630dSMartin Matuska ASSERT3U(vre->vre_offset, <=, offset); 3935e716630dSMartin Matuska vre->vre_offset = offset; 3936e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 3937e716630dSMartin Matuska 3938e716630dSMartin Matuska if (vre->vre_offset_pertxg[txgoff] == 0) { 3939e716630dSMartin Matuska dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, 3940e716630dSMartin Matuska spa, tx); 3941e716630dSMartin Matuska } 3942e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = offset; 3943e716630dSMartin Matuska } 3944e716630dSMartin Matuska 3945e716630dSMartin Matuska static boolean_t 3946e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) 3947e716630dSMartin Matuska { 3948e716630dSMartin Matuska for (int i = 0; i < raidz_vd->vdev_children; i++) { 3949e716630dSMartin Matuska /* Quick check if a child is being replaced */ 3950e716630dSMartin Matuska if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) 3951e716630dSMartin Matuska return (B_TRUE); 3952e716630dSMartin Matuska } 3953e716630dSMartin Matuska return (B_FALSE); 3954e716630dSMartin Matuska } 3955e716630dSMartin Matuska 3956e716630dSMartin Matuska static boolean_t 3957b59a0cdeSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, 3958e716630dSMartin Matuska dmu_tx_t *tx) 3959e716630dSMartin Matuska { 3960e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 396117aab35aSMartin Matuska uint_t ashift = vd->vdev_top->vdev_ashift; 3962e716630dSMartin Matuska 3963b59a0cdeSMartin Matuska zfs_range_seg_t *rs = zfs_range_tree_first(rt); 396417aab35aSMartin Matuska if (rt == NULL) 3965e716630dSMartin Matuska return (B_FALSE); 3966b59a0cdeSMartin Matuska uint64_t offset = zfs_rs_get_start(rs, rt); 3967e716630dSMartin Matuska ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); 3968b59a0cdeSMartin Matuska uint64_t size = zfs_rs_get_end(rs, rt) - offset; 3969e716630dSMartin Matuska ASSERT3U(size, >=, 1 << ashift); 397017aab35aSMartin Matuska ASSERT(IS_P2ALIGNED(size, 1 << ashift)); 3971e716630dSMartin Matuska 3972e716630dSMartin Matuska uint64_t blkid = offset >> ashift; 397317aab35aSMartin Matuska uint_t old_children = vd->vdev_children - 1; 3974e716630dSMartin Matuska 3975e716630dSMartin Matuska /* 3976e716630dSMartin Matuska * We can only progress to the point that writes will not overlap 3977e716630dSMartin Matuska * with blocks whose progress has not yet been recorded on disk. 3978e716630dSMartin Matuska * Since partially-copied rows are still read from the old location, 3979e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent 3980e716630dSMartin Matuska * row-wise overlap. 3981e716630dSMartin Matuska * 3982e716630dSMartin Matuska * Note that even if we are skipping over a large unallocated region, 3983e716630dSMartin Matuska * we can't move the on-disk progress to `offset`, because concurrent 3984e716630dSMartin Matuska * writes/allocations could still use the currently-unallocated 3985e716630dSMartin Matuska * region. 3986e716630dSMartin Matuska */ 3987e716630dSMartin Matuska uint64_t ubsync_blkid = 3988e716630dSMartin Matuska RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; 3989e716630dSMartin Matuska uint64_t next_overwrite_blkid = ubsync_blkid + 3990e716630dSMartin Matuska ubsync_blkid / old_children - old_children; 3991e716630dSMartin Matuska VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); 3992e716630dSMartin Matuska if (blkid >= next_overwrite_blkid) { 3993e716630dSMartin Matuska raidz_reflow_record_progress(vre, 3994e716630dSMartin Matuska next_overwrite_blkid << ashift, tx); 3995e716630dSMartin Matuska return (B_TRUE); 3996e716630dSMartin Matuska } 3997e716630dSMartin Matuska 399817aab35aSMartin Matuska size = MIN(size, raidz_expand_max_copy_bytes); 399917aab35aSMartin Matuska size = MIN(size, (uint64_t)old_children * 400017aab35aSMartin Matuska MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); 400117aab35aSMartin Matuska size = MAX(size, 1 << ashift); 400217aab35aSMartin Matuska uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); 400317aab35aSMartin Matuska size = (uint64_t)blocks << ashift; 4004e716630dSMartin Matuska 4005b59a0cdeSMartin Matuska zfs_range_tree_remove(rt, offset, size); 400617aab35aSMartin Matuska 400717aab35aSMartin Matuska uint_t reads = MIN(blocks, old_children); 400817aab35aSMartin Matuska uint_t writes = MIN(blocks, vd->vdev_children); 400917aab35aSMartin Matuska raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + 401017aab35aSMartin Matuska sizeof (zio_t *) * writes, KM_SLEEP); 4011e716630dSMartin Matuska rra->rra_vre = vre; 4012e716630dSMartin Matuska rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, 401317aab35aSMartin Matuska offset, size, RL_WRITER); 4014e716630dSMartin Matuska rra->rra_txg = dmu_tx_get_txg(tx); 401517aab35aSMartin Matuska rra->rra_ashift = ashift; 401617aab35aSMartin Matuska rra->rra_tbd = reads; 401717aab35aSMartin Matuska rra->rra_writes = writes; 4018e716630dSMartin Matuska 401917aab35aSMartin Matuska raidz_reflow_record_progress(vre, offset + size, tx); 4020e716630dSMartin Matuska 4021e716630dSMartin Matuska /* 4022e716630dSMartin Matuska * SCL_STATE will be released when the read and write are done, 4023e716630dSMartin Matuska * by raidz_reflow_write_done(). 4024e716630dSMartin Matuska */ 4025e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4026e716630dSMartin Matuska 4027e716630dSMartin Matuska /* check if a replacing vdev was added, if so treat it as an error */ 4028e716630dSMartin Matuska if (vdev_raidz_expand_child_replacing(vd)) { 4029e716630dSMartin Matuska zfs_dbgmsg("replacing vdev encountered, reflow paused at " 4030e716630dSMartin Matuska "offset=%llu txg=%llu", 4031e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset, 4032e716630dSMartin Matuska (long long)rra->rra_txg); 4033e716630dSMartin Matuska 4034e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4035e716630dSMartin Matuska vre->vre_failed_offset = 4036e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); 4037e716630dSMartin Matuska cv_signal(&vre->vre_cv); 4038e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4039e716630dSMartin Matuska 4040e716630dSMartin Matuska /* drop everything we acquired */ 4041e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, spa); 404217aab35aSMartin Matuska zfs_rangelock_exit(rra->rra_lr); 404317aab35aSMartin Matuska kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); 4044e716630dSMartin Matuska return (B_TRUE); 4045e716630dSMartin Matuska } 4046e716630dSMartin Matuska 404717aab35aSMartin Matuska mutex_enter(&vre->vre_lock); 404817aab35aSMartin Matuska vre->vre_outstanding_bytes += size; 404917aab35aSMartin Matuska mutex_exit(&vre->vre_lock); 4050e716630dSMartin Matuska 405117aab35aSMartin Matuska /* Allocate ABD and ZIO for each child we write. */ 405217aab35aSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 405317aab35aSMartin Matuska zio_t *pio = spa->spa_txg_zio[txgoff]; 405417aab35aSMartin Matuska uint_t b = blocks / vd->vdev_children; 405517aab35aSMartin Matuska uint_t bb = blocks % vd->vdev_children; 405617aab35aSMartin Matuska for (uint_t i = 0; i < writes; i++) { 405717aab35aSMartin Matuska uint_t n = b + (i < bb); 405817aab35aSMartin Matuska abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); 405917aab35aSMartin Matuska rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, 406017aab35aSMartin Matuska vd->vdev_child[(blkid + i) % vd->vdev_children], 406117aab35aSMartin Matuska ((blkid + i) / vd->vdev_children) << ashift, 406217aab35aSMartin Matuska abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 406317aab35aSMartin Matuska ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); 406417aab35aSMartin Matuska } 406517aab35aSMartin Matuska 406617aab35aSMartin Matuska /* 406717aab35aSMartin Matuska * Allocate and issue ZIO for each child we read. For reads of only 406817aab35aSMartin Matuska * one block we can use respective writer ABDs, since they will also 406917aab35aSMartin Matuska * have only one block. For bigger reads create gang ABDs and fill 407017aab35aSMartin Matuska * them with respective blocks from writer ABDs. 407117aab35aSMartin Matuska */ 407217aab35aSMartin Matuska b = blocks / old_children; 407317aab35aSMartin Matuska bb = blocks % old_children; 407417aab35aSMartin Matuska for (uint_t i = 0; i < reads; i++) { 407517aab35aSMartin Matuska uint_t n = b + (i < bb); 407617aab35aSMartin Matuska abd_t *abd; 407717aab35aSMartin Matuska if (n > 1) { 407817aab35aSMartin Matuska abd = abd_alloc_gang(); 407917aab35aSMartin Matuska for (uint_t j = 0; j < n; j++) { 408017aab35aSMartin Matuska uint_t b = j * old_children + i; 408117aab35aSMartin Matuska abd_t *cabd = abd_get_offset_size( 408217aab35aSMartin Matuska rra->rra_zio[b % vd->vdev_children]->io_abd, 408317aab35aSMartin Matuska (b / vd->vdev_children) << ashift, 408417aab35aSMartin Matuska 1 << ashift); 408517aab35aSMartin Matuska abd_gang_add(abd, cabd, B_TRUE); 408617aab35aSMartin Matuska } 408717aab35aSMartin Matuska } else { 408817aab35aSMartin Matuska abd = rra->rra_zio[i]->io_abd; 408917aab35aSMartin Matuska } 409017aab35aSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 409117aab35aSMartin Matuska vd->vdev_child[(blkid + i) % old_children], 409217aab35aSMartin Matuska ((blkid + i) / old_children) << ashift, abd, 409317aab35aSMartin Matuska n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 409417aab35aSMartin Matuska ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); 409517aab35aSMartin Matuska } 4096e716630dSMartin Matuska 4097e716630dSMartin Matuska return (B_FALSE); 4098e716630dSMartin Matuska } 4099e716630dSMartin Matuska 4100e716630dSMartin Matuska /* 4101e716630dSMartin Matuska * For testing (ztest specific) 4102e716630dSMartin Matuska */ 4103e716630dSMartin Matuska static void 4104e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point) 4105e716630dSMartin Matuska { 4106e716630dSMartin Matuska while (raidz_expand_pause_point != 0 && 4107e716630dSMartin Matuska raidz_expand_pause_point <= pause_point) 4108e716630dSMartin Matuska delay(hz); 4109e716630dSMartin Matuska } 4110e716630dSMartin Matuska 4111e716630dSMartin Matuska static void 4112e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio) 4113e716630dSMartin Matuska { 4114e716630dSMartin Matuska zio_t *pio = zio->io_private; 4115e716630dSMartin Matuska 4116e716630dSMartin Matuska mutex_enter(&pio->io_lock); 4117e716630dSMartin Matuska pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 4118e716630dSMartin Matuska mutex_exit(&pio->io_lock); 4119e716630dSMartin Matuska } 4120e716630dSMartin Matuska 4121e716630dSMartin Matuska /* 4122e716630dSMartin Matuska * Reflow the beginning portion of the vdev into an intermediate scratch area 4123e716630dSMartin Matuska * in memory and on disk. This operation must be persisted on disk before we 4124e716630dSMartin Matuska * proceed to overwrite the beginning portion with the reflowed data. 4125e716630dSMartin Matuska * 4126e716630dSMartin Matuska * This multi-step task can fail to complete if disk errors are encountered 4127e716630dSMartin Matuska * and we can return here after a pause (waiting for disk to become healthy). 4128e716630dSMartin Matuska */ 4129e716630dSMartin Matuska static void 4130e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) 4131e716630dSMartin Matuska { 4132e716630dSMartin Matuska vdev_raidz_expand_t *vre = arg; 4133e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4134e716630dSMartin Matuska zio_t *pio; 4135e716630dSMartin Matuska int error; 4136e716630dSMartin Matuska 4137e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4138e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4139e716630dSMartin Matuska int ashift = raidvd->vdev_ashift; 4140aca928a5SMartin Matuska uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, 4141aca928a5SMartin Matuska uint64_t); 4142e716630dSMartin Matuska uint64_t logical_size = write_size * raidvd->vdev_children; 4143e716630dSMartin Matuska uint64_t read_size = 4144e716630dSMartin Matuska P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 4145e716630dSMartin Matuska 1 << ashift); 4146e716630dSMartin Matuska 4147e716630dSMartin Matuska /* 4148e716630dSMartin Matuska * The scratch space must be large enough to get us to the point 4149e716630dSMartin Matuska * that one row does not overlap itself when moved. This is checked 4150e716630dSMartin Matuska * by vdev_raidz_attach_check(). 4151e716630dSMartin Matuska */ 4152e716630dSMartin Matuska VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); 4153e716630dSMartin Matuska VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); 4154e716630dSMartin Matuska VERIFY3U(write_size, <=, read_size); 4155e716630dSMartin Matuska 4156e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 4157e716630dSMartin Matuska 0, logical_size, RL_WRITER); 4158e716630dSMartin Matuska 4159e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4160e716630dSMartin Matuska KM_SLEEP); 4161e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4162e716630dSMartin Matuska abds[i] = abd_alloc_linear(read_size, B_FALSE); 4163e716630dSMartin Matuska } 4164e716630dSMartin Matuska 4165e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); 4166e716630dSMartin Matuska 4167e716630dSMartin Matuska /* 4168e716630dSMartin Matuska * If we have already written the scratch area then we must read from 4169e716630dSMartin Matuska * there, since new writes were redirected there while we were paused 4170e716630dSMartin Matuska * or the original location may have been partially overwritten with 4171e716630dSMartin Matuska * reflowed data. 4172e716630dSMartin Matuska */ 4173e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { 4174e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); 4175e716630dSMartin Matuska /* 4176e716630dSMartin Matuska * Read from scratch space. 4177e716630dSMartin Matuska */ 4178e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4179e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4180e716630dSMartin Matuska /* 4181e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE 4182e716630dSMartin Matuska * to the offset to calculate the physical offset to 4183e716630dSMartin Matuska * write to. Passing in a negative offset makes us 4184e716630dSMartin Matuska * access the scratch area. 4185e716630dSMartin Matuska */ 4186e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, 4187e716630dSMartin Matuska raidvd->vdev_child[i], 4188e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 418917aab35aSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 4190e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4191e716630dSMartin Matuska } 4192e716630dSMartin Matuska error = zio_wait(pio); 4193e716630dSMartin Matuska if (error != 0) { 4194e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading scratch location", 4195e716630dSMartin Matuska error); 4196e716630dSMartin Matuska goto io_error_exit; 4197e716630dSMartin Matuska } 4198e716630dSMartin Matuska goto overwrite; 4199e716630dSMartin Matuska } 4200e716630dSMartin Matuska 4201e716630dSMartin Matuska /* 4202e716630dSMartin Matuska * Read from original location. 4203e716630dSMartin Matuska */ 4204e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4205e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children - 1; i++) { 4206e716630dSMartin Matuska ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); 4207e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4208e716630dSMartin Matuska 0, abds[i], read_size, ZIO_TYPE_READ, 420917aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4210e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4211e716630dSMartin Matuska } 4212e716630dSMartin Matuska error = zio_wait(pio); 4213e716630dSMartin Matuska if (error != 0) { 4214e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading original location", error); 4215e716630dSMartin Matuska io_error_exit: 4216e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4217e716630dSMartin Matuska abd_free(abds[i]); 4218e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4219e716630dSMartin Matuska zfs_rangelock_exit(lr); 4220e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4221e716630dSMartin Matuska return; 4222e716630dSMartin Matuska } 4223e716630dSMartin Matuska 4224e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); 4225e716630dSMartin Matuska 4226e716630dSMartin Matuska /* 4227e716630dSMartin Matuska * Reflow in memory. 4228e716630dSMartin Matuska */ 4229e716630dSMartin Matuska uint64_t logical_sectors = logical_size >> ashift; 4230e716630dSMartin Matuska for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { 4231e716630dSMartin Matuska int oldchild = i % (raidvd->vdev_children - 1); 4232e716630dSMartin Matuska uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; 4233e716630dSMartin Matuska 4234e716630dSMartin Matuska int newchild = i % raidvd->vdev_children; 4235e716630dSMartin Matuska uint64_t newoff = (i / raidvd->vdev_children) << ashift; 4236e716630dSMartin Matuska 4237e716630dSMartin Matuska /* a single sector should not be copying over itself */ 4238e716630dSMartin Matuska ASSERT(!(newchild == oldchild && newoff == oldoff)); 4239e716630dSMartin Matuska 4240e716630dSMartin Matuska abd_copy_off(abds[newchild], abds[oldchild], 4241e716630dSMartin Matuska newoff, oldoff, 1 << ashift); 4242e716630dSMartin Matuska } 4243e716630dSMartin Matuska 4244e716630dSMartin Matuska /* 4245e716630dSMartin Matuska * Verify that we filled in everything we intended to (write_size on 4246e716630dSMartin Matuska * each child). 4247e716630dSMartin Matuska */ 4248e716630dSMartin Matuska VERIFY0(logical_sectors % raidvd->vdev_children); 4249e716630dSMartin Matuska VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, 4250e716630dSMartin Matuska write_size); 4251e716630dSMartin Matuska 4252e716630dSMartin Matuska /* 4253e716630dSMartin Matuska * Write to scratch location (boot area). 4254e716630dSMartin Matuska */ 4255e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4256e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4257e716630dSMartin Matuska /* 4258e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4259e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4260e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4261e716630dSMartin Matuska */ 4262e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4263e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 426417aab35aSMartin Matuska write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 4265e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); 4266e716630dSMartin Matuska } 4267e716630dSMartin Matuska error = zio_wait(pio); 4268e716630dSMartin Matuska if (error != 0) { 4269e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing scratch location", error); 4270e716630dSMartin Matuska goto io_error_exit; 4271e716630dSMartin Matuska } 4272e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4273e716630dSMartin Matuska zio_flush(pio, raidvd); 4274e716630dSMartin Matuska zio_wait(pio); 4275e716630dSMartin Matuska 4276e716630dSMartin Matuska zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", 4277e716630dSMartin Matuska (long long)logical_size); 4278e716630dSMartin Matuska 4279e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); 4280e716630dSMartin Matuska 4281e716630dSMartin Matuska /* 4282e716630dSMartin Matuska * Update uberblock to indicate that scratch space is valid. This is 4283e716630dSMartin Matuska * needed because after this point, the real location may be 4284e716630dSMartin Matuska * overwritten. If we crash, we need to get the data from the 4285e716630dSMartin Matuska * scratch space, rather than the real location. 4286e716630dSMartin Matuska * 4287e716630dSMartin Matuska * Note: ub_timestamp is bumped so that vdev_uberblock_compare() 4288e716630dSMartin Matuska * will prefer this uberblock. 4289e716630dSMartin Matuska */ 4290e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); 4291e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4292e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4293e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4294e716630dSMartin Matuska if (spa_multihost(spa)) 4295e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4296e716630dSMartin Matuska 4297e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4298e716630dSMartin Matuska "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", 4299e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4300e716630dSMartin Matuska (long long)logical_size, 4301e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4302e716630dSMartin Matuska 4303e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); 4304e716630dSMartin Matuska 4305e716630dSMartin Matuska /* 4306e716630dSMartin Matuska * Overwrite with reflow'ed data. 4307e716630dSMartin Matuska */ 4308e716630dSMartin Matuska overwrite: 4309e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4310e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4311e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4312e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 431317aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, 4314e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4315e716630dSMartin Matuska } 4316e716630dSMartin Matuska error = zio_wait(pio); 4317e716630dSMartin Matuska if (error != 0) { 4318e716630dSMartin Matuska /* 4319e716630dSMartin Matuska * When we exit early here and drop the range lock, new 4320e716630dSMartin Matuska * writes will go into the scratch area so we'll need to 4321e716630dSMartin Matuska * read from there when we return after pausing. 4322e716630dSMartin Matuska */ 4323e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing real location", error); 4324e716630dSMartin Matuska /* 4325e716630dSMartin Matuska * Update the uberblock that is written when this txg completes. 4326e716630dSMartin Matuska */ 4327e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, 4328e716630dSMartin Matuska logical_size); 4329e716630dSMartin Matuska goto io_error_exit; 4330e716630dSMartin Matuska } 4331e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4332e716630dSMartin Matuska zio_flush(pio, raidvd); 4333e716630dSMartin Matuska zio_wait(pio); 4334e716630dSMartin Matuska 4335e716630dSMartin Matuska zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", 4336e716630dSMartin Matuska (long long)logical_size); 4337e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4338e716630dSMartin Matuska abd_free(abds[i]); 4339e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4340e716630dSMartin Matuska 4341e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); 4342e716630dSMartin Matuska 4343e716630dSMartin Matuska /* 4344e716630dSMartin Matuska * Update uberblock to indicate that the initial part has been 4345e716630dSMartin Matuska * reflow'ed. This is needed because after this point (when we exit 4346e716630dSMartin Matuska * the rangelock), we allow regular writes to this region, which will 4347e716630dSMartin Matuska * be written to the new location only (because reflow_offset_next == 4348e716630dSMartin Matuska * reflow_offset_synced). If we crashed and re-copied from the 4349e716630dSMartin Matuska * scratch space, we would lose the regular writes. 4350e716630dSMartin Matuska */ 4351e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, 4352e716630dSMartin Matuska logical_size); 4353e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4354e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4355e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4356e716630dSMartin Matuska if (spa_multihost(spa)) 4357e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4358e716630dSMartin Matuska 4359e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated " 4360e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4361e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4362e716630dSMartin Matuska (long long)logical_size, 4363e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4364e716630dSMartin Matuska 4365e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); 4366e716630dSMartin Matuska 4367e716630dSMartin Matuska /* 4368e716630dSMartin Matuska * Update progress. 4369e716630dSMartin Matuska */ 4370e716630dSMartin Matuska vre->vre_offset = logical_size; 4371e716630dSMartin Matuska zfs_rangelock_exit(lr); 4372e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4373e716630dSMartin Matuska 4374e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4375e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4376e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4377e716630dSMartin Matuska /* 4378e716630dSMartin Matuska * Note - raidz_reflow_sync() will update the uberblock state to 4379e716630dSMartin Matuska * RRSS_SCRATCH_INVALID_SYNCED_REFLOW 4380e716630dSMartin Matuska */ 4381e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4382e716630dSMartin Matuska 4383e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); 4384e716630dSMartin Matuska } 4385e716630dSMartin Matuska 4386e716630dSMartin Matuska /* 4387e716630dSMartin Matuska * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work 4388e716630dSMartin Matuska * here. No other i/o can be in progress, so we don't need the vre_rangelock. 4389e716630dSMartin Matuska */ 4390e716630dSMartin Matuska void 4391e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa) 4392e716630dSMartin Matuska { 4393e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4394e716630dSMartin Matuska uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); 4395e716630dSMartin Matuska ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); 4396e716630dSMartin Matuska 4397e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4398e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4399e716630dSMartin Matuska ASSERT0(logical_size % raidvd->vdev_children); 4400e716630dSMartin Matuska uint64_t write_size = logical_size / raidvd->vdev_children; 4401e716630dSMartin Matuska 4402e716630dSMartin Matuska zio_t *pio; 4403e716630dSMartin Matuska 4404e716630dSMartin Matuska /* 4405e716630dSMartin Matuska * Read from scratch space. 4406e716630dSMartin Matuska */ 4407e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), 4408e716630dSMartin Matuska KM_SLEEP); 4409e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4410e716630dSMartin Matuska abds[i] = abd_alloc_linear(write_size, B_FALSE); 4411e716630dSMartin Matuska } 4412e716630dSMartin Matuska 4413e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4414e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4415e716630dSMartin Matuska /* 4416e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to 4417e716630dSMartin Matuska * the offset to calculate the physical offset to write to. 4418e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area. 4419e716630dSMartin Matuska */ 4420e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4421e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], 442217aab35aSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, 4423e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4424e716630dSMartin Matuska } 4425e716630dSMartin Matuska zio_wait(pio); 4426e716630dSMartin Matuska 4427e716630dSMartin Matuska /* 4428e716630dSMartin Matuska * Overwrite real location with reflow'ed data. 4429e716630dSMartin Matuska */ 4430e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4431e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) { 4432e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 4433e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE, 443417aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, 0, 4435e716630dSMartin Matuska raidz_scratch_child_done, pio)); 4436e716630dSMartin Matuska } 4437e716630dSMartin Matuska zio_wait(pio); 4438e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0); 4439e716630dSMartin Matuska zio_flush(pio, raidvd); 4440e716630dSMartin Matuska zio_wait(pio); 4441e716630dSMartin Matuska 4442e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " 4443e716630dSMartin Matuska "to real location", (long long)logical_size); 4444e716630dSMartin Matuska 4445e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) 4446e716630dSMartin Matuska abd_free(abds[i]); 4447e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); 4448e716630dSMartin Matuska 4449e716630dSMartin Matuska /* 4450e716630dSMartin Matuska * Update uberblock. 4451e716630dSMartin Matuska */ 4452e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, 4453e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); 4454e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++; 4455e716630dSMartin Matuska VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, 4456e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); 4457e716630dSMartin Matuska if (spa_multihost(spa)) 4458e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync); 4459e716630dSMartin Matuska 4460e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: uberblock updated " 4461e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", 4462e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg, 4463e716630dSMartin Matuska (long long)logical_size, 4464e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp); 4465e716630dSMartin Matuska 4466e716630dSMartin Matuska dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 4467e716630dSMartin Matuska spa_first_txg(spa)); 4468e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 4469e716630dSMartin Matuska vre->vre_offset = logical_size; 4470e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset; 4471e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; 4472e716630dSMartin Matuska /* 4473e716630dSMartin Matuska * Note that raidz_reflow_sync() will update the uberblock once more 4474e716630dSMartin Matuska */ 4475e716630dSMartin Matuska raidz_reflow_sync(spa, tx); 4476e716630dSMartin Matuska 4477e716630dSMartin Matuska dmu_tx_commit(tx); 4478e716630dSMartin Matuska 4479e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG); 4480e716630dSMartin Matuska } 4481e716630dSMartin Matuska 4482e716630dSMartin Matuska static boolean_t 4483e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) 4484e716630dSMartin Matuska { 4485e716630dSMartin Matuska (void) zthr; 4486e716630dSMartin Matuska spa_t *spa = arg; 4487e716630dSMartin Matuska 4488e716630dSMartin Matuska return (spa->spa_raidz_expand != NULL && 4489e716630dSMartin Matuska !spa->spa_raidz_expand->vre_waiting_for_resilver); 4490e716630dSMartin Matuska } 4491e716630dSMartin Matuska 4492e716630dSMartin Matuska /* 4493e716630dSMartin Matuska * RAIDZ expansion background thread 4494e716630dSMartin Matuska * 4495e716630dSMartin Matuska * Can be called multiple times if the reflow is paused 4496e716630dSMartin Matuska */ 4497e716630dSMartin Matuska static void 4498e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr) 4499e716630dSMartin Matuska { 4500e716630dSMartin Matuska spa_t *spa = arg; 4501e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4502e716630dSMartin Matuska 4503e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) 4504e716630dSMartin Matuska vre->vre_offset = 0; 4505e716630dSMartin Matuska else 4506e716630dSMartin Matuska vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); 4507e716630dSMartin Matuska 4508e716630dSMartin Matuska /* Reflow the begining portion using the scratch area */ 4509e716630dSMartin Matuska if (vre->vre_offset == 0) { 4510e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), 4511e716630dSMartin Matuska NULL, raidz_reflow_scratch_sync, 4512e716630dSMartin Matuska vre, 0, ZFS_SPACE_CHECK_NONE)); 4513e716630dSMartin Matuska 4514e716630dSMartin Matuska /* if we encountered errors then pause */ 4515e716630dSMartin Matuska if (vre->vre_offset == 0) { 4516e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4517e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4518e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4519e716630dSMartin Matuska return; 4520e716630dSMartin Matuska } 4521e716630dSMartin Matuska } 4522e716630dSMartin Matuska 4523e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4524e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4525e716630dSMartin Matuska 4526e716630dSMartin Matuska uint64_t guid = raidvd->vdev_guid; 4527e716630dSMartin Matuska 4528e716630dSMartin Matuska /* Iterate over all the remaining metaslabs */ 4529e716630dSMartin Matuska for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; 4530e716630dSMartin Matuska i < raidvd->vdev_ms_count && 4531e716630dSMartin Matuska !zthr_iscancelled(zthr) && 4532e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX; i++) { 4533e716630dSMartin Matuska metaslab_t *msp = raidvd->vdev_ms[i]; 4534e716630dSMartin Matuska 4535e716630dSMartin Matuska metaslab_disable(msp); 4536e716630dSMartin Matuska mutex_enter(&msp->ms_lock); 4537e716630dSMartin Matuska 4538e716630dSMartin Matuska /* 4539e716630dSMartin Matuska * The metaslab may be newly created (for the expanded 4540e716630dSMartin Matuska * space), in which case its trees won't exist yet, 4541e716630dSMartin Matuska * so we need to bail out early. 4542e716630dSMartin Matuska */ 4543e716630dSMartin Matuska if (msp->ms_new) { 4544e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4545e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4546e716630dSMartin Matuska continue; 4547e716630dSMartin Matuska } 4548e716630dSMartin Matuska 4549e716630dSMartin Matuska VERIFY0(metaslab_load(msp)); 4550e716630dSMartin Matuska 4551e716630dSMartin Matuska /* 4552e716630dSMartin Matuska * We want to copy everything except the free (allocatable) 4553e716630dSMartin Matuska * space. Note that there may be a little bit more free 4554e716630dSMartin Matuska * space (e.g. in ms_defer), and it's fine to copy that too. 4555e716630dSMartin Matuska */ 455617aab35aSMartin Matuska uint64_t shift, start; 4557b59a0cdeSMartin Matuska zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( 455817aab35aSMartin Matuska raidvd, msp, &start, &shift); 4559b59a0cdeSMartin Matuska zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, 456017aab35aSMartin Matuska start, shift); 4561b59a0cdeSMartin Matuska zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); 4562b59a0cdeSMartin Matuska zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, 4563b59a0cdeSMartin Matuska rt); 4564e716630dSMartin Matuska mutex_exit(&msp->ms_lock); 4565e716630dSMartin Matuska 4566e716630dSMartin Matuska /* 4567e716630dSMartin Matuska * Force the last sector of each metaslab to be copied. This 4568e716630dSMartin Matuska * ensures that we advance the on-disk progress to the end of 4569e716630dSMartin Matuska * this metaslab while the metaslab is disabled. Otherwise, we 4570e716630dSMartin Matuska * could move past this metaslab without advancing the on-disk 4571e716630dSMartin Matuska * progress, and then an allocation to this metaslab would not 4572e716630dSMartin Matuska * be copied. 4573e716630dSMartin Matuska */ 4574e716630dSMartin Matuska int sectorsz = 1 << raidvd->vdev_ashift; 4575e716630dSMartin Matuska uint64_t ms_last_offset = msp->ms_start + 4576e716630dSMartin Matuska msp->ms_size - sectorsz; 4577b59a0cdeSMartin Matuska if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { 4578b59a0cdeSMartin Matuska zfs_range_tree_add(rt, ms_last_offset, sectorsz); 4579e716630dSMartin Matuska } 4580e716630dSMartin Matuska 4581e716630dSMartin Matuska /* 4582e716630dSMartin Matuska * When we are resuming from a paused expansion (i.e. 4583e716630dSMartin Matuska * when importing a pool with a expansion in progress), 4584e716630dSMartin Matuska * discard any state that we have already processed. 4585e716630dSMartin Matuska */ 458617aab35aSMartin Matuska if (vre->vre_offset > msp->ms_start) { 4587b59a0cdeSMartin Matuska zfs_range_tree_clear(rt, msp->ms_start, 458817aab35aSMartin Matuska vre->vre_offset - msp->ms_start); 458917aab35aSMartin Matuska } 4590e716630dSMartin Matuska 4591e716630dSMartin Matuska while (!zthr_iscancelled(zthr) && 4592b59a0cdeSMartin Matuska !zfs_range_tree_is_empty(rt) && 4593e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX) { 4594e716630dSMartin Matuska 4595e716630dSMartin Matuska /* 4596e716630dSMartin Matuska * We need to periodically drop the config lock so that 4597e716630dSMartin Matuska * writers can get in. Additionally, we can't wait 4598e716630dSMartin Matuska * for a txg to sync while holding a config lock 4599e716630dSMartin Matuska * (since a waiting writer could cause a 3-way deadlock 4600e716630dSMartin Matuska * with the sync thread, which also gets a config 4601e716630dSMartin Matuska * lock for reader). So we can't hold the config lock 4602e716630dSMartin Matuska * while calling dmu_tx_assign(). 4603e716630dSMartin Matuska */ 4604e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4605e716630dSMartin Matuska 4606e716630dSMartin Matuska /* 4607e716630dSMartin Matuska * If requested, pause the reflow when the amount 4608e716630dSMartin Matuska * specified by raidz_expand_max_reflow_bytes is reached 4609e716630dSMartin Matuska * 4610e716630dSMartin Matuska * This pause is only used during testing or debugging. 4611e716630dSMartin Matuska */ 4612e716630dSMartin Matuska while (raidz_expand_max_reflow_bytes != 0 && 4613e716630dSMartin Matuska raidz_expand_max_reflow_bytes <= 4614e716630dSMartin Matuska vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { 4615e716630dSMartin Matuska delay(hz); 4616e716630dSMartin Matuska } 4617e716630dSMartin Matuska 4618e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4619e716630dSMartin Matuska while (vre->vre_outstanding_bytes > 4620e716630dSMartin Matuska raidz_expand_max_copy_bytes) { 4621e716630dSMartin Matuska cv_wait(&vre->vre_cv, &vre->vre_lock); 4622e716630dSMartin Matuska } 4623e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4624e716630dSMartin Matuska 4625e716630dSMartin Matuska dmu_tx_t *tx = 4626e716630dSMartin Matuska dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4627e716630dSMartin Matuska 4628*61145dc2SMartin Matuska VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); 4629e716630dSMartin Matuska uint64_t txg = dmu_tx_get_txg(tx); 4630e716630dSMartin Matuska 4631e716630dSMartin Matuska /* 4632e716630dSMartin Matuska * Reacquire the vdev_config lock. Theoretically, the 4633e716630dSMartin Matuska * vdev_t that we're expanding may have changed. 4634e716630dSMartin Matuska */ 4635e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4636e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4637e716630dSMartin Matuska 4638e716630dSMartin Matuska boolean_t needsync = 4639e716630dSMartin Matuska raidz_reflow_impl(raidvd, vre, rt, tx); 4640e716630dSMartin Matuska 4641e716630dSMartin Matuska dmu_tx_commit(tx); 4642e716630dSMartin Matuska 4643e716630dSMartin Matuska if (needsync) { 4644e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4645e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, txg); 4646e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, 4647e716630dSMartin Matuska RW_READER); 4648e716630dSMartin Matuska } 4649e716630dSMartin Matuska } 4650e716630dSMartin Matuska 4651e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4652e716630dSMartin Matuska 4653e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE); 4654b59a0cdeSMartin Matuska zfs_range_tree_vacate(rt, NULL, NULL); 4655b59a0cdeSMartin Matuska zfs_range_tree_destroy(rt); 4656e716630dSMartin Matuska 4657e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4658e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 4659e716630dSMartin Matuska } 4660e716630dSMartin Matuska 4661e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG); 4662e716630dSMartin Matuska 4663e716630dSMartin Matuska /* 4664e716630dSMartin Matuska * The txg_wait_synced() here ensures that all reflow zio's have 4665e716630dSMartin Matuska * completed, and vre_failed_offset has been set if necessary. It 4666e716630dSMartin Matuska * also ensures that the progress of the last raidz_reflow_sync() is 4667e716630dSMartin Matuska * written to disk before raidz_reflow_complete_sync() changes the 4668e716630dSMartin Matuska * in-memory vre_state. vdev_raidz_io_start() uses vre_state to 4669e716630dSMartin Matuska * determine if a reflow is in progress, in which case we may need to 4670e716630dSMartin Matuska * write to both old and new locations. Therefore we can only change 4671e716630dSMartin Matuska * vre_state once this is not necessary, which is once the on-disk 4672e716630dSMartin Matuska * progress (in spa_ubsync) has been set past any possible writes (to 4673e716630dSMartin Matuska * the end of the last metaslab). 4674e716630dSMartin Matuska */ 4675e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, 0); 4676e716630dSMartin Matuska 4677e716630dSMartin Matuska if (!zthr_iscancelled(zthr) && 4678e716630dSMartin Matuska vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { 4679e716630dSMartin Matuska /* 4680e716630dSMartin Matuska * We are not being canceled or paused, so the reflow must be 4681e716630dSMartin Matuska * complete. In that case also mark it as completed on disk. 4682e716630dSMartin Matuska */ 4683e716630dSMartin Matuska ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); 4684e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL, 4685e716630dSMartin Matuska raidz_reflow_complete_sync, spa, 4686e716630dSMartin Matuska 0, ZFS_SPACE_CHECK_NONE)); 4687e716630dSMartin Matuska (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); 4688e716630dSMartin Matuska } else { 4689e716630dSMartin Matuska /* 4690e716630dSMartin Matuska * Wait for all copy zio's to complete and for all the 4691e716630dSMartin Matuska * raidz_reflow_sync() synctasks to be run. 4692e716630dSMartin Matuska */ 4693e716630dSMartin Matuska spa_history_log_internal(spa, "reflow pause", 4694e716630dSMartin Matuska NULL, "offset=%llu failed_offset=%lld", 4695e716630dSMartin Matuska (long long)vre->vre_offset, 4696e716630dSMartin Matuska (long long)vre->vre_failed_offset); 4697e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4698e716630dSMartin Matuska if (vre->vre_failed_offset != UINT64_MAX) { 4699e716630dSMartin Matuska /* 4700e716630dSMartin Matuska * Reset progress so that we will retry everything 4701e716630dSMartin Matuska * after the point that something failed. 4702e716630dSMartin Matuska */ 4703e716630dSMartin Matuska vre->vre_offset = vre->vre_failed_offset; 4704e716630dSMartin Matuska vre->vre_failed_offset = UINT64_MAX; 4705e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE; 4706e716630dSMartin Matuska } 4707e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4708e716630dSMartin Matuska } 4709e716630dSMartin Matuska } 4710e716630dSMartin Matuska 4711e716630dSMartin Matuska void 4712e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa) 4713e716630dSMartin Matuska { 4714e716630dSMartin Matuska ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); 4715e716630dSMartin Matuska spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", 4716e716630dSMartin Matuska spa_raidz_expand_thread_check, spa_raidz_expand_thread, 4717e716630dSMartin Matuska spa, defclsyspri); 4718e716630dSMartin Matuska } 4719e716630dSMartin Matuska 4720e716630dSMartin Matuska void 4721e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd) 4722e716630dSMartin Matuska { 4723e716630dSMartin Matuska spa_t *spa = vd->vdev_spa; 4724e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL) { 4725e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4726e716630dSMartin Matuska /* 4727e716630dSMartin Matuska * we get called often from vdev_dtl_reassess() so make 4728e716630dSMartin Matuska * sure it's our vdev and any replacing is complete 4729e716630dSMartin Matuska */ 4730e716630dSMartin Matuska if (vd->vdev_top->vdev_id == vre->vre_vdev_id && 4731e716630dSMartin Matuska !vdev_raidz_expand_child_replacing(vd->vdev_top)) { 4732e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4733e716630dSMartin Matuska if (vre->vre_waiting_for_resilver) { 4734e716630dSMartin Matuska vdev_dbgmsg(vd, "DTL reassessed, " 4735e716630dSMartin Matuska "continuing raidz expansion"); 4736e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_FALSE; 4737e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4738e716630dSMartin Matuska } 4739e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4740e716630dSMartin Matuska } 4741e716630dSMartin Matuska } 4742e716630dSMartin Matuska } 4743e716630dSMartin Matuska 4744e716630dSMartin Matuska int 4745e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child) 4746e716630dSMartin Matuska { 4747e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4748e716630dSMartin Matuska uint64_t new_children = raidvd->vdev_children; 4749e716630dSMartin Matuska 4750e716630dSMartin Matuska /* 4751e716630dSMartin Matuska * We use the "boot" space as scratch space to handle overwriting the 4752e716630dSMartin Matuska * initial part of the vdev. If it is too small, then this expansion 4753e716630dSMartin Matuska * is not allowed. This would be very unusual (e.g. ashift > 13 and 4754e716630dSMartin Matuska * >200 children). 4755e716630dSMartin Matuska */ 4756e716630dSMartin Matuska if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { 4757e716630dSMartin Matuska return (EINVAL); 4758e716630dSMartin Matuska } 4759e716630dSMartin Matuska return (0); 4760e716630dSMartin Matuska } 4761e716630dSMartin Matuska 4762e716630dSMartin Matuska void 4763e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) 4764e716630dSMartin Matuska { 4765e716630dSMartin Matuska vdev_t *new_child = arg; 4766e716630dSMartin Matuska spa_t *spa = new_child->vdev_spa; 4767e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent; 4768e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd; 4769e716630dSMartin Matuska ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); 4770e716630dSMartin Matuska ASSERT3P(raidvd->vdev_top, ==, raidvd); 4771e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); 4772e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); 4773e716630dSMartin Matuska ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, 4774e716630dSMartin Matuska new_child); 4775e716630dSMartin Matuska 4776e716630dSMartin Matuska spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); 4777e716630dSMartin Matuska 4778e716630dSMartin Matuska vdrz->vd_physical_width++; 4779e716630dSMartin Matuska 4780e716630dSMartin Matuska VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); 4781e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; 4782e716630dSMartin Matuska vdrz->vn_vre.vre_offset = 0; 4783e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4784e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4785e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr); 4786e716630dSMartin Matuska 4787e716630dSMartin Matuska /* 4788e716630dSMartin Matuska * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get 4789e716630dSMartin Matuska * written to the config. 4790e716630dSMartin Matuska */ 4791e716630dSMartin Matuska vdev_config_dirty(raidvd); 4792e716630dSMartin Matuska 4793e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = gethrestime_sec(); 4794e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = 0; 4795e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4796e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = 0; 4797e716630dSMartin Matuska 4798e716630dSMartin Matuska uint64_t state = vdrz->vn_vre.vre_state; 4799e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4800e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4801e716630dSMartin Matuska sizeof (state), 1, &state, tx)); 4802e716630dSMartin Matuska 4803e716630dSMartin Matuska uint64_t start_time = vdrz->vn_vre.vre_start_time; 4804e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset, 4805e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4806e716630dSMartin Matuska sizeof (start_time), 1, &start_time, tx)); 4807e716630dSMartin Matuska 4808e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4809e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); 4810e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset, 4811e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); 4812e716630dSMartin Matuska 4813e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion started", tx, 4814e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa), 4815e716630dSMartin Matuska (unsigned long long)raidvd->vdev_id, 4816e716630dSMartin Matuska (unsigned long long)raidvd->vdev_children); 4817e716630dSMartin Matuska } 4818e716630dSMartin Matuska 4819e716630dSMartin Matuska int 4820e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd) 4821e716630dSMartin Matuska { 4822e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4823e716630dSMartin Matuska int err; 4824e716630dSMartin Matuska 4825e716630dSMartin Matuska uint64_t state = DSS_NONE; 4826e716630dSMartin Matuska uint64_t start_time = 0; 4827e716630dSMartin Matuska uint64_t end_time = 0; 4828e716630dSMartin Matuska uint64_t bytes_copied = 0; 4829e716630dSMartin Matuska 4830e716630dSMartin Matuska if (vd->vdev_top_zap != 0) { 4831e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4832e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, 4833e716630dSMartin Matuska sizeof (state), 1, &state); 4834e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4835e716630dSMartin Matuska return (err); 4836e716630dSMartin Matuska 4837e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4838e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, 4839e716630dSMartin Matuska sizeof (start_time), 1, &start_time); 4840e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4841e716630dSMartin Matuska return (err); 4842e716630dSMartin Matuska 4843e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4844e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, 4845e716630dSMartin Matuska sizeof (end_time), 1, &end_time); 4846e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4847e716630dSMartin Matuska return (err); 4848e716630dSMartin Matuska 4849e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset, 4850e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, 4851e716630dSMartin Matuska sizeof (bytes_copied), 1, &bytes_copied); 4852e716630dSMartin Matuska if (err != 0 && err != ENOENT) 4853e716630dSMartin Matuska return (err); 4854e716630dSMartin Matuska } 4855e716630dSMartin Matuska 4856e716630dSMartin Matuska /* 4857e716630dSMartin Matuska * If we are in the middle of expansion, vre_state should have 4858e716630dSMartin Matuska * already been set by vdev_raidz_init(). 4859e716630dSMartin Matuska */ 4860e716630dSMartin Matuska EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); 4861e716630dSMartin Matuska vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; 4862e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = start_time; 4863e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = end_time; 4864e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = bytes_copied; 4865e716630dSMartin Matuska 4866e716630dSMartin Matuska return (0); 4867e716630dSMartin Matuska } 4868e716630dSMartin Matuska 4869e716630dSMartin Matuska int 4870e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) 4871e716630dSMartin Matuska { 4872e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 4873e716630dSMartin Matuska 4874e716630dSMartin Matuska if (vre == NULL) { 4875e716630dSMartin Matuska /* no removal in progress; find most recent completed */ 4876e716630dSMartin Matuska for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 4877e716630dSMartin Matuska vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; 4878e716630dSMartin Matuska if (vd->vdev_ops == &vdev_raidz_ops) { 4879e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 4880e716630dSMartin Matuska 4881e716630dSMartin Matuska if (vdrz->vn_vre.vre_end_time != 0 && 4882e716630dSMartin Matuska (vre == NULL || 4883e716630dSMartin Matuska vdrz->vn_vre.vre_end_time > 4884e716630dSMartin Matuska vre->vre_end_time)) { 4885e716630dSMartin Matuska vre = &vdrz->vn_vre; 4886e716630dSMartin Matuska } 4887e716630dSMartin Matuska } 4888e716630dSMartin Matuska } 4889e716630dSMartin Matuska } 4890e716630dSMartin Matuska 4891e716630dSMartin Matuska if (vre == NULL) { 4892e716630dSMartin Matuska return (SET_ERROR(ENOENT)); 4893e716630dSMartin Matuska } 4894e716630dSMartin Matuska 4895e716630dSMartin Matuska pres->pres_state = vre->vre_state; 4896e716630dSMartin Matuska pres->pres_expanding_vdev = vre->vre_vdev_id; 4897e716630dSMartin Matuska 4898e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); 4899e716630dSMartin Matuska pres->pres_to_reflow = vd->vdev_stat.vs_alloc; 4900e716630dSMartin Matuska 4901e716630dSMartin Matuska mutex_enter(&vre->vre_lock); 4902e716630dSMartin Matuska pres->pres_reflowed = vre->vre_bytes_copied; 4903e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 4904e716630dSMartin Matuska pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; 4905e716630dSMartin Matuska mutex_exit(&vre->vre_lock); 4906e716630dSMartin Matuska 4907e716630dSMartin Matuska pres->pres_start_time = vre->vre_start_time; 4908e716630dSMartin Matuska pres->pres_end_time = vre->vre_end_time; 4909e716630dSMartin Matuska pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; 4910e716630dSMartin Matuska 4911e716630dSMartin Matuska return (0); 4912e716630dSMartin Matuska } 4913e716630dSMartin Matuska 49147877fdebSMatt Macy /* 49157877fdebSMatt Macy * Initialize private RAIDZ specific fields from the nvlist. 49167877fdebSMatt Macy */ 49177877fdebSMatt Macy static int 49187877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) 49197877fdebSMatt Macy { 49207877fdebSMatt Macy uint_t children; 49217877fdebSMatt Macy nvlist_t **child; 49227877fdebSMatt Macy int error = nvlist_lookup_nvlist_array(nv, 49237877fdebSMatt Macy ZPOOL_CONFIG_CHILDREN, &child, &children); 49247877fdebSMatt Macy if (error != 0) 49257877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49267877fdebSMatt Macy 4927e716630dSMartin Matuska uint64_t nparity; 49287877fdebSMatt Macy if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { 49297877fdebSMatt Macy if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 49307877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49317877fdebSMatt Macy 49327877fdebSMatt Macy /* 49337877fdebSMatt Macy * Previous versions could only support 1 or 2 parity 49347877fdebSMatt Macy * device. 49357877fdebSMatt Macy */ 49367877fdebSMatt Macy if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) 49377877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49387877fdebSMatt Macy else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) 49397877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49407877fdebSMatt Macy } else { 49417877fdebSMatt Macy /* 49427877fdebSMatt Macy * We require the parity to be specified for SPAs that 49437877fdebSMatt Macy * support multiple parity levels. 49447877fdebSMatt Macy */ 49457877fdebSMatt Macy if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 49467877fdebSMatt Macy return (SET_ERROR(EINVAL)); 49477877fdebSMatt Macy 49487877fdebSMatt Macy /* 49497877fdebSMatt Macy * Otherwise, we default to 1 parity device for RAID-Z. 49507877fdebSMatt Macy */ 49517877fdebSMatt Macy nparity = 1; 49527877fdebSMatt Macy } 49537877fdebSMatt Macy 4954e716630dSMartin Matuska vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); 4955e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = -1; 4956e716630dSMartin Matuska vdrz->vn_vre.vre_offset = UINT64_MAX; 4957e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX; 4958e716630dSMartin Matuska mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); 4959e716630dSMartin Matuska cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); 4960e716630dSMartin Matuska zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); 4961e716630dSMartin Matuska mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); 4962e716630dSMartin Matuska avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, 4963e716630dSMartin Matuska sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); 4964e716630dSMartin Matuska 4965e716630dSMartin Matuska vdrz->vd_physical_width = children; 49667877fdebSMatt Macy vdrz->vd_nparity = nparity; 49677877fdebSMatt Macy 4968e716630dSMartin Matuska /* note, the ID does not exist when creating a pool */ 4969e716630dSMartin Matuska (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, 4970e716630dSMartin Matuska &vdrz->vn_vre.vre_vdev_id); 4971e716630dSMartin Matuska 4972e716630dSMartin Matuska boolean_t reflow_in_progress = 4973e716630dSMartin Matuska nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 4974e716630dSMartin Matuska if (reflow_in_progress) { 4975e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre; 4976e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING; 4977e716630dSMartin Matuska } 4978e716630dSMartin Matuska 4979e716630dSMartin Matuska vdrz->vd_original_width = children; 4980e716630dSMartin Matuska uint64_t *txgs; 4981e716630dSMartin Matuska unsigned int txgs_size = 0; 4982e716630dSMartin Matuska error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 4983e716630dSMartin Matuska &txgs, &txgs_size); 4984e716630dSMartin Matuska if (error == 0) { 4985e716630dSMartin Matuska for (int i = 0; i < txgs_size; i++) { 4986e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); 4987e716630dSMartin Matuska re->re_txg = txgs[txgs_size - i - 1]; 4988e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width - i; 4989e716630dSMartin Matuska 4990e716630dSMartin Matuska if (reflow_in_progress) 4991e716630dSMartin Matuska re->re_logical_width--; 4992e716630dSMartin Matuska 4993e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re); 4994e716630dSMartin Matuska } 4995e716630dSMartin Matuska 4996e716630dSMartin Matuska vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; 4997e716630dSMartin Matuska } 4998e716630dSMartin Matuska if (reflow_in_progress) { 4999e716630dSMartin Matuska vdrz->vd_original_width--; 5000e716630dSMartin Matuska zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", 5001e716630dSMartin Matuska children, txgs_size); 5002e716630dSMartin Matuska } 5003e716630dSMartin Matuska 50047877fdebSMatt Macy *tsd = vdrz; 50057877fdebSMatt Macy 50067877fdebSMatt Macy return (0); 50077877fdebSMatt Macy } 50087877fdebSMatt Macy 50097877fdebSMatt Macy static void 50107877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd) 50117877fdebSMatt Macy { 5012e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd; 5013e716630dSMartin Matuska if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) 5014e716630dSMartin Matuska vd->vdev_spa->spa_raidz_expand = NULL; 5015e716630dSMartin Matuska reflow_node_t *re; 5016e716630dSMartin Matuska void *cookie = NULL; 5017e716630dSMartin Matuska avl_tree_t *tree = &vdrz->vd_expand_txgs; 5018e716630dSMartin Matuska while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) 5019e716630dSMartin Matuska kmem_free(re, sizeof (*re)); 5020e716630dSMartin Matuska avl_destroy(&vdrz->vd_expand_txgs); 5021e716630dSMartin Matuska mutex_destroy(&vdrz->vd_expand_lock); 5022e716630dSMartin Matuska mutex_destroy(&vdrz->vn_vre.vre_lock); 5023e716630dSMartin Matuska cv_destroy(&vdrz->vn_vre.vre_cv); 5024e716630dSMartin Matuska zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); 5025e716630dSMartin Matuska kmem_free(vdrz, sizeof (*vdrz)); 50267877fdebSMatt Macy } 50277877fdebSMatt Macy 50287877fdebSMatt Macy /* 50297877fdebSMatt Macy * Add RAIDZ specific fields to the config nvlist. 50307877fdebSMatt Macy */ 50317877fdebSMatt Macy static void 50327877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) 50337877fdebSMatt Macy { 50347877fdebSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); 50357877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 50367877fdebSMatt Macy 50377877fdebSMatt Macy /* 50387877fdebSMatt Macy * Make sure someone hasn't managed to sneak a fancy new vdev 50397877fdebSMatt Macy * into a crufty old storage pool. 50407877fdebSMatt Macy */ 50417877fdebSMatt Macy ASSERT(vdrz->vd_nparity == 1 || 50427877fdebSMatt Macy (vdrz->vd_nparity <= 2 && 50437877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || 50447877fdebSMatt Macy (vdrz->vd_nparity <= 3 && 50457877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); 50467877fdebSMatt Macy 50477877fdebSMatt Macy /* 50487877fdebSMatt Macy * Note that we'll add these even on storage pools where they 50497877fdebSMatt Macy * aren't strictly required -- older software will just ignore 50507877fdebSMatt Macy * it. 50517877fdebSMatt Macy */ 50527877fdebSMatt Macy fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); 5053e716630dSMartin Matuska 5054e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) { 5055e716630dSMartin Matuska fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); 5056e716630dSMartin Matuska } 5057e716630dSMartin Matuska 5058e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock); 5059e716630dSMartin Matuska if (!avl_is_empty(&vdrz->vd_expand_txgs)) { 5060e716630dSMartin Matuska uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); 5061e716630dSMartin Matuska uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, 5062e716630dSMartin Matuska KM_SLEEP); 5063e716630dSMartin Matuska uint64_t i = 0; 5064e716630dSMartin Matuska 5065e716630dSMartin Matuska for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); 5066e716630dSMartin Matuska re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { 5067e716630dSMartin Matuska txgs[i++] = re->re_txg; 5068e716630dSMartin Matuska } 5069e716630dSMartin Matuska 5070e716630dSMartin Matuska fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, 5071e716630dSMartin Matuska txgs, count); 5072e716630dSMartin Matuska 5073e716630dSMartin Matuska kmem_free(txgs, sizeof (uint64_t) * count); 5074e716630dSMartin Matuska } 5075e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock); 50767877fdebSMatt Macy } 50777877fdebSMatt Macy 50787877fdebSMatt Macy static uint64_t 50797877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd) 50807877fdebSMatt Macy { 50817877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd; 50827877fdebSMatt Macy return (vdrz->vd_nparity); 50837877fdebSMatt Macy } 50847877fdebSMatt Macy 50857877fdebSMatt Macy static uint64_t 50867877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd) 50877877fdebSMatt Macy { 50887877fdebSMatt Macy return (vd->vdev_children); 5089eda14cbcSMatt Macy } 5090eda14cbcSMatt Macy 5091eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = { 50927877fdebSMatt Macy .vdev_op_init = vdev_raidz_init, 50937877fdebSMatt Macy .vdev_op_fini = vdev_raidz_fini, 5094eda14cbcSMatt Macy .vdev_op_open = vdev_raidz_open, 5095eda14cbcSMatt Macy .vdev_op_close = vdev_raidz_close, 5096eda14cbcSMatt Macy .vdev_op_asize = vdev_raidz_asize, 50977877fdebSMatt Macy .vdev_op_min_asize = vdev_raidz_min_asize, 50987877fdebSMatt Macy .vdev_op_min_alloc = NULL, 5099eda14cbcSMatt Macy .vdev_op_io_start = vdev_raidz_io_start, 5100eda14cbcSMatt Macy .vdev_op_io_done = vdev_raidz_io_done, 5101eda14cbcSMatt Macy .vdev_op_state_change = vdev_raidz_state_change, 5102eda14cbcSMatt Macy .vdev_op_need_resilver = vdev_raidz_need_resilver, 5103eda14cbcSMatt Macy .vdev_op_hold = NULL, 5104eda14cbcSMatt Macy .vdev_op_rele = NULL, 5105eda14cbcSMatt Macy .vdev_op_remap = NULL, 5106eda14cbcSMatt Macy .vdev_op_xlate = vdev_raidz_xlate, 51077877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 51087877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 51097877fdebSMatt Macy .vdev_op_config_generate = vdev_raidz_config_generate, 51107877fdebSMatt Macy .vdev_op_nparity = vdev_raidz_nparity, 51117877fdebSMatt Macy .vdev_op_ndisks = vdev_raidz_ndisks, 5112eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ 5113eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 5114eda14cbcSMatt Macy }; 5115e716630dSMartin Matuska 5116e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, 5117e716630dSMartin Matuska "For testing, pause RAIDZ expansion after reflowing this many bytes"); 5118e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, 5119e716630dSMartin Matuska "Max amount of concurrent i/o for RAIDZ expansion"); 5120e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, 5121e716630dSMartin Matuska "For expanded RAIDZ, aggregate reads that have more rows than this"); 5122e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, 5123e716630dSMartin Matuska "For expanded RAIDZ, automatically start a pool scrub when expansion " 5124e716630dSMartin Matuska "completes"); 5125