161145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy
23eda14cbcSMatt Macy /*
24eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
252c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
27eda14cbcSMatt Macy */
28eda14cbcSMatt Macy
29eda14cbcSMatt Macy #include <sys/zfs_context.h>
30eda14cbcSMatt Macy #include <sys/spa.h>
31e716630dSMartin Matuska #include <sys/spa_impl.h>
32e716630dSMartin Matuska #include <sys/zap.h>
33eda14cbcSMatt Macy #include <sys/vdev_impl.h>
34e716630dSMartin Matuska #include <sys/metaslab_impl.h>
35eda14cbcSMatt Macy #include <sys/zio.h>
36eda14cbcSMatt Macy #include <sys/zio_checksum.h>
37e716630dSMartin Matuska #include <sys/dmu_tx.h>
38eda14cbcSMatt Macy #include <sys/abd.h>
39e716630dSMartin Matuska #include <sys/zfs_rlock.h>
40eda14cbcSMatt Macy #include <sys/fs/zfs.h>
41eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h>
42eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
43eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h>
447877fdebSMatt Macy #include <sys/vdev_draid.h>
45e716630dSMartin Matuska #include <sys/uberblock_impl.h>
46e716630dSMartin Matuska #include <sys/dsl_scan.h>
47eda14cbcSMatt Macy
48eda14cbcSMatt Macy #ifdef ZFS_DEBUG
49eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
50eda14cbcSMatt Macy #endif
51eda14cbcSMatt Macy
52eda14cbcSMatt Macy /*
53eda14cbcSMatt Macy * Virtual device vector for RAID-Z.
54eda14cbcSMatt Macy *
55eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity,
56eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity,
57eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the
58eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
59eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
60eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
61eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance
62eda14cbcSMatt Macy * for writes.
63eda14cbcSMatt Macy *
64eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then
65eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its
66eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to
67eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to
68eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
69eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will
70eda14cbcSMatt Macy * suffer.
71eda14cbcSMatt Macy *
72eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the
73eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
74eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the
75eda14cbcSMatt Macy * field are defined as follows:
76eda14cbcSMatt Macy *
77eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR
78eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B
79eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression:
80eda14cbcSMatt Macy *
81eda14cbcSMatt Macy * (A * 2)_7 = A_6
82eda14cbcSMatt Macy * (A * 2)_6 = A_5
83eda14cbcSMatt Macy * (A * 2)_5 = A_4
84eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7
85eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7
86eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7
87eda14cbcSMatt Macy * (A * 2)_1 = A_0
88eda14cbcSMatt Macy * (A * 2)_0 = A_7
89eda14cbcSMatt Macy *
90eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
91eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting
92eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
93eda14cbcSMatt Macy *
94eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a
95eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of
96eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
97eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
98eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore
99eda14cbcSMatt Macy * A ^ (255 - 1) = A^254.
100eda14cbcSMatt Macy *
101eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns,
102eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations:
103eda14cbcSMatt Macy *
104eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1
105eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
106eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
107eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
108eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
109eda14cbcSMatt Macy *
110eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
111eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
112eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have
113eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.)
114eda14cbcSMatt Macy *
115eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually
116eda14cbcSMatt Macy * or in concert to recover missing data columns.
117eda14cbcSMatt Macy */
118eda14cbcSMatt Macy
119eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0
120eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1
121eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2
122eda14cbcSMatt Macy
123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
124eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
125eda14cbcSMatt Macy
126eda14cbcSMatt Macy /*
127eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a
128eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by
129eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to
130eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d.
131eda14cbcSMatt Macy */
132eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \
133eda14cbcSMatt Macy { \
134eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \
135eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \
136eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
137eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
138eda14cbcSMatt Macy }
139eda14cbcSMatt Macy
140eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \
141eda14cbcSMatt Macy { \
142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \
143eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \
144eda14cbcSMatt Macy }
145eda14cbcSMatt Macy
146e716630dSMartin Matuska
147e716630dSMartin Matuska /*
148e716630dSMartin Matuska * Big Theory Statement for how a RAIDZ VDEV is expanded
149e716630dSMartin Matuska *
150e716630dSMartin Matuska * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
151e716630dSMartin Matuska * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
152e716630dSMartin Matuska * that have been previously expanded can be expanded again.
153e716630dSMartin Matuska *
154e716630dSMartin Matuska * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
155e716630dSMartin Matuska * the VDEV) when an expansion starts. And the expansion will pause if any
156e716630dSMartin Matuska * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
157e716630dSMartin Matuska * operations on the pool can continue while an expansion is in progress (e.g.
158e716630dSMartin Matuska * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
159e716630dSMartin Matuska * and zpool initialize which can't be run during an expansion. Following a
160e716630dSMartin Matuska * reboot or export/import, the expansion resumes where it left off.
161e716630dSMartin Matuska *
162e716630dSMartin Matuska * == Reflowing the Data ==
163e716630dSMartin Matuska *
164e716630dSMartin Matuska * The expansion involves reflowing (copying) the data from the current set
165e716630dSMartin Matuska * of disks to spread it across the new set which now has one more disk. This
166e716630dSMartin Matuska * reflow operation is similar to reflowing text when the column width of a
167e716630dSMartin Matuska * text editor window is expanded. The text doesn’t change but the location of
168e716630dSMartin Matuska * the text changes to accommodate the new width. An example reflow result for
169e716630dSMartin Matuska * a 4-wide RAIDZ1 to a 5-wide is shown below.
170e716630dSMartin Matuska *
171e716630dSMartin Matuska * Reflow End State
172e716630dSMartin Matuska * Each letter indicates a parity group (logical stripe)
173e716630dSMartin Matuska *
174e716630dSMartin Matuska * Before expansion After Expansion
175e716630dSMartin Matuska * D1 D2 D3 D4 D1 D2 D3 D4 D5
176e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
177e716630dSMartin Matuska * | | | | | | | | | | |
178e716630dSMartin Matuska * | A | A | A | A | | A | A | A | A | B |
179e716630dSMartin Matuska * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
180e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
181e716630dSMartin Matuska * | | | | | | | | | | |
182e716630dSMartin Matuska * | B | B | C | C | | B | C | C | C | C |
183e716630dSMartin Matuska * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
184e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
185e716630dSMartin Matuska * | | | | | | | | | | |
186e716630dSMartin Matuska * | C | C | D | D | | D | D | E | E | E |
187e716630dSMartin Matuska * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
188e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
189e716630dSMartin Matuska * | | | | | | | | | | |
190e716630dSMartin Matuska * | E | E | E | E | --> | E | F | F | G | G |
191e716630dSMartin Matuska * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
192e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
193e716630dSMartin Matuska * | | | | | | | | | | |
194e716630dSMartin Matuska * | F | F | G | G | | G | G | H | H | H |
195e716630dSMartin Matuska * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
196e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
197e716630dSMartin Matuska * | | | | | | | | | | |
198e716630dSMartin Matuska * | G | G | H | H | | H | I | I | J | J |
199e716630dSMartin Matuska * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
200e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
201e716630dSMartin Matuska * | | | | | | | | | | |
202e716630dSMartin Matuska * | H | H | I | I | | J | J | | | K |
203e716630dSMartin Matuska * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
204e716630dSMartin Matuska * +------+------+------+------+ +------+------+------+------+------+
205e716630dSMartin Matuska *
206e716630dSMartin Matuska * This reflow approach has several advantages. There is no need to read or
207e716630dSMartin Matuska * modify the block pointers or recompute any block checksums. The reflow
208e716630dSMartin Matuska * doesn’t need to know where the parity sectors reside. We can read and write
209e716630dSMartin Matuska * data sequentially and the copy can occur in a background thread in open
210e716630dSMartin Matuska * context. The design also allows for fast discovery of what data to copy.
211e716630dSMartin Matuska *
212e716630dSMartin Matuska * The VDEV metaslabs are processed, one at a time, to copy the block data to
213e716630dSMartin Matuska * have it flow across all the disks. The metaslab is disabled for allocations
214e716630dSMartin Matuska * during the copy. As an optimization, we only copy the allocated data which
215e716630dSMartin Matuska * can be determined by looking at the metaslab range tree. During the copy we
216e716630dSMartin Matuska * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
217e716630dSMartin Matuska * need to be able to survive losing parity count disks). This means we
218e716630dSMartin Matuska * cannot overwrite data during the reflow that would be needed if a disk is
219e716630dSMartin Matuska * lost.
220e716630dSMartin Matuska *
221e716630dSMartin Matuska * After the reflow completes, all newly-written blocks will have the new
222e716630dSMartin Matuska * layout, i.e., they will have the parity to data ratio implied by the new
223e716630dSMartin Matuska * number of disks in the RAIDZ group. Even though the reflow copies all of
224e716630dSMartin Matuska * the allocated space (data and parity), it is only rearranged, not changed.
225e716630dSMartin Matuska *
226e716630dSMartin Matuska * This act of reflowing the data has a few implications about blocks
227e716630dSMartin Matuska * that were written before the reflow completes:
228e716630dSMartin Matuska *
229e716630dSMartin Matuska * - Old blocks will still use the same amount of space (i.e., they will have
230e716630dSMartin Matuska * the parity to data ratio implied by the old number of disks in the RAIDZ
231e716630dSMartin Matuska * group).
232e716630dSMartin Matuska * - Reading old blocks will be slightly slower than before the reflow, for
233e716630dSMartin Matuska * two reasons. First, we will have to read from all disks in the RAIDZ
234e716630dSMartin Matuska * VDEV, rather than being able to skip the children that contain only
235e716630dSMartin Matuska * parity of this block (because the data of a single block is now spread
236e716630dSMartin Matuska * out across all the disks). Second, in most cases there will be an extra
237e716630dSMartin Matuska * bcopy, needed to rearrange the data back to its original layout in memory.
238e716630dSMartin Matuska *
239e716630dSMartin Matuska * == Scratch Area ==
240e716630dSMartin Matuska *
241e716630dSMartin Matuska * As we copy the block data, we can only progress to the point that writes
242e716630dSMartin Matuska * will not overlap with blocks whose progress has not yet been recorded on
243e716630dSMartin Matuska * disk. Since partially-copied rows are always read from the old location,
244e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent any
245e716630dSMartin Matuska * row-wise overlap. For example, in the diagram above, when we reflow sector
246e716630dSMartin Matuska * B6 it will overwite the original location for B5.
247e716630dSMartin Matuska *
248e716630dSMartin Matuska * To get around this, a scratch space is used so that we can start copying
249e716630dSMartin Matuska * without risking data loss by overlapping the row. As an added benefit, it
250e716630dSMartin Matuska * improves performance at the beginning of the reflow, but that small perf
251e716630dSMartin Matuska * boost wouldn't be worth the complexity on its own.
252e716630dSMartin Matuska *
253e716630dSMartin Matuska * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
254e716630dSMartin Matuska * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
255e716630dSMartin Matuska * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
256e716630dSMartin Matuska * the widths will likely be single digits so we can get a substantial chuck
257e716630dSMartin Matuska * size using only a few MB of scratch per disk.
258e716630dSMartin Matuska *
259e716630dSMartin Matuska * The scratch area is persisted to disk which holds a large amount of reflowed
260e716630dSMartin Matuska * state. We can always read the partially written stripes when a disk fails or
261e716630dSMartin Matuska * the copy is interrupted (crash) during the initial copying phase and also
262e716630dSMartin Matuska * get past a small chunk size restriction. At a minimum, the scratch space
263e716630dSMartin Matuska * must be large enough to get us to the point that one row does not overlap
264e716630dSMartin Matuska * itself when moved (i.e new_width^2). But going larger is even better. We
265e716630dSMartin Matuska * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
266e716630dSMartin Matuska * as our scratch space to handle overwriting the initial part of the VDEV.
267e716630dSMartin Matuska *
268e716630dSMartin Matuska * 0 256K 512K 4M
269e716630dSMartin Matuska * +------+------+-----------------------+-----------------------------
270e716630dSMartin Matuska * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
271e716630dSMartin Matuska * | L0 | L1 | Reserved | (Metaslabs)
272e716630dSMartin Matuska * +------+------+-----------------------+-------------------------------
273e716630dSMartin Matuska * Scratch Area
274e716630dSMartin Matuska *
275e716630dSMartin Matuska * == Reflow Progress Updates ==
276e716630dSMartin Matuska * After the initial scratch-based reflow, the expansion process works
277e716630dSMartin Matuska * similarly to device removal. We create a new open context thread which
278e716630dSMartin Matuska * reflows the data, and periodically kicks off sync tasks to update logical
279e716630dSMartin Matuska * state. In this case, state is the committed progress (offset of next data
280e716630dSMartin Matuska * to copy). We need to persist the completed offset on disk, so that if we
281e716630dSMartin Matuska * crash we know which format each VDEV offset is in.
282e716630dSMartin Matuska *
283e716630dSMartin Matuska * == Time Dependent Geometry ==
284e716630dSMartin Matuska *
285e716630dSMartin Matuska * In non-expanded RAIDZ, blocks are read from disk in a column by column
286e716630dSMartin Matuska * fashion. For a multi-row block, the second sector is in the first column
287e716630dSMartin Matuska * not in the second column. This allows us to issue full reads for each
288e716630dSMartin Matuska * column directly into the request buffer. The block data is thus laid out
289e716630dSMartin Matuska * sequentially in a column-by-column fashion.
290e716630dSMartin Matuska *
291e716630dSMartin Matuska * For example, in the before expansion diagram above, one logical block might
292e716630dSMartin Matuska * be sectors G19-H26. The parity is in G19,H23; and the data is in
293e716630dSMartin Matuska * G20,H24,G21,H25,G22,H26.
294e716630dSMartin Matuska *
295e716630dSMartin Matuska * After a block is reflowed, the sectors that were all in the original column
296e716630dSMartin Matuska * data can now reside in different columns. When reading from an expanded
297e716630dSMartin Matuska * VDEV, we need to know the logical stripe width for each block so we can
298e716630dSMartin Matuska * reconstitute the block’s data after the reads are completed. Likewise,
299e716630dSMartin Matuska * when we perform the combinatorial reconstruction we need to know the
300e716630dSMartin Matuska * original width so we can retry combinations from the past layouts.
301e716630dSMartin Matuska *
302e716630dSMartin Matuska * Time dependent geometry is what we call having blocks with different layouts
303e716630dSMartin Matuska * (stripe widths) in the same VDEV. This time-dependent geometry uses the
304e716630dSMartin Matuska * block’s birth time (+ the time expansion ended) to establish the correct
305e716630dSMartin Matuska * width for a given block. After an expansion completes, we record the time
306e716630dSMartin Matuska * for blocks written with a particular width (geometry).
307e716630dSMartin Matuska *
308e716630dSMartin Matuska * == On Disk Format Changes ==
309e716630dSMartin Matuska *
310e716630dSMartin Matuska * New pool feature flag, 'raidz_expansion' whose reference count is the number
311e716630dSMartin Matuska * of RAIDZ VDEVs that have been expanded.
312e716630dSMartin Matuska *
313e716630dSMartin Matuska * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
314e716630dSMartin Matuska *
315e716630dSMartin Matuska * Since the uberblock can point to arbitrary blocks, which might be on the
316e716630dSMartin Matuska * expanding RAIDZ, and might or might not have been expanded. We need to know
317e716630dSMartin Matuska * which way a block is laid out before reading it. This info is the next
318e716630dSMartin Matuska * offset that needs to be reflowed and we persist that in the uberblock, in
319e716630dSMartin Matuska * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
320e716630dSMartin Matuska * After the expansion is complete, we then use the raidz_expand_txgs array
321e716630dSMartin Matuska * (see below) to determine how to read a block and the ub_raidz_reflow_info
322e716630dSMartin Matuska * field no longer required.
323e716630dSMartin Matuska *
324e716630dSMartin Matuska * The uberblock's ub_raidz_reflow_info field also holds the scratch space
325e716630dSMartin Matuska * state (i.e., active or not) which is also required before reading a block
326e716630dSMartin Matuska * during the initial phase of reflowing the data.
327e716630dSMartin Matuska *
328e716630dSMartin Matuska * The top-level RAIDZ VDEV has two new entries in the nvlist:
329e716630dSMartin Matuska *
330e716630dSMartin Matuska * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
331e716630dSMartin Matuska * and used after the expansion is complete to
332e716630dSMartin Matuska * determine how to read a raidz block
333e716630dSMartin Matuska * 'raidz_expanding' boolean: present during reflow and removed after completion
334e716630dSMartin Matuska * used during a spa import to resume an unfinished
335e716630dSMartin Matuska * expansion
336e716630dSMartin Matuska *
337e716630dSMartin Matuska * And finally the VDEVs top zap adds the following informational entries:
338e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
339e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
340e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
341e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
342e716630dSMartin Matuska */
343e716630dSMartin Matuska
344e716630dSMartin Matuska /*
345e716630dSMartin Matuska * For testing only: pause the raidz expansion after reflowing this amount.
346e716630dSMartin Matuska * (accessed by ZTS and ztest)
347e716630dSMartin Matuska */
348e716630dSMartin Matuska #ifdef _KERNEL
349e716630dSMartin Matuska static
350e716630dSMartin Matuska #endif /* _KERNEL */
351e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0;
352e716630dSMartin Matuska
353e716630dSMartin Matuska /*
354e716630dSMartin Matuska * For testing only: pause the raidz expansion at a certain point.
355e716630dSMartin Matuska */
356e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0;
357e716630dSMartin Matuska
358e716630dSMartin Matuska /*
359e716630dSMartin Matuska * Maximum amount of copy io's outstanding at once.
360e716630dSMartin Matuska */
36117aab35aSMartin Matuska #ifdef _ILP32
36217aab35aSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
36317aab35aSMartin Matuska #else
364e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
36517aab35aSMartin Matuska #endif
366e716630dSMartin Matuska
367e716630dSMartin Matuska /*
368e716630dSMartin Matuska * Apply raidz map abds aggregation if the number of rows in the map is equal
369e716630dSMartin Matuska * or greater than the value below.
370e716630dSMartin Matuska */
371e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4;
372e716630dSMartin Matuska
373e716630dSMartin Matuska /*
374e716630dSMartin Matuska * Automatically start a pool scrub when a RAIDZ expansion completes in
375e716630dSMartin Matuska * order to verify the checksums of all blocks which have been copied
376e716630dSMartin Matuska * during the expansion. Automatic scrubbing is enabled by default and
377e716630dSMartin Matuska * is strongly recommended.
378e716630dSMartin Matuska */
379e716630dSMartin Matuska static int zfs_scrub_after_expand = 1;
380e716630dSMartin Matuska
3817877fdebSMatt Macy static void
vdev_raidz_row_free(raidz_row_t * rr)3827877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr)
383eda14cbcSMatt Macy {
384184c1b94SMartin Matuska for (int c = 0; c < rr->rr_cols; c++) {
385184c1b94SMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
386eda14cbcSMatt Macy
387184c1b94SMartin Matuska if (rc->rc_size != 0)
388184c1b94SMartin Matuska abd_free(rc->rc_abd);
389184c1b94SMartin Matuska if (rc->rc_orig_data != NULL)
390f9693befSMartin Matuska abd_free(rc->rc_orig_data);
391eda14cbcSMatt Macy }
392eda14cbcSMatt Macy
3937877fdebSMatt Macy if (rr->rr_abd_empty != NULL)
3947877fdebSMatt Macy abd_free(rr->rr_abd_empty);
395eda14cbcSMatt Macy
3967877fdebSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
3977877fdebSMatt Macy }
3987877fdebSMatt Macy
3997877fdebSMatt Macy void
vdev_raidz_map_free(raidz_map_t * rm)4007877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm)
4017877fdebSMatt Macy {
4027877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++)
4037877fdebSMatt Macy vdev_raidz_row_free(rm->rm_row[i]);
4047877fdebSMatt Macy
405e716630dSMartin Matuska if (rm->rm_nphys_cols) {
406e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) {
407e716630dSMartin Matuska if (rm->rm_phys_col[i].rc_abd != NULL)
408e716630dSMartin Matuska abd_free(rm->rm_phys_col[i].rc_abd);
409e716630dSMartin Matuska }
410e716630dSMartin Matuska
411e716630dSMartin Matuska kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
412e716630dSMartin Matuska rm->rm_nphys_cols);
413e716630dSMartin Matuska }
414e716630dSMartin Matuska
415e716630dSMartin Matuska ASSERT3P(rm->rm_lr, ==, NULL);
4167877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
417eda14cbcSMatt Macy }
418eda14cbcSMatt Macy
419eda14cbcSMatt Macy static void
vdev_raidz_map_free_vsd(zio_t * zio)420eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio)
421eda14cbcSMatt Macy {
422eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
423eda14cbcSMatt Macy
424eda14cbcSMatt Macy vdev_raidz_map_free(rm);
425eda14cbcSMatt Macy }
426eda14cbcSMatt Macy
427e716630dSMartin Matuska static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)428e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2)
429e716630dSMartin Matuska {
430e716630dSMartin Matuska const reflow_node_t *l = x1;
431e716630dSMartin Matuska const reflow_node_t *r = x2;
432e716630dSMartin Matuska
433e716630dSMartin Matuska return (TREE_CMP(l->re_txg, r->re_txg));
434e716630dSMartin Matuska }
435e716630dSMartin Matuska
436f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = {
437eda14cbcSMatt Macy .vsd_free = vdev_raidz_map_free_vsd,
438eda14cbcSMatt Macy };
439eda14cbcSMatt Macy
440e716630dSMartin Matuska raidz_row_t *
vdev_raidz_row_alloc(int cols,zio_t * zio)44187bf66d4SMartin Matuska vdev_raidz_row_alloc(int cols, zio_t *zio)
442e716630dSMartin Matuska {
443e716630dSMartin Matuska raidz_row_t *rr =
444e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
445e716630dSMartin Matuska
446e716630dSMartin Matuska rr->rr_cols = cols;
447e716630dSMartin Matuska rr->rr_scols = cols;
448e716630dSMartin Matuska
449e716630dSMartin Matuska for (int c = 0; c < cols; c++) {
450e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
451e716630dSMartin Matuska rc->rc_shadow_devidx = INT_MAX;
452e716630dSMartin Matuska rc->rc_shadow_offset = UINT64_MAX;
45387bf66d4SMartin Matuska /*
45487bf66d4SMartin Matuska * We can not allow self healing to take place for Direct I/O
45587bf66d4SMartin Matuska * reads. There is nothing that stops the buffer contents from
45687bf66d4SMartin Matuska * being manipulated while the I/O is in flight. It is possible
45787bf66d4SMartin Matuska * that the checksum could be verified on the buffer and then
45887bf66d4SMartin Matuska * the contents of that buffer are manipulated afterwards. This
45987bf66d4SMartin Matuska * could lead to bad data being written out during self
46087bf66d4SMartin Matuska * healing.
46187bf66d4SMartin Matuska */
46287bf66d4SMartin Matuska if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
463e716630dSMartin Matuska rc->rc_allow_repair = 1;
464e716630dSMartin Matuska }
465e716630dSMartin Matuska return (rr);
466e716630dSMartin Matuska }
467e716630dSMartin Matuska
46881b22a98SMartin Matuska static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)46981b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
47081b22a98SMartin Matuska {
47181b22a98SMartin Matuska int c;
47281b22a98SMartin Matuska int nwrapped = 0;
47381b22a98SMartin Matuska uint64_t off = 0;
47481b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0];
47581b22a98SMartin Matuska
47681b22a98SMartin Matuska ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
47781b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1);
47881b22a98SMartin Matuska
47981b22a98SMartin Matuska /*
48081b22a98SMartin Matuska * Pad any parity columns with additional space to account for skip
48181b22a98SMartin Matuska * sectors.
48281b22a98SMartin Matuska */
48381b22a98SMartin Matuska if (rm->rm_skipstart < rr->rr_firstdatacol) {
48481b22a98SMartin Matuska ASSERT0(rm->rm_skipstart);
48581b22a98SMartin Matuska nwrapped = rm->rm_nskip;
48681b22a98SMartin Matuska } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
48781b22a98SMartin Matuska nwrapped =
48881b22a98SMartin Matuska (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
48981b22a98SMartin Matuska }
49081b22a98SMartin Matuska
49181b22a98SMartin Matuska /*
49281b22a98SMartin Matuska * Optional single skip sectors (rc_size == 0) will be handled in
49381b22a98SMartin Matuska * vdev_raidz_io_start_write().
49481b22a98SMartin Matuska */
49581b22a98SMartin Matuska int skipped = rr->rr_scols - rr->rr_cols;
49681b22a98SMartin Matuska
49781b22a98SMartin Matuska /* Allocate buffers for the parity columns */
49881b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++) {
49981b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
50081b22a98SMartin Matuska
50181b22a98SMartin Matuska /*
50281b22a98SMartin Matuska * Parity columns will pad out a linear ABD to account for
50381b22a98SMartin Matuska * the skip sector. A linear ABD is used here because
50481b22a98SMartin Matuska * parity calculations use the ABD buffer directly to calculate
50581b22a98SMartin Matuska * parity. This avoids doing a memcpy back to the ABD after the
50681b22a98SMartin Matuska * parity has been calculated. By issuing the parity column
50781b22a98SMartin Matuska * with the skip sector we can reduce contention on the child
50881b22a98SMartin Matuska * VDEV queue locks (vq_lock).
50981b22a98SMartin Matuska */
51081b22a98SMartin Matuska if (c < nwrapped) {
51181b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(
51281b22a98SMartin Matuska rc->rc_size + (1ULL << ashift), B_FALSE);
51381b22a98SMartin Matuska abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
51481b22a98SMartin Matuska skipped++;
51581b22a98SMartin Matuska } else {
51681b22a98SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
51781b22a98SMartin Matuska }
51881b22a98SMartin Matuska }
51981b22a98SMartin Matuska
52081b22a98SMartin Matuska for (off = 0; c < rr->rr_cols; c++) {
52181b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
52281b22a98SMartin Matuska abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
52381b22a98SMartin Matuska zio->io_abd, off, rc->rc_size);
52481b22a98SMartin Matuska
52581b22a98SMartin Matuska /*
52681b22a98SMartin Matuska * Generate I/O for skip sectors to improve aggregation
52781b22a98SMartin Matuska * continuity. We will use gang ABD's to reduce contention
52881b22a98SMartin Matuska * on the child VDEV queue locks (vq_lock) by issuing
52981b22a98SMartin Matuska * a single I/O that contains the data and skip sector.
53081b22a98SMartin Matuska *
53181b22a98SMartin Matuska * It is important to make sure that rc_size is not updated
53281b22a98SMartin Matuska * even though we are adding a skip sector to the ABD. When
53381b22a98SMartin Matuska * calculating the parity in vdev_raidz_generate_parity_row()
53481b22a98SMartin Matuska * the rc_size is used to iterate through the ABD's. We can
53581b22a98SMartin Matuska * not have zero'd out skip sectors used for calculating
53681b22a98SMartin Matuska * parity for raidz, because those same sectors are not used
53781b22a98SMartin Matuska * during reconstruction.
53881b22a98SMartin Matuska */
53981b22a98SMartin Matuska if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
54081b22a98SMartin Matuska rc->rc_abd = abd_alloc_gang();
54181b22a98SMartin Matuska abd_gang_add(rc->rc_abd, abd, B_TRUE);
54281b22a98SMartin Matuska abd_gang_add(rc->rc_abd,
54381b22a98SMartin Matuska abd_get_zeros(1ULL << ashift), B_TRUE);
54481b22a98SMartin Matuska skipped++;
54581b22a98SMartin Matuska } else {
54681b22a98SMartin Matuska rc->rc_abd = abd;
54781b22a98SMartin Matuska }
54881b22a98SMartin Matuska off += rc->rc_size;
54981b22a98SMartin Matuska }
55081b22a98SMartin Matuska
55181b22a98SMartin Matuska ASSERT3U(off, ==, zio->io_size);
55281b22a98SMartin Matuska ASSERT3S(skipped, ==, rm->rm_nskip);
55381b22a98SMartin Matuska }
55481b22a98SMartin Matuska
55581b22a98SMartin Matuska static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)55681b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
55781b22a98SMartin Matuska {
55881b22a98SMartin Matuska int c;
55981b22a98SMartin Matuska raidz_row_t *rr = rm->rm_row[0];
56081b22a98SMartin Matuska
56181b22a98SMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1);
56281b22a98SMartin Matuska
56381b22a98SMartin Matuska /* Allocate buffers for the parity columns */
56481b22a98SMartin Matuska for (c = 0; c < rr->rr_firstdatacol; c++)
56581b22a98SMartin Matuska rr->rr_col[c].rc_abd =
56681b22a98SMartin Matuska abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
56781b22a98SMartin Matuska
56881b22a98SMartin Matuska for (uint64_t off = 0; c < rr->rr_cols; c++) {
56981b22a98SMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
57081b22a98SMartin Matuska rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
57181b22a98SMartin Matuska zio->io_abd, off, rc->rc_size);
57281b22a98SMartin Matuska off += rc->rc_size;
57381b22a98SMartin Matuska }
57481b22a98SMartin Matuska }
57581b22a98SMartin Matuska
576eda14cbcSMatt Macy /*
577eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is
578eda14cbcSMatt Macy * the number of children in the target vdev.
579eda14cbcSMatt Macy *
580eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which
581eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack.
582eda14cbcSMatt Macy */
583eda14cbcSMatt Macy noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)584eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
585eda14cbcSMatt Macy uint64_t nparity)
586eda14cbcSMatt Macy {
5877877fdebSMatt Macy raidz_row_t *rr;
588eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */
589eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift;
590eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */
591eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift;
592eda14cbcSMatt Macy /* The first column for this stripe. */
593eda14cbcSMatt Macy uint64_t f = b % dcols;
594eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */
595eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift;
596e716630dSMartin Matuska uint64_t acols, scols;
597eda14cbcSMatt Macy
5987877fdebSMatt Macy raidz_map_t *rm =
5997877fdebSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
6007877fdebSMatt Macy rm->rm_nrows = 1;
6017877fdebSMatt Macy
602eda14cbcSMatt Macy /*
603eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but
604eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data.
605eda14cbcSMatt Macy */
606e716630dSMartin Matuska uint64_t q = s / (dcols - nparity);
607eda14cbcSMatt Macy
608eda14cbcSMatt Macy /*
609eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O.
610eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs.
611eda14cbcSMatt Macy */
612e716630dSMartin Matuska uint64_t r = s - q * (dcols - nparity);
613eda14cbcSMatt Macy
614eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */
615e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity);
616eda14cbcSMatt Macy
617eda14cbcSMatt Macy /*
618eda14cbcSMatt Macy * The total number of data and parity sectors associated with
619eda14cbcSMatt Macy * this I/O.
620eda14cbcSMatt Macy */
621e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
622eda14cbcSMatt Macy
6237877fdebSMatt Macy /*
6247877fdebSMatt Macy * acols: The columns that will be accessed.
6257877fdebSMatt Macy * scols: The columns that will be accessed or skipped.
6267877fdebSMatt Macy */
627eda14cbcSMatt Macy if (q == 0) {
628eda14cbcSMatt Macy /* Our I/O request doesn't span all child vdevs. */
629eda14cbcSMatt Macy acols = bc;
630eda14cbcSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1));
631eda14cbcSMatt Macy } else {
632eda14cbcSMatt Macy acols = dcols;
633eda14cbcSMatt Macy scols = dcols;
634eda14cbcSMatt Macy }
635eda14cbcSMatt Macy
636eda14cbcSMatt Macy ASSERT3U(acols, <=, scols);
63787bf66d4SMartin Matuska rr = vdev_raidz_row_alloc(scols, zio);
6387877fdebSMatt Macy rm->rm_row[0] = rr;
6397877fdebSMatt Macy rr->rr_cols = acols;
6407877fdebSMatt Macy rr->rr_bigcols = bc;
6417877fdebSMatt Macy rr->rr_firstdatacol = nparity;
6427877fdebSMatt Macy #ifdef ZFS_DEBUG
6437877fdebSMatt Macy rr->rr_offset = zio->io_offset;
6447877fdebSMatt Macy rr->rr_size = zio->io_size;
6457877fdebSMatt Macy #endif
646eda14cbcSMatt Macy
647e716630dSMartin Matuska uint64_t asize = 0;
648eda14cbcSMatt Macy
649e716630dSMartin Matuska for (uint64_t c = 0; c < scols; c++) {
6507877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
651e716630dSMartin Matuska uint64_t col = f + c;
652e716630dSMartin Matuska uint64_t coff = o;
653eda14cbcSMatt Macy if (col >= dcols) {
654eda14cbcSMatt Macy col -= dcols;
655eda14cbcSMatt Macy coff += 1ULL << ashift;
656eda14cbcSMatt Macy }
6577877fdebSMatt Macy rc->rc_devidx = col;
6587877fdebSMatt Macy rc->rc_offset = coff;
659eda14cbcSMatt Macy
660eda14cbcSMatt Macy if (c >= acols)
6617877fdebSMatt Macy rc->rc_size = 0;
662eda14cbcSMatt Macy else if (c < bc)
6637877fdebSMatt Macy rc->rc_size = (q + 1) << ashift;
664eda14cbcSMatt Macy else
6657877fdebSMatt Macy rc->rc_size = q << ashift;
666eda14cbcSMatt Macy
6677877fdebSMatt Macy asize += rc->rc_size;
668eda14cbcSMatt Macy }
669eda14cbcSMatt Macy
670eda14cbcSMatt Macy ASSERT3U(asize, ==, tot << ashift);
671eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot;
6727877fdebSMatt Macy rm->rm_skipstart = bc;
673eda14cbcSMatt Macy
674eda14cbcSMatt Macy /*
675eda14cbcSMatt Macy * If all data stored spans all columns, there's a danger that parity
676eda14cbcSMatt Macy * will always be on the same device and, since parity isn't read
677eda14cbcSMatt Macy * during normal operation, that device's I/O bandwidth won't be
678eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB.
679eda14cbcSMatt Macy *
680eda14cbcSMatt Macy * ... at least that was, ostensibly, the theory. As a practical
681eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we
682eda14cbcSMatt Macy * won't see any benefit. Further, occasional writes that aren't a
683eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum
684eda14cbcSMatt Macy * stripe width are sufficient to avoid pessimal behavior.
685eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format
686eda14cbcSMatt Macy * requirement that we need to support for all eternity, but only
687eda14cbcSMatt Macy * for single-parity RAID-Z.
688eda14cbcSMatt Macy *
689eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding
690eda14cbcSMatt Macy * we must make sure to note this swap. We will never intend to
691eda14cbcSMatt Macy * skip the first column since at least one data and one parity
692eda14cbcSMatt Macy * column must appear in each row.
693eda14cbcSMatt Macy */
6947877fdebSMatt Macy ASSERT(rr->rr_cols >= 2);
6957877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
696eda14cbcSMatt Macy
6977877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
698e716630dSMartin Matuska uint64_t devidx = rr->rr_col[0].rc_devidx;
6997877fdebSMatt Macy o = rr->rr_col[0].rc_offset;
7007877fdebSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
7017877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
7027877fdebSMatt Macy rr->rr_col[1].rc_devidx = devidx;
7037877fdebSMatt Macy rr->rr_col[1].rc_offset = o;
704eda14cbcSMatt Macy if (rm->rm_skipstart == 0)
705eda14cbcSMatt Macy rm->rm_skipstart = 1;
706eda14cbcSMatt Macy }
707eda14cbcSMatt Macy
70881b22a98SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) {
70981b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio, rm, ashift);
71081b22a98SMartin Matuska } else {
71181b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio, rm);
71281b22a98SMartin Matuska }
713e716630dSMartin Matuska /* init RAIDZ parity ops */
714e716630dSMartin Matuska rm->rm_ops = vdev_raidz_math_get_ops();
71581b22a98SMartin Matuska
716e716630dSMartin Matuska return (rm);
717e716630dSMartin Matuska }
718e716630dSMartin Matuska
719e716630dSMartin Matuska /*
720e716630dSMartin Matuska * Everything before reflow_offset_synced should have been moved to the new
721e716630dSMartin Matuska * location (read and write completed). However, this may not yet be reflected
722e716630dSMartin Matuska * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
723e716630dSMartin Matuska * uberblock has not yet been written). If reflow is not in progress,
724e716630dSMartin Matuska * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
725e716630dSMartin Matuska * entirely before reflow_offset_synced, it will come from the new location.
726e716630dSMartin Matuska * Otherwise this row will come from the old location. Therefore, rows that
727e716630dSMartin Matuska * straddle the reflow_offset_synced will come from the old location.
728e716630dSMartin Matuska *
729e716630dSMartin Matuska * For writes, reflow_offset_next is the next offset to copy. If a sector has
730e716630dSMartin Matuska * been copied, but not yet reflected in the on-disk progress
731e716630dSMartin Matuska * (reflow_offset_synced), it will also be written to the new (already copied)
732e716630dSMartin Matuska * offset.
733e716630dSMartin Matuska */
734e716630dSMartin Matuska noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)735e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio,
736e716630dSMartin Matuska uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
737e716630dSMartin Matuska uint64_t nparity, uint64_t reflow_offset_synced,
738e716630dSMartin Matuska uint64_t reflow_offset_next, boolean_t use_scratch)
739e716630dSMartin Matuska {
740e716630dSMartin Matuska abd_t *abd = zio->io_abd;
741e716630dSMartin Matuska uint64_t offset = zio->io_offset;
742e716630dSMartin Matuska uint64_t size = zio->io_size;
743e716630dSMartin Matuska
744e716630dSMartin Matuska /* The zio's size in units of the vdev's minimum sector size. */
745e716630dSMartin Matuska uint64_t s = size >> ashift;
746e716630dSMartin Matuska
747e716630dSMartin Matuska /*
748e716630dSMartin Matuska * "Quotient": The number of data sectors for this stripe on all but
749e716630dSMartin Matuska * the "big column" child vdevs that also contain "remainder" data.
750e716630dSMartin Matuska * AKA "full rows"
751e716630dSMartin Matuska */
752e716630dSMartin Matuska uint64_t q = s / (logical_cols - nparity);
753e716630dSMartin Matuska
754e716630dSMartin Matuska /*
755e716630dSMartin Matuska * "Remainder": The number of partial stripe data sectors in this I/O.
756e716630dSMartin Matuska * This will add a sector to some, but not all, child vdevs.
757e716630dSMartin Matuska */
758e716630dSMartin Matuska uint64_t r = s - q * (logical_cols - nparity);
759e716630dSMartin Matuska
760e716630dSMartin Matuska /* The number of "big columns" - those which contain remainder data. */
761e716630dSMartin Matuska uint64_t bc = (r == 0 ? 0 : r + nparity);
762e716630dSMartin Matuska
763e716630dSMartin Matuska /*
764e716630dSMartin Matuska * The total number of data and parity sectors associated with
765e716630dSMartin Matuska * this I/O.
766e716630dSMartin Matuska */
767e716630dSMartin Matuska uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
768e716630dSMartin Matuska
769e716630dSMartin Matuska /* How many rows contain data (not skip) */
770e716630dSMartin Matuska uint64_t rows = howmany(tot, logical_cols);
771e716630dSMartin Matuska int cols = MIN(tot, logical_cols);
772e716630dSMartin Matuska
773e716630dSMartin Matuska raidz_map_t *rm =
774e716630dSMartin Matuska kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
775e716630dSMartin Matuska KM_SLEEP);
776e716630dSMartin Matuska rm->rm_nrows = rows;
777e716630dSMartin Matuska rm->rm_nskip = roundup(tot, nparity + 1) - tot;
778e716630dSMartin Matuska rm->rm_skipstart = bc;
779e716630dSMartin Matuska uint64_t asize = 0;
780e716630dSMartin Matuska
781e716630dSMartin Matuska for (uint64_t row = 0; row < rows; row++) {
782e716630dSMartin Matuska boolean_t row_use_scratch = B_FALSE;
78387bf66d4SMartin Matuska raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
784e716630dSMartin Matuska rm->rm_row[row] = rr;
785e716630dSMartin Matuska
786e716630dSMartin Matuska /* The starting RAIDZ (parent) vdev sector of the row. */
787e716630dSMartin Matuska uint64_t b = (offset >> ashift) + row * logical_cols;
788e716630dSMartin Matuska
789e716630dSMartin Matuska /*
790e716630dSMartin Matuska * If we are in the middle of a reflow, and the copying has
791e716630dSMartin Matuska * not yet completed for any part of this row, then use the
792e716630dSMartin Matuska * old location of this row. Note that reflow_offset_synced
793e716630dSMartin Matuska * reflects the i/o that's been completed, because it's
794e716630dSMartin Matuska * updated by a synctask, after zio_wait(spa_txg_zio[]).
795e716630dSMartin Matuska * This is sufficient for our check, even if that progress
796e716630dSMartin Matuska * has not yet been recorded to disk (reflected in
797e716630dSMartin Matuska * spa_ubsync). Also note that we consider the last row to
798e716630dSMartin Matuska * be "full width" (`cols`-wide rather than `bc`-wide) for
799e716630dSMartin Matuska * this calculation. This causes a tiny bit of unnecessary
800e716630dSMartin Matuska * double-writes but is safe and simpler to calculate.
801e716630dSMartin Matuska */
802e716630dSMartin Matuska int row_phys_cols = physical_cols;
803e716630dSMartin Matuska if (b + cols > reflow_offset_synced >> ashift)
804e716630dSMartin Matuska row_phys_cols--;
805e716630dSMartin Matuska else if (use_scratch)
806e716630dSMartin Matuska row_use_scratch = B_TRUE;
807e716630dSMartin Matuska
808e716630dSMartin Matuska /* starting child of this row */
809e716630dSMartin Matuska uint64_t child_id = b % row_phys_cols;
810e716630dSMartin Matuska /* The starting byte offset on each child vdev. */
811e716630dSMartin Matuska uint64_t child_offset = (b / row_phys_cols) << ashift;
812e716630dSMartin Matuska
813e716630dSMartin Matuska /*
814e716630dSMartin Matuska * Note, rr_cols is the entire width of the block, even
815e716630dSMartin Matuska * if this row is shorter. This is needed because parity
816e716630dSMartin Matuska * generation (for Q and R) needs to know the entire width,
817e716630dSMartin Matuska * because it treats the short row as though it was
818e716630dSMartin Matuska * full-width (and the "phantom" sectors were zero-filled).
819e716630dSMartin Matuska *
820e716630dSMartin Matuska * Another approach to this would be to set cols shorter
821e716630dSMartin Matuska * (to just the number of columns that we might do i/o to)
822e716630dSMartin Matuska * and have another mechanism to tell the parity generation
823e716630dSMartin Matuska * about the "entire width". Reconstruction (at least
824e716630dSMartin Matuska * vdev_raidz_reconstruct_general()) would also need to
825e716630dSMartin Matuska * know about the "entire width".
826e716630dSMartin Matuska */
827e716630dSMartin Matuska rr->rr_firstdatacol = nparity;
828e716630dSMartin Matuska #ifdef ZFS_DEBUG
829e716630dSMartin Matuska /*
830e716630dSMartin Matuska * note: rr_size is PSIZE, not ASIZE
831e716630dSMartin Matuska */
832e716630dSMartin Matuska rr->rr_offset = b << ashift;
833e716630dSMartin Matuska rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
834e716630dSMartin Matuska #endif
835e716630dSMartin Matuska
836e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++, child_id++) {
837e716630dSMartin Matuska if (child_id >= row_phys_cols) {
838e716630dSMartin Matuska child_id -= row_phys_cols;
839e716630dSMartin Matuska child_offset += 1ULL << ashift;
840e716630dSMartin Matuska }
841e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
842e716630dSMartin Matuska rc->rc_devidx = child_id;
843e716630dSMartin Matuska rc->rc_offset = child_offset;
844e716630dSMartin Matuska
845e716630dSMartin Matuska /*
846e716630dSMartin Matuska * Get this from the scratch space if appropriate.
847e716630dSMartin Matuska * This only happens if we crashed in the middle of
848e716630dSMartin Matuska * raidz_reflow_scratch_sync() (while it's running,
849e716630dSMartin Matuska * the rangelock prevents us from doing concurrent
850e716630dSMartin Matuska * io), and even then only during zpool import or
851e716630dSMartin Matuska * when the pool is imported readonly.
852e716630dSMartin Matuska */
853e716630dSMartin Matuska if (row_use_scratch)
854e716630dSMartin Matuska rc->rc_offset -= VDEV_BOOT_SIZE;
855e716630dSMartin Matuska
856e716630dSMartin Matuska uint64_t dc = c - rr->rr_firstdatacol;
857e716630dSMartin Matuska if (c < rr->rr_firstdatacol) {
858e716630dSMartin Matuska rc->rc_size = 1ULL << ashift;
859e716630dSMartin Matuska
860e716630dSMartin Matuska /*
861e716630dSMartin Matuska * Parity sectors' rc_abd's are set below
862e716630dSMartin Matuska * after determining if this is an aggregation.
863e716630dSMartin Matuska */
864e716630dSMartin Matuska } else if (row == rows - 1 && bc != 0 && c >= bc) {
865e716630dSMartin Matuska /*
866e716630dSMartin Matuska * Past the end of the block (even including
867e716630dSMartin Matuska * skip sectors). This sector is part of the
868e716630dSMartin Matuska * map so that we have full rows for p/q parity
869e716630dSMartin Matuska * generation.
870e716630dSMartin Matuska */
871e716630dSMartin Matuska rc->rc_size = 0;
872e716630dSMartin Matuska rc->rc_abd = NULL;
873e716630dSMartin Matuska } else {
874e716630dSMartin Matuska /* "data column" (col excluding parity) */
875e716630dSMartin Matuska uint64_t off;
876e716630dSMartin Matuska
877e716630dSMartin Matuska if (c < bc || r == 0) {
878e716630dSMartin Matuska off = dc * rows + row;
879e716630dSMartin Matuska } else {
880e716630dSMartin Matuska off = r * rows +
881e716630dSMartin Matuska (dc - r) * (rows - 1) + row;
882e716630dSMartin Matuska }
883e716630dSMartin Matuska rc->rc_size = 1ULL << ashift;
884e716630dSMartin Matuska rc->rc_abd = abd_get_offset_struct(
885e716630dSMartin Matuska &rc->rc_abdstruct, abd, off << ashift,
886e716630dSMartin Matuska rc->rc_size);
887e716630dSMartin Matuska }
888e716630dSMartin Matuska
889e716630dSMartin Matuska if (rc->rc_size == 0)
890e716630dSMartin Matuska continue;
891e716630dSMartin Matuska
892e716630dSMartin Matuska /*
893e716630dSMartin Matuska * If any part of this row is in both old and new
894e716630dSMartin Matuska * locations, the primary location is the old
895e716630dSMartin Matuska * location. If this sector was already copied to the
896e716630dSMartin Matuska * new location, we need to also write to the new,
897e716630dSMartin Matuska * "shadow" location.
898e716630dSMartin Matuska *
899e716630dSMartin Matuska * Note, `row_phys_cols != physical_cols` indicates
900e716630dSMartin Matuska * that the primary location is the old location.
901e716630dSMartin Matuska * `b+c < reflow_offset_next` indicates that the copy
902e716630dSMartin Matuska * to the new location has been initiated. We know
903e716630dSMartin Matuska * that the copy has completed because we have the
904e716630dSMartin Matuska * rangelock, which is held exclusively while the
905e716630dSMartin Matuska * copy is in progress.
906e716630dSMartin Matuska */
907e716630dSMartin Matuska if (row_use_scratch ||
908e716630dSMartin Matuska (row_phys_cols != physical_cols &&
909e716630dSMartin Matuska b + c < reflow_offset_next >> ashift)) {
910e716630dSMartin Matuska rc->rc_shadow_devidx = (b + c) % physical_cols;
911e716630dSMartin Matuska rc->rc_shadow_offset =
912e716630dSMartin Matuska ((b + c) / physical_cols) << ashift;
913e716630dSMartin Matuska if (row_use_scratch)
914e716630dSMartin Matuska rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
915e716630dSMartin Matuska }
916e716630dSMartin Matuska
917e716630dSMartin Matuska asize += rc->rc_size;
918e716630dSMartin Matuska }
919e716630dSMartin Matuska
920e716630dSMartin Matuska /*
921e716630dSMartin Matuska * See comment in vdev_raidz_map_alloc()
922e716630dSMartin Matuska */
923e716630dSMartin Matuska if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
924e716630dSMartin Matuska (offset & (1ULL << 20))) {
925e716630dSMartin Matuska ASSERT(rr->rr_cols >= 2);
926e716630dSMartin Matuska ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
927e716630dSMartin Matuska
928e716630dSMartin Matuska int devidx0 = rr->rr_col[0].rc_devidx;
929e716630dSMartin Matuska uint64_t offset0 = rr->rr_col[0].rc_offset;
930e716630dSMartin Matuska int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
931e716630dSMartin Matuska uint64_t shadow_offset0 =
932e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset;
933e716630dSMartin Matuska
934e716630dSMartin Matuska rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
935e716630dSMartin Matuska rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
936e716630dSMartin Matuska rr->rr_col[0].rc_shadow_devidx =
937e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx;
938e716630dSMartin Matuska rr->rr_col[0].rc_shadow_offset =
939e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset;
940e716630dSMartin Matuska
941e716630dSMartin Matuska rr->rr_col[1].rc_devidx = devidx0;
942e716630dSMartin Matuska rr->rr_col[1].rc_offset = offset0;
943e716630dSMartin Matuska rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
944e716630dSMartin Matuska rr->rr_col[1].rc_shadow_offset = shadow_offset0;
945e716630dSMartin Matuska }
946e716630dSMartin Matuska }
947e716630dSMartin Matuska ASSERT3U(asize, ==, tot << ashift);
948e716630dSMartin Matuska
949e716630dSMartin Matuska /*
950e716630dSMartin Matuska * Determine if the block is contiguous, in which case we can use
951e716630dSMartin Matuska * an aggregation.
952e716630dSMartin Matuska */
953e716630dSMartin Matuska if (rows >= raidz_io_aggregate_rows) {
954e716630dSMartin Matuska rm->rm_nphys_cols = physical_cols;
955e716630dSMartin Matuska rm->rm_phys_col =
956e716630dSMartin Matuska kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
957e716630dSMartin Matuska KM_SLEEP);
958e716630dSMartin Matuska
959e716630dSMartin Matuska /*
960e716630dSMartin Matuska * Determine the aggregate io's offset and size, and check
961e716630dSMartin Matuska * that the io is contiguous.
962e716630dSMartin Matuska */
963e716630dSMartin Matuska for (int i = 0;
964e716630dSMartin Matuska i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
965e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i];
966e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) {
967e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
968e716630dSMartin Matuska raidz_col_t *prc =
969e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx];
970e716630dSMartin Matuska
971e716630dSMartin Matuska if (rc->rc_size == 0)
972e716630dSMartin Matuska continue;
973e716630dSMartin Matuska
974e716630dSMartin Matuska if (prc->rc_size == 0) {
975e716630dSMartin Matuska ASSERT0(prc->rc_offset);
976e716630dSMartin Matuska prc->rc_offset = rc->rc_offset;
977e716630dSMartin Matuska } else if (prc->rc_offset + prc->rc_size !=
978e716630dSMartin Matuska rc->rc_offset) {
979e716630dSMartin Matuska /*
980e716630dSMartin Matuska * This block is not contiguous and
981e716630dSMartin Matuska * therefore can't be aggregated.
982e716630dSMartin Matuska * This is expected to be rare, so
983e716630dSMartin Matuska * the cost of allocating and then
984e716630dSMartin Matuska * freeing rm_phys_col is not
985e716630dSMartin Matuska * significant.
986e716630dSMartin Matuska */
987e716630dSMartin Matuska kmem_free(rm->rm_phys_col,
988e716630dSMartin Matuska sizeof (raidz_col_t) *
989e716630dSMartin Matuska rm->rm_nphys_cols);
990e716630dSMartin Matuska rm->rm_phys_col = NULL;
991e716630dSMartin Matuska rm->rm_nphys_cols = 0;
992e716630dSMartin Matuska break;
993e716630dSMartin Matuska }
994e716630dSMartin Matuska prc->rc_size += rc->rc_size;
995e716630dSMartin Matuska }
996e716630dSMartin Matuska }
997e716630dSMartin Matuska }
998e716630dSMartin Matuska if (rm->rm_phys_col != NULL) {
999e716630dSMartin Matuska /*
1000e716630dSMartin Matuska * Allocate aggregate ABD's.
1001e716630dSMartin Matuska */
1002e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) {
1003e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i];
1004e716630dSMartin Matuska
1005e716630dSMartin Matuska prc->rc_devidx = i;
1006e716630dSMartin Matuska
1007e716630dSMartin Matuska if (prc->rc_size == 0)
1008e716630dSMartin Matuska continue;
1009e716630dSMartin Matuska
1010e716630dSMartin Matuska prc->rc_abd =
1011e716630dSMartin Matuska abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1012e716630dSMartin Matuska B_FALSE);
1013e716630dSMartin Matuska }
1014e716630dSMartin Matuska
1015e716630dSMartin Matuska /*
1016e716630dSMartin Matuska * Point the parity abd's into the aggregate abd's.
1017e716630dSMartin Matuska */
1018e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) {
1019e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i];
1020e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) {
1021e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
1022e716630dSMartin Matuska raidz_col_t *prc =
1023e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx];
1024e716630dSMartin Matuska rc->rc_abd =
1025e716630dSMartin Matuska abd_get_offset_struct(&rc->rc_abdstruct,
1026e716630dSMartin Matuska prc->rc_abd,
1027e716630dSMartin Matuska rc->rc_offset - prc->rc_offset,
1028e716630dSMartin Matuska rc->rc_size);
1029e716630dSMartin Matuska }
1030e716630dSMartin Matuska }
1031e716630dSMartin Matuska } else {
1032e716630dSMartin Matuska /*
1033e716630dSMartin Matuska * Allocate new abd's for the parity sectors.
1034e716630dSMartin Matuska */
1035e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) {
1036e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i];
1037e716630dSMartin Matuska for (int c = 0; c < rr->rr_firstdatacol; c++) {
1038e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
1039e716630dSMartin Matuska rc->rc_abd =
1040e716630dSMartin Matuska abd_alloc_linear(rc->rc_size,
1041e716630dSMartin Matuska B_TRUE);
1042e716630dSMartin Matuska }
1043e716630dSMartin Matuska }
1044e716630dSMartin Matuska }
1045eda14cbcSMatt Macy /* init RAIDZ parity ops */
1046eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops();
1047eda14cbcSMatt Macy
1048eda14cbcSMatt Macy return (rm);
1049eda14cbcSMatt Macy }
1050eda14cbcSMatt Macy
1051eda14cbcSMatt Macy struct pqr_struct {
1052eda14cbcSMatt Macy uint64_t *p;
1053eda14cbcSMatt Macy uint64_t *q;
1054eda14cbcSMatt Macy uint64_t *r;
1055eda14cbcSMatt Macy };
1056eda14cbcSMatt Macy
1057eda14cbcSMatt Macy static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1058eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private)
1059eda14cbcSMatt Macy {
1060eda14cbcSMatt Macy struct pqr_struct *pqr = private;
1061eda14cbcSMatt Macy const uint64_t *src = buf;
1062e716630dSMartin Matuska int cnt = size / sizeof (src[0]);
1063eda14cbcSMatt Macy
1064eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r);
1065eda14cbcSMatt Macy
1066e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++)
1067eda14cbcSMatt Macy *pqr->p ^= *src;
1068eda14cbcSMatt Macy
1069eda14cbcSMatt Macy return (0);
1070eda14cbcSMatt Macy }
1071eda14cbcSMatt Macy
1072eda14cbcSMatt Macy static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1073eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private)
1074eda14cbcSMatt Macy {
1075eda14cbcSMatt Macy struct pqr_struct *pqr = private;
1076eda14cbcSMatt Macy const uint64_t *src = buf;
1077eda14cbcSMatt Macy uint64_t mask;
1078e716630dSMartin Matuska int cnt = size / sizeof (src[0]);
1079eda14cbcSMatt Macy
1080eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r);
1081eda14cbcSMatt Macy
1082e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1083eda14cbcSMatt Macy *pqr->p ^= *src;
1084eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1085eda14cbcSMatt Macy *pqr->q ^= *src;
1086eda14cbcSMatt Macy }
1087eda14cbcSMatt Macy
1088eda14cbcSMatt Macy return (0);
1089eda14cbcSMatt Macy }
1090eda14cbcSMatt Macy
1091eda14cbcSMatt Macy static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1092eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1093eda14cbcSMatt Macy {
1094eda14cbcSMatt Macy struct pqr_struct *pqr = private;
1095eda14cbcSMatt Macy const uint64_t *src = buf;
1096eda14cbcSMatt Macy uint64_t mask;
1097e716630dSMartin Matuska int cnt = size / sizeof (src[0]);
1098eda14cbcSMatt Macy
1099eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r);
1100eda14cbcSMatt Macy
1101e716630dSMartin Matuska for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1102eda14cbcSMatt Macy *pqr->p ^= *src;
1103eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1104eda14cbcSMatt Macy *pqr->q ^= *src;
1105eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1106eda14cbcSMatt Macy *pqr->r ^= *src;
1107eda14cbcSMatt Macy }
1108eda14cbcSMatt Macy
1109eda14cbcSMatt Macy return (0);
1110eda14cbcSMatt Macy }
1111eda14cbcSMatt Macy
1112eda14cbcSMatt Macy static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)11137877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr)
1114eda14cbcSMatt Macy {
11157877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1116eda14cbcSMatt Macy
11177877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
11187877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd;
1119eda14cbcSMatt Macy
11207877fdebSMatt Macy if (c == rr->rr_firstdatacol) {
11217877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1122eda14cbcSMatt Macy } else {
1123eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL };
11247877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1125eda14cbcSMatt Macy vdev_raidz_p_func, &pqr);
1126eda14cbcSMatt Macy }
1127eda14cbcSMatt Macy }
1128eda14cbcSMatt Macy }
1129eda14cbcSMatt Macy
1130eda14cbcSMatt Macy static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)11317877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1132eda14cbcSMatt Macy {
11337877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
11347877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
11357877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
11367877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
11377877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1138eda14cbcSMatt Macy
11397877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
11407877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd;
1141eda14cbcSMatt Macy
11427877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1143eda14cbcSMatt Macy
11447877fdebSMatt Macy if (c == rr->rr_firstdatacol) {
1145eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0);
11467877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
11477877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size);
1148eda14cbcSMatt Macy
11497877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1150eda14cbcSMatt Macy p[i] = 0;
1151eda14cbcSMatt Macy q[i] = 0;
1152eda14cbcSMatt Macy }
1153eda14cbcSMatt Macy } else {
1154eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL };
1155eda14cbcSMatt Macy
1156eda14cbcSMatt Macy ASSERT(ccnt <= pcnt);
11577877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1158eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr);
1159eda14cbcSMatt Macy
1160eda14cbcSMatt Macy /*
1161eda14cbcSMatt Macy * Treat short columns as though they are full of 0s.
1162eda14cbcSMatt Macy * Note that there's therefore nothing needed for P.
1163eda14cbcSMatt Macy */
11647877fdebSMatt Macy uint64_t mask;
11657877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1166eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask);
1167eda14cbcSMatt Macy }
1168eda14cbcSMatt Macy }
1169eda14cbcSMatt Macy }
1170eda14cbcSMatt Macy }
1171eda14cbcSMatt Macy
1172eda14cbcSMatt Macy static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)11737877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1174eda14cbcSMatt Macy {
11757877fdebSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
11767877fdebSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
11777877fdebSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
11787877fdebSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
11797877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
11807877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size);
11817877fdebSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
11827877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size);
1183eda14cbcSMatt Macy
11847877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
11857877fdebSMatt Macy abd_t *src = rr->rr_col[c].rc_abd;
1186eda14cbcSMatt Macy
11877877fdebSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1188eda14cbcSMatt Macy
11897877fdebSMatt Macy if (c == rr->rr_firstdatacol) {
1190eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0);
11917877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
11927877fdebSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size);
11937877fdebSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size);
1194eda14cbcSMatt Macy
11957877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1196eda14cbcSMatt Macy p[i] = 0;
1197eda14cbcSMatt Macy q[i] = 0;
1198eda14cbcSMatt Macy r[i] = 0;
1199eda14cbcSMatt Macy }
1200eda14cbcSMatt Macy } else {
1201eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r };
1202eda14cbcSMatt Macy
1203eda14cbcSMatt Macy ASSERT(ccnt <= pcnt);
12047877fdebSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1205eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr);
1206eda14cbcSMatt Macy
1207eda14cbcSMatt Macy /*
1208eda14cbcSMatt Macy * Treat short columns as though they are full of 0s.
1209eda14cbcSMatt Macy * Note that there's therefore nothing needed for P.
1210eda14cbcSMatt Macy */
12117877fdebSMatt Macy uint64_t mask;
12127877fdebSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1213eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask);
1214eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask);
1215eda14cbcSMatt Macy }
1216eda14cbcSMatt Macy }
1217eda14cbcSMatt Macy }
1218eda14cbcSMatt Macy }
1219eda14cbcSMatt Macy
1220eda14cbcSMatt Macy /*
1221eda14cbcSMatt Macy * Generate RAID parity in the first virtual columns according to the number of
1222eda14cbcSMatt Macy * parity columns available.
1223eda14cbcSMatt Macy */
1224eda14cbcSMatt Macy void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)12257877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1226eda14cbcSMatt Macy {
1227e716630dSMartin Matuska if (rr->rr_cols == 0) {
1228e716630dSMartin Matuska /*
1229e716630dSMartin Matuska * We are handling this block one row at a time (because
1230e716630dSMartin Matuska * this block has a different logical vs physical width,
1231e716630dSMartin Matuska * due to RAIDZ expansion), and this is a pad-only row,
1232e716630dSMartin Matuska * which has no parity.
1233e716630dSMartin Matuska */
1234e716630dSMartin Matuska return;
1235e716630dSMartin Matuska }
12367877fdebSMatt Macy
1237eda14cbcSMatt Macy /* Generate using the new math implementation */
12387877fdebSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1239eda14cbcSMatt Macy return;
1240eda14cbcSMatt Macy
12417877fdebSMatt Macy switch (rr->rr_firstdatacol) {
1242eda14cbcSMatt Macy case 1:
12437877fdebSMatt Macy vdev_raidz_generate_parity_p(rr);
1244eda14cbcSMatt Macy break;
1245eda14cbcSMatt Macy case 2:
12467877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr);
1247eda14cbcSMatt Macy break;
1248eda14cbcSMatt Macy case 3:
12497877fdebSMatt Macy vdev_raidz_generate_parity_pqr(rr);
1250eda14cbcSMatt Macy break;
1251eda14cbcSMatt Macy default:
1252eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1253eda14cbcSMatt Macy }
1254eda14cbcSMatt Macy }
1255eda14cbcSMatt Macy
12567877fdebSMatt Macy void
vdev_raidz_generate_parity(raidz_map_t * rm)12577877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm)
12587877fdebSMatt Macy {
12597877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
12607877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
12617877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr);
12627877fdebSMatt Macy }
12637877fdebSMatt Macy }
12647877fdebSMatt Macy
1265eda14cbcSMatt Macy static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1266eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1267eda14cbcSMatt Macy {
1268e92ffd9bSMartin Matuska (void) private;
1269eda14cbcSMatt Macy uint64_t *dst = dbuf;
1270eda14cbcSMatt Macy uint64_t *src = sbuf;
1271eda14cbcSMatt Macy int cnt = size / sizeof (src[0]);
1272eda14cbcSMatt Macy
1273eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) {
1274eda14cbcSMatt Macy dst[i] ^= src[i];
1275eda14cbcSMatt Macy }
1276eda14cbcSMatt Macy
1277eda14cbcSMatt Macy return (0);
1278eda14cbcSMatt Macy }
1279eda14cbcSMatt Macy
1280eda14cbcSMatt Macy static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1281eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1282eda14cbcSMatt Macy void *private)
1283eda14cbcSMatt Macy {
1284e92ffd9bSMartin Matuska (void) private;
1285eda14cbcSMatt Macy uint64_t *dst = dbuf;
1286eda14cbcSMatt Macy uint64_t *src = sbuf;
1287eda14cbcSMatt Macy uint64_t mask;
1288eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]);
1289eda14cbcSMatt Macy
1290eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) {
1291eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask);
1292eda14cbcSMatt Macy *dst ^= *src;
1293eda14cbcSMatt Macy }
1294eda14cbcSMatt Macy
1295eda14cbcSMatt Macy return (0);
1296eda14cbcSMatt Macy }
1297eda14cbcSMatt Macy
1298eda14cbcSMatt Macy static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1299eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1300eda14cbcSMatt Macy {
1301e92ffd9bSMartin Matuska (void) private;
1302eda14cbcSMatt Macy uint64_t *dst = buf;
1303eda14cbcSMatt Macy uint64_t mask;
1304eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]);
1305eda14cbcSMatt Macy
1306eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) {
1307eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1308eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask);
1309eda14cbcSMatt Macy }
1310eda14cbcSMatt Macy
1311eda14cbcSMatt Macy return (0);
1312eda14cbcSMatt Macy }
1313eda14cbcSMatt Macy
1314eda14cbcSMatt Macy struct reconst_q_struct {
1315eda14cbcSMatt Macy uint64_t *q;
1316eda14cbcSMatt Macy int exp;
1317eda14cbcSMatt Macy };
1318eda14cbcSMatt Macy
1319eda14cbcSMatt Macy static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1320eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1321eda14cbcSMatt Macy {
1322eda14cbcSMatt Macy struct reconst_q_struct *rq = private;
1323eda14cbcSMatt Macy uint64_t *dst = buf;
1324eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]);
1325eda14cbcSMatt Macy
1326eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1327eda14cbcSMatt Macy int j;
1328eda14cbcSMatt Macy uint8_t *b;
1329eda14cbcSMatt Macy
1330eda14cbcSMatt Macy *dst ^= *rq->q;
1331eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1332eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp);
1333eda14cbcSMatt Macy }
1334eda14cbcSMatt Macy }
1335eda14cbcSMatt Macy
1336eda14cbcSMatt Macy return (0);
1337eda14cbcSMatt Macy }
1338eda14cbcSMatt Macy
1339eda14cbcSMatt Macy struct reconst_pq_struct {
1340eda14cbcSMatt Macy uint8_t *p;
1341eda14cbcSMatt Macy uint8_t *q;
1342eda14cbcSMatt Macy uint8_t *pxy;
1343eda14cbcSMatt Macy uint8_t *qxy;
1344eda14cbcSMatt Macy int aexp;
1345eda14cbcSMatt Macy int bexp;
1346eda14cbcSMatt Macy };
1347eda14cbcSMatt Macy
1348eda14cbcSMatt Macy static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)1349eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1350eda14cbcSMatt Macy {
1351eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private;
1352eda14cbcSMatt Macy uint8_t *xd = xbuf;
1353eda14cbcSMatt Macy uint8_t *yd = ybuf;
1354eda14cbcSMatt Macy
1355eda14cbcSMatt Macy for (int i = 0; i < size;
1356eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1357eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1358eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1359eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1360eda14cbcSMatt Macy }
1361eda14cbcSMatt Macy
1362eda14cbcSMatt Macy return (0);
1363eda14cbcSMatt Macy }
1364eda14cbcSMatt Macy
1365eda14cbcSMatt Macy static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1366eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1367eda14cbcSMatt Macy {
1368eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private;
1369eda14cbcSMatt Macy uint8_t *xd = xbuf;
1370eda14cbcSMatt Macy
1371eda14cbcSMatt Macy for (int i = 0; i < size;
1372eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1373eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */
1374eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1375eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1376eda14cbcSMatt Macy }
1377eda14cbcSMatt Macy
1378eda14cbcSMatt Macy return (0);
1379eda14cbcSMatt Macy }
1380eda14cbcSMatt Macy
1381f9693befSMartin Matuska static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)13827877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1383eda14cbcSMatt Macy {
1384eda14cbcSMatt Macy int x = tgts[0];
1385eda14cbcSMatt Macy abd_t *dst, *src;
1386eda14cbcSMatt Macy
1387e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1388e716630dSMartin Matuska zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1389e716630dSMartin Matuska
13907877fdebSMatt Macy ASSERT3U(ntgts, ==, 1);
13917877fdebSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol);
13927877fdebSMatt Macy ASSERT3U(x, <, rr->rr_cols);
1393eda14cbcSMatt Macy
13947877fdebSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1395eda14cbcSMatt Macy
13967877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
13977877fdebSMatt Macy dst = rr->rr_col[x].rc_abd;
1398eda14cbcSMatt Macy
13997877fdebSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1400eda14cbcSMatt Macy
14017877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14027877fdebSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size,
14037877fdebSMatt Macy rr->rr_col[c].rc_size);
1404eda14cbcSMatt Macy
14057877fdebSMatt Macy src = rr->rr_col[c].rc_abd;
1406eda14cbcSMatt Macy
1407eda14cbcSMatt Macy if (c == x)
1408eda14cbcSMatt Macy continue;
1409eda14cbcSMatt Macy
1410eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size,
1411eda14cbcSMatt Macy vdev_raidz_reconst_p_func, NULL);
1412eda14cbcSMatt Macy }
1413eda14cbcSMatt Macy }
1414eda14cbcSMatt Macy
1415f9693befSMartin Matuska static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)14167877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1417eda14cbcSMatt Macy {
1418eda14cbcSMatt Macy int x = tgts[0];
1419eda14cbcSMatt Macy int c, exp;
1420eda14cbcSMatt Macy abd_t *dst, *src;
1421eda14cbcSMatt Macy
1422e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1423e716630dSMartin Matuska zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1424e716630dSMartin Matuska
1425eda14cbcSMatt Macy ASSERT(ntgts == 1);
1426eda14cbcSMatt Macy
14277877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1428eda14cbcSMatt Macy
14297877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14307877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
14317877fdebSMatt Macy rr->rr_col[c].rc_size);
1432eda14cbcSMatt Macy
14337877fdebSMatt Macy src = rr->rr_col[c].rc_abd;
14347877fdebSMatt Macy dst = rr->rr_col[x].rc_abd;
1435eda14cbcSMatt Macy
14367877fdebSMatt Macy if (c == rr->rr_firstdatacol) {
1437eda14cbcSMatt Macy abd_copy(dst, src, size);
14387877fdebSMatt Macy if (rr->rr_col[x].rc_size > size) {
1439eda14cbcSMatt Macy abd_zero_off(dst, size,
14407877fdebSMatt Macy rr->rr_col[x].rc_size - size);
14417877fdebSMatt Macy }
1442eda14cbcSMatt Macy } else {
14437877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1444eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size,
1445eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL);
1446eda14cbcSMatt Macy (void) abd_iterate_func(dst,
14477877fdebSMatt Macy size, rr->rr_col[x].rc_size - size,
1448eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL);
1449eda14cbcSMatt Macy }
1450eda14cbcSMatt Macy }
1451eda14cbcSMatt Macy
14527877fdebSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
14537877fdebSMatt Macy dst = rr->rr_col[x].rc_abd;
14547877fdebSMatt Macy exp = 255 - (rr->rr_cols - 1 - x);
1455eda14cbcSMatt Macy
1456eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp };
14577877fdebSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1458eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq);
1459eda14cbcSMatt Macy }
1460eda14cbcSMatt Macy
1461f9693befSMartin Matuska static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)14627877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1463eda14cbcSMatt Macy {
1464eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1465eda14cbcSMatt Macy abd_t *pdata, *qdata;
1466eda14cbcSMatt Macy uint64_t xsize, ysize;
1467eda14cbcSMatt Macy int x = tgts[0];
1468eda14cbcSMatt Macy int y = tgts[1];
1469eda14cbcSMatt Macy abd_t *xd, *yd;
1470eda14cbcSMatt Macy
1471e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1472e716630dSMartin Matuska zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1473e716630dSMartin Matuska
1474eda14cbcSMatt Macy ASSERT(ntgts == 2);
1475eda14cbcSMatt Macy ASSERT(x < y);
14767877fdebSMatt Macy ASSERT(x >= rr->rr_firstdatacol);
14777877fdebSMatt Macy ASSERT(y < rr->rr_cols);
1478eda14cbcSMatt Macy
14797877fdebSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1480eda14cbcSMatt Macy
1481eda14cbcSMatt Macy /*
1482eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as
1483eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1484eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual
1485eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by
1486eda14cbcSMatt Macy * setting their lengths to zero.
1487eda14cbcSMatt Macy */
14887877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
14897877fdebSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
14907877fdebSMatt Macy xsize = rr->rr_col[x].rc_size;
14917877fdebSMatt Macy ysize = rr->rr_col[y].rc_size;
1492eda14cbcSMatt Macy
14937877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd =
14947877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
14957877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
14967877fdebSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
14977877fdebSMatt Macy rr->rr_col[x].rc_size = 0;
14987877fdebSMatt Macy rr->rr_col[y].rc_size = 0;
1499eda14cbcSMatt Macy
15007877fdebSMatt Macy vdev_raidz_generate_parity_pq(rr);
1501eda14cbcSMatt Macy
15027877fdebSMatt Macy rr->rr_col[x].rc_size = xsize;
15037877fdebSMatt Macy rr->rr_col[y].rc_size = ysize;
1504eda14cbcSMatt Macy
1505eda14cbcSMatt Macy p = abd_to_buf(pdata);
1506eda14cbcSMatt Macy q = abd_to_buf(qdata);
15077877fdebSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
15087877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
15097877fdebSMatt Macy xd = rr->rr_col[x].rc_abd;
15107877fdebSMatt Macy yd = rr->rr_col[y].rc_abd;
1511eda14cbcSMatt Macy
1512eda14cbcSMatt Macy /*
1513eda14cbcSMatt Macy * We now have:
1514eda14cbcSMatt Macy * Pxy = P + D_x + D_y
1515eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1516eda14cbcSMatt Macy *
1517eda14cbcSMatt Macy * We can then solve for D_x:
1518eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy)
1519eda14cbcSMatt Macy * where
1520eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1
1521eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1522eda14cbcSMatt Macy *
1523eda14cbcSMatt Macy * With D_x in hand, we can easily solve for D_y:
1524eda14cbcSMatt Macy * D_y = P + Pxy + D_x
1525eda14cbcSMatt Macy */
1526eda14cbcSMatt Macy
1527eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y];
15287877fdebSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1529eda14cbcSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1];
1530eda14cbcSMatt Macy
1531eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1532eda14cbcSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1533eda14cbcSMatt Macy
1534eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize);
1535eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1536eda14cbcSMatt Macy
1537eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1538eda14cbcSMatt Macy vdev_raidz_reconst_pq_func, &rpq);
1539eda14cbcSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize,
1540eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq);
1541eda14cbcSMatt Macy
15427877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
15437877fdebSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1544eda14cbcSMatt Macy
1545eda14cbcSMatt Macy /*
1546eda14cbcSMatt Macy * Restore the saved parity data.
1547eda14cbcSMatt Macy */
15487877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
15497877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1550eda14cbcSMatt Macy }
1551eda14cbcSMatt Macy
1552eda14cbcSMatt Macy /*
1553eda14cbcSMatt Macy * In the general case of reconstruction, we must solve the system of linear
1554eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as
1555eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with
1556eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p)
1557eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1558eda14cbcSMatt Macy *
1559eda14cbcSMatt Macy * __ __ __ __
1560eda14cbcSMatt Macy * | | __ __ | p_0 |
1561eda14cbcSMatt Macy * | V | | D_0 | | p_m-1 |
1562eda14cbcSMatt Macy * | | x | : | = | d_0 |
1563eda14cbcSMatt Macy * | I | | D_n-1 | | : |
1564eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 |
1565eda14cbcSMatt Macy * ~~ ~~ ~~ ~~
1566eda14cbcSMatt Macy *
1567eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde
1568eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns
1569eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1570eda14cbcSMatt Macy * computation as well as linear separability.
1571eda14cbcSMatt Macy *
1572eda14cbcSMatt Macy * __ __ __ __
1573eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 |
1574eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : |
1575eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1576eda14cbcSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 |
1577eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1578eda14cbcSMatt Macy * | : : : : | | : | | d_2 |
1579eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : |
1580eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : |
1581eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 |
1582eda14cbcSMatt Macy * ~~ ~~ ~~ ~~
1583eda14cbcSMatt Macy *
1584eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the
1585eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown
1586eda14cbcSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond
1587eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p
1588eda14cbcSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up
1589eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1590eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity
1591eda14cbcSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1592eda14cbcSMatt Macy * __ __
1593eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 |
1594eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1595eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / /
1596eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / /
1597eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' /
1598eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1599eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 |
1600eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 |
1601eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 |
1602eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 |
1603eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 |
1604eda14cbcSMatt Macy * ~~ ~~
1605eda14cbcSMatt Macy * __ __
1606eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 |
1607eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 |
1608eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 |
1609eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 |
1610eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 |
1611eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 |
1612eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 |
1613eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 |
1614eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 |
1615eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 |
1616eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 |
1617eda14cbcSMatt Macy * ~~ ~~
1618eda14cbcSMatt Macy *
1619eda14cbcSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1620eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1621eda14cbcSMatt Macy * matrix is not singular.
1622eda14cbcSMatt Macy * __ __
1623eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1624eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1625eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1626eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1627eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1628eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1629eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1630eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1631eda14cbcSMatt Macy * ~~ ~~
1632eda14cbcSMatt Macy * __ __
1633eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1634eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1635eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1636eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1637eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1638eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1639eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1640eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1641eda14cbcSMatt Macy * ~~ ~~
1642eda14cbcSMatt Macy * __ __
1643eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1644eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1645eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1646eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1647eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1648eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1649eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1650eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1651eda14cbcSMatt Macy * ~~ ~~
1652eda14cbcSMatt Macy * __ __
1653eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1654eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1655eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1656eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1657eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1658eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1659eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1660eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1661eda14cbcSMatt Macy * ~~ ~~
1662eda14cbcSMatt Macy * __ __
1663eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1664eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1665eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1666eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1667eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1668eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1669eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1670eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1671eda14cbcSMatt Macy * ~~ ~~
1672eda14cbcSMatt Macy * __ __
1673eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1674eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1675eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1676eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1677eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1678eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1679eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1680eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1681eda14cbcSMatt Macy * ~~ ~~
1682eda14cbcSMatt Macy * __ __
1683eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 |
1684eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 |
1685eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 |
1686eda14cbcSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1687eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 |
1688eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 |
1689eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 |
1690eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 |
1691eda14cbcSMatt Macy * ~~ ~~
1692eda14cbcSMatt Macy *
1693eda14cbcSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1694eda14cbcSMatt Macy * of the missing data.
1695eda14cbcSMatt Macy *
1696eda14cbcSMatt Macy * As is apparent from the example above, the only non-trivial rows in the
1697eda14cbcSMatt Macy * inverse matrix correspond to the data disks that we're trying to
1698eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would
1699eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For
1700eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to
1701eda14cbcSMatt Macy * targeted columns.
1702eda14cbcSMatt Macy */
1703eda14cbcSMatt Macy
1704eda14cbcSMatt Macy static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)17057877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1706eda14cbcSMatt Macy uint8_t **rows)
1707eda14cbcSMatt Macy {
1708eda14cbcSMatt Macy int i, j;
1709eda14cbcSMatt Macy int pow;
1710eda14cbcSMatt Macy
17117877fdebSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1712eda14cbcSMatt Macy
1713eda14cbcSMatt Macy /*
1714eda14cbcSMatt Macy * Fill in the missing rows of interest.
1715eda14cbcSMatt Macy */
1716eda14cbcSMatt Macy for (i = 0; i < nmap; i++) {
1717eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]);
1718eda14cbcSMatt Macy ASSERT3S(map[i], <=, 2);
1719eda14cbcSMatt Macy
1720eda14cbcSMatt Macy pow = map[i] * n;
1721eda14cbcSMatt Macy if (pow > 255)
1722eda14cbcSMatt Macy pow -= 255;
1723eda14cbcSMatt Macy ASSERT(pow <= 255);
1724eda14cbcSMatt Macy
1725eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1726eda14cbcSMatt Macy pow -= map[i];
1727eda14cbcSMatt Macy if (pow < 0)
1728eda14cbcSMatt Macy pow += 255;
1729eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow];
1730eda14cbcSMatt Macy }
1731eda14cbcSMatt Macy }
1732eda14cbcSMatt Macy }
1733eda14cbcSMatt Macy
1734eda14cbcSMatt Macy static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)17357877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1736eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1737eda14cbcSMatt Macy {
1738eda14cbcSMatt Macy int i, j, ii, jj;
1739eda14cbcSMatt Macy uint8_t log;
1740eda14cbcSMatt Macy
1741eda14cbcSMatt Macy /*
1742eda14cbcSMatt Macy * Assert that the first nmissing entries from the array of used
1743eda14cbcSMatt Macy * columns correspond to parity columns and that subsequent entries
1744eda14cbcSMatt Macy * correspond to data columns.
1745eda14cbcSMatt Macy */
1746eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
17477877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol);
1748eda14cbcSMatt Macy }
1749eda14cbcSMatt Macy for (; i < n; i++) {
17507877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1751eda14cbcSMatt Macy }
1752eda14cbcSMatt Macy
1753eda14cbcSMatt Macy /*
1754eda14cbcSMatt Macy * First initialize the storage where we'll compute the inverse rows.
1755eda14cbcSMatt Macy */
1756eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
1757eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1758eda14cbcSMatt Macy invrows[i][j] = (i == j) ? 1 : 0;
1759eda14cbcSMatt Macy }
1760eda14cbcSMatt Macy }
1761eda14cbcSMatt Macy
1762eda14cbcSMatt Macy /*
1763eda14cbcSMatt Macy * Subtract all trivial rows from the rows of consequence.
1764eda14cbcSMatt Macy */
1765eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
1766eda14cbcSMatt Macy for (j = nmissing; j < n; j++) {
17677877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol);
17687877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol;
1769eda14cbcSMatt Macy ASSERT3S(jj, <, n);
1770eda14cbcSMatt Macy invrows[i][j] = rows[i][jj];
1771eda14cbcSMatt Macy rows[i][jj] = 0;
1772eda14cbcSMatt Macy }
1773eda14cbcSMatt Macy }
1774eda14cbcSMatt Macy
1775eda14cbcSMatt Macy /*
1776eda14cbcSMatt Macy * For each of the rows of interest, we must normalize it and subtract
1777eda14cbcSMatt Macy * a multiple of it from the other rows.
1778eda14cbcSMatt Macy */
1779eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
1780eda14cbcSMatt Macy for (j = 0; j < missing[i]; j++) {
1781eda14cbcSMatt Macy ASSERT0(rows[i][j]);
1782eda14cbcSMatt Macy }
1783eda14cbcSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0);
1784eda14cbcSMatt Macy
1785eda14cbcSMatt Macy /*
1786eda14cbcSMatt Macy * Compute the inverse of the first element and multiply each
1787eda14cbcSMatt Macy * element in the row by that value.
1788eda14cbcSMatt Macy */
1789eda14cbcSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1790eda14cbcSMatt Macy
1791eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1792eda14cbcSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1793eda14cbcSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1794eda14cbcSMatt Macy }
1795eda14cbcSMatt Macy
1796eda14cbcSMatt Macy for (ii = 0; ii < nmissing; ii++) {
1797eda14cbcSMatt Macy if (i == ii)
1798eda14cbcSMatt Macy continue;
1799eda14cbcSMatt Macy
1800eda14cbcSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0);
1801eda14cbcSMatt Macy
1802eda14cbcSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]];
1803eda14cbcSMatt Macy
1804eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1805eda14cbcSMatt Macy rows[ii][j] ^=
1806eda14cbcSMatt Macy vdev_raidz_exp2(rows[i][j], log);
1807eda14cbcSMatt Macy invrows[ii][j] ^=
1808eda14cbcSMatt Macy vdev_raidz_exp2(invrows[i][j], log);
1809eda14cbcSMatt Macy }
1810eda14cbcSMatt Macy }
1811eda14cbcSMatt Macy }
1812eda14cbcSMatt Macy
1813eda14cbcSMatt Macy /*
1814eda14cbcSMatt Macy * Verify that the data that is left in the rows are properly part of
1815eda14cbcSMatt Macy * an identity matrix.
1816eda14cbcSMatt Macy */
1817eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
1818eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1819eda14cbcSMatt Macy if (j == missing[i]) {
1820eda14cbcSMatt Macy ASSERT3U(rows[i][j], ==, 1);
1821eda14cbcSMatt Macy } else {
1822eda14cbcSMatt Macy ASSERT0(rows[i][j]);
1823eda14cbcSMatt Macy }
1824eda14cbcSMatt Macy }
1825eda14cbcSMatt Macy }
1826eda14cbcSMatt Macy }
1827eda14cbcSMatt Macy
1828eda14cbcSMatt Macy static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)18297877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1830eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used)
1831eda14cbcSMatt Macy {
1832eda14cbcSMatt Macy int i, j, x, cc, c;
1833eda14cbcSMatt Macy uint8_t *src;
1834eda14cbcSMatt Macy uint64_t ccount;
1835eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1836eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1837eda14cbcSMatt Macy uint8_t log = 0;
1838eda14cbcSMatt Macy uint8_t val;
1839eda14cbcSMatt Macy int ll;
1840eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1841eda14cbcSMatt Macy uint8_t *p, *pp;
1842eda14cbcSMatt Macy size_t psize;
1843eda14cbcSMatt Macy
1844eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing;
1845eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP);
1846eda14cbcSMatt Macy
1847eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing; i++) {
1848eda14cbcSMatt Macy invlog[i] = pp;
1849eda14cbcSMatt Macy pp += n;
1850eda14cbcSMatt Macy }
1851eda14cbcSMatt Macy
1852eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
1853eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1854eda14cbcSMatt Macy ASSERT3U(invrows[i][j], !=, 0);
1855eda14cbcSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1856eda14cbcSMatt Macy }
1857eda14cbcSMatt Macy }
1858eda14cbcSMatt Macy
1859eda14cbcSMatt Macy for (i = 0; i < n; i++) {
1860eda14cbcSMatt Macy c = used[i];
18617877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols);
1862eda14cbcSMatt Macy
18637877fdebSMatt Macy ccount = rr->rr_col[c].rc_size;
18647877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
18657877fdebSMatt Macy if (ccount == 0)
18667877fdebSMatt Macy continue;
18677877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd);
1868eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) {
18697877fdebSMatt Macy cc = missing[j] + rr->rr_firstdatacol;
18707877fdebSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol);
18717877fdebSMatt Macy ASSERT3U(cc, <, rr->rr_cols);
1872eda14cbcSMatt Macy ASSERT3U(cc, !=, c);
1873eda14cbcSMatt Macy
18747877fdebSMatt Macy dcount[j] = rr->rr_col[cc].rc_size;
18757877fdebSMatt Macy if (dcount[j] != 0)
18767877fdebSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1877eda14cbcSMatt Macy }
1878eda14cbcSMatt Macy
1879eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) {
1880eda14cbcSMatt Macy if (*src != 0)
1881eda14cbcSMatt Macy log = vdev_raidz_log2[*src];
1882eda14cbcSMatt Macy
1883eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) {
1884eda14cbcSMatt Macy if (x >= dcount[cc])
1885eda14cbcSMatt Macy continue;
1886eda14cbcSMatt Macy
1887eda14cbcSMatt Macy if (*src == 0) {
1888eda14cbcSMatt Macy val = 0;
1889eda14cbcSMatt Macy } else {
1890eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255)
1891eda14cbcSMatt Macy ll -= 255;
1892eda14cbcSMatt Macy val = vdev_raidz_pow2[ll];
1893eda14cbcSMatt Macy }
1894eda14cbcSMatt Macy
1895eda14cbcSMatt Macy if (i == 0)
1896eda14cbcSMatt Macy dst[cc][x] = val;
1897eda14cbcSMatt Macy else
1898eda14cbcSMatt Macy dst[cc][x] ^= val;
1899eda14cbcSMatt Macy }
1900eda14cbcSMatt Macy }
1901eda14cbcSMatt Macy }
1902eda14cbcSMatt Macy
1903eda14cbcSMatt Macy kmem_free(p, psize);
1904eda14cbcSMatt Macy }
1905eda14cbcSMatt Macy
1906f9693befSMartin Matuska static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)19077877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1908eda14cbcSMatt Macy {
1909b985c9caSMartin Matuska int i, c, t, tt;
1910b985c9caSMartin Matuska unsigned int n;
1911b985c9caSMartin Matuska unsigned int nmissing_rows;
1912eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY];
1913eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY];
1914eda14cbcSMatt Macy uint8_t *p, *pp;
1915eda14cbcSMatt Macy size_t psize;
1916eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1917eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1918eda14cbcSMatt Macy uint8_t *used;
1919eda14cbcSMatt Macy
1920eda14cbcSMatt Macy abd_t **bufs = NULL;
1921eda14cbcSMatt Macy
1922e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1923e716630dSMartin Matuska zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1924eda14cbcSMatt Macy /*
1925eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate
19267877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found.
1927eda14cbcSMatt Macy */
19287877fdebSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1929e716630dSMartin Matuska ASSERT(rr->rr_col[i].rc_abd != NULL);
19307877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
19317877fdebSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
19327877fdebSMatt Macy KM_PUSHPAGE);
1933eda14cbcSMatt Macy
19347877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
19357877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c];
1936eda14cbcSMatt Macy
1937eda14cbcSMatt Macy bufs[c] = col->rc_abd;
19387877fdebSMatt Macy if (bufs[c] != NULL) {
19397877fdebSMatt Macy col->rc_abd = abd_alloc_linear(
19407877fdebSMatt Macy col->rc_size, B_TRUE);
19417877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c],
19427877fdebSMatt Macy col->rc_size);
1943eda14cbcSMatt Macy }
1944eda14cbcSMatt Macy }
1945eda14cbcSMatt Macy
19467877fdebSMatt Macy break;
19477877fdebSMatt Macy }
19487877fdebSMatt Macy }
19497877fdebSMatt Macy
19507877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol;
1951eda14cbcSMatt Macy
1952eda14cbcSMatt Macy /*
1953eda14cbcSMatt Macy * Figure out which data columns are missing.
1954eda14cbcSMatt Macy */
1955eda14cbcSMatt Macy nmissing_rows = 0;
1956eda14cbcSMatt Macy for (t = 0; t < ntgts; t++) {
19577877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) {
1958eda14cbcSMatt Macy missing_rows[nmissing_rows++] =
19597877fdebSMatt Macy tgts[t] - rr->rr_firstdatacol;
1960eda14cbcSMatt Macy }
1961eda14cbcSMatt Macy }
1962eda14cbcSMatt Macy
1963eda14cbcSMatt Macy /*
1964eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing
1965eda14cbcSMatt Macy * data columns.
1966eda14cbcSMatt Macy */
1967eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1968eda14cbcSMatt Macy ASSERT(tt < ntgts);
19697877fdebSMatt Macy ASSERT(c < rr->rr_firstdatacol);
1970eda14cbcSMatt Macy
1971eda14cbcSMatt Macy /*
1972eda14cbcSMatt Macy * Skip any targeted parity columns.
1973eda14cbcSMatt Macy */
1974eda14cbcSMatt Macy if (c == tgts[tt]) {
1975eda14cbcSMatt Macy tt++;
1976eda14cbcSMatt Macy continue;
1977eda14cbcSMatt Macy }
1978eda14cbcSMatt Macy
1979eda14cbcSMatt Macy parity_map[i] = c;
1980eda14cbcSMatt Macy i++;
1981eda14cbcSMatt Macy }
1982eda14cbcSMatt Macy
1983eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1984eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n;
1985eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP);
1986eda14cbcSMatt Macy
1987eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) {
1988eda14cbcSMatt Macy rows[i] = pp;
1989eda14cbcSMatt Macy pp += n;
1990eda14cbcSMatt Macy invrows[i] = pp;
1991eda14cbcSMatt Macy pp += n;
1992eda14cbcSMatt Macy }
1993eda14cbcSMatt Macy used = pp;
1994eda14cbcSMatt Macy
1995eda14cbcSMatt Macy for (i = 0; i < nmissing_rows; i++) {
1996eda14cbcSMatt Macy used[i] = parity_map[i];
1997eda14cbcSMatt Macy }
1998eda14cbcSMatt Macy
19997877fdebSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2000eda14cbcSMatt Macy if (tt < nmissing_rows &&
20017877fdebSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) {
2002eda14cbcSMatt Macy tt++;
2003eda14cbcSMatt Macy continue;
2004eda14cbcSMatt Macy }
2005eda14cbcSMatt Macy
2006eda14cbcSMatt Macy ASSERT3S(i, <, n);
2007eda14cbcSMatt Macy used[i] = c;
2008eda14cbcSMatt Macy i++;
2009eda14cbcSMatt Macy }
2010eda14cbcSMatt Macy
2011eda14cbcSMatt Macy /*
2012eda14cbcSMatt Macy * Initialize the interesting rows of the matrix.
2013eda14cbcSMatt Macy */
20147877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2015eda14cbcSMatt Macy
2016eda14cbcSMatt Macy /*
2017eda14cbcSMatt Macy * Invert the matrix.
2018eda14cbcSMatt Macy */
20197877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2020eda14cbcSMatt Macy invrows, used);
2021eda14cbcSMatt Macy
2022eda14cbcSMatt Macy /*
2023eda14cbcSMatt Macy * Reconstruct the missing data using the generated matrix.
2024eda14cbcSMatt Macy */
20257877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2026eda14cbcSMatt Macy invrows, used);
2027eda14cbcSMatt Macy
2028eda14cbcSMatt Macy kmem_free(p, psize);
2029eda14cbcSMatt Macy
2030eda14cbcSMatt Macy /*
2031eda14cbcSMatt Macy * copy back from temporary linear abds and free them
2032eda14cbcSMatt Macy */
2033eda14cbcSMatt Macy if (bufs) {
20347877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
20357877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c];
2036eda14cbcSMatt Macy
20377877fdebSMatt Macy if (bufs[c] != NULL) {
2038eda14cbcSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size);
2039eda14cbcSMatt Macy abd_free(col->rc_abd);
20407877fdebSMatt Macy }
2041eda14cbcSMatt Macy col->rc_abd = bufs[c];
2042eda14cbcSMatt Macy }
20437877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2044eda14cbcSMatt Macy }
2045eda14cbcSMatt Macy }
2046eda14cbcSMatt Macy
2047f9693befSMartin Matuska static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)20487877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
20497877fdebSMatt Macy const int *t, int nt)
2050eda14cbcSMatt Macy {
2051eda14cbcSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2052eda14cbcSMatt Macy int ntgts;
2053eda14cbcSMatt Macy int i, c, ret;
2054eda14cbcSMatt Macy int nbadparity, nbaddata;
2055eda14cbcSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY];
2056eda14cbcSMatt Macy
2057e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2058e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2059e716630dSMartin Matuska rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2060e716630dSMartin Matuska (int)rr->rr_missingparity);
2061e716630dSMartin Matuska }
2062e716630dSMartin Matuska
20637877fdebSMatt Macy nbadparity = rr->rr_firstdatacol;
20647877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity;
2065eda14cbcSMatt Macy ntgts = 0;
20667877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) {
2067e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2068e716630dSMartin Matuska zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2069e716630dSMartin Matuska "offset=%llx error=%u)",
2070e716630dSMartin Matuska rr, c, (int)rr->rr_col[c].rc_devidx,
2071e716630dSMartin Matuska (long long)rr->rr_col[c].rc_offset,
2072e716630dSMartin Matuska (int)rr->rr_col[c].rc_error);
2073e716630dSMartin Matuska }
20747877fdebSMatt Macy if (c < rr->rr_firstdatacol)
2075eda14cbcSMatt Macy parity_valid[c] = B_FALSE;
2076eda14cbcSMatt Macy
2077eda14cbcSMatt Macy if (i < nt && c == t[i]) {
2078eda14cbcSMatt Macy tgts[ntgts++] = c;
2079eda14cbcSMatt Macy i++;
20807877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) {
2081eda14cbcSMatt Macy tgts[ntgts++] = c;
20827877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) {
2083eda14cbcSMatt Macy nbaddata--;
2084eda14cbcSMatt Macy } else {
2085eda14cbcSMatt Macy parity_valid[c] = B_TRUE;
2086eda14cbcSMatt Macy nbadparity--;
2087eda14cbcSMatt Macy }
2088eda14cbcSMatt Macy }
2089eda14cbcSMatt Macy
2090eda14cbcSMatt Macy ASSERT(ntgts >= nt);
2091eda14cbcSMatt Macy ASSERT(nbaddata >= 0);
2092eda14cbcSMatt Macy ASSERT(nbaddata + nbadparity == ntgts);
2093eda14cbcSMatt Macy
2094eda14cbcSMatt Macy dt = &tgts[nbadparity];
2095eda14cbcSMatt Macy
2096eda14cbcSMatt Macy /* Reconstruct using the new math implementation */
20977877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2098eda14cbcSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL)
2099f9693befSMartin Matuska return;
2100eda14cbcSMatt Macy
2101eda14cbcSMatt Macy /*
2102eda14cbcSMatt Macy * See if we can use any of our optimized reconstruction routines.
2103eda14cbcSMatt Macy */
2104eda14cbcSMatt Macy switch (nbaddata) {
2105eda14cbcSMatt Macy case 1:
2106f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_P]) {
2107f9693befSMartin Matuska vdev_raidz_reconstruct_p(rr, dt, 1);
2108f9693befSMartin Matuska return;
2109f9693befSMartin Matuska }
2110eda14cbcSMatt Macy
21117877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1);
2112eda14cbcSMatt Macy
2113f9693befSMartin Matuska if (parity_valid[VDEV_RAIDZ_Q]) {
2114f9693befSMartin Matuska vdev_raidz_reconstruct_q(rr, dt, 1);
2115f9693befSMartin Matuska return;
2116f9693befSMartin Matuska }
2117eda14cbcSMatt Macy
21187877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2);
2119eda14cbcSMatt Macy break;
2120eda14cbcSMatt Macy
2121eda14cbcSMatt Macy case 2:
21227877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1);
2123eda14cbcSMatt Macy
2124eda14cbcSMatt Macy if (parity_valid[VDEV_RAIDZ_P] &&
2125f9693befSMartin Matuska parity_valid[VDEV_RAIDZ_Q]) {
2126f9693befSMartin Matuska vdev_raidz_reconstruct_pq(rr, dt, 2);
2127f9693befSMartin Matuska return;
2128f9693befSMartin Matuska }
2129eda14cbcSMatt Macy
21307877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2);
2131eda14cbcSMatt Macy
2132eda14cbcSMatt Macy break;
2133eda14cbcSMatt Macy }
2134eda14cbcSMatt Macy
2135f9693befSMartin Matuska vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2136eda14cbcSMatt Macy }
2137eda14cbcSMatt Macy
2138eda14cbcSMatt Macy static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)2139eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2140eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift)
2141eda14cbcSMatt Macy {
21427877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
21437877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity;
2144eda14cbcSMatt Macy int c;
2145eda14cbcSMatt Macy int lasterror = 0;
2146eda14cbcSMatt Macy int numerrors = 0;
2147eda14cbcSMatt Macy
2148eda14cbcSMatt Macy ASSERT(nparity > 0);
2149eda14cbcSMatt Macy
2150eda14cbcSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY ||
2151eda14cbcSMatt Macy vd->vdev_children < nparity + 1) {
2152eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2153eda14cbcSMatt Macy return (SET_ERROR(EINVAL));
2154eda14cbcSMatt Macy }
2155eda14cbcSMatt Macy
2156eda14cbcSMatt Macy vdev_open_children(vd);
2157eda14cbcSMatt Macy
2158eda14cbcSMatt Macy for (c = 0; c < vd->vdev_children; c++) {
21597877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c];
2160eda14cbcSMatt Macy
2161eda14cbcSMatt Macy if (cvd->vdev_open_error != 0) {
2162eda14cbcSMatt Macy lasterror = cvd->vdev_open_error;
2163eda14cbcSMatt Macy numerrors++;
2164eda14cbcSMatt Macy continue;
2165eda14cbcSMatt Macy }
2166eda14cbcSMatt Macy
2167eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2168eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2169eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2170c7046f76SMartin Matuska }
2171c7046f76SMartin Matuska for (c = 0; c < vd->vdev_children; c++) {
2172c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c];
2173c7046f76SMartin Matuska
2174c7046f76SMartin Matuska if (cvd->vdev_open_error != 0)
2175c7046f76SMartin Matuska continue;
2176c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift,
2177c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift);
2178eda14cbcSMatt Macy }
2179eda14cbcSMatt Macy
2180e716630dSMartin Matuska if (vd->vdev_rz_expanding) {
2181e716630dSMartin Matuska *asize *= vd->vdev_children - 1;
2182e716630dSMartin Matuska *max_asize *= vd->vdev_children - 1;
2183e716630dSMartin Matuska
2184e716630dSMartin Matuska vd->vdev_min_asize = *asize;
2185e716630dSMartin Matuska } else {
2186eda14cbcSMatt Macy *asize *= vd->vdev_children;
2187eda14cbcSMatt Macy *max_asize *= vd->vdev_children;
2188e716630dSMartin Matuska }
2189eda14cbcSMatt Macy
2190eda14cbcSMatt Macy if (numerrors > nparity) {
2191eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2192eda14cbcSMatt Macy return (lasterror);
2193eda14cbcSMatt Macy }
2194eda14cbcSMatt Macy
2195eda14cbcSMatt Macy return (0);
2196eda14cbcSMatt Macy }
2197eda14cbcSMatt Macy
2198eda14cbcSMatt Macy static void
vdev_raidz_close(vdev_t * vd)2199eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd)
2200eda14cbcSMatt Macy {
22017877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) {
22027877fdebSMatt Macy if (vd->vdev_child[c] != NULL)
2203eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]);
2204eda14cbcSMatt Macy }
22057877fdebSMatt Macy }
2206eda14cbcSMatt Macy
2207e716630dSMartin Matuska /*
2208e716630dSMartin Matuska * Return the logical width to use, given the txg in which the allocation
2209783d3ff6SMartin Matuska * happened. Note that BP_GET_BIRTH() is usually the txg in which the
2210e716630dSMartin Matuska * BP was allocated. Remapped BP's (that were relocated due to device
2211783d3ff6SMartin Matuska * removal, see remap_blkptr_cb()), will have a more recent physical birth
2212783d3ff6SMartin Matuska * which reflects when the BP was relocated, but we can ignore these because
2213783d3ff6SMartin Matuska * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2214e716630dSMartin Matuska */
2215eda14cbcSMatt Macy static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)2216e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2217e716630dSMartin Matuska {
2218e716630dSMartin Matuska reflow_node_t lookup = {
2219e716630dSMartin Matuska .re_txg = txg,
2220e716630dSMartin Matuska };
2221e716630dSMartin Matuska avl_index_t where;
2222e716630dSMartin Matuska
2223e716630dSMartin Matuska uint64_t width;
2224e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock);
2225e716630dSMartin Matuska reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2226e716630dSMartin Matuska if (re != NULL) {
2227e716630dSMartin Matuska width = re->re_logical_width;
2228e716630dSMartin Matuska } else {
2229e716630dSMartin Matuska re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2230e716630dSMartin Matuska if (re != NULL)
2231e716630dSMartin Matuska width = re->re_logical_width;
2232e716630dSMartin Matuska else
2233e716630dSMartin Matuska width = vdrz->vd_original_width;
2234e716630dSMartin Matuska }
2235e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock);
2236e716630dSMartin Matuska return (width);
2237e716630dSMartin Matuska }
2238071ab5a1SMartin Matuska /*
2239071ab5a1SMartin Matuska * This code converts an asize into the largest psize that can safely be written
2240071ab5a1SMartin Matuska * to an allocation of that size for this vdev.
2241071ab5a1SMartin Matuska *
2242071ab5a1SMartin Matuska * Note that this function will not take into account the effect of gang
2243071ab5a1SMartin Matuska * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
2244071ab5a1SMartin Matuska * the psize_to_asize function.
2245071ab5a1SMartin Matuska */
2246071ab5a1SMartin Matuska static uint64_t
vdev_raidz_asize_to_psize(vdev_t * vd,uint64_t asize,uint64_t txg)2247071ab5a1SMartin Matuska vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
2248071ab5a1SMartin Matuska {
2249071ab5a1SMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd;
2250071ab5a1SMartin Matuska uint64_t psize;
2251071ab5a1SMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift;
2252071ab5a1SMartin Matuska uint64_t cols = vdrz->vd_original_width;
2253071ab5a1SMartin Matuska uint64_t nparity = vdrz->vd_nparity;
2254071ab5a1SMartin Matuska
2255071ab5a1SMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg);
2256071ab5a1SMartin Matuska
2257071ab5a1SMartin Matuska ASSERT0(asize % (1 << ashift));
2258071ab5a1SMartin Matuska
2259071ab5a1SMartin Matuska psize = (asize >> ashift);
2260071ab5a1SMartin Matuska psize -= nparity * DIV_ROUND_UP(psize, cols);
2261071ab5a1SMartin Matuska psize <<= ashift;
2262071ab5a1SMartin Matuska
2263071ab5a1SMartin Matuska return (asize);
2264071ab5a1SMartin Matuska }
2265e716630dSMartin Matuska
2266e716630dSMartin Matuska /*
2267e716630dSMartin Matuska * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2268e716630dSMartin Matuska * more space due to the lower data-to-parity ratio. In this case it's
2269e716630dSMartin Matuska * important to pass in the correct txg. Note that vdev_gang_header_asize()
2270e716630dSMartin Matuska * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2271e716630dSMartin Matuska * regardless of txg. This is assured because for a single data sector, we
2272e716630dSMartin Matuska * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2273e716630dSMartin Matuska */
2274e716630dSMartin Matuska static uint64_t
vdev_raidz_psize_to_asize(vdev_t * vd,uint64_t psize,uint64_t txg)2275071ab5a1SMartin Matuska vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2276eda14cbcSMatt Macy {
22777877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
2278eda14cbcSMatt Macy uint64_t asize;
2279eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift;
2280e716630dSMartin Matuska uint64_t cols = vdrz->vd_original_width;
22817877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity;
2282eda14cbcSMatt Macy
2283e716630dSMartin Matuska cols = vdev_raidz_get_logical_width(vdrz, txg);
2284e716630dSMartin Matuska
2285eda14cbcSMatt Macy asize = ((psize - 1) >> ashift) + 1;
2286eda14cbcSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2287eda14cbcSMatt Macy asize = roundup(asize, nparity + 1) << ashift;
2288eda14cbcSMatt Macy
2289e716630dSMartin Matuska #ifdef ZFS_DEBUG
2290e716630dSMartin Matuska uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2291e716630dSMartin Matuska uint64_t ncols_new = vdrz->vd_physical_width;
2292e716630dSMartin Matuska asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2293e716630dSMartin Matuska (ncols_new - nparity));
2294e716630dSMartin Matuska asize_new = roundup(asize_new, nparity + 1) << ashift;
2295e716630dSMartin Matuska VERIFY3U(asize_new, <=, asize);
2296e716630dSMartin Matuska #endif
2297e716630dSMartin Matuska
2298eda14cbcSMatt Macy return (asize);
2299eda14cbcSMatt Macy }
2300eda14cbcSMatt Macy
23017877fdebSMatt Macy /*
23027877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child)
23037877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize.
23047877fdebSMatt Macy */
23057877fdebSMatt Macy static uint64_t
vdev_raidz_min_asize(vdev_t * vd)23067877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd)
23077877fdebSMatt Macy {
23087877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) /
23097877fdebSMatt Macy vd->vdev_children);
23107877fdebSMatt Macy }
23117877fdebSMatt Macy
23127877fdebSMatt Macy void
vdev_raidz_child_done(zio_t * zio)2313eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio)
2314eda14cbcSMatt Macy {
2315eda14cbcSMatt Macy raidz_col_t *rc = zio->io_private;
2316eda14cbcSMatt Macy
231781b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL);
2318eda14cbcSMatt Macy rc->rc_error = zio->io_error;
2319eda14cbcSMatt Macy rc->rc_tried = 1;
2320eda14cbcSMatt Macy rc->rc_skipped = 0;
2321eda14cbcSMatt Macy }
2322eda14cbcSMatt Macy
2323eda14cbcSMatt Macy static void
vdev_raidz_shadow_child_done(zio_t * zio)2324e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio)
2325eda14cbcSMatt Macy {
2326e716630dSMartin Matuska raidz_col_t *rc = zio->io_private;
2327eda14cbcSMatt Macy
2328e716630dSMartin Matuska rc->rc_shadow_error = zio->io_error;
2329e716630dSMartin Matuska }
2330e716630dSMartin Matuska
2331e716630dSMartin Matuska static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)2332e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2333e716630dSMartin Matuska {
2334e716630dSMartin Matuska (void) rm;
2335e716630dSMartin Matuska #ifdef ZFS_DEBUG
2336b59a0cdeSMartin Matuska zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
23377877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset;
2338eda14cbcSMatt Macy logical_rs.rs_end = logical_rs.rs_start +
2339071ab5a1SMartin Matuska vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
2340783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp));
2341eda14cbcSMatt Macy
23427877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col];
2343e716630dSMartin Matuska vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2344eda14cbcSMatt Macy
23457877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
23467877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs));
2347e716630dSMartin Matuska if (vdev_xlate_is_empty(&physical_rs)) {
2348e716630dSMartin Matuska /*
2349e716630dSMartin Matuska * If we are in the middle of expansion, the
2350e716630dSMartin Matuska * physical->logical mapping is changing so vdev_xlate()
2351e716630dSMartin Matuska * can't give us a reliable answer.
2352e716630dSMartin Matuska */
2353e716630dSMartin Matuska return;
2354e716630dSMartin Matuska }
2355eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2356eda14cbcSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2357eda14cbcSMatt Macy /*
2358eda14cbcSMatt Macy * It would be nice to assert that rs_end is equal
2359eda14cbcSMatt Macy * to rc_offset + rc_size but there might be an
2360eda14cbcSMatt Macy * optional I/O at the end that is not accounted in
2361eda14cbcSMatt Macy * rc_size.
2362eda14cbcSMatt Macy */
2363eda14cbcSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2364eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2365e716630dSMartin Matuska rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2366eda14cbcSMatt Macy } else {
2367eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2368eda14cbcSMatt Macy }
2369eda14cbcSMatt Macy #endif
2370eda14cbcSMatt Macy }
2371eda14cbcSMatt Macy
23727877fdebSMatt Macy static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2373e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
23747877fdebSMatt Macy {
23757877fdebSMatt Macy vdev_t *vd = zio->io_vd;
23767877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
23777877fdebSMatt Macy
23787877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr);
23797877fdebSMatt Macy
238081b22a98SMartin Matuska for (int c = 0; c < rr->rr_scols; c++) {
23817877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
238281b22a98SMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
23837877fdebSMatt Macy
23847877fdebSMatt Macy /* Verify physical to logical translation */
2385e716630dSMartin Matuska vdev_raidz_io_verify(zio, rm, rr, c);
23867877fdebSMatt Macy
2387e716630dSMartin Matuska if (rc->rc_size == 0)
2388e716630dSMartin Matuska continue;
2389e716630dSMartin Matuska
2390e716630dSMartin Matuska ASSERT3U(rc->rc_offset + rc->rc_size, <,
2391e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2392e716630dSMartin Matuska
239381b22a98SMartin Matuska ASSERT3P(rc->rc_abd, !=, NULL);
23947877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
239581b22a98SMartin Matuska rc->rc_offset, rc->rc_abd,
239681b22a98SMartin Matuska abd_get_size(rc->rc_abd), zio->io_type,
239781b22a98SMartin Matuska zio->io_priority, 0, vdev_raidz_child_done, rc));
2398e716630dSMartin Matuska
2399e716630dSMartin Matuska if (rc->rc_shadow_devidx != INT_MAX) {
2400e716630dSMartin Matuska vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2401e716630dSMartin Matuska
2402e716630dSMartin Matuska ASSERT3U(
2403e716630dSMartin Matuska rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2404e716630dSMartin Matuska cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2405e716630dSMartin Matuska
2406e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2407e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd,
2408e716630dSMartin Matuska abd_get_size(rc->rc_abd),
2409e716630dSMartin Matuska zio->io_type, zio->io_priority, 0,
2410e716630dSMartin Matuska vdev_raidz_shadow_child_done, rc));
241181b22a98SMartin Matuska }
24127877fdebSMatt Macy }
24137877fdebSMatt Macy }
24147877fdebSMatt Macy
2415e716630dSMartin Matuska /*
2416e716630dSMartin Matuska * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2417e716630dSMartin Matuska * This only works for vdev_raidz_map_alloc() (not _expanded()).
2418e716630dSMartin Matuska */
24197877fdebSMatt Macy static void
raidz_start_skip_writes(zio_t * zio)2420e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio)
2421e716630dSMartin Matuska {
2422e716630dSMartin Matuska vdev_t *vd = zio->io_vd;
2423e716630dSMartin Matuska uint64_t ashift = vd->vdev_top->vdev_ashift;
2424e716630dSMartin Matuska raidz_map_t *rm = zio->io_vsd;
2425e716630dSMartin Matuska ASSERT3U(rm->rm_nrows, ==, 1);
2426e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[0];
2427e716630dSMartin Matuska for (int c = 0; c < rr->rr_scols; c++) {
2428e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
2429e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2430e716630dSMartin Matuska if (rc->rc_size != 0)
2431e716630dSMartin Matuska continue;
2432e716630dSMartin Matuska ASSERT3P(rc->rc_abd, ==, NULL);
2433e716630dSMartin Matuska
2434e716630dSMartin Matuska ASSERT3U(rc->rc_offset, <,
2435e716630dSMartin Matuska cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2436e716630dSMartin Matuska
2437e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2438e716630dSMartin Matuska NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2439e716630dSMartin Matuska ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2440e716630dSMartin Matuska }
2441e716630dSMartin Matuska }
2442e716630dSMartin Matuska
2443e716630dSMartin Matuska static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2444e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
24457877fdebSMatt Macy {
24467877fdebSMatt Macy vdev_t *vd = zio->io_vd;
24477877fdebSMatt Macy
24487877fdebSMatt Macy /*
24497877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity
24507877fdebSMatt Macy * last -- any errors along the way will force us to read the parity.
24517877fdebSMatt Macy */
24527877fdebSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) {
24537877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
24547877fdebSMatt Macy if (rc->rc_size == 0)
24557877fdebSMatt Macy continue;
24567877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
24577877fdebSMatt Macy if (!vdev_readable(cvd)) {
24587877fdebSMatt Macy if (c >= rr->rr_firstdatacol)
24597877fdebSMatt Macy rr->rr_missingdata++;
24607877fdebSMatt Macy else
24617877fdebSMatt Macy rr->rr_missingparity++;
24627877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO);
24637877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */
24647877fdebSMatt Macy rc->rc_skipped = 1;
24657877fdebSMatt Macy continue;
24667877fdebSMatt Macy }
24677877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
24687877fdebSMatt Macy if (c >= rr->rr_firstdatacol)
24697877fdebSMatt Macy rr->rr_missingdata++;
24707877fdebSMatt Macy else
24717877fdebSMatt Macy rr->rr_missingparity++;
24727877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE);
24737877fdebSMatt Macy rc->rc_skipped = 1;
24747877fdebSMatt Macy continue;
24757877fdebSMatt Macy }
2476e716630dSMartin Matuska if (forceparity ||
2477e716630dSMartin Matuska c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
24787877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
24797877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
24807877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size,
24817877fdebSMatt Macy zio->io_type, zio->io_priority, 0,
24827877fdebSMatt Macy vdev_raidz_child_done, rc));
24837877fdebSMatt Macy }
24847877fdebSMatt Macy }
24857877fdebSMatt Macy }
24867877fdebSMatt Macy
2487e716630dSMartin Matuska static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)2488e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2489e716630dSMartin Matuska {
2490e716630dSMartin Matuska vdev_t *vd = zio->io_vd;
2491e716630dSMartin Matuska
2492e716630dSMartin Matuska for (int i = 0; i < rm->rm_nphys_cols; i++) {
2493e716630dSMartin Matuska raidz_col_t *prc = &rm->rm_phys_col[i];
2494e716630dSMartin Matuska if (prc->rc_size == 0)
2495e716630dSMartin Matuska continue;
2496e716630dSMartin Matuska
2497e716630dSMartin Matuska ASSERT3U(prc->rc_devidx, ==, i);
2498e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[i];
2499e716630dSMartin Matuska if (!vdev_readable(cvd)) {
2500e716630dSMartin Matuska prc->rc_error = SET_ERROR(ENXIO);
2501e716630dSMartin Matuska prc->rc_tried = 1; /* don't even try */
2502e716630dSMartin Matuska prc->rc_skipped = 1;
2503e716630dSMartin Matuska continue;
2504e716630dSMartin Matuska }
2505e716630dSMartin Matuska if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2506e716630dSMartin Matuska prc->rc_error = SET_ERROR(ESTALE);
2507e716630dSMartin Matuska prc->rc_skipped = 1;
2508e716630dSMartin Matuska continue;
2509e716630dSMartin Matuska }
2510e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2511e716630dSMartin Matuska prc->rc_offset, prc->rc_abd, prc->rc_size,
2512e716630dSMartin Matuska zio->io_type, zio->io_priority, 0,
2513e716630dSMartin Matuska vdev_raidz_child_done, prc));
2514e716630dSMartin Matuska }
2515e716630dSMartin Matuska }
2516e716630dSMartin Matuska
2517e716630dSMartin Matuska static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)2518e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2519e716630dSMartin Matuska {
2520e716630dSMartin Matuska /*
2521e716630dSMartin Matuska * If there are multiple rows, we will be hitting
2522e716630dSMartin Matuska * all disks, so go ahead and read the parity so
2523e716630dSMartin Matuska * that we are reading in decent size chunks.
2524e716630dSMartin Matuska */
2525e716630dSMartin Matuska boolean_t forceparity = rm->rm_nrows > 1;
2526e716630dSMartin Matuska
2527e716630dSMartin Matuska if (rm->rm_phys_col) {
2528e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio, rm);
2529e716630dSMartin Matuska } else {
2530e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) {
2531e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i];
2532e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio, rr, forceparity);
2533e716630dSMartin Matuska }
2534e716630dSMartin Matuska }
2535e716630dSMartin Matuska }
2536e716630dSMartin Matuska
2537eda14cbcSMatt Macy /*
2538eda14cbcSMatt Macy * Start an IO operation on a RAIDZ VDev
2539eda14cbcSMatt Macy *
2540eda14cbcSMatt Macy * Outline:
2541eda14cbcSMatt Macy * - For write operations:
2542eda14cbcSMatt Macy * 1. Generate the parity data
2543eda14cbcSMatt Macy * 2. Create child zio write operations to each column's vdev, for both
2544eda14cbcSMatt Macy * data and parity.
2545eda14cbcSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy
2546eda14cbcSMatt Macy * write zio children for those areas to improve aggregation continuity.
2547eda14cbcSMatt Macy * - For read operations:
2548eda14cbcSMatt Macy * 1. Create child zio read operations to each data column's vdev to read
2549eda14cbcSMatt Macy * the range of data required for zio.
2550eda14cbcSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data
2551eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity
2552eda14cbcSMatt Macy * columns' VDevs as well.
2553eda14cbcSMatt Macy */
2554eda14cbcSMatt Macy static void
vdev_raidz_io_start(zio_t * zio)2555eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio)
2556eda14cbcSMatt Macy {
2557eda14cbcSMatt Macy vdev_t *vd = zio->io_vd;
2558eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top;
25597877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
2560e716630dSMartin Matuska raidz_map_t *rm;
2561eda14cbcSMatt Macy
2562e716630dSMartin Matuska uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2563783d3ff6SMartin Matuska BP_GET_BIRTH(zio->io_bp));
2564e716630dSMartin Matuska if (logical_width != vdrz->vd_physical_width) {
2565e716630dSMartin Matuska zfs_locked_range_t *lr = NULL;
2566e716630dSMartin Matuska uint64_t synced_offset = UINT64_MAX;
2567e716630dSMartin Matuska uint64_t next_offset = UINT64_MAX;
2568e716630dSMartin Matuska boolean_t use_scratch = B_FALSE;
2569e716630dSMartin Matuska /*
2570e716630dSMartin Matuska * Note: when the expansion is completing, we set
2571e716630dSMartin Matuska * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2572e716630dSMartin Matuska * in a later txg than when we last update spa_ubsync's state
2573e716630dSMartin Matuska * (see the end of spa_raidz_expand_thread()). Therefore we
2574e716630dSMartin Matuska * may see vre_state!=SCANNING before
2575e716630dSMartin Matuska * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2576e716630dSMartin Matuska * on disk, but the copying progress has been synced to disk
2577e716630dSMartin Matuska * (and reflected in spa_ubsync). In this case it's fine to
2578e716630dSMartin Matuska * treat the expansion as completed, since if we crash there's
2579e716630dSMartin Matuska * no additional copying to do.
2580e716630dSMartin Matuska */
2581e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2582e716630dSMartin Matuska ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2583e716630dSMartin Matuska &vdrz->vn_vre);
2584e716630dSMartin Matuska lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2585e716630dSMartin Matuska zio->io_offset, zio->io_size, RL_READER);
2586e716630dSMartin Matuska use_scratch =
2587e716630dSMartin Matuska (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2588e716630dSMartin Matuska RRSS_SCRATCH_VALID);
2589e716630dSMartin Matuska synced_offset =
2590e716630dSMartin Matuska RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2591e716630dSMartin Matuska next_offset = vdrz->vn_vre.vre_offset;
2592e716630dSMartin Matuska /*
2593e716630dSMartin Matuska * If we haven't resumed expanding since importing the
2594e716630dSMartin Matuska * pool, vre_offset won't have been set yet. In
2595e716630dSMartin Matuska * this case the next offset to be copied is the same
2596e716630dSMartin Matuska * as what was synced.
2597e716630dSMartin Matuska */
2598e716630dSMartin Matuska if (next_offset == UINT64_MAX) {
2599e716630dSMartin Matuska next_offset = synced_offset;
2600e716630dSMartin Matuska }
2601e716630dSMartin Matuska }
2602e716630dSMartin Matuska if (use_scratch) {
2603e716630dSMartin Matuska zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2604e716630dSMartin Matuska "%lld next_offset=%lld use_scratch=%u",
2605e716630dSMartin Matuska zio,
2606e716630dSMartin Matuska zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2607e716630dSMartin Matuska (long long)zio->io_offset,
2608e716630dSMartin Matuska (long long)synced_offset,
2609e716630dSMartin Matuska (long long)next_offset,
2610e716630dSMartin Matuska use_scratch);
2611e716630dSMartin Matuska }
2612e716630dSMartin Matuska
2613e716630dSMartin Matuska rm = vdev_raidz_map_alloc_expanded(zio,
2614e716630dSMartin Matuska tvd->vdev_ashift, vdrz->vd_physical_width,
2615e716630dSMartin Matuska logical_width, vdrz->vd_nparity,
2616e716630dSMartin Matuska synced_offset, next_offset, use_scratch);
2617e716630dSMartin Matuska rm->rm_lr = lr;
2618e716630dSMartin Matuska } else {
2619e716630dSMartin Matuska rm = vdev_raidz_map_alloc(zio,
2620e716630dSMartin Matuska tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2621e716630dSMartin Matuska }
2622e716630dSMartin Matuska rm->rm_original_width = vdrz->vd_original_width;
2623e716630dSMartin Matuska
2624f9693befSMartin Matuska zio->io_vsd = rm;
2625f9693befSMartin Matuska zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2626eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) {
2627e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) {
2628e716630dSMartin Matuska vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2629e716630dSMartin Matuska }
2630e716630dSMartin Matuska
2631e716630dSMartin Matuska if (logical_width == vdrz->vd_physical_width) {
2632e716630dSMartin Matuska raidz_start_skip_writes(zio);
2633e716630dSMartin Matuska }
26347877fdebSMatt Macy } else {
2635eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ);
2636e716630dSMartin Matuska vdev_raidz_io_start_read(zio, rm);
2637eda14cbcSMatt Macy }
2638eda14cbcSMatt Macy
2639eda14cbcSMatt Macy zio_execute(zio);
2640eda14cbcSMatt Macy }
2641eda14cbcSMatt Macy
2642eda14cbcSMatt Macy /*
2643eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device.
2644eda14cbcSMatt Macy */
2645e92ffd9bSMartin Matuska void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2646e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2647eda14cbcSMatt Macy {
2648eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2649eda14cbcSMatt Macy
26507877fdebSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
26517877fdebSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) {
2652eda14cbcSMatt Macy zio_bad_cksum_t zbc;
2653eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
2654eda14cbcSMatt Macy
2655eda14cbcSMatt Macy zbc.zbc_has_cksum = 0;
2656eda14cbcSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected;
2657eda14cbcSMatt Macy
26582c48331dSMatt Macy mutex_enter(&vd->vdev_stat_lock);
26592c48331dSMatt Macy vd->vdev_stat.vs_checksum_errors++;
26602c48331dSMatt Macy mutex_exit(&vd->vdev_stat_lock);
2661bb2d13b6SMartin Matuska (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2662bb2d13b6SMartin Matuska &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2663bb2d13b6SMartin Matuska rc->rc_abd, bad_data, &zbc);
26642c48331dSMatt Macy }
2665eda14cbcSMatt Macy }
2666eda14cbcSMatt Macy
2667eda14cbcSMatt Macy /*
2668eda14cbcSMatt Macy * We keep track of whether or not there were any injected errors, so that
2669eda14cbcSMatt Macy * any ereports we generate can note it.
2670eda14cbcSMatt Macy */
2671eda14cbcSMatt Macy static int
raidz_checksum_verify(zio_t * zio)2672eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio)
2673eda14cbcSMatt Macy {
2674315ee00fSMartin Matuska zio_bad_cksum_t zbc = {0};
2675eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
2676eda14cbcSMatt Macy
2677eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc);
267887bf66d4SMartin Matuska /*
267987bf66d4SMartin Matuska * Any Direct I/O read that has a checksum error must be treated as
268087bf66d4SMartin Matuska * suspicious as the contents of the buffer could be getting
268187bf66d4SMartin Matuska * manipulated while the I/O is taking place. The checksum verify error
268287bf66d4SMartin Matuska * will be reported to the top-level RAIDZ VDEV.
268387bf66d4SMartin Matuska */
268487bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
268587bf66d4SMartin Matuska zio->io_error = ret;
268687bf66d4SMartin Matuska zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
268787bf66d4SMartin Matuska zio_dio_chksum_verify_error_report(zio);
268887bf66d4SMartin Matuska zio_checksum_verified(zio);
268987bf66d4SMartin Matuska return (0);
269087bf66d4SMartin Matuska }
269187bf66d4SMartin Matuska
2692eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0)
2693eda14cbcSMatt Macy rm->rm_ecksuminjected = 1;
2694eda14cbcSMatt Macy
2695eda14cbcSMatt Macy return (ret);
2696eda14cbcSMatt Macy }
2697eda14cbcSMatt Macy
2698eda14cbcSMatt Macy /*
2699eda14cbcSMatt Macy * Generate the parity from the data columns. If we tried and were able to
2700eda14cbcSMatt Macy * read the parity without error, verify that the generated parity matches the
2701eda14cbcSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the
27027877fdebSMatt Macy * number of such failures.
2703eda14cbcSMatt Macy */
2704eda14cbcSMatt Macy static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)27057877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2706eda14cbcSMatt Macy {
2707eda14cbcSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2708eda14cbcSMatt Macy int c, ret = 0;
27097877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
2710eda14cbcSMatt Macy raidz_col_t *rc;
2711eda14cbcSMatt Macy
2712eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp;
2713eda14cbcSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2714eda14cbcSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2715eda14cbcSMatt Macy
2716eda14cbcSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY)
2717eda14cbcSMatt Macy return (ret);
2718eda14cbcSMatt Macy
27197877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) {
27207877fdebSMatt Macy rc = &rr->rr_col[c];
2721eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0)
2722eda14cbcSMatt Macy continue;
2723eda14cbcSMatt Macy
2724a0b956f5SMartin Matuska orig[c] = rc->rc_abd;
2725a0b956f5SMartin Matuska ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2726a0b956f5SMartin Matuska rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2727eda14cbcSMatt Macy }
2728eda14cbcSMatt Macy
27297877fdebSMatt Macy /*
2730e92ffd9bSMartin Matuska * Verify any empty sectors are zero filled to ensure the parity
2731e92ffd9bSMartin Matuska * is calculated correctly even if these non-data sectors are damaged.
2732e92ffd9bSMartin Matuska */
2733e92ffd9bSMartin Matuska if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2734e92ffd9bSMartin Matuska ret += vdev_draid_map_verify_empty(zio, rr);
2735e92ffd9bSMartin Matuska
2736e92ffd9bSMartin Matuska /*
27377877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This
27387877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff
27397877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0).
27407877fdebSMatt Macy */
27417877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr);
2742eda14cbcSMatt Macy
27437877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) {
27447877fdebSMatt Macy rc = &rr->rr_col[c];
27457877fdebSMatt Macy
2746eda14cbcSMatt Macy if (!rc->rc_tried || rc->rc_error != 0)
2747eda14cbcSMatt Macy continue;
27487877fdebSMatt Macy
2749eda14cbcSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2750e716630dSMartin Matuska zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2751e716630dSMartin Matuska c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2752e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio, rc, orig[c]);
2753eda14cbcSMatt Macy rc->rc_error = SET_ERROR(ECKSUM);
2754eda14cbcSMatt Macy ret++;
2755eda14cbcSMatt Macy }
2756eda14cbcSMatt Macy abd_free(orig[c]);
2757eda14cbcSMatt Macy }
2758eda14cbcSMatt Macy
2759eda14cbcSMatt Macy return (ret);
2760eda14cbcSMatt Macy }
2761eda14cbcSMatt Macy
2762eda14cbcSMatt Macy static int
vdev_raidz_worst_error(raidz_row_t * rr)27637877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr)
2764eda14cbcSMatt Macy {
2765eda14cbcSMatt Macy int error = 0;
2766eda14cbcSMatt Macy
2767e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) {
27687877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error);
2769e716630dSMartin Matuska error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2770e716630dSMartin Matuska }
2771eda14cbcSMatt Macy
2772eda14cbcSMatt Macy return (error);
2773eda14cbcSMatt Macy }
2774eda14cbcSMatt Macy
2775eda14cbcSMatt Macy static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)27767877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2777eda14cbcSMatt Macy {
2778eda14cbcSMatt Macy int unexpected_errors = 0;
2779eda14cbcSMatt Macy int parity_errors = 0;
2780eda14cbcSMatt Macy int parity_untried = 0;
2781eda14cbcSMatt Macy int data_errors = 0;
2782eda14cbcSMatt Macy
27837877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2784eda14cbcSMatt Macy
27857877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
27867877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
2787eda14cbcSMatt Macy
2788eda14cbcSMatt Macy if (rc->rc_error) {
27897877fdebSMatt Macy if (c < rr->rr_firstdatacol)
2790eda14cbcSMatt Macy parity_errors++;
2791eda14cbcSMatt Macy else
2792eda14cbcSMatt Macy data_errors++;
2793eda14cbcSMatt Macy
2794eda14cbcSMatt Macy if (!rc->rc_skipped)
2795eda14cbcSMatt Macy unexpected_errors++;
27967877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2797eda14cbcSMatt Macy parity_untried++;
2798eda14cbcSMatt Macy }
2799a0b956f5SMartin Matuska
2800a0b956f5SMartin Matuska if (rc->rc_force_repair)
2801a0b956f5SMartin Matuska unexpected_errors++;
2802eda14cbcSMatt Macy }
2803eda14cbcSMatt Macy
2804eda14cbcSMatt Macy /*
28057877fdebSMatt Macy * If we read more parity disks than were used for
28067877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced
28077877fdebSMatt Macy * correct data.
28087877fdebSMatt Macy *
28097877fdebSMatt Macy * Note that we also regenerate parity when resilvering so we
28107877fdebSMatt Macy * can write it out to failed devices later.
28117877fdebSMatt Macy */
28127877fdebSMatt Macy if (parity_errors + parity_untried <
28137877fdebSMatt Macy rr->rr_firstdatacol - data_errors ||
28147877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) {
28157877fdebSMatt Macy int n = raidz_parity_verify(zio, rr);
28167877fdebSMatt Macy unexpected_errors += n;
28177877fdebSMatt Macy }
28187877fdebSMatt Macy
28197877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
28207877fdebSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
28217877fdebSMatt Macy /*
28227877fdebSMatt Macy * Use the good data we have in hand to repair damaged children.
28237877fdebSMatt Macy */
28247877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
28257877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
28267877fdebSMatt Macy vdev_t *vd = zio->io_vd;
28277877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
28287877fdebSMatt Macy
282916038816SMartin Matuska if (!rc->rc_allow_repair) {
283016038816SMartin Matuska continue;
283116038816SMartin Matuska } else if (!rc->rc_force_repair &&
283216038816SMartin Matuska (rc->rc_error == 0 || rc->rc_size == 0)) {
28337877fdebSMatt Macy continue;
28347877fdebSMatt Macy }
283587bf66d4SMartin Matuska /*
283687bf66d4SMartin Matuska * We do not allow self healing for Direct I/O reads.
283787bf66d4SMartin Matuska * See comment in vdev_raid_row_alloc().
283887bf66d4SMartin Matuska */
283987bf66d4SMartin Matuska ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
28407877fdebSMatt Macy
2841e716630dSMartin Matuska zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2842e716630dSMartin Matuska "offset=%llx",
2843e716630dSMartin Matuska zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2844e716630dSMartin Matuska
28457877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
28467877fdebSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size,
28477877fdebSMatt Macy ZIO_TYPE_WRITE,
28487877fdebSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ?
28497877fdebSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
28507877fdebSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
28517877fdebSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
28527877fdebSMatt Macy }
28537877fdebSMatt Macy }
2854e716630dSMartin Matuska
2855e716630dSMartin Matuska /*
2856e716630dSMartin Matuska * Scrub or resilver i/o's: overwrite any shadow locations with the
2857e716630dSMartin Matuska * good data. This ensures that if we've already copied this sector,
2858e716630dSMartin Matuska * it will be corrected if it was damaged. This writes more than is
2859e716630dSMartin Matuska * necessary, but since expansion is paused during scrub/resilver, at
2860e716630dSMartin Matuska * most a single row will have a shadow location.
2861e716630dSMartin Matuska */
2862e716630dSMartin Matuska if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2863e716630dSMartin Matuska (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2864e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) {
2865e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
2866e716630dSMartin Matuska vdev_t *vd = zio->io_vd;
2867e716630dSMartin Matuska
2868e716630dSMartin Matuska if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2869e716630dSMartin Matuska continue;
2870e716630dSMartin Matuska vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2871e716630dSMartin Matuska
2872e716630dSMartin Matuska /*
2873e716630dSMartin Matuska * Note: We don't want to update the repair stats
2874e716630dSMartin Matuska * because that would incorrectly indicate that there
2875e716630dSMartin Matuska * was bad data to repair, which we aren't sure about.
2876e716630dSMartin Matuska * By clearing the SCAN_THREAD flag, we prevent this
2877e716630dSMartin Matuska * from happening, despite having the REPAIR flag set.
2878e716630dSMartin Matuska * We need to set SELF_HEAL so that this i/o can't be
2879e716630dSMartin Matuska * bypassed by zio_vdev_io_start().
2880e716630dSMartin Matuska */
2881e716630dSMartin Matuska zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2882e716630dSMartin Matuska rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2883e716630dSMartin Matuska ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2884e716630dSMartin Matuska ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2885e716630dSMartin Matuska NULL, NULL);
2886e716630dSMartin Matuska cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2887e716630dSMartin Matuska zio_nowait(cio);
2888e716630dSMartin Matuska }
2889e716630dSMartin Matuska }
28907877fdebSMatt Macy }
28917877fdebSMatt Macy
28927877fdebSMatt Macy static void
raidz_restore_orig_data(raidz_map_t * rm)28937877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm)
28947877fdebSMatt Macy {
28957877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
28967877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
28977877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
28987877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
28997877fdebSMatt Macy if (rc->rc_need_orig_restore) {
2900f9693befSMartin Matuska abd_copy(rc->rc_abd,
29017877fdebSMatt Macy rc->rc_orig_data, rc->rc_size);
29027877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE;
29037877fdebSMatt Macy }
29047877fdebSMatt Macy }
29057877fdebSMatt Macy }
29067877fdebSMatt Macy }
29077877fdebSMatt Macy
29087877fdebSMatt Macy /*
2909e716630dSMartin Matuska * During raidz_reconstruct() for expanded VDEV, we need special consideration
2910e716630dSMartin Matuska * failure simulations. See note in raidz_reconstruct() on simulating failure
2911e716630dSMartin Matuska * of a pre-expansion device.
2912e716630dSMartin Matuska *
2913e716630dSMartin Matuska * Treating logical child i as failed, return TRUE if the given column should
2914e716630dSMartin Matuska * be treated as failed. The idea of logical children allows us to imagine
2915e716630dSMartin Matuska * that a disk silently failed before a RAIDZ expansion (reads from this disk
2916e716630dSMartin Matuska * succeed but return the wrong data). Since the expansion doesn't verify
2917e716630dSMartin Matuska * checksums, the incorrect data will be moved to new locations spread among
2918e716630dSMartin Matuska * the children (going diagonally across them).
2919e716630dSMartin Matuska *
2920e716630dSMartin Matuska * Higher "logical child failures" (values of `i`) indicate these
2921e716630dSMartin Matuska * "pre-expansion failures". The first physical_width values imagine that a
2922e716630dSMartin Matuska * current child failed; the next physical_width-1 values imagine that a
2923e716630dSMartin Matuska * child failed before the most recent expansion; the next physical_width-2
2924e716630dSMartin Matuska * values imagine a child failed in the expansion before that, etc.
2925e716630dSMartin Matuska */
2926e716630dSMartin Matuska static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)2927e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift,
2928e716630dSMartin Matuska int i, raidz_col_t *rc)
2929e716630dSMartin Matuska {
2930e716630dSMartin Matuska uint64_t sector_id =
2931e716630dSMartin Matuska physical_width * (rc->rc_offset >> ashift) +
2932e716630dSMartin Matuska rc->rc_devidx;
2933e716630dSMartin Matuska
2934e716630dSMartin Matuska for (int w = physical_width; w >= original_width; w--) {
2935e716630dSMartin Matuska if (i < w) {
2936e716630dSMartin Matuska return (sector_id % w == i);
2937e716630dSMartin Matuska } else {
2938e716630dSMartin Matuska i -= w;
2939e716630dSMartin Matuska }
2940e716630dSMartin Matuska }
2941e716630dSMartin Matuska ASSERT(!"invalid logical child id");
2942e716630dSMartin Matuska return (B_FALSE);
2943e716630dSMartin Matuska }
2944e716630dSMartin Matuska
2945e716630dSMartin Matuska /*
29467877fdebSMatt Macy * returns EINVAL if reconstruction of the block will not be possible
29477877fdebSMatt Macy * returns ECKSUM if this specific reconstruction failed
29487877fdebSMatt Macy * returns 0 on successful reconstruction
29497877fdebSMatt Macy */
29507877fdebSMatt Macy static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)29517877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
29527877fdebSMatt Macy {
29537877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
2954e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children;
2955e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ?
2956e716630dSMartin Matuska rm->rm_original_width : physical_width;
2957e716630dSMartin Matuska int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2958e716630dSMartin Matuska
2959e716630dSMartin Matuska if (dbgmsg) {
2960e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2961e716630dSMartin Matuska "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2962e716630dSMartin Matuska }
29637877fdebSMatt Macy
29647877fdebSMatt Macy /* Reconstruct each row */
29657877fdebSMatt Macy for (int r = 0; r < rm->rm_nrows; r++) {
29667877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[r];
29677877fdebSMatt Macy int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
29687877fdebSMatt Macy int t = 0;
29697877fdebSMatt Macy int dead = 0;
29707877fdebSMatt Macy int dead_data = 0;
29717877fdebSMatt Macy
2972e716630dSMartin Matuska if (dbgmsg)
2973e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2974e716630dSMartin Matuska
29757877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
29767877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
29777877fdebSMatt Macy ASSERT0(rc->rc_need_orig_restore);
29787877fdebSMatt Macy if (rc->rc_error != 0) {
29797877fdebSMatt Macy dead++;
29807877fdebSMatt Macy if (c >= nparity)
29817877fdebSMatt Macy dead_data++;
29827877fdebSMatt Macy continue;
29837877fdebSMatt Macy }
29847877fdebSMatt Macy if (rc->rc_size == 0)
29857877fdebSMatt Macy continue;
29867877fdebSMatt Macy for (int lt = 0; lt < ntgts; lt++) {
2987e716630dSMartin Matuska if (raidz_simulate_failure(physical_width,
2988e716630dSMartin Matuska original_width,
2989e716630dSMartin Matuska zio->io_vd->vdev_top->vdev_ashift,
2990e716630dSMartin Matuska ltgts[lt], rc)) {
29917877fdebSMatt Macy if (rc->rc_orig_data == NULL) {
29927877fdebSMatt Macy rc->rc_orig_data =
2993f9693befSMartin Matuska abd_alloc_linear(
2994f9693befSMartin Matuska rc->rc_size, B_TRUE);
2995f9693befSMartin Matuska abd_copy(rc->rc_orig_data,
29967877fdebSMatt Macy rc->rc_abd, rc->rc_size);
29977877fdebSMatt Macy }
29987877fdebSMatt Macy rc->rc_need_orig_restore = B_TRUE;
29997877fdebSMatt Macy
30007877fdebSMatt Macy dead++;
30017877fdebSMatt Macy if (c >= nparity)
30027877fdebSMatt Macy dead_data++;
3003e716630dSMartin Matuska /*
3004e716630dSMartin Matuska * Note: simulating failure of a
3005e716630dSMartin Matuska * pre-expansion device can hit more
3006e716630dSMartin Matuska * than one column, in which case we
3007e716630dSMartin Matuska * might try to simulate more failures
3008e716630dSMartin Matuska * than can be reconstructed, which is
3009e716630dSMartin Matuska * also more than the size of my_tgts.
3010e716630dSMartin Matuska * This check prevents accessing past
3011e716630dSMartin Matuska * the end of my_tgts. The "dead >
3012e716630dSMartin Matuska * nparity" check below will fail this
3013e716630dSMartin Matuska * reconstruction attempt.
3014e716630dSMartin Matuska */
3015e716630dSMartin Matuska if (t < VDEV_RAIDZ_MAXPARITY) {
30167877fdebSMatt Macy my_tgts[t++] = c;
3017e716630dSMartin Matuska if (dbgmsg) {
3018e716630dSMartin Matuska zfs_dbgmsg("simulating "
3019e716630dSMartin Matuska "failure of col %u "
3020e716630dSMartin Matuska "devidx %u", c,
3021e716630dSMartin Matuska (int)rc->rc_devidx);
3022e716630dSMartin Matuska }
3023e716630dSMartin Matuska }
30247877fdebSMatt Macy break;
30257877fdebSMatt Macy }
30267877fdebSMatt Macy }
30277877fdebSMatt Macy }
30287877fdebSMatt Macy if (dead > nparity) {
30297877fdebSMatt Macy /* reconstruction not possible */
3030e716630dSMartin Matuska if (dbgmsg) {
3031e716630dSMartin Matuska zfs_dbgmsg("reconstruction not possible; "
3032e716630dSMartin Matuska "too many failures");
3033e716630dSMartin Matuska }
30347877fdebSMatt Macy raidz_restore_orig_data(rm);
30357877fdebSMatt Macy return (EINVAL);
30367877fdebSMatt Macy }
30377877fdebSMatt Macy if (dead_data > 0)
3038f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
30397877fdebSMatt Macy }
30407877fdebSMatt Macy
30417877fdebSMatt Macy /* Check for success */
30427877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) {
304387bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
304487bf66d4SMartin Matuska return (0);
30457877fdebSMatt Macy
30467877fdebSMatt Macy /* Reconstruction succeeded - report errors */
30477877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
30487877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
30497877fdebSMatt Macy
30507877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
30517877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
30527877fdebSMatt Macy if (rc->rc_need_orig_restore) {
30537877fdebSMatt Macy /*
30547877fdebSMatt Macy * Note: if this is a parity column,
30557877fdebSMatt Macy * we don't really know if it's wrong.
30567877fdebSMatt Macy * We need to let
30577877fdebSMatt Macy * vdev_raidz_io_done_verified() check
30587877fdebSMatt Macy * it, and if we set rc_error, it will
30597877fdebSMatt Macy * think that it is a "known" error
30607877fdebSMatt Macy * that doesn't need to be checked
30617877fdebSMatt Macy * or corrected.
30627877fdebSMatt Macy */
30637877fdebSMatt Macy if (rc->rc_error == 0 &&
30647877fdebSMatt Macy c >= rr->rr_firstdatacol) {
3065e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio,
3066f9693befSMartin Matuska rc, rc->rc_orig_data);
30677877fdebSMatt Macy rc->rc_error =
30687877fdebSMatt Macy SET_ERROR(ECKSUM);
30697877fdebSMatt Macy }
30707877fdebSMatt Macy rc->rc_need_orig_restore = B_FALSE;
30717877fdebSMatt Macy }
30727877fdebSMatt Macy }
30737877fdebSMatt Macy
30747877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr);
30757877fdebSMatt Macy }
30767877fdebSMatt Macy
30777877fdebSMatt Macy zio_checksum_verified(zio);
30787877fdebSMatt Macy
3079e716630dSMartin Matuska if (dbgmsg) {
3080e716630dSMartin Matuska zfs_dbgmsg("reconstruction successful "
3081e716630dSMartin Matuska "(checksum verified)");
3082e716630dSMartin Matuska }
30837877fdebSMatt Macy return (0);
30847877fdebSMatt Macy }
30857877fdebSMatt Macy
30867877fdebSMatt Macy /* Reconstruction failed - restore original data */
30877877fdebSMatt Macy raidz_restore_orig_data(rm);
3088e716630dSMartin Matuska if (dbgmsg) {
3089e716630dSMartin Matuska zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3090e716630dSMartin Matuska "failed", zio);
3091e716630dSMartin Matuska }
30927877fdebSMatt Macy return (ECKSUM);
30937877fdebSMatt Macy }
30947877fdebSMatt Macy
30957877fdebSMatt Macy /*
30967877fdebSMatt Macy * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
30977877fdebSMatt Macy * Note that the algorithm below is non-optimal because it doesn't take into
30987877fdebSMatt Macy * account how reconstruction is actually performed. For example, with
30997877fdebSMatt Macy * triple-parity RAID-Z the reconstruction procedure is the same if column 4
31007877fdebSMatt Macy * is targeted as invalid as if columns 1 and 4 are targeted since in both
31017877fdebSMatt Macy * cases we'd only use parity information in column 0.
31027877fdebSMatt Macy *
31037877fdebSMatt Macy * The order that we find the various possible combinations of failed
31047877fdebSMatt Macy * disks is dictated by these rules:
31057877fdebSMatt Macy * - Examine each "slot" (the "i" in tgts[i])
3106e716630dSMartin Matuska * - Try to increment this slot (tgts[i] += 1)
31077877fdebSMatt Macy * - if we can't increment because it runs into the next slot,
31087877fdebSMatt Macy * reset our slot to the minimum, and examine the next slot
31097877fdebSMatt Macy *
31107877fdebSMatt Macy * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
31117877fdebSMatt Macy * 3 columns to reconstruct), we will generate the following sequence:
31127877fdebSMatt Macy *
31137877fdebSMatt Macy * STATE ACTION
31147877fdebSMatt Macy * 0 1 2 special case: skip since these are all parity
31157877fdebSMatt Macy * 0 1 3 first slot: reset to 0; middle slot: increment to 2
31167877fdebSMatt Macy * 0 2 3 first slot: increment to 1
31177877fdebSMatt Macy * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
31187877fdebSMatt Macy * 0 1 4 first: reset to 0; middle: increment to 2
31197877fdebSMatt Macy * 0 2 4 first: increment to 1
31207877fdebSMatt Macy * 1 2 4 first: reset to 0; middle: increment to 3
31217877fdebSMatt Macy * 0 3 4 first: increment to 1
31227877fdebSMatt Macy * 1 3 4 first: increment to 2
31237877fdebSMatt Macy * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
31247877fdebSMatt Macy * 0 1 5 first: reset to 0; middle: increment to 2
31257877fdebSMatt Macy * 0 2 5 first: increment to 1
31267877fdebSMatt Macy * 1 2 5 first: reset to 0; middle: increment to 3
31277877fdebSMatt Macy * 0 3 5 first: increment to 1
31287877fdebSMatt Macy * 1 3 5 first: increment to 2
31297877fdebSMatt Macy * 2 3 5 first: reset to 0; middle: increment to 4
31307877fdebSMatt Macy * 0 4 5 first: increment to 1
31317877fdebSMatt Macy * 1 4 5 first: increment to 2
31327877fdebSMatt Macy * 2 4 5 first: increment to 3
31337877fdebSMatt Macy * 3 4 5 done
31347877fdebSMatt Macy *
313516038816SMartin Matuska * This strategy works for dRAID but is less efficient when there are a large
31367877fdebSMatt Macy * number of child vdevs and therefore permutations to check. Furthermore,
3137e716630dSMartin Matuska * since the raidz_map_t rows likely do not overlap, reconstruction would be
31387877fdebSMatt Macy * possible as long as there are no more than nparity data errors per row.
31397877fdebSMatt Macy * These additional permutations are not currently checked but could be as
31407877fdebSMatt Macy * a future improvement.
3141e716630dSMartin Matuska *
3142e716630dSMartin Matuska * Returns 0 on success, ECKSUM on failure.
31437877fdebSMatt Macy */
31447877fdebSMatt Macy static int
vdev_raidz_combrec(zio_t * zio)31457877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio)
31467877fdebSMatt Macy {
31477877fdebSMatt Macy int nparity = vdev_get_nparity(zio->io_vd);
31487877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
3149e716630dSMartin Matuska int physical_width = zio->io_vd->vdev_children;
3150e716630dSMartin Matuska int original_width = (rm->rm_original_width != 0) ?
3151e716630dSMartin Matuska rm->rm_original_width : physical_width;
31527877fdebSMatt Macy
31537877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
31547877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
31557877fdebSMatt Macy int total_errors = 0;
31567877fdebSMatt Macy
31577877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
31587877fdebSMatt Macy if (rr->rr_col[c].rc_error)
31597877fdebSMatt Macy total_errors++;
31607877fdebSMatt Macy }
31617877fdebSMatt Macy
31627877fdebSMatt Macy if (total_errors > nparity)
31637877fdebSMatt Macy return (vdev_raidz_worst_error(rr));
31647877fdebSMatt Macy }
31657877fdebSMatt Macy
31667877fdebSMatt Macy for (int num_failures = 1; num_failures <= nparity; num_failures++) {
31677877fdebSMatt Macy int tstore[VDEV_RAIDZ_MAXPARITY + 2];
31687877fdebSMatt Macy int *ltgts = &tstore[1]; /* value is logical child ID */
31697877fdebSMatt Macy
3170e716630dSMartin Matuska
3171e716630dSMartin Matuska /*
3172e716630dSMartin Matuska * Determine number of logical children, n. See comment
3173e716630dSMartin Matuska * above raidz_simulate_failure().
3174e716630dSMartin Matuska */
3175e716630dSMartin Matuska int n = 0;
3176e716630dSMartin Matuska for (int w = physical_width;
3177e716630dSMartin Matuska w >= original_width; w--) {
3178e716630dSMartin Matuska n += w;
3179e716630dSMartin Matuska }
31807877fdebSMatt Macy
31817877fdebSMatt Macy ASSERT3U(num_failures, <=, nparity);
31827877fdebSMatt Macy ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
31837877fdebSMatt Macy
31847877fdebSMatt Macy /* Handle corner cases in combrec logic */
31857877fdebSMatt Macy ltgts[-1] = -1;
31867877fdebSMatt Macy for (int i = 0; i < num_failures; i++) {
31877877fdebSMatt Macy ltgts[i] = i;
31887877fdebSMatt Macy }
31897877fdebSMatt Macy ltgts[num_failures] = n;
31907877fdebSMatt Macy
31917877fdebSMatt Macy for (;;) {
31927877fdebSMatt Macy int err = raidz_reconstruct(zio, ltgts, num_failures,
31937877fdebSMatt Macy nparity);
31947877fdebSMatt Macy if (err == EINVAL) {
31957877fdebSMatt Macy /*
31967877fdebSMatt Macy * Reconstruction not possible with this #
31977877fdebSMatt Macy * failures; try more failures.
31987877fdebSMatt Macy */
31997877fdebSMatt Macy break;
32007877fdebSMatt Macy } else if (err == 0)
32017877fdebSMatt Macy return (0);
32027877fdebSMatt Macy
32037877fdebSMatt Macy /* Compute next targets to try */
32047877fdebSMatt Macy for (int t = 0; ; t++) {
32057877fdebSMatt Macy ASSERT3U(t, <, num_failures);
32067877fdebSMatt Macy ltgts[t]++;
32077877fdebSMatt Macy if (ltgts[t] == n) {
32087877fdebSMatt Macy /* try more failures */
32097877fdebSMatt Macy ASSERT3U(t, ==, num_failures - 1);
3210e716630dSMartin Matuska if (zfs_flags &
3211e716630dSMartin Matuska ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3212e716630dSMartin Matuska zfs_dbgmsg("reconstruction "
3213e716630dSMartin Matuska "failed for num_failures="
3214e716630dSMartin Matuska "%u; tried all "
3215e716630dSMartin Matuska "combinations",
3216e716630dSMartin Matuska num_failures);
3217e716630dSMartin Matuska }
32187877fdebSMatt Macy break;
32197877fdebSMatt Macy }
32207877fdebSMatt Macy
32217877fdebSMatt Macy ASSERT3U(ltgts[t], <, n);
32227877fdebSMatt Macy ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
32237877fdebSMatt Macy
32247877fdebSMatt Macy /*
32257877fdebSMatt Macy * If that spot is available, we're done here.
32267877fdebSMatt Macy * Try the next combination.
32277877fdebSMatt Macy */
32287877fdebSMatt Macy if (ltgts[t] != ltgts[t + 1])
3229e716630dSMartin Matuska break; // found next combination
32307877fdebSMatt Macy
32317877fdebSMatt Macy /*
32327877fdebSMatt Macy * Otherwise, reset this tgt to the minimum,
32337877fdebSMatt Macy * and move on to the next tgt.
32347877fdebSMatt Macy */
32357877fdebSMatt Macy ltgts[t] = ltgts[t - 1] + 1;
32367877fdebSMatt Macy ASSERT3U(ltgts[t], ==, t);
32377877fdebSMatt Macy }
32387877fdebSMatt Macy
32397877fdebSMatt Macy /* Increase the number of failures and keep trying. */
32407877fdebSMatt Macy if (ltgts[num_failures - 1] == n)
32417877fdebSMatt Macy break;
32427877fdebSMatt Macy }
32437877fdebSMatt Macy }
3244e716630dSMartin Matuska if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3245e716630dSMartin Matuska zfs_dbgmsg("reconstruction failed for all num_failures");
32467877fdebSMatt Macy return (ECKSUM);
32477877fdebSMatt Macy }
32487877fdebSMatt Macy
32497877fdebSMatt Macy void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)32507877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
32517877fdebSMatt Macy {
32527877fdebSMatt Macy for (uint64_t row = 0; row < rm->rm_nrows; row++) {
32537877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[row];
32547877fdebSMatt Macy vdev_raidz_reconstruct_row(rm, rr, t, nt);
32557877fdebSMatt Macy }
32567877fdebSMatt Macy }
32577877fdebSMatt Macy
32587877fdebSMatt Macy /*
32597877fdebSMatt Macy * Complete a write IO operation on a RAIDZ VDev
32607877fdebSMatt Macy *
32617877fdebSMatt Macy * Outline:
32627877fdebSMatt Macy * 1. Check for errors on the child IOs.
32637877fdebSMatt Macy * 2. Return, setting an error code if too few child VDevs were written
32647877fdebSMatt Macy * to reconstruct the data later. Note that partial writes are
32657877fdebSMatt Macy * considered successful if they can be reconstructed at all.
32667877fdebSMatt Macy */
32677877fdebSMatt Macy static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)32687877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
32697877fdebSMatt Macy {
3270e716630dSMartin Matuska int normal_errors = 0;
3271e716630dSMartin Matuska int shadow_errors = 0;
32727877fdebSMatt Macy
32737877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
32747877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
32757877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
32767877fdebSMatt Macy
32777877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
32787877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
32797877fdebSMatt Macy
3280e716630dSMartin Matuska if (rc->rc_error != 0) {
32817877fdebSMatt Macy ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3282e716630dSMartin Matuska normal_errors++;
3283e716630dSMartin Matuska }
3284e716630dSMartin Matuska if (rc->rc_shadow_error != 0) {
3285e716630dSMartin Matuska ASSERT(rc->rc_shadow_error != ECKSUM);
3286e716630dSMartin Matuska shadow_errors++;
32877877fdebSMatt Macy }
32887877fdebSMatt Macy }
32897877fdebSMatt Macy
32907877fdebSMatt Macy /*
32917877fdebSMatt Macy * Treat partial writes as a success. If we couldn't write enough
3292e716630dSMartin Matuska * columns to reconstruct the data, the I/O failed. Otherwise, good
3293e716630dSMartin Matuska * enough. Note that in the case of a shadow write (during raidz
3294e716630dSMartin Matuska * expansion), depending on if we crash, either the normal (old) or
3295e716630dSMartin Matuska * shadow (new) location may become the "real" version of the block,
3296e716630dSMartin Matuska * so both locations must have sufficient redundancy.
3297eda14cbcSMatt Macy *
3298eda14cbcSMatt Macy * Now that we support write reallocation, it would be better
3299eda14cbcSMatt Macy * to treat partial failure as real failure unless there are
3300eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs
3301eda14cbcSMatt Macy * if we intend to reallocate.
3302eda14cbcSMatt Macy */
3303e716630dSMartin Matuska if (normal_errors > rr->rr_firstdatacol ||
3304e716630dSMartin Matuska shadow_errors > rr->rr_firstdatacol) {
33057877fdebSMatt Macy zio->io_error = zio_worst_error(zio->io_error,
33067877fdebSMatt Macy vdev_raidz_worst_error(rr));
33077877fdebSMatt Macy }
3308eda14cbcSMatt Macy }
3309eda14cbcSMatt Macy
3310f9693befSMartin Matuska static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)33117877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
33127877fdebSMatt Macy raidz_row_t *rr)
33137877fdebSMatt Macy {
33147877fdebSMatt Macy int parity_errors = 0;
33157877fdebSMatt Macy int parity_untried = 0;
33167877fdebSMatt Macy int data_errors = 0;
33177877fdebSMatt Macy int total_errors = 0;
33187877fdebSMatt Macy
33197877fdebSMatt Macy ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
33207877fdebSMatt Macy ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
33217877fdebSMatt Macy
33227877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
33237877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
33247877fdebSMatt Macy
3325a0b956f5SMartin Matuska /*
3326a0b956f5SMartin Matuska * If scrubbing and a replacing/sparing child vdev determined
3327a0b956f5SMartin Matuska * that not all of its children have an identical copy of the
3328a0b956f5SMartin Matuska * data, then clear the error so the column is treated like
3329a0b956f5SMartin Matuska * any other read and force a repair to correct the damage.
3330a0b956f5SMartin Matuska */
3331a0b956f5SMartin Matuska if (rc->rc_error == ECKSUM) {
3332a0b956f5SMartin Matuska ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3333a0b956f5SMartin Matuska vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3334a0b956f5SMartin Matuska rc->rc_force_repair = 1;
3335a0b956f5SMartin Matuska rc->rc_error = 0;
3336a0b956f5SMartin Matuska }
33377877fdebSMatt Macy
3338a0b956f5SMartin Matuska if (rc->rc_error) {
33397877fdebSMatt Macy if (c < rr->rr_firstdatacol)
33407877fdebSMatt Macy parity_errors++;
33417877fdebSMatt Macy else
33427877fdebSMatt Macy data_errors++;
33437877fdebSMatt Macy
33447877fdebSMatt Macy total_errors++;
33457877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
33467877fdebSMatt Macy parity_untried++;
33477877fdebSMatt Macy }
33487877fdebSMatt Macy }
3349eda14cbcSMatt Macy
3350eda14cbcSMatt Macy /*
33517877fdebSMatt Macy * If there were data errors and the number of errors we saw was
33527877fdebSMatt Macy * correctable -- less than or equal to the number of parity disks read
33537877fdebSMatt Macy * -- reconstruct based on the missing data.
3354eda14cbcSMatt Macy */
33557877fdebSMatt Macy if (data_errors != 0 &&
33567877fdebSMatt Macy total_errors <= rr->rr_firstdatacol - parity_untried) {
3357eda14cbcSMatt Macy /*
3358eda14cbcSMatt Macy * We either attempt to read all the parity columns or
3359eda14cbcSMatt Macy * none of them. If we didn't try to read parity, we
3360eda14cbcSMatt Macy * wouldn't be here in the correctable case. There must
3361eda14cbcSMatt Macy * also have been fewer parity errors than parity
3362eda14cbcSMatt Macy * columns or, again, we wouldn't be in this code path.
3363eda14cbcSMatt Macy */
3364eda14cbcSMatt Macy ASSERT(parity_untried == 0);
33657877fdebSMatt Macy ASSERT(parity_errors < rr->rr_firstdatacol);
3366eda14cbcSMatt Macy
3367eda14cbcSMatt Macy /*
3368eda14cbcSMatt Macy * Identify the data columns that reported an error.
3369eda14cbcSMatt Macy */
33707877fdebSMatt Macy int n = 0;
33717877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY];
33727877fdebSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
33737877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
3374eda14cbcSMatt Macy if (rc->rc_error != 0) {
3375eda14cbcSMatt Macy ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3376eda14cbcSMatt Macy tgts[n++] = c;
3377eda14cbcSMatt Macy }
3378eda14cbcSMatt Macy }
3379eda14cbcSMatt Macy
33807877fdebSMatt Macy ASSERT(rr->rr_firstdatacol >= n);
3381eda14cbcSMatt Macy
3382f9693befSMartin Matuska vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3383eda14cbcSMatt Macy }
3384eda14cbcSMatt Macy }
3385eda14cbcSMatt Macy
3386eda14cbcSMatt Macy /*
33877877fdebSMatt Macy * Return the number of reads issued.
3388eda14cbcSMatt Macy */
33897877fdebSMatt Macy static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)33907877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
33917877fdebSMatt Macy {
33927877fdebSMatt Macy vdev_t *vd = zio->io_vd;
33937877fdebSMatt Macy int nread = 0;
3394eda14cbcSMatt Macy
33957877fdebSMatt Macy rr->rr_missingdata = 0;
33967877fdebSMatt Macy rr->rr_missingparity = 0;
33977877fdebSMatt Macy
33987877fdebSMatt Macy /*
33997877fdebSMatt Macy * If this rows contains empty sectors which are not required
34007877fdebSMatt Macy * for a normal read then allocate an ABD for them now so they
34017877fdebSMatt Macy * may be read, verified, and any needed repairs performed.
34027877fdebSMatt Macy */
3403e716630dSMartin Matuska if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
34047877fdebSMatt Macy vdev_draid_map_alloc_empty(zio, rr);
34057877fdebSMatt Macy
34067877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
34077877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
34087877fdebSMatt Macy if (rc->rc_tried || rc->rc_size == 0)
3409eda14cbcSMatt Macy continue;
3410eda14cbcSMatt Macy
3411eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL,
3412eda14cbcSMatt Macy vd->vdev_child[rc->rc_devidx],
3413eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size,
3414eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0,
3415eda14cbcSMatt Macy vdev_raidz_child_done, rc));
34167877fdebSMatt Macy nread++;
34177877fdebSMatt Macy }
34187877fdebSMatt Macy return (nread);
3419eda14cbcSMatt Macy }
3420eda14cbcSMatt Macy
3421eda14cbcSMatt Macy /*
34227877fdebSMatt Macy * We're here because either there were too many errors to even attempt
34237877fdebSMatt Macy * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
34247877fdebSMatt Macy * failed. In either case, there is enough bad data to prevent reconstruction.
34257877fdebSMatt Macy * Start checksum ereports for all children which haven't failed.
3426eda14cbcSMatt Macy */
34277877fdebSMatt Macy static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)34287877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio)
34297877fdebSMatt Macy {
34307877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
3431eda14cbcSMatt Macy
34327877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
34337877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
3434eda14cbcSMatt Macy
34357877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
34367877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
34377877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
34387877fdebSMatt Macy
34392c48331dSMatt Macy if (rc->rc_error != 0)
34402c48331dSMatt Macy continue;
34412c48331dSMatt Macy
3442eda14cbcSMatt Macy zio_bad_cksum_t zbc;
3443eda14cbcSMatt Macy zbc.zbc_has_cksum = 0;
34442c48331dSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected;
3445eda14cbcSMatt Macy mutex_enter(&cvd->vdev_stat_lock);
3446eda14cbcSMatt Macy cvd->vdev_stat.vs_checksum_errors++;
3447eda14cbcSMatt Macy mutex_exit(&cvd->vdev_stat_lock);
3448bb2d13b6SMartin Matuska (void) zfs_ereport_start_checksum(zio->io_spa,
3449bb2d13b6SMartin Matuska cvd, &zio->io_bookmark, zio, rc->rc_offset,
3450bb2d13b6SMartin Matuska rc->rc_size, &zbc);
3451eda14cbcSMatt Macy }
3452eda14cbcSMatt Macy }
3453eda14cbcSMatt Macy }
3454eda14cbcSMatt Macy
34557877fdebSMatt Macy void
vdev_raidz_io_done(zio_t * zio)34567877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio)
34577877fdebSMatt Macy {
34587877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
34597877fdebSMatt Macy
3460e716630dSMartin Matuska ASSERT(zio->io_bp != NULL);
34617877fdebSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) {
34627877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
34637877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
34647877fdebSMatt Macy }
34657877fdebSMatt Macy } else {
3466e716630dSMartin Matuska if (rm->rm_phys_col) {
3467e716630dSMartin Matuska /*
3468e716630dSMartin Matuska * This is an aggregated read. Copy the data and status
3469e716630dSMartin Matuska * from the aggregate abd's to the individual rows.
3470e716630dSMartin Matuska */
3471e716630dSMartin Matuska for (int i = 0; i < rm->rm_nrows; i++) {
3472e716630dSMartin Matuska raidz_row_t *rr = rm->rm_row[i];
3473e716630dSMartin Matuska
3474e716630dSMartin Matuska for (int c = 0; c < rr->rr_cols; c++) {
3475e716630dSMartin Matuska raidz_col_t *rc = &rr->rr_col[c];
3476e716630dSMartin Matuska if (rc->rc_tried || rc->rc_size == 0)
3477e716630dSMartin Matuska continue;
3478e716630dSMartin Matuska
3479e716630dSMartin Matuska raidz_col_t *prc =
3480e716630dSMartin Matuska &rm->rm_phys_col[rc->rc_devidx];
3481e716630dSMartin Matuska rc->rc_error = prc->rc_error;
3482e716630dSMartin Matuska rc->rc_tried = prc->rc_tried;
3483e716630dSMartin Matuska rc->rc_skipped = prc->rc_skipped;
3484e716630dSMartin Matuska if (c >= rr->rr_firstdatacol) {
3485e716630dSMartin Matuska /*
3486e716630dSMartin Matuska * Note: this is slightly faster
3487e716630dSMartin Matuska * than using abd_copy_off().
3488e716630dSMartin Matuska */
3489e716630dSMartin Matuska char *physbuf = abd_to_buf(
3490e716630dSMartin Matuska prc->rc_abd);
3491e716630dSMartin Matuska void *physloc = physbuf +
3492e716630dSMartin Matuska rc->rc_offset -
3493e716630dSMartin Matuska prc->rc_offset;
3494e716630dSMartin Matuska
3495e716630dSMartin Matuska abd_copy_from_buf(rc->rc_abd,
3496e716630dSMartin Matuska physloc, rc->rc_size);
3497e716630dSMartin Matuska }
3498e716630dSMartin Matuska }
3499e716630dSMartin Matuska }
3500e716630dSMartin Matuska }
3501e716630dSMartin Matuska
35027877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
35037877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
35047877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio,
35057877fdebSMatt Macy rm, rr);
35067877fdebSMatt Macy }
35077877fdebSMatt Macy
35087877fdebSMatt Macy if (raidz_checksum_verify(zio) == 0) {
350987bf66d4SMartin Matuska if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
351087bf66d4SMartin Matuska goto done;
351187bf66d4SMartin Matuska
35127877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
35137877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
35147877fdebSMatt Macy vdev_raidz_io_done_verified(zio, rr);
35157877fdebSMatt Macy }
3516eda14cbcSMatt Macy zio_checksum_verified(zio);
35177877fdebSMatt Macy } else {
3518eda14cbcSMatt Macy /*
35197877fdebSMatt Macy * A sequential resilver has no checksum which makes
35207877fdebSMatt Macy * combinatoral reconstruction impossible. This code
35217877fdebSMatt Macy * path is unreachable since raidz_checksum_verify()
35227877fdebSMatt Macy * has no checksum to verify and must succeed.
3523eda14cbcSMatt Macy */
35247877fdebSMatt Macy ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3525eda14cbcSMatt Macy
35267877fdebSMatt Macy /*
35277877fdebSMatt Macy * This isn't a typical situation -- either we got a
35287877fdebSMatt Macy * read error or a child silently returned bad data.
35297877fdebSMatt Macy * Read every block so we can try again with as much
35307877fdebSMatt Macy * data and parity as we can track down. If we've
35317877fdebSMatt Macy * already been through once before, all children will
35327877fdebSMatt Macy * be marked as tried so we'll proceed to combinatorial
35337877fdebSMatt Macy * reconstruction.
35347877fdebSMatt Macy */
35357877fdebSMatt Macy int nread = 0;
35367877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
35377877fdebSMatt Macy nread += vdev_raidz_read_all(zio,
35387877fdebSMatt Macy rm->rm_row[i]);
35397877fdebSMatt Macy }
35407877fdebSMatt Macy if (nread != 0) {
35417877fdebSMatt Macy /*
35427877fdebSMatt Macy * Normally our stage is VDEV_IO_DONE, but if
35437877fdebSMatt Macy * we've already called redone(), it will have
35447877fdebSMatt Macy * changed to VDEV_IO_START, in which case we
35457877fdebSMatt Macy * don't want to call redone() again.
35467877fdebSMatt Macy */
35477877fdebSMatt Macy if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
35487877fdebSMatt Macy zio_vdev_io_redone(zio);
35497877fdebSMatt Macy return;
35507877fdebSMatt Macy }
3551e716630dSMartin Matuska /*
3552e716630dSMartin Matuska * It would be too expensive to try every possible
3553e716630dSMartin Matuska * combination of failed sectors in every row, so
3554e716630dSMartin Matuska * instead we try every combination of failed current or
3555e716630dSMartin Matuska * past physical disk. This means that if the incorrect
3556e716630dSMartin Matuska * sectors were all on Nparity disks at any point in the
3557e716630dSMartin Matuska * past, we will find the correct data. The only known
3558e716630dSMartin Matuska * case where this is less durable than a non-expanded
3559e716630dSMartin Matuska * RAIDZ, is if we have a silent failure during
3560e716630dSMartin Matuska * expansion. In that case, one block could be
3561e716630dSMartin Matuska * partially in the old format and partially in the
3562e716630dSMartin Matuska * new format, so we'd lost some sectors from the old
3563e716630dSMartin Matuska * format and some from the new format.
3564e716630dSMartin Matuska *
3565e716630dSMartin Matuska * e.g. logical_width=4 physical_width=6
3566e716630dSMartin Matuska * the 15 (6+5+4) possible failed disks are:
3567e716630dSMartin Matuska * width=6 child=0
3568e716630dSMartin Matuska * width=6 child=1
3569e716630dSMartin Matuska * width=6 child=2
3570e716630dSMartin Matuska * width=6 child=3
3571e716630dSMartin Matuska * width=6 child=4
3572e716630dSMartin Matuska * width=6 child=5
3573e716630dSMartin Matuska * width=5 child=0
3574e716630dSMartin Matuska * width=5 child=1
3575e716630dSMartin Matuska * width=5 child=2
3576e716630dSMartin Matuska * width=5 child=3
3577e716630dSMartin Matuska * width=5 child=4
3578e716630dSMartin Matuska * width=4 child=0
3579e716630dSMartin Matuska * width=4 child=1
3580e716630dSMartin Matuska * width=4 child=2
3581e716630dSMartin Matuska * width=4 child=3
3582e716630dSMartin Matuska * And we will try every combination of Nparity of these
3583e716630dSMartin Matuska * failing.
3584e716630dSMartin Matuska *
3585e716630dSMartin Matuska * As a first pass, we can generate every combo,
3586e716630dSMartin Matuska * and try reconstructing, ignoring any known
3587e716630dSMartin Matuska * failures. If any row has too many known + simulated
3588e716630dSMartin Matuska * failures, then we bail on reconstructing with this
3589e716630dSMartin Matuska * number of simulated failures. As an improvement,
3590e716630dSMartin Matuska * we could detect the number of whole known failures
3591e716630dSMartin Matuska * (i.e. we have known failures on these disks for
3592e716630dSMartin Matuska * every row; the disks never succeeded), and
3593e716630dSMartin Matuska * subtract that from the max # failures to simulate.
3594e716630dSMartin Matuska * We could go even further like the current
3595e716630dSMartin Matuska * combrec code, but that doesn't seem like it
3596e716630dSMartin Matuska * gains us very much. If we simulate a failure
3597e716630dSMartin Matuska * that is also a known failure, that's fine.
3598e716630dSMartin Matuska */
35997877fdebSMatt Macy zio->io_error = vdev_raidz_combrec(zio);
36007877fdebSMatt Macy if (zio->io_error == ECKSUM &&
36017877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
36027877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio);
36037877fdebSMatt Macy }
3604eda14cbcSMatt Macy }
3605eda14cbcSMatt Macy }
360687bf66d4SMartin Matuska done:
3607e716630dSMartin Matuska if (rm->rm_lr != NULL) {
3608e716630dSMartin Matuska zfs_rangelock_exit(rm->rm_lr);
3609e716630dSMartin Matuska rm->rm_lr = NULL;
3610e716630dSMartin Matuska }
3611eda14cbcSMatt Macy }
3612eda14cbcSMatt Macy
3613eda14cbcSMatt Macy static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)3614eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3615eda14cbcSMatt Macy {
36167877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
36177877fdebSMatt Macy if (faulted > vdrz->vd_nparity)
3618eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3619eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS);
3620eda14cbcSMatt Macy else if (degraded + faulted != 0)
3621eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3622eda14cbcSMatt Macy else
3623eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3624eda14cbcSMatt Macy }
3625eda14cbcSMatt Macy
3626eda14cbcSMatt Macy /*
3627eda14cbcSMatt Macy * Determine if any portion of the provided block resides on a child vdev
3628eda14cbcSMatt Macy * with a dirty DTL and therefore needs to be resilvered. The function
3629eda14cbcSMatt Macy * assumes that at least one DTL is dirty which implies that full stripe
3630eda14cbcSMatt Macy * width blocks must be resilvered.
3631eda14cbcSMatt Macy */
3632eda14cbcSMatt Macy static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)36337877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
36347877fdebSMatt Macy uint64_t phys_birth)
3635eda14cbcSMatt Macy {
36367877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
3637e716630dSMartin Matuska
3638e716630dSMartin Matuska /*
3639e716630dSMartin Matuska * If we're in the middle of a RAIDZ expansion, this block may be in
3640e716630dSMartin Matuska * the old and/or new location. For simplicity, always resilver it.
3641e716630dSMartin Matuska */
3642e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3643e716630dSMartin Matuska return (B_TRUE);
3644e716630dSMartin Matuska
3645eda14cbcSMatt Macy uint64_t dcols = vd->vdev_children;
36467877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity;
3647eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift;
3648eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */
36497877fdebSMatt Macy uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3650eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */
3651eda14cbcSMatt Macy uint64_t s = ((psize - 1) >> ashift) + 1;
3652eda14cbcSMatt Macy /* The first column for this stripe. */
3653eda14cbcSMatt Macy uint64_t f = b % dcols;
3654eda14cbcSMatt Macy
36557877fdebSMatt Macy /* Unreachable by sequential resilver. */
36567877fdebSMatt Macy ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
36577877fdebSMatt Macy
36587877fdebSMatt Macy if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
36597877fdebSMatt Macy return (B_FALSE);
36607877fdebSMatt Macy
3661eda14cbcSMatt Macy if (s + nparity >= dcols)
3662eda14cbcSMatt Macy return (B_TRUE);
3663eda14cbcSMatt Macy
3664eda14cbcSMatt Macy for (uint64_t c = 0; c < s + nparity; c++) {
3665eda14cbcSMatt Macy uint64_t devidx = (f + c) % dcols;
3666eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[devidx];
3667eda14cbcSMatt Macy
3668eda14cbcSMatt Macy /*
3669eda14cbcSMatt Macy * dsl_scan_need_resilver() already checked vd with
3670eda14cbcSMatt Macy * vdev_dtl_contains(). So here just check cvd with
3671eda14cbcSMatt Macy * vdev_dtl_empty(), cheaper and a good approximation.
3672eda14cbcSMatt Macy */
3673eda14cbcSMatt Macy if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3674eda14cbcSMatt Macy return (B_TRUE);
3675eda14cbcSMatt Macy }
3676eda14cbcSMatt Macy
3677eda14cbcSMatt Macy return (B_FALSE);
3678eda14cbcSMatt Macy }
3679eda14cbcSMatt Macy
3680eda14cbcSMatt Macy static void
vdev_raidz_xlate(vdev_t * cvd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)3681b59a0cdeSMartin Matuska vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
3682b59a0cdeSMartin Matuska zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
3683eda14cbcSMatt Macy {
3684e92ffd9bSMartin Matuska (void) remain_rs;
3685e92ffd9bSMartin Matuska
3686eda14cbcSMatt Macy vdev_t *raidvd = cvd->vdev_parent;
3687eda14cbcSMatt Macy ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3688eda14cbcSMatt Macy
3689e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3690e716630dSMartin Matuska
3691e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3692e716630dSMartin Matuska /*
3693e716630dSMartin Matuska * We're in the middle of expansion, in which case the
3694e716630dSMartin Matuska * translation is in flux. Any answer we give may be wrong
3695e716630dSMartin Matuska * by the time we return, so it isn't safe for the caller to
3696e716630dSMartin Matuska * act on it. Therefore we say that this range isn't present
3697e716630dSMartin Matuska * on any children. The only consumers of this are "zpool
3698e716630dSMartin Matuska * initialize" and trimming, both of which are "best effort"
3699e716630dSMartin Matuska * anyway.
3700e716630dSMartin Matuska */
3701e716630dSMartin Matuska physical_rs->rs_start = physical_rs->rs_end = 0;
3702e716630dSMartin Matuska remain_rs->rs_start = remain_rs->rs_end = 0;
3703e716630dSMartin Matuska return;
3704e716630dSMartin Matuska }
3705e716630dSMartin Matuska
3706e716630dSMartin Matuska uint64_t width = vdrz->vd_physical_width;
3707eda14cbcSMatt Macy uint64_t tgt_col = cvd->vdev_id;
3708eda14cbcSMatt Macy uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3709eda14cbcSMatt Macy
3710eda14cbcSMatt Macy /* make sure the offsets are block-aligned */
37117877fdebSMatt Macy ASSERT0(logical_rs->rs_start % (1 << ashift));
37127877fdebSMatt Macy ASSERT0(logical_rs->rs_end % (1 << ashift));
37137877fdebSMatt Macy uint64_t b_start = logical_rs->rs_start >> ashift;
37147877fdebSMatt Macy uint64_t b_end = logical_rs->rs_end >> ashift;
3715eda14cbcSMatt Macy
3716eda14cbcSMatt Macy uint64_t start_row = 0;
3717eda14cbcSMatt Macy if (b_start > tgt_col) /* avoid underflow */
3718eda14cbcSMatt Macy start_row = ((b_start - tgt_col - 1) / width) + 1;
3719eda14cbcSMatt Macy
3720eda14cbcSMatt Macy uint64_t end_row = 0;
3721eda14cbcSMatt Macy if (b_end > tgt_col)
3722eda14cbcSMatt Macy end_row = ((b_end - tgt_col - 1) / width) + 1;
3723eda14cbcSMatt Macy
37247877fdebSMatt Macy physical_rs->rs_start = start_row << ashift;
37257877fdebSMatt Macy physical_rs->rs_end = end_row << ashift;
3726eda14cbcSMatt Macy
37277877fdebSMatt Macy ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
37287877fdebSMatt Macy ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
37297877fdebSMatt Macy logical_rs->rs_end - logical_rs->rs_start);
37307877fdebSMatt Macy }
37317877fdebSMatt Macy
3732e716630dSMartin Matuska static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)3733e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3734e716630dSMartin Matuska {
3735e716630dSMartin Matuska spa_t *spa = arg;
3736e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3737e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3738e716630dSMartin Matuska
3739e716630dSMartin Matuska /*
3740e716630dSMartin Matuska * Ensure there are no i/os to the range that is being committed.
3741e716630dSMartin Matuska */
3742e716630dSMartin Matuska uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3743e716630dSMartin Matuska ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3744e716630dSMartin Matuska
3745e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
3746e716630dSMartin Matuska uint64_t new_offset =
3747e716630dSMartin Matuska MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3748e716630dSMartin Matuska /*
3749e716630dSMartin Matuska * We should not have committed anything that failed.
3750e716630dSMartin Matuska */
3751e716630dSMartin Matuska VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3752e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
3753e716630dSMartin Matuska
3754e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3755e716630dSMartin Matuska old_offset, new_offset - old_offset,
3756e716630dSMartin Matuska RL_WRITER);
3757e716630dSMartin Matuska
3758e716630dSMartin Matuska /*
3759e716630dSMartin Matuska * Update the uberblock that will be written when this txg completes.
3760e716630dSMartin Matuska */
3761e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3762e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3763e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = 0;
3764e716630dSMartin Matuska zfs_rangelock_exit(lr);
3765e716630dSMartin Matuska
3766e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
3767e716630dSMartin Matuska vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3768e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = 0;
3769e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
3770e716630dSMartin Matuska
3771e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3772e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset,
3773e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3774e716630dSMartin Matuska sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3775e716630dSMartin Matuska }
3776e716630dSMartin Matuska
3777e716630dSMartin Matuska static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)3778e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3779e716630dSMartin Matuska {
3780e716630dSMartin Matuska spa_t *spa = arg;
3781e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3782e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3783e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3784e716630dSMartin Matuska
3785e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++)
3786e716630dSMartin Matuska VERIFY0(vre->vre_offset_pertxg[i]);
3787e716630dSMartin Matuska
3788e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3789e716630dSMartin Matuska re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3790e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width;
3791e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock);
3792e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re);
3793e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock);
3794e716630dSMartin Matuska
3795e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3796e716630dSMartin Matuska
3797e716630dSMartin Matuska /*
3798e716630dSMartin Matuska * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3799e716630dSMartin Matuska * will get written (based on vd_expand_txgs).
3800e716630dSMartin Matuska */
3801e716630dSMartin Matuska vdev_config_dirty(vd);
3802e716630dSMartin Matuska
3803e716630dSMartin Matuska /*
3804e716630dSMartin Matuska * Before we change vre_state, the on-disk state must reflect that we
3805e716630dSMartin Matuska * have completed all copying, so that vdev_raidz_io_start() can use
3806e716630dSMartin Matuska * vre_state to determine if the reflow is in progress. See also the
3807e716630dSMartin Matuska * end of spa_raidz_expand_thread().
3808e716630dSMartin Matuska */
3809e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3810e716630dSMartin Matuska raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3811e716630dSMartin Matuska
3812e716630dSMartin Matuska vre->vre_end_time = gethrestime_sec();
3813e716630dSMartin Matuska vre->vre_state = DSS_FINISHED;
3814e716630dSMartin Matuska
3815e716630dSMartin Matuska uint64_t state = vre->vre_state;
3816e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset,
3817e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3818e716630dSMartin Matuska sizeof (state), 1, &state, tx));
3819e716630dSMartin Matuska
3820e716630dSMartin Matuska uint64_t end_time = vre->vre_end_time;
3821e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset,
3822e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3823e716630dSMartin Matuska sizeof (end_time), 1, &end_time, tx));
3824e716630dSMartin Matuska
3825e716630dSMartin Matuska spa->spa_uberblock.ub_raidz_reflow_info = 0;
3826e716630dSMartin Matuska
3827e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
3828e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa),
3829e716630dSMartin Matuska (unsigned long long)vd->vdev_id,
3830e716630dSMartin Matuska (unsigned long long)vd->vdev_children);
3831e716630dSMartin Matuska
3832e716630dSMartin Matuska spa->spa_raidz_expand = NULL;
3833e716630dSMartin Matuska raidvd->vdev_rz_expanding = B_FALSE;
3834e716630dSMartin Matuska
3835e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3836e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3837e716630dSMartin Matuska spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3838e716630dSMartin Matuska
3839e716630dSMartin Matuska spa_notify_waiters(spa);
3840e716630dSMartin Matuska
3841e716630dSMartin Matuska /*
3842e716630dSMartin Matuska * While we're in syncing context take the opportunity to
3843e716630dSMartin Matuska * setup a scrub. All the data has been sucessfully copied
3844e716630dSMartin Matuska * but we have not validated any checksums.
3845e716630dSMartin Matuska */
384617aab35aSMartin Matuska setup_sync_arg_t setup_sync_arg = {
384717aab35aSMartin Matuska .func = POOL_SCAN_SCRUB,
384817aab35aSMartin Matuska .txgstart = 0,
384917aab35aSMartin Matuska .txgend = 0,
385017aab35aSMartin Matuska };
385117aab35aSMartin Matuska if (zfs_scrub_after_expand &&
385217aab35aSMartin Matuska dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
385317aab35aSMartin Matuska dsl_scan_setup_sync(&setup_sync_arg, tx);
385417aab35aSMartin Matuska }
3855e716630dSMartin Matuska }
3856e716630dSMartin Matuska
3857e716630dSMartin Matuska /*
385817aab35aSMartin Matuska * State of one copy batch.
3859e716630dSMartin Matuska */
3860e716630dSMartin Matuska typedef struct raidz_reflow_arg {
386117aab35aSMartin Matuska vdev_raidz_expand_t *rra_vre; /* Global expantion state. */
386217aab35aSMartin Matuska zfs_locked_range_t *rra_lr; /* Range lock of this batch. */
386317aab35aSMartin Matuska uint64_t rra_txg; /* TXG of this batch. */
386417aab35aSMartin Matuska uint_t rra_ashift; /* Ashift of the vdev. */
386517aab35aSMartin Matuska uint32_t rra_tbd; /* Number of in-flight ZIOs. */
386617aab35aSMartin Matuska uint32_t rra_writes; /* Number of write ZIOs. */
386717aab35aSMartin Matuska zio_t *rra_zio[]; /* Write ZIO pointers. */
3868e716630dSMartin Matuska } raidz_reflow_arg_t;
3869e716630dSMartin Matuska
3870e716630dSMartin Matuska /*
387117aab35aSMartin Matuska * Write of the new location on one child is done. Once all of them are done
387217aab35aSMartin Matuska * we can unlock and free everything.
3873e716630dSMartin Matuska */
3874e716630dSMartin Matuska static void
raidz_reflow_write_done(zio_t * zio)3875e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio)
3876e716630dSMartin Matuska {
3877e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private;
3878e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre;
3879e716630dSMartin Matuska
3880e716630dSMartin Matuska abd_free(zio->io_abd);
3881e716630dSMartin Matuska
3882e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
3883e716630dSMartin Matuska if (zio->io_error != 0) {
3884e716630dSMartin Matuska /* Force a reflow pause on errors */
3885e716630dSMartin Matuska vre->vre_failed_offset =
3886e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3887e716630dSMartin Matuska }
3888e716630dSMartin Matuska ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3889e716630dSMartin Matuska vre->vre_outstanding_bytes -= zio->io_size;
3890e716630dSMartin Matuska if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3891e716630dSMartin Matuska vre->vre_failed_offset) {
3892e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3893e716630dSMartin Matuska zio->io_size;
3894e716630dSMartin Matuska }
3895e716630dSMartin Matuska cv_signal(&vre->vre_cv);
389617aab35aSMartin Matuska boolean_t done = (--rra->rra_tbd == 0);
3897e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
3898e716630dSMartin Matuska
389917aab35aSMartin Matuska if (!done)
390017aab35aSMartin Matuska return;
3901e716630dSMartin Matuska spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
390217aab35aSMartin Matuska zfs_rangelock_exit(rra->rra_lr);
390317aab35aSMartin Matuska kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3904e716630dSMartin Matuska }
3905e716630dSMartin Matuska
3906e716630dSMartin Matuska /*
390717aab35aSMartin Matuska * Read of the old location on one child is done. Once all of them are done
390817aab35aSMartin Matuska * writes should have all the data and we can issue them.
3909e716630dSMartin Matuska */
3910e716630dSMartin Matuska static void
raidz_reflow_read_done(zio_t * zio)3911e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio)
3912e716630dSMartin Matuska {
3913e716630dSMartin Matuska raidz_reflow_arg_t *rra = zio->io_private;
3914e716630dSMartin Matuska vdev_raidz_expand_t *vre = rra->rra_vre;
3915e716630dSMartin Matuska
391617aab35aSMartin Matuska /* Reads of only one block use write ABDs. For bigger free gangs. */
391717aab35aSMartin Matuska if (zio->io_size > (1 << rra->rra_ashift))
391817aab35aSMartin Matuska abd_free(zio->io_abd);
391917aab35aSMartin Matuska
3920e716630dSMartin Matuska /*
3921e716630dSMartin Matuska * If the read failed, or if it was done on a vdev that is not fully
3922e716630dSMartin Matuska * healthy (e.g. a child that has a resilver in progress), we may not
3923e716630dSMartin Matuska * have the correct data. Note that it's OK if the write proceeds.
3924e716630dSMartin Matuska * It may write garbage but the location is otherwise unused and we
3925e716630dSMartin Matuska * will retry later due to vre_failed_offset.
3926e716630dSMartin Matuska */
3927e716630dSMartin Matuska if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3928e716630dSMartin Matuska zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3929e716630dSMartin Matuska "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3930e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset,
3931e716630dSMartin Matuska (long long)rra->rra_lr->lr_length,
3932e716630dSMartin Matuska (long long)rra->rra_txg,
3933e716630dSMartin Matuska zio->io_error,
3934e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3935e716630dSMartin Matuska vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3936e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
3937e716630dSMartin Matuska /* Force a reflow pause on errors */
3938e716630dSMartin Matuska vre->vre_failed_offset =
3939e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3940e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
3941e716630dSMartin Matuska }
3942e716630dSMartin Matuska
394317aab35aSMartin Matuska if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
394417aab35aSMartin Matuska return;
3945dd215568SMartin Matuska uint32_t writes = rra->rra_tbd = rra->rra_writes;
3946dd215568SMartin Matuska for (uint64_t i = 0; i < writes; i++)
394717aab35aSMartin Matuska zio_nowait(rra->rra_zio[i]);
3948e716630dSMartin Matuska }
3949e716630dSMartin Matuska
3950e716630dSMartin Matuska static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)3951e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3952e716630dSMartin Matuska dmu_tx_t *tx)
3953e716630dSMartin Matuska {
3954e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3955e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3956e716630dSMartin Matuska
3957e716630dSMartin Matuska if (offset == 0)
3958e716630dSMartin Matuska return;
3959e716630dSMartin Matuska
3960e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
3961e716630dSMartin Matuska ASSERT3U(vre->vre_offset, <=, offset);
3962e716630dSMartin Matuska vre->vre_offset = offset;
3963e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
3964e716630dSMartin Matuska
3965e716630dSMartin Matuska if (vre->vre_offset_pertxg[txgoff] == 0) {
3966e716630dSMartin Matuska dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3967e716630dSMartin Matuska spa, tx);
3968e716630dSMartin Matuska }
3969e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = offset;
3970e716630dSMartin Matuska }
3971e716630dSMartin Matuska
3972e716630dSMartin Matuska static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)3973e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3974e716630dSMartin Matuska {
3975e716630dSMartin Matuska for (int i = 0; i < raidz_vd->vdev_children; i++) {
3976e716630dSMartin Matuska /* Quick check if a child is being replaced */
3977e716630dSMartin Matuska if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3978e716630dSMartin Matuska return (B_TRUE);
3979e716630dSMartin Matuska }
3980e716630dSMartin Matuska return (B_FALSE);
3981e716630dSMartin Matuska }
3982e716630dSMartin Matuska
3983e716630dSMartin Matuska static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,zfs_range_tree_t * rt,dmu_tx_t * tx)3984b59a0cdeSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
3985e716630dSMartin Matuska dmu_tx_t *tx)
3986e716630dSMartin Matuska {
3987e716630dSMartin Matuska spa_t *spa = vd->vdev_spa;
398817aab35aSMartin Matuska uint_t ashift = vd->vdev_top->vdev_ashift;
3989e716630dSMartin Matuska
3990b59a0cdeSMartin Matuska zfs_range_seg_t *rs = zfs_range_tree_first(rt);
399117aab35aSMartin Matuska if (rt == NULL)
3992e716630dSMartin Matuska return (B_FALSE);
3993b59a0cdeSMartin Matuska uint64_t offset = zfs_rs_get_start(rs, rt);
3994e716630dSMartin Matuska ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3995b59a0cdeSMartin Matuska uint64_t size = zfs_rs_get_end(rs, rt) - offset;
3996e716630dSMartin Matuska ASSERT3U(size, >=, 1 << ashift);
399717aab35aSMartin Matuska ASSERT(IS_P2ALIGNED(size, 1 << ashift));
3998e716630dSMartin Matuska
3999e716630dSMartin Matuska uint64_t blkid = offset >> ashift;
400017aab35aSMartin Matuska uint_t old_children = vd->vdev_children - 1;
4001e716630dSMartin Matuska
4002e716630dSMartin Matuska /*
4003e716630dSMartin Matuska * We can only progress to the point that writes will not overlap
4004e716630dSMartin Matuska * with blocks whose progress has not yet been recorded on disk.
4005e716630dSMartin Matuska * Since partially-copied rows are still read from the old location,
4006e716630dSMartin Matuska * we need to stop one row before the sector-wise overlap, to prevent
4007e716630dSMartin Matuska * row-wise overlap.
4008e716630dSMartin Matuska *
4009e716630dSMartin Matuska * Note that even if we are skipping over a large unallocated region,
4010e716630dSMartin Matuska * we can't move the on-disk progress to `offset`, because concurrent
4011e716630dSMartin Matuska * writes/allocations could still use the currently-unallocated
4012e716630dSMartin Matuska * region.
4013e716630dSMartin Matuska */
4014e716630dSMartin Matuska uint64_t ubsync_blkid =
4015e716630dSMartin Matuska RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
4016e716630dSMartin Matuska uint64_t next_overwrite_blkid = ubsync_blkid +
4017e716630dSMartin Matuska ubsync_blkid / old_children - old_children;
4018e716630dSMartin Matuska VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
4019e716630dSMartin Matuska if (blkid >= next_overwrite_blkid) {
4020e716630dSMartin Matuska raidz_reflow_record_progress(vre,
4021e716630dSMartin Matuska next_overwrite_blkid << ashift, tx);
4022e716630dSMartin Matuska return (B_TRUE);
4023e716630dSMartin Matuska }
4024e716630dSMartin Matuska
402517aab35aSMartin Matuska size = MIN(size, raidz_expand_max_copy_bytes);
402617aab35aSMartin Matuska size = MIN(size, (uint64_t)old_children *
402717aab35aSMartin Matuska MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
402817aab35aSMartin Matuska size = MAX(size, 1 << ashift);
402917aab35aSMartin Matuska uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
403017aab35aSMartin Matuska size = (uint64_t)blocks << ashift;
4031e716630dSMartin Matuska
4032b59a0cdeSMartin Matuska zfs_range_tree_remove(rt, offset, size);
403317aab35aSMartin Matuska
403417aab35aSMartin Matuska uint_t reads = MIN(blocks, old_children);
403517aab35aSMartin Matuska uint_t writes = MIN(blocks, vd->vdev_children);
403617aab35aSMartin Matuska raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
403717aab35aSMartin Matuska sizeof (zio_t *) * writes, KM_SLEEP);
4038e716630dSMartin Matuska rra->rra_vre = vre;
4039e716630dSMartin Matuska rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
404017aab35aSMartin Matuska offset, size, RL_WRITER);
4041e716630dSMartin Matuska rra->rra_txg = dmu_tx_get_txg(tx);
404217aab35aSMartin Matuska rra->rra_ashift = ashift;
404317aab35aSMartin Matuska rra->rra_tbd = reads;
404417aab35aSMartin Matuska rra->rra_writes = writes;
4045e716630dSMartin Matuska
404617aab35aSMartin Matuska raidz_reflow_record_progress(vre, offset + size, tx);
4047e716630dSMartin Matuska
4048e716630dSMartin Matuska /*
4049e716630dSMartin Matuska * SCL_STATE will be released when the read and write are done,
4050e716630dSMartin Matuska * by raidz_reflow_write_done().
4051e716630dSMartin Matuska */
4052e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4053e716630dSMartin Matuska
4054e716630dSMartin Matuska /* check if a replacing vdev was added, if so treat it as an error */
4055e716630dSMartin Matuska if (vdev_raidz_expand_child_replacing(vd)) {
4056e716630dSMartin Matuska zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4057e716630dSMartin Matuska "offset=%llu txg=%llu",
4058e716630dSMartin Matuska (long long)rra->rra_lr->lr_offset,
4059e716630dSMartin Matuska (long long)rra->rra_txg);
4060e716630dSMartin Matuska
4061e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
4062e716630dSMartin Matuska vre->vre_failed_offset =
4063e716630dSMartin Matuska MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4064e716630dSMartin Matuska cv_signal(&vre->vre_cv);
4065e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
4066e716630dSMartin Matuska
4067e716630dSMartin Matuska /* drop everything we acquired */
4068e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, spa);
406917aab35aSMartin Matuska zfs_rangelock_exit(rra->rra_lr);
407017aab35aSMartin Matuska kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4071e716630dSMartin Matuska return (B_TRUE);
4072e716630dSMartin Matuska }
4073e716630dSMartin Matuska
407417aab35aSMartin Matuska mutex_enter(&vre->vre_lock);
407517aab35aSMartin Matuska vre->vre_outstanding_bytes += size;
407617aab35aSMartin Matuska mutex_exit(&vre->vre_lock);
4077e716630dSMartin Matuska
407817aab35aSMartin Matuska /* Allocate ABD and ZIO for each child we write. */
407917aab35aSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
408017aab35aSMartin Matuska zio_t *pio = spa->spa_txg_zio[txgoff];
408117aab35aSMartin Matuska uint_t b = blocks / vd->vdev_children;
408217aab35aSMartin Matuska uint_t bb = blocks % vd->vdev_children;
408317aab35aSMartin Matuska for (uint_t i = 0; i < writes; i++) {
408417aab35aSMartin Matuska uint_t n = b + (i < bb);
408517aab35aSMartin Matuska abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
408617aab35aSMartin Matuska rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
408717aab35aSMartin Matuska vd->vdev_child[(blkid + i) % vd->vdev_children],
408817aab35aSMartin Matuska ((blkid + i) / vd->vdev_children) << ashift,
408917aab35aSMartin Matuska abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
409017aab35aSMartin Matuska ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
409117aab35aSMartin Matuska }
409217aab35aSMartin Matuska
409317aab35aSMartin Matuska /*
409417aab35aSMartin Matuska * Allocate and issue ZIO for each child we read. For reads of only
409517aab35aSMartin Matuska * one block we can use respective writer ABDs, since they will also
409617aab35aSMartin Matuska * have only one block. For bigger reads create gang ABDs and fill
409717aab35aSMartin Matuska * them with respective blocks from writer ABDs.
409817aab35aSMartin Matuska */
409917aab35aSMartin Matuska b = blocks / old_children;
410017aab35aSMartin Matuska bb = blocks % old_children;
410117aab35aSMartin Matuska for (uint_t i = 0; i < reads; i++) {
410217aab35aSMartin Matuska uint_t n = b + (i < bb);
410317aab35aSMartin Matuska abd_t *abd;
410417aab35aSMartin Matuska if (n > 1) {
410517aab35aSMartin Matuska abd = abd_alloc_gang();
410617aab35aSMartin Matuska for (uint_t j = 0; j < n; j++) {
410717aab35aSMartin Matuska uint_t b = j * old_children + i;
410817aab35aSMartin Matuska abd_t *cabd = abd_get_offset_size(
410917aab35aSMartin Matuska rra->rra_zio[b % vd->vdev_children]->io_abd,
411017aab35aSMartin Matuska (b / vd->vdev_children) << ashift,
411117aab35aSMartin Matuska 1 << ashift);
411217aab35aSMartin Matuska abd_gang_add(abd, cabd, B_TRUE);
411317aab35aSMartin Matuska }
411417aab35aSMartin Matuska } else {
411517aab35aSMartin Matuska abd = rra->rra_zio[i]->io_abd;
411617aab35aSMartin Matuska }
411717aab35aSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL,
411817aab35aSMartin Matuska vd->vdev_child[(blkid + i) % old_children],
411917aab35aSMartin Matuska ((blkid + i) / old_children) << ashift, abd,
412017aab35aSMartin Matuska n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
412117aab35aSMartin Matuska ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
412217aab35aSMartin Matuska }
4123e716630dSMartin Matuska
4124e716630dSMartin Matuska return (B_FALSE);
4125e716630dSMartin Matuska }
4126e716630dSMartin Matuska
4127e716630dSMartin Matuska /*
4128e716630dSMartin Matuska * For testing (ztest specific)
4129e716630dSMartin Matuska */
4130e716630dSMartin Matuska static void
raidz_expand_pause(uint_t pause_point)4131e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point)
4132e716630dSMartin Matuska {
4133e716630dSMartin Matuska while (raidz_expand_pause_point != 0 &&
4134e716630dSMartin Matuska raidz_expand_pause_point <= pause_point)
4135e716630dSMartin Matuska delay(hz);
4136e716630dSMartin Matuska }
4137e716630dSMartin Matuska
4138e716630dSMartin Matuska static void
raidz_scratch_child_done(zio_t * zio)4139e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio)
4140e716630dSMartin Matuska {
4141e716630dSMartin Matuska zio_t *pio = zio->io_private;
4142e716630dSMartin Matuska
4143e716630dSMartin Matuska mutex_enter(&pio->io_lock);
4144e716630dSMartin Matuska pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4145e716630dSMartin Matuska mutex_exit(&pio->io_lock);
4146e716630dSMartin Matuska }
4147e716630dSMartin Matuska
4148e716630dSMartin Matuska /*
4149e716630dSMartin Matuska * Reflow the beginning portion of the vdev into an intermediate scratch area
4150e716630dSMartin Matuska * in memory and on disk. This operation must be persisted on disk before we
4151e716630dSMartin Matuska * proceed to overwrite the beginning portion with the reflowed data.
4152e716630dSMartin Matuska *
4153e716630dSMartin Matuska * This multi-step task can fail to complete if disk errors are encountered
4154e716630dSMartin Matuska * and we can return here after a pause (waiting for disk to become healthy).
4155e716630dSMartin Matuska */
4156e716630dSMartin Matuska static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4157e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4158e716630dSMartin Matuska {
4159e716630dSMartin Matuska vdev_raidz_expand_t *vre = arg;
4160e716630dSMartin Matuska spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4161e716630dSMartin Matuska zio_t *pio;
4162e716630dSMartin Matuska int error;
4163e716630dSMartin Matuska
4164e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4165e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4166e716630dSMartin Matuska int ashift = raidvd->vdev_ashift;
4167aca928a5SMartin Matuska uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4168aca928a5SMartin Matuska uint64_t);
4169e716630dSMartin Matuska uint64_t logical_size = write_size * raidvd->vdev_children;
4170e716630dSMartin Matuska uint64_t read_size =
4171e716630dSMartin Matuska P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4172e716630dSMartin Matuska 1 << ashift);
4173e716630dSMartin Matuska
4174e716630dSMartin Matuska /*
4175e716630dSMartin Matuska * The scratch space must be large enough to get us to the point
4176e716630dSMartin Matuska * that one row does not overlap itself when moved. This is checked
4177e716630dSMartin Matuska * by vdev_raidz_attach_check().
4178e716630dSMartin Matuska */
4179e716630dSMartin Matuska VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4180e716630dSMartin Matuska VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4181e716630dSMartin Matuska VERIFY3U(write_size, <=, read_size);
4182e716630dSMartin Matuska
4183e716630dSMartin Matuska zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4184e716630dSMartin Matuska 0, logical_size, RL_WRITER);
4185e716630dSMartin Matuska
4186e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4187e716630dSMartin Matuska KM_SLEEP);
4188e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4189e716630dSMartin Matuska abds[i] = abd_alloc_linear(read_size, B_FALSE);
4190e716630dSMartin Matuska }
4191e716630dSMartin Matuska
4192e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4193e716630dSMartin Matuska
4194e716630dSMartin Matuska /*
4195e716630dSMartin Matuska * If we have already written the scratch area then we must read from
4196e716630dSMartin Matuska * there, since new writes were redirected there while we were paused
4197e716630dSMartin Matuska * or the original location may have been partially overwritten with
4198e716630dSMartin Matuska * reflowed data.
4199e716630dSMartin Matuska */
4200e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4201e716630dSMartin Matuska VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4202e716630dSMartin Matuska /*
4203e716630dSMartin Matuska * Read from scratch space.
4204e716630dSMartin Matuska */
4205e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4206e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4207e716630dSMartin Matuska /*
4208e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4209e716630dSMartin Matuska * to the offset to calculate the physical offset to
4210e716630dSMartin Matuska * write to. Passing in a negative offset makes us
4211e716630dSMartin Matuska * access the scratch area.
4212e716630dSMartin Matuska */
4213e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL,
4214e716630dSMartin Matuska raidvd->vdev_child[i],
4215e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
421617aab35aSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4217e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4218e716630dSMartin Matuska }
4219e716630dSMartin Matuska error = zio_wait(pio);
4220e716630dSMartin Matuska if (error != 0) {
4221e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading scratch location",
4222e716630dSMartin Matuska error);
4223e716630dSMartin Matuska goto io_error_exit;
4224e716630dSMartin Matuska }
4225e716630dSMartin Matuska goto overwrite;
4226e716630dSMartin Matuska }
4227e716630dSMartin Matuska
4228e716630dSMartin Matuska /*
4229e716630dSMartin Matuska * Read from original location.
4230e716630dSMartin Matuska */
4231e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4232e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4233e716630dSMartin Matuska ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4234e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4235e716630dSMartin Matuska 0, abds[i], read_size, ZIO_TYPE_READ,
423617aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4237e716630dSMartin Matuska raidz_scratch_child_done, pio));
4238e716630dSMartin Matuska }
4239e716630dSMartin Matuska error = zio_wait(pio);
4240e716630dSMartin Matuska if (error != 0) {
4241e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d reading original location", error);
4242e716630dSMartin Matuska io_error_exit:
4243e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++)
4244e716630dSMartin Matuska abd_free(abds[i]);
4245e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4246e716630dSMartin Matuska zfs_rangelock_exit(lr);
4247e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG);
4248e716630dSMartin Matuska return;
4249e716630dSMartin Matuska }
4250e716630dSMartin Matuska
4251e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4252e716630dSMartin Matuska
4253e716630dSMartin Matuska /*
4254e716630dSMartin Matuska * Reflow in memory.
4255e716630dSMartin Matuska */
4256e716630dSMartin Matuska uint64_t logical_sectors = logical_size >> ashift;
4257e716630dSMartin Matuska for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4258e716630dSMartin Matuska int oldchild = i % (raidvd->vdev_children - 1);
4259e716630dSMartin Matuska uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4260e716630dSMartin Matuska
4261e716630dSMartin Matuska int newchild = i % raidvd->vdev_children;
4262e716630dSMartin Matuska uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4263e716630dSMartin Matuska
4264e716630dSMartin Matuska /* a single sector should not be copying over itself */
4265e716630dSMartin Matuska ASSERT(!(newchild == oldchild && newoff == oldoff));
4266e716630dSMartin Matuska
4267e716630dSMartin Matuska abd_copy_off(abds[newchild], abds[oldchild],
4268e716630dSMartin Matuska newoff, oldoff, 1 << ashift);
4269e716630dSMartin Matuska }
4270e716630dSMartin Matuska
4271e716630dSMartin Matuska /*
4272e716630dSMartin Matuska * Verify that we filled in everything we intended to (write_size on
4273e716630dSMartin Matuska * each child).
4274e716630dSMartin Matuska */
4275e716630dSMartin Matuska VERIFY0(logical_sectors % raidvd->vdev_children);
4276e716630dSMartin Matuska VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4277e716630dSMartin Matuska write_size);
4278e716630dSMartin Matuska
4279e716630dSMartin Matuska /*
4280e716630dSMartin Matuska * Write to scratch location (boot area).
4281e716630dSMartin Matuska */
4282e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4283e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4284e716630dSMartin Matuska /*
4285e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4286e716630dSMartin Matuska * the offset to calculate the physical offset to write to.
4287e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area.
4288e716630dSMartin Matuska */
4289e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4290e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
429117aab35aSMartin Matuska write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4292e716630dSMartin Matuska ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4293e716630dSMartin Matuska }
4294e716630dSMartin Matuska error = zio_wait(pio);
4295e716630dSMartin Matuska if (error != 0) {
4296e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing scratch location", error);
4297e716630dSMartin Matuska goto io_error_exit;
4298e716630dSMartin Matuska }
4299e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0);
4300e716630dSMartin Matuska zio_flush(pio, raidvd);
4301e716630dSMartin Matuska zio_wait(pio);
4302e716630dSMartin Matuska
4303e716630dSMartin Matuska zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4304e716630dSMartin Matuska (long long)logical_size);
4305e716630dSMartin Matuska
4306e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4307e716630dSMartin Matuska
4308e716630dSMartin Matuska /*
4309e716630dSMartin Matuska * Update uberblock to indicate that scratch space is valid. This is
4310e716630dSMartin Matuska * needed because after this point, the real location may be
4311e716630dSMartin Matuska * overwritten. If we crash, we need to get the data from the
4312e716630dSMartin Matuska * scratch space, rather than the real location.
4313e716630dSMartin Matuska *
4314e716630dSMartin Matuska * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4315e716630dSMartin Matuska * will prefer this uberblock.
4316e716630dSMartin Matuska */
4317e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4318e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++;
4319e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4320e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4321e716630dSMartin Matuska if (spa_multihost(spa))
4322e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync);
4323e716630dSMartin Matuska
4324e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated "
4325e716630dSMartin Matuska "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4326e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg,
4327e716630dSMartin Matuska (long long)logical_size,
4328e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp);
4329e716630dSMartin Matuska
4330e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4331e716630dSMartin Matuska
4332e716630dSMartin Matuska /*
4333e716630dSMartin Matuska * Overwrite with reflow'ed data.
4334e716630dSMartin Matuska */
4335e716630dSMartin Matuska overwrite:
4336e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4337e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4338e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4339e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE,
434017aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4341e716630dSMartin Matuska raidz_scratch_child_done, pio));
4342e716630dSMartin Matuska }
4343e716630dSMartin Matuska error = zio_wait(pio);
4344e716630dSMartin Matuska if (error != 0) {
4345e716630dSMartin Matuska /*
4346e716630dSMartin Matuska * When we exit early here and drop the range lock, new
4347e716630dSMartin Matuska * writes will go into the scratch area so we'll need to
4348e716630dSMartin Matuska * read from there when we return after pausing.
4349e716630dSMartin Matuska */
4350e716630dSMartin Matuska zfs_dbgmsg("reflow: error %d writing real location", error);
4351e716630dSMartin Matuska /*
4352e716630dSMartin Matuska * Update the uberblock that is written when this txg completes.
4353e716630dSMartin Matuska */
4354e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4355e716630dSMartin Matuska logical_size);
4356e716630dSMartin Matuska goto io_error_exit;
4357e716630dSMartin Matuska }
4358e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0);
4359e716630dSMartin Matuska zio_flush(pio, raidvd);
4360e716630dSMartin Matuska zio_wait(pio);
4361e716630dSMartin Matuska
4362e716630dSMartin Matuska zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4363e716630dSMartin Matuska (long long)logical_size);
4364e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++)
4365e716630dSMartin Matuska abd_free(abds[i]);
4366e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4367e716630dSMartin Matuska
4368e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4369e716630dSMartin Matuska
4370e716630dSMartin Matuska /*
4371e716630dSMartin Matuska * Update uberblock to indicate that the initial part has been
4372e716630dSMartin Matuska * reflow'ed. This is needed because after this point (when we exit
4373e716630dSMartin Matuska * the rangelock), we allow regular writes to this region, which will
4374e716630dSMartin Matuska * be written to the new location only (because reflow_offset_next ==
4375e716630dSMartin Matuska * reflow_offset_synced). If we crashed and re-copied from the
4376e716630dSMartin Matuska * scratch space, we would lose the regular writes.
4377e716630dSMartin Matuska */
4378e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4379e716630dSMartin Matuska logical_size);
4380e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++;
4381e716630dSMartin Matuska ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4382e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4383e716630dSMartin Matuska if (spa_multihost(spa))
4384e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync);
4385e716630dSMartin Matuska
4386e716630dSMartin Matuska zfs_dbgmsg("reflow: uberblock updated "
4387e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4388e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg,
4389e716630dSMartin Matuska (long long)logical_size,
4390e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp);
4391e716630dSMartin Matuska
4392e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4393e716630dSMartin Matuska
4394e716630dSMartin Matuska /*
4395e716630dSMartin Matuska * Update progress.
4396e716630dSMartin Matuska */
4397e716630dSMartin Matuska vre->vre_offset = logical_size;
4398e716630dSMartin Matuska zfs_rangelock_exit(lr);
4399e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG);
4400e716630dSMartin Matuska
4401e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4402e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4403e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4404e716630dSMartin Matuska /*
4405e716630dSMartin Matuska * Note - raidz_reflow_sync() will update the uberblock state to
4406e716630dSMartin Matuska * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4407e716630dSMartin Matuska */
4408e716630dSMartin Matuska raidz_reflow_sync(spa, tx);
4409e716630dSMartin Matuska
4410e716630dSMartin Matuska raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4411e716630dSMartin Matuska }
4412e716630dSMartin Matuska
4413e716630dSMartin Matuska /*
4414e716630dSMartin Matuska * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4415e716630dSMartin Matuska * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4416e716630dSMartin Matuska */
4417e716630dSMartin Matuska void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4418e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa)
4419e716630dSMartin Matuska {
4420e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4421e716630dSMartin Matuska uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4422e716630dSMartin Matuska ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4423e716630dSMartin Matuska
4424e716630dSMartin Matuska spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4425e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4426e716630dSMartin Matuska ASSERT0(logical_size % raidvd->vdev_children);
4427e716630dSMartin Matuska uint64_t write_size = logical_size / raidvd->vdev_children;
4428e716630dSMartin Matuska
4429e716630dSMartin Matuska zio_t *pio;
4430e716630dSMartin Matuska
4431e716630dSMartin Matuska /*
4432e716630dSMartin Matuska * Read from scratch space.
4433e716630dSMartin Matuska */
4434e716630dSMartin Matuska abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4435e716630dSMartin Matuska KM_SLEEP);
4436e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4437e716630dSMartin Matuska abds[i] = abd_alloc_linear(write_size, B_FALSE);
4438e716630dSMartin Matuska }
4439e716630dSMartin Matuska
4440e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0);
4441e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4442e716630dSMartin Matuska /*
4443e716630dSMartin Matuska * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4444e716630dSMartin Matuska * the offset to calculate the physical offset to write to.
4445e716630dSMartin Matuska * Passing in a negative offset lets us access the boot area.
4446e716630dSMartin Matuska */
4447e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4448e716630dSMartin Matuska VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
444917aab35aSMartin Matuska write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4450e716630dSMartin Matuska raidz_scratch_child_done, pio));
4451e716630dSMartin Matuska }
4452e716630dSMartin Matuska zio_wait(pio);
4453e716630dSMartin Matuska
4454e716630dSMartin Matuska /*
4455e716630dSMartin Matuska * Overwrite real location with reflow'ed data.
4456e716630dSMartin Matuska */
4457e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0);
4458e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++) {
4459e716630dSMartin Matuska zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4460e716630dSMartin Matuska 0, abds[i], write_size, ZIO_TYPE_WRITE,
446117aab35aSMartin Matuska ZIO_PRIORITY_REMOVAL, 0,
4462e716630dSMartin Matuska raidz_scratch_child_done, pio));
4463e716630dSMartin Matuska }
4464e716630dSMartin Matuska zio_wait(pio);
4465e716630dSMartin Matuska pio = zio_root(spa, NULL, NULL, 0);
4466e716630dSMartin Matuska zio_flush(pio, raidvd);
4467e716630dSMartin Matuska zio_wait(pio);
4468e716630dSMartin Matuska
4469e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4470e716630dSMartin Matuska "to real location", (long long)logical_size);
4471e716630dSMartin Matuska
4472e716630dSMartin Matuska for (int i = 0; i < raidvd->vdev_children; i++)
4473e716630dSMartin Matuska abd_free(abds[i]);
4474e716630dSMartin Matuska kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4475e716630dSMartin Matuska
4476e716630dSMartin Matuska /*
4477e716630dSMartin Matuska * Update uberblock.
4478e716630dSMartin Matuska */
4479e716630dSMartin Matuska RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4480e716630dSMartin Matuska RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4481e716630dSMartin Matuska spa->spa_ubsync.ub_timestamp++;
4482e716630dSMartin Matuska VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4483e716630dSMartin Matuska &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4484e716630dSMartin Matuska if (spa_multihost(spa))
4485e716630dSMartin Matuska mmp_update_uberblock(spa, &spa->spa_ubsync);
4486e716630dSMartin Matuska
4487e716630dSMartin Matuska zfs_dbgmsg("reflow recovery: uberblock updated "
4488e716630dSMartin Matuska "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4489e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_txg,
4490e716630dSMartin Matuska (long long)logical_size,
4491e716630dSMartin Matuska (long long)spa->spa_ubsync.ub_timestamp);
4492e716630dSMartin Matuska
4493e716630dSMartin Matuska dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4494e716630dSMartin Matuska spa_first_txg(spa));
4495e716630dSMartin Matuska int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4496e716630dSMartin Matuska vre->vre_offset = logical_size;
4497e716630dSMartin Matuska vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4498e716630dSMartin Matuska vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4499e716630dSMartin Matuska /*
4500e716630dSMartin Matuska * Note that raidz_reflow_sync() will update the uberblock once more
4501e716630dSMartin Matuska */
4502e716630dSMartin Matuska raidz_reflow_sync(spa, tx);
4503e716630dSMartin Matuska
4504e716630dSMartin Matuska dmu_tx_commit(tx);
4505e716630dSMartin Matuska
4506e716630dSMartin Matuska spa_config_exit(spa, SCL_STATE, FTAG);
4507e716630dSMartin Matuska }
4508e716630dSMartin Matuska
4509e716630dSMartin Matuska static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4510e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4511e716630dSMartin Matuska {
4512e716630dSMartin Matuska (void) zthr;
4513e716630dSMartin Matuska spa_t *spa = arg;
4514e716630dSMartin Matuska
4515e716630dSMartin Matuska return (spa->spa_raidz_expand != NULL &&
4516e716630dSMartin Matuska !spa->spa_raidz_expand->vre_waiting_for_resilver);
4517e716630dSMartin Matuska }
4518e716630dSMartin Matuska
4519e716630dSMartin Matuska /*
4520e716630dSMartin Matuska * RAIDZ expansion background thread
4521e716630dSMartin Matuska *
4522e716630dSMartin Matuska * Can be called multiple times if the reflow is paused
4523e716630dSMartin Matuska */
4524e716630dSMartin Matuska static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4525e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4526e716630dSMartin Matuska {
4527e716630dSMartin Matuska spa_t *spa = arg;
4528e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4529e716630dSMartin Matuska
4530e716630dSMartin Matuska if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4531e716630dSMartin Matuska vre->vre_offset = 0;
4532e716630dSMartin Matuska else
4533e716630dSMartin Matuska vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4534e716630dSMartin Matuska
4535e716630dSMartin Matuska /* Reflow the begining portion using the scratch area */
4536e716630dSMartin Matuska if (vre->vre_offset == 0) {
4537e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa),
4538e716630dSMartin Matuska NULL, raidz_reflow_scratch_sync,
4539e716630dSMartin Matuska vre, 0, ZFS_SPACE_CHECK_NONE));
4540e716630dSMartin Matuska
4541e716630dSMartin Matuska /* if we encountered errors then pause */
4542e716630dSMartin Matuska if (vre->vre_offset == 0) {
4543e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
4544e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE;
4545e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
4546e716630dSMartin Matuska return;
4547e716630dSMartin Matuska }
4548e716630dSMartin Matuska }
4549e716630dSMartin Matuska
4550e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4551e716630dSMartin Matuska vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4552e716630dSMartin Matuska
4553e716630dSMartin Matuska uint64_t guid = raidvd->vdev_guid;
4554e716630dSMartin Matuska
4555e716630dSMartin Matuska /* Iterate over all the remaining metaslabs */
4556e716630dSMartin Matuska for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4557e716630dSMartin Matuska i < raidvd->vdev_ms_count &&
4558e716630dSMartin Matuska !zthr_iscancelled(zthr) &&
4559e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX; i++) {
4560e716630dSMartin Matuska metaslab_t *msp = raidvd->vdev_ms[i];
4561e716630dSMartin Matuska
4562e716630dSMartin Matuska metaslab_disable(msp);
4563e716630dSMartin Matuska mutex_enter(&msp->ms_lock);
4564e716630dSMartin Matuska
4565e716630dSMartin Matuska /*
4566e716630dSMartin Matuska * The metaslab may be newly created (for the expanded
4567e716630dSMartin Matuska * space), in which case its trees won't exist yet,
4568e716630dSMartin Matuska * so we need to bail out early.
4569e716630dSMartin Matuska */
4570e716630dSMartin Matuska if (msp->ms_new) {
4571e716630dSMartin Matuska mutex_exit(&msp->ms_lock);
4572e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE);
4573e716630dSMartin Matuska continue;
4574e716630dSMartin Matuska }
4575e716630dSMartin Matuska
4576e716630dSMartin Matuska VERIFY0(metaslab_load(msp));
4577e716630dSMartin Matuska
4578e716630dSMartin Matuska /*
4579e716630dSMartin Matuska * We want to copy everything except the free (allocatable)
4580e716630dSMartin Matuska * space. Note that there may be a little bit more free
4581e716630dSMartin Matuska * space (e.g. in ms_defer), and it's fine to copy that too.
4582e716630dSMartin Matuska */
458317aab35aSMartin Matuska uint64_t shift, start;
4584b59a0cdeSMartin Matuska zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
458517aab35aSMartin Matuska raidvd, msp, &start, &shift);
4586b59a0cdeSMartin Matuska zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
458717aab35aSMartin Matuska start, shift);
4588b59a0cdeSMartin Matuska zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
4589b59a0cdeSMartin Matuska zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
4590b59a0cdeSMartin Matuska rt);
4591e716630dSMartin Matuska mutex_exit(&msp->ms_lock);
4592e716630dSMartin Matuska
4593e716630dSMartin Matuska /*
4594e716630dSMartin Matuska * Force the last sector of each metaslab to be copied. This
4595e716630dSMartin Matuska * ensures that we advance the on-disk progress to the end of
4596e716630dSMartin Matuska * this metaslab while the metaslab is disabled. Otherwise, we
4597e716630dSMartin Matuska * could move past this metaslab without advancing the on-disk
4598e716630dSMartin Matuska * progress, and then an allocation to this metaslab would not
4599e716630dSMartin Matuska * be copied.
4600e716630dSMartin Matuska */
4601e716630dSMartin Matuska int sectorsz = 1 << raidvd->vdev_ashift;
4602e716630dSMartin Matuska uint64_t ms_last_offset = msp->ms_start +
4603e716630dSMartin Matuska msp->ms_size - sectorsz;
4604b59a0cdeSMartin Matuska if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
4605b59a0cdeSMartin Matuska zfs_range_tree_add(rt, ms_last_offset, sectorsz);
4606e716630dSMartin Matuska }
4607e716630dSMartin Matuska
4608e716630dSMartin Matuska /*
4609e716630dSMartin Matuska * When we are resuming from a paused expansion (i.e.
4610e716630dSMartin Matuska * when importing a pool with a expansion in progress),
4611e716630dSMartin Matuska * discard any state that we have already processed.
4612e716630dSMartin Matuska */
461317aab35aSMartin Matuska if (vre->vre_offset > msp->ms_start) {
4614b59a0cdeSMartin Matuska zfs_range_tree_clear(rt, msp->ms_start,
461517aab35aSMartin Matuska vre->vre_offset - msp->ms_start);
461617aab35aSMartin Matuska }
4617e716630dSMartin Matuska
4618e716630dSMartin Matuska while (!zthr_iscancelled(zthr) &&
4619b59a0cdeSMartin Matuska !zfs_range_tree_is_empty(rt) &&
4620e716630dSMartin Matuska vre->vre_failed_offset == UINT64_MAX) {
4621e716630dSMartin Matuska
4622e716630dSMartin Matuska /*
4623e716630dSMartin Matuska * We need to periodically drop the config lock so that
4624e716630dSMartin Matuska * writers can get in. Additionally, we can't wait
4625e716630dSMartin Matuska * for a txg to sync while holding a config lock
4626e716630dSMartin Matuska * (since a waiting writer could cause a 3-way deadlock
4627e716630dSMartin Matuska * with the sync thread, which also gets a config
4628e716630dSMartin Matuska * lock for reader). So we can't hold the config lock
4629e716630dSMartin Matuska * while calling dmu_tx_assign().
4630e716630dSMartin Matuska */
4631e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG);
4632e716630dSMartin Matuska
4633e716630dSMartin Matuska /*
4634e716630dSMartin Matuska * If requested, pause the reflow when the amount
4635e716630dSMartin Matuska * specified by raidz_expand_max_reflow_bytes is reached
4636e716630dSMartin Matuska *
4637e716630dSMartin Matuska * This pause is only used during testing or debugging.
4638e716630dSMartin Matuska */
4639e716630dSMartin Matuska while (raidz_expand_max_reflow_bytes != 0 &&
4640e716630dSMartin Matuska raidz_expand_max_reflow_bytes <=
4641e716630dSMartin Matuska vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4642e716630dSMartin Matuska delay(hz);
4643e716630dSMartin Matuska }
4644e716630dSMartin Matuska
4645e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
4646e716630dSMartin Matuska while (vre->vre_outstanding_bytes >
4647e716630dSMartin Matuska raidz_expand_max_copy_bytes) {
4648e716630dSMartin Matuska cv_wait(&vre->vre_cv, &vre->vre_lock);
4649e716630dSMartin Matuska }
4650e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
4651e716630dSMartin Matuska
4652e716630dSMartin Matuska dmu_tx_t *tx =
4653e716630dSMartin Matuska dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4654e716630dSMartin Matuska
4655*b1c1ee44SMartin Matuska VERIFY0(dmu_tx_assign(tx,
4656*b1c1ee44SMartin Matuska DMU_TX_WAIT | DMU_TX_SUSPEND));
4657e716630dSMartin Matuska uint64_t txg = dmu_tx_get_txg(tx);
4658e716630dSMartin Matuska
4659e716630dSMartin Matuska /*
4660e716630dSMartin Matuska * Reacquire the vdev_config lock. Theoretically, the
4661e716630dSMartin Matuska * vdev_t that we're expanding may have changed.
4662e716630dSMartin Matuska */
4663e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4664e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4665e716630dSMartin Matuska
4666e716630dSMartin Matuska boolean_t needsync =
4667e716630dSMartin Matuska raidz_reflow_impl(raidvd, vre, rt, tx);
4668e716630dSMartin Matuska
4669e716630dSMartin Matuska dmu_tx_commit(tx);
4670e716630dSMartin Matuska
4671e716630dSMartin Matuska if (needsync) {
4672e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG);
4673e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, txg);
4674e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG,
4675e716630dSMartin Matuska RW_READER);
4676e716630dSMartin Matuska }
4677e716630dSMartin Matuska }
4678e716630dSMartin Matuska
4679e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG);
4680e716630dSMartin Matuska
4681e716630dSMartin Matuska metaslab_enable(msp, B_FALSE, B_FALSE);
4682b59a0cdeSMartin Matuska zfs_range_tree_vacate(rt, NULL, NULL);
4683b59a0cdeSMartin Matuska zfs_range_tree_destroy(rt);
4684e716630dSMartin Matuska
4685e716630dSMartin Matuska spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4686e716630dSMartin Matuska raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4687e716630dSMartin Matuska }
4688e716630dSMartin Matuska
4689e716630dSMartin Matuska spa_config_exit(spa, SCL_CONFIG, FTAG);
4690e716630dSMartin Matuska
4691e716630dSMartin Matuska /*
4692e716630dSMartin Matuska * The txg_wait_synced() here ensures that all reflow zio's have
4693e716630dSMartin Matuska * completed, and vre_failed_offset has been set if necessary. It
4694e716630dSMartin Matuska * also ensures that the progress of the last raidz_reflow_sync() is
4695e716630dSMartin Matuska * written to disk before raidz_reflow_complete_sync() changes the
4696e716630dSMartin Matuska * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4697e716630dSMartin Matuska * determine if a reflow is in progress, in which case we may need to
4698e716630dSMartin Matuska * write to both old and new locations. Therefore we can only change
4699e716630dSMartin Matuska * vre_state once this is not necessary, which is once the on-disk
4700e716630dSMartin Matuska * progress (in spa_ubsync) has been set past any possible writes (to
4701e716630dSMartin Matuska * the end of the last metaslab).
4702e716630dSMartin Matuska */
4703e716630dSMartin Matuska txg_wait_synced(spa->spa_dsl_pool, 0);
4704e716630dSMartin Matuska
4705e716630dSMartin Matuska if (!zthr_iscancelled(zthr) &&
4706e716630dSMartin Matuska vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4707e716630dSMartin Matuska /*
4708e716630dSMartin Matuska * We are not being canceled or paused, so the reflow must be
4709e716630dSMartin Matuska * complete. In that case also mark it as completed on disk.
4710e716630dSMartin Matuska */
4711e716630dSMartin Matuska ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4712e716630dSMartin Matuska VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4713e716630dSMartin Matuska raidz_reflow_complete_sync, spa,
4714e716630dSMartin Matuska 0, ZFS_SPACE_CHECK_NONE));
4715e716630dSMartin Matuska (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4716e716630dSMartin Matuska } else {
4717e716630dSMartin Matuska /*
4718e716630dSMartin Matuska * Wait for all copy zio's to complete and for all the
4719e716630dSMartin Matuska * raidz_reflow_sync() synctasks to be run.
4720e716630dSMartin Matuska */
4721e716630dSMartin Matuska spa_history_log_internal(spa, "reflow pause",
4722e716630dSMartin Matuska NULL, "offset=%llu failed_offset=%lld",
4723e716630dSMartin Matuska (long long)vre->vre_offset,
4724e716630dSMartin Matuska (long long)vre->vre_failed_offset);
4725e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
4726e716630dSMartin Matuska if (vre->vre_failed_offset != UINT64_MAX) {
4727e716630dSMartin Matuska /*
4728e716630dSMartin Matuska * Reset progress so that we will retry everything
4729e716630dSMartin Matuska * after the point that something failed.
4730e716630dSMartin Matuska */
4731e716630dSMartin Matuska vre->vre_offset = vre->vre_failed_offset;
4732e716630dSMartin Matuska vre->vre_failed_offset = UINT64_MAX;
4733e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_TRUE;
4734e716630dSMartin Matuska }
4735e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
4736e716630dSMartin Matuska }
4737e716630dSMartin Matuska }
4738e716630dSMartin Matuska
4739e716630dSMartin Matuska void
spa_start_raidz_expansion_thread(spa_t * spa)4740e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa)
4741e716630dSMartin Matuska {
4742e716630dSMartin Matuska ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4743e716630dSMartin Matuska spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4744e716630dSMartin Matuska spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4745e716630dSMartin Matuska spa, defclsyspri);
4746e716630dSMartin Matuska }
4747e716630dSMartin Matuska
4748e716630dSMartin Matuska void
raidz_dtl_reassessed(vdev_t * vd)4749e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd)
4750e716630dSMartin Matuska {
4751e716630dSMartin Matuska spa_t *spa = vd->vdev_spa;
4752e716630dSMartin Matuska if (spa->spa_raidz_expand != NULL) {
4753e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4754e716630dSMartin Matuska /*
4755e716630dSMartin Matuska * we get called often from vdev_dtl_reassess() so make
4756e716630dSMartin Matuska * sure it's our vdev and any replacing is complete
4757e716630dSMartin Matuska */
4758e716630dSMartin Matuska if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4759e716630dSMartin Matuska !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4760e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
4761e716630dSMartin Matuska if (vre->vre_waiting_for_resilver) {
4762e716630dSMartin Matuska vdev_dbgmsg(vd, "DTL reassessed, "
4763e716630dSMartin Matuska "continuing raidz expansion");
4764e716630dSMartin Matuska vre->vre_waiting_for_resilver = B_FALSE;
4765e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr);
4766e716630dSMartin Matuska }
4767e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
4768e716630dSMartin Matuska }
4769e716630dSMartin Matuska }
4770e716630dSMartin Matuska }
4771e716630dSMartin Matuska
4772e716630dSMartin Matuska int
vdev_raidz_attach_check(vdev_t * new_child)4773e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child)
4774e716630dSMartin Matuska {
4775e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent;
4776e716630dSMartin Matuska uint64_t new_children = raidvd->vdev_children;
4777e716630dSMartin Matuska
4778e716630dSMartin Matuska /*
4779e716630dSMartin Matuska * We use the "boot" space as scratch space to handle overwriting the
4780e716630dSMartin Matuska * initial part of the vdev. If it is too small, then this expansion
4781e716630dSMartin Matuska * is not allowed. This would be very unusual (e.g. ashift > 13 and
4782e716630dSMartin Matuska * >200 children).
4783e716630dSMartin Matuska */
4784e716630dSMartin Matuska if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4785e716630dSMartin Matuska return (EINVAL);
4786e716630dSMartin Matuska }
4787e716630dSMartin Matuska return (0);
4788e716630dSMartin Matuska }
4789e716630dSMartin Matuska
4790e716630dSMartin Matuska void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)4791e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4792e716630dSMartin Matuska {
4793e716630dSMartin Matuska vdev_t *new_child = arg;
4794e716630dSMartin Matuska spa_t *spa = new_child->vdev_spa;
4795e716630dSMartin Matuska vdev_t *raidvd = new_child->vdev_parent;
4796e716630dSMartin Matuska vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4797e716630dSMartin Matuska ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4798e716630dSMartin Matuska ASSERT3P(raidvd->vdev_top, ==, raidvd);
4799e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4800e716630dSMartin Matuska ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4801e716630dSMartin Matuska ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4802e716630dSMartin Matuska new_child);
4803e716630dSMartin Matuska
4804e716630dSMartin Matuska spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4805e716630dSMartin Matuska
4806e716630dSMartin Matuska vdrz->vd_physical_width++;
4807e716630dSMartin Matuska
4808e716630dSMartin Matuska VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4809e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4810e716630dSMartin Matuska vdrz->vn_vre.vre_offset = 0;
4811e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4812e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre;
4813e716630dSMartin Matuska zthr_wakeup(spa->spa_raidz_expand_zthr);
4814e716630dSMartin Matuska
4815e716630dSMartin Matuska /*
4816e716630dSMartin Matuska * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4817e716630dSMartin Matuska * written to the config.
4818e716630dSMartin Matuska */
4819e716630dSMartin Matuska vdev_config_dirty(raidvd);
4820e716630dSMartin Matuska
4821e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = gethrestime_sec();
4822e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = 0;
4823e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING;
4824e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = 0;
4825e716630dSMartin Matuska
4826e716630dSMartin Matuska uint64_t state = vdrz->vn_vre.vre_state;
4827e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset,
4828e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4829e716630dSMartin Matuska sizeof (state), 1, &state, tx));
4830e716630dSMartin Matuska
4831e716630dSMartin Matuska uint64_t start_time = vdrz->vn_vre.vre_start_time;
4832e716630dSMartin Matuska VERIFY0(zap_update(spa->spa_meta_objset,
4833e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4834e716630dSMartin Matuska sizeof (start_time), 1, &start_time, tx));
4835e716630dSMartin Matuska
4836e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset,
4837e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4838e716630dSMartin Matuska (void) zap_remove(spa->spa_meta_objset,
4839e716630dSMartin Matuska raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4840e716630dSMartin Matuska
4841e716630dSMartin Matuska spa_history_log_internal(spa, "raidz vdev expansion started", tx,
4842e716630dSMartin Matuska "%s vdev %llu new width %llu", spa_name(spa),
4843e716630dSMartin Matuska (unsigned long long)raidvd->vdev_id,
4844e716630dSMartin Matuska (unsigned long long)raidvd->vdev_children);
4845e716630dSMartin Matuska }
4846e716630dSMartin Matuska
4847e716630dSMartin Matuska int
vdev_raidz_load(vdev_t * vd)4848e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd)
4849e716630dSMartin Matuska {
4850e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd;
4851e716630dSMartin Matuska int err;
4852e716630dSMartin Matuska
4853e716630dSMartin Matuska uint64_t state = DSS_NONE;
4854e716630dSMartin Matuska uint64_t start_time = 0;
4855e716630dSMartin Matuska uint64_t end_time = 0;
4856e716630dSMartin Matuska uint64_t bytes_copied = 0;
4857e716630dSMartin Matuska
4858e716630dSMartin Matuska if (vd->vdev_top_zap != 0) {
4859e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4860e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4861e716630dSMartin Matuska sizeof (state), 1, &state);
4862e716630dSMartin Matuska if (err != 0 && err != ENOENT)
4863e716630dSMartin Matuska return (err);
4864e716630dSMartin Matuska
4865e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4866e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4867e716630dSMartin Matuska sizeof (start_time), 1, &start_time);
4868e716630dSMartin Matuska if (err != 0 && err != ENOENT)
4869e716630dSMartin Matuska return (err);
4870e716630dSMartin Matuska
4871e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4872e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4873e716630dSMartin Matuska sizeof (end_time), 1, &end_time);
4874e716630dSMartin Matuska if (err != 0 && err != ENOENT)
4875e716630dSMartin Matuska return (err);
4876e716630dSMartin Matuska
4877e716630dSMartin Matuska err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4878e716630dSMartin Matuska vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4879e716630dSMartin Matuska sizeof (bytes_copied), 1, &bytes_copied);
4880e716630dSMartin Matuska if (err != 0 && err != ENOENT)
4881e716630dSMartin Matuska return (err);
4882e716630dSMartin Matuska }
4883e716630dSMartin Matuska
4884e716630dSMartin Matuska /*
4885e716630dSMartin Matuska * If we are in the middle of expansion, vre_state should have
4886e716630dSMartin Matuska * already been set by vdev_raidz_init().
4887e716630dSMartin Matuska */
4888e716630dSMartin Matuska EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4889e716630dSMartin Matuska vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4890e716630dSMartin Matuska vdrz->vn_vre.vre_start_time = start_time;
4891e716630dSMartin Matuska vdrz->vn_vre.vre_end_time = end_time;
4892e716630dSMartin Matuska vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4893e716630dSMartin Matuska
4894e716630dSMartin Matuska return (0);
4895e716630dSMartin Matuska }
4896e716630dSMartin Matuska
4897e716630dSMartin Matuska int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)4898e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4899e716630dSMartin Matuska {
4900e716630dSMartin Matuska vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4901e716630dSMartin Matuska
4902e716630dSMartin Matuska if (vre == NULL) {
4903e716630dSMartin Matuska /* no removal in progress; find most recent completed */
4904e716630dSMartin Matuska for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4905e716630dSMartin Matuska vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4906e716630dSMartin Matuska if (vd->vdev_ops == &vdev_raidz_ops) {
4907e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd;
4908e716630dSMartin Matuska
4909e716630dSMartin Matuska if (vdrz->vn_vre.vre_end_time != 0 &&
4910e716630dSMartin Matuska (vre == NULL ||
4911e716630dSMartin Matuska vdrz->vn_vre.vre_end_time >
4912e716630dSMartin Matuska vre->vre_end_time)) {
4913e716630dSMartin Matuska vre = &vdrz->vn_vre;
4914e716630dSMartin Matuska }
4915e716630dSMartin Matuska }
4916e716630dSMartin Matuska }
4917e716630dSMartin Matuska }
4918e716630dSMartin Matuska
4919e716630dSMartin Matuska if (vre == NULL) {
4920e716630dSMartin Matuska return (SET_ERROR(ENOENT));
4921e716630dSMartin Matuska }
4922e716630dSMartin Matuska
4923e716630dSMartin Matuska pres->pres_state = vre->vre_state;
4924e716630dSMartin Matuska pres->pres_expanding_vdev = vre->vre_vdev_id;
4925e716630dSMartin Matuska
4926e716630dSMartin Matuska vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4927e716630dSMartin Matuska pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4928e716630dSMartin Matuska
4929e716630dSMartin Matuska mutex_enter(&vre->vre_lock);
4930e716630dSMartin Matuska pres->pres_reflowed = vre->vre_bytes_copied;
4931e716630dSMartin Matuska for (int i = 0; i < TXG_SIZE; i++)
4932e716630dSMartin Matuska pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4933e716630dSMartin Matuska mutex_exit(&vre->vre_lock);
4934e716630dSMartin Matuska
4935e716630dSMartin Matuska pres->pres_start_time = vre->vre_start_time;
4936e716630dSMartin Matuska pres->pres_end_time = vre->vre_end_time;
4937e716630dSMartin Matuska pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4938e716630dSMartin Matuska
4939e716630dSMartin Matuska return (0);
4940e716630dSMartin Matuska }
4941e716630dSMartin Matuska
49427877fdebSMatt Macy /*
49437877fdebSMatt Macy * Initialize private RAIDZ specific fields from the nvlist.
49447877fdebSMatt Macy */
49457877fdebSMatt Macy static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)49467877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
49477877fdebSMatt Macy {
49487877fdebSMatt Macy uint_t children;
49497877fdebSMatt Macy nvlist_t **child;
49507877fdebSMatt Macy int error = nvlist_lookup_nvlist_array(nv,
49517877fdebSMatt Macy ZPOOL_CONFIG_CHILDREN, &child, &children);
49527877fdebSMatt Macy if (error != 0)
49537877fdebSMatt Macy return (SET_ERROR(EINVAL));
49547877fdebSMatt Macy
4955e716630dSMartin Matuska uint64_t nparity;
49567877fdebSMatt Macy if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
49577877fdebSMatt Macy if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
49587877fdebSMatt Macy return (SET_ERROR(EINVAL));
49597877fdebSMatt Macy
49607877fdebSMatt Macy /*
49617877fdebSMatt Macy * Previous versions could only support 1 or 2 parity
49627877fdebSMatt Macy * device.
49637877fdebSMatt Macy */
49647877fdebSMatt Macy if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
49657877fdebSMatt Macy return (SET_ERROR(EINVAL));
49667877fdebSMatt Macy else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
49677877fdebSMatt Macy return (SET_ERROR(EINVAL));
49687877fdebSMatt Macy } else {
49697877fdebSMatt Macy /*
49707877fdebSMatt Macy * We require the parity to be specified for SPAs that
49717877fdebSMatt Macy * support multiple parity levels.
49727877fdebSMatt Macy */
49737877fdebSMatt Macy if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
49747877fdebSMatt Macy return (SET_ERROR(EINVAL));
49757877fdebSMatt Macy
49767877fdebSMatt Macy /*
49777877fdebSMatt Macy * Otherwise, we default to 1 parity device for RAID-Z.
49787877fdebSMatt Macy */
49797877fdebSMatt Macy nparity = 1;
49807877fdebSMatt Macy }
49817877fdebSMatt Macy
4982e716630dSMartin Matuska vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4983e716630dSMartin Matuska vdrz->vn_vre.vre_vdev_id = -1;
4984e716630dSMartin Matuska vdrz->vn_vre.vre_offset = UINT64_MAX;
4985e716630dSMartin Matuska vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4986e716630dSMartin Matuska mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4987e716630dSMartin Matuska cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4988e716630dSMartin Matuska zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4989e716630dSMartin Matuska mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4990e716630dSMartin Matuska avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4991e716630dSMartin Matuska sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4992e716630dSMartin Matuska
4993e716630dSMartin Matuska vdrz->vd_physical_width = children;
49947877fdebSMatt Macy vdrz->vd_nparity = nparity;
49957877fdebSMatt Macy
4996e716630dSMartin Matuska /* note, the ID does not exist when creating a pool */
4997e716630dSMartin Matuska (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4998e716630dSMartin Matuska &vdrz->vn_vre.vre_vdev_id);
4999e716630dSMartin Matuska
5000e716630dSMartin Matuska boolean_t reflow_in_progress =
5001e716630dSMartin Matuska nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5002e716630dSMartin Matuska if (reflow_in_progress) {
5003e716630dSMartin Matuska spa->spa_raidz_expand = &vdrz->vn_vre;
5004e716630dSMartin Matuska vdrz->vn_vre.vre_state = DSS_SCANNING;
5005e716630dSMartin Matuska }
5006e716630dSMartin Matuska
5007e716630dSMartin Matuska vdrz->vd_original_width = children;
5008e716630dSMartin Matuska uint64_t *txgs;
5009e716630dSMartin Matuska unsigned int txgs_size = 0;
5010e716630dSMartin Matuska error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5011e716630dSMartin Matuska &txgs, &txgs_size);
5012e716630dSMartin Matuska if (error == 0) {
5013e716630dSMartin Matuska for (int i = 0; i < txgs_size; i++) {
5014e716630dSMartin Matuska reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
5015e716630dSMartin Matuska re->re_txg = txgs[txgs_size - i - 1];
5016e716630dSMartin Matuska re->re_logical_width = vdrz->vd_physical_width - i;
5017e716630dSMartin Matuska
5018e716630dSMartin Matuska if (reflow_in_progress)
5019e716630dSMartin Matuska re->re_logical_width--;
5020e716630dSMartin Matuska
5021e716630dSMartin Matuska avl_add(&vdrz->vd_expand_txgs, re);
5022e716630dSMartin Matuska }
5023e716630dSMartin Matuska
5024e716630dSMartin Matuska vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
5025e716630dSMartin Matuska }
5026e716630dSMartin Matuska if (reflow_in_progress) {
5027e716630dSMartin Matuska vdrz->vd_original_width--;
5028e716630dSMartin Matuska zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
5029e716630dSMartin Matuska children, txgs_size);
5030e716630dSMartin Matuska }
5031e716630dSMartin Matuska
50327877fdebSMatt Macy *tsd = vdrz;
50337877fdebSMatt Macy
50347877fdebSMatt Macy return (0);
50357877fdebSMatt Macy }
50367877fdebSMatt Macy
50377877fdebSMatt Macy static void
vdev_raidz_fini(vdev_t * vd)50387877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd)
50397877fdebSMatt Macy {
5040e716630dSMartin Matuska vdev_raidz_t *vdrz = vd->vdev_tsd;
5041e716630dSMartin Matuska if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5042e716630dSMartin Matuska vd->vdev_spa->spa_raidz_expand = NULL;
5043e716630dSMartin Matuska reflow_node_t *re;
5044e716630dSMartin Matuska void *cookie = NULL;
5045e716630dSMartin Matuska avl_tree_t *tree = &vdrz->vd_expand_txgs;
5046e716630dSMartin Matuska while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5047e716630dSMartin Matuska kmem_free(re, sizeof (*re));
5048e716630dSMartin Matuska avl_destroy(&vdrz->vd_expand_txgs);
5049e716630dSMartin Matuska mutex_destroy(&vdrz->vd_expand_lock);
5050e716630dSMartin Matuska mutex_destroy(&vdrz->vn_vre.vre_lock);
5051e716630dSMartin Matuska cv_destroy(&vdrz->vn_vre.vre_cv);
5052e716630dSMartin Matuska zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5053e716630dSMartin Matuska kmem_free(vdrz, sizeof (*vdrz));
50547877fdebSMatt Macy }
50557877fdebSMatt Macy
50567877fdebSMatt Macy /*
50577877fdebSMatt Macy * Add RAIDZ specific fields to the config nvlist.
50587877fdebSMatt Macy */
50597877fdebSMatt Macy static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)50607877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
50617877fdebSMatt Macy {
50627877fdebSMatt Macy ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
50637877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
50647877fdebSMatt Macy
50657877fdebSMatt Macy /*
50667877fdebSMatt Macy * Make sure someone hasn't managed to sneak a fancy new vdev
50677877fdebSMatt Macy * into a crufty old storage pool.
50687877fdebSMatt Macy */
50697877fdebSMatt Macy ASSERT(vdrz->vd_nparity == 1 ||
50707877fdebSMatt Macy (vdrz->vd_nparity <= 2 &&
50717877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
50727877fdebSMatt Macy (vdrz->vd_nparity <= 3 &&
50737877fdebSMatt Macy spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
50747877fdebSMatt Macy
50757877fdebSMatt Macy /*
50767877fdebSMatt Macy * Note that we'll add these even on storage pools where they
50777877fdebSMatt Macy * aren't strictly required -- older software will just ignore
50787877fdebSMatt Macy * it.
50797877fdebSMatt Macy */
50807877fdebSMatt Macy fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5081e716630dSMartin Matuska
5082e716630dSMartin Matuska if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5083e716630dSMartin Matuska fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5084e716630dSMartin Matuska }
5085e716630dSMartin Matuska
5086e716630dSMartin Matuska mutex_enter(&vdrz->vd_expand_lock);
5087e716630dSMartin Matuska if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5088e716630dSMartin Matuska uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5089e716630dSMartin Matuska uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5090e716630dSMartin Matuska KM_SLEEP);
5091e716630dSMartin Matuska uint64_t i = 0;
5092e716630dSMartin Matuska
5093e716630dSMartin Matuska for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5094e716630dSMartin Matuska re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5095e716630dSMartin Matuska txgs[i++] = re->re_txg;
5096e716630dSMartin Matuska }
5097e716630dSMartin Matuska
5098e716630dSMartin Matuska fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5099e716630dSMartin Matuska txgs, count);
5100e716630dSMartin Matuska
5101e716630dSMartin Matuska kmem_free(txgs, sizeof (uint64_t) * count);
5102e716630dSMartin Matuska }
5103e716630dSMartin Matuska mutex_exit(&vdrz->vd_expand_lock);
51047877fdebSMatt Macy }
51057877fdebSMatt Macy
51067877fdebSMatt Macy static uint64_t
vdev_raidz_nparity(vdev_t * vd)51077877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd)
51087877fdebSMatt Macy {
51097877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
51107877fdebSMatt Macy return (vdrz->vd_nparity);
51117877fdebSMatt Macy }
51127877fdebSMatt Macy
51137877fdebSMatt Macy static uint64_t
vdev_raidz_ndisks(vdev_t * vd)51147877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd)
51157877fdebSMatt Macy {
51167877fdebSMatt Macy return (vd->vdev_children);
5117eda14cbcSMatt Macy }
5118eda14cbcSMatt Macy
5119eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = {
51207877fdebSMatt Macy .vdev_op_init = vdev_raidz_init,
51217877fdebSMatt Macy .vdev_op_fini = vdev_raidz_fini,
5122eda14cbcSMatt Macy .vdev_op_open = vdev_raidz_open,
5123eda14cbcSMatt Macy .vdev_op_close = vdev_raidz_close,
5124071ab5a1SMartin Matuska .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
5125071ab5a1SMartin Matuska .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
51267877fdebSMatt Macy .vdev_op_min_asize = vdev_raidz_min_asize,
51277877fdebSMatt Macy .vdev_op_min_alloc = NULL,
5128eda14cbcSMatt Macy .vdev_op_io_start = vdev_raidz_io_start,
5129eda14cbcSMatt Macy .vdev_op_io_done = vdev_raidz_io_done,
5130eda14cbcSMatt Macy .vdev_op_state_change = vdev_raidz_state_change,
5131eda14cbcSMatt Macy .vdev_op_need_resilver = vdev_raidz_need_resilver,
5132eda14cbcSMatt Macy .vdev_op_hold = NULL,
5133eda14cbcSMatt Macy .vdev_op_rele = NULL,
5134eda14cbcSMatt Macy .vdev_op_remap = NULL,
5135eda14cbcSMatt Macy .vdev_op_xlate = vdev_raidz_xlate,
51367877fdebSMatt Macy .vdev_op_rebuild_asize = NULL,
51377877fdebSMatt Macy .vdev_op_metaslab_init = NULL,
51387877fdebSMatt Macy .vdev_op_config_generate = vdev_raidz_config_generate,
51397877fdebSMatt Macy .vdev_op_nparity = vdev_raidz_nparity,
51407877fdebSMatt Macy .vdev_op_ndisks = vdev_raidz_ndisks,
5141eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5142eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */
5143eda14cbcSMatt Macy };
5144e716630dSMartin Matuska
5145e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5146e716630dSMartin Matuska "For testing, pause RAIDZ expansion after reflowing this many bytes");
5147e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5148e716630dSMartin Matuska "Max amount of concurrent i/o for RAIDZ expansion");
5149e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5150e716630dSMartin Matuska "For expanded RAIDZ, aggregate reads that have more rows than this");
5151e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5152e716630dSMartin Matuska "For expanded RAIDZ, automatically start a pool scrub when expansion "
5153e716630dSMartin Matuska "completes");
5154