1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/zap.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/metaslab_impl.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/abd.h>
39 #include <sys/zfs_rlock.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
42 #include <sys/vdev_raidz.h>
43 #include <sys/vdev_raidz_impl.h>
44 #include <sys/vdev_draid.h>
45 #include <sys/uberblock_impl.h>
46 #include <sys/dsl_scan.h>
47
48 #ifdef ZFS_DEBUG
49 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
50 #endif
51
52 /*
53 * Virtual device vector for RAID-Z.
54 *
55 * This vdev supports single, double, and triple parity. For single parity,
56 * we use a simple XOR of all the data columns. For double or triple parity,
57 * we use a special case of Reed-Solomon coding. This extends the
58 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
59 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
60 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
61 * former is also based. The latter is designed to provide higher performance
62 * for writes.
63 *
64 * Note that the Plank paper claimed to support arbitrary N+M, but was then
65 * amended six years later identifying a critical flaw that invalidates its
66 * claims. Nevertheless, the technique can be adapted to work for up to
67 * triple parity. For additional parity, the amendment "Note: Correction to
68 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
69 * is viable, but the additional complexity means that write performance will
70 * suffer.
71 *
72 * All of the methods above operate on a Galois field, defined over the
73 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
74 * can be expressed with a single byte. Briefly, the operations on the
75 * field are defined as follows:
76 *
77 * o addition (+) is represented by a bitwise XOR
78 * o subtraction (-) is therefore identical to addition: A + B = A - B
79 * o multiplication of A by 2 is defined by the following bitwise expression:
80 *
81 * (A * 2)_7 = A_6
82 * (A * 2)_6 = A_5
83 * (A * 2)_5 = A_4
84 * (A * 2)_4 = A_3 + A_7
85 * (A * 2)_3 = A_2 + A_7
86 * (A * 2)_2 = A_1 + A_7
87 * (A * 2)_1 = A_0
88 * (A * 2)_0 = A_7
89 *
90 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
91 * As an aside, this multiplication is derived from the error correcting
92 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
93 *
94 * Observe that any number in the field (except for 0) can be expressed as a
95 * power of 2 -- a generator for the field. We store a table of the powers of
96 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
97 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
98 * than field addition). The inverse of a field element A (A^-1) is therefore
99 * A ^ (255 - 1) = A^254.
100 *
101 * The up-to-three parity columns, P, Q, R over several data columns,
102 * D_0, ... D_n-1, can be expressed by field operations:
103 *
104 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
105 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
106 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
107 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
108 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
109 *
110 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
111 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
112 * independent coefficients. (There are no additional coefficients that have
113 * this property which is why the uncorrected Plank method breaks down.)
114 *
115 * See the reconstruction code below for how P, Q and R can used individually
116 * or in concert to recover missing data columns.
117 */
118
119 #define VDEV_RAIDZ_P 0
120 #define VDEV_RAIDZ_Q 1
121 #define VDEV_RAIDZ_R 2
122
123 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
124 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
125
126 /*
127 * We provide a mechanism to perform the field multiplication operation on a
128 * 64-bit value all at once rather than a byte at a time. This works by
129 * creating a mask from the top bit in each byte and using that to
130 * conditionally apply the XOR of 0x1d.
131 */
132 #define VDEV_RAIDZ_64MUL_2(x, mask) \
133 { \
134 (mask) = (x) & 0x8080808080808080ULL; \
135 (mask) = ((mask) << 1) - ((mask) >> 7); \
136 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
137 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
138 }
139
140 #define VDEV_RAIDZ_64MUL_4(x, mask) \
141 { \
142 VDEV_RAIDZ_64MUL_2((x), mask); \
143 VDEV_RAIDZ_64MUL_2((x), mask); \
144 }
145
146
147 /*
148 * Big Theory Statement for how a RAIDZ VDEV is expanded
149 *
150 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
151 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
152 * that have been previously expanded can be expanded again.
153 *
154 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
155 * the VDEV) when an expansion starts. And the expansion will pause if any
156 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
157 * operations on the pool can continue while an expansion is in progress (e.g.
158 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
159 * and zpool initialize which can't be run during an expansion. Following a
160 * reboot or export/import, the expansion resumes where it left off.
161 *
162 * == Reflowing the Data ==
163 *
164 * The expansion involves reflowing (copying) the data from the current set
165 * of disks to spread it across the new set which now has one more disk. This
166 * reflow operation is similar to reflowing text when the column width of a
167 * text editor window is expanded. The text doesn’t change but the location of
168 * the text changes to accommodate the new width. An example reflow result for
169 * a 4-wide RAIDZ1 to a 5-wide is shown below.
170 *
171 * Reflow End State
172 * Each letter indicates a parity group (logical stripe)
173 *
174 * Before expansion After Expansion
175 * D1 D2 D3 D4 D1 D2 D3 D4 D5
176 * +------+------+------+------+ +------+------+------+------+------+
177 * | | | | | | | | | | |
178 * | A | A | A | A | | A | A | A | A | B |
179 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
180 * +------+------+------+------+ +------+------+------+------+------+
181 * | | | | | | | | | | |
182 * | B | B | C | C | | B | C | C | C | C |
183 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
184 * +------+------+------+------+ +------+------+------+------+------+
185 * | | | | | | | | | | |
186 * | C | C | D | D | | D | D | E | E | E |
187 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
188 * +------+------+------+------+ +------+------+------+------+------+
189 * | | | | | | | | | | |
190 * | E | E | E | E | --> | E | F | F | G | G |
191 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
192 * +------+------+------+------+ +------+------+------+------+------+
193 * | | | | | | | | | | |
194 * | F | F | G | G | | G | G | H | H | H |
195 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
196 * +------+------+------+------+ +------+------+------+------+------+
197 * | | | | | | | | | | |
198 * | G | G | H | H | | H | I | I | J | J |
199 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
200 * +------+------+------+------+ +------+------+------+------+------+
201 * | | | | | | | | | | |
202 * | H | H | I | I | | J | J | | | K |
203 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
204 * +------+------+------+------+ +------+------+------+------+------+
205 *
206 * This reflow approach has several advantages. There is no need to read or
207 * modify the block pointers or recompute any block checksums. The reflow
208 * doesn’t need to know where the parity sectors reside. We can read and write
209 * data sequentially and the copy can occur in a background thread in open
210 * context. The design also allows for fast discovery of what data to copy.
211 *
212 * The VDEV metaslabs are processed, one at a time, to copy the block data to
213 * have it flow across all the disks. The metaslab is disabled for allocations
214 * during the copy. As an optimization, we only copy the allocated data which
215 * can be determined by looking at the metaslab range tree. During the copy we
216 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
217 * need to be able to survive losing parity count disks). This means we
218 * cannot overwrite data during the reflow that would be needed if a disk is
219 * lost.
220 *
221 * After the reflow completes, all newly-written blocks will have the new
222 * layout, i.e., they will have the parity to data ratio implied by the new
223 * number of disks in the RAIDZ group. Even though the reflow copies all of
224 * the allocated space (data and parity), it is only rearranged, not changed.
225 *
226 * This act of reflowing the data has a few implications about blocks
227 * that were written before the reflow completes:
228 *
229 * - Old blocks will still use the same amount of space (i.e., they will have
230 * the parity to data ratio implied by the old number of disks in the RAIDZ
231 * group).
232 * - Reading old blocks will be slightly slower than before the reflow, for
233 * two reasons. First, we will have to read from all disks in the RAIDZ
234 * VDEV, rather than being able to skip the children that contain only
235 * parity of this block (because the data of a single block is now spread
236 * out across all the disks). Second, in most cases there will be an extra
237 * bcopy, needed to rearrange the data back to its original layout in memory.
238 *
239 * == Scratch Area ==
240 *
241 * As we copy the block data, we can only progress to the point that writes
242 * will not overlap with blocks whose progress has not yet been recorded on
243 * disk. Since partially-copied rows are always read from the old location,
244 * we need to stop one row before the sector-wise overlap, to prevent any
245 * row-wise overlap. For example, in the diagram above, when we reflow sector
246 * B6 it will overwite the original location for B5.
247 *
248 * To get around this, a scratch space is used so that we can start copying
249 * without risking data loss by overlapping the row. As an added benefit, it
250 * improves performance at the beginning of the reflow, but that small perf
251 * boost wouldn't be worth the complexity on its own.
252 *
253 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
254 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
255 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
256 * the widths will likely be single digits so we can get a substantial chuck
257 * size using only a few MB of scratch per disk.
258 *
259 * The scratch area is persisted to disk which holds a large amount of reflowed
260 * state. We can always read the partially written stripes when a disk fails or
261 * the copy is interrupted (crash) during the initial copying phase and also
262 * get past a small chunk size restriction. At a minimum, the scratch space
263 * must be large enough to get us to the point that one row does not overlap
264 * itself when moved (i.e new_width^2). But going larger is even better. We
265 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
266 * as our scratch space to handle overwriting the initial part of the VDEV.
267 *
268 * 0 256K 512K 4M
269 * +------+------+-----------------------+-----------------------------
270 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
271 * | L0 | L1 | Reserved | (Metaslabs)
272 * +------+------+-----------------------+-------------------------------
273 * Scratch Area
274 *
275 * == Reflow Progress Updates ==
276 * After the initial scratch-based reflow, the expansion process works
277 * similarly to device removal. We create a new open context thread which
278 * reflows the data, and periodically kicks off sync tasks to update logical
279 * state. In this case, state is the committed progress (offset of next data
280 * to copy). We need to persist the completed offset on disk, so that if we
281 * crash we know which format each VDEV offset is in.
282 *
283 * == Time Dependent Geometry ==
284 *
285 * In non-expanded RAIDZ, blocks are read from disk in a column by column
286 * fashion. For a multi-row block, the second sector is in the first column
287 * not in the second column. This allows us to issue full reads for each
288 * column directly into the request buffer. The block data is thus laid out
289 * sequentially in a column-by-column fashion.
290 *
291 * For example, in the before expansion diagram above, one logical block might
292 * be sectors G19-H26. The parity is in G19,H23; and the data is in
293 * G20,H24,G21,H25,G22,H26.
294 *
295 * After a block is reflowed, the sectors that were all in the original column
296 * data can now reside in different columns. When reading from an expanded
297 * VDEV, we need to know the logical stripe width for each block so we can
298 * reconstitute the block’s data after the reads are completed. Likewise,
299 * when we perform the combinatorial reconstruction we need to know the
300 * original width so we can retry combinations from the past layouts.
301 *
302 * Time dependent geometry is what we call having blocks with different layouts
303 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
304 * block’s birth time (+ the time expansion ended) to establish the correct
305 * width for a given block. After an expansion completes, we record the time
306 * for blocks written with a particular width (geometry).
307 *
308 * == On Disk Format Changes ==
309 *
310 * New pool feature flag, 'raidz_expansion' whose reference count is the number
311 * of RAIDZ VDEVs that have been expanded.
312 *
313 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
314 *
315 * Since the uberblock can point to arbitrary blocks, which might be on the
316 * expanding RAIDZ, and might or might not have been expanded. We need to know
317 * which way a block is laid out before reading it. This info is the next
318 * offset that needs to be reflowed and we persist that in the uberblock, in
319 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
320 * After the expansion is complete, we then use the raidz_expand_txgs array
321 * (see below) to determine how to read a block and the ub_raidz_reflow_info
322 * field no longer required.
323 *
324 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
325 * state (i.e., active or not) which is also required before reading a block
326 * during the initial phase of reflowing the data.
327 *
328 * The top-level RAIDZ VDEV has two new entries in the nvlist:
329 *
330 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
331 * and used after the expansion is complete to
332 * determine how to read a raidz block
333 * 'raidz_expanding' boolean: present during reflow and removed after completion
334 * used during a spa import to resume an unfinished
335 * expansion
336 *
337 * And finally the VDEVs top zap adds the following informational entries:
338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
341 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
342 */
343
344 /*
345 * For testing only: pause the raidz expansion after reflowing this amount.
346 * (accessed by ZTS and ztest)
347 */
348 #ifdef _KERNEL
349 static
350 #endif /* _KERNEL */
351 unsigned long raidz_expand_max_reflow_bytes = 0;
352
353 /*
354 * For testing only: pause the raidz expansion at a certain point.
355 */
356 uint_t raidz_expand_pause_point = 0;
357
358 /*
359 * Maximum amount of copy io's outstanding at once.
360 */
361 #ifdef _ILP32
362 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
363 #else
364 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
365 #endif
366
367 /*
368 * Apply raidz map abds aggregation if the number of rows in the map is equal
369 * or greater than the value below.
370 */
371 static unsigned long raidz_io_aggregate_rows = 4;
372
373 /*
374 * Automatically start a pool scrub when a RAIDZ expansion completes in
375 * order to verify the checksums of all blocks which have been copied
376 * during the expansion. Automatic scrubbing is enabled by default and
377 * is strongly recommended.
378 */
379 static int zfs_scrub_after_expand = 1;
380
381 static void
vdev_raidz_row_free(raidz_row_t * rr)382 vdev_raidz_row_free(raidz_row_t *rr)
383 {
384 for (int c = 0; c < rr->rr_cols; c++) {
385 raidz_col_t *rc = &rr->rr_col[c];
386
387 if (rc->rc_size != 0)
388 abd_free(rc->rc_abd);
389 if (rc->rc_orig_data != NULL)
390 abd_free(rc->rc_orig_data);
391 }
392
393 if (rr->rr_abd_empty != NULL)
394 abd_free(rr->rr_abd_empty);
395
396 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
397 }
398
399 void
vdev_raidz_map_free(raidz_map_t * rm)400 vdev_raidz_map_free(raidz_map_t *rm)
401 {
402 for (int i = 0; i < rm->rm_nrows; i++)
403 vdev_raidz_row_free(rm->rm_row[i]);
404
405 if (rm->rm_nphys_cols) {
406 for (int i = 0; i < rm->rm_nphys_cols; i++) {
407 if (rm->rm_phys_col[i].rc_abd != NULL)
408 abd_free(rm->rm_phys_col[i].rc_abd);
409 }
410
411 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
412 rm->rm_nphys_cols);
413 }
414
415 ASSERT0P(rm->rm_lr);
416 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
417 }
418
419 static void
vdev_raidz_map_free_vsd(zio_t * zio)420 vdev_raidz_map_free_vsd(zio_t *zio)
421 {
422 raidz_map_t *rm = zio->io_vsd;
423
424 vdev_raidz_map_free(rm);
425 }
426
427 static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)428 vdev_raidz_reflow_compare(const void *x1, const void *x2)
429 {
430 const reflow_node_t *l = x1;
431 const reflow_node_t *r = x2;
432
433 return (TREE_CMP(l->re_txg, r->re_txg));
434 }
435
436 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
437 .vsd_free = vdev_raidz_map_free_vsd,
438 };
439
440 raidz_row_t *
vdev_raidz_row_alloc(int cols,zio_t * zio)441 vdev_raidz_row_alloc(int cols, zio_t *zio)
442 {
443 raidz_row_t *rr =
444 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
445
446 rr->rr_cols = cols;
447 rr->rr_scols = cols;
448
449 for (int c = 0; c < cols; c++) {
450 raidz_col_t *rc = &rr->rr_col[c];
451 rc->rc_shadow_devidx = INT_MAX;
452 rc->rc_shadow_offset = UINT64_MAX;
453 /*
454 * We can not allow self healing to take place for Direct I/O
455 * reads. There is nothing that stops the buffer contents from
456 * being manipulated while the I/O is in flight. It is possible
457 * that the checksum could be verified on the buffer and then
458 * the contents of that buffer are manipulated afterwards. This
459 * could lead to bad data being written out during self
460 * healing.
461 */
462 if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
463 rc->rc_allow_repair = 1;
464 }
465 return (rr);
466 }
467
468 static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)469 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
470 {
471 int c;
472 int nwrapped = 0;
473 uint64_t off = 0;
474 raidz_row_t *rr = rm->rm_row[0];
475
476 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
477 ASSERT3U(rm->rm_nrows, ==, 1);
478
479 /*
480 * Pad any parity columns with additional space to account for skip
481 * sectors.
482 */
483 if (rm->rm_skipstart < rr->rr_firstdatacol) {
484 ASSERT0(rm->rm_skipstart);
485 nwrapped = rm->rm_nskip;
486 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
487 nwrapped =
488 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
489 }
490
491 /*
492 * Optional single skip sectors (rc_size == 0) will be handled in
493 * vdev_raidz_io_start_write().
494 */
495 int skipped = rr->rr_scols - rr->rr_cols;
496
497 /* Allocate buffers for the parity columns */
498 for (c = 0; c < rr->rr_firstdatacol; c++) {
499 raidz_col_t *rc = &rr->rr_col[c];
500
501 /*
502 * Parity columns will pad out a linear ABD to account for
503 * the skip sector. A linear ABD is used here because
504 * parity calculations use the ABD buffer directly to calculate
505 * parity. This avoids doing a memcpy back to the ABD after the
506 * parity has been calculated. By issuing the parity column
507 * with the skip sector we can reduce contention on the child
508 * VDEV queue locks (vq_lock).
509 */
510 if (c < nwrapped) {
511 rc->rc_abd = abd_alloc_linear(
512 rc->rc_size + (1ULL << ashift), B_FALSE);
513 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
514 skipped++;
515 } else {
516 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
517 }
518 }
519
520 for (off = 0; c < rr->rr_cols; c++) {
521 raidz_col_t *rc = &rr->rr_col[c];
522 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
523 zio->io_abd, off, rc->rc_size);
524
525 /*
526 * Generate I/O for skip sectors to improve aggregation
527 * continuity. We will use gang ABD's to reduce contention
528 * on the child VDEV queue locks (vq_lock) by issuing
529 * a single I/O that contains the data and skip sector.
530 *
531 * It is important to make sure that rc_size is not updated
532 * even though we are adding a skip sector to the ABD. When
533 * calculating the parity in vdev_raidz_generate_parity_row()
534 * the rc_size is used to iterate through the ABD's. We can
535 * not have zero'd out skip sectors used for calculating
536 * parity for raidz, because those same sectors are not used
537 * during reconstruction.
538 */
539 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
540 rc->rc_abd = abd_alloc_gang();
541 abd_gang_add(rc->rc_abd, abd, B_TRUE);
542 abd_gang_add(rc->rc_abd,
543 abd_get_zeros(1ULL << ashift), B_TRUE);
544 skipped++;
545 } else {
546 rc->rc_abd = abd;
547 }
548 off += rc->rc_size;
549 }
550
551 ASSERT3U(off, ==, zio->io_size);
552 ASSERT3S(skipped, ==, rm->rm_nskip);
553 }
554
555 static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)556 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
557 {
558 int c;
559 raidz_row_t *rr = rm->rm_row[0];
560
561 ASSERT3U(rm->rm_nrows, ==, 1);
562
563 /* Allocate buffers for the parity columns */
564 for (c = 0; c < rr->rr_firstdatacol; c++)
565 rr->rr_col[c].rc_abd =
566 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
567
568 for (uint64_t off = 0; c < rr->rr_cols; c++) {
569 raidz_col_t *rc = &rr->rr_col[c];
570 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
571 zio->io_abd, off, rc->rc_size);
572 off += rc->rc_size;
573 }
574 }
575
576 /*
577 * Divides the IO evenly across all child vdevs; usually, dcols is
578 * the number of children in the target vdev.
579 *
580 * Avoid inlining the function to keep vdev_raidz_io_start(), which
581 * is this functions only caller, as small as possible on the stack.
582 */
583 noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)584 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
585 uint64_t nparity)
586 {
587 raidz_row_t *rr;
588 /* The starting RAIDZ (parent) vdev sector of the block. */
589 uint64_t b = zio->io_offset >> ashift;
590 /* The zio's size in units of the vdev's minimum sector size. */
591 uint64_t s = zio->io_size >> ashift;
592 /* The first column for this stripe. */
593 uint64_t f = b % dcols;
594 /* The starting byte offset on each child vdev. */
595 uint64_t o = (b / dcols) << ashift;
596 uint64_t acols, scols;
597
598 raidz_map_t *rm =
599 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
600 rm->rm_nrows = 1;
601
602 /*
603 * "Quotient": The number of data sectors for this stripe on all but
604 * the "big column" child vdevs that also contain "remainder" data.
605 */
606 uint64_t q = s / (dcols - nparity);
607
608 /*
609 * "Remainder": The number of partial stripe data sectors in this I/O.
610 * This will add a sector to some, but not all, child vdevs.
611 */
612 uint64_t r = s - q * (dcols - nparity);
613
614 /* The number of "big columns" - those which contain remainder data. */
615 uint64_t bc = (r == 0 ? 0 : r + nparity);
616
617 /*
618 * The total number of data and parity sectors associated with
619 * this I/O.
620 */
621 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
622
623 /*
624 * acols: The columns that will be accessed.
625 * scols: The columns that will be accessed or skipped.
626 */
627 if (q == 0) {
628 /* Our I/O request doesn't span all child vdevs. */
629 acols = bc;
630 scols = MIN(dcols, roundup(bc, nparity + 1));
631 } else {
632 acols = dcols;
633 scols = dcols;
634 }
635
636 ASSERT3U(acols, <=, scols);
637 rr = vdev_raidz_row_alloc(scols, zio);
638 rm->rm_row[0] = rr;
639 rr->rr_cols = acols;
640 rr->rr_bigcols = bc;
641 rr->rr_firstdatacol = nparity;
642 #ifdef ZFS_DEBUG
643 rr->rr_offset = zio->io_offset;
644 rr->rr_size = zio->io_size;
645 #endif
646
647 uint64_t asize = 0;
648
649 for (uint64_t c = 0; c < scols; c++) {
650 raidz_col_t *rc = &rr->rr_col[c];
651 uint64_t col = f + c;
652 uint64_t coff = o;
653 if (col >= dcols) {
654 col -= dcols;
655 coff += 1ULL << ashift;
656 }
657 rc->rc_devidx = col;
658 rc->rc_offset = coff;
659
660 if (c >= acols)
661 rc->rc_size = 0;
662 else if (c < bc)
663 rc->rc_size = (q + 1) << ashift;
664 else
665 rc->rc_size = q << ashift;
666
667 asize += rc->rc_size;
668 }
669
670 ASSERT3U(asize, ==, tot << ashift);
671 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
672 rm->rm_skipstart = bc;
673
674 /*
675 * If all data stored spans all columns, there's a danger that parity
676 * will always be on the same device and, since parity isn't read
677 * during normal operation, that device's I/O bandwidth won't be
678 * used effectively. We therefore switch the parity every 1MB.
679 *
680 * ... at least that was, ostensibly, the theory. As a practical
681 * matter unless we juggle the parity between all devices evenly, we
682 * won't see any benefit. Further, occasional writes that aren't a
683 * multiple of the LCM of the number of children and the minimum
684 * stripe width are sufficient to avoid pessimal behavior.
685 * Unfortunately, this decision created an implicit on-disk format
686 * requirement that we need to support for all eternity, but only
687 * for single-parity RAID-Z.
688 *
689 * If we intend to skip a sector in the zeroth column for padding
690 * we must make sure to note this swap. We will never intend to
691 * skip the first column since at least one data and one parity
692 * column must appear in each row.
693 */
694 ASSERT(rr->rr_cols >= 2);
695 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
696
697 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
698 uint64_t devidx = rr->rr_col[0].rc_devidx;
699 o = rr->rr_col[0].rc_offset;
700 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
701 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
702 rr->rr_col[1].rc_devidx = devidx;
703 rr->rr_col[1].rc_offset = o;
704 if (rm->rm_skipstart == 0)
705 rm->rm_skipstart = 1;
706 }
707
708 if (zio->io_type == ZIO_TYPE_WRITE) {
709 vdev_raidz_map_alloc_write(zio, rm, ashift);
710 } else {
711 vdev_raidz_map_alloc_read(zio, rm);
712 }
713 /* init RAIDZ parity ops */
714 rm->rm_ops = vdev_raidz_math_get_ops();
715
716 return (rm);
717 }
718
719 /*
720 * Everything before reflow_offset_synced should have been moved to the new
721 * location (read and write completed). However, this may not yet be reflected
722 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
723 * uberblock has not yet been written). If reflow is not in progress,
724 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
725 * entirely before reflow_offset_synced, it will come from the new location.
726 * Otherwise this row will come from the old location. Therefore, rows that
727 * straddle the reflow_offset_synced will come from the old location.
728 *
729 * For writes, reflow_offset_next is the next offset to copy. If a sector has
730 * been copied, but not yet reflected in the on-disk progress
731 * (reflow_offset_synced), it will also be written to the new (already copied)
732 * offset.
733 */
734 noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)735 vdev_raidz_map_alloc_expanded(zio_t *zio,
736 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
737 uint64_t nparity, uint64_t reflow_offset_synced,
738 uint64_t reflow_offset_next, boolean_t use_scratch)
739 {
740 abd_t *abd = zio->io_abd;
741 uint64_t offset = zio->io_offset;
742 uint64_t size = zio->io_size;
743
744 /* The zio's size in units of the vdev's minimum sector size. */
745 uint64_t s = size >> ashift;
746
747 /*
748 * "Quotient": The number of data sectors for this stripe on all but
749 * the "big column" child vdevs that also contain "remainder" data.
750 * AKA "full rows"
751 */
752 uint64_t q = s / (logical_cols - nparity);
753
754 /*
755 * "Remainder": The number of partial stripe data sectors in this I/O.
756 * This will add a sector to some, but not all, child vdevs.
757 */
758 uint64_t r = s - q * (logical_cols - nparity);
759
760 /* The number of "big columns" - those which contain remainder data. */
761 uint64_t bc = (r == 0 ? 0 : r + nparity);
762
763 /*
764 * The total number of data and parity sectors associated with
765 * this I/O.
766 */
767 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
768
769 /* How many rows contain data (not skip) */
770 uint64_t rows = howmany(tot, logical_cols);
771 int cols = MIN(tot, logical_cols);
772
773 raidz_map_t *rm =
774 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
775 KM_SLEEP);
776 rm->rm_nrows = rows;
777 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
778 rm->rm_skipstart = bc;
779 uint64_t asize = 0;
780
781 for (uint64_t row = 0; row < rows; row++) {
782 boolean_t row_use_scratch = B_FALSE;
783 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
784 rm->rm_row[row] = rr;
785
786 /* The starting RAIDZ (parent) vdev sector of the row. */
787 uint64_t b = (offset >> ashift) + row * logical_cols;
788
789 /*
790 * If we are in the middle of a reflow, and the copying has
791 * not yet completed for any part of this row, then use the
792 * old location of this row. Note that reflow_offset_synced
793 * reflects the i/o that's been completed, because it's
794 * updated by a synctask, after zio_wait(spa_txg_zio[]).
795 * This is sufficient for our check, even if that progress
796 * has not yet been recorded to disk (reflected in
797 * spa_ubsync). Also note that we consider the last row to
798 * be "full width" (`cols`-wide rather than `bc`-wide) for
799 * this calculation. This causes a tiny bit of unnecessary
800 * double-writes but is safe and simpler to calculate.
801 */
802 int row_phys_cols = physical_cols;
803 if (b + cols > reflow_offset_synced >> ashift)
804 row_phys_cols--;
805 else if (use_scratch)
806 row_use_scratch = B_TRUE;
807
808 /* starting child of this row */
809 uint64_t child_id = b % row_phys_cols;
810 /* The starting byte offset on each child vdev. */
811 uint64_t child_offset = (b / row_phys_cols) << ashift;
812
813 /*
814 * Note, rr_cols is the entire width of the block, even
815 * if this row is shorter. This is needed because parity
816 * generation (for Q and R) needs to know the entire width,
817 * because it treats the short row as though it was
818 * full-width (and the "phantom" sectors were zero-filled).
819 *
820 * Another approach to this would be to set cols shorter
821 * (to just the number of columns that we might do i/o to)
822 * and have another mechanism to tell the parity generation
823 * about the "entire width". Reconstruction (at least
824 * vdev_raidz_reconstruct_general()) would also need to
825 * know about the "entire width".
826 */
827 rr->rr_firstdatacol = nparity;
828 #ifdef ZFS_DEBUG
829 /*
830 * note: rr_size is PSIZE, not ASIZE
831 */
832 rr->rr_offset = b << ashift;
833 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
834 #endif
835
836 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
837 if (child_id >= row_phys_cols) {
838 child_id -= row_phys_cols;
839 child_offset += 1ULL << ashift;
840 }
841 raidz_col_t *rc = &rr->rr_col[c];
842 rc->rc_devidx = child_id;
843 rc->rc_offset = child_offset;
844
845 /*
846 * Get this from the scratch space if appropriate.
847 * This only happens if we crashed in the middle of
848 * raidz_reflow_scratch_sync() (while it's running,
849 * the rangelock prevents us from doing concurrent
850 * io), and even then only during zpool import or
851 * when the pool is imported readonly.
852 */
853 if (row_use_scratch)
854 rc->rc_offset -= VDEV_BOOT_SIZE;
855
856 uint64_t dc = c - rr->rr_firstdatacol;
857 if (c < rr->rr_firstdatacol) {
858 rc->rc_size = 1ULL << ashift;
859
860 /*
861 * Parity sectors' rc_abd's are set below
862 * after determining if this is an aggregation.
863 */
864 } else if (row == rows - 1 && bc != 0 && c >= bc) {
865 /*
866 * Past the end of the block (even including
867 * skip sectors). This sector is part of the
868 * map so that we have full rows for p/q parity
869 * generation.
870 */
871 rc->rc_size = 0;
872 rc->rc_abd = NULL;
873 } else {
874 /* "data column" (col excluding parity) */
875 uint64_t off;
876
877 if (c < bc || r == 0) {
878 off = dc * rows + row;
879 } else {
880 off = r * rows +
881 (dc - r) * (rows - 1) + row;
882 }
883 rc->rc_size = 1ULL << ashift;
884 rc->rc_abd = abd_get_offset_struct(
885 &rc->rc_abdstruct, abd, off << ashift,
886 rc->rc_size);
887 }
888
889 if (rc->rc_size == 0)
890 continue;
891
892 /*
893 * If any part of this row is in both old and new
894 * locations, the primary location is the old
895 * location. If this sector was already copied to the
896 * new location, we need to also write to the new,
897 * "shadow" location.
898 *
899 * Note, `row_phys_cols != physical_cols` indicates
900 * that the primary location is the old location.
901 * `b+c < reflow_offset_next` indicates that the copy
902 * to the new location has been initiated. We know
903 * that the copy has completed because we have the
904 * rangelock, which is held exclusively while the
905 * copy is in progress.
906 */
907 if (row_use_scratch ||
908 (row_phys_cols != physical_cols &&
909 b + c < reflow_offset_next >> ashift)) {
910 rc->rc_shadow_devidx = (b + c) % physical_cols;
911 rc->rc_shadow_offset =
912 ((b + c) / physical_cols) << ashift;
913 if (row_use_scratch)
914 rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
915 }
916
917 asize += rc->rc_size;
918 }
919
920 /*
921 * See comment in vdev_raidz_map_alloc()
922 */
923 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
924 (offset & (1ULL << 20))) {
925 ASSERT(rr->rr_cols >= 2);
926 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
927
928 int devidx0 = rr->rr_col[0].rc_devidx;
929 uint64_t offset0 = rr->rr_col[0].rc_offset;
930 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
931 uint64_t shadow_offset0 =
932 rr->rr_col[0].rc_shadow_offset;
933
934 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
935 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
936 rr->rr_col[0].rc_shadow_devidx =
937 rr->rr_col[1].rc_shadow_devidx;
938 rr->rr_col[0].rc_shadow_offset =
939 rr->rr_col[1].rc_shadow_offset;
940
941 rr->rr_col[1].rc_devidx = devidx0;
942 rr->rr_col[1].rc_offset = offset0;
943 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
944 rr->rr_col[1].rc_shadow_offset = shadow_offset0;
945 }
946 }
947 ASSERT3U(asize, ==, tot << ashift);
948
949 /*
950 * Determine if the block is contiguous, in which case we can use
951 * an aggregation.
952 */
953 if (rows >= raidz_io_aggregate_rows) {
954 rm->rm_nphys_cols = physical_cols;
955 rm->rm_phys_col =
956 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
957 KM_SLEEP);
958
959 /*
960 * Determine the aggregate io's offset and size, and check
961 * that the io is contiguous.
962 */
963 for (int i = 0;
964 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
965 raidz_row_t *rr = rm->rm_row[i];
966 for (int c = 0; c < rr->rr_cols; c++) {
967 raidz_col_t *rc = &rr->rr_col[c];
968 raidz_col_t *prc =
969 &rm->rm_phys_col[rc->rc_devidx];
970
971 if (rc->rc_size == 0)
972 continue;
973
974 if (prc->rc_size == 0) {
975 ASSERT0(prc->rc_offset);
976 prc->rc_offset = rc->rc_offset;
977 } else if (prc->rc_offset + prc->rc_size !=
978 rc->rc_offset) {
979 /*
980 * This block is not contiguous and
981 * therefore can't be aggregated.
982 * This is expected to be rare, so
983 * the cost of allocating and then
984 * freeing rm_phys_col is not
985 * significant.
986 */
987 kmem_free(rm->rm_phys_col,
988 sizeof (raidz_col_t) *
989 rm->rm_nphys_cols);
990 rm->rm_phys_col = NULL;
991 rm->rm_nphys_cols = 0;
992 break;
993 }
994 prc->rc_size += rc->rc_size;
995 }
996 }
997 }
998 if (rm->rm_phys_col != NULL) {
999 /*
1000 * Allocate aggregate ABD's.
1001 */
1002 for (int i = 0; i < rm->rm_nphys_cols; i++) {
1003 raidz_col_t *prc = &rm->rm_phys_col[i];
1004
1005 prc->rc_devidx = i;
1006
1007 if (prc->rc_size == 0)
1008 continue;
1009
1010 prc->rc_abd =
1011 abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1012 B_FALSE);
1013 }
1014
1015 /*
1016 * Point the parity abd's into the aggregate abd's.
1017 */
1018 for (int i = 0; i < rm->rm_nrows; i++) {
1019 raidz_row_t *rr = rm->rm_row[i];
1020 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1021 raidz_col_t *rc = &rr->rr_col[c];
1022 raidz_col_t *prc =
1023 &rm->rm_phys_col[rc->rc_devidx];
1024 rc->rc_abd =
1025 abd_get_offset_struct(&rc->rc_abdstruct,
1026 prc->rc_abd,
1027 rc->rc_offset - prc->rc_offset,
1028 rc->rc_size);
1029 }
1030 }
1031 } else {
1032 /*
1033 * Allocate new abd's for the parity sectors.
1034 */
1035 for (int i = 0; i < rm->rm_nrows; i++) {
1036 raidz_row_t *rr = rm->rm_row[i];
1037 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1038 raidz_col_t *rc = &rr->rr_col[c];
1039 rc->rc_abd =
1040 abd_alloc_linear(rc->rc_size,
1041 B_TRUE);
1042 }
1043 }
1044 }
1045 /* init RAIDZ parity ops */
1046 rm->rm_ops = vdev_raidz_math_get_ops();
1047
1048 return (rm);
1049 }
1050
1051 struct pqr_struct {
1052 uint64_t *p;
1053 uint64_t *q;
1054 uint64_t *r;
1055 };
1056
1057 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1058 vdev_raidz_p_func(void *buf, size_t size, void *private)
1059 {
1060 struct pqr_struct *pqr = private;
1061 const uint64_t *src = buf;
1062 int cnt = size / sizeof (src[0]);
1063
1064 ASSERT(pqr->p && !pqr->q && !pqr->r);
1065
1066 for (int i = 0; i < cnt; i++, src++, pqr->p++)
1067 *pqr->p ^= *src;
1068
1069 return (0);
1070 }
1071
1072 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1073 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1074 {
1075 struct pqr_struct *pqr = private;
1076 const uint64_t *src = buf;
1077 uint64_t mask;
1078 int cnt = size / sizeof (src[0]);
1079
1080 ASSERT(pqr->p && pqr->q && !pqr->r);
1081
1082 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1083 *pqr->p ^= *src;
1084 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1085 *pqr->q ^= *src;
1086 }
1087
1088 return (0);
1089 }
1090
1091 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1092 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1093 {
1094 struct pqr_struct *pqr = private;
1095 const uint64_t *src = buf;
1096 uint64_t mask;
1097 int cnt = size / sizeof (src[0]);
1098
1099 ASSERT(pqr->p && pqr->q && pqr->r);
1100
1101 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1102 *pqr->p ^= *src;
1103 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1104 *pqr->q ^= *src;
1105 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1106 *pqr->r ^= *src;
1107 }
1108
1109 return (0);
1110 }
1111
1112 static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)1113 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1114 {
1115 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1116
1117 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1118 abd_t *src = rr->rr_col[c].rc_abd;
1119
1120 if (c == rr->rr_firstdatacol) {
1121 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1122 } else {
1123 struct pqr_struct pqr = { p, NULL, NULL };
1124 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1125 vdev_raidz_p_func, &pqr);
1126 }
1127 }
1128 }
1129
1130 static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)1131 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1132 {
1133 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1134 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1135 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1136 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1137 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1138
1139 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1140 abd_t *src = rr->rr_col[c].rc_abd;
1141
1142 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1143
1144 if (c == rr->rr_firstdatacol) {
1145 ASSERT(ccnt == pcnt || ccnt == 0);
1146 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1147 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1148
1149 for (uint64_t i = ccnt; i < pcnt; i++) {
1150 p[i] = 0;
1151 q[i] = 0;
1152 }
1153 } else {
1154 struct pqr_struct pqr = { p, q, NULL };
1155
1156 ASSERT(ccnt <= pcnt);
1157 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1158 vdev_raidz_pq_func, &pqr);
1159
1160 /*
1161 * Treat short columns as though they are full of 0s.
1162 * Note that there's therefore nothing needed for P.
1163 */
1164 uint64_t mask;
1165 for (uint64_t i = ccnt; i < pcnt; i++) {
1166 VDEV_RAIDZ_64MUL_2(q[i], mask);
1167 }
1168 }
1169 }
1170 }
1171
1172 static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)1173 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1174 {
1175 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1176 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1177 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1178 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1179 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1180 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1181 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1182 rr->rr_col[VDEV_RAIDZ_R].rc_size);
1183
1184 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1185 abd_t *src = rr->rr_col[c].rc_abd;
1186
1187 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1188
1189 if (c == rr->rr_firstdatacol) {
1190 ASSERT(ccnt == pcnt || ccnt == 0);
1191 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1192 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1193 (void) memcpy(r, p, rr->rr_col[c].rc_size);
1194
1195 for (uint64_t i = ccnt; i < pcnt; i++) {
1196 p[i] = 0;
1197 q[i] = 0;
1198 r[i] = 0;
1199 }
1200 } else {
1201 struct pqr_struct pqr = { p, q, r };
1202
1203 ASSERT(ccnt <= pcnt);
1204 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1205 vdev_raidz_pqr_func, &pqr);
1206
1207 /*
1208 * Treat short columns as though they are full of 0s.
1209 * Note that there's therefore nothing needed for P.
1210 */
1211 uint64_t mask;
1212 for (uint64_t i = ccnt; i < pcnt; i++) {
1213 VDEV_RAIDZ_64MUL_2(q[i], mask);
1214 VDEV_RAIDZ_64MUL_4(r[i], mask);
1215 }
1216 }
1217 }
1218 }
1219
1220 /*
1221 * Generate RAID parity in the first virtual columns according to the number of
1222 * parity columns available.
1223 */
1224 void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)1225 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1226 {
1227 if (rr->rr_cols == 0) {
1228 /*
1229 * We are handling this block one row at a time (because
1230 * this block has a different logical vs physical width,
1231 * due to RAIDZ expansion), and this is a pad-only row,
1232 * which has no parity.
1233 */
1234 return;
1235 }
1236
1237 /* Generate using the new math implementation */
1238 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1239 return;
1240
1241 switch (rr->rr_firstdatacol) {
1242 case 1:
1243 vdev_raidz_generate_parity_p(rr);
1244 break;
1245 case 2:
1246 vdev_raidz_generate_parity_pq(rr);
1247 break;
1248 case 3:
1249 vdev_raidz_generate_parity_pqr(rr);
1250 break;
1251 default:
1252 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1253 }
1254 }
1255
1256 void
vdev_raidz_generate_parity(raidz_map_t * rm)1257 vdev_raidz_generate_parity(raidz_map_t *rm)
1258 {
1259 for (int i = 0; i < rm->rm_nrows; i++) {
1260 raidz_row_t *rr = rm->rm_row[i];
1261 vdev_raidz_generate_parity_row(rm, rr);
1262 }
1263 }
1264
1265 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1266 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1267 {
1268 (void) private;
1269 uint64_t *dst = dbuf;
1270 uint64_t *src = sbuf;
1271 int cnt = size / sizeof (src[0]);
1272
1273 for (int i = 0; i < cnt; i++) {
1274 dst[i] ^= src[i];
1275 }
1276
1277 return (0);
1278 }
1279
1280 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1281 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1282 void *private)
1283 {
1284 (void) private;
1285 uint64_t *dst = dbuf;
1286 uint64_t *src = sbuf;
1287 uint64_t mask;
1288 int cnt = size / sizeof (dst[0]);
1289
1290 for (int i = 0; i < cnt; i++, dst++, src++) {
1291 VDEV_RAIDZ_64MUL_2(*dst, mask);
1292 *dst ^= *src;
1293 }
1294
1295 return (0);
1296 }
1297
1298 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1299 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1300 {
1301 (void) private;
1302 uint64_t *dst = buf;
1303 uint64_t mask;
1304 int cnt = size / sizeof (dst[0]);
1305
1306 for (int i = 0; i < cnt; i++, dst++) {
1307 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1308 VDEV_RAIDZ_64MUL_2(*dst, mask);
1309 }
1310
1311 return (0);
1312 }
1313
1314 struct reconst_q_struct {
1315 uint64_t *q;
1316 int exp;
1317 };
1318
1319 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1320 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1321 {
1322 struct reconst_q_struct *rq = private;
1323 uint64_t *dst = buf;
1324 int cnt = size / sizeof (dst[0]);
1325
1326 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1327 int j;
1328 uint8_t *b;
1329
1330 *dst ^= *rq->q;
1331 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1332 *b = vdev_raidz_exp2(*b, rq->exp);
1333 }
1334 }
1335
1336 return (0);
1337 }
1338
1339 struct reconst_pq_struct {
1340 uint8_t *p;
1341 uint8_t *q;
1342 uint8_t *pxy;
1343 uint8_t *qxy;
1344 int aexp;
1345 int bexp;
1346 };
1347
1348 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)1349 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1350 {
1351 struct reconst_pq_struct *rpq = private;
1352 uint8_t *xd = xbuf;
1353 uint8_t *yd = ybuf;
1354
1355 for (int i = 0; i < size;
1356 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1357 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1358 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1359 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1360 }
1361
1362 return (0);
1363 }
1364
1365 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1366 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1367 {
1368 struct reconst_pq_struct *rpq = private;
1369 uint8_t *xd = xbuf;
1370
1371 for (int i = 0; i < size;
1372 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1373 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1374 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1375 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1376 }
1377
1378 return (0);
1379 }
1380
1381 static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)1382 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1383 {
1384 int x = tgts[0];
1385 abd_t *dst, *src;
1386
1387 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1388 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1389
1390 ASSERT3U(ntgts, ==, 1);
1391 ASSERT3U(x, >=, rr->rr_firstdatacol);
1392 ASSERT3U(x, <, rr->rr_cols);
1393
1394 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1395
1396 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1397 dst = rr->rr_col[x].rc_abd;
1398
1399 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1400
1401 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1402 uint64_t size = MIN(rr->rr_col[x].rc_size,
1403 rr->rr_col[c].rc_size);
1404
1405 src = rr->rr_col[c].rc_abd;
1406
1407 if (c == x)
1408 continue;
1409
1410 (void) abd_iterate_func2(dst, src, 0, 0, size,
1411 vdev_raidz_reconst_p_func, NULL);
1412 }
1413 }
1414
1415 static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)1416 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1417 {
1418 int x = tgts[0];
1419 int c, exp;
1420 abd_t *dst, *src;
1421
1422 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1423 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1424
1425 ASSERT(ntgts == 1);
1426
1427 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1428
1429 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1430 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1431 rr->rr_col[c].rc_size);
1432
1433 src = rr->rr_col[c].rc_abd;
1434 dst = rr->rr_col[x].rc_abd;
1435
1436 if (c == rr->rr_firstdatacol) {
1437 abd_copy(dst, src, size);
1438 if (rr->rr_col[x].rc_size > size) {
1439 abd_zero_off(dst, size,
1440 rr->rr_col[x].rc_size - size);
1441 }
1442 } else {
1443 ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1444 (void) abd_iterate_func2(dst, src, 0, 0, size,
1445 vdev_raidz_reconst_q_pre_func, NULL);
1446 (void) abd_iterate_func(dst,
1447 size, rr->rr_col[x].rc_size - size,
1448 vdev_raidz_reconst_q_pre_tail_func, NULL);
1449 }
1450 }
1451
1452 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1453 dst = rr->rr_col[x].rc_abd;
1454 exp = 255 - (rr->rr_cols - 1 - x);
1455
1456 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1457 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1458 vdev_raidz_reconst_q_post_func, &rq);
1459 }
1460
1461 static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)1462 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1463 {
1464 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1465 abd_t *pdata, *qdata;
1466 uint64_t xsize, ysize;
1467 int x = tgts[0];
1468 int y = tgts[1];
1469 abd_t *xd, *yd;
1470
1471 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1472 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1473
1474 ASSERT(ntgts == 2);
1475 ASSERT(x < y);
1476 ASSERT(x >= rr->rr_firstdatacol);
1477 ASSERT(y < rr->rr_cols);
1478
1479 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1480
1481 /*
1482 * Move the parity data aside -- we're going to compute parity as
1483 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1484 * reuse the parity generation mechanism without trashing the actual
1485 * parity so we make those columns appear to be full of zeros by
1486 * setting their lengths to zero.
1487 */
1488 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1489 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1490 xsize = rr->rr_col[x].rc_size;
1491 ysize = rr->rr_col[y].rc_size;
1492
1493 rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1494 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1495 rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1496 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1497 rr->rr_col[x].rc_size = 0;
1498 rr->rr_col[y].rc_size = 0;
1499
1500 vdev_raidz_generate_parity_pq(rr);
1501
1502 rr->rr_col[x].rc_size = xsize;
1503 rr->rr_col[y].rc_size = ysize;
1504
1505 p = abd_to_buf(pdata);
1506 q = abd_to_buf(qdata);
1507 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1508 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1509 xd = rr->rr_col[x].rc_abd;
1510 yd = rr->rr_col[y].rc_abd;
1511
1512 /*
1513 * We now have:
1514 * Pxy = P + D_x + D_y
1515 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1516 *
1517 * We can then solve for D_x:
1518 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1519 * where
1520 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1521 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1522 *
1523 * With D_x in hand, we can easily solve for D_y:
1524 * D_y = P + Pxy + D_x
1525 */
1526
1527 a = vdev_raidz_pow2[255 + x - y];
1528 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1529 tmp = 255 - vdev_raidz_log2[a ^ 1];
1530
1531 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1532 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1533
1534 ASSERT3U(xsize, >=, ysize);
1535 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1536
1537 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1538 vdev_raidz_reconst_pq_func, &rpq);
1539 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1540 vdev_raidz_reconst_pq_tail_func, &rpq);
1541
1542 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1543 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1544
1545 /*
1546 * Restore the saved parity data.
1547 */
1548 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1549 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1550 }
1551
1552 /*
1553 * In the general case of reconstruction, we must solve the system of linear
1554 * equations defined by the coefficients used to generate parity as well as
1555 * the contents of the data and parity disks. This can be expressed with
1556 * vectors for the original data (D) and the actual data (d) and parity (p)
1557 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1558 *
1559 * __ __ __ __
1560 * | | __ __ | p_0 |
1561 * | V | | D_0 | | p_m-1 |
1562 * | | x | : | = | d_0 |
1563 * | I | | D_n-1 | | : |
1564 * | | ~~ ~~ | d_n-1 |
1565 * ~~ ~~ ~~ ~~
1566 *
1567 * I is simply a square identity matrix of size n, and V is a vandermonde
1568 * matrix defined by the coefficients we chose for the various parity columns
1569 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1570 * computation as well as linear separability.
1571 *
1572 * __ __ __ __
1573 * | 1 .. 1 1 1 | | p_0 |
1574 * | 2^n-1 .. 4 2 1 | __ __ | : |
1575 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1576 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1577 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1578 * | : : : : | | : | | d_2 |
1579 * | 0 .. 1 0 0 | | D_n-1 | | : |
1580 * | 0 .. 0 1 0 | ~~ ~~ | : |
1581 * | 0 .. 0 0 1 | | d_n-1 |
1582 * ~~ ~~ ~~ ~~
1583 *
1584 * Note that I, V, d, and p are known. To compute D, we must invert the
1585 * matrix and use the known data and parity values to reconstruct the unknown
1586 * data values. We begin by removing the rows in V|I and d|p that correspond
1587 * to failed or missing columns; we then make V|I square (n x n) and d|p
1588 * sized n by removing rows corresponding to unused parity from the bottom up
1589 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1590 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1591 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1592 * __ __
1593 * | 1 1 1 1 1 1 1 1 |
1594 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1595 * | 19 205 116 29 64 16 4 1 | / /
1596 * | 1 0 0 0 0 0 0 0 | / /
1597 * | 0 1 0 0 0 0 0 0 | <--' /
1598 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1599 * | 0 0 0 1 0 0 0 0 |
1600 * | 0 0 0 0 1 0 0 0 |
1601 * | 0 0 0 0 0 1 0 0 |
1602 * | 0 0 0 0 0 0 1 0 |
1603 * | 0 0 0 0 0 0 0 1 |
1604 * ~~ ~~
1605 * __ __
1606 * | 1 1 1 1 1 1 1 1 |
1607 * | 128 64 32 16 8 4 2 1 |
1608 * | 19 205 116 29 64 16 4 1 |
1609 * | 1 0 0 0 0 0 0 0 |
1610 * | 0 1 0 0 0 0 0 0 |
1611 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1612 * | 0 0 0 1 0 0 0 0 |
1613 * | 0 0 0 0 1 0 0 0 |
1614 * | 0 0 0 0 0 1 0 0 |
1615 * | 0 0 0 0 0 0 1 0 |
1616 * | 0 0 0 0 0 0 0 1 |
1617 * ~~ ~~
1618 *
1619 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1620 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1621 * matrix is not singular.
1622 * __ __
1623 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1624 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1625 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1626 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1627 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1628 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1629 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1630 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1631 * ~~ ~~
1632 * __ __
1633 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1634 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1635 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1636 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1637 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1638 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1639 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1640 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1641 * ~~ ~~
1642 * __ __
1643 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1644 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1645 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1646 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1647 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1648 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1649 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1650 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1651 * ~~ ~~
1652 * __ __
1653 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1654 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1655 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1656 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1657 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1658 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1659 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1660 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1661 * ~~ ~~
1662 * __ __
1663 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1664 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1665 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1666 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1667 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1668 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1669 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1670 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1671 * ~~ ~~
1672 * __ __
1673 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1674 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1675 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1676 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1677 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1678 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1679 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1680 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1681 * ~~ ~~
1682 * __ __
1683 * | 0 0 1 0 0 0 0 0 |
1684 * | 167 100 5 41 159 169 217 208 |
1685 * | 166 100 4 40 158 168 216 209 |
1686 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1687 * | 0 0 0 0 1 0 0 0 |
1688 * | 0 0 0 0 0 1 0 0 |
1689 * | 0 0 0 0 0 0 1 0 |
1690 * | 0 0 0 0 0 0 0 1 |
1691 * ~~ ~~
1692 *
1693 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1694 * of the missing data.
1695 *
1696 * As is apparent from the example above, the only non-trivial rows in the
1697 * inverse matrix correspond to the data disks that we're trying to
1698 * reconstruct. Indeed, those are the only rows we need as the others would
1699 * only be useful for reconstructing data known or assumed to be valid. For
1700 * that reason, we only build the coefficients in the rows that correspond to
1701 * targeted columns.
1702 */
1703
1704 static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)1705 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1706 uint8_t **rows)
1707 {
1708 int i, j;
1709 int pow;
1710
1711 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1712
1713 /*
1714 * Fill in the missing rows of interest.
1715 */
1716 for (i = 0; i < nmap; i++) {
1717 ASSERT3S(0, <=, map[i]);
1718 ASSERT3S(map[i], <=, 2);
1719
1720 pow = map[i] * n;
1721 if (pow > 255)
1722 pow -= 255;
1723 ASSERT(pow <= 255);
1724
1725 for (j = 0; j < n; j++) {
1726 pow -= map[i];
1727 if (pow < 0)
1728 pow += 255;
1729 rows[i][j] = vdev_raidz_pow2[pow];
1730 }
1731 }
1732 }
1733
1734 static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1735 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1736 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1737 {
1738 int i, j, ii, jj;
1739 uint8_t log;
1740
1741 /*
1742 * Assert that the first nmissing entries from the array of used
1743 * columns correspond to parity columns and that subsequent entries
1744 * correspond to data columns.
1745 */
1746 for (i = 0; i < nmissing; i++) {
1747 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1748 }
1749 for (; i < n; i++) {
1750 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1751 }
1752
1753 /*
1754 * First initialize the storage where we'll compute the inverse rows.
1755 */
1756 for (i = 0; i < nmissing; i++) {
1757 for (j = 0; j < n; j++) {
1758 invrows[i][j] = (i == j) ? 1 : 0;
1759 }
1760 }
1761
1762 /*
1763 * Subtract all trivial rows from the rows of consequence.
1764 */
1765 for (i = 0; i < nmissing; i++) {
1766 for (j = nmissing; j < n; j++) {
1767 ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1768 jj = used[j] - rr->rr_firstdatacol;
1769 ASSERT3S(jj, <, n);
1770 invrows[i][j] = rows[i][jj];
1771 rows[i][jj] = 0;
1772 }
1773 }
1774
1775 /*
1776 * For each of the rows of interest, we must normalize it and subtract
1777 * a multiple of it from the other rows.
1778 */
1779 for (i = 0; i < nmissing; i++) {
1780 for (j = 0; j < missing[i]; j++) {
1781 ASSERT0(rows[i][j]);
1782 }
1783 ASSERT3U(rows[i][missing[i]], !=, 0);
1784
1785 /*
1786 * Compute the inverse of the first element and multiply each
1787 * element in the row by that value.
1788 */
1789 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1790
1791 for (j = 0; j < n; j++) {
1792 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1793 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1794 }
1795
1796 for (ii = 0; ii < nmissing; ii++) {
1797 if (i == ii)
1798 continue;
1799
1800 ASSERT3U(rows[ii][missing[i]], !=, 0);
1801
1802 log = vdev_raidz_log2[rows[ii][missing[i]]];
1803
1804 for (j = 0; j < n; j++) {
1805 rows[ii][j] ^=
1806 vdev_raidz_exp2(rows[i][j], log);
1807 invrows[ii][j] ^=
1808 vdev_raidz_exp2(invrows[i][j], log);
1809 }
1810 }
1811 }
1812
1813 /*
1814 * Verify that the data that is left in the rows are properly part of
1815 * an identity matrix.
1816 */
1817 for (i = 0; i < nmissing; i++) {
1818 for (j = 0; j < n; j++) {
1819 if (j == missing[i]) {
1820 ASSERT3U(rows[i][j], ==, 1);
1821 } else {
1822 ASSERT0(rows[i][j]);
1823 }
1824 }
1825 }
1826 }
1827
1828 static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1829 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1830 int *missing, uint8_t **invrows, const uint8_t *used)
1831 {
1832 int i, j, x, cc, c;
1833 uint8_t *src;
1834 uint64_t ccount;
1835 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1836 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1837 uint8_t log = 0;
1838 uint8_t val;
1839 int ll;
1840 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1841 uint8_t *p, *pp;
1842 size_t psize;
1843
1844 psize = sizeof (invlog[0][0]) * n * nmissing;
1845 p = kmem_alloc(psize, KM_SLEEP);
1846
1847 for (pp = p, i = 0; i < nmissing; i++) {
1848 invlog[i] = pp;
1849 pp += n;
1850 }
1851
1852 for (i = 0; i < nmissing; i++) {
1853 for (j = 0; j < n; j++) {
1854 ASSERT3U(invrows[i][j], !=, 0);
1855 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1856 }
1857 }
1858
1859 for (i = 0; i < n; i++) {
1860 c = used[i];
1861 ASSERT3U(c, <, rr->rr_cols);
1862
1863 ccount = rr->rr_col[c].rc_size;
1864 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1865 if (ccount == 0)
1866 continue;
1867 src = abd_to_buf(rr->rr_col[c].rc_abd);
1868 for (j = 0; j < nmissing; j++) {
1869 cc = missing[j] + rr->rr_firstdatacol;
1870 ASSERT3U(cc, >=, rr->rr_firstdatacol);
1871 ASSERT3U(cc, <, rr->rr_cols);
1872 ASSERT3U(cc, !=, c);
1873
1874 dcount[j] = rr->rr_col[cc].rc_size;
1875 if (dcount[j] != 0)
1876 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1877 }
1878
1879 for (x = 0; x < ccount; x++, src++) {
1880 if (*src != 0)
1881 log = vdev_raidz_log2[*src];
1882
1883 for (cc = 0; cc < nmissing; cc++) {
1884 if (x >= dcount[cc])
1885 continue;
1886
1887 if (*src == 0) {
1888 val = 0;
1889 } else {
1890 if ((ll = log + invlog[cc][i]) >= 255)
1891 ll -= 255;
1892 val = vdev_raidz_pow2[ll];
1893 }
1894
1895 if (i == 0)
1896 dst[cc][x] = val;
1897 else
1898 dst[cc][x] ^= val;
1899 }
1900 }
1901 }
1902
1903 kmem_free(p, psize);
1904 }
1905
1906 static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)1907 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1908 {
1909 int i, c, t, tt;
1910 unsigned int n;
1911 unsigned int nmissing_rows;
1912 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1913 int parity_map[VDEV_RAIDZ_MAXPARITY];
1914 uint8_t *p, *pp;
1915 size_t psize;
1916 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1917 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1918 uint8_t *used;
1919
1920 abd_t **bufs = NULL;
1921
1922 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1923 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1924 /*
1925 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1926 * temporary linear ABDs if any non-linear ABDs are found.
1927 */
1928 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1929 ASSERT(rr->rr_col[i].rc_abd != NULL);
1930 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1931 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1932 KM_PUSHPAGE);
1933
1934 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1935 raidz_col_t *col = &rr->rr_col[c];
1936
1937 bufs[c] = col->rc_abd;
1938 if (bufs[c] != NULL) {
1939 col->rc_abd = abd_alloc_linear(
1940 col->rc_size, B_TRUE);
1941 abd_copy(col->rc_abd, bufs[c],
1942 col->rc_size);
1943 }
1944 }
1945
1946 break;
1947 }
1948 }
1949
1950 n = rr->rr_cols - rr->rr_firstdatacol;
1951
1952 /*
1953 * Figure out which data columns are missing.
1954 */
1955 nmissing_rows = 0;
1956 for (t = 0; t < ntgts; t++) {
1957 if (tgts[t] >= rr->rr_firstdatacol) {
1958 missing_rows[nmissing_rows++] =
1959 tgts[t] - rr->rr_firstdatacol;
1960 }
1961 }
1962
1963 /*
1964 * Figure out which parity columns to use to help generate the missing
1965 * data columns.
1966 */
1967 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1968 ASSERT(tt < ntgts);
1969 ASSERT(c < rr->rr_firstdatacol);
1970
1971 /*
1972 * Skip any targeted parity columns.
1973 */
1974 if (c == tgts[tt]) {
1975 tt++;
1976 continue;
1977 }
1978
1979 parity_map[i] = c;
1980 i++;
1981 }
1982
1983 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1984 nmissing_rows * n + sizeof (used[0]) * n;
1985 p = kmem_alloc(psize, KM_SLEEP);
1986
1987 for (pp = p, i = 0; i < nmissing_rows; i++) {
1988 rows[i] = pp;
1989 pp += n;
1990 invrows[i] = pp;
1991 pp += n;
1992 }
1993 used = pp;
1994
1995 for (i = 0; i < nmissing_rows; i++) {
1996 used[i] = parity_map[i];
1997 }
1998
1999 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2000 if (tt < nmissing_rows &&
2001 c == missing_rows[tt] + rr->rr_firstdatacol) {
2002 tt++;
2003 continue;
2004 }
2005
2006 ASSERT3S(i, <, n);
2007 used[i] = c;
2008 i++;
2009 }
2010
2011 /*
2012 * Initialize the interesting rows of the matrix.
2013 */
2014 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2015
2016 /*
2017 * Invert the matrix.
2018 */
2019 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2020 invrows, used);
2021
2022 /*
2023 * Reconstruct the missing data using the generated matrix.
2024 */
2025 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2026 invrows, used);
2027
2028 kmem_free(p, psize);
2029
2030 /*
2031 * copy back from temporary linear abds and free them
2032 */
2033 if (bufs) {
2034 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2035 raidz_col_t *col = &rr->rr_col[c];
2036
2037 if (bufs[c] != NULL) {
2038 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2039 abd_free(col->rc_abd);
2040 }
2041 col->rc_abd = bufs[c];
2042 }
2043 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2044 }
2045 }
2046
2047 static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)2048 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2049 const int *t, int nt)
2050 {
2051 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2052 int ntgts;
2053 int i, c, ret;
2054 int nbadparity, nbaddata;
2055 int parity_valid[VDEV_RAIDZ_MAXPARITY];
2056
2057 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2058 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2059 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2060 (int)rr->rr_missingparity);
2061 }
2062
2063 nbadparity = rr->rr_firstdatacol;
2064 nbaddata = rr->rr_cols - nbadparity;
2065 ntgts = 0;
2066 for (i = 0, c = 0; c < rr->rr_cols; c++) {
2067 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2068 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2069 "offset=%llx error=%u)",
2070 rr, c, (int)rr->rr_col[c].rc_devidx,
2071 (long long)rr->rr_col[c].rc_offset,
2072 (int)rr->rr_col[c].rc_error);
2073 }
2074 if (c < rr->rr_firstdatacol)
2075 parity_valid[c] = B_FALSE;
2076
2077 if (i < nt && c == t[i]) {
2078 tgts[ntgts++] = c;
2079 i++;
2080 } else if (rr->rr_col[c].rc_error != 0) {
2081 tgts[ntgts++] = c;
2082 } else if (c >= rr->rr_firstdatacol) {
2083 nbaddata--;
2084 } else {
2085 parity_valid[c] = B_TRUE;
2086 nbadparity--;
2087 }
2088 }
2089
2090 ASSERT(ntgts >= nt);
2091 ASSERT(nbaddata >= 0);
2092 ASSERT(nbaddata + nbadparity == ntgts);
2093
2094 dt = &tgts[nbadparity];
2095
2096 /* Reconstruct using the new math implementation */
2097 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2098 if (ret != RAIDZ_ORIGINAL_IMPL)
2099 return;
2100
2101 /*
2102 * See if we can use any of our optimized reconstruction routines.
2103 */
2104 switch (nbaddata) {
2105 case 1:
2106 if (parity_valid[VDEV_RAIDZ_P]) {
2107 vdev_raidz_reconstruct_p(rr, dt, 1);
2108 return;
2109 }
2110
2111 ASSERT(rr->rr_firstdatacol > 1);
2112
2113 if (parity_valid[VDEV_RAIDZ_Q]) {
2114 vdev_raidz_reconstruct_q(rr, dt, 1);
2115 return;
2116 }
2117
2118 ASSERT(rr->rr_firstdatacol > 2);
2119 break;
2120
2121 case 2:
2122 ASSERT(rr->rr_firstdatacol > 1);
2123
2124 if (parity_valid[VDEV_RAIDZ_P] &&
2125 parity_valid[VDEV_RAIDZ_Q]) {
2126 vdev_raidz_reconstruct_pq(rr, dt, 2);
2127 return;
2128 }
2129
2130 ASSERT(rr->rr_firstdatacol > 2);
2131
2132 break;
2133 }
2134
2135 vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2136 }
2137
2138 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)2139 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2140 uint64_t *logical_ashift, uint64_t *physical_ashift)
2141 {
2142 vdev_raidz_t *vdrz = vd->vdev_tsd;
2143 uint64_t nparity = vdrz->vd_nparity;
2144 int c;
2145 int lasterror = 0;
2146 int numerrors = 0;
2147
2148 ASSERT(nparity > 0);
2149
2150 if (nparity > VDEV_RAIDZ_MAXPARITY ||
2151 vd->vdev_children < nparity + 1) {
2152 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2153 return (SET_ERROR(EINVAL));
2154 }
2155
2156 vdev_open_children(vd);
2157
2158 for (c = 0; c < vd->vdev_children; c++) {
2159 vdev_t *cvd = vd->vdev_child[c];
2160
2161 if (cvd->vdev_open_error != 0) {
2162 lasterror = cvd->vdev_open_error;
2163 numerrors++;
2164 continue;
2165 }
2166
2167 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2168 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2169 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2170 }
2171 for (c = 0; c < vd->vdev_children; c++) {
2172 vdev_t *cvd = vd->vdev_child[c];
2173
2174 if (cvd->vdev_open_error != 0)
2175 continue;
2176 *physical_ashift = vdev_best_ashift(*logical_ashift,
2177 *physical_ashift, cvd->vdev_physical_ashift);
2178 }
2179
2180 if (vd->vdev_rz_expanding) {
2181 *asize *= vd->vdev_children - 1;
2182 *max_asize *= vd->vdev_children - 1;
2183
2184 vd->vdev_min_asize = *asize;
2185 } else {
2186 *asize *= vd->vdev_children;
2187 *max_asize *= vd->vdev_children;
2188 }
2189
2190 if (numerrors > nparity) {
2191 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2192 return (lasterror);
2193 }
2194
2195 return (0);
2196 }
2197
2198 static void
vdev_raidz_close(vdev_t * vd)2199 vdev_raidz_close(vdev_t *vd)
2200 {
2201 for (int c = 0; c < vd->vdev_children; c++) {
2202 if (vd->vdev_child[c] != NULL)
2203 vdev_close(vd->vdev_child[c]);
2204 }
2205 }
2206
2207 /*
2208 * Return the logical width to use, given the txg in which the allocation
2209 * happened.
2210 */
2211 static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)2212 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2213 {
2214 reflow_node_t lookup = {
2215 .re_txg = txg,
2216 };
2217 avl_index_t where;
2218
2219 uint64_t width;
2220 mutex_enter(&vdrz->vd_expand_lock);
2221 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2222 if (re != NULL) {
2223 width = re->re_logical_width;
2224 } else {
2225 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2226 if (re != NULL)
2227 width = re->re_logical_width;
2228 else
2229 width = vdrz->vd_original_width;
2230 }
2231 mutex_exit(&vdrz->vd_expand_lock);
2232 return (width);
2233 }
2234 /*
2235 * This code converts an asize into the largest psize that can safely be written
2236 * to an allocation of that size for this vdev.
2237 *
2238 * Note that this function will not take into account the effect of gang
2239 * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
2240 * the psize_to_asize function.
2241 */
2242 static uint64_t
vdev_raidz_asize_to_psize(vdev_t * vd,uint64_t asize,uint64_t txg)2243 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
2244 {
2245 vdev_raidz_t *vdrz = vd->vdev_tsd;
2246 uint64_t psize;
2247 uint64_t ashift = vd->vdev_top->vdev_ashift;
2248 uint64_t nparity = vdrz->vd_nparity;
2249
2250 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2251
2252 ASSERT0(asize % (1 << ashift));
2253
2254 psize = (asize >> ashift);
2255 /*
2256 * If the roundup to nparity + 1 caused us to spill into a new row, we
2257 * need to ignore that row entirely (since it can't store data or
2258 * parity).
2259 */
2260 uint64_t rows = psize / cols;
2261 psize = psize - (rows * cols) <= nparity ? rows * cols : psize;
2262 /* Subtract out parity sectors for each row storing data. */
2263 psize -= nparity * DIV_ROUND_UP(psize, cols);
2264 psize <<= ashift;
2265
2266 return (psize);
2267 }
2268
2269 /*
2270 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2271 * more space due to the lower data-to-parity ratio. In this case it's
2272 * important to pass in the correct txg. Note that vdev_gang_header_asize()
2273 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2274 * regardless of txg. This is assured because for a single data sector, we
2275 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2276 */
2277 static uint64_t
vdev_raidz_psize_to_asize(vdev_t * vd,uint64_t psize,uint64_t txg)2278 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2279 {
2280 vdev_raidz_t *vdrz = vd->vdev_tsd;
2281 uint64_t asize;
2282 uint64_t ashift = vd->vdev_top->vdev_ashift;
2283 uint64_t nparity = vdrz->vd_nparity;
2284
2285 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2286
2287 asize = ((psize - 1) >> ashift) + 1;
2288 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2289 asize = roundup(asize, nparity + 1) << ashift;
2290
2291 #ifdef ZFS_DEBUG
2292 uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2293 uint64_t ncols_new = vdrz->vd_physical_width;
2294 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2295 (ncols_new - nparity));
2296 asize_new = roundup(asize_new, nparity + 1) << ashift;
2297 VERIFY3U(asize_new, <=, asize);
2298 #endif
2299
2300 return (asize);
2301 }
2302
2303 /*
2304 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2305 * so each child must provide at least 1/Nth of its asize.
2306 */
2307 static uint64_t
vdev_raidz_min_asize(vdev_t * vd)2308 vdev_raidz_min_asize(vdev_t *vd)
2309 {
2310 return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2311 vd->vdev_children);
2312 }
2313
2314 void
vdev_raidz_child_done(zio_t * zio)2315 vdev_raidz_child_done(zio_t *zio)
2316 {
2317 raidz_col_t *rc = zio->io_private;
2318
2319 ASSERT3P(rc->rc_abd, !=, NULL);
2320 rc->rc_error = zio->io_error;
2321 rc->rc_tried = 1;
2322 rc->rc_skipped = 0;
2323 }
2324
2325 static void
vdev_raidz_shadow_child_done(zio_t * zio)2326 vdev_raidz_shadow_child_done(zio_t *zio)
2327 {
2328 raidz_col_t *rc = zio->io_private;
2329
2330 rc->rc_shadow_error = zio->io_error;
2331 }
2332
2333 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)2334 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2335 {
2336 (void) rm;
2337 #ifdef ZFS_DEBUG
2338 zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
2339 logical_rs.rs_start = rr->rr_offset;
2340 logical_rs.rs_end = logical_rs.rs_start +
2341 vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
2342 BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2343
2344 raidz_col_t *rc = &rr->rr_col[col];
2345 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2346
2347 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2348 ASSERT(vdev_xlate_is_empty(&remain_rs));
2349 if (vdev_xlate_is_empty(&physical_rs)) {
2350 /*
2351 * If we are in the middle of expansion, the
2352 * physical->logical mapping is changing so vdev_xlate()
2353 * can't give us a reliable answer.
2354 */
2355 return;
2356 }
2357 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2358 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2359 /*
2360 * It would be nice to assert that rs_end is equal
2361 * to rc_offset + rc_size but there might be an
2362 * optional I/O at the end that is not accounted in
2363 * rc_size.
2364 */
2365 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2366 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2367 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2368 } else {
2369 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2370 }
2371 #endif
2372 }
2373
2374 static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2375 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2376 {
2377 vdev_t *vd = zio->io_vd;
2378 raidz_map_t *rm = zio->io_vsd;
2379
2380 vdev_raidz_generate_parity_row(rm, rr);
2381
2382 for (int c = 0; c < rr->rr_scols; c++) {
2383 raidz_col_t *rc = &rr->rr_col[c];
2384 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2385
2386 /* Verify physical to logical translation */
2387 vdev_raidz_io_verify(zio, rm, rr, c);
2388
2389 if (rc->rc_size == 0)
2390 continue;
2391
2392 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2393 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2394
2395 ASSERT3P(rc->rc_abd, !=, NULL);
2396 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2397 rc->rc_offset, rc->rc_abd,
2398 abd_get_size(rc->rc_abd), zio->io_type,
2399 zio->io_priority, 0, vdev_raidz_child_done, rc));
2400
2401 if (rc->rc_shadow_devidx != INT_MAX) {
2402 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2403
2404 ASSERT3U(
2405 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2406 cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2407
2408 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2409 rc->rc_shadow_offset, rc->rc_abd,
2410 abd_get_size(rc->rc_abd),
2411 zio->io_type, zio->io_priority, 0,
2412 vdev_raidz_shadow_child_done, rc));
2413 }
2414 }
2415 }
2416
2417 /*
2418 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2419 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2420 */
2421 static void
raidz_start_skip_writes(zio_t * zio)2422 raidz_start_skip_writes(zio_t *zio)
2423 {
2424 vdev_t *vd = zio->io_vd;
2425 uint64_t ashift = vd->vdev_top->vdev_ashift;
2426 raidz_map_t *rm = zio->io_vsd;
2427 ASSERT3U(rm->rm_nrows, ==, 1);
2428 raidz_row_t *rr = rm->rm_row[0];
2429 for (int c = 0; c < rr->rr_scols; c++) {
2430 raidz_col_t *rc = &rr->rr_col[c];
2431 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2432 if (rc->rc_size != 0)
2433 continue;
2434 ASSERT0P(rc->rc_abd);
2435
2436 ASSERT3U(rc->rc_offset, <,
2437 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2438
2439 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2440 NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2441 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2442 }
2443 }
2444
2445 static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2446 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2447 {
2448 vdev_t *vd = zio->io_vd;
2449
2450 /*
2451 * Iterate over the columns in reverse order so that we hit the parity
2452 * last -- any errors along the way will force us to read the parity.
2453 */
2454 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2455 raidz_col_t *rc = &rr->rr_col[c];
2456 if (rc->rc_size == 0)
2457 continue;
2458 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2459 if (!vdev_readable(cvd)) {
2460 if (c >= rr->rr_firstdatacol)
2461 rr->rr_missingdata++;
2462 else
2463 rr->rr_missingparity++;
2464 rc->rc_error = SET_ERROR(ENXIO);
2465 rc->rc_tried = 1; /* don't even try */
2466 rc->rc_skipped = 1;
2467 continue;
2468 }
2469 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2470 if (c >= rr->rr_firstdatacol)
2471 rr->rr_missingdata++;
2472 else
2473 rr->rr_missingparity++;
2474 rc->rc_error = SET_ERROR(ESTALE);
2475 rc->rc_skipped = 1;
2476 continue;
2477 }
2478 if (forceparity ||
2479 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2480 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2481 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2482 rc->rc_offset, rc->rc_abd, rc->rc_size,
2483 zio->io_type, zio->io_priority, 0,
2484 vdev_raidz_child_done, rc));
2485 }
2486 }
2487 }
2488
2489 static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)2490 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2491 {
2492 vdev_t *vd = zio->io_vd;
2493
2494 for (int i = 0; i < rm->rm_nphys_cols; i++) {
2495 raidz_col_t *prc = &rm->rm_phys_col[i];
2496 if (prc->rc_size == 0)
2497 continue;
2498
2499 ASSERT3U(prc->rc_devidx, ==, i);
2500 vdev_t *cvd = vd->vdev_child[i];
2501 if (!vdev_readable(cvd)) {
2502 prc->rc_error = SET_ERROR(ENXIO);
2503 prc->rc_tried = 1; /* don't even try */
2504 prc->rc_skipped = 1;
2505 continue;
2506 }
2507 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2508 prc->rc_error = SET_ERROR(ESTALE);
2509 prc->rc_skipped = 1;
2510 continue;
2511 }
2512 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2513 prc->rc_offset, prc->rc_abd, prc->rc_size,
2514 zio->io_type, zio->io_priority, 0,
2515 vdev_raidz_child_done, prc));
2516 }
2517 }
2518
2519 static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)2520 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2521 {
2522 /*
2523 * If there are multiple rows, we will be hitting
2524 * all disks, so go ahead and read the parity so
2525 * that we are reading in decent size chunks.
2526 */
2527 boolean_t forceparity = rm->rm_nrows > 1;
2528
2529 if (rm->rm_phys_col) {
2530 vdev_raidz_io_start_read_phys_cols(zio, rm);
2531 } else {
2532 for (int i = 0; i < rm->rm_nrows; i++) {
2533 raidz_row_t *rr = rm->rm_row[i];
2534 vdev_raidz_io_start_read_row(zio, rr, forceparity);
2535 }
2536 }
2537 }
2538
2539 /*
2540 * Start an IO operation on a RAIDZ VDev
2541 *
2542 * Outline:
2543 * - For write operations:
2544 * 1. Generate the parity data
2545 * 2. Create child zio write operations to each column's vdev, for both
2546 * data and parity.
2547 * 3. If the column skips any sectors for padding, create optional dummy
2548 * write zio children for those areas to improve aggregation continuity.
2549 * - For read operations:
2550 * 1. Create child zio read operations to each data column's vdev to read
2551 * the range of data required for zio.
2552 * 2. If this is a scrub or resilver operation, or if any of the data
2553 * vdevs have had errors, then create zio read operations to the parity
2554 * columns' VDevs as well.
2555 */
2556 static void
vdev_raidz_io_start(zio_t * zio)2557 vdev_raidz_io_start(zio_t *zio)
2558 {
2559 vdev_t *vd = zio->io_vd;
2560 vdev_t *tvd = vd->vdev_top;
2561 vdev_raidz_t *vdrz = vd->vdev_tsd;
2562 raidz_map_t *rm;
2563
2564 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2565 BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2566 if (logical_width != vdrz->vd_physical_width) {
2567 zfs_locked_range_t *lr = NULL;
2568 uint64_t synced_offset = UINT64_MAX;
2569 uint64_t next_offset = UINT64_MAX;
2570 boolean_t use_scratch = B_FALSE;
2571 /*
2572 * Note: when the expansion is completing, we set
2573 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2574 * in a later txg than when we last update spa_ubsync's state
2575 * (see the end of spa_raidz_expand_thread()). Therefore we
2576 * may see vre_state!=SCANNING before
2577 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2578 * on disk, but the copying progress has been synced to disk
2579 * (and reflected in spa_ubsync). In this case it's fine to
2580 * treat the expansion as completed, since if we crash there's
2581 * no additional copying to do.
2582 */
2583 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2584 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2585 &vdrz->vn_vre);
2586 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2587 zio->io_offset, zio->io_size, RL_READER);
2588 use_scratch =
2589 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2590 RRSS_SCRATCH_VALID);
2591 synced_offset =
2592 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2593 next_offset = vdrz->vn_vre.vre_offset;
2594 /*
2595 * If we haven't resumed expanding since importing the
2596 * pool, vre_offset won't have been set yet. In
2597 * this case the next offset to be copied is the same
2598 * as what was synced.
2599 */
2600 if (next_offset == UINT64_MAX) {
2601 next_offset = synced_offset;
2602 }
2603 }
2604 if (use_scratch) {
2605 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2606 "%lld next_offset=%lld use_scratch=%u",
2607 zio,
2608 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2609 (long long)zio->io_offset,
2610 (long long)synced_offset,
2611 (long long)next_offset,
2612 use_scratch);
2613 }
2614
2615 rm = vdev_raidz_map_alloc_expanded(zio,
2616 tvd->vdev_ashift, vdrz->vd_physical_width,
2617 logical_width, vdrz->vd_nparity,
2618 synced_offset, next_offset, use_scratch);
2619 rm->rm_lr = lr;
2620 } else {
2621 rm = vdev_raidz_map_alloc(zio,
2622 tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2623 }
2624 rm->rm_original_width = vdrz->vd_original_width;
2625
2626 zio->io_vsd = rm;
2627 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2628 if (zio->io_type == ZIO_TYPE_WRITE) {
2629 for (int i = 0; i < rm->rm_nrows; i++) {
2630 vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2631 }
2632
2633 if (logical_width == vdrz->vd_physical_width) {
2634 raidz_start_skip_writes(zio);
2635 }
2636 } else {
2637 ASSERT(zio->io_type == ZIO_TYPE_READ);
2638 vdev_raidz_io_start_read(zio, rm);
2639 }
2640
2641 zio_execute(zio);
2642 }
2643
2644 /*
2645 * Report a checksum error for a child of a RAID-Z device.
2646 */
2647 void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2648 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2649 {
2650 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2651
2652 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2653 zio->io_priority != ZIO_PRIORITY_REBUILD) {
2654 zio_bad_cksum_t zbc;
2655 raidz_map_t *rm = zio->io_vsd;
2656
2657 zbc.zbc_has_cksum = 0;
2658 zbc.zbc_injected = rm->rm_ecksuminjected;
2659
2660 mutex_enter(&vd->vdev_stat_lock);
2661 vd->vdev_stat.vs_checksum_errors++;
2662 mutex_exit(&vd->vdev_stat_lock);
2663 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2664 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2665 rc->rc_abd, bad_data, &zbc);
2666 }
2667 }
2668
2669 /*
2670 * We keep track of whether or not there were any injected errors, so that
2671 * any ereports we generate can note it.
2672 */
2673 static int
raidz_checksum_verify(zio_t * zio)2674 raidz_checksum_verify(zio_t *zio)
2675 {
2676 zio_bad_cksum_t zbc = {0};
2677 raidz_map_t *rm = zio->io_vsd;
2678
2679 int ret = zio_checksum_error(zio, &zbc);
2680 /*
2681 * Any Direct I/O read that has a checksum error must be treated as
2682 * suspicious as the contents of the buffer could be getting
2683 * manipulated while the I/O is taking place. The checksum verify error
2684 * will be reported to the top-level RAIDZ VDEV.
2685 */
2686 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2687 zio->io_error = ret;
2688 zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
2689 zio_dio_chksum_verify_error_report(zio);
2690 zio_checksum_verified(zio);
2691 return (0);
2692 }
2693
2694 if (ret != 0 && zbc.zbc_injected != 0)
2695 rm->rm_ecksuminjected = 1;
2696
2697 return (ret);
2698 }
2699
2700 /*
2701 * Generate the parity from the data columns. If we tried and were able to
2702 * read the parity without error, verify that the generated parity matches the
2703 * data we read. If it doesn't, we fire off a checksum error. Return the
2704 * number of such failures.
2705 */
2706 static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)2707 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2708 {
2709 abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2710 int c, ret = 0;
2711 raidz_map_t *rm = zio->io_vsd;
2712 raidz_col_t *rc;
2713
2714 blkptr_t *bp = zio->io_bp;
2715 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2716 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2717
2718 if (checksum == ZIO_CHECKSUM_NOPARITY)
2719 return (ret);
2720
2721 for (c = 0; c < rr->rr_firstdatacol; c++) {
2722 rc = &rr->rr_col[c];
2723 if (!rc->rc_tried || rc->rc_error != 0)
2724 continue;
2725
2726 orig[c] = rc->rc_abd;
2727 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2728 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2729 }
2730
2731 /*
2732 * Verify any empty sectors are zero filled to ensure the parity
2733 * is calculated correctly even if these non-data sectors are damaged.
2734 */
2735 if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2736 ret += vdev_draid_map_verify_empty(zio, rr);
2737
2738 /*
2739 * Regenerates parity even for !tried||rc_error!=0 columns. This
2740 * isn't harmful but it does have the side effect of fixing stuff
2741 * we didn't realize was necessary (i.e. even if we return 0).
2742 */
2743 vdev_raidz_generate_parity_row(rm, rr);
2744
2745 for (c = 0; c < rr->rr_firstdatacol; c++) {
2746 rc = &rr->rr_col[c];
2747
2748 if (!rc->rc_tried || rc->rc_error != 0)
2749 continue;
2750
2751 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2752 zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2753 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2754 vdev_raidz_checksum_error(zio, rc, orig[c]);
2755 rc->rc_error = SET_ERROR(ECKSUM);
2756 ret++;
2757 }
2758 abd_free(orig[c]);
2759 }
2760
2761 return (ret);
2762 }
2763
2764 static int
vdev_raidz_worst_error(raidz_row_t * rr)2765 vdev_raidz_worst_error(raidz_row_t *rr)
2766 {
2767 int error = 0;
2768
2769 for (int c = 0; c < rr->rr_cols; c++) {
2770 error = zio_worst_error(error, rr->rr_col[c].rc_error);
2771 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2772 }
2773
2774 return (error);
2775 }
2776
2777 static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)2778 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2779 {
2780 int unexpected_errors = 0;
2781 int parity_errors = 0;
2782 int parity_untried = 0;
2783 int data_errors = 0;
2784
2785 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2786
2787 for (int c = 0; c < rr->rr_cols; c++) {
2788 raidz_col_t *rc = &rr->rr_col[c];
2789
2790 if (rc->rc_error) {
2791 if (c < rr->rr_firstdatacol)
2792 parity_errors++;
2793 else
2794 data_errors++;
2795
2796 if (!rc->rc_skipped)
2797 unexpected_errors++;
2798 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2799 parity_untried++;
2800 }
2801
2802 if (rc->rc_force_repair)
2803 unexpected_errors++;
2804 }
2805
2806 /*
2807 * If we read more parity disks than were used for
2808 * reconstruction, confirm that the other parity disks produced
2809 * correct data.
2810 *
2811 * Note that we also regenerate parity when resilvering so we
2812 * can write it out to failed devices later.
2813 */
2814 if (parity_errors + parity_untried <
2815 rr->rr_firstdatacol - data_errors ||
2816 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2817 int n = raidz_parity_verify(zio, rr);
2818 unexpected_errors += n;
2819 }
2820
2821 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2822 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2823 /*
2824 * Use the good data we have in hand to repair damaged children.
2825 */
2826 for (int c = 0; c < rr->rr_cols; c++) {
2827 raidz_col_t *rc = &rr->rr_col[c];
2828 vdev_t *vd = zio->io_vd;
2829 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2830
2831 if (!rc->rc_allow_repair) {
2832 continue;
2833 } else if (!rc->rc_force_repair &&
2834 (rc->rc_error == 0 || rc->rc_size == 0)) {
2835 continue;
2836 }
2837 /*
2838 * We do not allow self healing for Direct I/O reads.
2839 * See comment in vdev_raid_row_alloc().
2840 */
2841 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2842
2843 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2844 "offset=%llx",
2845 zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2846
2847 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2848 rc->rc_offset, rc->rc_abd, rc->rc_size,
2849 ZIO_TYPE_WRITE,
2850 zio->io_priority == ZIO_PRIORITY_REBUILD ?
2851 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2852 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2853 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2854 }
2855 }
2856
2857 /*
2858 * Scrub or resilver i/o's: overwrite any shadow locations with the
2859 * good data. This ensures that if we've already copied this sector,
2860 * it will be corrected if it was damaged. This writes more than is
2861 * necessary, but since expansion is paused during scrub/resilver, at
2862 * most a single row will have a shadow location.
2863 */
2864 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2865 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2866 for (int c = 0; c < rr->rr_cols; c++) {
2867 raidz_col_t *rc = &rr->rr_col[c];
2868 vdev_t *vd = zio->io_vd;
2869
2870 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2871 continue;
2872 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2873
2874 /*
2875 * Note: We don't want to update the repair stats
2876 * because that would incorrectly indicate that there
2877 * was bad data to repair, which we aren't sure about.
2878 * By clearing the SCAN_THREAD flag, we prevent this
2879 * from happening, despite having the REPAIR flag set.
2880 * We need to set SELF_HEAL so that this i/o can't be
2881 * bypassed by zio_vdev_io_start().
2882 */
2883 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2884 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2885 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2886 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2887 NULL, NULL);
2888 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2889 zio_nowait(cio);
2890 }
2891 }
2892 }
2893
2894 static void
raidz_restore_orig_data(raidz_map_t * rm)2895 raidz_restore_orig_data(raidz_map_t *rm)
2896 {
2897 for (int i = 0; i < rm->rm_nrows; i++) {
2898 raidz_row_t *rr = rm->rm_row[i];
2899 for (int c = 0; c < rr->rr_cols; c++) {
2900 raidz_col_t *rc = &rr->rr_col[c];
2901 if (rc->rc_need_orig_restore) {
2902 abd_copy(rc->rc_abd,
2903 rc->rc_orig_data, rc->rc_size);
2904 rc->rc_need_orig_restore = B_FALSE;
2905 }
2906 }
2907 }
2908 }
2909
2910 /*
2911 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2912 * failure simulations. See note in raidz_reconstruct() on simulating failure
2913 * of a pre-expansion device.
2914 *
2915 * Treating logical child i as failed, return TRUE if the given column should
2916 * be treated as failed. The idea of logical children allows us to imagine
2917 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2918 * succeed but return the wrong data). Since the expansion doesn't verify
2919 * checksums, the incorrect data will be moved to new locations spread among
2920 * the children (going diagonally across them).
2921 *
2922 * Higher "logical child failures" (values of `i`) indicate these
2923 * "pre-expansion failures". The first physical_width values imagine that a
2924 * current child failed; the next physical_width-1 values imagine that a
2925 * child failed before the most recent expansion; the next physical_width-2
2926 * values imagine a child failed in the expansion before that, etc.
2927 */
2928 static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)2929 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2930 int i, raidz_col_t *rc)
2931 {
2932 uint64_t sector_id =
2933 physical_width * (rc->rc_offset >> ashift) +
2934 rc->rc_devidx;
2935
2936 for (int w = physical_width; w >= original_width; w--) {
2937 if (i < w) {
2938 return (sector_id % w == i);
2939 } else {
2940 i -= w;
2941 }
2942 }
2943 ASSERT(!"invalid logical child id");
2944 return (B_FALSE);
2945 }
2946
2947 /*
2948 * returns EINVAL if reconstruction of the block will not be possible
2949 * returns ECKSUM if this specific reconstruction failed
2950 * returns 0 on successful reconstruction
2951 */
2952 static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)2953 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2954 {
2955 raidz_map_t *rm = zio->io_vsd;
2956 int physical_width = zio->io_vd->vdev_children;
2957 int original_width = (rm->rm_original_width != 0) ?
2958 rm->rm_original_width : physical_width;
2959 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2960
2961 if (dbgmsg) {
2962 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2963 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2964 }
2965
2966 /* Reconstruct each row */
2967 for (int r = 0; r < rm->rm_nrows; r++) {
2968 raidz_row_t *rr = rm->rm_row[r];
2969 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2970 int t = 0;
2971 int dead = 0;
2972 int dead_data = 0;
2973
2974 if (dbgmsg)
2975 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2976
2977 for (int c = 0; c < rr->rr_cols; c++) {
2978 raidz_col_t *rc = &rr->rr_col[c];
2979 ASSERT0(rc->rc_need_orig_restore);
2980 if (rc->rc_error != 0) {
2981 dead++;
2982 if (c >= nparity)
2983 dead_data++;
2984 continue;
2985 }
2986 if (rc->rc_size == 0)
2987 continue;
2988 for (int lt = 0; lt < ntgts; lt++) {
2989 if (raidz_simulate_failure(physical_width,
2990 original_width,
2991 zio->io_vd->vdev_top->vdev_ashift,
2992 ltgts[lt], rc)) {
2993 if (rc->rc_orig_data == NULL) {
2994 rc->rc_orig_data =
2995 abd_alloc_linear(
2996 rc->rc_size, B_TRUE);
2997 abd_copy(rc->rc_orig_data,
2998 rc->rc_abd, rc->rc_size);
2999 }
3000 rc->rc_need_orig_restore = B_TRUE;
3001
3002 dead++;
3003 if (c >= nparity)
3004 dead_data++;
3005 /*
3006 * Note: simulating failure of a
3007 * pre-expansion device can hit more
3008 * than one column, in which case we
3009 * might try to simulate more failures
3010 * than can be reconstructed, which is
3011 * also more than the size of my_tgts.
3012 * This check prevents accessing past
3013 * the end of my_tgts. The "dead >
3014 * nparity" check below will fail this
3015 * reconstruction attempt.
3016 */
3017 if (t < VDEV_RAIDZ_MAXPARITY) {
3018 my_tgts[t++] = c;
3019 if (dbgmsg) {
3020 zfs_dbgmsg("simulating "
3021 "failure of col %u "
3022 "devidx %u", c,
3023 (int)rc->rc_devidx);
3024 }
3025 }
3026 break;
3027 }
3028 }
3029 }
3030 if (dead > nparity) {
3031 /* reconstruction not possible */
3032 if (dbgmsg) {
3033 zfs_dbgmsg("reconstruction not possible; "
3034 "too many failures");
3035 }
3036 raidz_restore_orig_data(rm);
3037 return (EINVAL);
3038 }
3039 if (dead_data > 0)
3040 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3041 }
3042
3043 /* Check for success */
3044 if (raidz_checksum_verify(zio) == 0) {
3045 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3046 return (0);
3047
3048 /* Reconstruction succeeded - report errors */
3049 for (int i = 0; i < rm->rm_nrows; i++) {
3050 raidz_row_t *rr = rm->rm_row[i];
3051
3052 for (int c = 0; c < rr->rr_cols; c++) {
3053 raidz_col_t *rc = &rr->rr_col[c];
3054 if (rc->rc_need_orig_restore) {
3055 /*
3056 * Note: if this is a parity column,
3057 * we don't really know if it's wrong.
3058 * We need to let
3059 * vdev_raidz_io_done_verified() check
3060 * it, and if we set rc_error, it will
3061 * think that it is a "known" error
3062 * that doesn't need to be checked
3063 * or corrected.
3064 */
3065 if (rc->rc_error == 0 &&
3066 c >= rr->rr_firstdatacol) {
3067 vdev_raidz_checksum_error(zio,
3068 rc, rc->rc_orig_data);
3069 rc->rc_error =
3070 SET_ERROR(ECKSUM);
3071 }
3072 rc->rc_need_orig_restore = B_FALSE;
3073 }
3074 }
3075
3076 vdev_raidz_io_done_verified(zio, rr);
3077 }
3078
3079 zio_checksum_verified(zio);
3080
3081 if (dbgmsg) {
3082 zfs_dbgmsg("reconstruction successful "
3083 "(checksum verified)");
3084 }
3085 return (0);
3086 }
3087
3088 /* Reconstruction failed - restore original data */
3089 raidz_restore_orig_data(rm);
3090 if (dbgmsg) {
3091 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3092 "failed", zio);
3093 }
3094 return (ECKSUM);
3095 }
3096
3097 /*
3098 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3099 * Note that the algorithm below is non-optimal because it doesn't take into
3100 * account how reconstruction is actually performed. For example, with
3101 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3102 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3103 * cases we'd only use parity information in column 0.
3104 *
3105 * The order that we find the various possible combinations of failed
3106 * disks is dictated by these rules:
3107 * - Examine each "slot" (the "i" in tgts[i])
3108 * - Try to increment this slot (tgts[i] += 1)
3109 * - if we can't increment because it runs into the next slot,
3110 * reset our slot to the minimum, and examine the next slot
3111 *
3112 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3113 * 3 columns to reconstruct), we will generate the following sequence:
3114 *
3115 * STATE ACTION
3116 * 0 1 2 special case: skip since these are all parity
3117 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3118 * 0 2 3 first slot: increment to 1
3119 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3120 * 0 1 4 first: reset to 0; middle: increment to 2
3121 * 0 2 4 first: increment to 1
3122 * 1 2 4 first: reset to 0; middle: increment to 3
3123 * 0 3 4 first: increment to 1
3124 * 1 3 4 first: increment to 2
3125 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3126 * 0 1 5 first: reset to 0; middle: increment to 2
3127 * 0 2 5 first: increment to 1
3128 * 1 2 5 first: reset to 0; middle: increment to 3
3129 * 0 3 5 first: increment to 1
3130 * 1 3 5 first: increment to 2
3131 * 2 3 5 first: reset to 0; middle: increment to 4
3132 * 0 4 5 first: increment to 1
3133 * 1 4 5 first: increment to 2
3134 * 2 4 5 first: increment to 3
3135 * 3 4 5 done
3136 *
3137 * This strategy works for dRAID but is less efficient when there are a large
3138 * number of child vdevs and therefore permutations to check. Furthermore,
3139 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3140 * possible as long as there are no more than nparity data errors per row.
3141 * These additional permutations are not currently checked but could be as
3142 * a future improvement.
3143 *
3144 * Returns 0 on success, ECKSUM on failure.
3145 */
3146 static int
vdev_raidz_combrec(zio_t * zio)3147 vdev_raidz_combrec(zio_t *zio)
3148 {
3149 int nparity = vdev_get_nparity(zio->io_vd);
3150 raidz_map_t *rm = zio->io_vsd;
3151 int physical_width = zio->io_vd->vdev_children;
3152 int original_width = (rm->rm_original_width != 0) ?
3153 rm->rm_original_width : physical_width;
3154
3155 for (int i = 0; i < rm->rm_nrows; i++) {
3156 raidz_row_t *rr = rm->rm_row[i];
3157 int total_errors = 0;
3158
3159 for (int c = 0; c < rr->rr_cols; c++) {
3160 if (rr->rr_col[c].rc_error)
3161 total_errors++;
3162 }
3163
3164 if (total_errors > nparity)
3165 return (vdev_raidz_worst_error(rr));
3166 }
3167
3168 for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3169 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3170 int *ltgts = &tstore[1]; /* value is logical child ID */
3171
3172
3173 /*
3174 * Determine number of logical children, n. See comment
3175 * above raidz_simulate_failure().
3176 */
3177 int n = 0;
3178 for (int w = physical_width;
3179 w >= original_width; w--) {
3180 n += w;
3181 }
3182
3183 ASSERT3U(num_failures, <=, nparity);
3184 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3185
3186 /* Handle corner cases in combrec logic */
3187 ltgts[-1] = -1;
3188 for (int i = 0; i < num_failures; i++) {
3189 ltgts[i] = i;
3190 }
3191 ltgts[num_failures] = n;
3192
3193 for (;;) {
3194 int err = raidz_reconstruct(zio, ltgts, num_failures,
3195 nparity);
3196 if (err == EINVAL) {
3197 /*
3198 * Reconstruction not possible with this #
3199 * failures; try more failures.
3200 */
3201 break;
3202 } else if (err == 0)
3203 return (0);
3204
3205 /* Compute next targets to try */
3206 for (int t = 0; ; t++) {
3207 ASSERT3U(t, <, num_failures);
3208 ltgts[t]++;
3209 if (ltgts[t] == n) {
3210 /* try more failures */
3211 ASSERT3U(t, ==, num_failures - 1);
3212 if (zfs_flags &
3213 ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3214 zfs_dbgmsg("reconstruction "
3215 "failed for num_failures="
3216 "%u; tried all "
3217 "combinations",
3218 num_failures);
3219 }
3220 break;
3221 }
3222
3223 ASSERT3U(ltgts[t], <, n);
3224 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3225
3226 /*
3227 * If that spot is available, we're done here.
3228 * Try the next combination.
3229 */
3230 if (ltgts[t] != ltgts[t + 1])
3231 break; // found next combination
3232
3233 /*
3234 * Otherwise, reset this tgt to the minimum,
3235 * and move on to the next tgt.
3236 */
3237 ltgts[t] = ltgts[t - 1] + 1;
3238 ASSERT3U(ltgts[t], ==, t);
3239 }
3240
3241 /* Increase the number of failures and keep trying. */
3242 if (ltgts[num_failures - 1] == n)
3243 break;
3244 }
3245 }
3246 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3247 zfs_dbgmsg("reconstruction failed for all num_failures");
3248 return (ECKSUM);
3249 }
3250
3251 void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)3252 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3253 {
3254 for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3255 raidz_row_t *rr = rm->rm_row[row];
3256 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3257 }
3258 }
3259
3260 /*
3261 * Complete a write IO operation on a RAIDZ VDev
3262 *
3263 * Outline:
3264 * 1. Check for errors on the child IOs.
3265 * 2. Return, setting an error code if too few child VDevs were written
3266 * to reconstruct the data later. Note that partial writes are
3267 * considered successful if they can be reconstructed at all.
3268 */
3269 static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)3270 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3271 {
3272 int normal_errors = 0;
3273 int shadow_errors = 0;
3274
3275 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3276 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3277 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3278
3279 for (int c = 0; c < rr->rr_cols; c++) {
3280 raidz_col_t *rc = &rr->rr_col[c];
3281
3282 if (rc->rc_error != 0) {
3283 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3284 normal_errors++;
3285 }
3286 if (rc->rc_shadow_error != 0) {
3287 ASSERT(rc->rc_shadow_error != ECKSUM);
3288 shadow_errors++;
3289 }
3290 }
3291
3292 /*
3293 * Treat partial writes as a success. If we couldn't write enough
3294 * columns to reconstruct the data, the I/O failed. Otherwise, good
3295 * enough. Note that in the case of a shadow write (during raidz
3296 * expansion), depending on if we crash, either the normal (old) or
3297 * shadow (new) location may become the "real" version of the block,
3298 * so both locations must have sufficient redundancy.
3299 *
3300 * Now that we support write reallocation, it would be better
3301 * to treat partial failure as real failure unless there are
3302 * no non-degraded top-level vdevs left, and not update DTLs
3303 * if we intend to reallocate.
3304 */
3305 if (normal_errors > rr->rr_firstdatacol ||
3306 shadow_errors > rr->rr_firstdatacol) {
3307 zio->io_error = zio_worst_error(zio->io_error,
3308 vdev_raidz_worst_error(rr));
3309 }
3310 }
3311
3312 static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)3313 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3314 raidz_row_t *rr)
3315 {
3316 int parity_errors = 0;
3317 int parity_untried = 0;
3318 int data_errors = 0;
3319 int total_errors = 0;
3320
3321 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3322 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3323
3324 for (int c = 0; c < rr->rr_cols; c++) {
3325 raidz_col_t *rc = &rr->rr_col[c];
3326
3327 /*
3328 * If scrubbing and a replacing/sparing child vdev determined
3329 * that not all of its children have an identical copy of the
3330 * data, then clear the error so the column is treated like
3331 * any other read and force a repair to correct the damage.
3332 */
3333 if (rc->rc_error == ECKSUM) {
3334 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3335 vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3336 rc->rc_force_repair = 1;
3337 rc->rc_error = 0;
3338 }
3339
3340 if (rc->rc_error) {
3341 if (c < rr->rr_firstdatacol)
3342 parity_errors++;
3343 else
3344 data_errors++;
3345
3346 total_errors++;
3347 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3348 parity_untried++;
3349 }
3350 }
3351
3352 /*
3353 * If there were data errors and the number of errors we saw was
3354 * correctable -- less than or equal to the number of parity disks read
3355 * -- reconstruct based on the missing data.
3356 */
3357 if (data_errors != 0 &&
3358 total_errors <= rr->rr_firstdatacol - parity_untried) {
3359 /*
3360 * We either attempt to read all the parity columns or
3361 * none of them. If we didn't try to read parity, we
3362 * wouldn't be here in the correctable case. There must
3363 * also have been fewer parity errors than parity
3364 * columns or, again, we wouldn't be in this code path.
3365 */
3366 ASSERT0(parity_untried);
3367 ASSERT(parity_errors < rr->rr_firstdatacol);
3368
3369 /*
3370 * Identify the data columns that reported an error.
3371 */
3372 int n = 0;
3373 int tgts[VDEV_RAIDZ_MAXPARITY];
3374 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3375 raidz_col_t *rc = &rr->rr_col[c];
3376 if (rc->rc_error != 0) {
3377 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3378 tgts[n++] = c;
3379 }
3380 }
3381
3382 ASSERT(rr->rr_firstdatacol >= n);
3383
3384 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3385 }
3386 }
3387
3388 /*
3389 * Return the number of reads issued.
3390 */
3391 static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)3392 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3393 {
3394 vdev_t *vd = zio->io_vd;
3395 int nread = 0;
3396
3397 rr->rr_missingdata = 0;
3398 rr->rr_missingparity = 0;
3399
3400 /*
3401 * If this rows contains empty sectors which are not required
3402 * for a normal read then allocate an ABD for them now so they
3403 * may be read, verified, and any needed repairs performed.
3404 */
3405 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3406 vdev_draid_map_alloc_empty(zio, rr);
3407
3408 for (int c = 0; c < rr->rr_cols; c++) {
3409 raidz_col_t *rc = &rr->rr_col[c];
3410 if (rc->rc_tried || rc->rc_size == 0)
3411 continue;
3412
3413 zio_nowait(zio_vdev_child_io(zio, NULL,
3414 vd->vdev_child[rc->rc_devidx],
3415 rc->rc_offset, rc->rc_abd, rc->rc_size,
3416 zio->io_type, zio->io_priority, 0,
3417 vdev_raidz_child_done, rc));
3418 nread++;
3419 }
3420 return (nread);
3421 }
3422
3423 /*
3424 * We're here because either there were too many errors to even attempt
3425 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3426 * failed. In either case, there is enough bad data to prevent reconstruction.
3427 * Start checksum ereports for all children which haven't failed.
3428 */
3429 static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)3430 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3431 {
3432 raidz_map_t *rm = zio->io_vsd;
3433
3434 for (int i = 0; i < rm->rm_nrows; i++) {
3435 raidz_row_t *rr = rm->rm_row[i];
3436
3437 for (int c = 0; c < rr->rr_cols; c++) {
3438 raidz_col_t *rc = &rr->rr_col[c];
3439 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3440
3441 if (rc->rc_error != 0)
3442 continue;
3443
3444 zio_bad_cksum_t zbc;
3445 zbc.zbc_has_cksum = 0;
3446 zbc.zbc_injected = rm->rm_ecksuminjected;
3447 mutex_enter(&cvd->vdev_stat_lock);
3448 cvd->vdev_stat.vs_checksum_errors++;
3449 mutex_exit(&cvd->vdev_stat_lock);
3450 (void) zfs_ereport_start_checksum(zio->io_spa,
3451 cvd, &zio->io_bookmark, zio, rc->rc_offset,
3452 rc->rc_size, &zbc);
3453 }
3454 }
3455 }
3456
3457 void
vdev_raidz_io_done(zio_t * zio)3458 vdev_raidz_io_done(zio_t *zio)
3459 {
3460 raidz_map_t *rm = zio->io_vsd;
3461
3462 ASSERT(zio->io_bp != NULL);
3463 if (zio->io_type == ZIO_TYPE_WRITE) {
3464 for (int i = 0; i < rm->rm_nrows; i++) {
3465 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3466 }
3467 } else {
3468 if (rm->rm_phys_col) {
3469 /*
3470 * This is an aggregated read. Copy the data and status
3471 * from the aggregate abd's to the individual rows.
3472 */
3473 for (int i = 0; i < rm->rm_nrows; i++) {
3474 raidz_row_t *rr = rm->rm_row[i];
3475
3476 for (int c = 0; c < rr->rr_cols; c++) {
3477 raidz_col_t *rc = &rr->rr_col[c];
3478 if (rc->rc_tried || rc->rc_size == 0)
3479 continue;
3480
3481 raidz_col_t *prc =
3482 &rm->rm_phys_col[rc->rc_devidx];
3483 rc->rc_error = prc->rc_error;
3484 rc->rc_tried = prc->rc_tried;
3485 rc->rc_skipped = prc->rc_skipped;
3486 if (c >= rr->rr_firstdatacol) {
3487 /*
3488 * Note: this is slightly faster
3489 * than using abd_copy_off().
3490 */
3491 char *physbuf = abd_to_buf(
3492 prc->rc_abd);
3493 void *physloc = physbuf +
3494 rc->rc_offset -
3495 prc->rc_offset;
3496
3497 abd_copy_from_buf(rc->rc_abd,
3498 physloc, rc->rc_size);
3499 }
3500 }
3501 }
3502 }
3503
3504 for (int i = 0; i < rm->rm_nrows; i++) {
3505 raidz_row_t *rr = rm->rm_row[i];
3506 vdev_raidz_io_done_reconstruct_known_missing(zio,
3507 rm, rr);
3508 }
3509
3510 if (raidz_checksum_verify(zio) == 0) {
3511 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3512 goto done;
3513
3514 for (int i = 0; i < rm->rm_nrows; i++) {
3515 raidz_row_t *rr = rm->rm_row[i];
3516 vdev_raidz_io_done_verified(zio, rr);
3517 }
3518 zio_checksum_verified(zio);
3519 } else {
3520 /*
3521 * A sequential resilver has no checksum which makes
3522 * combinatoral reconstruction impossible. This code
3523 * path is unreachable since raidz_checksum_verify()
3524 * has no checksum to verify and must succeed.
3525 */
3526 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3527
3528 /*
3529 * This isn't a typical situation -- either we got a
3530 * read error or a child silently returned bad data.
3531 * Read every block so we can try again with as much
3532 * data and parity as we can track down. If we've
3533 * already been through once before, all children will
3534 * be marked as tried so we'll proceed to combinatorial
3535 * reconstruction.
3536 */
3537 int nread = 0;
3538 for (int i = 0; i < rm->rm_nrows; i++) {
3539 nread += vdev_raidz_read_all(zio,
3540 rm->rm_row[i]);
3541 }
3542 if (nread != 0) {
3543 /*
3544 * Normally our stage is VDEV_IO_DONE, but if
3545 * we've already called redone(), it will have
3546 * changed to VDEV_IO_START, in which case we
3547 * don't want to call redone() again.
3548 */
3549 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3550 zio_vdev_io_redone(zio);
3551 return;
3552 }
3553 /*
3554 * It would be too expensive to try every possible
3555 * combination of failed sectors in every row, so
3556 * instead we try every combination of failed current or
3557 * past physical disk. This means that if the incorrect
3558 * sectors were all on Nparity disks at any point in the
3559 * past, we will find the correct data. The only known
3560 * case where this is less durable than a non-expanded
3561 * RAIDZ, is if we have a silent failure during
3562 * expansion. In that case, one block could be
3563 * partially in the old format and partially in the
3564 * new format, so we'd lost some sectors from the old
3565 * format and some from the new format.
3566 *
3567 * e.g. logical_width=4 physical_width=6
3568 * the 15 (6+5+4) possible failed disks are:
3569 * width=6 child=0
3570 * width=6 child=1
3571 * width=6 child=2
3572 * width=6 child=3
3573 * width=6 child=4
3574 * width=6 child=5
3575 * width=5 child=0
3576 * width=5 child=1
3577 * width=5 child=2
3578 * width=5 child=3
3579 * width=5 child=4
3580 * width=4 child=0
3581 * width=4 child=1
3582 * width=4 child=2
3583 * width=4 child=3
3584 * And we will try every combination of Nparity of these
3585 * failing.
3586 *
3587 * As a first pass, we can generate every combo,
3588 * and try reconstructing, ignoring any known
3589 * failures. If any row has too many known + simulated
3590 * failures, then we bail on reconstructing with this
3591 * number of simulated failures. As an improvement,
3592 * we could detect the number of whole known failures
3593 * (i.e. we have known failures on these disks for
3594 * every row; the disks never succeeded), and
3595 * subtract that from the max # failures to simulate.
3596 * We could go even further like the current
3597 * combrec code, but that doesn't seem like it
3598 * gains us very much. If we simulate a failure
3599 * that is also a known failure, that's fine.
3600 */
3601 zio->io_error = vdev_raidz_combrec(zio);
3602 if (zio->io_error == ECKSUM &&
3603 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3604 vdev_raidz_io_done_unrecoverable(zio);
3605 }
3606 }
3607 }
3608 done:
3609 if (rm->rm_lr != NULL) {
3610 zfs_rangelock_exit(rm->rm_lr);
3611 rm->rm_lr = NULL;
3612 }
3613 }
3614
3615 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)3616 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3617 {
3618 vdev_raidz_t *vdrz = vd->vdev_tsd;
3619 if (faulted > vdrz->vd_nparity)
3620 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3621 VDEV_AUX_NO_REPLICAS);
3622 else if (degraded + faulted != 0)
3623 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3624 else
3625 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3626 }
3627
3628 /*
3629 * Determine if any portion of the provided block resides on a child vdev
3630 * with a dirty DTL and therefore needs to be resilvered. The function
3631 * assumes that at least one DTL is dirty which implies that full stripe
3632 * width blocks must be resilvered.
3633 */
3634 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3635 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3636 uint64_t phys_birth)
3637 {
3638 vdev_raidz_t *vdrz = vd->vdev_tsd;
3639
3640 /*
3641 * If we're in the middle of a RAIDZ expansion, this block may be in
3642 * the old and/or new location. For simplicity, always resilver it.
3643 */
3644 if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3645 return (B_TRUE);
3646
3647 uint64_t dcols = vd->vdev_children;
3648 uint64_t nparity = vdrz->vd_nparity;
3649 uint64_t ashift = vd->vdev_top->vdev_ashift;
3650 /* The starting RAIDZ (parent) vdev sector of the block. */
3651 uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3652 /* The zio's size in units of the vdev's minimum sector size. */
3653 uint64_t s = ((psize - 1) >> ashift) + 1;
3654 /* The first column for this stripe. */
3655 uint64_t f = b % dcols;
3656
3657 /* Unreachable by sequential resilver. */
3658 ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3659
3660 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3661 return (B_FALSE);
3662
3663 if (s + nparity >= dcols)
3664 return (B_TRUE);
3665
3666 for (uint64_t c = 0; c < s + nparity; c++) {
3667 uint64_t devidx = (f + c) % dcols;
3668 vdev_t *cvd = vd->vdev_child[devidx];
3669
3670 /*
3671 * dsl_scan_need_resilver() already checked vd with
3672 * vdev_dtl_contains(). So here just check cvd with
3673 * vdev_dtl_empty(), cheaper and a good approximation.
3674 */
3675 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3676 return (B_TRUE);
3677 }
3678
3679 return (B_FALSE);
3680 }
3681
3682 static void
vdev_raidz_xlate(vdev_t * cvd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)3683 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
3684 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
3685 {
3686 (void) remain_rs;
3687
3688 vdev_t *raidvd = cvd->vdev_parent;
3689 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3690
3691 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3692
3693 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3694 /*
3695 * We're in the middle of expansion, in which case the
3696 * translation is in flux. Any answer we give may be wrong
3697 * by the time we return, so it isn't safe for the caller to
3698 * act on it. Therefore we say that this range isn't present
3699 * on any children. The only consumers of this are "zpool
3700 * initialize" and trimming, both of which are "best effort"
3701 * anyway.
3702 */
3703 physical_rs->rs_start = physical_rs->rs_end = 0;
3704 remain_rs->rs_start = remain_rs->rs_end = 0;
3705 return;
3706 }
3707
3708 uint64_t width = vdrz->vd_physical_width;
3709 uint64_t tgt_col = cvd->vdev_id;
3710 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3711
3712 /* make sure the offsets are block-aligned */
3713 ASSERT0(logical_rs->rs_start % (1 << ashift));
3714 ASSERT0(logical_rs->rs_end % (1 << ashift));
3715 uint64_t b_start = logical_rs->rs_start >> ashift;
3716 uint64_t b_end = logical_rs->rs_end >> ashift;
3717
3718 uint64_t start_row = 0;
3719 if (b_start > tgt_col) /* avoid underflow */
3720 start_row = ((b_start - tgt_col - 1) / width) + 1;
3721
3722 uint64_t end_row = 0;
3723 if (b_end > tgt_col)
3724 end_row = ((b_end - tgt_col - 1) / width) + 1;
3725
3726 physical_rs->rs_start = start_row << ashift;
3727 physical_rs->rs_end = end_row << ashift;
3728
3729 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3730 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3731 logical_rs->rs_end - logical_rs->rs_start);
3732 }
3733
3734 static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)3735 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3736 {
3737 spa_t *spa = arg;
3738 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3739 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3740
3741 /*
3742 * Ensure there are no i/os to the range that is being committed.
3743 */
3744 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3745 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3746
3747 mutex_enter(&vre->vre_lock);
3748 uint64_t new_offset =
3749 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3750 /*
3751 * We should not have committed anything that failed.
3752 */
3753 VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3754 mutex_exit(&vre->vre_lock);
3755
3756 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3757 old_offset, new_offset - old_offset,
3758 RL_WRITER);
3759
3760 /*
3761 * Update the uberblock that will be written when this txg completes.
3762 */
3763 RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3764 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3765 vre->vre_offset_pertxg[txgoff] = 0;
3766 zfs_rangelock_exit(lr);
3767
3768 mutex_enter(&vre->vre_lock);
3769 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3770 vre->vre_bytes_copied_pertxg[txgoff] = 0;
3771 mutex_exit(&vre->vre_lock);
3772
3773 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3774 VERIFY0(zap_update(spa->spa_meta_objset,
3775 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3776 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3777 }
3778
3779 static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)3780 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3781 {
3782 spa_t *spa = arg;
3783 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3784 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3785 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3786
3787 for (int i = 0; i < TXG_SIZE; i++)
3788 VERIFY0(vre->vre_offset_pertxg[i]);
3789
3790 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3791 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3792 re->re_logical_width = vdrz->vd_physical_width;
3793 mutex_enter(&vdrz->vd_expand_lock);
3794 avl_add(&vdrz->vd_expand_txgs, re);
3795 mutex_exit(&vdrz->vd_expand_lock);
3796
3797 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3798
3799 /*
3800 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3801 * will get written (based on vd_expand_txgs).
3802 */
3803 vdev_config_dirty(vd);
3804
3805 /*
3806 * Before we change vre_state, the on-disk state must reflect that we
3807 * have completed all copying, so that vdev_raidz_io_start() can use
3808 * vre_state to determine if the reflow is in progress. See also the
3809 * end of spa_raidz_expand_thread().
3810 */
3811 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3812 raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3813
3814 vre->vre_end_time = gethrestime_sec();
3815 vre->vre_state = DSS_FINISHED;
3816
3817 uint64_t state = vre->vre_state;
3818 VERIFY0(zap_update(spa->spa_meta_objset,
3819 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3820 sizeof (state), 1, &state, tx));
3821
3822 uint64_t end_time = vre->vre_end_time;
3823 VERIFY0(zap_update(spa->spa_meta_objset,
3824 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3825 sizeof (end_time), 1, &end_time, tx));
3826
3827 spa->spa_uberblock.ub_raidz_reflow_info = 0;
3828
3829 spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
3830 "%s vdev %llu new width %llu", spa_name(spa),
3831 (unsigned long long)vd->vdev_id,
3832 (unsigned long long)vd->vdev_children);
3833
3834 spa->spa_raidz_expand = NULL;
3835 raidvd->vdev_rz_expanding = B_FALSE;
3836
3837 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3838 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3839 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3840
3841 spa_notify_waiters(spa);
3842
3843 /*
3844 * While we're in syncing context take the opportunity to
3845 * setup a scrub. All the data has been sucessfully copied
3846 * but we have not validated any checksums.
3847 */
3848 setup_sync_arg_t setup_sync_arg = {
3849 .func = POOL_SCAN_SCRUB,
3850 .txgstart = 0,
3851 .txgend = 0,
3852 };
3853 if (zfs_scrub_after_expand &&
3854 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
3855 dsl_scan_setup_sync(&setup_sync_arg, tx);
3856 }
3857 }
3858
3859 /*
3860 * State of one copy batch.
3861 */
3862 typedef struct raidz_reflow_arg {
3863 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */
3864 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */
3865 uint64_t rra_txg; /* TXG of this batch. */
3866 uint_t rra_ashift; /* Ashift of the vdev. */
3867 uint32_t rra_tbd; /* Number of in-flight ZIOs. */
3868 uint32_t rra_writes; /* Number of write ZIOs. */
3869 zio_t *rra_zio[]; /* Write ZIO pointers. */
3870 } raidz_reflow_arg_t;
3871
3872 /*
3873 * Write of the new location on one child is done. Once all of them are done
3874 * we can unlock and free everything.
3875 */
3876 static void
raidz_reflow_write_done(zio_t * zio)3877 raidz_reflow_write_done(zio_t *zio)
3878 {
3879 raidz_reflow_arg_t *rra = zio->io_private;
3880 vdev_raidz_expand_t *vre = rra->rra_vre;
3881
3882 abd_free(zio->io_abd);
3883
3884 mutex_enter(&vre->vre_lock);
3885 if (zio->io_error != 0) {
3886 /* Force a reflow pause on errors */
3887 vre->vre_failed_offset =
3888 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3889 }
3890 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3891 vre->vre_outstanding_bytes -= zio->io_size;
3892 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3893 vre->vre_failed_offset) {
3894 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3895 zio->io_size;
3896 }
3897 cv_signal(&vre->vre_cv);
3898 boolean_t done = (--rra->rra_tbd == 0);
3899 mutex_exit(&vre->vre_lock);
3900
3901 if (!done)
3902 return;
3903 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3904 zfs_rangelock_exit(rra->rra_lr);
3905 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3906 }
3907
3908 /*
3909 * Read of the old location on one child is done. Once all of them are done
3910 * writes should have all the data and we can issue them.
3911 */
3912 static void
raidz_reflow_read_done(zio_t * zio)3913 raidz_reflow_read_done(zio_t *zio)
3914 {
3915 raidz_reflow_arg_t *rra = zio->io_private;
3916 vdev_raidz_expand_t *vre = rra->rra_vre;
3917
3918 /* Reads of only one block use write ABDs. For bigger free gangs. */
3919 if (zio->io_size > (1 << rra->rra_ashift))
3920 abd_free(zio->io_abd);
3921
3922 /*
3923 * If the read failed, or if it was done on a vdev that is not fully
3924 * healthy (e.g. a child that has a resilver in progress), we may not
3925 * have the correct data. Note that it's OK if the write proceeds.
3926 * It may write garbage but the location is otherwise unused and we
3927 * will retry later due to vre_failed_offset.
3928 */
3929 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3930 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3931 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3932 (long long)rra->rra_lr->lr_offset,
3933 (long long)rra->rra_lr->lr_length,
3934 (long long)rra->rra_txg,
3935 zio->io_error,
3936 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3937 vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3938 mutex_enter(&vre->vre_lock);
3939 /* Force a reflow pause on errors */
3940 vre->vre_failed_offset =
3941 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3942 mutex_exit(&vre->vre_lock);
3943 }
3944
3945 if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
3946 return;
3947 uint32_t writes = rra->rra_tbd = rra->rra_writes;
3948 for (uint64_t i = 0; i < writes; i++)
3949 zio_nowait(rra->rra_zio[i]);
3950 }
3951
3952 static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)3953 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3954 dmu_tx_t *tx)
3955 {
3956 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3957 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3958
3959 if (offset == 0)
3960 return;
3961
3962 mutex_enter(&vre->vre_lock);
3963 ASSERT3U(vre->vre_offset, <=, offset);
3964 vre->vre_offset = offset;
3965 mutex_exit(&vre->vre_lock);
3966
3967 if (vre->vre_offset_pertxg[txgoff] == 0) {
3968 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3969 spa, tx);
3970 }
3971 vre->vre_offset_pertxg[txgoff] = offset;
3972 }
3973
3974 static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)3975 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3976 {
3977 for (int i = 0; i < raidz_vd->vdev_children; i++) {
3978 /* Quick check if a child is being replaced */
3979 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3980 return (B_TRUE);
3981 }
3982 return (B_FALSE);
3983 }
3984
3985 static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,zfs_range_tree_t * rt,dmu_tx_t * tx)3986 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
3987 dmu_tx_t *tx)
3988 {
3989 spa_t *spa = vd->vdev_spa;
3990 uint_t ashift = vd->vdev_top->vdev_ashift;
3991
3992 zfs_range_seg_t *rs = zfs_range_tree_first(rt);
3993 if (rt == NULL)
3994 return (B_FALSE);
3995 uint64_t offset = zfs_rs_get_start(rs, rt);
3996 ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3997 uint64_t size = zfs_rs_get_end(rs, rt) - offset;
3998 ASSERT3U(size, >=, 1 << ashift);
3999 ASSERT(IS_P2ALIGNED(size, 1 << ashift));
4000
4001 uint64_t blkid = offset >> ashift;
4002 uint_t old_children = vd->vdev_children - 1;
4003
4004 /*
4005 * We can only progress to the point that writes will not overlap
4006 * with blocks whose progress has not yet been recorded on disk.
4007 * Since partially-copied rows are still read from the old location,
4008 * we need to stop one row before the sector-wise overlap, to prevent
4009 * row-wise overlap.
4010 *
4011 * Note that even if we are skipping over a large unallocated region,
4012 * we can't move the on-disk progress to `offset`, because concurrent
4013 * writes/allocations could still use the currently-unallocated
4014 * region.
4015 */
4016 uint64_t ubsync_blkid =
4017 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
4018 uint64_t next_overwrite_blkid = ubsync_blkid +
4019 ubsync_blkid / old_children - old_children;
4020 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
4021 if (blkid >= next_overwrite_blkid) {
4022 raidz_reflow_record_progress(vre,
4023 next_overwrite_blkid << ashift, tx);
4024 return (B_TRUE);
4025 }
4026
4027 size = MIN(size, raidz_expand_max_copy_bytes);
4028 size = MIN(size, (uint64_t)old_children *
4029 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4030 size = MAX(size, 1 << ashift);
4031 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4032 size = (uint64_t)blocks << ashift;
4033
4034 zfs_range_tree_remove(rt, offset, size);
4035
4036 uint_t reads = MIN(blocks, old_children);
4037 uint_t writes = MIN(blocks, vd->vdev_children);
4038 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4039 sizeof (zio_t *) * writes, KM_SLEEP);
4040 rra->rra_vre = vre;
4041 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4042 offset, size, RL_WRITER);
4043 rra->rra_txg = dmu_tx_get_txg(tx);
4044 rra->rra_ashift = ashift;
4045 rra->rra_tbd = reads;
4046 rra->rra_writes = writes;
4047
4048 raidz_reflow_record_progress(vre, offset + size, tx);
4049
4050 /*
4051 * SCL_STATE will be released when the read and write are done,
4052 * by raidz_reflow_write_done().
4053 */
4054 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4055
4056 /* check if a replacing vdev was added, if so treat it as an error */
4057 if (vdev_raidz_expand_child_replacing(vd)) {
4058 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4059 "offset=%llu txg=%llu",
4060 (long long)rra->rra_lr->lr_offset,
4061 (long long)rra->rra_txg);
4062
4063 mutex_enter(&vre->vre_lock);
4064 vre->vre_failed_offset =
4065 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4066 cv_signal(&vre->vre_cv);
4067 mutex_exit(&vre->vre_lock);
4068
4069 /* drop everything we acquired */
4070 spa_config_exit(spa, SCL_STATE, spa);
4071 zfs_rangelock_exit(rra->rra_lr);
4072 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4073 return (B_TRUE);
4074 }
4075
4076 mutex_enter(&vre->vre_lock);
4077 vre->vre_outstanding_bytes += size;
4078 mutex_exit(&vre->vre_lock);
4079
4080 /* Allocate ABD and ZIO for each child we write. */
4081 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4082 zio_t *pio = spa->spa_txg_zio[txgoff];
4083 uint_t b = blocks / vd->vdev_children;
4084 uint_t bb = blocks % vd->vdev_children;
4085 for (uint_t i = 0; i < writes; i++) {
4086 uint_t n = b + (i < bb);
4087 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4088 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4089 vd->vdev_child[(blkid + i) % vd->vdev_children],
4090 ((blkid + i) / vd->vdev_children) << ashift,
4091 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4092 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4093 }
4094
4095 /*
4096 * Allocate and issue ZIO for each child we read. For reads of only
4097 * one block we can use respective writer ABDs, since they will also
4098 * have only one block. For bigger reads create gang ABDs and fill
4099 * them with respective blocks from writer ABDs.
4100 */
4101 b = blocks / old_children;
4102 bb = blocks % old_children;
4103 for (uint_t i = 0; i < reads; i++) {
4104 uint_t n = b + (i < bb);
4105 abd_t *abd;
4106 if (n > 1) {
4107 abd = abd_alloc_gang();
4108 for (uint_t j = 0; j < n; j++) {
4109 uint_t b = j * old_children + i;
4110 abd_t *cabd = abd_get_offset_size(
4111 rra->rra_zio[b % vd->vdev_children]->io_abd,
4112 (b / vd->vdev_children) << ashift,
4113 1 << ashift);
4114 abd_gang_add(abd, cabd, B_TRUE);
4115 }
4116 } else {
4117 abd = rra->rra_zio[i]->io_abd;
4118 }
4119 zio_nowait(zio_vdev_child_io(pio, NULL,
4120 vd->vdev_child[(blkid + i) % old_children],
4121 ((blkid + i) / old_children) << ashift, abd,
4122 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4123 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4124 }
4125
4126 return (B_FALSE);
4127 }
4128
4129 /*
4130 * For testing (ztest specific)
4131 */
4132 static void
raidz_expand_pause(uint_t pause_point)4133 raidz_expand_pause(uint_t pause_point)
4134 {
4135 while (raidz_expand_pause_point != 0 &&
4136 raidz_expand_pause_point <= pause_point)
4137 delay(hz);
4138 }
4139
4140 static void
raidz_scratch_child_done(zio_t * zio)4141 raidz_scratch_child_done(zio_t *zio)
4142 {
4143 zio_t *pio = zio->io_private;
4144
4145 mutex_enter(&pio->io_lock);
4146 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4147 mutex_exit(&pio->io_lock);
4148 }
4149
4150 /*
4151 * Reflow the beginning portion of the vdev into an intermediate scratch area
4152 * in memory and on disk. This operation must be persisted on disk before we
4153 * proceed to overwrite the beginning portion with the reflowed data.
4154 *
4155 * This multi-step task can fail to complete if disk errors are encountered
4156 * and we can return here after a pause (waiting for disk to become healthy).
4157 */
4158 static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4159 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4160 {
4161 vdev_raidz_expand_t *vre = arg;
4162 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4163 zio_t *pio;
4164 int error;
4165
4166 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4167 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4168 int ashift = raidvd->vdev_ashift;
4169 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4170 uint64_t);
4171 uint64_t logical_size = write_size * raidvd->vdev_children;
4172 uint64_t read_size =
4173 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4174 1 << ashift);
4175
4176 /*
4177 * The scratch space must be large enough to get us to the point
4178 * that one row does not overlap itself when moved. This is checked
4179 * by vdev_raidz_attach_check().
4180 */
4181 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4182 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4183 VERIFY3U(write_size, <=, read_size);
4184
4185 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4186 0, logical_size, RL_WRITER);
4187
4188 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4189 KM_SLEEP);
4190 for (int i = 0; i < raidvd->vdev_children; i++) {
4191 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4192 }
4193
4194 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4195
4196 /*
4197 * If we have already written the scratch area then we must read from
4198 * there, since new writes were redirected there while we were paused
4199 * or the original location may have been partially overwritten with
4200 * reflowed data.
4201 */
4202 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4203 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4204 /*
4205 * Read from scratch space.
4206 */
4207 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4208 for (int i = 0; i < raidvd->vdev_children; i++) {
4209 /*
4210 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4211 * to the offset to calculate the physical offset to
4212 * write to. Passing in a negative offset makes us
4213 * access the scratch area.
4214 */
4215 zio_nowait(zio_vdev_child_io(pio, NULL,
4216 raidvd->vdev_child[i],
4217 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4218 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4219 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4220 }
4221 error = zio_wait(pio);
4222 if (error != 0) {
4223 zfs_dbgmsg("reflow: error %d reading scratch location",
4224 error);
4225 goto io_error_exit;
4226 }
4227 goto overwrite;
4228 }
4229
4230 /*
4231 * Read from original location.
4232 */
4233 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4234 for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4235 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4236 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4237 0, abds[i], read_size, ZIO_TYPE_READ,
4238 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4239 raidz_scratch_child_done, pio));
4240 }
4241 error = zio_wait(pio);
4242 if (error != 0) {
4243 zfs_dbgmsg("reflow: error %d reading original location", error);
4244 io_error_exit:
4245 for (int i = 0; i < raidvd->vdev_children; i++)
4246 abd_free(abds[i]);
4247 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4248 zfs_rangelock_exit(lr);
4249 spa_config_exit(spa, SCL_STATE, FTAG);
4250 return;
4251 }
4252
4253 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4254
4255 /*
4256 * Reflow in memory.
4257 */
4258 uint64_t logical_sectors = logical_size >> ashift;
4259 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4260 int oldchild = i % (raidvd->vdev_children - 1);
4261 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4262
4263 int newchild = i % raidvd->vdev_children;
4264 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4265
4266 /* a single sector should not be copying over itself */
4267 ASSERT(!(newchild == oldchild && newoff == oldoff));
4268
4269 abd_copy_off(abds[newchild], abds[oldchild],
4270 newoff, oldoff, 1 << ashift);
4271 }
4272
4273 /*
4274 * Verify that we filled in everything we intended to (write_size on
4275 * each child).
4276 */
4277 VERIFY0(logical_sectors % raidvd->vdev_children);
4278 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4279 write_size);
4280
4281 /*
4282 * Write to scratch location (boot area).
4283 */
4284 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4285 for (int i = 0; i < raidvd->vdev_children; i++) {
4286 /*
4287 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4288 * the offset to calculate the physical offset to write to.
4289 * Passing in a negative offset lets us access the boot area.
4290 */
4291 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4292 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4293 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4294 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4295 }
4296 error = zio_wait(pio);
4297 if (error != 0) {
4298 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4299 goto io_error_exit;
4300 }
4301 pio = zio_root(spa, NULL, NULL, 0);
4302 zio_flush(pio, raidvd);
4303 zio_wait(pio);
4304
4305 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4306 (long long)logical_size);
4307
4308 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4309
4310 /*
4311 * Update uberblock to indicate that scratch space is valid. This is
4312 * needed because after this point, the real location may be
4313 * overwritten. If we crash, we need to get the data from the
4314 * scratch space, rather than the real location.
4315 *
4316 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4317 * will prefer this uberblock.
4318 */
4319 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4320 spa->spa_ubsync.ub_timestamp++;
4321 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4322 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4323 if (spa_multihost(spa))
4324 mmp_update_uberblock(spa, &spa->spa_ubsync);
4325
4326 zfs_dbgmsg("reflow: uberblock updated "
4327 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4328 (long long)spa->spa_ubsync.ub_txg,
4329 (long long)logical_size,
4330 (long long)spa->spa_ubsync.ub_timestamp);
4331
4332 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4333
4334 /*
4335 * Overwrite with reflow'ed data.
4336 */
4337 overwrite:
4338 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4339 for (int i = 0; i < raidvd->vdev_children; i++) {
4340 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4341 0, abds[i], write_size, ZIO_TYPE_WRITE,
4342 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4343 raidz_scratch_child_done, pio));
4344 }
4345 error = zio_wait(pio);
4346 if (error != 0) {
4347 /*
4348 * When we exit early here and drop the range lock, new
4349 * writes will go into the scratch area so we'll need to
4350 * read from there when we return after pausing.
4351 */
4352 zfs_dbgmsg("reflow: error %d writing real location", error);
4353 /*
4354 * Update the uberblock that is written when this txg completes.
4355 */
4356 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4357 logical_size);
4358 goto io_error_exit;
4359 }
4360 pio = zio_root(spa, NULL, NULL, 0);
4361 zio_flush(pio, raidvd);
4362 zio_wait(pio);
4363
4364 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4365 (long long)logical_size);
4366 for (int i = 0; i < raidvd->vdev_children; i++)
4367 abd_free(abds[i]);
4368 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4369
4370 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4371
4372 /*
4373 * Update uberblock to indicate that the initial part has been
4374 * reflow'ed. This is needed because after this point (when we exit
4375 * the rangelock), we allow regular writes to this region, which will
4376 * be written to the new location only (because reflow_offset_next ==
4377 * reflow_offset_synced). If we crashed and re-copied from the
4378 * scratch space, we would lose the regular writes.
4379 */
4380 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4381 logical_size);
4382 spa->spa_ubsync.ub_timestamp++;
4383 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4384 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4385 if (spa_multihost(spa))
4386 mmp_update_uberblock(spa, &spa->spa_ubsync);
4387
4388 zfs_dbgmsg("reflow: uberblock updated "
4389 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4390 (long long)spa->spa_ubsync.ub_txg,
4391 (long long)logical_size,
4392 (long long)spa->spa_ubsync.ub_timestamp);
4393
4394 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4395
4396 /*
4397 * Update progress.
4398 */
4399 vre->vre_offset = logical_size;
4400 zfs_rangelock_exit(lr);
4401 spa_config_exit(spa, SCL_STATE, FTAG);
4402
4403 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4404 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4405 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4406 /*
4407 * Note - raidz_reflow_sync() will update the uberblock state to
4408 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4409 */
4410 raidz_reflow_sync(spa, tx);
4411
4412 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4413 }
4414
4415 /*
4416 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4417 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4418 */
4419 void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4420 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4421 {
4422 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4423 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4424 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4425
4426 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4427 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4428 ASSERT0(logical_size % raidvd->vdev_children);
4429 uint64_t write_size = logical_size / raidvd->vdev_children;
4430
4431 zio_t *pio;
4432
4433 /*
4434 * Read from scratch space.
4435 */
4436 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4437 KM_SLEEP);
4438 for (int i = 0; i < raidvd->vdev_children; i++) {
4439 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4440 }
4441
4442 pio = zio_root(spa, NULL, NULL, 0);
4443 for (int i = 0; i < raidvd->vdev_children; i++) {
4444 /*
4445 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4446 * the offset to calculate the physical offset to write to.
4447 * Passing in a negative offset lets us access the boot area.
4448 */
4449 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4450 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4451 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4452 raidz_scratch_child_done, pio));
4453 }
4454 zio_wait(pio);
4455
4456 /*
4457 * Overwrite real location with reflow'ed data.
4458 */
4459 pio = zio_root(spa, NULL, NULL, 0);
4460 for (int i = 0; i < raidvd->vdev_children; i++) {
4461 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4462 0, abds[i], write_size, ZIO_TYPE_WRITE,
4463 ZIO_PRIORITY_REMOVAL, 0,
4464 raidz_scratch_child_done, pio));
4465 }
4466 zio_wait(pio);
4467 pio = zio_root(spa, NULL, NULL, 0);
4468 zio_flush(pio, raidvd);
4469 zio_wait(pio);
4470
4471 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4472 "to real location", (long long)logical_size);
4473
4474 for (int i = 0; i < raidvd->vdev_children; i++)
4475 abd_free(abds[i]);
4476 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4477
4478 /*
4479 * Update uberblock.
4480 */
4481 RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4482 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4483 spa->spa_ubsync.ub_timestamp++;
4484 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4485 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4486 if (spa_multihost(spa))
4487 mmp_update_uberblock(spa, &spa->spa_ubsync);
4488
4489 zfs_dbgmsg("reflow recovery: uberblock updated "
4490 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4491 (long long)spa->spa_ubsync.ub_txg,
4492 (long long)logical_size,
4493 (long long)spa->spa_ubsync.ub_timestamp);
4494
4495 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4496 spa_first_txg(spa));
4497 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4498 vre->vre_offset = logical_size;
4499 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4500 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4501 /*
4502 * Note that raidz_reflow_sync() will update the uberblock once more
4503 */
4504 raidz_reflow_sync(spa, tx);
4505
4506 dmu_tx_commit(tx);
4507
4508 spa_config_exit(spa, SCL_STATE, FTAG);
4509 }
4510
4511 static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4512 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4513 {
4514 (void) zthr;
4515 spa_t *spa = arg;
4516
4517 return (spa->spa_raidz_expand != NULL &&
4518 !spa->spa_raidz_expand->vre_waiting_for_resilver);
4519 }
4520
4521 /*
4522 * RAIDZ expansion background thread
4523 *
4524 * Can be called multiple times if the reflow is paused
4525 */
4526 static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4527 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4528 {
4529 spa_t *spa = arg;
4530 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4531
4532 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4533 vre->vre_offset = 0;
4534 else
4535 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4536
4537 /* Reflow the begining portion using the scratch area */
4538 if (vre->vre_offset == 0) {
4539 VERIFY0(dsl_sync_task(spa_name(spa),
4540 NULL, raidz_reflow_scratch_sync,
4541 vre, 0, ZFS_SPACE_CHECK_NONE));
4542
4543 /* if we encountered errors then pause */
4544 if (vre->vre_offset == 0) {
4545 mutex_enter(&vre->vre_lock);
4546 vre->vre_waiting_for_resilver = B_TRUE;
4547 mutex_exit(&vre->vre_lock);
4548 return;
4549 }
4550 }
4551
4552 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4553 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4554
4555 uint64_t guid = raidvd->vdev_guid;
4556
4557 /* Iterate over all the remaining metaslabs */
4558 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4559 i < raidvd->vdev_ms_count &&
4560 !zthr_iscancelled(zthr) &&
4561 vre->vre_failed_offset == UINT64_MAX; i++) {
4562 metaslab_t *msp = raidvd->vdev_ms[i];
4563
4564 metaslab_disable(msp);
4565 mutex_enter(&msp->ms_lock);
4566
4567 /*
4568 * The metaslab may be newly created (for the expanded
4569 * space), in which case its trees won't exist yet,
4570 * so we need to bail out early.
4571 */
4572 if (msp->ms_new) {
4573 mutex_exit(&msp->ms_lock);
4574 metaslab_enable(msp, B_FALSE, B_FALSE);
4575 continue;
4576 }
4577
4578 VERIFY0(metaslab_load(msp));
4579
4580 /*
4581 * We want to copy everything except the free (allocatable)
4582 * space. Note that there may be a little bit more free
4583 * space (e.g. in ms_defer), and it's fine to copy that too.
4584 */
4585 uint64_t shift, start;
4586 zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
4587 raidvd, msp, &start, &shift);
4588 zfs_range_tree_t *rt = zfs_range_tree_create_flags(
4589 NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
4590 metaslab_rt_name(msp->ms_group, msp,
4591 "spa_raidz_expand_thread:rt"));
4592 zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
4593 zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
4594 rt);
4595 mutex_exit(&msp->ms_lock);
4596
4597 /*
4598 * Force the last sector of each metaslab to be copied. This
4599 * ensures that we advance the on-disk progress to the end of
4600 * this metaslab while the metaslab is disabled. Otherwise, we
4601 * could move past this metaslab without advancing the on-disk
4602 * progress, and then an allocation to this metaslab would not
4603 * be copied.
4604 */
4605 int sectorsz = 1 << raidvd->vdev_ashift;
4606 uint64_t ms_last_offset = msp->ms_start +
4607 msp->ms_size - sectorsz;
4608 if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
4609 zfs_range_tree_add(rt, ms_last_offset, sectorsz);
4610 }
4611
4612 /*
4613 * When we are resuming from a paused expansion (i.e.
4614 * when importing a pool with a expansion in progress),
4615 * discard any state that we have already processed.
4616 */
4617 if (vre->vre_offset > msp->ms_start) {
4618 zfs_range_tree_clear(rt, msp->ms_start,
4619 vre->vre_offset - msp->ms_start);
4620 }
4621
4622 while (!zthr_iscancelled(zthr) &&
4623 !zfs_range_tree_is_empty(rt) &&
4624 vre->vre_failed_offset == UINT64_MAX) {
4625
4626 /*
4627 * We need to periodically drop the config lock so that
4628 * writers can get in. Additionally, we can't wait
4629 * for a txg to sync while holding a config lock
4630 * (since a waiting writer could cause a 3-way deadlock
4631 * with the sync thread, which also gets a config
4632 * lock for reader). So we can't hold the config lock
4633 * while calling dmu_tx_assign().
4634 */
4635 spa_config_exit(spa, SCL_CONFIG, FTAG);
4636
4637 /*
4638 * If requested, pause the reflow when the amount
4639 * specified by raidz_expand_max_reflow_bytes is reached
4640 *
4641 * This pause is only used during testing or debugging.
4642 */
4643 while (raidz_expand_max_reflow_bytes != 0 &&
4644 raidz_expand_max_reflow_bytes <=
4645 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4646 delay(hz);
4647 }
4648
4649 mutex_enter(&vre->vre_lock);
4650 while (vre->vre_outstanding_bytes >
4651 raidz_expand_max_copy_bytes) {
4652 cv_wait(&vre->vre_cv, &vre->vre_lock);
4653 }
4654 mutex_exit(&vre->vre_lock);
4655
4656 dmu_tx_t *tx =
4657 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4658
4659 VERIFY0(dmu_tx_assign(tx,
4660 DMU_TX_WAIT | DMU_TX_SUSPEND));
4661 uint64_t txg = dmu_tx_get_txg(tx);
4662
4663 /*
4664 * Reacquire the vdev_config lock. Theoretically, the
4665 * vdev_t that we're expanding may have changed.
4666 */
4667 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4668 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4669
4670 boolean_t needsync =
4671 raidz_reflow_impl(raidvd, vre, rt, tx);
4672
4673 dmu_tx_commit(tx);
4674
4675 if (needsync) {
4676 spa_config_exit(spa, SCL_CONFIG, FTAG);
4677 txg_wait_synced(spa->spa_dsl_pool, txg);
4678 spa_config_enter(spa, SCL_CONFIG, FTAG,
4679 RW_READER);
4680 }
4681 }
4682
4683 spa_config_exit(spa, SCL_CONFIG, FTAG);
4684
4685 metaslab_enable(msp, B_FALSE, B_FALSE);
4686 zfs_range_tree_vacate(rt, NULL, NULL);
4687 zfs_range_tree_destroy(rt);
4688
4689 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4690 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4691 }
4692
4693 spa_config_exit(spa, SCL_CONFIG, FTAG);
4694
4695 /*
4696 * The txg_wait_synced() here ensures that all reflow zio's have
4697 * completed, and vre_failed_offset has been set if necessary. It
4698 * also ensures that the progress of the last raidz_reflow_sync() is
4699 * written to disk before raidz_reflow_complete_sync() changes the
4700 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4701 * determine if a reflow is in progress, in which case we may need to
4702 * write to both old and new locations. Therefore we can only change
4703 * vre_state once this is not necessary, which is once the on-disk
4704 * progress (in spa_ubsync) has been set past any possible writes (to
4705 * the end of the last metaslab).
4706 */
4707 txg_wait_synced(spa->spa_dsl_pool, 0);
4708
4709 if (!zthr_iscancelled(zthr) &&
4710 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4711 /*
4712 * We are not being canceled or paused, so the reflow must be
4713 * complete. In that case also mark it as completed on disk.
4714 */
4715 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4716 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4717 raidz_reflow_complete_sync, spa,
4718 0, ZFS_SPACE_CHECK_NONE));
4719 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4720 } else {
4721 /*
4722 * Wait for all copy zio's to complete and for all the
4723 * raidz_reflow_sync() synctasks to be run.
4724 */
4725 spa_history_log_internal(spa, "reflow pause",
4726 NULL, "offset=%llu failed_offset=%lld",
4727 (long long)vre->vre_offset,
4728 (long long)vre->vre_failed_offset);
4729 mutex_enter(&vre->vre_lock);
4730 if (vre->vre_failed_offset != UINT64_MAX) {
4731 /*
4732 * Reset progress so that we will retry everything
4733 * after the point that something failed.
4734 */
4735 vre->vre_offset = vre->vre_failed_offset;
4736 vre->vre_failed_offset = UINT64_MAX;
4737 vre->vre_waiting_for_resilver = B_TRUE;
4738 }
4739 mutex_exit(&vre->vre_lock);
4740 }
4741 }
4742
4743 void
spa_start_raidz_expansion_thread(spa_t * spa)4744 spa_start_raidz_expansion_thread(spa_t *spa)
4745 {
4746 ASSERT0P(spa->spa_raidz_expand_zthr);
4747 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4748 spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4749 spa, defclsyspri);
4750 }
4751
4752 void
raidz_dtl_reassessed(vdev_t * vd)4753 raidz_dtl_reassessed(vdev_t *vd)
4754 {
4755 spa_t *spa = vd->vdev_spa;
4756 if (spa->spa_raidz_expand != NULL) {
4757 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4758 /*
4759 * we get called often from vdev_dtl_reassess() so make
4760 * sure it's our vdev and any replacing is complete
4761 */
4762 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4763 !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4764 mutex_enter(&vre->vre_lock);
4765 if (vre->vre_waiting_for_resilver) {
4766 vdev_dbgmsg(vd, "DTL reassessed, "
4767 "continuing raidz expansion");
4768 vre->vre_waiting_for_resilver = B_FALSE;
4769 zthr_wakeup(spa->spa_raidz_expand_zthr);
4770 }
4771 mutex_exit(&vre->vre_lock);
4772 }
4773 }
4774 }
4775
4776 int
vdev_raidz_attach_check(vdev_t * new_child)4777 vdev_raidz_attach_check(vdev_t *new_child)
4778 {
4779 vdev_t *raidvd = new_child->vdev_parent;
4780 uint64_t new_children = raidvd->vdev_children;
4781
4782 /*
4783 * We use the "boot" space as scratch space to handle overwriting the
4784 * initial part of the vdev. If it is too small, then this expansion
4785 * is not allowed. This would be very unusual (e.g. ashift > 13 and
4786 * >200 children).
4787 */
4788 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4789 return (EINVAL);
4790 }
4791 return (0);
4792 }
4793
4794 void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)4795 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4796 {
4797 vdev_t *new_child = arg;
4798 spa_t *spa = new_child->vdev_spa;
4799 vdev_t *raidvd = new_child->vdev_parent;
4800 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4801 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4802 ASSERT3P(raidvd->vdev_top, ==, raidvd);
4803 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4804 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4805 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4806 new_child);
4807
4808 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4809
4810 vdrz->vd_physical_width++;
4811
4812 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4813 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4814 vdrz->vn_vre.vre_offset = 0;
4815 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4816 spa->spa_raidz_expand = &vdrz->vn_vre;
4817 zthr_wakeup(spa->spa_raidz_expand_zthr);
4818
4819 /*
4820 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4821 * written to the config.
4822 */
4823 vdev_config_dirty(raidvd);
4824
4825 vdrz->vn_vre.vre_start_time = gethrestime_sec();
4826 vdrz->vn_vre.vre_end_time = 0;
4827 vdrz->vn_vre.vre_state = DSS_SCANNING;
4828 vdrz->vn_vre.vre_bytes_copied = 0;
4829
4830 uint64_t state = vdrz->vn_vre.vre_state;
4831 VERIFY0(zap_update(spa->spa_meta_objset,
4832 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4833 sizeof (state), 1, &state, tx));
4834
4835 uint64_t start_time = vdrz->vn_vre.vre_start_time;
4836 VERIFY0(zap_update(spa->spa_meta_objset,
4837 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4838 sizeof (start_time), 1, &start_time, tx));
4839
4840 (void) zap_remove(spa->spa_meta_objset,
4841 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4842 (void) zap_remove(spa->spa_meta_objset,
4843 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4844
4845 spa_history_log_internal(spa, "raidz vdev expansion started", tx,
4846 "%s vdev %llu new width %llu", spa_name(spa),
4847 (unsigned long long)raidvd->vdev_id,
4848 (unsigned long long)raidvd->vdev_children);
4849 }
4850
4851 int
vdev_raidz_load(vdev_t * vd)4852 vdev_raidz_load(vdev_t *vd)
4853 {
4854 vdev_raidz_t *vdrz = vd->vdev_tsd;
4855 int err;
4856
4857 uint64_t state = DSS_NONE;
4858 uint64_t start_time = 0;
4859 uint64_t end_time = 0;
4860 uint64_t bytes_copied = 0;
4861
4862 if (vd->vdev_top_zap != 0) {
4863 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4864 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4865 sizeof (state), 1, &state);
4866 if (err != 0 && err != ENOENT)
4867 return (err);
4868
4869 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4870 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4871 sizeof (start_time), 1, &start_time);
4872 if (err != 0 && err != ENOENT)
4873 return (err);
4874
4875 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4876 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4877 sizeof (end_time), 1, &end_time);
4878 if (err != 0 && err != ENOENT)
4879 return (err);
4880
4881 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4882 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4883 sizeof (bytes_copied), 1, &bytes_copied);
4884 if (err != 0 && err != ENOENT)
4885 return (err);
4886 }
4887
4888 /*
4889 * If we are in the middle of expansion, vre_state should have
4890 * already been set by vdev_raidz_init().
4891 */
4892 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4893 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4894 vdrz->vn_vre.vre_start_time = start_time;
4895 vdrz->vn_vre.vre_end_time = end_time;
4896 vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4897
4898 return (0);
4899 }
4900
4901 int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)4902 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4903 {
4904 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4905
4906 if (vre == NULL) {
4907 /* no removal in progress; find most recent completed */
4908 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4909 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4910 if (vd->vdev_ops == &vdev_raidz_ops) {
4911 vdev_raidz_t *vdrz = vd->vdev_tsd;
4912
4913 if (vdrz->vn_vre.vre_end_time != 0 &&
4914 (vre == NULL ||
4915 vdrz->vn_vre.vre_end_time >
4916 vre->vre_end_time)) {
4917 vre = &vdrz->vn_vre;
4918 }
4919 }
4920 }
4921 }
4922
4923 if (vre == NULL) {
4924 return (SET_ERROR(ENOENT));
4925 }
4926
4927 pres->pres_state = vre->vre_state;
4928 pres->pres_expanding_vdev = vre->vre_vdev_id;
4929
4930 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4931 pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4932
4933 mutex_enter(&vre->vre_lock);
4934 pres->pres_reflowed = vre->vre_bytes_copied;
4935 for (int i = 0; i < TXG_SIZE; i++)
4936 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4937 mutex_exit(&vre->vre_lock);
4938
4939 pres->pres_start_time = vre->vre_start_time;
4940 pres->pres_end_time = vre->vre_end_time;
4941 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4942
4943 return (0);
4944 }
4945
4946 /*
4947 * Initialize private RAIDZ specific fields from the nvlist.
4948 */
4949 static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)4950 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4951 {
4952 uint_t children;
4953 nvlist_t **child;
4954 int error = nvlist_lookup_nvlist_array(nv,
4955 ZPOOL_CONFIG_CHILDREN, &child, &children);
4956 if (error != 0)
4957 return (SET_ERROR(EINVAL));
4958
4959 uint64_t nparity;
4960 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4961 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4962 return (SET_ERROR(EINVAL));
4963
4964 /*
4965 * Previous versions could only support 1 or 2 parity
4966 * device.
4967 */
4968 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4969 return (SET_ERROR(EINVAL));
4970 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4971 return (SET_ERROR(EINVAL));
4972 } else {
4973 /*
4974 * We require the parity to be specified for SPAs that
4975 * support multiple parity levels.
4976 */
4977 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4978 return (SET_ERROR(EINVAL));
4979
4980 /*
4981 * Otherwise, we default to 1 parity device for RAID-Z.
4982 */
4983 nparity = 1;
4984 }
4985
4986 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4987 vdrz->vn_vre.vre_vdev_id = -1;
4988 vdrz->vn_vre.vre_offset = UINT64_MAX;
4989 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4990 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4991 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4992 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4993 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4994 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4995 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4996
4997 vdrz->vd_physical_width = children;
4998 vdrz->vd_nparity = nparity;
4999
5000 /* note, the ID does not exist when creating a pool */
5001 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
5002 &vdrz->vn_vre.vre_vdev_id);
5003
5004 boolean_t reflow_in_progress =
5005 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5006 if (reflow_in_progress) {
5007 spa->spa_raidz_expand = &vdrz->vn_vre;
5008 vdrz->vn_vre.vre_state = DSS_SCANNING;
5009 }
5010
5011 vdrz->vd_original_width = children;
5012 uint64_t *txgs;
5013 unsigned int txgs_size = 0;
5014 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5015 &txgs, &txgs_size);
5016 if (error == 0) {
5017 for (int i = 0; i < txgs_size; i++) {
5018 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
5019 re->re_txg = txgs[txgs_size - i - 1];
5020 re->re_logical_width = vdrz->vd_physical_width - i;
5021
5022 if (reflow_in_progress)
5023 re->re_logical_width--;
5024
5025 avl_add(&vdrz->vd_expand_txgs, re);
5026 }
5027
5028 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
5029 }
5030 if (reflow_in_progress) {
5031 vdrz->vd_original_width--;
5032 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
5033 children, txgs_size);
5034 }
5035
5036 *tsd = vdrz;
5037
5038 return (0);
5039 }
5040
5041 static void
vdev_raidz_fini(vdev_t * vd)5042 vdev_raidz_fini(vdev_t *vd)
5043 {
5044 vdev_raidz_t *vdrz = vd->vdev_tsd;
5045 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5046 vd->vdev_spa->spa_raidz_expand = NULL;
5047 reflow_node_t *re;
5048 void *cookie = NULL;
5049 avl_tree_t *tree = &vdrz->vd_expand_txgs;
5050 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5051 kmem_free(re, sizeof (*re));
5052 avl_destroy(&vdrz->vd_expand_txgs);
5053 mutex_destroy(&vdrz->vd_expand_lock);
5054 mutex_destroy(&vdrz->vn_vre.vre_lock);
5055 cv_destroy(&vdrz->vn_vre.vre_cv);
5056 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5057 kmem_free(vdrz, sizeof (*vdrz));
5058 }
5059
5060 /*
5061 * Add RAIDZ specific fields to the config nvlist.
5062 */
5063 static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)5064 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5065 {
5066 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5067 vdev_raidz_t *vdrz = vd->vdev_tsd;
5068
5069 /*
5070 * Make sure someone hasn't managed to sneak a fancy new vdev
5071 * into a crufty old storage pool.
5072 */
5073 ASSERT(vdrz->vd_nparity == 1 ||
5074 (vdrz->vd_nparity <= 2 &&
5075 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5076 (vdrz->vd_nparity <= 3 &&
5077 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5078
5079 /*
5080 * Note that we'll add these even on storage pools where they
5081 * aren't strictly required -- older software will just ignore
5082 * it.
5083 */
5084 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5085
5086 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5087 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5088 }
5089
5090 mutex_enter(&vdrz->vd_expand_lock);
5091 if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5092 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5093 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5094 KM_SLEEP);
5095 uint64_t i = 0;
5096
5097 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5098 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5099 txgs[i++] = re->re_txg;
5100 }
5101
5102 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5103 txgs, count);
5104
5105 kmem_free(txgs, sizeof (uint64_t) * count);
5106 }
5107 mutex_exit(&vdrz->vd_expand_lock);
5108 }
5109
5110 static uint64_t
vdev_raidz_nparity(vdev_t * vd)5111 vdev_raidz_nparity(vdev_t *vd)
5112 {
5113 vdev_raidz_t *vdrz = vd->vdev_tsd;
5114 return (vdrz->vd_nparity);
5115 }
5116
5117 static uint64_t
vdev_raidz_ndisks(vdev_t * vd)5118 vdev_raidz_ndisks(vdev_t *vd)
5119 {
5120 return (vd->vdev_children);
5121 }
5122
5123 vdev_ops_t vdev_raidz_ops = {
5124 .vdev_op_init = vdev_raidz_init,
5125 .vdev_op_fini = vdev_raidz_fini,
5126 .vdev_op_open = vdev_raidz_open,
5127 .vdev_op_close = vdev_raidz_close,
5128 .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
5129 .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
5130 .vdev_op_min_asize = vdev_raidz_min_asize,
5131 .vdev_op_min_alloc = NULL,
5132 .vdev_op_io_start = vdev_raidz_io_start,
5133 .vdev_op_io_done = vdev_raidz_io_done,
5134 .vdev_op_state_change = vdev_raidz_state_change,
5135 .vdev_op_need_resilver = vdev_raidz_need_resilver,
5136 .vdev_op_hold = NULL,
5137 .vdev_op_rele = NULL,
5138 .vdev_op_remap = NULL,
5139 .vdev_op_xlate = vdev_raidz_xlate,
5140 .vdev_op_rebuild_asize = NULL,
5141 .vdev_op_metaslab_init = NULL,
5142 .vdev_op_config_generate = vdev_raidz_config_generate,
5143 .vdev_op_nparity = vdev_raidz_nparity,
5144 .vdev_op_ndisks = vdev_raidz_ndisks,
5145 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5146 .vdev_op_leaf = B_FALSE /* not a leaf vdev */
5147 };
5148
5149 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5150 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5151 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5152 "Max amount of concurrent i/o for RAIDZ expansion");
5153 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5154 "For expanded RAIDZ, aggregate reads that have more rows than this");
5155 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5156 "For expanded RAIDZ, automatically start a pool scrub when expansion "
5157 "completes");
5158