xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision f9590540c524607d22fa7e718c758725c4365375)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
27  * Copyright (c) 2025, Klara, Inc.
28  * Copyright (c) 2026, Wasabi Technologies, Inc.
29  */
30 
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/spa_impl.h>
34 #include <sys/zap.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/metaslab_impl.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/dmu_tx.h>
40 #include <sys/abd.h>
41 #include <sys/zfs_rlock.h>
42 #include <sys/fs/zfs.h>
43 #include <sys/fm/fs/zfs.h>
44 #include <sys/vdev_raidz.h>
45 #include <sys/vdev_raidz_impl.h>
46 #include <sys/vdev_draid.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/dsl_scan.h>
49 
50 #ifdef ZFS_DEBUG
51 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
52 #endif
53 
54 /*
55  * Virtual device vector for RAID-Z.
56  *
57  * This vdev supports single, double, and triple parity. For single parity,
58  * we use a simple XOR of all the data columns. For double or triple parity,
59  * we use a special case of Reed-Solomon coding. This extends the
60  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
61  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
62  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
63  * former is also based. The latter is designed to provide higher performance
64  * for writes.
65  *
66  * Note that the Plank paper claimed to support arbitrary N+M, but was then
67  * amended six years later identifying a critical flaw that invalidates its
68  * claims. Nevertheless, the technique can be adapted to work for up to
69  * triple parity. For additional parity, the amendment "Note: Correction to
70  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
71  * is viable, but the additional complexity means that write performance will
72  * suffer.
73  *
74  * All of the methods above operate on a Galois field, defined over the
75  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
76  * can be expressed with a single byte. Briefly, the operations on the
77  * field are defined as follows:
78  *
79  *   o addition (+) is represented by a bitwise XOR
80  *   o subtraction (-) is therefore identical to addition: A + B = A - B
81  *   o multiplication of A by 2 is defined by the following bitwise expression:
82  *
83  *	(A * 2)_7 = A_6
84  *	(A * 2)_6 = A_5
85  *	(A * 2)_5 = A_4
86  *	(A * 2)_4 = A_3 + A_7
87  *	(A * 2)_3 = A_2 + A_7
88  *	(A * 2)_2 = A_1 + A_7
89  *	(A * 2)_1 = A_0
90  *	(A * 2)_0 = A_7
91  *
92  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
93  * As an aside, this multiplication is derived from the error correcting
94  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
95  *
96  * Observe that any number in the field (except for 0) can be expressed as a
97  * power of 2 -- a generator for the field. We store a table of the powers of
98  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
99  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
100  * than field addition). The inverse of a field element A (A^-1) is therefore
101  * A ^ (255 - 1) = A^254.
102  *
103  * The up-to-three parity columns, P, Q, R over several data columns,
104  * D_0, ... D_n-1, can be expressed by field operations:
105  *
106  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
107  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
108  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
109  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
110  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
111  *
112  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
113  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
114  * independent coefficients. (There are no additional coefficients that have
115  * this property which is why the uncorrected Plank method breaks down.)
116  *
117  * See the reconstruction code below for how P, Q and R can used individually
118  * or in concert to recover missing data columns.
119  */
120 
121 #define	VDEV_RAIDZ_P		0
122 #define	VDEV_RAIDZ_Q		1
123 #define	VDEV_RAIDZ_R		2
124 
125 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
126 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
127 
128 /*
129  * We provide a mechanism to perform the field multiplication operation on a
130  * 64-bit value all at once rather than a byte at a time. This works by
131  * creating a mask from the top bit in each byte and using that to
132  * conditionally apply the XOR of 0x1d.
133  */
134 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
135 { \
136 	(mask) = (x) & 0x8080808080808080ULL; \
137 	(mask) = ((mask) << 1) - ((mask) >> 7); \
138 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
139 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
140 }
141 
142 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
143 { \
144 	VDEV_RAIDZ_64MUL_2((x), mask); \
145 	VDEV_RAIDZ_64MUL_2((x), mask); \
146 }
147 
148 
149 /*
150  * Big Theory Statement for how a RAIDZ VDEV is expanded
151  *
152  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
153  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
154  * that have been previously expanded can be expanded again.
155  *
156  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
157  * the VDEV) when an expansion starts.  And the expansion will pause if any
158  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
159  * operations on the pool can continue while an expansion is in progress (e.g.
160  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
161  * and zpool initialize which can't be run during an expansion.  Following a
162  * reboot or export/import, the expansion resumes where it left off.
163  *
164  * == Reflowing the Data ==
165  *
166  * The expansion involves reflowing (copying) the data from the current set
167  * of disks to spread it across the new set which now has one more disk. This
168  * reflow operation is similar to reflowing text when the column width of a
169  * text editor window is expanded. The text doesn’t change but the location of
170  * the text changes to accommodate the new width. An example reflow result for
171  * a 4-wide RAIDZ1 to a 5-wide is shown below.
172  *
173  *                            Reflow End State
174  *            Each letter indicates a parity group (logical stripe)
175  *
176  *         Before expansion                         After Expansion
177  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
178  *  +------+------+------+------+         +------+------+------+------+------+
179  *  |      |      |      |      |         |      |      |      |      |      |
180  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
181  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
182  *  +------+------+------+------+         +------+------+------+------+------+
183  *  |      |      |      |      |         |      |      |      |      |      |
184  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
185  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
186  *  +------+------+------+------+         +------+------+------+------+------+
187  *  |      |      |      |      |         |      |      |      |      |      |
188  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
189  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
190  *  +------+------+------+------+         +------+------+------+------+------+
191  *  |      |      |      |      |         |      |      |      |      |      |
192  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
193  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
194  *  +------+------+------+------+         +------+------+------+------+------+
195  *  |      |      |      |      |         |      |      |      |      |      |
196  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
197  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
198  *  +------+------+------+------+         +------+------+------+------+------+
199  *  |      |      |      |      |         |      |      |      |      |      |
200  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
201  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
202  *  +------+------+------+------+         +------+------+------+------+------+
203  *  |      |      |      |      |         |      |      |      |      |      |
204  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
205  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
206  *  +------+------+------+------+         +------+------+------+------+------+
207  *
208  * This reflow approach has several advantages. There is no need to read or
209  * modify the block pointers or recompute any block checksums.  The reflow
210  * doesn’t need to know where the parity sectors reside. We can read and write
211  * data sequentially and the copy can occur in a background thread in open
212  * context. The design also allows for fast discovery of what data to copy.
213  *
214  * The VDEV metaslabs are processed, one at a time, to copy the block data to
215  * have it flow across all the disks. The metaslab is disabled for allocations
216  * during the copy. As an optimization, we only copy the allocated data which
217  * can be determined by looking at the metaslab range tree. During the copy we
218  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
219  * need to be able to survive losing parity count disks).  This means we
220  * cannot overwrite data during the reflow that would be needed if a disk is
221  * lost.
222  *
223  * After the reflow completes, all newly-written blocks will have the new
224  * layout, i.e., they will have the parity to data ratio implied by the new
225  * number of disks in the RAIDZ group.  Even though the reflow copies all of
226  * the allocated space (data and parity), it is only rearranged, not changed.
227  *
228  * This act of reflowing the data has a few implications about blocks
229  * that were written before the reflow completes:
230  *
231  *  - Old blocks will still use the same amount of space (i.e., they will have
232  *    the parity to data ratio implied by the old number of disks in the RAIDZ
233  *    group).
234  *  - Reading old blocks will be slightly slower than before the reflow, for
235  *    two reasons. First, we will have to read from all disks in the RAIDZ
236  *    VDEV, rather than being able to skip the children that contain only
237  *    parity of this block (because the data of a single block is now spread
238  *    out across all the disks).  Second, in most cases there will be an extra
239  *    bcopy, needed to rearrange the data back to its original layout in memory.
240  *
241  * == Scratch Area ==
242  *
243  * As we copy the block data, we can only progress to the point that writes
244  * will not overlap with blocks whose progress has not yet been recorded on
245  * disk.  Since partially-copied rows are always read from the old location,
246  * we need to stop one row before the sector-wise overlap, to prevent any
247  * row-wise overlap. For example, in the diagram above, when we reflow sector
248  * B6 it will overwite the original location for B5.
249  *
250  * To get around this, a scratch space is used so that we can start copying
251  * without risking data loss by overlapping the row. As an added benefit, it
252  * improves performance at the beginning of the reflow, but that small perf
253  * boost wouldn't be worth the complexity on its own.
254  *
255  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
256  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
257  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
258  * the widths will likely be single digits so we can get a substantial chuck
259  * size using only a few MB of scratch per disk.
260  *
261  * The scratch area is persisted to disk which holds a large amount of reflowed
262  * state. We can always read the partially written stripes when a disk fails or
263  * the copy is interrupted (crash) during the initial copying phase and also
264  * get past a small chunk size restriction.  At a minimum, the scratch space
265  * must be large enough to get us to the point that one row does not overlap
266  * itself when moved (i.e new_width^2).  But going larger is even better. We
267  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
268  * as our scratch space to handle overwriting the initial part of the VDEV.
269  *
270  *	0     256K   512K                    4M
271  *	+------+------+-----------------------+-----------------------------
272  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
273  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
274  *	+------+------+-----------------------+-------------------------------
275  *                        Scratch Area
276  *
277  * == Reflow Progress Updates ==
278  * After the initial scratch-based reflow, the expansion process works
279  * similarly to device removal. We create a new open context thread which
280  * reflows the data, and periodically kicks off sync tasks to update logical
281  * state. In this case, state is the committed progress (offset of next data
282  * to copy). We need to persist the completed offset on disk, so that if we
283  * crash we know which format each VDEV offset is in.
284  *
285  * == Time Dependent Geometry ==
286  *
287  * In non-expanded RAIDZ, blocks are read from disk in a column by column
288  * fashion. For a multi-row block, the second sector is in the first column
289  * not in the second column. This allows us to issue full reads for each
290  * column directly into the request buffer. The block data is thus laid out
291  * sequentially in a column-by-column fashion.
292  *
293  * For example, in the before expansion diagram above, one logical block might
294  * be sectors G19-H26. The parity is in G19,H23; and the data is in
295  * G20,H24,G21,H25,G22,H26.
296  *
297  * After a block is reflowed, the sectors that were all in the original column
298  * data can now reside in different columns. When reading from an expanded
299  * VDEV, we need to know the logical stripe width for each block so we can
300  * reconstitute the block’s data after the reads are completed. Likewise,
301  * when we perform the combinatorial reconstruction we need to know the
302  * original width so we can retry combinations from the past layouts.
303  *
304  * Time dependent geometry is what we call having blocks with different layouts
305  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
306  * block’s birth time (+ the time expansion ended) to establish the correct
307  * width for a given block. After an expansion completes, we record the time
308  * for blocks written with a particular width (geometry).
309  *
310  * == On Disk Format Changes ==
311  *
312  * New pool feature flag, 'raidz_expansion' whose reference count is the number
313  * of RAIDZ VDEVs that have been expanded.
314  *
315  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
316  *
317  * Since the uberblock can point to arbitrary blocks, which might be on the
318  * expanding RAIDZ, and might or might not have been expanded. We need to know
319  * which way a block is laid out before reading it. This info is the next
320  * offset that needs to be reflowed and we persist that in the uberblock, in
321  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
322  * After the expansion is complete, we then use the raidz_expand_txgs array
323  * (see below) to determine how to read a block and the ub_raidz_reflow_info
324  * field no longer required.
325  *
326  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
327  * state (i.e., active or not) which is also required before reading a block
328  * during the initial phase of reflowing the data.
329  *
330  * The top-level RAIDZ VDEV has two new entries in the nvlist:
331  *
332  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
333  *                            and used after the expansion is complete to
334  *                            determine how to read a raidz block
335  * 'raidz_expanding' boolean: present during reflow and removed after completion
336  *                            used during a spa import to resume an unfinished
337  *                            expansion
338  *
339  * And finally the VDEVs top zap adds the following informational entries:
340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
341  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
342  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
343  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
344  */
345 
346 /*
347  * For testing only: pause the raidz expansion after reflowing this amount.
348  * (accessed by ZTS and ztest)
349  */
350 #ifdef	_KERNEL
351 static
352 #endif	/* _KERNEL */
353 unsigned long raidz_expand_max_reflow_bytes = 0;
354 
355 /*
356  * For testing only: pause the raidz expansion at a certain point.
357  */
358 uint_t raidz_expand_pause_point = 0;
359 
360 /*
361  * This represents the duration for a slow drive read sit out.
362  */
363 static unsigned long vdev_read_sit_out_secs = 600;
364 
365 /*
366  * How often each RAID-Z and dRAID vdev will check for slow disk outliers.
367  * Increasing this interval will reduce the sensitivity of detection (since all
368  * I/Os since the last check are included in the statistics), but will slow the
369  * response to a disk developing a problem.
370  *
371  * Defaults to once per second; setting extremely small values may cause
372  * negative performance effects.
373  */
374 static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
375 
376 /*
377  * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
378  * used to determine how far out an outlier must be before it counts as an event
379  * worth consdering.
380  *
381  * Smaller values will result in more aggressive sitting out of disks that may
382  * have problems, but may significantly increase the rate of spurious sit-outs.
383  */
384 static uint32_t vdev_raidz_outlier_insensitivity = 50;
385 
386 /*
387  * Maximum amount of copy io's outstanding at once.
388  */
389 #ifdef _ILP32
390 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
391 #else
392 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
393 #endif
394 
395 /*
396  * Apply raidz map abds aggregation if the number of rows in the map is equal
397  * or greater than the value below.
398  */
399 static unsigned long raidz_io_aggregate_rows = 4;
400 
401 /*
402  * Automatically start a pool scrub when a RAIDZ expansion completes in
403  * order to verify the checksums of all blocks which have been copied
404  * during the expansion.  Automatic scrubbing is enabled by default and
405  * is strongly recommended.
406  */
407 static int zfs_scrub_after_expand = 1;
408 
409 /*
410  * If there are errors when writing, but few enough that the data is
411  * recoverable, then ZFS used to silently move on, leaving the data not 100%
412  * redundant. If this tunable is set, we issue a read after that case occurs,
413  * allowing the normal error recovery process to handle it.
414  *
415  * NOTE: Currently applies only to raidz and draid.
416  */
417 static int zfs_scrub_partial_writes = 1;
418 
419 static void
vdev_raidz_row_free(raidz_row_t * rr)420 vdev_raidz_row_free(raidz_row_t *rr)
421 {
422 	for (int c = 0; c < rr->rr_cols; c++) {
423 		raidz_col_t *rc = &rr->rr_col[c];
424 
425 		if (rc->rc_size != 0)
426 			abd_free(rc->rc_abd);
427 		if (rc->rc_orig_data != NULL)
428 			abd_free(rc->rc_orig_data);
429 	}
430 
431 	if (rr->rr_abd_empty != NULL)
432 		abd_free(rr->rr_abd_empty);
433 
434 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
435 }
436 
437 void
vdev_raidz_map_free(raidz_map_t * rm)438 vdev_raidz_map_free(raidz_map_t *rm)
439 {
440 	for (int i = 0; i < rm->rm_nrows; i++)
441 		vdev_raidz_row_free(rm->rm_row[i]);
442 
443 	if (rm->rm_nphys_cols) {
444 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
445 			if (rm->rm_phys_col[i].rc_abd != NULL)
446 				abd_free(rm->rm_phys_col[i].rc_abd);
447 		}
448 
449 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
450 		    rm->rm_nphys_cols);
451 	}
452 
453 	ASSERT0P(rm->rm_lr);
454 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
455 }
456 
457 static void
vdev_raidz_map_free_vsd(zio_t * zio)458 vdev_raidz_map_free_vsd(zio_t *zio)
459 {
460 	raidz_map_t *rm = zio->io_vsd;
461 
462 	vdev_raidz_map_free(rm);
463 }
464 
465 static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)466 vdev_raidz_reflow_compare(const void *x1, const void *x2)
467 {
468 	const reflow_node_t *l = x1;
469 	const reflow_node_t *r = x2;
470 
471 	return (TREE_CMP(l->re_txg, r->re_txg));
472 }
473 
474 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
475 	.vsd_free = vdev_raidz_map_free_vsd,
476 };
477 
478 raidz_row_t *
vdev_raidz_row_alloc(int cols,zio_t * zio)479 vdev_raidz_row_alloc(int cols, zio_t *zio)
480 {
481 	raidz_row_t *rr =
482 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
483 
484 	rr->rr_cols = cols;
485 	rr->rr_scols = cols;
486 
487 	for (int c = 0; c < cols; c++) {
488 		raidz_col_t *rc = &rr->rr_col[c];
489 		rc->rc_shadow_devidx = INT_MAX;
490 		rc->rc_shadow_offset = UINT64_MAX;
491 		/*
492 		 * We can not allow self healing to take place for Direct I/O
493 		 * reads. There is nothing that stops the buffer contents from
494 		 * being manipulated while the I/O is in flight. It is possible
495 		 * that the checksum could be verified on the buffer and then
496 		 * the contents of that buffer are manipulated afterwards. This
497 		 * could lead to bad data being written out during self
498 		 * healing.
499 		 */
500 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
501 			rc->rc_allow_repair = 1;
502 	}
503 	return (rr);
504 }
505 
506 static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)507 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
508 {
509 	int c;
510 	int nwrapped = 0;
511 	uint64_t off = 0;
512 	raidz_row_t *rr = rm->rm_row[0];
513 
514 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
515 	ASSERT3U(rm->rm_nrows, ==, 1);
516 
517 	/*
518 	 * Pad any parity columns with additional space to account for skip
519 	 * sectors.
520 	 */
521 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
522 		ASSERT0(rm->rm_skipstart);
523 		nwrapped = rm->rm_nskip;
524 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
525 		nwrapped =
526 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
527 	}
528 
529 	/*
530 	 * Optional single skip sectors (rc_size == 0) will be handled in
531 	 * vdev_raidz_io_start_write().
532 	 */
533 	int skipped = rr->rr_scols - rr->rr_cols;
534 
535 	/* Allocate buffers for the parity columns */
536 	for (c = 0; c < rr->rr_firstdatacol; c++) {
537 		raidz_col_t *rc = &rr->rr_col[c];
538 
539 		/*
540 		 * Parity columns will pad out a linear ABD to account for
541 		 * the skip sector. A linear ABD is used here because
542 		 * parity calculations use the ABD buffer directly to calculate
543 		 * parity. This avoids doing a memcpy back to the ABD after the
544 		 * parity has been calculated. By issuing the parity column
545 		 * with the skip sector we can reduce contention on the child
546 		 * VDEV queue locks (vq_lock).
547 		 */
548 		if (c < nwrapped) {
549 			rc->rc_abd = abd_alloc_linear(
550 			    rc->rc_size + (1ULL << ashift), B_FALSE);
551 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
552 			skipped++;
553 		} else {
554 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
555 		}
556 	}
557 
558 	for (off = 0; c < rr->rr_cols; c++) {
559 		raidz_col_t *rc = &rr->rr_col[c];
560 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
561 		    zio->io_abd, off, rc->rc_size);
562 
563 		/*
564 		 * Generate I/O for skip sectors to improve aggregation
565 		 * continuity. We will use gang ABD's to reduce contention
566 		 * on the child VDEV queue locks (vq_lock) by issuing
567 		 * a single I/O that contains the data and skip sector.
568 		 *
569 		 * It is important to make sure that rc_size is not updated
570 		 * even though we are adding a skip sector to the ABD. When
571 		 * calculating the parity in vdev_raidz_generate_parity_row()
572 		 * the rc_size is used to iterate through the ABD's. We can
573 		 * not have zero'd out skip sectors used for calculating
574 		 * parity for raidz, because those same sectors are not used
575 		 * during reconstruction.
576 		 */
577 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
578 			rc->rc_abd = abd_alloc_gang();
579 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
580 			abd_gang_add(rc->rc_abd,
581 			    abd_get_zeros(1ULL << ashift), B_TRUE);
582 			skipped++;
583 		} else {
584 			rc->rc_abd = abd;
585 		}
586 		off += rc->rc_size;
587 	}
588 
589 	ASSERT3U(off, ==, zio->io_size);
590 	ASSERT3S(skipped, ==, rm->rm_nskip);
591 }
592 
593 static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)594 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
595 {
596 	int c;
597 	raidz_row_t *rr = rm->rm_row[0];
598 
599 	ASSERT3U(rm->rm_nrows, ==, 1);
600 
601 	/* Allocate buffers for the parity columns */
602 	for (c = 0; c < rr->rr_firstdatacol; c++)
603 		rr->rr_col[c].rc_abd =
604 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
605 
606 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
607 		raidz_col_t *rc = &rr->rr_col[c];
608 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
609 		    zio->io_abd, off, rc->rc_size);
610 		off += rc->rc_size;
611 	}
612 }
613 
614 /*
615  * Divides the IO evenly across all child vdevs; usually, dcols is
616  * the number of children in the target vdev.
617  *
618  * Avoid inlining the function to keep vdev_raidz_io_start(), which
619  * is this functions only caller, as small as possible on the stack.
620  */
621 noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)622 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
623     uint64_t nparity)
624 {
625 	raidz_row_t *rr;
626 	/* The starting RAIDZ (parent) vdev sector of the block. */
627 	uint64_t b = zio->io_offset >> ashift;
628 	/* The zio's size in units of the vdev's minimum sector size. */
629 	uint64_t s = zio->io_size >> ashift;
630 	/* The first column for this stripe. */
631 	uint64_t f = b % dcols;
632 	/* The starting byte offset on each child vdev. */
633 	uint64_t o = (b / dcols) << ashift;
634 	uint64_t acols, scols;
635 
636 	raidz_map_t *rm =
637 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
638 	rm->rm_nrows = 1;
639 
640 	/*
641 	 * "Quotient": The number of data sectors for this stripe on all but
642 	 * the "big column" child vdevs that also contain "remainder" data.
643 	 */
644 	uint64_t q = s / (dcols - nparity);
645 
646 	/*
647 	 * "Remainder": The number of partial stripe data sectors in this I/O.
648 	 * This will add a sector to some, but not all, child vdevs.
649 	 */
650 	uint64_t r = s - q * (dcols - nparity);
651 
652 	/* The number of "big columns" - those which contain remainder data. */
653 	uint64_t bc = (r == 0 ? 0 : r + nparity);
654 
655 	/*
656 	 * The total number of data and parity sectors associated with
657 	 * this I/O.
658 	 */
659 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
660 
661 	/*
662 	 * acols: The columns that will be accessed.
663 	 * scols: The columns that will be accessed or skipped.
664 	 */
665 	if (q == 0) {
666 		/* Our I/O request doesn't span all child vdevs. */
667 		acols = bc;
668 		scols = MIN(dcols, roundup(bc, nparity + 1));
669 	} else {
670 		acols = dcols;
671 		scols = dcols;
672 	}
673 
674 	ASSERT3U(acols, <=, scols);
675 	rr = vdev_raidz_row_alloc(scols, zio);
676 	rm->rm_row[0] = rr;
677 	rr->rr_cols = acols;
678 	rr->rr_bigcols = bc;
679 	rr->rr_firstdatacol = nparity;
680 #ifdef ZFS_DEBUG
681 	rr->rr_offset = zio->io_offset;
682 	rr->rr_size = zio->io_size;
683 #endif
684 
685 	uint64_t asize = 0;
686 
687 	for (uint64_t c = 0; c < scols; c++) {
688 		raidz_col_t *rc = &rr->rr_col[c];
689 		uint64_t col = f + c;
690 		uint64_t coff = o;
691 		if (col >= dcols) {
692 			col -= dcols;
693 			coff += 1ULL << ashift;
694 		}
695 		rc->rc_devidx = col;
696 		rc->rc_offset = coff;
697 
698 		if (c >= acols)
699 			rc->rc_size = 0;
700 		else if (c < bc)
701 			rc->rc_size = (q + 1) << ashift;
702 		else
703 			rc->rc_size = q << ashift;
704 
705 		asize += rc->rc_size;
706 	}
707 
708 	ASSERT3U(asize, ==, tot << ashift);
709 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
710 	rm->rm_skipstart = bc;
711 
712 	/*
713 	 * If all data stored spans all columns, there's a danger that parity
714 	 * will always be on the same device and, since parity isn't read
715 	 * during normal operation, that device's I/O bandwidth won't be
716 	 * used effectively. We therefore switch the parity every 1MB.
717 	 *
718 	 * ... at least that was, ostensibly, the theory. As a practical
719 	 * matter unless we juggle the parity between all devices evenly, we
720 	 * won't see any benefit. Further, occasional writes that aren't a
721 	 * multiple of the LCM of the number of children and the minimum
722 	 * stripe width are sufficient to avoid pessimal behavior.
723 	 * Unfortunately, this decision created an implicit on-disk format
724 	 * requirement that we need to support for all eternity, but only
725 	 * for single-parity RAID-Z.
726 	 *
727 	 * If we intend to skip a sector in the zeroth column for padding
728 	 * we must make sure to note this swap. We will never intend to
729 	 * skip the first column since at least one data and one parity
730 	 * column must appear in each row.
731 	 */
732 	ASSERT(rr->rr_cols >= 2);
733 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
734 
735 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
736 		uint64_t devidx = rr->rr_col[0].rc_devidx;
737 		o = rr->rr_col[0].rc_offset;
738 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
739 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
740 		rr->rr_col[1].rc_devidx = devidx;
741 		rr->rr_col[1].rc_offset = o;
742 		if (rm->rm_skipstart == 0)
743 			rm->rm_skipstart = 1;
744 	}
745 
746 	if (zio->io_type == ZIO_TYPE_WRITE) {
747 		vdev_raidz_map_alloc_write(zio, rm, ashift);
748 	} else {
749 		vdev_raidz_map_alloc_read(zio, rm);
750 	}
751 	/* init RAIDZ parity ops */
752 	rm->rm_ops = vdev_raidz_math_get_ops();
753 
754 	return (rm);
755 }
756 
757 /*
758  * Everything before reflow_offset_synced should have been moved to the new
759  * location (read and write completed).  However, this may not yet be reflected
760  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
761  * uberblock has not yet been written). If reflow is not in progress,
762  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
763  * entirely before reflow_offset_synced, it will come from the new location.
764  * Otherwise this row will come from the old location.  Therefore, rows that
765  * straddle the reflow_offset_synced will come from the old location.
766  *
767  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
768  * been copied, but not yet reflected in the on-disk progress
769  * (reflow_offset_synced), it will also be written to the new (already copied)
770  * offset.
771  */
772 noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)773 vdev_raidz_map_alloc_expanded(zio_t *zio,
774     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
775     uint64_t nparity, uint64_t reflow_offset_synced,
776     uint64_t reflow_offset_next, boolean_t use_scratch)
777 {
778 	abd_t *abd = zio->io_abd;
779 	uint64_t offset = zio->io_offset;
780 	uint64_t size = zio->io_size;
781 
782 	/* The zio's size in units of the vdev's minimum sector size. */
783 	uint64_t s = size >> ashift;
784 
785 	/*
786 	 * "Quotient": The number of data sectors for this stripe on all but
787 	 * the "big column" child vdevs that also contain "remainder" data.
788 	 * AKA "full rows"
789 	 */
790 	uint64_t q = s / (logical_cols - nparity);
791 
792 	/*
793 	 * "Remainder": The number of partial stripe data sectors in this I/O.
794 	 * This will add a sector to some, but not all, child vdevs.
795 	 */
796 	uint64_t r = s - q * (logical_cols - nparity);
797 
798 	/* The number of "big columns" - those which contain remainder data. */
799 	uint64_t bc = (r == 0 ? 0 : r + nparity);
800 
801 	/*
802 	 * The total number of data and parity sectors associated with
803 	 * this I/O.
804 	 */
805 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
806 
807 	/* How many rows contain data (not skip) */
808 	uint64_t rows = howmany(tot, logical_cols);
809 	int cols = MIN(tot, logical_cols);
810 
811 	raidz_map_t *rm =
812 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
813 	    KM_SLEEP);
814 	rm->rm_nrows = rows;
815 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
816 	rm->rm_skipstart = bc;
817 	uint64_t asize = 0;
818 
819 	for (uint64_t row = 0; row < rows; row++) {
820 		boolean_t row_use_scratch = B_FALSE;
821 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
822 		rm->rm_row[row] = rr;
823 
824 		/* The starting RAIDZ (parent) vdev sector of the row. */
825 		uint64_t b = (offset >> ashift) + row * logical_cols;
826 
827 		/*
828 		 * If we are in the middle of a reflow, and the copying has
829 		 * not yet completed for any part of this row, then use the
830 		 * old location of this row.  Note that reflow_offset_synced
831 		 * reflects the i/o that's been completed, because it's
832 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
833 		 * This is sufficient for our check, even if that progress
834 		 * has not yet been recorded to disk (reflected in
835 		 * spa_ubsync).  Also note that we consider the last row to
836 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
837 		 * this calculation. This causes a tiny bit of unnecessary
838 		 * double-writes but is safe and simpler to calculate.
839 		 */
840 		int row_phys_cols = physical_cols;
841 		if (b + cols > reflow_offset_synced >> ashift)
842 			row_phys_cols--;
843 		else if (use_scratch)
844 			row_use_scratch = B_TRUE;
845 
846 		/* starting child of this row */
847 		uint64_t child_id = b % row_phys_cols;
848 		/* The starting byte offset on each child vdev. */
849 		uint64_t child_offset = (b / row_phys_cols) << ashift;
850 
851 		/*
852 		 * Note, rr_cols is the entire width of the block, even
853 		 * if this row is shorter.  This is needed because parity
854 		 * generation (for Q and R) needs to know the entire width,
855 		 * because it treats the short row as though it was
856 		 * full-width (and the "phantom" sectors were zero-filled).
857 		 *
858 		 * Another approach to this would be to set cols shorter
859 		 * (to just the number of columns that we might do i/o to)
860 		 * and have another mechanism to tell the parity generation
861 		 * about the "entire width".  Reconstruction (at least
862 		 * vdev_raidz_reconstruct_general()) would also need to
863 		 * know about the "entire width".
864 		 */
865 		rr->rr_firstdatacol = nparity;
866 #ifdef ZFS_DEBUG
867 		/*
868 		 * note: rr_size is PSIZE, not ASIZE
869 		 */
870 		rr->rr_offset = b << ashift;
871 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
872 #endif
873 
874 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
875 			if (child_id >= row_phys_cols) {
876 				child_id -= row_phys_cols;
877 				child_offset += 1ULL << ashift;
878 			}
879 			raidz_col_t *rc = &rr->rr_col[c];
880 			rc->rc_devidx = child_id;
881 			rc->rc_offset = child_offset;
882 
883 			/*
884 			 * Get this from the scratch space if appropriate.
885 			 * This only happens if we crashed in the middle of
886 			 * raidz_reflow_scratch_sync() (while it's running,
887 			 * the rangelock prevents us from doing concurrent
888 			 * io), and even then only during zpool import or
889 			 * when the pool is imported readonly.
890 			 */
891 			if (row_use_scratch)
892 				rc->rc_offset -= VDEV_BOOT_SIZE;
893 
894 			uint64_t dc = c - rr->rr_firstdatacol;
895 			if (c < rr->rr_firstdatacol) {
896 				rc->rc_size = 1ULL << ashift;
897 
898 				/*
899 				 * Parity sectors' rc_abd's are set below
900 				 * after determining if this is an aggregation.
901 				 */
902 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
903 				/*
904 				 * Past the end of the block (even including
905 				 * skip sectors).  This sector is part of the
906 				 * map so that we have full rows for p/q parity
907 				 * generation.
908 				 */
909 				rc->rc_size = 0;
910 				rc->rc_abd = NULL;
911 			} else {
912 				/* "data column" (col excluding parity) */
913 				uint64_t off;
914 
915 				if (c < bc || r == 0) {
916 					off = dc * rows + row;
917 				} else {
918 					off = r * rows +
919 					    (dc - r) * (rows - 1) + row;
920 				}
921 				rc->rc_size = 1ULL << ashift;
922 				rc->rc_abd = abd_get_offset_struct(
923 				    &rc->rc_abdstruct, abd, off << ashift,
924 				    rc->rc_size);
925 			}
926 
927 			if (rc->rc_size == 0)
928 				continue;
929 
930 			/*
931 			 * If any part of this row is in both old and new
932 			 * locations, the primary location is the old
933 			 * location. If this sector was already copied to the
934 			 * new location, we need to also write to the new,
935 			 * "shadow" location.
936 			 *
937 			 * Note, `row_phys_cols != physical_cols` indicates
938 			 * that the primary location is the old location.
939 			 * `b+c < reflow_offset_next` indicates that the copy
940 			 * to the new location has been initiated. We know
941 			 * that the copy has completed because we have the
942 			 * rangelock, which is held exclusively while the
943 			 * copy is in progress.
944 			 */
945 			if (row_use_scratch ||
946 			    (row_phys_cols != physical_cols &&
947 			    b + c < reflow_offset_next >> ashift)) {
948 				rc->rc_shadow_devidx = (b + c) % physical_cols;
949 				rc->rc_shadow_offset =
950 				    ((b + c) / physical_cols) << ashift;
951 				if (row_use_scratch)
952 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
953 			}
954 
955 			asize += rc->rc_size;
956 		}
957 
958 		/*
959 		 * See comment in vdev_raidz_map_alloc()
960 		 */
961 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
962 		    (offset & (1ULL << 20))) {
963 			ASSERT(rr->rr_cols >= 2);
964 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
965 
966 			int devidx0 = rr->rr_col[0].rc_devidx;
967 			uint64_t offset0 = rr->rr_col[0].rc_offset;
968 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
969 			uint64_t shadow_offset0 =
970 			    rr->rr_col[0].rc_shadow_offset;
971 
972 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
973 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
974 			rr->rr_col[0].rc_shadow_devidx =
975 			    rr->rr_col[1].rc_shadow_devidx;
976 			rr->rr_col[0].rc_shadow_offset =
977 			    rr->rr_col[1].rc_shadow_offset;
978 
979 			rr->rr_col[1].rc_devidx = devidx0;
980 			rr->rr_col[1].rc_offset = offset0;
981 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
982 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
983 		}
984 	}
985 	ASSERT3U(asize, ==, tot << ashift);
986 
987 	/*
988 	 * Determine if the block is contiguous, in which case we can use
989 	 * an aggregation.
990 	 */
991 	if (rows >= raidz_io_aggregate_rows) {
992 		rm->rm_nphys_cols = physical_cols;
993 		rm->rm_phys_col =
994 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
995 		    KM_SLEEP);
996 
997 		/*
998 		 * Determine the aggregate io's offset and size, and check
999 		 * that the io is contiguous.
1000 		 */
1001 		for (int i = 0;
1002 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
1003 			raidz_row_t *rr = rm->rm_row[i];
1004 			for (int c = 0; c < rr->rr_cols; c++) {
1005 				raidz_col_t *rc = &rr->rr_col[c];
1006 				raidz_col_t *prc =
1007 				    &rm->rm_phys_col[rc->rc_devidx];
1008 
1009 				if (rc->rc_size == 0)
1010 					continue;
1011 
1012 				if (prc->rc_size == 0) {
1013 					ASSERT0(prc->rc_offset);
1014 					prc->rc_offset = rc->rc_offset;
1015 				} else if (prc->rc_offset + prc->rc_size !=
1016 				    rc->rc_offset) {
1017 					/*
1018 					 * This block is not contiguous and
1019 					 * therefore can't be aggregated.
1020 					 * This is expected to be rare, so
1021 					 * the cost of allocating and then
1022 					 * freeing rm_phys_col is not
1023 					 * significant.
1024 					 */
1025 					kmem_free(rm->rm_phys_col,
1026 					    sizeof (raidz_col_t) *
1027 					    rm->rm_nphys_cols);
1028 					rm->rm_phys_col = NULL;
1029 					rm->rm_nphys_cols = 0;
1030 					break;
1031 				}
1032 				prc->rc_size += rc->rc_size;
1033 			}
1034 		}
1035 	}
1036 	if (rm->rm_phys_col != NULL) {
1037 		/*
1038 		 * Allocate aggregate ABD's.
1039 		 */
1040 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
1041 			raidz_col_t *prc = &rm->rm_phys_col[i];
1042 
1043 			prc->rc_devidx = i;
1044 
1045 			if (prc->rc_size == 0)
1046 				continue;
1047 
1048 			prc->rc_abd =
1049 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1050 			    B_FALSE);
1051 		}
1052 
1053 		/*
1054 		 * Point the parity abd's into the aggregate abd's.
1055 		 */
1056 		for (int i = 0; i < rm->rm_nrows; i++) {
1057 			raidz_row_t *rr = rm->rm_row[i];
1058 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1059 				raidz_col_t *rc = &rr->rr_col[c];
1060 				raidz_col_t *prc =
1061 				    &rm->rm_phys_col[rc->rc_devidx];
1062 				rc->rc_abd =
1063 				    abd_get_offset_struct(&rc->rc_abdstruct,
1064 				    prc->rc_abd,
1065 				    rc->rc_offset - prc->rc_offset,
1066 				    rc->rc_size);
1067 			}
1068 		}
1069 	} else {
1070 		/*
1071 		 * Allocate new abd's for the parity sectors.
1072 		 */
1073 		for (int i = 0; i < rm->rm_nrows; i++) {
1074 			raidz_row_t *rr = rm->rm_row[i];
1075 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1076 				raidz_col_t *rc = &rr->rr_col[c];
1077 				rc->rc_abd =
1078 				    abd_alloc_linear(rc->rc_size,
1079 				    B_TRUE);
1080 			}
1081 		}
1082 	}
1083 	/* init RAIDZ parity ops */
1084 	rm->rm_ops = vdev_raidz_math_get_ops();
1085 
1086 	return (rm);
1087 }
1088 
1089 struct pqr_struct {
1090 	uint64_t *p;
1091 	uint64_t *q;
1092 	uint64_t *r;
1093 };
1094 
1095 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1096 vdev_raidz_p_func(void *buf, size_t size, void *private)
1097 {
1098 	struct pqr_struct *pqr = private;
1099 	const uint64_t *src = buf;
1100 	int cnt = size / sizeof (src[0]);
1101 
1102 	ASSERT(pqr->p && !pqr->q && !pqr->r);
1103 
1104 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1105 		*pqr->p ^= *src;
1106 
1107 	return (0);
1108 }
1109 
1110 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1111 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1112 {
1113 	struct pqr_struct *pqr = private;
1114 	const uint64_t *src = buf;
1115 	uint64_t mask;
1116 	int cnt = size / sizeof (src[0]);
1117 
1118 	ASSERT(pqr->p && pqr->q && !pqr->r);
1119 
1120 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1121 		*pqr->p ^= *src;
1122 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1123 		*pqr->q ^= *src;
1124 	}
1125 
1126 	return (0);
1127 }
1128 
1129 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1130 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1131 {
1132 	struct pqr_struct *pqr = private;
1133 	const uint64_t *src = buf;
1134 	uint64_t mask;
1135 	int cnt = size / sizeof (src[0]);
1136 
1137 	ASSERT(pqr->p && pqr->q && pqr->r);
1138 
1139 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1140 		*pqr->p ^= *src;
1141 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1142 		*pqr->q ^= *src;
1143 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1144 		*pqr->r ^= *src;
1145 	}
1146 
1147 	return (0);
1148 }
1149 
1150 static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)1151 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1152 {
1153 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1154 
1155 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1156 		abd_t *src = rr->rr_col[c].rc_abd;
1157 
1158 		if (c == rr->rr_firstdatacol) {
1159 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1160 		} else {
1161 			struct pqr_struct pqr = { p, NULL, NULL };
1162 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1163 			    vdev_raidz_p_func, &pqr);
1164 		}
1165 	}
1166 }
1167 
1168 static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)1169 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1170 {
1171 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1172 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1173 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1174 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1175 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1176 
1177 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1178 		abd_t *src = rr->rr_col[c].rc_abd;
1179 
1180 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1181 
1182 		if (c == rr->rr_firstdatacol) {
1183 			ASSERT(ccnt == pcnt || ccnt == 0);
1184 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1185 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1186 
1187 			for (uint64_t i = ccnt; i < pcnt; i++) {
1188 				p[i] = 0;
1189 				q[i] = 0;
1190 			}
1191 		} else {
1192 			struct pqr_struct pqr = { p, q, NULL };
1193 
1194 			ASSERT(ccnt <= pcnt);
1195 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1196 			    vdev_raidz_pq_func, &pqr);
1197 
1198 			/*
1199 			 * Treat short columns as though they are full of 0s.
1200 			 * Note that there's therefore nothing needed for P.
1201 			 */
1202 			uint64_t mask;
1203 			for (uint64_t i = ccnt; i < pcnt; i++) {
1204 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1205 			}
1206 		}
1207 	}
1208 }
1209 
1210 static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)1211 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1212 {
1213 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1214 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1215 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1216 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1217 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1218 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1219 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1220 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1221 
1222 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1223 		abd_t *src = rr->rr_col[c].rc_abd;
1224 
1225 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1226 
1227 		if (c == rr->rr_firstdatacol) {
1228 			ASSERT(ccnt == pcnt || ccnt == 0);
1229 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1230 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1231 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1232 
1233 			for (uint64_t i = ccnt; i < pcnt; i++) {
1234 				p[i] = 0;
1235 				q[i] = 0;
1236 				r[i] = 0;
1237 			}
1238 		} else {
1239 			struct pqr_struct pqr = { p, q, r };
1240 
1241 			ASSERT(ccnt <= pcnt);
1242 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1243 			    vdev_raidz_pqr_func, &pqr);
1244 
1245 			/*
1246 			 * Treat short columns as though they are full of 0s.
1247 			 * Note that there's therefore nothing needed for P.
1248 			 */
1249 			uint64_t mask;
1250 			for (uint64_t i = ccnt; i < pcnt; i++) {
1251 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1252 				VDEV_RAIDZ_64MUL_4(r[i], mask);
1253 			}
1254 		}
1255 	}
1256 }
1257 
1258 /*
1259  * Generate RAID parity in the first virtual columns according to the number of
1260  * parity columns available.
1261  */
1262 void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)1263 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1264 {
1265 	if (rr->rr_cols == 0) {
1266 		/*
1267 		 * We are handling this block one row at a time (because
1268 		 * this block has a different logical vs physical width,
1269 		 * due to RAIDZ expansion), and this is a pad-only row,
1270 		 * which has no parity.
1271 		 */
1272 		return;
1273 	}
1274 
1275 	/* Generate using the new math implementation */
1276 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1277 		return;
1278 
1279 	switch (rr->rr_firstdatacol) {
1280 	case 1:
1281 		vdev_raidz_generate_parity_p(rr);
1282 		break;
1283 	case 2:
1284 		vdev_raidz_generate_parity_pq(rr);
1285 		break;
1286 	case 3:
1287 		vdev_raidz_generate_parity_pqr(rr);
1288 		break;
1289 	default:
1290 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1291 	}
1292 }
1293 
1294 void
vdev_raidz_generate_parity(raidz_map_t * rm)1295 vdev_raidz_generate_parity(raidz_map_t *rm)
1296 {
1297 	for (int i = 0; i < rm->rm_nrows; i++) {
1298 		raidz_row_t *rr = rm->rm_row[i];
1299 		vdev_raidz_generate_parity_row(rm, rr);
1300 	}
1301 }
1302 
1303 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1304 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1305 {
1306 	(void) private;
1307 	uint64_t *dst = dbuf;
1308 	uint64_t *src = sbuf;
1309 	int cnt = size / sizeof (src[0]);
1310 
1311 	for (int i = 0; i < cnt; i++) {
1312 		dst[i] ^= src[i];
1313 	}
1314 
1315 	return (0);
1316 }
1317 
1318 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1319 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1320     void *private)
1321 {
1322 	(void) private;
1323 	uint64_t *dst = dbuf;
1324 	uint64_t *src = sbuf;
1325 	uint64_t mask;
1326 	int cnt = size / sizeof (dst[0]);
1327 
1328 	for (int i = 0; i < cnt; i++, dst++, src++) {
1329 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1330 		*dst ^= *src;
1331 	}
1332 
1333 	return (0);
1334 }
1335 
1336 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1337 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1338 {
1339 	(void) private;
1340 	uint64_t *dst = buf;
1341 	uint64_t mask;
1342 	int cnt = size / sizeof (dst[0]);
1343 
1344 	for (int i = 0; i < cnt; i++, dst++) {
1345 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1346 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1347 	}
1348 
1349 	return (0);
1350 }
1351 
1352 struct reconst_q_struct {
1353 	uint64_t *q;
1354 	int exp;
1355 };
1356 
1357 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1358 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1359 {
1360 	struct reconst_q_struct *rq = private;
1361 	uint64_t *dst = buf;
1362 	int cnt = size / sizeof (dst[0]);
1363 
1364 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1365 		int j;
1366 		uint8_t *b;
1367 
1368 		*dst ^= *rq->q;
1369 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1370 			*b = vdev_raidz_exp2(*b, rq->exp);
1371 		}
1372 	}
1373 
1374 	return (0);
1375 }
1376 
1377 struct reconst_pq_struct {
1378 	uint8_t *p;
1379 	uint8_t *q;
1380 	uint8_t *pxy;
1381 	uint8_t *qxy;
1382 	int aexp;
1383 	int bexp;
1384 };
1385 
1386 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)1387 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1388 {
1389 	struct reconst_pq_struct *rpq = private;
1390 	uint8_t *xd = xbuf;
1391 	uint8_t *yd = ybuf;
1392 
1393 	for (int i = 0; i < size;
1394 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1395 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1396 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1397 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1398 	}
1399 
1400 	return (0);
1401 }
1402 
1403 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1404 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1405 {
1406 	struct reconst_pq_struct *rpq = private;
1407 	uint8_t *xd = xbuf;
1408 
1409 	for (int i = 0; i < size;
1410 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1411 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1412 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1413 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1414 	}
1415 
1416 	return (0);
1417 }
1418 
1419 static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)1420 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1421 {
1422 	int x = tgts[0];
1423 	abd_t *dst, *src;
1424 
1425 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1426 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1427 
1428 	ASSERT3U(ntgts, ==, 1);
1429 	ASSERT3U(x, >=, rr->rr_firstdatacol);
1430 	ASSERT3U(x, <, rr->rr_cols);
1431 
1432 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1433 
1434 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1435 	dst = rr->rr_col[x].rc_abd;
1436 
1437 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1438 
1439 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1440 		uint64_t size = MIN(rr->rr_col[x].rc_size,
1441 		    rr->rr_col[c].rc_size);
1442 
1443 		src = rr->rr_col[c].rc_abd;
1444 
1445 		if (c == x)
1446 			continue;
1447 
1448 		(void) abd_iterate_func2(dst, src, 0, 0, size,
1449 		    vdev_raidz_reconst_p_func, NULL);
1450 	}
1451 }
1452 
1453 static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)1454 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1455 {
1456 	int x = tgts[0];
1457 	int c, exp;
1458 	abd_t *dst, *src;
1459 
1460 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1461 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1462 
1463 	ASSERT(ntgts == 1);
1464 
1465 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1466 
1467 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1468 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1469 		    rr->rr_col[c].rc_size);
1470 
1471 		src = rr->rr_col[c].rc_abd;
1472 		dst = rr->rr_col[x].rc_abd;
1473 
1474 		if (c == rr->rr_firstdatacol) {
1475 			abd_copy(dst, src, size);
1476 			if (rr->rr_col[x].rc_size > size) {
1477 				abd_zero_off(dst, size,
1478 				    rr->rr_col[x].rc_size - size);
1479 			}
1480 		} else {
1481 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1482 			(void) abd_iterate_func2(dst, src, 0, 0, size,
1483 			    vdev_raidz_reconst_q_pre_func, NULL);
1484 			(void) abd_iterate_func(dst,
1485 			    size, rr->rr_col[x].rc_size - size,
1486 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1487 		}
1488 	}
1489 
1490 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1491 	dst = rr->rr_col[x].rc_abd;
1492 	exp = 255 - (rr->rr_cols - 1 - x);
1493 
1494 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1495 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1496 	    vdev_raidz_reconst_q_post_func, &rq);
1497 }
1498 
1499 static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)1500 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1501 {
1502 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1503 	abd_t *pdata, *qdata;
1504 	uint64_t xsize, ysize;
1505 	int x = tgts[0];
1506 	int y = tgts[1];
1507 	abd_t *xd, *yd;
1508 
1509 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1510 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1511 
1512 	ASSERT(ntgts == 2);
1513 	ASSERT(x < y);
1514 	ASSERT(x >= rr->rr_firstdatacol);
1515 	ASSERT(y < rr->rr_cols);
1516 
1517 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1518 
1519 	/*
1520 	 * Move the parity data aside -- we're going to compute parity as
1521 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1522 	 * reuse the parity generation mechanism without trashing the actual
1523 	 * parity so we make those columns appear to be full of zeros by
1524 	 * setting their lengths to zero.
1525 	 */
1526 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1527 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1528 	xsize = rr->rr_col[x].rc_size;
1529 	ysize = rr->rr_col[y].rc_size;
1530 
1531 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1532 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1533 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1534 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1535 	rr->rr_col[x].rc_size = 0;
1536 	rr->rr_col[y].rc_size = 0;
1537 
1538 	vdev_raidz_generate_parity_pq(rr);
1539 
1540 	rr->rr_col[x].rc_size = xsize;
1541 	rr->rr_col[y].rc_size = ysize;
1542 
1543 	p = abd_to_buf(pdata);
1544 	q = abd_to_buf(qdata);
1545 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1546 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1547 	xd = rr->rr_col[x].rc_abd;
1548 	yd = rr->rr_col[y].rc_abd;
1549 
1550 	/*
1551 	 * We now have:
1552 	 *	Pxy = P + D_x + D_y
1553 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1554 	 *
1555 	 * We can then solve for D_x:
1556 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1557 	 * where
1558 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1559 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1560 	 *
1561 	 * With D_x in hand, we can easily solve for D_y:
1562 	 *	D_y = P + Pxy + D_x
1563 	 */
1564 
1565 	a = vdev_raidz_pow2[255 + x - y];
1566 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1567 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1568 
1569 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1570 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1571 
1572 	ASSERT3U(xsize, >=, ysize);
1573 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1574 
1575 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1576 	    vdev_raidz_reconst_pq_func, &rpq);
1577 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1578 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1579 
1580 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1581 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1582 
1583 	/*
1584 	 * Restore the saved parity data.
1585 	 */
1586 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1587 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1588 }
1589 
1590 /*
1591  * In the general case of reconstruction, we must solve the system of linear
1592  * equations defined by the coefficients used to generate parity as well as
1593  * the contents of the data and parity disks. This can be expressed with
1594  * vectors for the original data (D) and the actual data (d) and parity (p)
1595  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1596  *
1597  *            __   __                     __     __
1598  *            |     |         __     __   |  p_0  |
1599  *            |  V  |         |  D_0  |   | p_m-1 |
1600  *            |     |    x    |   :   | = |  d_0  |
1601  *            |  I  |         | D_n-1 |   |   :   |
1602  *            |     |         ~~     ~~   | d_n-1 |
1603  *            ~~   ~~                     ~~     ~~
1604  *
1605  * I is simply a square identity matrix of size n, and V is a vandermonde
1606  * matrix defined by the coefficients we chose for the various parity columns
1607  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1608  * computation as well as linear separability.
1609  *
1610  *      __               __               __     __
1611  *      |   1   ..  1 1 1 |               |  p_0  |
1612  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1613  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1614  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1615  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1616  *      |   :       : : : |   |   :   |   |  d_2  |
1617  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1618  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1619  *      |   0   ..  0 0 1 |               | d_n-1 |
1620  *      ~~               ~~               ~~     ~~
1621  *
1622  * Note that I, V, d, and p are known. To compute D, we must invert the
1623  * matrix and use the known data and parity values to reconstruct the unknown
1624  * data values. We begin by removing the rows in V|I and d|p that correspond
1625  * to failed or missing columns; we then make V|I square (n x n) and d|p
1626  * sized n by removing rows corresponding to unused parity from the bottom up
1627  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1628  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1629  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1630  *           __                               __
1631  *           |  1   1   1   1   1   1   1   1  |
1632  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1633  *           |  19 205 116  29  64  16  4   1  |      / /
1634  *           |  1   0   0   0   0   0   0   0  |     / /
1635  *           |  0   1   0   0   0   0   0   0  | <--' /
1636  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1637  *           |  0   0   0   1   0   0   0   0  |
1638  *           |  0   0   0   0   1   0   0   0  |
1639  *           |  0   0   0   0   0   1   0   0  |
1640  *           |  0   0   0   0   0   0   1   0  |
1641  *           |  0   0   0   0   0   0   0   1  |
1642  *           ~~                               ~~
1643  *           __                               __
1644  *           |  1   1   1   1   1   1   1   1  |
1645  *           | 128  64  32  16  8   4   2   1  |
1646  *           |  19 205 116  29  64  16  4   1  |
1647  *           |  1   0   0   0   0   0   0   0  |
1648  *           |  0   1   0   0   0   0   0   0  |
1649  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1650  *           |  0   0   0   1   0   0   0   0  |
1651  *           |  0   0   0   0   1   0   0   0  |
1652  *           |  0   0   0   0   0   1   0   0  |
1653  *           |  0   0   0   0   0   0   1   0  |
1654  *           |  0   0   0   0   0   0   0   1  |
1655  *           ~~                               ~~
1656  *
1657  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1658  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1659  * matrix is not singular.
1660  * __                                                                 __
1661  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1662  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1663  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1664  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1665  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1666  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1667  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1668  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1669  * ~~                                                                 ~~
1670  * __                                                                 __
1671  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1672  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1673  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1674  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1675  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1676  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1677  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1678  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1679  * ~~                                                                 ~~
1680  * __                                                                 __
1681  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1682  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1683  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1684  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1685  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1686  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1687  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1688  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1689  * ~~                                                                 ~~
1690  * __                                                                 __
1691  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1692  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1693  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1694  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1695  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1696  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1697  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1698  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1699  * ~~                                                                 ~~
1700  * __                                                                 __
1701  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1702  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1703  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1704  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1705  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1706  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1707  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1708  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1709  * ~~                                                                 ~~
1710  * __                                                                 __
1711  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1712  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1713  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1714  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1715  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1716  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1717  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1718  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1719  * ~~                                                                 ~~
1720  *                   __                               __
1721  *                   |  0   0   1   0   0   0   0   0  |
1722  *                   | 167 100  5   41 159 169 217 208 |
1723  *                   | 166 100  4   40 158 168 216 209 |
1724  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1725  *                   |  0   0   0   0   1   0   0   0  |
1726  *                   |  0   0   0   0   0   1   0   0  |
1727  *                   |  0   0   0   0   0   0   1   0  |
1728  *                   |  0   0   0   0   0   0   0   1  |
1729  *                   ~~                               ~~
1730  *
1731  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1732  * of the missing data.
1733  *
1734  * As is apparent from the example above, the only non-trivial rows in the
1735  * inverse matrix correspond to the data disks that we're trying to
1736  * reconstruct. Indeed, those are the only rows we need as the others would
1737  * only be useful for reconstructing data known or assumed to be valid. For
1738  * that reason, we only build the coefficients in the rows that correspond to
1739  * targeted columns.
1740  */
1741 
1742 static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)1743 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1744     uint8_t **rows)
1745 {
1746 	int i, j;
1747 	int pow;
1748 
1749 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1750 
1751 	/*
1752 	 * Fill in the missing rows of interest.
1753 	 */
1754 	for (i = 0; i < nmap; i++) {
1755 		ASSERT3S(0, <=, map[i]);
1756 		ASSERT3S(map[i], <=, 2);
1757 
1758 		pow = map[i] * n;
1759 		if (pow > 255)
1760 			pow -= 255;
1761 		ASSERT(pow <= 255);
1762 
1763 		for (j = 0; j < n; j++) {
1764 			pow -= map[i];
1765 			if (pow < 0)
1766 				pow += 255;
1767 			rows[i][j] = vdev_raidz_pow2[pow];
1768 		}
1769 	}
1770 }
1771 
1772 static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1773 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1774     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1775 {
1776 	int i, j, ii, jj;
1777 	uint8_t log;
1778 
1779 	/*
1780 	 * Assert that the first nmissing entries from the array of used
1781 	 * columns correspond to parity columns and that subsequent entries
1782 	 * correspond to data columns.
1783 	 */
1784 	for (i = 0; i < nmissing; i++) {
1785 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1786 	}
1787 	for (; i < n; i++) {
1788 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1789 	}
1790 
1791 	/*
1792 	 * First initialize the storage where we'll compute the inverse rows.
1793 	 */
1794 	for (i = 0; i < nmissing; i++) {
1795 		for (j = 0; j < n; j++) {
1796 			invrows[i][j] = (i == j) ? 1 : 0;
1797 		}
1798 	}
1799 
1800 	/*
1801 	 * Subtract all trivial rows from the rows of consequence.
1802 	 */
1803 	for (i = 0; i < nmissing; i++) {
1804 		for (j = nmissing; j < n; j++) {
1805 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1806 			jj = used[j] - rr->rr_firstdatacol;
1807 			ASSERT3S(jj, <, n);
1808 			invrows[i][j] = rows[i][jj];
1809 			rows[i][jj] = 0;
1810 		}
1811 	}
1812 
1813 	/*
1814 	 * For each of the rows of interest, we must normalize it and subtract
1815 	 * a multiple of it from the other rows.
1816 	 */
1817 	for (i = 0; i < nmissing; i++) {
1818 		for (j = 0; j < missing[i]; j++) {
1819 			ASSERT0(rows[i][j]);
1820 		}
1821 		ASSERT3U(rows[i][missing[i]], !=, 0);
1822 
1823 		/*
1824 		 * Compute the inverse of the first element and multiply each
1825 		 * element in the row by that value.
1826 		 */
1827 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1828 
1829 		for (j = 0; j < n; j++) {
1830 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1831 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1832 		}
1833 
1834 		for (ii = 0; ii < nmissing; ii++) {
1835 			if (i == ii)
1836 				continue;
1837 
1838 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1839 
1840 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1841 
1842 			for (j = 0; j < n; j++) {
1843 				rows[ii][j] ^=
1844 				    vdev_raidz_exp2(rows[i][j], log);
1845 				invrows[ii][j] ^=
1846 				    vdev_raidz_exp2(invrows[i][j], log);
1847 			}
1848 		}
1849 	}
1850 
1851 	/*
1852 	 * Verify that the data that is left in the rows are properly part of
1853 	 * an identity matrix.
1854 	 */
1855 	for (i = 0; i < nmissing; i++) {
1856 		for (j = 0; j < n; j++) {
1857 			if (j == missing[i]) {
1858 				ASSERT3U(rows[i][j], ==, 1);
1859 			} else {
1860 				ASSERT0(rows[i][j]);
1861 			}
1862 		}
1863 	}
1864 }
1865 
1866 static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1867 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1868     int *missing, uint8_t **invrows, const uint8_t *used)
1869 {
1870 	int i, j, x, cc, c;
1871 	uint8_t *src;
1872 	uint64_t ccount;
1873 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1874 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1875 	uint8_t log = 0;
1876 	uint8_t val;
1877 	int ll;
1878 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1879 	uint8_t *p, *pp;
1880 	size_t psize;
1881 
1882 	psize = sizeof (invlog[0][0]) * n * nmissing;
1883 	p = kmem_alloc(psize, KM_SLEEP);
1884 
1885 	for (pp = p, i = 0; i < nmissing; i++) {
1886 		invlog[i] = pp;
1887 		pp += n;
1888 	}
1889 
1890 	for (i = 0; i < nmissing; i++) {
1891 		for (j = 0; j < n; j++) {
1892 			ASSERT3U(invrows[i][j], !=, 0);
1893 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1894 		}
1895 	}
1896 
1897 	for (i = 0; i < n; i++) {
1898 		c = used[i];
1899 		ASSERT3U(c, <, rr->rr_cols);
1900 
1901 		ccount = rr->rr_col[c].rc_size;
1902 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1903 		if (ccount == 0)
1904 			continue;
1905 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1906 		for (j = 0; j < nmissing; j++) {
1907 			cc = missing[j] + rr->rr_firstdatacol;
1908 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1909 			ASSERT3U(cc, <, rr->rr_cols);
1910 			ASSERT3U(cc, !=, c);
1911 
1912 			dcount[j] = rr->rr_col[cc].rc_size;
1913 			if (dcount[j] != 0)
1914 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1915 		}
1916 
1917 		for (x = 0; x < ccount; x++, src++) {
1918 			if (*src != 0)
1919 				log = vdev_raidz_log2[*src];
1920 
1921 			for (cc = 0; cc < nmissing; cc++) {
1922 				if (x >= dcount[cc])
1923 					continue;
1924 
1925 				if (*src == 0) {
1926 					val = 0;
1927 				} else {
1928 					if ((ll = log + invlog[cc][i]) >= 255)
1929 						ll -= 255;
1930 					val = vdev_raidz_pow2[ll];
1931 				}
1932 
1933 				if (i == 0)
1934 					dst[cc][x] = val;
1935 				else
1936 					dst[cc][x] ^= val;
1937 			}
1938 		}
1939 	}
1940 
1941 	kmem_free(p, psize);
1942 }
1943 
1944 static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)1945 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1946 {
1947 	int i, c, t, tt;
1948 	unsigned int n;
1949 	unsigned int nmissing_rows;
1950 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1951 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1952 	uint8_t *p, *pp;
1953 	size_t psize;
1954 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1955 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1956 	uint8_t *used;
1957 
1958 	abd_t **bufs = NULL;
1959 
1960 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1961 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1962 	/*
1963 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1964 	 * temporary linear ABDs if any non-linear ABDs are found.
1965 	 */
1966 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1967 		ASSERT(rr->rr_col[i].rc_abd != NULL);
1968 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1969 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1970 			    KM_PUSHPAGE);
1971 
1972 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1973 				raidz_col_t *col = &rr->rr_col[c];
1974 
1975 				bufs[c] = col->rc_abd;
1976 				if (bufs[c] != NULL) {
1977 					col->rc_abd = abd_alloc_linear(
1978 					    col->rc_size, B_TRUE);
1979 					abd_copy(col->rc_abd, bufs[c],
1980 					    col->rc_size);
1981 				}
1982 			}
1983 
1984 			break;
1985 		}
1986 	}
1987 
1988 	n = rr->rr_cols - rr->rr_firstdatacol;
1989 
1990 	/*
1991 	 * Figure out which data columns are missing.
1992 	 */
1993 	nmissing_rows = 0;
1994 	for (t = 0; t < ntgts; t++) {
1995 		if (tgts[t] >= rr->rr_firstdatacol) {
1996 			missing_rows[nmissing_rows++] =
1997 			    tgts[t] - rr->rr_firstdatacol;
1998 		}
1999 	}
2000 
2001 	/*
2002 	 * Figure out which parity columns to use to help generate the missing
2003 	 * data columns.
2004 	 */
2005 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
2006 		ASSERT(tt < ntgts);
2007 		ASSERT(c < rr->rr_firstdatacol);
2008 
2009 		/*
2010 		 * Skip any targeted parity columns.
2011 		 */
2012 		if (c == tgts[tt]) {
2013 			tt++;
2014 			continue;
2015 		}
2016 
2017 		parity_map[i] = c;
2018 		i++;
2019 	}
2020 
2021 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
2022 	    nmissing_rows * n + sizeof (used[0]) * n;
2023 	p = kmem_alloc(psize, KM_SLEEP);
2024 
2025 	for (pp = p, i = 0; i < nmissing_rows; i++) {
2026 		rows[i] = pp;
2027 		pp += n;
2028 		invrows[i] = pp;
2029 		pp += n;
2030 	}
2031 	used = pp;
2032 
2033 	for (i = 0; i < nmissing_rows; i++) {
2034 		used[i] = parity_map[i];
2035 	}
2036 
2037 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2038 		if (tt < nmissing_rows &&
2039 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
2040 			tt++;
2041 			continue;
2042 		}
2043 
2044 		ASSERT3S(i, <, n);
2045 		used[i] = c;
2046 		i++;
2047 	}
2048 
2049 	/*
2050 	 * Initialize the interesting rows of the matrix.
2051 	 */
2052 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2053 
2054 	/*
2055 	 * Invert the matrix.
2056 	 */
2057 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2058 	    invrows, used);
2059 
2060 	/*
2061 	 * Reconstruct the missing data using the generated matrix.
2062 	 */
2063 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2064 	    invrows, used);
2065 
2066 	kmem_free(p, psize);
2067 
2068 	/*
2069 	 * copy back from temporary linear abds and free them
2070 	 */
2071 	if (bufs) {
2072 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2073 			raidz_col_t *col = &rr->rr_col[c];
2074 
2075 			if (bufs[c] != NULL) {
2076 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2077 				abd_free(col->rc_abd);
2078 			}
2079 			col->rc_abd = bufs[c];
2080 		}
2081 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2082 	}
2083 }
2084 
2085 static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)2086 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2087     const int *t, int nt)
2088 {
2089 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2090 	int ntgts;
2091 	int i, c, ret;
2092 	int nbadparity, nbaddata;
2093 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2094 
2095 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2096 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2097 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2098 		    (int)rr->rr_missingparity);
2099 	}
2100 
2101 	nbadparity = rr->rr_firstdatacol;
2102 	nbaddata = rr->rr_cols - nbadparity;
2103 	ntgts = 0;
2104 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2105 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2106 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2107 			    "offset=%llx error=%u)",
2108 			    rr, c, (int)rr->rr_col[c].rc_devidx,
2109 			    (long long)rr->rr_col[c].rc_offset,
2110 			    (int)rr->rr_col[c].rc_error);
2111 		}
2112 		if (c < rr->rr_firstdatacol)
2113 			parity_valid[c] = B_FALSE;
2114 
2115 		if (i < nt && c == t[i]) {
2116 			tgts[ntgts++] = c;
2117 			i++;
2118 		} else if (rr->rr_col[c].rc_error != 0) {
2119 			tgts[ntgts++] = c;
2120 		} else if (c >= rr->rr_firstdatacol) {
2121 			nbaddata--;
2122 		} else {
2123 			parity_valid[c] = B_TRUE;
2124 			nbadparity--;
2125 		}
2126 	}
2127 
2128 	ASSERT(ntgts >= nt);
2129 	ASSERT(nbaddata >= 0);
2130 	ASSERT(nbaddata + nbadparity == ntgts);
2131 
2132 	dt = &tgts[nbadparity];
2133 
2134 	/* Reconstruct using the new math implementation */
2135 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2136 	if (ret != RAIDZ_ORIGINAL_IMPL)
2137 		return;
2138 
2139 	/*
2140 	 * See if we can use any of our optimized reconstruction routines.
2141 	 */
2142 	switch (nbaddata) {
2143 	case 1:
2144 		if (parity_valid[VDEV_RAIDZ_P]) {
2145 			vdev_raidz_reconstruct_p(rr, dt, 1);
2146 			return;
2147 		}
2148 
2149 		ASSERT(rr->rr_firstdatacol > 1);
2150 
2151 		if (parity_valid[VDEV_RAIDZ_Q]) {
2152 			vdev_raidz_reconstruct_q(rr, dt, 1);
2153 			return;
2154 		}
2155 
2156 		ASSERT(rr->rr_firstdatacol > 2);
2157 		break;
2158 
2159 	case 2:
2160 		ASSERT(rr->rr_firstdatacol > 1);
2161 
2162 		if (parity_valid[VDEV_RAIDZ_P] &&
2163 		    parity_valid[VDEV_RAIDZ_Q]) {
2164 			vdev_raidz_reconstruct_pq(rr, dt, 2);
2165 			return;
2166 		}
2167 
2168 		ASSERT(rr->rr_firstdatacol > 2);
2169 
2170 		break;
2171 	}
2172 
2173 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2174 }
2175 
2176 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)2177 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2178     uint64_t *logical_ashift, uint64_t *physical_ashift)
2179 {
2180 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2181 	uint64_t nparity = vdrz->vd_nparity;
2182 	int c;
2183 	int lasterror = 0;
2184 	int numerrors = 0;
2185 
2186 	ASSERT(nparity > 0);
2187 
2188 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2189 	    vd->vdev_children < nparity + 1) {
2190 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2191 		return (SET_ERROR(EINVAL));
2192 	}
2193 
2194 	vdev_open_children(vd);
2195 
2196 	for (c = 0; c < vd->vdev_children; c++) {
2197 		vdev_t *cvd = vd->vdev_child[c];
2198 
2199 		if (cvd->vdev_open_error != 0) {
2200 			lasterror = cvd->vdev_open_error;
2201 			numerrors++;
2202 			continue;
2203 		}
2204 
2205 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2206 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2207 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2208 	}
2209 	for (c = 0; c < vd->vdev_children; c++) {
2210 		vdev_t *cvd = vd->vdev_child[c];
2211 
2212 		if (cvd->vdev_open_error != 0)
2213 			continue;
2214 		*physical_ashift = vdev_best_ashift(*logical_ashift,
2215 		    *physical_ashift, cvd->vdev_physical_ashift);
2216 	}
2217 
2218 	if (vd->vdev_rz_expanding) {
2219 		*asize *= vd->vdev_children - 1;
2220 		*max_asize *= vd->vdev_children - 1;
2221 
2222 		vd->vdev_min_asize = *asize;
2223 	} else {
2224 		*asize *= vd->vdev_children;
2225 		*max_asize *= vd->vdev_children;
2226 	}
2227 
2228 	if (numerrors > nparity) {
2229 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2230 		return (lasterror);
2231 	}
2232 
2233 	return (0);
2234 }
2235 
2236 static void
vdev_raidz_close(vdev_t * vd)2237 vdev_raidz_close(vdev_t *vd)
2238 {
2239 	for (int c = 0; c < vd->vdev_children; c++) {
2240 		if (vd->vdev_child[c] != NULL)
2241 			vdev_close(vd->vdev_child[c]);
2242 	}
2243 }
2244 
2245 /*
2246  * Return the logical width to use, given the txg in which the allocation
2247  * happened.
2248  */
2249 static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)2250 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2251 {
2252 	reflow_node_t lookup = {
2253 		.re_txg = txg,
2254 	};
2255 	avl_index_t where;
2256 
2257 	uint64_t width;
2258 	mutex_enter(&vdrz->vd_expand_lock);
2259 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2260 	if (re != NULL) {
2261 		width = re->re_logical_width;
2262 	} else {
2263 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2264 		if (re != NULL)
2265 			width = re->re_logical_width;
2266 		else
2267 			width = vdrz->vd_original_width;
2268 	}
2269 	mutex_exit(&vdrz->vd_expand_lock);
2270 	return (width);
2271 }
2272 /*
2273  * This code converts an asize into the largest psize that can safely be written
2274  * to an allocation of that size for this vdev.
2275  *
2276  * Note that this function will not take into account the effect of gang
2277  * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
2278  * the psize_to_asize function.
2279  */
2280 static uint64_t
vdev_raidz_asize_to_psize(vdev_t * vd,uint64_t asize,uint64_t txg)2281 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
2282 {
2283 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2284 	uint64_t psize;
2285 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2286 	uint64_t nparity = vdrz->vd_nparity;
2287 
2288 	uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2289 
2290 	ASSERT0(asize % (1 << ashift));
2291 
2292 	psize = (asize >> ashift);
2293 	/*
2294 	 * If the roundup to nparity + 1 caused us to spill into a new row, we
2295 	 * need to ignore that row entirely (since it can't store data or
2296 	 * parity).
2297 	 */
2298 	uint64_t rows = psize / cols;
2299 	psize = psize - (rows * cols) <= nparity ? rows * cols : psize;
2300 	/*  Subtract out parity sectors for each row storing data. */
2301 	psize -= nparity * DIV_ROUND_UP(psize, cols);
2302 	psize <<= ashift;
2303 
2304 	return (psize);
2305 }
2306 
2307 /*
2308  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2309  * more space due to the lower data-to-parity ratio.  In this case it's
2310  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2311  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2312  * regardless of txg.  This is assured because for a single data sector, we
2313  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2314  */
2315 static uint64_t
vdev_raidz_psize_to_asize(vdev_t * vd,uint64_t psize,uint64_t txg)2316 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2317 {
2318 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2319 	uint64_t asize;
2320 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2321 	uint64_t nparity = vdrz->vd_nparity;
2322 
2323 	uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2324 
2325 	asize = ((psize - 1) >> ashift) + 1;
2326 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2327 	asize = roundup(asize, nparity + 1) << ashift;
2328 
2329 #ifdef ZFS_DEBUG
2330 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2331 	uint64_t ncols_new = vdrz->vd_physical_width;
2332 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2333 	    (ncols_new - nparity));
2334 	asize_new = roundup(asize_new, nparity + 1) << ashift;
2335 	VERIFY3U(asize_new, <=, asize);
2336 #endif
2337 
2338 	return (asize);
2339 }
2340 
2341 /*
2342  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2343  * so each child must provide at least 1/Nth of its asize.
2344  */
2345 static uint64_t
vdev_raidz_min_asize(vdev_t * vd)2346 vdev_raidz_min_asize(vdev_t *vd)
2347 {
2348 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2349 	    vd->vdev_children);
2350 }
2351 
2352 /*
2353  * return B_TRUE if a read should be skipped due to being too slow.
2354  *
2355  * In vdev_child_slow_outlier() it looks for outliers based on disk
2356  * latency from the most recent child reads.  Here we're checking if,
2357  * over time, a disk has has been an outlier too many times and is
2358  * now in a sit out period.
2359  */
2360 boolean_t
vdev_sit_out_reads(vdev_t * vd,zio_flag_t io_flags)2361 vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
2362 {
2363 	if (vdev_read_sit_out_secs == 0)
2364 		return (B_FALSE);
2365 
2366 	/* Avoid skipping a data column read when scrubbing */
2367 	if (io_flags & ZIO_FLAG_SCRUB)
2368 		return (B_FALSE);
2369 
2370 	if (!vd->vdev_ops->vdev_op_leaf) {
2371 		boolean_t sitting = B_FALSE;
2372 		for (int c = 0; c < vd->vdev_children; c++) {
2373 			sitting |= vdev_sit_out_reads(vd->vdev_child[c],
2374 			    io_flags);
2375 		}
2376 		return (sitting);
2377 	}
2378 
2379 	if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
2380 		return (B_TRUE);
2381 
2382 	vd->vdev_read_sit_out_expire = 0;
2383 
2384 	return (B_FALSE);
2385 }
2386 
2387 void
vdev_raidz_child_done(zio_t * zio)2388 vdev_raidz_child_done(zio_t *zio)
2389 {
2390 	raidz_col_t *rc = zio->io_private;
2391 
2392 	ASSERT3P(rc->rc_abd, !=, NULL);
2393 	rc->rc_error = zio->io_error;
2394 	rc->rc_tried = 1;
2395 	rc->rc_skipped = 0;
2396 }
2397 
2398 static void
vdev_raidz_shadow_child_done(zio_t * zio)2399 vdev_raidz_shadow_child_done(zio_t *zio)
2400 {
2401 	raidz_col_t *rc = zio->io_private;
2402 
2403 	rc->rc_shadow_error = zio->io_error;
2404 }
2405 
2406 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)2407 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2408 {
2409 	(void) rm;
2410 #ifdef ZFS_DEBUG
2411 	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
2412 	logical_rs.rs_start = rr->rr_offset;
2413 	logical_rs.rs_end = logical_rs.rs_start +
2414 	    vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
2415 	    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2416 
2417 	raidz_col_t *rc = &rr->rr_col[col];
2418 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2419 
2420 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2421 	ASSERT(vdev_xlate_is_empty(&remain_rs));
2422 	if (vdev_xlate_is_empty(&physical_rs)) {
2423 		/*
2424 		 * If we are in the middle of expansion, the
2425 		 * physical->logical mapping is changing so vdev_xlate()
2426 		 * can't give us a reliable answer.
2427 		 */
2428 		return;
2429 	}
2430 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2431 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2432 	/*
2433 	 * It would be nice to assert that rs_end is equal
2434 	 * to rc_offset + rc_size but there might be an
2435 	 * optional I/O at the end that is not accounted in
2436 	 * rc_size.
2437 	 */
2438 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2439 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2440 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2441 	} else {
2442 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2443 	}
2444 #endif
2445 }
2446 
2447 static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2448 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2449 {
2450 	vdev_t *vd = zio->io_vd;
2451 	raidz_map_t *rm = zio->io_vsd;
2452 
2453 	vdev_raidz_generate_parity_row(rm, rr);
2454 
2455 	for (int c = 0; c < rr->rr_scols; c++) {
2456 		raidz_col_t *rc = &rr->rr_col[c];
2457 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2458 
2459 		/* Verify physical to logical translation */
2460 		vdev_raidz_io_verify(zio, rm, rr, c);
2461 
2462 		if (rc->rc_size == 0)
2463 			continue;
2464 
2465 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2466 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2467 
2468 		ASSERT3P(rc->rc_abd, !=, NULL);
2469 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2470 		    rc->rc_offset, rc->rc_abd,
2471 		    abd_get_size(rc->rc_abd), zio->io_type,
2472 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2473 
2474 		if (rc->rc_shadow_devidx != INT_MAX) {
2475 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2476 
2477 			ASSERT3U(
2478 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2479 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2480 
2481 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2482 			    rc->rc_shadow_offset, rc->rc_abd,
2483 			    abd_get_size(rc->rc_abd),
2484 			    zio->io_type, zio->io_priority, 0,
2485 			    vdev_raidz_shadow_child_done, rc));
2486 		}
2487 	}
2488 }
2489 
2490 /*
2491  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2492  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2493  */
2494 static void
raidz_start_skip_writes(zio_t * zio)2495 raidz_start_skip_writes(zio_t *zio)
2496 {
2497 	vdev_t *vd = zio->io_vd;
2498 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2499 	raidz_map_t *rm = zio->io_vsd;
2500 	ASSERT3U(rm->rm_nrows, ==, 1);
2501 	raidz_row_t *rr = rm->rm_row[0];
2502 	for (int c = 0; c < rr->rr_scols; c++) {
2503 		raidz_col_t *rc = &rr->rr_col[c];
2504 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2505 		if (rc->rc_size != 0)
2506 			continue;
2507 		ASSERT0P(rc->rc_abd);
2508 
2509 		ASSERT3U(rc->rc_offset, <,
2510 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2511 
2512 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2513 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2514 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2515 	}
2516 }
2517 
2518 static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2519 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2520 {
2521 	vdev_t *vd = zio->io_vd;
2522 
2523 	/*
2524 	 * Iterate over the columns in reverse order so that we hit the parity
2525 	 * last -- any errors along the way will force us to read the parity.
2526 	 */
2527 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2528 		raidz_col_t *rc = &rr->rr_col[c];
2529 		if (rc->rc_size == 0)
2530 			continue;
2531 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2532 		if (!vdev_readable(cvd)) {
2533 			if (c >= rr->rr_firstdatacol)
2534 				rr->rr_missingdata++;
2535 			else
2536 				rr->rr_missingparity++;
2537 			rc->rc_error = SET_ERROR(ENXIO);
2538 			rc->rc_tried = 1;	/* don't even try */
2539 			rc->rc_skipped = 1;
2540 			continue;
2541 		}
2542 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2543 			if (c >= rr->rr_firstdatacol)
2544 				rr->rr_missingdata++;
2545 			else
2546 				rr->rr_missingparity++;
2547 			rc->rc_error = SET_ERROR(ESTALE);
2548 			rc->rc_skipped = 1;
2549 			continue;
2550 		}
2551 
2552 		if (vdev_sit_out_reads(cvd, zio->io_flags)) {
2553 			rr->rr_outlier_cnt++;
2554 			ASSERT0(rc->rc_latency_outlier);
2555 			rc->rc_latency_outlier = 1;
2556 		}
2557 	}
2558 
2559 	/*
2560 	 * When the row contains a latency outlier and sufficient parity
2561 	 * exists to reconstruct the column data, then skip reading the
2562 	 * known slow child vdev as a performance optimization.
2563 	 */
2564 	if (rr->rr_outlier_cnt > 0 &&
2565 	    (rr->rr_firstdatacol - rr->rr_missingparity) >=
2566 	    (rr->rr_missingdata + 1)) {
2567 
2568 		for (int c = rr->rr_cols - 1; c >= 0; c--) {
2569 			raidz_col_t *rc = &rr->rr_col[c];
2570 
2571 			if (rc->rc_error == 0 && rc->rc_latency_outlier) {
2572 				if (c >= rr->rr_firstdatacol)
2573 					rr->rr_missingdata++;
2574 				else
2575 					rr->rr_missingparity++;
2576 				rc->rc_error = SET_ERROR(EAGAIN);
2577 				rc->rc_skipped = 1;
2578 				break;
2579 			}
2580 		}
2581 	}
2582 
2583 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2584 		raidz_col_t *rc = &rr->rr_col[c];
2585 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2586 
2587 		if (rc->rc_error || rc->rc_size == 0)
2588 			continue;
2589 
2590 		if (forceparity ||
2591 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2592 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2593 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2594 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2595 			    zio->io_type, zio->io_priority, 0,
2596 			    vdev_raidz_child_done, rc));
2597 		}
2598 	}
2599 }
2600 
2601 static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)2602 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2603 {
2604 	vdev_t *vd = zio->io_vd;
2605 
2606 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2607 		raidz_col_t *prc = &rm->rm_phys_col[i];
2608 		if (prc->rc_size == 0)
2609 			continue;
2610 
2611 		ASSERT3U(prc->rc_devidx, ==, i);
2612 		vdev_t *cvd = vd->vdev_child[i];
2613 
2614 		if (!vdev_readable(cvd)) {
2615 			prc->rc_error = SET_ERROR(ENXIO);
2616 			prc->rc_tried = 1;	/* don't even try */
2617 			prc->rc_skipped = 1;
2618 			continue;
2619 		}
2620 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2621 			prc->rc_error = SET_ERROR(ESTALE);
2622 			prc->rc_skipped = 1;
2623 			continue;
2624 		}
2625 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2626 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2627 		    zio->io_type, zio->io_priority, 0,
2628 		    vdev_raidz_child_done, prc));
2629 	}
2630 }
2631 
2632 static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)2633 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2634 {
2635 	/*
2636 	 * If there are multiple rows, we will be hitting
2637 	 * all disks, so go ahead and read the parity so
2638 	 * that we are reading in decent size chunks.
2639 	 */
2640 	boolean_t forceparity = rm->rm_nrows > 1;
2641 
2642 	if (rm->rm_phys_col) {
2643 		vdev_raidz_io_start_read_phys_cols(zio, rm);
2644 	} else {
2645 		for (int i = 0; i < rm->rm_nrows; i++) {
2646 			raidz_row_t *rr = rm->rm_row[i];
2647 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2648 		}
2649 	}
2650 }
2651 
2652 /*
2653  * Start an IO operation on a RAIDZ VDev
2654  *
2655  * Outline:
2656  * - For write operations:
2657  *   1. Generate the parity data
2658  *   2. Create child zio write operations to each column's vdev, for both
2659  *      data and parity.
2660  *   3. If the column skips any sectors for padding, create optional dummy
2661  *      write zio children for those areas to improve aggregation continuity.
2662  * - For read operations:
2663  *   1. Create child zio read operations to each data column's vdev to read
2664  *      the range of data required for zio.
2665  *   2. If this is a scrub or resilver operation, or if any of the data
2666  *      vdevs have had errors, then create zio read operations to the parity
2667  *      columns' VDevs as well.
2668  */
2669 static void
vdev_raidz_io_start(zio_t * zio)2670 vdev_raidz_io_start(zio_t *zio)
2671 {
2672 	vdev_t *vd = zio->io_vd;
2673 	vdev_t *tvd = vd->vdev_top;
2674 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2675 	raidz_map_t *rm;
2676 
2677 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2678 	    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2679 	if (logical_width != vdrz->vd_physical_width) {
2680 		zfs_locked_range_t *lr = NULL;
2681 		uint64_t synced_offset = UINT64_MAX;
2682 		uint64_t next_offset = UINT64_MAX;
2683 		boolean_t use_scratch = B_FALSE;
2684 		/*
2685 		 * Note: when the expansion is completing, we set
2686 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2687 		 * in a later txg than when we last update spa_ubsync's state
2688 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2689 		 * may see vre_state!=SCANNING before
2690 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2691 		 * on disk, but the copying progress has been synced to disk
2692 		 * (and reflected in spa_ubsync).  In this case it's fine to
2693 		 * treat the expansion as completed, since if we crash there's
2694 		 * no additional copying to do.
2695 		 */
2696 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2697 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2698 			    &vdrz->vn_vre);
2699 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2700 			    zio->io_offset, zio->io_size, RL_READER);
2701 			use_scratch =
2702 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2703 			    RRSS_SCRATCH_VALID);
2704 			synced_offset =
2705 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2706 			next_offset = vdrz->vn_vre.vre_offset;
2707 			/*
2708 			 * If we haven't resumed expanding since importing the
2709 			 * pool, vre_offset won't have been set yet.  In
2710 			 * this case the next offset to be copied is the same
2711 			 * as what was synced.
2712 			 */
2713 			if (next_offset == UINT64_MAX) {
2714 				next_offset = synced_offset;
2715 			}
2716 		}
2717 
2718 		rm = vdev_raidz_map_alloc_expanded(zio,
2719 		    tvd->vdev_ashift, vdrz->vd_physical_width,
2720 		    logical_width, vdrz->vd_nparity,
2721 		    synced_offset, next_offset, use_scratch);
2722 		rm->rm_lr = lr;
2723 	} else {
2724 		rm = vdev_raidz_map_alloc(zio,
2725 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2726 	}
2727 	rm->rm_original_width = vdrz->vd_original_width;
2728 
2729 	zio->io_vsd = rm;
2730 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2731 	if (zio->io_type == ZIO_TYPE_WRITE) {
2732 		for (int i = 0; i < rm->rm_nrows; i++) {
2733 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2734 		}
2735 
2736 		if (logical_width == vdrz->vd_physical_width) {
2737 			raidz_start_skip_writes(zio);
2738 		}
2739 	} else {
2740 		ASSERT(zio->io_type == ZIO_TYPE_READ);
2741 		vdev_raidz_io_start_read(zio, rm);
2742 	}
2743 
2744 	zio_execute(zio);
2745 }
2746 
2747 /*
2748  * Report a checksum error for a child of a RAID-Z device.
2749  */
2750 void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2751 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2752 {
2753 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2754 
2755 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2756 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2757 		zio_bad_cksum_t zbc;
2758 		raidz_map_t *rm = zio->io_vsd;
2759 
2760 		zbc.zbc_has_cksum = 0;
2761 		zbc.zbc_injected = rm->rm_ecksuminjected;
2762 
2763 		mutex_enter(&vd->vdev_stat_lock);
2764 		vd->vdev_stat.vs_checksum_errors++;
2765 		mutex_exit(&vd->vdev_stat_lock);
2766 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2767 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2768 		    rc->rc_abd, bad_data, &zbc);
2769 	}
2770 }
2771 
2772 /*
2773  * We keep track of whether or not there were any injected errors, so that
2774  * any ereports we generate can note it.
2775  */
2776 static int
raidz_checksum_verify(zio_t * zio)2777 raidz_checksum_verify(zio_t *zio)
2778 {
2779 	zio_bad_cksum_t zbc = {0};
2780 	raidz_map_t *rm = zio->io_vsd;
2781 
2782 	int ret = zio_checksum_error(zio, &zbc);
2783 	/*
2784 	 * Any Direct I/O read that has a checksum error must be treated as
2785 	 * suspicious as the contents of the buffer could be getting
2786 	 * manipulated while the I/O is taking place. The checksum verify error
2787 	 * will be reported to the top-level RAIDZ VDEV.
2788 	 */
2789 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2790 		zio->io_error = ret;
2791 		zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
2792 		zio_dio_chksum_verify_error_report(zio);
2793 		zio_checksum_verified(zio);
2794 		return (0);
2795 	}
2796 
2797 	if (ret != 0 && zbc.zbc_injected != 0)
2798 		rm->rm_ecksuminjected = 1;
2799 
2800 	return (ret);
2801 }
2802 
2803 /*
2804  * Generate the parity from the data columns. If we tried and were able to
2805  * read the parity without error, verify that the generated parity matches the
2806  * data we read. If it doesn't, we fire off a checksum error. Return the
2807  * number of such failures.
2808  */
2809 static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)2810 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2811 {
2812 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2813 	int c, ret = 0;
2814 	raidz_map_t *rm = zio->io_vsd;
2815 	raidz_col_t *rc;
2816 
2817 	blkptr_t *bp = zio->io_bp;
2818 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2819 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2820 
2821 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2822 		return (ret);
2823 
2824 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2825 		rc = &rr->rr_col[c];
2826 		if (!rc->rc_tried || rc->rc_error != 0)
2827 			continue;
2828 
2829 		orig[c] = rc->rc_abd;
2830 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2831 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2832 	}
2833 
2834 	/*
2835 	 * Verify any empty sectors are zero filled to ensure the parity
2836 	 * is calculated correctly even if these non-data sectors are damaged.
2837 	 */
2838 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2839 		ret += vdev_draid_map_verify_empty(zio, rr);
2840 
2841 	/*
2842 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
2843 	 * isn't harmful but it does have the side effect of fixing stuff
2844 	 * we didn't realize was necessary (i.e. even if we return 0).
2845 	 */
2846 	vdev_raidz_generate_parity_row(rm, rr);
2847 
2848 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2849 		rc = &rr->rr_col[c];
2850 
2851 		if (!rc->rc_tried || rc->rc_error != 0)
2852 			continue;
2853 
2854 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2855 			vdev_raidz_checksum_error(zio, rc, orig[c]);
2856 			rc->rc_error = SET_ERROR(ECKSUM);
2857 			ret++;
2858 		}
2859 		abd_free(orig[c]);
2860 	}
2861 
2862 	return (ret);
2863 }
2864 
2865 static int
vdev_raidz_worst_error(raidz_row_t * rr)2866 vdev_raidz_worst_error(raidz_row_t *rr)
2867 {
2868 	int error = 0;
2869 
2870 	for (int c = 0; c < rr->rr_cols; c++) {
2871 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2872 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2873 	}
2874 
2875 	return (error);
2876 }
2877 
2878 /*
2879  * Find the median value from a set of n values
2880  */
2881 static uint64_t
latency_median_value(const uint64_t * data,size_t n)2882 latency_median_value(const uint64_t *data, size_t n)
2883 {
2884 	uint64_t m;
2885 
2886 	if (n % 2 == 0)
2887 		m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
2888 	else
2889 		m = data[((n + 1) >> 1) - 1];
2890 
2891 	return (m);
2892 }
2893 
2894 /*
2895  * Calculate the outlier fence from a set of n latency values
2896  *
2897  * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
2898  */
2899 static uint64_t
latency_quartiles_fence(const uint64_t * data,size_t n,uint64_t * iqr)2900 latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
2901 {
2902 	uint64_t q1 = latency_median_value(&data[0], n >> 1);
2903 	uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
2904 
2905 	/*
2906 	 * To avoid detecting false positive outliers when N is small and
2907 	 * and the latencies values are very close, make sure the IQR
2908 	 * is at least 25% larger than Q1.
2909 	 */
2910 	*iqr = MAX(q3 - q1, q1 / 4);
2911 
2912 	return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
2913 }
2914 #define	LAT_CHILDREN_MIN	5
2915 #define	LAT_OUTLIER_LIMIT	20
2916 
2917 static int
latency_compare(const void * arg1,const void * arg2)2918 latency_compare(const void *arg1, const void *arg2)
2919 {
2920 	const uint64_t *l1 = (uint64_t *)arg1;
2921 	const uint64_t *l2 = (uint64_t *)arg2;
2922 
2923 	return (TREE_CMP(*l1, *l2));
2924 }
2925 
2926 void
vdev_raidz_sit_child(vdev_t * svd,uint64_t secs)2927 vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
2928 {
2929 	for (int c = 0; c < svd->vdev_children; c++)
2930 		vdev_raidz_sit_child(svd->vdev_child[c], secs);
2931 
2932 	if (!svd->vdev_ops->vdev_op_leaf)
2933 		return;
2934 
2935 	/* Begin a sit out period for this slow drive */
2936 	svd->vdev_read_sit_out_expire = gethrestime_sec() +
2937 	    secs;
2938 
2939 	/* Count each slow io period */
2940 	mutex_enter(&svd->vdev_stat_lock);
2941 	svd->vdev_stat.vs_slow_ios++;
2942 	mutex_exit(&svd->vdev_stat_lock);
2943 }
2944 
2945 void
vdev_raidz_unsit_child(vdev_t * vd)2946 vdev_raidz_unsit_child(vdev_t *vd)
2947 {
2948 	for (int c = 0; c < vd->vdev_children; c++)
2949 		vdev_raidz_unsit_child(vd->vdev_child[c]);
2950 
2951 	if (!vd->vdev_ops->vdev_op_leaf)
2952 		return;
2953 
2954 	vd->vdev_read_sit_out_expire = 0;
2955 }
2956 
2957 /*
2958  * Check for any latency outlier from latest set of child reads.
2959  *
2960  * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
2961  * rule defines extreme outliers as data points outside the fence of the
2962  * third quartile plus fifty times the Interquartile Range (IQR). This range
2963  * is the distance between the first and third quartile.
2964  *
2965  * Fifty is an extremely large value for Tukey's fence, but the outliers we're
2966  * attempting to detect here are orders of magnitude times larger than the
2967  * median. This large value should capture any truly fault disk quickly,
2968  * without causing spurious sit-outs.
2969  *
2970  * To further avoid spurious sit-outs, vdevs must be detected multiple times
2971  * as an outlier before they are sat, and outlier counts will gradually decay.
2972  * Every nchildren times we have detected an outlier, we subtract 2 from the
2973  * outlier count of all children. If detected outliers are close to uniformly
2974  * distributed, this will result in the outlier count remaining close to 0
2975  * (in expectation; over long enough time-scales, spurious sit-outs are still
2976  * possible).
2977  */
2978 static void
vdev_child_slow_outlier(zio_t * zio)2979 vdev_child_slow_outlier(zio_t *zio)
2980 {
2981 	vdev_t *vd = zio->io_vd;
2982 	if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
2983 	    vd->vdev_children < LAT_CHILDREN_MIN)
2984 		return;
2985 
2986 	hrtime_t now = getlrtime();
2987 	uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
2988 
2989 	if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
2990 		return;
2991 
2992 	/* Allow a single winner when there are racing callers. */
2993 	if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
2994 		return;
2995 
2996 	int children = vd->vdev_children;
2997 	uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
2998 
2999 	for (int c = 0; c < children; c++) {
3000 		vdev_t *cvd = vd->vdev_child[c];
3001 		if (cvd->vdev_prev_histo == NULL) {
3002 			mutex_enter(&cvd->vdev_stat_lock);
3003 			size_t size =
3004 			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3005 			cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
3006 			memcpy(cvd->vdev_prev_histo,
3007 			    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
3008 			    size);
3009 			mutex_exit(&cvd->vdev_stat_lock);
3010 		}
3011 	}
3012 	uint64_t max = 0;
3013 	vdev_t *svd = NULL;
3014 	uint_t sitouts = 0;
3015 	boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
3016 	for (int c = 0; c < children; c++) {
3017 		vdev_t *cvd = vd->vdev_child[c];
3018 		boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
3019 		    cvd->vdev_state != VDEV_STATE_HEALTHY;
3020 
3021 		/* We can't sit out more disks than we have parity */
3022 		if (sitting && ++sitouts >= vdev_get_nparity(vd))
3023 			skip = B_TRUE;
3024 
3025 		mutex_enter(&cvd->vdev_stat_lock);
3026 
3027 		uint64_t *prev_histo = cvd->vdev_prev_histo;
3028 		uint64_t *histo =
3029 		    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
3030 		if (skip) {
3031 			size_t size =
3032 			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3033 			memcpy(prev_histo, histo, size);
3034 			mutex_exit(&cvd->vdev_stat_lock);
3035 			continue;
3036 		}
3037 		uint64_t count = 0;
3038 		lat_data[c] = 0;
3039 		for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
3040 			uint64_t this_count = histo[i] - prev_histo[i];
3041 			lat_data[c] += (1ULL << i) * this_count;
3042 			count += this_count;
3043 		}
3044 		size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3045 		memcpy(prev_histo, histo, size);
3046 		mutex_exit(&cvd->vdev_stat_lock);
3047 		lat_data[c] /= MAX(1, count);
3048 
3049 		/* Wait until all disks have been read from */
3050 		if (lat_data[c] == 0 && !sitting) {
3051 			skip = B_TRUE;
3052 			continue;
3053 		}
3054 
3055 		/* Keep track of the vdev with largest value */
3056 		if (lat_data[c] > max) {
3057 			max = lat_data[c];
3058 			svd = cvd;
3059 			svd_sitting = sitting;
3060 		}
3061 	}
3062 
3063 	if (skip) {
3064 		kmem_free(lat_data, sizeof (uint64_t) * children);
3065 		return;
3066 	}
3067 
3068 	qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
3069 
3070 	uint64_t iqr;
3071 	uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
3072 
3073 	ASSERT3U(lat_data[children - 1], ==, max);
3074 	if (max > fence && !svd_sitting) {
3075 		ASSERT3U(iqr, >, 0);
3076 		uint64_t incr = MAX(1, MIN((max - fence) / iqr,
3077 		    LAT_OUTLIER_LIMIT / 4));
3078 		vd->vdev_outlier_count += incr;
3079 		if (vd->vdev_outlier_count >= children) {
3080 			for (int c = 0; c < children; c++) {
3081 				vdev_t *cvd = vd->vdev_child[c];
3082 				cvd->vdev_outlier_count -= 2;
3083 				cvd->vdev_outlier_count = MAX(0,
3084 				    cvd->vdev_outlier_count);
3085 			}
3086 			vd->vdev_outlier_count = 0;
3087 		}
3088 		/*
3089 		 * Keep track of how many times this child has had
3090 		 * an outlier read. A disk that persitently has a
3091 		 * higher than peers outlier count will be considered
3092 		 * a slow disk.
3093 		 */
3094 		svd->vdev_outlier_count += incr;
3095 		if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
3096 			ASSERT0(svd->vdev_read_sit_out_expire);
3097 			vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
3098 			(void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
3099 			    zio->io_spa, svd, NULL, NULL, 0);
3100 			vdev_dbgmsg(svd, "begin read sit out for %d secs",
3101 			    (int)vdev_read_sit_out_secs);
3102 
3103 			for (int c = 0; c < vd->vdev_children; c++)
3104 				vd->vdev_child[c]->vdev_outlier_count = 0;
3105 		}
3106 	}
3107 
3108 	kmem_free(lat_data, sizeof (uint64_t) * children);
3109 }
3110 
3111 static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)3112 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
3113 {
3114 	int unexpected_errors = 0;
3115 	int parity_errors = 0;
3116 	int parity_untried = 0;
3117 	int data_errors = 0;
3118 	zio_flag_t add_flags = 0;
3119 
3120 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
3121 
3122 	for (int c = 0; c < rr->rr_cols; c++) {
3123 		raidz_col_t *rc = &rr->rr_col[c];
3124 
3125 		if (rc->rc_error) {
3126 			if (c < rr->rr_firstdatacol)
3127 				parity_errors++;
3128 			else
3129 				data_errors++;
3130 
3131 			if (!rc->rc_skipped)
3132 				unexpected_errors++;
3133 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3134 			parity_untried++;
3135 		}
3136 
3137 		if (rc->rc_force_repair)
3138 			unexpected_errors++;
3139 	}
3140 
3141 	/*
3142 	 * If we read more parity disks than were used for
3143 	 * reconstruction, confirm that the other parity disks produced
3144 	 * correct data.
3145 	 *
3146 	 * Note that we also regenerate parity when resilvering so we
3147 	 * can write it out to failed devices later.
3148 	 */
3149 	boolean_t parity_verify = (parity_errors + parity_untried) <
3150 	    (rr->rr_firstdatacol - data_errors);
3151 	if (parity_verify || (zio->io_flags & ZIO_FLAG_RESILVER)) {
3152 		int n = raidz_parity_verify(zio, rr);
3153 		/*
3154 		 * In, Reed-Solomon encoding, if we have ndata+1 columns and
3155 		 * the parity doesn't match, it means the data integrity is
3156 		 * compromised. We shouldn't try to repair anything in this
3157 		 * case.
3158 		 */
3159 		if (parity_verify && n > 0 &&
3160 		    zio->io_priority == ZIO_PRIORITY_REBUILD)
3161 			return;
3162 		/*
3163 		 * If we have only ndata columns, the data integrity will
3164 		 * be checked by the checksums normally, but not in case
3165 		 * of rebuild when we don't have checksums. In this case,
3166 		 * we add ZIO_FLAG_SPECULATIVE and try to not spread
3167 		 * unverified data. For example, when the target vdev happens
3168 		 * to be the mirroring spare vdev, we would repair only that
3169 		 * child in it which is being rebuilt.
3170 		 */
3171 		if (!parity_verify && zio->io_priority == ZIO_PRIORITY_REBUILD)
3172 			add_flags |= ZIO_FLAG_SPECULATIVE;
3173 		unexpected_errors += n;
3174 	}
3175 
3176 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
3177 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
3178 		/*
3179 		 * Use the good data we have in hand to repair damaged children.
3180 		 */
3181 		for (int c = 0; c < rr->rr_cols; c++) {
3182 			raidz_col_t *rc = &rr->rr_col[c];
3183 			vdev_t *vd = zio->io_vd;
3184 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
3185 
3186 			if (!rc->rc_allow_repair) {
3187 				continue;
3188 			} else if (!rc->rc_force_repair &&
3189 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
3190 				continue;
3191 			}
3192 			/*
3193 			 * We do not allow self healing for Direct I/O reads.
3194 			 * See comment in vdev_raid_row_alloc().
3195 			 */
3196 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
3197 
3198 			/*
3199 			 * When the target vdev is draid spare, we should clear
3200 			 * ZIO_FLAG_SPECULATIVE. First, if that draid spare maps
3201 			 * to another spare having an online/degraded disk, that
3202 			 * disk must be repaired also. Otherwise, the scrub will
3203 			 * detect a lot of cksum errors later. Second, since it
3204 			 * is draid spare, there is no harm in updating its
3205 			 * content on any vdev it maps to because the space is
3206 			 * reserved as a spare anyway.
3207 			 */
3208 			zio_flag_t aflags = add_flags;
3209 			if (rc->rc_tgt_is_dspare)
3210 				aflags &= ~ZIO_FLAG_SPECULATIVE;
3211 
3212 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
3213 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
3214 			    ZIO_TYPE_WRITE,
3215 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
3216 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
3217 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
3218 			    ZIO_FLAG_SELF_HEAL : 0) | aflags, NULL, NULL));
3219 		}
3220 	}
3221 
3222 	/*
3223 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
3224 	 * good data.  This ensures that if we've already copied this sector,
3225 	 * it will be corrected if it was damaged.  This writes more than is
3226 	 * necessary, but since expansion is paused during scrub/resilver, at
3227 	 * most a single row will have a shadow location.
3228 	 */
3229 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
3230 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
3231 		for (int c = 0; c < rr->rr_cols; c++) {
3232 			raidz_col_t *rc = &rr->rr_col[c];
3233 			vdev_t *vd = zio->io_vd;
3234 
3235 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
3236 				continue;
3237 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
3238 
3239 			/*
3240 			 * Note: We don't want to update the repair stats
3241 			 * because that would incorrectly indicate that there
3242 			 * was bad data to repair, which we aren't sure about.
3243 			 * By clearing the SCAN_THREAD flag, we prevent this
3244 			 * from happening, despite having the REPAIR flag set.
3245 			 * We need to set SELF_HEAL so that this i/o can't be
3246 			 * bypassed by zio_vdev_io_start().
3247 			 */
3248 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
3249 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
3250 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
3251 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
3252 			    NULL, NULL);
3253 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
3254 			zio_nowait(cio);
3255 		}
3256 	}
3257 }
3258 
3259 static void
raidz_restore_orig_data(raidz_map_t * rm)3260 raidz_restore_orig_data(raidz_map_t *rm)
3261 {
3262 	for (int i = 0; i < rm->rm_nrows; i++) {
3263 		raidz_row_t *rr = rm->rm_row[i];
3264 		for (int c = 0; c < rr->rr_cols; c++) {
3265 			raidz_col_t *rc = &rr->rr_col[c];
3266 			if (rc->rc_need_orig_restore) {
3267 				abd_copy(rc->rc_abd,
3268 				    rc->rc_orig_data, rc->rc_size);
3269 				rc->rc_need_orig_restore = B_FALSE;
3270 			}
3271 		}
3272 	}
3273 }
3274 
3275 /*
3276  * During raidz_reconstruct() for expanded VDEV, we need special consideration
3277  * failure simulations.  See note in raidz_reconstruct() on simulating failure
3278  * of a pre-expansion device.
3279  *
3280  * Treating logical child i as failed, return TRUE if the given column should
3281  * be treated as failed.  The idea of logical children allows us to imagine
3282  * that a disk silently failed before a RAIDZ expansion (reads from this disk
3283  * succeed but return the wrong data).  Since the expansion doesn't verify
3284  * checksums, the incorrect data will be moved to new locations spread among
3285  * the children (going diagonally across them).
3286  *
3287  * Higher "logical child failures" (values of `i`) indicate these
3288  * "pre-expansion failures".  The first physical_width values imagine that a
3289  * current child failed; the next physical_width-1 values imagine that a
3290  * child failed before the most recent expansion; the next physical_width-2
3291  * values imagine a child failed in the expansion before that, etc.
3292  */
3293 static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)3294 raidz_simulate_failure(int physical_width, int original_width, int ashift,
3295     int i, raidz_col_t *rc)
3296 {
3297 	uint64_t sector_id =
3298 	    physical_width * (rc->rc_offset >> ashift) +
3299 	    rc->rc_devidx;
3300 
3301 	for (int w = physical_width; w >= original_width; w--) {
3302 		if (i < w) {
3303 			return (sector_id % w == i);
3304 		} else {
3305 			i -= w;
3306 		}
3307 	}
3308 	ASSERT(!"invalid logical child id");
3309 	return (B_FALSE);
3310 }
3311 
3312 /*
3313  * returns EINVAL if reconstruction of the block will not be possible
3314  * returns ECKSUM if this specific reconstruction failed
3315  * returns 0 on successful reconstruction
3316  */
3317 static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)3318 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
3319 {
3320 	vdev_t *vd = zio->io_vd;
3321 	raidz_map_t *rm = zio->io_vsd;
3322 	int physical_width = vd->vdev_children;
3323 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
3324 
3325 	if (vd->vdev_ops == &vdev_draid_ops) {
3326 		vdev_draid_config_t *vdc = vd->vdev_tsd;
3327 		physical_width = vdc->vdc_children;
3328 	}
3329 
3330 	int original_width = (rm->rm_original_width != 0) ?
3331 	    rm->rm_original_width : physical_width;
3332 
3333 	if (dbgmsg) {
3334 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
3335 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
3336 	}
3337 
3338 	/* Reconstruct each row */
3339 	for (int r = 0; r < rm->rm_nrows; r++) {
3340 		raidz_row_t *rr = rm->rm_row[r];
3341 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
3342 		int t = 0;
3343 		int dead = 0;
3344 		int dead_data = 0;
3345 
3346 		if (dbgmsg)
3347 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
3348 
3349 		for (int c = 0; c < rr->rr_cols; c++) {
3350 			raidz_col_t *rc = &rr->rr_col[c];
3351 			ASSERT0(rc->rc_need_orig_restore);
3352 			if (rc->rc_error != 0) {
3353 				dead++;
3354 				if (c >= nparity)
3355 					dead_data++;
3356 				continue;
3357 			}
3358 			if (rc->rc_size == 0)
3359 				continue;
3360 			for (int lt = 0; lt < ntgts; lt++) {
3361 				if (raidz_simulate_failure(physical_width,
3362 				    original_width,
3363 				    zio->io_vd->vdev_top->vdev_ashift,
3364 				    ltgts[lt], rc)) {
3365 					if (rc->rc_orig_data == NULL) {
3366 						rc->rc_orig_data =
3367 						    abd_alloc_linear(
3368 						    rc->rc_size, B_TRUE);
3369 						abd_copy(rc->rc_orig_data,
3370 						    rc->rc_abd, rc->rc_size);
3371 					}
3372 					rc->rc_need_orig_restore = B_TRUE;
3373 
3374 					dead++;
3375 					if (c >= nparity)
3376 						dead_data++;
3377 					/*
3378 					 * Note: simulating failure of a
3379 					 * pre-expansion device can hit more
3380 					 * than one column, in which case we
3381 					 * might try to simulate more failures
3382 					 * than can be reconstructed, which is
3383 					 * also more than the size of my_tgts.
3384 					 * This check prevents accessing past
3385 					 * the end of my_tgts.  The "dead >
3386 					 * nparity" check below will fail this
3387 					 * reconstruction attempt.
3388 					 */
3389 					if (t < VDEV_RAIDZ_MAXPARITY) {
3390 						my_tgts[t++] = c;
3391 						if (dbgmsg) {
3392 							zfs_dbgmsg("simulating "
3393 							    "failure of col %u "
3394 							    "devidx %u", c,
3395 							    (int)rc->rc_devidx);
3396 						}
3397 					}
3398 					break;
3399 				}
3400 			}
3401 		}
3402 		if (dead > nparity) {
3403 			/* reconstruction not possible */
3404 			if (dbgmsg) {
3405 				zfs_dbgmsg("reconstruction not possible; "
3406 				    "too many failures");
3407 			}
3408 			raidz_restore_orig_data(rm);
3409 			return (EINVAL);
3410 		}
3411 		if (dead_data > 0)
3412 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3413 	}
3414 
3415 	/* Check for success */
3416 	if (raidz_checksum_verify(zio) == 0) {
3417 		if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3418 			return (0);
3419 
3420 		/* Reconstruction succeeded - report errors */
3421 		for (int i = 0; i < rm->rm_nrows; i++) {
3422 			raidz_row_t *rr = rm->rm_row[i];
3423 
3424 			for (int c = 0; c < rr->rr_cols; c++) {
3425 				raidz_col_t *rc = &rr->rr_col[c];
3426 				if (rc->rc_need_orig_restore) {
3427 					/*
3428 					 * Note: if this is a parity column,
3429 					 * we don't really know if it's wrong.
3430 					 * We need to let
3431 					 * vdev_raidz_io_done_verified() check
3432 					 * it, and if we set rc_error, it will
3433 					 * think that it is a "known" error
3434 					 * that doesn't need to be checked
3435 					 * or corrected.
3436 					 */
3437 					if (rc->rc_error == 0 &&
3438 					    c >= rr->rr_firstdatacol) {
3439 						vdev_raidz_checksum_error(zio,
3440 						    rc, rc->rc_orig_data);
3441 						rc->rc_error =
3442 						    SET_ERROR(ECKSUM);
3443 					}
3444 					rc->rc_need_orig_restore = B_FALSE;
3445 				}
3446 			}
3447 
3448 			vdev_raidz_io_done_verified(zio, rr);
3449 		}
3450 
3451 		zio_checksum_verified(zio);
3452 
3453 		if (dbgmsg) {
3454 			zfs_dbgmsg("reconstruction successful "
3455 			    "(checksum verified)");
3456 		}
3457 		return (0);
3458 	}
3459 
3460 	/* Reconstruction failed - restore original data */
3461 	raidz_restore_orig_data(rm);
3462 	if (dbgmsg) {
3463 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3464 		    "failed", zio);
3465 	}
3466 	return (ECKSUM);
3467 }
3468 
3469 /*
3470  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3471  * Note that the algorithm below is non-optimal because it doesn't take into
3472  * account how reconstruction is actually performed. For example, with
3473  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3474  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3475  * cases we'd only use parity information in column 0.
3476  *
3477  * The order that we find the various possible combinations of failed
3478  * disks is dictated by these rules:
3479  * - Examine each "slot" (the "i" in tgts[i])
3480  *   - Try to increment this slot (tgts[i] += 1)
3481  *   - if we can't increment because it runs into the next slot,
3482  *     reset our slot to the minimum, and examine the next slot
3483  *
3484  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3485  *  3 columns to reconstruct), we will generate the following sequence:
3486  *
3487  *  STATE        ACTION
3488  *  0 1 2        special case: skip since these are all parity
3489  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3490  *  0   2 3      first slot: increment to 1
3491  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3492  *  0 1     4    first: reset to 0; middle: increment to 2
3493  *  0   2   4    first: increment to 1
3494  *    1 2   4    first: reset to 0; middle: increment to 3
3495  *  0     3 4    first: increment to 1
3496  *    1   3 4    first: increment to 2
3497  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3498  *  0 1       5  first: reset to 0; middle: increment to 2
3499  *  0   2     5  first: increment to 1
3500  *    1 2     5  first: reset to 0; middle: increment to 3
3501  *  0     3   5  first: increment to 1
3502  *    1   3   5  first: increment to 2
3503  *      2 3   5  first: reset to 0; middle: increment to 4
3504  *  0       4 5  first: increment to 1
3505  *    1     4 5  first: increment to 2
3506  *      2   4 5  first: increment to 3
3507  *        3 4 5  done
3508  *
3509  * This strategy works for dRAID but is less efficient when there are a large
3510  * number of child vdevs and therefore permutations to check. Furthermore,
3511  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3512  * possible as long as there are no more than nparity data errors per row.
3513  * These additional permutations are not currently checked but could be as
3514  * a future improvement.
3515  *
3516  * Returns 0 on success, ECKSUM on failure.
3517  */
3518 static int
vdev_raidz_combrec(zio_t * zio)3519 vdev_raidz_combrec(zio_t *zio)
3520 {
3521 	vdev_t *vd = zio->io_vd;
3522 	int nparity = vdev_get_nparity(vd);
3523 	raidz_map_t *rm = zio->io_vsd;
3524 	int physical_width = zio->io_vd->vdev_children;
3525 
3526 	if (vd->vdev_ops == &vdev_draid_ops) {
3527 		vdev_draid_config_t *vdc = vd->vdev_tsd;
3528 		nparity = vdc->vdc_nparity;
3529 		physical_width = vdc->vdc_children;
3530 	}
3531 
3532 	int original_width = (rm->rm_original_width != 0) ?
3533 	    rm->rm_original_width : physical_width;
3534 
3535 	for (int i = 0; i < rm->rm_nrows; i++) {
3536 		raidz_row_t *rr = rm->rm_row[i];
3537 		int total_errors = 0;
3538 
3539 		for (int c = 0; c < rr->rr_cols; c++) {
3540 			if (rr->rr_col[c].rc_error)
3541 				total_errors++;
3542 		}
3543 
3544 		if (total_errors > nparity)
3545 			return (vdev_raidz_worst_error(rr));
3546 	}
3547 
3548 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3549 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3550 		int *ltgts = &tstore[1]; /* value is logical child ID */
3551 
3552 
3553 		/*
3554 		 * Determine number of logical children, n.  See comment
3555 		 * above raidz_simulate_failure().
3556 		 */
3557 		int n = 0;
3558 		for (int w = physical_width;
3559 		    w >= original_width; w--) {
3560 			n += w;
3561 		}
3562 
3563 		ASSERT3U(num_failures, <=, nparity);
3564 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3565 
3566 		/* Handle corner cases in combrec logic */
3567 		ltgts[-1] = -1;
3568 		for (int i = 0; i < num_failures; i++) {
3569 			ltgts[i] = i;
3570 		}
3571 		ltgts[num_failures] = n;
3572 
3573 		for (;;) {
3574 			int err = raidz_reconstruct(zio, ltgts, num_failures,
3575 			    nparity);
3576 			if (err == EINVAL) {
3577 				/*
3578 				 * Reconstruction not possible with this #
3579 				 * failures; try more failures.
3580 				 */
3581 				break;
3582 			} else if (err == 0)
3583 				return (0);
3584 
3585 			/* Compute next targets to try */
3586 			for (int t = 0; ; t++) {
3587 				ASSERT3U(t, <, num_failures);
3588 				ltgts[t]++;
3589 				if (ltgts[t] == n) {
3590 					/* try more failures */
3591 					ASSERT3U(t, ==, num_failures - 1);
3592 					if (zfs_flags &
3593 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3594 						zfs_dbgmsg("reconstruction "
3595 						    "failed for num_failures="
3596 						    "%u; tried all "
3597 						    "combinations",
3598 						    num_failures);
3599 					}
3600 					break;
3601 				}
3602 
3603 				ASSERT3U(ltgts[t], <, n);
3604 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3605 
3606 				/*
3607 				 * If that spot is available, we're done here.
3608 				 * Try the next combination.
3609 				 */
3610 				if (ltgts[t] != ltgts[t + 1])
3611 					break; // found next combination
3612 
3613 				/*
3614 				 * Otherwise, reset this tgt to the minimum,
3615 				 * and move on to the next tgt.
3616 				 */
3617 				ltgts[t] = ltgts[t - 1] + 1;
3618 				ASSERT3U(ltgts[t], ==, t);
3619 			}
3620 
3621 			/* Increase the number of failures and keep trying. */
3622 			if (ltgts[num_failures - 1] == n)
3623 				break;
3624 		}
3625 	}
3626 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3627 		zfs_dbgmsg("reconstruction failed for all num_failures");
3628 	return (ECKSUM);
3629 }
3630 
3631 void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)3632 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3633 {
3634 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3635 		raidz_row_t *rr = rm->rm_row[row];
3636 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
3637 	}
3638 }
3639 
3640 /*
3641  * Complete a write IO operation on a RAIDZ VDev
3642  *
3643  * Outline:
3644  *   1. Check for errors on the child IOs.
3645  *   2. Return, setting an error code if too few child VDevs were written
3646  *      to reconstruct the data later.  Note that partial writes are
3647  *      considered successful if they can be reconstructed at all.
3648  */
3649 static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)3650 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3651 {
3652 	int normal_errors = 0;
3653 	int shadow_errors = 0;
3654 	int retryable_errors = 0;
3655 
3656 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3657 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3658 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3659 
3660 	for (int c = 0; c < rr->rr_cols; c++) {
3661 		raidz_col_t *rc = &rr->rr_col[c];
3662 
3663 		if (rc->rc_error != 0) {
3664 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3665 			normal_errors++;
3666 		}
3667 		if (rc->rc_shadow_error != 0) {
3668 			ASSERT(rc->rc_shadow_error != ECKSUM);
3669 			shadow_errors++;
3670 		}
3671 		if (rc->rc_error || rc->rc_shadow_error) {
3672 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3673 			if (!(vdev_is_dead(cvd) || cvd->vdev_cant_write))
3674 				retryable_errors++;
3675 		}
3676 	}
3677 
3678 	/*
3679 	 * Treat partial writes as a success. If we couldn't write enough
3680 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3681 	 * enough.  Note that in the case of a shadow write (during raidz
3682 	 * expansion), depending on if we crash, either the normal (old) or
3683 	 * shadow (new) location may become the "real" version of the block,
3684 	 * so both locations must have sufficient redundancy.
3685 	 *
3686 	 * Now that we support write reallocation, it would be better
3687 	 * to treat partial failure as real failure unless there are
3688 	 * no non-degraded top-level vdevs left, and not update DTLs
3689 	 * if we intend to reallocate.
3690 	 */
3691 	if (normal_errors > rr->rr_firstdatacol ||
3692 	    shadow_errors > rr->rr_firstdatacol) {
3693 		zio->io_error = zio_worst_error(zio->io_error,
3694 		    vdev_raidz_worst_error(rr));
3695 	} else if (retryable_errors && zfs_scrub_partial_writes) {
3696 		zio->io_flags |= ZIO_FLAG_POSTREAD;
3697 	}
3698 }
3699 
3700 static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)3701 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3702     raidz_row_t *rr)
3703 {
3704 	int parity_errors = 0;
3705 	int parity_untried = 0;
3706 	int data_errors = 0;
3707 	int total_errors = 0;
3708 
3709 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3710 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3711 
3712 	for (int c = 0; c < rr->rr_cols; c++) {
3713 		raidz_col_t *rc = &rr->rr_col[c];
3714 
3715 		/*
3716 		 * If scrubbing and a replacing/sparing child vdev determined
3717 		 * that not all of its children have an identical copy of the
3718 		 * data, then clear the error so the column is treated like
3719 		 * any other read and force a repair to correct the damage.
3720 		 */
3721 		if (rc->rc_error == ECKSUM) {
3722 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3723 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3724 			rc->rc_force_repair = 1;
3725 			rc->rc_error = 0;
3726 		}
3727 
3728 		if (rc->rc_error) {
3729 			if (c < rr->rr_firstdatacol)
3730 				parity_errors++;
3731 			else
3732 				data_errors++;
3733 
3734 			total_errors++;
3735 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3736 			parity_untried++;
3737 		}
3738 	}
3739 
3740 	/*
3741 	 * If there were data errors and the number of errors we saw was
3742 	 * correctable -- less than or equal to the number of parity disks read
3743 	 * -- reconstruct based on the missing data.
3744 	 */
3745 	if (data_errors != 0 &&
3746 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3747 		/*
3748 		 * We either attempt to read all the parity columns or
3749 		 * none of them. If we didn't try to read parity, we
3750 		 * wouldn't be here in the correctable case. There must
3751 		 * also have been fewer parity errors than parity
3752 		 * columns or, again, we wouldn't be in this code path.
3753 		 */
3754 		ASSERT0(parity_untried);
3755 		ASSERT(parity_errors < rr->rr_firstdatacol);
3756 
3757 		/*
3758 		 * Identify the data columns that reported an error.
3759 		 */
3760 		int n = 0;
3761 		int tgts[VDEV_RAIDZ_MAXPARITY];
3762 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3763 			raidz_col_t *rc = &rr->rr_col[c];
3764 			if (rc->rc_error != 0) {
3765 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3766 				tgts[n++] = c;
3767 			}
3768 		}
3769 
3770 		ASSERT(rr->rr_firstdatacol >= n);
3771 
3772 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3773 	}
3774 }
3775 
3776 /*
3777  * Return the number of reads issued.
3778  */
3779 static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)3780 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3781 {
3782 	vdev_t *vd = zio->io_vd;
3783 	int nread = 0;
3784 
3785 	rr->rr_missingdata = 0;
3786 	rr->rr_missingparity = 0;
3787 
3788 	/*
3789 	 * If this rows contains empty sectors which are not required
3790 	 * for a normal read then allocate an ABD for them now so they
3791 	 * may be read, verified, and any needed repairs performed.
3792 	 */
3793 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3794 		vdev_draid_map_alloc_empty(zio, rr);
3795 
3796 	for (int c = 0; c < rr->rr_cols; c++) {
3797 		raidz_col_t *rc = &rr->rr_col[c];
3798 		if (rc->rc_tried || rc->rc_size == 0)
3799 			continue;
3800 
3801 		zio_nowait(zio_vdev_child_io(zio, NULL,
3802 		    vd->vdev_child[rc->rc_devidx],
3803 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3804 		    zio->io_type, zio->io_priority, 0,
3805 		    vdev_raidz_child_done, rc));
3806 		nread++;
3807 	}
3808 	return (nread);
3809 }
3810 
3811 /*
3812  * We're here because either there were too many errors to even attempt
3813  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3814  * failed. In either case, there is enough bad data to prevent reconstruction.
3815  * Start checksum ereports for all children which haven't failed.
3816  */
3817 static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)3818 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3819 {
3820 	raidz_map_t *rm = zio->io_vsd;
3821 
3822 	for (int i = 0; i < rm->rm_nrows; i++) {
3823 		raidz_row_t *rr = rm->rm_row[i];
3824 
3825 		for (int c = 0; c < rr->rr_cols; c++) {
3826 			raidz_col_t *rc = &rr->rr_col[c];
3827 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3828 
3829 			if (rc->rc_error != 0)
3830 				continue;
3831 
3832 			zio_bad_cksum_t zbc;
3833 			zbc.zbc_has_cksum = 0;
3834 			zbc.zbc_injected = rm->rm_ecksuminjected;
3835 			mutex_enter(&cvd->vdev_stat_lock);
3836 			cvd->vdev_stat.vs_checksum_errors++;
3837 			mutex_exit(&cvd->vdev_stat_lock);
3838 			(void) zfs_ereport_start_checksum(zio->io_spa,
3839 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3840 			    rc->rc_size, &zbc);
3841 		}
3842 	}
3843 }
3844 
3845 void
vdev_raidz_io_done(zio_t * zio)3846 vdev_raidz_io_done(zio_t *zio)
3847 {
3848 	raidz_map_t *rm = zio->io_vsd;
3849 
3850 	ASSERT(zio->io_bp != NULL);
3851 	if (zio->io_type == ZIO_TYPE_WRITE) {
3852 		for (int i = 0; i < rm->rm_nrows; i++) {
3853 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3854 		}
3855 	} else {
3856 		if (rm->rm_phys_col) {
3857 			/*
3858 			 * This is an aggregated read.  Copy the data and status
3859 			 * from the aggregate abd's to the individual rows.
3860 			 */
3861 			for (int i = 0; i < rm->rm_nrows; i++) {
3862 				raidz_row_t *rr = rm->rm_row[i];
3863 
3864 				for (int c = 0; c < rr->rr_cols; c++) {
3865 					raidz_col_t *rc = &rr->rr_col[c];
3866 					if (rc->rc_tried || rc->rc_size == 0)
3867 						continue;
3868 
3869 					raidz_col_t *prc =
3870 					    &rm->rm_phys_col[rc->rc_devidx];
3871 					rc->rc_error = prc->rc_error;
3872 					rc->rc_tried = prc->rc_tried;
3873 					rc->rc_skipped = prc->rc_skipped;
3874 					if (c >= rr->rr_firstdatacol) {
3875 						/*
3876 						 * Note: this is slightly faster
3877 						 * than using abd_copy_off().
3878 						 */
3879 						char *physbuf = abd_to_buf(
3880 						    prc->rc_abd);
3881 						void *physloc = physbuf +
3882 						    rc->rc_offset -
3883 						    prc->rc_offset;
3884 
3885 						abd_copy_from_buf(rc->rc_abd,
3886 						    physloc, rc->rc_size);
3887 					}
3888 				}
3889 			}
3890 		}
3891 
3892 		for (int i = 0; i < rm->rm_nrows; i++) {
3893 			raidz_row_t *rr = rm->rm_row[i];
3894 			vdev_raidz_io_done_reconstruct_known_missing(zio,
3895 			    rm, rr);
3896 		}
3897 
3898 		if (raidz_checksum_verify(zio) == 0) {
3899 			if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3900 				goto done;
3901 
3902 			for (int i = 0; i < rm->rm_nrows; i++) {
3903 				raidz_row_t *rr = rm->rm_row[i];
3904 				vdev_raidz_io_done_verified(zio, rr);
3905 			}
3906 			/* Periodically check for a read outlier */
3907 			if (zio->io_type == ZIO_TYPE_READ)
3908 				vdev_child_slow_outlier(zio);
3909 			zio_checksum_verified(zio);
3910 		} else {
3911 			/*
3912 			 * A sequential resilver has no checksum which makes
3913 			 * combinatoral reconstruction impossible. This code
3914 			 * path is unreachable since raidz_checksum_verify()
3915 			 * has no checksum to verify and must succeed.
3916 			 */
3917 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3918 
3919 			/*
3920 			 * This isn't a typical situation -- either we got a
3921 			 * read error or a child silently returned bad data.
3922 			 * Read every block so we can try again with as much
3923 			 * data and parity as we can track down. If we've
3924 			 * already been through once before, all children will
3925 			 * be marked as tried so we'll proceed to combinatorial
3926 			 * reconstruction.
3927 			 */
3928 			int nread = 0;
3929 			for (int i = 0; i < rm->rm_nrows; i++) {
3930 				nread += vdev_raidz_read_all(zio,
3931 				    rm->rm_row[i]);
3932 			}
3933 			if (nread != 0) {
3934 				/*
3935 				 * Normally our stage is VDEV_IO_DONE, but if
3936 				 * we've already called redone(), it will have
3937 				 * changed to VDEV_IO_START, in which case we
3938 				 * don't want to call redone() again.
3939 				 */
3940 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3941 					zio_vdev_io_redone(zio);
3942 				return;
3943 			}
3944 			/*
3945 			 * It would be too expensive to try every possible
3946 			 * combination of failed sectors in every row, so
3947 			 * instead we try every combination of failed current or
3948 			 * past physical disk. This means that if the incorrect
3949 			 * sectors were all on Nparity disks at any point in the
3950 			 * past, we will find the correct data.  The only known
3951 			 * case where this is less durable than a non-expanded
3952 			 * RAIDZ, is if we have a silent failure during
3953 			 * expansion.  In that case, one block could be
3954 			 * partially in the old format and partially in the
3955 			 * new format, so we'd lost some sectors from the old
3956 			 * format and some from the new format.
3957 			 *
3958 			 * e.g. logical_width=4 physical_width=6
3959 			 * the 15 (6+5+4) possible failed disks are:
3960 			 * width=6 child=0
3961 			 * width=6 child=1
3962 			 * width=6 child=2
3963 			 * width=6 child=3
3964 			 * width=6 child=4
3965 			 * width=6 child=5
3966 			 * width=5 child=0
3967 			 * width=5 child=1
3968 			 * width=5 child=2
3969 			 * width=5 child=3
3970 			 * width=5 child=4
3971 			 * width=4 child=0
3972 			 * width=4 child=1
3973 			 * width=4 child=2
3974 			 * width=4 child=3
3975 			 * And we will try every combination of Nparity of these
3976 			 * failing.
3977 			 *
3978 			 * As a first pass, we can generate every combo,
3979 			 * and try reconstructing, ignoring any known
3980 			 * failures.  If any row has too many known + simulated
3981 			 * failures, then we bail on reconstructing with this
3982 			 * number of simulated failures.  As an improvement,
3983 			 * we could detect the number of whole known failures
3984 			 * (i.e. we have known failures on these disks for
3985 			 * every row; the disks never succeeded), and
3986 			 * subtract that from the max # failures to simulate.
3987 			 * We could go even further like the current
3988 			 * combrec code, but that doesn't seem like it
3989 			 * gains us very much.  If we simulate a failure
3990 			 * that is also a known failure, that's fine.
3991 			 */
3992 			zio->io_error = vdev_raidz_combrec(zio);
3993 			if (zio->io_error == ECKSUM &&
3994 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3995 				vdev_raidz_io_done_unrecoverable(zio);
3996 			}
3997 		}
3998 	}
3999 done:
4000 	if (rm->rm_lr != NULL) {
4001 		zfs_rangelock_exit(rm->rm_lr);
4002 		rm->rm_lr = NULL;
4003 	}
4004 }
4005 
4006 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)4007 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
4008 {
4009 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4010 	if (faulted > vdrz->vd_nparity)
4011 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
4012 		    VDEV_AUX_NO_REPLICAS);
4013 	else if (degraded + faulted != 0)
4014 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
4015 	else
4016 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
4017 }
4018 
4019 /*
4020  * Determine if any portion of the provided block resides on a child vdev
4021  * with a dirty DTL and therefore needs to be resilvered.  The function
4022  * assumes that at least one DTL is dirty which implies that full stripe
4023  * width blocks must be resilvered.
4024  */
4025 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)4026 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
4027     uint64_t phys_birth)
4028 {
4029 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4030 
4031 	/*
4032 	 * If we're in the middle of a RAIDZ expansion, this block may be in
4033 	 * the old and/or new location.  For simplicity, always resilver it.
4034 	 */
4035 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
4036 		return (B_TRUE);
4037 
4038 	uint64_t dcols = vd->vdev_children;
4039 	uint64_t nparity = vdrz->vd_nparity;
4040 	uint64_t ashift = vd->vdev_top->vdev_ashift;
4041 	/* The starting RAIDZ (parent) vdev sector of the block. */
4042 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
4043 	/* The zio's size in units of the vdev's minimum sector size. */
4044 	uint64_t s = ((psize - 1) >> ashift) + 1;
4045 	/* The first column for this stripe. */
4046 	uint64_t f = b % dcols;
4047 
4048 	/* Unreachable by sequential resilver. */
4049 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
4050 
4051 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
4052 		return (B_FALSE);
4053 
4054 	if (s + nparity >= dcols)
4055 		return (B_TRUE);
4056 
4057 	for (uint64_t c = 0; c < s + nparity; c++) {
4058 		uint64_t devidx = (f + c) % dcols;
4059 		vdev_t *cvd = vd->vdev_child[devidx];
4060 
4061 		/*
4062 		 * dsl_scan_need_resilver() already checked vd with
4063 		 * vdev_dtl_contains(). So here just check cvd with
4064 		 * vdev_dtl_empty(), cheaper and a good approximation.
4065 		 */
4066 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
4067 			return (B_TRUE);
4068 	}
4069 
4070 	return (B_FALSE);
4071 }
4072 
4073 static void
vdev_raidz_xlate(vdev_t * cvd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)4074 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
4075     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
4076 {
4077 	(void) remain_rs;
4078 
4079 	vdev_t *raidvd = cvd->vdev_parent;
4080 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
4081 
4082 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4083 
4084 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4085 		/*
4086 		 * We're in the middle of expansion, in which case the
4087 		 * translation is in flux.  Any answer we give may be wrong
4088 		 * by the time we return, so it isn't safe for the caller to
4089 		 * act on it.  Therefore we say that this range isn't present
4090 		 * on any children.  The only consumers of this are "zpool
4091 		 * initialize" and trimming, both of which are "best effort"
4092 		 * anyway.
4093 		 */
4094 		physical_rs->rs_start = physical_rs->rs_end = 0;
4095 		remain_rs->rs_start = remain_rs->rs_end = 0;
4096 		return;
4097 	}
4098 
4099 	uint64_t width = vdrz->vd_physical_width;
4100 	uint64_t tgt_col = cvd->vdev_id;
4101 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
4102 
4103 	/* make sure the offsets are block-aligned */
4104 	ASSERT0(logical_rs->rs_start % (1 << ashift));
4105 	ASSERT0(logical_rs->rs_end % (1 << ashift));
4106 	uint64_t b_start = logical_rs->rs_start >> ashift;
4107 	uint64_t b_end = logical_rs->rs_end >> ashift;
4108 
4109 	uint64_t start_row = 0;
4110 	if (b_start > tgt_col) /* avoid underflow */
4111 		start_row = ((b_start - tgt_col - 1) / width) + 1;
4112 
4113 	uint64_t end_row = 0;
4114 	if (b_end > tgt_col)
4115 		end_row = ((b_end - tgt_col - 1) / width) + 1;
4116 
4117 	physical_rs->rs_start = start_row << ashift;
4118 	physical_rs->rs_end = end_row << ashift;
4119 
4120 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
4121 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
4122 	    logical_rs->rs_end - logical_rs->rs_start);
4123 }
4124 
4125 static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)4126 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
4127 {
4128 	spa_t *spa = arg;
4129 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4130 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4131 
4132 	/*
4133 	 * Ensure there are no i/os to the range that is being committed.
4134 	 */
4135 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
4136 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
4137 
4138 	mutex_enter(&vre->vre_lock);
4139 	uint64_t new_offset =
4140 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
4141 	/*
4142 	 * We should not have committed anything that failed.
4143 	 */
4144 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
4145 	mutex_exit(&vre->vre_lock);
4146 
4147 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4148 	    old_offset, new_offset - old_offset,
4149 	    RL_WRITER);
4150 
4151 	/*
4152 	 * Update the uberblock that will be written when this txg completes.
4153 	 */
4154 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
4155 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
4156 	vre->vre_offset_pertxg[txgoff] = 0;
4157 	zfs_rangelock_exit(lr);
4158 
4159 	mutex_enter(&vre->vre_lock);
4160 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
4161 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
4162 	mutex_exit(&vre->vre_lock);
4163 
4164 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4165 	VERIFY0(zap_update(spa->spa_meta_objset,
4166 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4167 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
4168 }
4169 
4170 static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)4171 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
4172 {
4173 	spa_t *spa = arg;
4174 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4175 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4176 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4177 
4178 	for (int i = 0; i < TXG_SIZE; i++)
4179 		VERIFY0(vre->vre_offset_pertxg[i]);
4180 
4181 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4182 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
4183 	re->re_logical_width = vdrz->vd_physical_width;
4184 	mutex_enter(&vdrz->vd_expand_lock);
4185 	avl_add(&vdrz->vd_expand_txgs, re);
4186 	mutex_exit(&vdrz->vd_expand_lock);
4187 
4188 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4189 
4190 	/*
4191 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
4192 	 * will get written (based on vd_expand_txgs).
4193 	 */
4194 	vdev_config_dirty(vd);
4195 
4196 	/*
4197 	 * Before we change vre_state, the on-disk state must reflect that we
4198 	 * have completed all copying, so that vdev_raidz_io_start() can use
4199 	 * vre_state to determine if the reflow is in progress.  See also the
4200 	 * end of spa_raidz_expand_thread().
4201 	 */
4202 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
4203 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
4204 
4205 	vre->vre_end_time = gethrestime_sec();
4206 	vre->vre_state = DSS_FINISHED;
4207 
4208 	uint64_t state = vre->vre_state;
4209 	VERIFY0(zap_update(spa->spa_meta_objset,
4210 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4211 	    sizeof (state), 1, &state, tx));
4212 
4213 	uint64_t end_time = vre->vre_end_time;
4214 	VERIFY0(zap_update(spa->spa_meta_objset,
4215 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4216 	    sizeof (end_time), 1, &end_time, tx));
4217 
4218 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
4219 
4220 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
4221 	    "%s vdev %llu new width %llu", spa_name(spa),
4222 	    (unsigned long long)vd->vdev_id,
4223 	    (unsigned long long)vd->vdev_children);
4224 
4225 	spa->spa_raidz_expand = NULL;
4226 	raidvd->vdev_rz_expanding = B_FALSE;
4227 
4228 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
4229 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
4230 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
4231 
4232 	spa_notify_waiters(spa);
4233 
4234 	/*
4235 	 * While we're in syncing context take the opportunity to
4236 	 * setup a scrub. All the data has been sucessfully copied
4237 	 * but we have not validated any checksums.
4238 	 */
4239 	setup_sync_arg_t setup_sync_arg = {
4240 		.func = POOL_SCAN_SCRUB,
4241 		.txgstart = 0,
4242 		.txgend = 0,
4243 	};
4244 	if (zfs_scrub_after_expand &&
4245 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
4246 		dsl_scan_setup_sync(&setup_sync_arg, tx);
4247 	}
4248 }
4249 
4250 /*
4251  * State of one copy batch.
4252  */
4253 typedef struct raidz_reflow_arg {
4254 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
4255 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
4256 	uint64_t rra_txg;	/* TXG of this batch. */
4257 	uint_t rra_ashift;	/* Ashift of the vdev. */
4258 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
4259 	uint32_t rra_writes;	/* Number of write ZIOs. */
4260 	zio_t *rra_zio[];	/* Write ZIO pointers. */
4261 } raidz_reflow_arg_t;
4262 
4263 /*
4264  * Write of the new location on one child is done.  Once all of them are done
4265  * we can unlock and free everything.
4266  */
4267 static void
raidz_reflow_write_done(zio_t * zio)4268 raidz_reflow_write_done(zio_t *zio)
4269 {
4270 	raidz_reflow_arg_t *rra = zio->io_private;
4271 	vdev_raidz_expand_t *vre = rra->rra_vre;
4272 
4273 	abd_free(zio->io_abd);
4274 
4275 	mutex_enter(&vre->vre_lock);
4276 	if (zio->io_error != 0) {
4277 		/* Force a reflow pause on errors */
4278 		vre->vre_failed_offset =
4279 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4280 	}
4281 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
4282 	vre->vre_outstanding_bytes -= zio->io_size;
4283 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
4284 	    vre->vre_failed_offset) {
4285 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
4286 		    zio->io_size;
4287 	}
4288 	cv_signal(&vre->vre_cv);
4289 	boolean_t done = (--rra->rra_tbd == 0);
4290 	mutex_exit(&vre->vre_lock);
4291 
4292 	if (!done)
4293 		return;
4294 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
4295 	zfs_rangelock_exit(rra->rra_lr);
4296 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
4297 }
4298 
4299 /*
4300  * Read of the old location on one child is done.  Once all of them are done
4301  * writes should have all the data and we can issue them.
4302  */
4303 static void
raidz_reflow_read_done(zio_t * zio)4304 raidz_reflow_read_done(zio_t *zio)
4305 {
4306 	raidz_reflow_arg_t *rra = zio->io_private;
4307 	vdev_raidz_expand_t *vre = rra->rra_vre;
4308 
4309 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
4310 	if (zio->io_size > (1 << rra->rra_ashift))
4311 		abd_free(zio->io_abd);
4312 
4313 	/*
4314 	 * If the read failed, or if it was done on a vdev that is not fully
4315 	 * healthy (e.g. a child that has a resilver in progress), we may not
4316 	 * have the correct data.  Note that it's OK if the write proceeds.
4317 	 * It may write garbage but the location is otherwise unused and we
4318 	 * will retry later due to vre_failed_offset.
4319 	 */
4320 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
4321 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
4322 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
4323 		    (long long)rra->rra_lr->lr_offset,
4324 		    (long long)rra->rra_lr->lr_length,
4325 		    (long long)rra->rra_txg,
4326 		    zio->io_error,
4327 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
4328 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
4329 		mutex_enter(&vre->vre_lock);
4330 		/* Force a reflow pause on errors */
4331 		vre->vre_failed_offset =
4332 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4333 		mutex_exit(&vre->vre_lock);
4334 	}
4335 
4336 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
4337 		return;
4338 	uint32_t writes = rra->rra_tbd = rra->rra_writes;
4339 	for (uint64_t i = 0; i < writes; i++)
4340 		zio_nowait(rra->rra_zio[i]);
4341 }
4342 
4343 static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)4344 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
4345     dmu_tx_t *tx)
4346 {
4347 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4348 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4349 
4350 	if (offset == 0)
4351 		return;
4352 
4353 	mutex_enter(&vre->vre_lock);
4354 	ASSERT3U(vre->vre_offset, <=, offset);
4355 	vre->vre_offset = offset;
4356 	mutex_exit(&vre->vre_lock);
4357 
4358 	if (vre->vre_offset_pertxg[txgoff] == 0) {
4359 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
4360 		    spa, tx);
4361 	}
4362 	vre->vre_offset_pertxg[txgoff] = offset;
4363 }
4364 
4365 static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)4366 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
4367 {
4368 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
4369 		/* Quick check if a child is being replaced */
4370 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
4371 			return (B_TRUE);
4372 	}
4373 	return (B_FALSE);
4374 }
4375 
4376 static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,zfs_range_tree_t * rt,dmu_tx_t * tx)4377 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
4378     dmu_tx_t *tx)
4379 {
4380 	spa_t *spa = vd->vdev_spa;
4381 	uint_t ashift = vd->vdev_top->vdev_ashift;
4382 
4383 	zfs_range_seg_t *rs = zfs_range_tree_first(rt);
4384 	if (rt == NULL)
4385 		return (B_FALSE);
4386 	uint64_t offset = zfs_rs_get_start(rs, rt);
4387 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
4388 	uint64_t size = zfs_rs_get_end(rs, rt) - offset;
4389 	ASSERT3U(size, >=, 1 << ashift);
4390 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
4391 
4392 	uint64_t blkid = offset >> ashift;
4393 	uint_t old_children = vd->vdev_children - 1;
4394 
4395 	/*
4396 	 * We can only progress to the point that writes will not overlap
4397 	 * with blocks whose progress has not yet been recorded on disk.
4398 	 * Since partially-copied rows are still read from the old location,
4399 	 * we need to stop one row before the sector-wise overlap, to prevent
4400 	 * row-wise overlap.
4401 	 *
4402 	 * Note that even if we are skipping over a large unallocated region,
4403 	 * we can't move the on-disk progress to `offset`, because concurrent
4404 	 * writes/allocations could still use the currently-unallocated
4405 	 * region.
4406 	 */
4407 	uint64_t ubsync_blkid =
4408 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
4409 	uint64_t next_overwrite_blkid = ubsync_blkid +
4410 	    ubsync_blkid / old_children - old_children;
4411 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
4412 	if (blkid >= next_overwrite_blkid) {
4413 		raidz_reflow_record_progress(vre,
4414 		    next_overwrite_blkid << ashift, tx);
4415 		return (B_TRUE);
4416 	}
4417 
4418 	size = MIN(size, raidz_expand_max_copy_bytes);
4419 	size = MIN(size, (uint64_t)old_children *
4420 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4421 	size = MAX(size, 1 << ashift);
4422 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4423 	size = (uint64_t)blocks << ashift;
4424 
4425 	zfs_range_tree_remove(rt, offset, size);
4426 
4427 	uint_t reads = MIN(blocks, old_children);
4428 	uint_t writes = MIN(blocks, vd->vdev_children);
4429 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4430 	    sizeof (zio_t *) * writes, KM_SLEEP);
4431 	rra->rra_vre = vre;
4432 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4433 	    offset, size, RL_WRITER);
4434 	rra->rra_txg = dmu_tx_get_txg(tx);
4435 	rra->rra_ashift = ashift;
4436 	rra->rra_tbd = reads;
4437 	rra->rra_writes = writes;
4438 
4439 	raidz_reflow_record_progress(vre, offset + size, tx);
4440 
4441 	/*
4442 	 * SCL_STATE will be released when the read and write are done,
4443 	 * by raidz_reflow_write_done().
4444 	 */
4445 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4446 
4447 	/* check if a replacing vdev was added, if so treat it as an error */
4448 	if (vdev_raidz_expand_child_replacing(vd)) {
4449 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4450 		    "offset=%llu txg=%llu",
4451 		    (long long)rra->rra_lr->lr_offset,
4452 		    (long long)rra->rra_txg);
4453 
4454 		mutex_enter(&vre->vre_lock);
4455 		vre->vre_failed_offset =
4456 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4457 		cv_signal(&vre->vre_cv);
4458 		mutex_exit(&vre->vre_lock);
4459 
4460 		/* drop everything we acquired */
4461 		spa_config_exit(spa, SCL_STATE, spa);
4462 		zfs_rangelock_exit(rra->rra_lr);
4463 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4464 		return (B_TRUE);
4465 	}
4466 
4467 	mutex_enter(&vre->vre_lock);
4468 	vre->vre_outstanding_bytes += size;
4469 	mutex_exit(&vre->vre_lock);
4470 
4471 	/* Allocate ABD and ZIO for each child we write. */
4472 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4473 	zio_t *pio = spa->spa_txg_zio[txgoff];
4474 	uint_t b = blocks / vd->vdev_children;
4475 	uint_t bb = blocks % vd->vdev_children;
4476 	for (uint_t i = 0; i < writes; i++) {
4477 		uint_t n = b + (i < bb);
4478 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4479 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4480 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
4481 		    ((blkid + i) / vd->vdev_children) << ashift,
4482 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4483 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4484 	}
4485 
4486 	/*
4487 	 * Allocate and issue ZIO for each child we read.  For reads of only
4488 	 * one block we can use respective writer ABDs, since they will also
4489 	 * have only one block.  For bigger reads create gang ABDs and fill
4490 	 * them with respective blocks from writer ABDs.
4491 	 */
4492 	b = blocks / old_children;
4493 	bb = blocks % old_children;
4494 	for (uint_t i = 0; i < reads; i++) {
4495 		uint_t n = b + (i < bb);
4496 		abd_t *abd;
4497 		if (n > 1) {
4498 			abd = abd_alloc_gang();
4499 			for (uint_t j = 0; j < n; j++) {
4500 				uint_t b = j * old_children + i;
4501 				abd_t *cabd = abd_get_offset_size(
4502 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
4503 				    (b / vd->vdev_children) << ashift,
4504 				    1 << ashift);
4505 				abd_gang_add(abd, cabd, B_TRUE);
4506 			}
4507 		} else {
4508 			abd = rra->rra_zio[i]->io_abd;
4509 		}
4510 		zio_nowait(zio_vdev_child_io(pio, NULL,
4511 		    vd->vdev_child[(blkid + i) % old_children],
4512 		    ((blkid + i) / old_children) << ashift, abd,
4513 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4514 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4515 	}
4516 
4517 	return (B_FALSE);
4518 }
4519 
4520 /*
4521  * For testing (ztest specific)
4522  */
4523 static void
raidz_expand_pause(uint_t pause_point)4524 raidz_expand_pause(uint_t pause_point)
4525 {
4526 	while (raidz_expand_pause_point != 0 &&
4527 	    raidz_expand_pause_point <= pause_point)
4528 		delay(hz);
4529 }
4530 
4531 static void
raidz_scratch_child_done(zio_t * zio)4532 raidz_scratch_child_done(zio_t *zio)
4533 {
4534 	zio_t *pio = zio->io_private;
4535 
4536 	mutex_enter(&pio->io_lock);
4537 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4538 	mutex_exit(&pio->io_lock);
4539 }
4540 
4541 /*
4542  * Reflow the beginning portion of the vdev into an intermediate scratch area
4543  * in memory and on disk. This operation must be persisted on disk before we
4544  * proceed to overwrite the beginning portion with the reflowed data.
4545  *
4546  * This multi-step task can fail to complete if disk errors are encountered
4547  * and we can return here after a pause (waiting for disk to become healthy).
4548  */
4549 static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4550 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4551 {
4552 	vdev_raidz_expand_t *vre = arg;
4553 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4554 	zio_t *pio;
4555 	int error;
4556 
4557 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4558 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4559 	int ashift = raidvd->vdev_ashift;
4560 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4561 	    uint64_t);
4562 	uint64_t logical_size = write_size * raidvd->vdev_children;
4563 	uint64_t read_size =
4564 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4565 	    1 << ashift);
4566 
4567 	/*
4568 	 * The scratch space must be large enough to get us to the point
4569 	 * that one row does not overlap itself when moved.  This is checked
4570 	 * by vdev_raidz_attach_check().
4571 	 */
4572 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4573 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4574 	VERIFY3U(write_size, <=, read_size);
4575 
4576 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4577 	    0, logical_size, RL_WRITER);
4578 
4579 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4580 	    KM_SLEEP);
4581 	for (int i = 0; i < raidvd->vdev_children; i++) {
4582 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4583 	}
4584 
4585 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4586 
4587 	/*
4588 	 * If we have already written the scratch area then we must read from
4589 	 * there, since new writes were redirected there while we were paused
4590 	 * or the original location may have been partially overwritten with
4591 	 * reflowed data.
4592 	 */
4593 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4594 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4595 		/*
4596 		 * Read from scratch space.
4597 		 */
4598 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4599 		for (int i = 0; i < raidvd->vdev_children; i++) {
4600 			/*
4601 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4602 			 * to the offset to calculate the physical offset to
4603 			 * write to.  Passing in a negative offset makes us
4604 			 * access the scratch area.
4605 			 */
4606 			zio_nowait(zio_vdev_child_io(pio, NULL,
4607 			    raidvd->vdev_child[i],
4608 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4609 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4610 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4611 		}
4612 		error = zio_wait(pio);
4613 		if (error != 0) {
4614 			zfs_dbgmsg("reflow: error %d reading scratch location",
4615 			    error);
4616 			goto io_error_exit;
4617 		}
4618 		goto overwrite;
4619 	}
4620 
4621 	/*
4622 	 * Read from original location.
4623 	 */
4624 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4625 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4626 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4627 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4628 		    0, abds[i], read_size, ZIO_TYPE_READ,
4629 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4630 		    raidz_scratch_child_done, pio));
4631 	}
4632 	error = zio_wait(pio);
4633 	if (error != 0) {
4634 		zfs_dbgmsg("reflow: error %d reading original location", error);
4635 io_error_exit:
4636 		for (int i = 0; i < raidvd->vdev_children; i++)
4637 			abd_free(abds[i]);
4638 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4639 		zfs_rangelock_exit(lr);
4640 		spa_config_exit(spa, SCL_STATE, FTAG);
4641 		return;
4642 	}
4643 
4644 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4645 
4646 	/*
4647 	 * Reflow in memory.
4648 	 */
4649 	uint64_t logical_sectors = logical_size >> ashift;
4650 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4651 		int oldchild = i % (raidvd->vdev_children - 1);
4652 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4653 
4654 		int newchild = i % raidvd->vdev_children;
4655 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4656 
4657 		/* a single sector should not be copying over itself */
4658 		ASSERT(!(newchild == oldchild && newoff == oldoff));
4659 
4660 		abd_copy_off(abds[newchild], abds[oldchild],
4661 		    newoff, oldoff, 1 << ashift);
4662 	}
4663 
4664 	/*
4665 	 * Verify that we filled in everything we intended to (write_size on
4666 	 * each child).
4667 	 */
4668 	VERIFY0(logical_sectors % raidvd->vdev_children);
4669 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4670 	    write_size);
4671 
4672 	/*
4673 	 * Write to scratch location (boot area).
4674 	 */
4675 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4676 	for (int i = 0; i < raidvd->vdev_children; i++) {
4677 		/*
4678 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4679 		 * the offset to calculate the physical offset to write to.
4680 		 * Passing in a negative offset lets us access the boot area.
4681 		 */
4682 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4683 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4684 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4685 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4686 	}
4687 	error = zio_wait(pio);
4688 	if (error != 0) {
4689 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4690 		goto io_error_exit;
4691 	}
4692 	pio = zio_root(spa, NULL, NULL, 0);
4693 	zio_flush(pio, raidvd);
4694 	zio_wait(pio);
4695 
4696 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4697 	    (long long)logical_size);
4698 
4699 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4700 
4701 	/*
4702 	 * Update uberblock to indicate that scratch space is valid.  This is
4703 	 * needed because after this point, the real location may be
4704 	 * overwritten.  If we crash, we need to get the data from the
4705 	 * scratch space, rather than the real location.
4706 	 *
4707 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4708 	 * will prefer this uberblock.
4709 	 */
4710 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4711 	spa->spa_ubsync.ub_timestamp++;
4712 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4713 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4714 	if (spa_multihost(spa))
4715 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4716 
4717 	zfs_dbgmsg("reflow: uberblock updated "
4718 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4719 	    (long long)spa->spa_ubsync.ub_txg,
4720 	    (long long)logical_size,
4721 	    (long long)spa->spa_ubsync.ub_timestamp);
4722 
4723 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4724 
4725 	/*
4726 	 * Overwrite with reflow'ed data.
4727 	 */
4728 overwrite:
4729 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4730 	for (int i = 0; i < raidvd->vdev_children; i++) {
4731 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4732 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4733 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4734 		    raidz_scratch_child_done, pio));
4735 	}
4736 	error = zio_wait(pio);
4737 	if (error != 0) {
4738 		/*
4739 		 * When we exit early here and drop the range lock, new
4740 		 * writes will go into the scratch area so we'll need to
4741 		 * read from there when we return after pausing.
4742 		 */
4743 		zfs_dbgmsg("reflow: error %d writing real location", error);
4744 		/*
4745 		 * Update the uberblock that is written when this txg completes.
4746 		 */
4747 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4748 		    logical_size);
4749 		goto io_error_exit;
4750 	}
4751 	pio = zio_root(spa, NULL, NULL, 0);
4752 	zio_flush(pio, raidvd);
4753 	zio_wait(pio);
4754 
4755 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4756 	    (long long)logical_size);
4757 	for (int i = 0; i < raidvd->vdev_children; i++)
4758 		abd_free(abds[i]);
4759 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4760 
4761 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4762 
4763 	/*
4764 	 * Update uberblock to indicate that the initial part has been
4765 	 * reflow'ed.  This is needed because after this point (when we exit
4766 	 * the rangelock), we allow regular writes to this region, which will
4767 	 * be written to the new location only (because reflow_offset_next ==
4768 	 * reflow_offset_synced).  If we crashed and re-copied from the
4769 	 * scratch space, we would lose the regular writes.
4770 	 */
4771 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4772 	    logical_size);
4773 	spa->spa_ubsync.ub_timestamp++;
4774 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4775 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4776 	if (spa_multihost(spa))
4777 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4778 
4779 	zfs_dbgmsg("reflow: uberblock updated "
4780 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4781 	    (long long)spa->spa_ubsync.ub_txg,
4782 	    (long long)logical_size,
4783 	    (long long)spa->spa_ubsync.ub_timestamp);
4784 
4785 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4786 
4787 	/*
4788 	 * Update progress.
4789 	 */
4790 	vre->vre_offset = logical_size;
4791 	zfs_rangelock_exit(lr);
4792 	spa_config_exit(spa, SCL_STATE, FTAG);
4793 
4794 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4795 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4796 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4797 	/*
4798 	 * Note - raidz_reflow_sync() will update the uberblock state to
4799 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4800 	 */
4801 	raidz_reflow_sync(spa, tx);
4802 
4803 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4804 }
4805 
4806 /*
4807  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4808  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4809  */
4810 void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4811 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4812 {
4813 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4814 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4815 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4816 
4817 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4818 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4819 	ASSERT0(logical_size % raidvd->vdev_children);
4820 	uint64_t write_size = logical_size / raidvd->vdev_children;
4821 
4822 	zio_t *pio;
4823 
4824 	/*
4825 	 * Read from scratch space.
4826 	 */
4827 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4828 	    KM_SLEEP);
4829 	for (int i = 0; i < raidvd->vdev_children; i++) {
4830 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4831 	}
4832 
4833 	pio = zio_root(spa, NULL, NULL, 0);
4834 	for (int i = 0; i < raidvd->vdev_children; i++) {
4835 		/*
4836 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4837 		 * the offset to calculate the physical offset to write to.
4838 		 * Passing in a negative offset lets us access the boot area.
4839 		 */
4840 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4841 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4842 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4843 		    raidz_scratch_child_done, pio));
4844 	}
4845 	zio_wait(pio);
4846 
4847 	/*
4848 	 * Overwrite real location with reflow'ed data.
4849 	 */
4850 	pio = zio_root(spa, NULL, NULL, 0);
4851 	for (int i = 0; i < raidvd->vdev_children; i++) {
4852 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4853 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4854 		    ZIO_PRIORITY_REMOVAL, 0,
4855 		    raidz_scratch_child_done, pio));
4856 	}
4857 	zio_wait(pio);
4858 	pio = zio_root(spa, NULL, NULL, 0);
4859 	zio_flush(pio, raidvd);
4860 	zio_wait(pio);
4861 
4862 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4863 	    "to real location", (long long)logical_size);
4864 
4865 	for (int i = 0; i < raidvd->vdev_children; i++)
4866 		abd_free(abds[i]);
4867 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4868 
4869 	/*
4870 	 * Update uberblock.
4871 	 */
4872 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4873 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4874 	spa->spa_ubsync.ub_timestamp++;
4875 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4876 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4877 	if (spa_multihost(spa))
4878 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4879 
4880 	zfs_dbgmsg("reflow recovery: uberblock updated "
4881 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4882 	    (long long)spa->spa_ubsync.ub_txg,
4883 	    (long long)logical_size,
4884 	    (long long)spa->spa_ubsync.ub_timestamp);
4885 
4886 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4887 	    spa_first_txg(spa));
4888 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4889 	vre->vre_offset = logical_size;
4890 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4891 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4892 	/*
4893 	 * Note that raidz_reflow_sync() will update the uberblock once more
4894 	 */
4895 	raidz_reflow_sync(spa, tx);
4896 
4897 	dmu_tx_commit(tx);
4898 
4899 	spa_config_exit(spa, SCL_STATE, FTAG);
4900 }
4901 
4902 static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4903 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4904 {
4905 	(void) zthr;
4906 	spa_t *spa = arg;
4907 
4908 	return (spa->spa_raidz_expand != NULL &&
4909 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4910 }
4911 
4912 /*
4913  * RAIDZ expansion background thread
4914  *
4915  * Can be called multiple times if the reflow is paused
4916  */
4917 static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4918 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4919 {
4920 	spa_t *spa = arg;
4921 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4922 
4923 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4924 		vre->vre_offset = 0;
4925 	else
4926 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4927 
4928 	/* Reflow the beginning portion using the scratch area */
4929 	if (vre->vre_offset == 0) {
4930 		VERIFY0(dsl_sync_task(spa_name(spa),
4931 		    NULL, raidz_reflow_scratch_sync,
4932 		    vre, 0, ZFS_SPACE_CHECK_NONE));
4933 
4934 		/* if we encountered errors then pause */
4935 		if (vre->vre_offset == 0) {
4936 			mutex_enter(&vre->vre_lock);
4937 			vre->vre_waiting_for_resilver = B_TRUE;
4938 			mutex_exit(&vre->vre_lock);
4939 			return;
4940 		}
4941 	}
4942 
4943 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4944 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4945 
4946 	uint64_t guid = raidvd->vdev_guid;
4947 
4948 	/* Iterate over all the remaining metaslabs */
4949 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4950 	    i < raidvd->vdev_ms_count &&
4951 	    !zthr_iscancelled(zthr) &&
4952 	    vre->vre_failed_offset == UINT64_MAX; i++) {
4953 		metaslab_t *msp = raidvd->vdev_ms[i];
4954 
4955 		metaslab_disable(msp);
4956 		mutex_enter(&msp->ms_lock);
4957 
4958 		/*
4959 		 * The metaslab may be newly created (for the expanded
4960 		 * space), in which case its trees won't exist yet,
4961 		 * so we need to bail out early.
4962 		 */
4963 		if (msp->ms_new) {
4964 			mutex_exit(&msp->ms_lock);
4965 			metaslab_enable(msp, B_FALSE, B_FALSE);
4966 			continue;
4967 		}
4968 
4969 		VERIFY0(metaslab_load(msp));
4970 
4971 		/*
4972 		 * We want to copy everything except the free (allocatable)
4973 		 * space.  Note that there may be a little bit more free
4974 		 * space (e.g. in ms_defer), and it's fine to copy that too.
4975 		 */
4976 		uint64_t shift, start;
4977 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
4978 		    raidvd, msp, &start, &shift);
4979 		zfs_range_tree_t *rt = zfs_range_tree_create_flags(
4980 		    NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
4981 		    metaslab_rt_name(msp->ms_group, msp,
4982 		    "spa_raidz_expand_thread:rt"));
4983 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
4984 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
4985 		    rt);
4986 		mutex_exit(&msp->ms_lock);
4987 
4988 		/*
4989 		 * Force the last sector of each metaslab to be copied.  This
4990 		 * ensures that we advance the on-disk progress to the end of
4991 		 * this metaslab while the metaslab is disabled.  Otherwise, we
4992 		 * could move past this metaslab without advancing the on-disk
4993 		 * progress, and then an allocation to this metaslab would not
4994 		 * be copied.
4995 		 */
4996 		int sectorsz = 1 << raidvd->vdev_ashift;
4997 		uint64_t ms_last_offset = msp->ms_start +
4998 		    msp->ms_size - sectorsz;
4999 		if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
5000 			zfs_range_tree_add(rt, ms_last_offset, sectorsz);
5001 		}
5002 
5003 		/*
5004 		 * When we are resuming from a paused expansion (i.e.
5005 		 * when importing a pool with a expansion in progress),
5006 		 * discard any state that we have already processed.
5007 		 */
5008 		if (vre->vre_offset > msp->ms_start) {
5009 			zfs_range_tree_clear(rt, msp->ms_start,
5010 			    vre->vre_offset - msp->ms_start);
5011 		}
5012 
5013 		while (!zthr_iscancelled(zthr) &&
5014 		    !zfs_range_tree_is_empty(rt) &&
5015 		    vre->vre_failed_offset == UINT64_MAX) {
5016 
5017 			/*
5018 			 * We need to periodically drop the config lock so that
5019 			 * writers can get in.  Additionally, we can't wait
5020 			 * for a txg to sync while holding a config lock
5021 			 * (since a waiting writer could cause a 3-way deadlock
5022 			 * with the sync thread, which also gets a config
5023 			 * lock for reader).  So we can't hold the config lock
5024 			 * while calling dmu_tx_assign().
5025 			 */
5026 			spa_config_exit(spa, SCL_CONFIG, FTAG);
5027 
5028 			/*
5029 			 * If requested, pause the reflow when the amount
5030 			 * specified by raidz_expand_max_reflow_bytes is reached
5031 			 *
5032 			 * This pause is only used during testing or debugging.
5033 			 */
5034 			while (raidz_expand_max_reflow_bytes != 0 &&
5035 			    raidz_expand_max_reflow_bytes <=
5036 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
5037 				delay(hz);
5038 			}
5039 
5040 			mutex_enter(&vre->vre_lock);
5041 			while (vre->vre_outstanding_bytes >
5042 			    raidz_expand_max_copy_bytes) {
5043 				cv_wait(&vre->vre_cv, &vre->vre_lock);
5044 			}
5045 			mutex_exit(&vre->vre_lock);
5046 
5047 			dmu_tx_t *tx =
5048 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5049 
5050 			VERIFY0(dmu_tx_assign(tx,
5051 			    DMU_TX_WAIT | DMU_TX_SUSPEND));
5052 			uint64_t txg = dmu_tx_get_txg(tx);
5053 
5054 			/*
5055 			 * Reacquire the vdev_config lock.  Theoretically, the
5056 			 * vdev_t that we're expanding may have changed.
5057 			 */
5058 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5059 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
5060 
5061 			boolean_t needsync =
5062 			    raidz_reflow_impl(raidvd, vre, rt, tx);
5063 
5064 			dmu_tx_commit(tx);
5065 
5066 			if (needsync) {
5067 				spa_config_exit(spa, SCL_CONFIG, FTAG);
5068 				txg_wait_synced(spa->spa_dsl_pool, txg);
5069 				spa_config_enter(spa, SCL_CONFIG, FTAG,
5070 				    RW_READER);
5071 			}
5072 		}
5073 
5074 		spa_config_exit(spa, SCL_CONFIG, FTAG);
5075 
5076 		metaslab_enable(msp, B_FALSE, B_FALSE);
5077 		zfs_range_tree_vacate(rt, NULL, NULL);
5078 		zfs_range_tree_destroy(rt);
5079 
5080 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5081 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
5082 	}
5083 
5084 	spa_config_exit(spa, SCL_CONFIG, FTAG);
5085 
5086 	/*
5087 	 * The txg_wait_synced() here ensures that all reflow zio's have
5088 	 * completed, and vre_failed_offset has been set if necessary.  It
5089 	 * also ensures that the progress of the last raidz_reflow_sync() is
5090 	 * written to disk before raidz_reflow_complete_sync() changes the
5091 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
5092 	 * determine if a reflow is in progress, in which case we may need to
5093 	 * write to both old and new locations.  Therefore we can only change
5094 	 * vre_state once this is not necessary, which is once the on-disk
5095 	 * progress (in spa_ubsync) has been set past any possible writes (to
5096 	 * the end of the last metaslab).
5097 	 */
5098 	txg_wait_synced(spa->spa_dsl_pool, 0);
5099 
5100 	if (!zthr_iscancelled(zthr) &&
5101 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
5102 		/*
5103 		 * We are not being canceled or paused, so the reflow must be
5104 		 * complete. In that case also mark it as completed on disk.
5105 		 */
5106 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
5107 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
5108 		    raidz_reflow_complete_sync, spa,
5109 		    0, ZFS_SPACE_CHECK_NONE));
5110 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
5111 	} else {
5112 		/*
5113 		 * Wait for all copy zio's to complete and for all the
5114 		 * raidz_reflow_sync() synctasks to be run.
5115 		 */
5116 		spa_history_log_internal(spa, "reflow pause",
5117 		    NULL, "offset=%llu failed_offset=%lld",
5118 		    (long long)vre->vre_offset,
5119 		    (long long)vre->vre_failed_offset);
5120 		mutex_enter(&vre->vre_lock);
5121 		if (vre->vre_failed_offset != UINT64_MAX) {
5122 			/*
5123 			 * Reset progress so that we will retry everything
5124 			 * after the point that something failed.
5125 			 */
5126 			vre->vre_offset = vre->vre_failed_offset;
5127 			vre->vre_failed_offset = UINT64_MAX;
5128 			vre->vre_waiting_for_resilver = B_TRUE;
5129 		}
5130 		mutex_exit(&vre->vre_lock);
5131 	}
5132 }
5133 
5134 void
spa_start_raidz_expansion_thread(spa_t * spa)5135 spa_start_raidz_expansion_thread(spa_t *spa)
5136 {
5137 	ASSERT0P(spa->spa_raidz_expand_zthr);
5138 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
5139 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
5140 	    spa, defclsyspri);
5141 }
5142 
5143 void
raidz_dtl_reassessed(vdev_t * vd)5144 raidz_dtl_reassessed(vdev_t *vd)
5145 {
5146 	spa_t *spa = vd->vdev_spa;
5147 	if (spa->spa_raidz_expand != NULL) {
5148 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
5149 		/*
5150 		 * we get called often from vdev_dtl_reassess() so make
5151 		 * sure it's our vdev and any replacing is complete
5152 		 */
5153 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
5154 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
5155 			mutex_enter(&vre->vre_lock);
5156 			if (vre->vre_waiting_for_resilver) {
5157 				vdev_dbgmsg(vd, "DTL reassessed, "
5158 				    "continuing raidz expansion");
5159 				vre->vre_waiting_for_resilver = B_FALSE;
5160 				zthr_wakeup(spa->spa_raidz_expand_zthr);
5161 			}
5162 			mutex_exit(&vre->vre_lock);
5163 		}
5164 	}
5165 }
5166 
5167 int
vdev_raidz_attach_check(vdev_t * new_child)5168 vdev_raidz_attach_check(vdev_t *new_child)
5169 {
5170 	vdev_t *raidvd = new_child->vdev_parent;
5171 	uint64_t new_children = raidvd->vdev_children;
5172 
5173 	/*
5174 	 * We use the "boot" space as scratch space to handle overwriting the
5175 	 * initial part of the vdev.  If it is too small, then this expansion
5176 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
5177 	 * >200 children).
5178 	 */
5179 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
5180 		return (EINVAL);
5181 	}
5182 	return (0);
5183 }
5184 
5185 void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)5186 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
5187 {
5188 	vdev_t *new_child = arg;
5189 	spa_t *spa = new_child->vdev_spa;
5190 	vdev_t *raidvd = new_child->vdev_parent;
5191 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
5192 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
5193 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
5194 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
5195 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
5196 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
5197 	    new_child);
5198 
5199 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
5200 
5201 	vdrz->vd_physical_width++;
5202 
5203 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
5204 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
5205 	vdrz->vn_vre.vre_offset = 0;
5206 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
5207 	spa->spa_raidz_expand = &vdrz->vn_vre;
5208 	zthr_wakeup(spa->spa_raidz_expand_zthr);
5209 
5210 	/*
5211 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
5212 	 * written to the config.
5213 	 */
5214 	vdev_config_dirty(raidvd);
5215 
5216 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
5217 	vdrz->vn_vre.vre_end_time = 0;
5218 	vdrz->vn_vre.vre_state = DSS_SCANNING;
5219 	vdrz->vn_vre.vre_bytes_copied = 0;
5220 
5221 	uint64_t state = vdrz->vn_vre.vre_state;
5222 	VERIFY0(zap_update(spa->spa_meta_objset,
5223 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
5224 	    sizeof (state), 1, &state, tx));
5225 
5226 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
5227 	VERIFY0(zap_update(spa->spa_meta_objset,
5228 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
5229 	    sizeof (start_time), 1, &start_time, tx));
5230 
5231 	(void) zap_remove(spa->spa_meta_objset,
5232 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
5233 	(void) zap_remove(spa->spa_meta_objset,
5234 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
5235 
5236 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
5237 	    "%s vdev %llu new width %llu", spa_name(spa),
5238 	    (unsigned long long)raidvd->vdev_id,
5239 	    (unsigned long long)raidvd->vdev_children);
5240 }
5241 
5242 int
vdev_raidz_load(vdev_t * vd)5243 vdev_raidz_load(vdev_t *vd)
5244 {
5245 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5246 	int err;
5247 
5248 	uint64_t state = DSS_NONE;
5249 	uint64_t start_time = 0;
5250 	uint64_t end_time = 0;
5251 	uint64_t bytes_copied = 0;
5252 
5253 	if (vd->vdev_top_zap != 0) {
5254 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5255 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
5256 		    sizeof (state), 1, &state);
5257 		if (err != 0 && err != ENOENT)
5258 			return (err);
5259 
5260 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5261 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
5262 		    sizeof (start_time), 1, &start_time);
5263 		if (err != 0 && err != ENOENT)
5264 			return (err);
5265 
5266 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5267 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
5268 		    sizeof (end_time), 1, &end_time);
5269 		if (err != 0 && err != ENOENT)
5270 			return (err);
5271 
5272 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5273 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
5274 		    sizeof (bytes_copied), 1, &bytes_copied);
5275 		if (err != 0 && err != ENOENT)
5276 			return (err);
5277 	}
5278 
5279 	/*
5280 	 * If we are in the middle of expansion, vre_state should have
5281 	 * already been set by vdev_raidz_init().
5282 	 */
5283 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
5284 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
5285 	vdrz->vn_vre.vre_start_time = start_time;
5286 	vdrz->vn_vre.vre_end_time = end_time;
5287 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
5288 
5289 	return (0);
5290 }
5291 
5292 int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)5293 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
5294 {
5295 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
5296 
5297 	if (vre == NULL) {
5298 		/* no removal in progress; find most recent completed */
5299 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
5300 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
5301 			if (vd->vdev_ops == &vdev_raidz_ops) {
5302 				vdev_raidz_t *vdrz = vd->vdev_tsd;
5303 
5304 				if (vdrz->vn_vre.vre_end_time != 0 &&
5305 				    (vre == NULL ||
5306 				    vdrz->vn_vre.vre_end_time >
5307 				    vre->vre_end_time)) {
5308 					vre = &vdrz->vn_vre;
5309 				}
5310 			}
5311 		}
5312 	}
5313 
5314 	if (vre == NULL) {
5315 		return (SET_ERROR(ENOENT));
5316 	}
5317 
5318 	pres->pres_state = vre->vre_state;
5319 	pres->pres_expanding_vdev = vre->vre_vdev_id;
5320 
5321 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
5322 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
5323 
5324 	mutex_enter(&vre->vre_lock);
5325 	pres->pres_reflowed = vre->vre_bytes_copied;
5326 	for (int i = 0; i < TXG_SIZE; i++)
5327 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
5328 	mutex_exit(&vre->vre_lock);
5329 
5330 	pres->pres_start_time = vre->vre_start_time;
5331 	pres->pres_end_time = vre->vre_end_time;
5332 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
5333 
5334 	return (0);
5335 }
5336 
5337 /*
5338  * Initialize private RAIDZ specific fields from the nvlist.
5339  */
5340 static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)5341 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
5342 {
5343 	uint_t children;
5344 	nvlist_t **child;
5345 	int error = nvlist_lookup_nvlist_array(nv,
5346 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
5347 	if (error != 0)
5348 		return (SET_ERROR(EINVAL));
5349 
5350 	uint64_t nparity;
5351 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
5352 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
5353 			return (SET_ERROR(EINVAL));
5354 
5355 		/*
5356 		 * Previous versions could only support 1 or 2 parity
5357 		 * device.
5358 		 */
5359 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
5360 			return (SET_ERROR(EINVAL));
5361 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
5362 			return (SET_ERROR(EINVAL));
5363 	} else {
5364 		/*
5365 		 * We require the parity to be specified for SPAs that
5366 		 * support multiple parity levels.
5367 		 */
5368 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
5369 			return (SET_ERROR(EINVAL));
5370 
5371 		/*
5372 		 * Otherwise, we default to 1 parity device for RAID-Z.
5373 		 */
5374 		nparity = 1;
5375 	}
5376 
5377 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
5378 	vdrz->vn_vre.vre_vdev_id = -1;
5379 	vdrz->vn_vre.vre_offset = UINT64_MAX;
5380 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
5381 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
5382 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
5383 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
5384 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
5385 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
5386 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
5387 
5388 	vdrz->vd_physical_width = children;
5389 	vdrz->vd_nparity = nparity;
5390 
5391 	/* note, the ID does not exist when creating a pool */
5392 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
5393 	    &vdrz->vn_vre.vre_vdev_id);
5394 
5395 	boolean_t reflow_in_progress =
5396 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5397 	if (reflow_in_progress) {
5398 		spa->spa_raidz_expand = &vdrz->vn_vre;
5399 		vdrz->vn_vre.vre_state = DSS_SCANNING;
5400 	}
5401 
5402 	vdrz->vd_original_width = children;
5403 	uint64_t *txgs;
5404 	unsigned int txgs_size = 0;
5405 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5406 	    &txgs, &txgs_size);
5407 	if (error == 0) {
5408 		for (int i = 0; i < txgs_size; i++) {
5409 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
5410 			re->re_txg = txgs[txgs_size - i - 1];
5411 			re->re_logical_width = vdrz->vd_physical_width - i;
5412 
5413 			if (reflow_in_progress)
5414 				re->re_logical_width--;
5415 
5416 			avl_add(&vdrz->vd_expand_txgs, re);
5417 		}
5418 
5419 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
5420 	}
5421 	if (reflow_in_progress) {
5422 		vdrz->vd_original_width--;
5423 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
5424 		    children, txgs_size);
5425 	}
5426 
5427 	*tsd = vdrz;
5428 
5429 	return (0);
5430 }
5431 
5432 static void
vdev_raidz_fini(vdev_t * vd)5433 vdev_raidz_fini(vdev_t *vd)
5434 {
5435 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5436 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5437 		vd->vdev_spa->spa_raidz_expand = NULL;
5438 	reflow_node_t *re;
5439 	void *cookie = NULL;
5440 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
5441 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5442 		kmem_free(re, sizeof (*re));
5443 	avl_destroy(&vdrz->vd_expand_txgs);
5444 	mutex_destroy(&vdrz->vd_expand_lock);
5445 	mutex_destroy(&vdrz->vn_vre.vre_lock);
5446 	cv_destroy(&vdrz->vn_vre.vre_cv);
5447 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5448 	kmem_free(vdrz, sizeof (*vdrz));
5449 }
5450 
5451 /*
5452  * Add RAIDZ specific fields to the config nvlist.
5453  */
5454 static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)5455 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5456 {
5457 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5458 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5459 
5460 	/*
5461 	 * Make sure someone hasn't managed to sneak a fancy new vdev
5462 	 * into a crufty old storage pool.
5463 	 */
5464 	ASSERT(vdrz->vd_nparity == 1 ||
5465 	    (vdrz->vd_nparity <= 2 &&
5466 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5467 	    (vdrz->vd_nparity <= 3 &&
5468 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5469 
5470 	/*
5471 	 * Note that we'll add these even on storage pools where they
5472 	 * aren't strictly required -- older software will just ignore
5473 	 * it.
5474 	 */
5475 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5476 
5477 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5478 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5479 	}
5480 
5481 	mutex_enter(&vdrz->vd_expand_lock);
5482 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5483 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5484 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5485 		    KM_SLEEP);
5486 		uint64_t i = 0;
5487 
5488 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5489 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5490 			txgs[i++] = re->re_txg;
5491 		}
5492 
5493 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5494 		    txgs, count);
5495 
5496 		kmem_free(txgs, sizeof (uint64_t) * count);
5497 	}
5498 	mutex_exit(&vdrz->vd_expand_lock);
5499 }
5500 
5501 static uint64_t
vdev_raidz_nparity(vdev_t * vd)5502 vdev_raidz_nparity(vdev_t *vd)
5503 {
5504 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5505 	return (vdrz->vd_nparity);
5506 }
5507 
5508 static uint64_t
vdev_raidz_ndisks(vdev_t * vd)5509 vdev_raidz_ndisks(vdev_t *vd)
5510 {
5511 	return (vd->vdev_children);
5512 }
5513 
5514 vdev_ops_t vdev_raidz_ops = {
5515 	.vdev_op_init = vdev_raidz_init,
5516 	.vdev_op_fini = vdev_raidz_fini,
5517 	.vdev_op_open = vdev_raidz_open,
5518 	.vdev_op_close = vdev_raidz_close,
5519 	.vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
5520 	.vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
5521 	.vdev_op_min_asize = vdev_raidz_min_asize,
5522 	.vdev_op_min_alloc = NULL,
5523 	.vdev_op_io_start = vdev_raidz_io_start,
5524 	.vdev_op_io_done = vdev_raidz_io_done,
5525 	.vdev_op_state_change = vdev_raidz_state_change,
5526 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
5527 	.vdev_op_hold = NULL,
5528 	.vdev_op_rele = NULL,
5529 	.vdev_op_remap = NULL,
5530 	.vdev_op_xlate = vdev_raidz_xlate,
5531 	.vdev_op_rebuild_asize = NULL,
5532 	.vdev_op_metaslab_init = NULL,
5533 	.vdev_op_config_generate = vdev_raidz_config_generate,
5534 	.vdev_op_nparity = vdev_raidz_nparity,
5535 	.vdev_op_ndisks = vdev_raidz_ndisks,
5536 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5537 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5538 };
5539 
5540 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5541 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5542 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5543 	"Max amount of concurrent i/o for RAIDZ expansion");
5544 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5545 	"For expanded RAIDZ, aggregate reads that have more rows than this");
5546 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5547 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5548 	"completes");
5549 ZFS_MODULE_PARAM(zfs, zfs_, scrub_partial_writes, INT, ZMOD_RW,
5550 	"Issue reads after writes with recoverable failures to ensure "
5551 	"integrity");
5552 ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
5553 	"Raidz/draid slow disk sit out time period in seconds");
5554 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
5555 	ZMOD_RW, "Interval to check for slow raidz/draid children");
5556 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
5557 	ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
5558 /* END CSTYLED */
5559