xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision 081f0e38e3324dd3ca63a2d5036a15f52e6cf858)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
27  * Copyright (c) 2025, Klara, Inc.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zap.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/metaslab_impl.h>
36 #include <sys/zio.h>
37 #include <sys/zio_checksum.h>
38 #include <sys/dmu_tx.h>
39 #include <sys/abd.h>
40 #include <sys/zfs_rlock.h>
41 #include <sys/fs/zfs.h>
42 #include <sys/fm/fs/zfs.h>
43 #include <sys/vdev_raidz.h>
44 #include <sys/vdev_raidz_impl.h>
45 #include <sys/vdev_draid.h>
46 #include <sys/uberblock_impl.h>
47 #include <sys/dsl_scan.h>
48 
49 #ifdef ZFS_DEBUG
50 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
51 #endif
52 
53 /*
54  * Virtual device vector for RAID-Z.
55  *
56  * This vdev supports single, double, and triple parity. For single parity,
57  * we use a simple XOR of all the data columns. For double or triple parity,
58  * we use a special case of Reed-Solomon coding. This extends the
59  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
60  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
61  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
62  * former is also based. The latter is designed to provide higher performance
63  * for writes.
64  *
65  * Note that the Plank paper claimed to support arbitrary N+M, but was then
66  * amended six years later identifying a critical flaw that invalidates its
67  * claims. Nevertheless, the technique can be adapted to work for up to
68  * triple parity. For additional parity, the amendment "Note: Correction to
69  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
70  * is viable, but the additional complexity means that write performance will
71  * suffer.
72  *
73  * All of the methods above operate on a Galois field, defined over the
74  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
75  * can be expressed with a single byte. Briefly, the operations on the
76  * field are defined as follows:
77  *
78  *   o addition (+) is represented by a bitwise XOR
79  *   o subtraction (-) is therefore identical to addition: A + B = A - B
80  *   o multiplication of A by 2 is defined by the following bitwise expression:
81  *
82  *	(A * 2)_7 = A_6
83  *	(A * 2)_6 = A_5
84  *	(A * 2)_5 = A_4
85  *	(A * 2)_4 = A_3 + A_7
86  *	(A * 2)_3 = A_2 + A_7
87  *	(A * 2)_2 = A_1 + A_7
88  *	(A * 2)_1 = A_0
89  *	(A * 2)_0 = A_7
90  *
91  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
92  * As an aside, this multiplication is derived from the error correcting
93  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
94  *
95  * Observe that any number in the field (except for 0) can be expressed as a
96  * power of 2 -- a generator for the field. We store a table of the powers of
97  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
98  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
99  * than field addition). The inverse of a field element A (A^-1) is therefore
100  * A ^ (255 - 1) = A^254.
101  *
102  * The up-to-three parity columns, P, Q, R over several data columns,
103  * D_0, ... D_n-1, can be expressed by field operations:
104  *
105  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
106  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
107  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
108  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
109  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
110  *
111  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
112  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
113  * independent coefficients. (There are no additional coefficients that have
114  * this property which is why the uncorrected Plank method breaks down.)
115  *
116  * See the reconstruction code below for how P, Q and R can used individually
117  * or in concert to recover missing data columns.
118  */
119 
120 #define	VDEV_RAIDZ_P		0
121 #define	VDEV_RAIDZ_Q		1
122 #define	VDEV_RAIDZ_R		2
123 
124 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
125 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
126 
127 /*
128  * We provide a mechanism to perform the field multiplication operation on a
129  * 64-bit value all at once rather than a byte at a time. This works by
130  * creating a mask from the top bit in each byte and using that to
131  * conditionally apply the XOR of 0x1d.
132  */
133 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
134 { \
135 	(mask) = (x) & 0x8080808080808080ULL; \
136 	(mask) = ((mask) << 1) - ((mask) >> 7); \
137 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
138 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
139 }
140 
141 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
142 { \
143 	VDEV_RAIDZ_64MUL_2((x), mask); \
144 	VDEV_RAIDZ_64MUL_2((x), mask); \
145 }
146 
147 
148 /*
149  * Big Theory Statement for how a RAIDZ VDEV is expanded
150  *
151  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
152  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
153  * that have been previously expanded can be expanded again.
154  *
155  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
156  * the VDEV) when an expansion starts.  And the expansion will pause if any
157  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
158  * operations on the pool can continue while an expansion is in progress (e.g.
159  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
160  * and zpool initialize which can't be run during an expansion.  Following a
161  * reboot or export/import, the expansion resumes where it left off.
162  *
163  * == Reflowing the Data ==
164  *
165  * The expansion involves reflowing (copying) the data from the current set
166  * of disks to spread it across the new set which now has one more disk. This
167  * reflow operation is similar to reflowing text when the column width of a
168  * text editor window is expanded. The text doesn’t change but the location of
169  * the text changes to accommodate the new width. An example reflow result for
170  * a 4-wide RAIDZ1 to a 5-wide is shown below.
171  *
172  *                            Reflow End State
173  *            Each letter indicates a parity group (logical stripe)
174  *
175  *         Before expansion                         After Expansion
176  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
177  *  +------+------+------+------+         +------+------+------+------+------+
178  *  |      |      |      |      |         |      |      |      |      |      |
179  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
180  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
181  *  +------+------+------+------+         +------+------+------+------+------+
182  *  |      |      |      |      |         |      |      |      |      |      |
183  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
184  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
185  *  +------+------+------+------+         +------+------+------+------+------+
186  *  |      |      |      |      |         |      |      |      |      |      |
187  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
188  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
189  *  +------+------+------+------+         +------+------+------+------+------+
190  *  |      |      |      |      |         |      |      |      |      |      |
191  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
192  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
193  *  +------+------+------+------+         +------+------+------+------+------+
194  *  |      |      |      |      |         |      |      |      |      |      |
195  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
196  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
197  *  +------+------+------+------+         +------+------+------+------+------+
198  *  |      |      |      |      |         |      |      |      |      |      |
199  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
200  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
201  *  +------+------+------+------+         +------+------+------+------+------+
202  *  |      |      |      |      |         |      |      |      |      |      |
203  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
204  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
205  *  +------+------+------+------+         +------+------+------+------+------+
206  *
207  * This reflow approach has several advantages. There is no need to read or
208  * modify the block pointers or recompute any block checksums.  The reflow
209  * doesn’t need to know where the parity sectors reside. We can read and write
210  * data sequentially and the copy can occur in a background thread in open
211  * context. The design also allows for fast discovery of what data to copy.
212  *
213  * The VDEV metaslabs are processed, one at a time, to copy the block data to
214  * have it flow across all the disks. The metaslab is disabled for allocations
215  * during the copy. As an optimization, we only copy the allocated data which
216  * can be determined by looking at the metaslab range tree. During the copy we
217  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
218  * need to be able to survive losing parity count disks).  This means we
219  * cannot overwrite data during the reflow that would be needed if a disk is
220  * lost.
221  *
222  * After the reflow completes, all newly-written blocks will have the new
223  * layout, i.e., they will have the parity to data ratio implied by the new
224  * number of disks in the RAIDZ group.  Even though the reflow copies all of
225  * the allocated space (data and parity), it is only rearranged, not changed.
226  *
227  * This act of reflowing the data has a few implications about blocks
228  * that were written before the reflow completes:
229  *
230  *  - Old blocks will still use the same amount of space (i.e., they will have
231  *    the parity to data ratio implied by the old number of disks in the RAIDZ
232  *    group).
233  *  - Reading old blocks will be slightly slower than before the reflow, for
234  *    two reasons. First, we will have to read from all disks in the RAIDZ
235  *    VDEV, rather than being able to skip the children that contain only
236  *    parity of this block (because the data of a single block is now spread
237  *    out across all the disks).  Second, in most cases there will be an extra
238  *    bcopy, needed to rearrange the data back to its original layout in memory.
239  *
240  * == Scratch Area ==
241  *
242  * As we copy the block data, we can only progress to the point that writes
243  * will not overlap with blocks whose progress has not yet been recorded on
244  * disk.  Since partially-copied rows are always read from the old location,
245  * we need to stop one row before the sector-wise overlap, to prevent any
246  * row-wise overlap. For example, in the diagram above, when we reflow sector
247  * B6 it will overwite the original location for B5.
248  *
249  * To get around this, a scratch space is used so that we can start copying
250  * without risking data loss by overlapping the row. As an added benefit, it
251  * improves performance at the beginning of the reflow, but that small perf
252  * boost wouldn't be worth the complexity on its own.
253  *
254  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
255  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
256  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
257  * the widths will likely be single digits so we can get a substantial chuck
258  * size using only a few MB of scratch per disk.
259  *
260  * The scratch area is persisted to disk which holds a large amount of reflowed
261  * state. We can always read the partially written stripes when a disk fails or
262  * the copy is interrupted (crash) during the initial copying phase and also
263  * get past a small chunk size restriction.  At a minimum, the scratch space
264  * must be large enough to get us to the point that one row does not overlap
265  * itself when moved (i.e new_width^2).  But going larger is even better. We
266  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
267  * as our scratch space to handle overwriting the initial part of the VDEV.
268  *
269  *	0     256K   512K                    4M
270  *	+------+------+-----------------------+-----------------------------
271  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
272  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
273  *	+------+------+-----------------------+-------------------------------
274  *                        Scratch Area
275  *
276  * == Reflow Progress Updates ==
277  * After the initial scratch-based reflow, the expansion process works
278  * similarly to device removal. We create a new open context thread which
279  * reflows the data, and periodically kicks off sync tasks to update logical
280  * state. In this case, state is the committed progress (offset of next data
281  * to copy). We need to persist the completed offset on disk, so that if we
282  * crash we know which format each VDEV offset is in.
283  *
284  * == Time Dependent Geometry ==
285  *
286  * In non-expanded RAIDZ, blocks are read from disk in a column by column
287  * fashion. For a multi-row block, the second sector is in the first column
288  * not in the second column. This allows us to issue full reads for each
289  * column directly into the request buffer. The block data is thus laid out
290  * sequentially in a column-by-column fashion.
291  *
292  * For example, in the before expansion diagram above, one logical block might
293  * be sectors G19-H26. The parity is in G19,H23; and the data is in
294  * G20,H24,G21,H25,G22,H26.
295  *
296  * After a block is reflowed, the sectors that were all in the original column
297  * data can now reside in different columns. When reading from an expanded
298  * VDEV, we need to know the logical stripe width for each block so we can
299  * reconstitute the block’s data after the reads are completed. Likewise,
300  * when we perform the combinatorial reconstruction we need to know the
301  * original width so we can retry combinations from the past layouts.
302  *
303  * Time dependent geometry is what we call having blocks with different layouts
304  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
305  * block’s birth time (+ the time expansion ended) to establish the correct
306  * width for a given block. After an expansion completes, we record the time
307  * for blocks written with a particular width (geometry).
308  *
309  * == On Disk Format Changes ==
310  *
311  * New pool feature flag, 'raidz_expansion' whose reference count is the number
312  * of RAIDZ VDEVs that have been expanded.
313  *
314  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
315  *
316  * Since the uberblock can point to arbitrary blocks, which might be on the
317  * expanding RAIDZ, and might or might not have been expanded. We need to know
318  * which way a block is laid out before reading it. This info is the next
319  * offset that needs to be reflowed and we persist that in the uberblock, in
320  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
321  * After the expansion is complete, we then use the raidz_expand_txgs array
322  * (see below) to determine how to read a block and the ub_raidz_reflow_info
323  * field no longer required.
324  *
325  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
326  * state (i.e., active or not) which is also required before reading a block
327  * during the initial phase of reflowing the data.
328  *
329  * The top-level RAIDZ VDEV has two new entries in the nvlist:
330  *
331  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
332  *                            and used after the expansion is complete to
333  *                            determine how to read a raidz block
334  * 'raidz_expanding' boolean: present during reflow and removed after completion
335  *                            used during a spa import to resume an unfinished
336  *                            expansion
337  *
338  * And finally the VDEVs top zap adds the following informational entries:
339  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
341  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
342  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
343  */
344 
345 /*
346  * For testing only: pause the raidz expansion after reflowing this amount.
347  * (accessed by ZTS and ztest)
348  */
349 #ifdef	_KERNEL
350 static
351 #endif	/* _KERNEL */
352 unsigned long raidz_expand_max_reflow_bytes = 0;
353 
354 /*
355  * For testing only: pause the raidz expansion at a certain point.
356  */
357 uint_t raidz_expand_pause_point = 0;
358 
359 /*
360  * This represents the duration for a slow drive read sit out.
361  */
362 static unsigned long vdev_read_sit_out_secs = 600;
363 
364 /*
365  * How often each RAID-Z and dRAID vdev will check for slow disk outliers.
366  * Increasing this interval will reduce the sensitivity of detection (since all
367  * I/Os since the last check are included in the statistics), but will slow the
368  * response to a disk developing a problem.
369  *
370  * Defaults to once per second; setting extremely small values may cause
371  * negative performance effects.
372  */
373 static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
374 
375 /*
376  * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
377  * used to determine how far out an outlier must be before it counts as an event
378  * worth consdering.
379  *
380  * Smaller values will result in more aggressive sitting out of disks that may
381  * have problems, but may significantly increase the rate of spurious sit-outs.
382  */
383 static uint32_t vdev_raidz_outlier_insensitivity = 50;
384 
385 /*
386  * Maximum amount of copy io's outstanding at once.
387  */
388 #ifdef _ILP32
389 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
390 #else
391 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
392 #endif
393 
394 /*
395  * Apply raidz map abds aggregation if the number of rows in the map is equal
396  * or greater than the value below.
397  */
398 static unsigned long raidz_io_aggregate_rows = 4;
399 
400 /*
401  * Automatically start a pool scrub when a RAIDZ expansion completes in
402  * order to verify the checksums of all blocks which have been copied
403  * during the expansion.  Automatic scrubbing is enabled by default and
404  * is strongly recommended.
405  */
406 static int zfs_scrub_after_expand = 1;
407 
408 static void
vdev_raidz_row_free(raidz_row_t * rr)409 vdev_raidz_row_free(raidz_row_t *rr)
410 {
411 	for (int c = 0; c < rr->rr_cols; c++) {
412 		raidz_col_t *rc = &rr->rr_col[c];
413 
414 		if (rc->rc_size != 0)
415 			abd_free(rc->rc_abd);
416 		if (rc->rc_orig_data != NULL)
417 			abd_free(rc->rc_orig_data);
418 	}
419 
420 	if (rr->rr_abd_empty != NULL)
421 		abd_free(rr->rr_abd_empty);
422 
423 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
424 }
425 
426 void
vdev_raidz_map_free(raidz_map_t * rm)427 vdev_raidz_map_free(raidz_map_t *rm)
428 {
429 	for (int i = 0; i < rm->rm_nrows; i++)
430 		vdev_raidz_row_free(rm->rm_row[i]);
431 
432 	if (rm->rm_nphys_cols) {
433 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
434 			if (rm->rm_phys_col[i].rc_abd != NULL)
435 				abd_free(rm->rm_phys_col[i].rc_abd);
436 		}
437 
438 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
439 		    rm->rm_nphys_cols);
440 	}
441 
442 	ASSERT0P(rm->rm_lr);
443 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
444 }
445 
446 static void
vdev_raidz_map_free_vsd(zio_t * zio)447 vdev_raidz_map_free_vsd(zio_t *zio)
448 {
449 	raidz_map_t *rm = zio->io_vsd;
450 
451 	vdev_raidz_map_free(rm);
452 }
453 
454 static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)455 vdev_raidz_reflow_compare(const void *x1, const void *x2)
456 {
457 	const reflow_node_t *l = x1;
458 	const reflow_node_t *r = x2;
459 
460 	return (TREE_CMP(l->re_txg, r->re_txg));
461 }
462 
463 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
464 	.vsd_free = vdev_raidz_map_free_vsd,
465 };
466 
467 raidz_row_t *
vdev_raidz_row_alloc(int cols,zio_t * zio)468 vdev_raidz_row_alloc(int cols, zio_t *zio)
469 {
470 	raidz_row_t *rr =
471 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
472 
473 	rr->rr_cols = cols;
474 	rr->rr_scols = cols;
475 
476 	for (int c = 0; c < cols; c++) {
477 		raidz_col_t *rc = &rr->rr_col[c];
478 		rc->rc_shadow_devidx = INT_MAX;
479 		rc->rc_shadow_offset = UINT64_MAX;
480 		/*
481 		 * We can not allow self healing to take place for Direct I/O
482 		 * reads. There is nothing that stops the buffer contents from
483 		 * being manipulated while the I/O is in flight. It is possible
484 		 * that the checksum could be verified on the buffer and then
485 		 * the contents of that buffer are manipulated afterwards. This
486 		 * could lead to bad data being written out during self
487 		 * healing.
488 		 */
489 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
490 			rc->rc_allow_repair = 1;
491 	}
492 	return (rr);
493 }
494 
495 static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)496 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
497 {
498 	int c;
499 	int nwrapped = 0;
500 	uint64_t off = 0;
501 	raidz_row_t *rr = rm->rm_row[0];
502 
503 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
504 	ASSERT3U(rm->rm_nrows, ==, 1);
505 
506 	/*
507 	 * Pad any parity columns with additional space to account for skip
508 	 * sectors.
509 	 */
510 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
511 		ASSERT0(rm->rm_skipstart);
512 		nwrapped = rm->rm_nskip;
513 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
514 		nwrapped =
515 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
516 	}
517 
518 	/*
519 	 * Optional single skip sectors (rc_size == 0) will be handled in
520 	 * vdev_raidz_io_start_write().
521 	 */
522 	int skipped = rr->rr_scols - rr->rr_cols;
523 
524 	/* Allocate buffers for the parity columns */
525 	for (c = 0; c < rr->rr_firstdatacol; c++) {
526 		raidz_col_t *rc = &rr->rr_col[c];
527 
528 		/*
529 		 * Parity columns will pad out a linear ABD to account for
530 		 * the skip sector. A linear ABD is used here because
531 		 * parity calculations use the ABD buffer directly to calculate
532 		 * parity. This avoids doing a memcpy back to the ABD after the
533 		 * parity has been calculated. By issuing the parity column
534 		 * with the skip sector we can reduce contention on the child
535 		 * VDEV queue locks (vq_lock).
536 		 */
537 		if (c < nwrapped) {
538 			rc->rc_abd = abd_alloc_linear(
539 			    rc->rc_size + (1ULL << ashift), B_FALSE);
540 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
541 			skipped++;
542 		} else {
543 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
544 		}
545 	}
546 
547 	for (off = 0; c < rr->rr_cols; c++) {
548 		raidz_col_t *rc = &rr->rr_col[c];
549 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
550 		    zio->io_abd, off, rc->rc_size);
551 
552 		/*
553 		 * Generate I/O for skip sectors to improve aggregation
554 		 * continuity. We will use gang ABD's to reduce contention
555 		 * on the child VDEV queue locks (vq_lock) by issuing
556 		 * a single I/O that contains the data and skip sector.
557 		 *
558 		 * It is important to make sure that rc_size is not updated
559 		 * even though we are adding a skip sector to the ABD. When
560 		 * calculating the parity in vdev_raidz_generate_parity_row()
561 		 * the rc_size is used to iterate through the ABD's. We can
562 		 * not have zero'd out skip sectors used for calculating
563 		 * parity for raidz, because those same sectors are not used
564 		 * during reconstruction.
565 		 */
566 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
567 			rc->rc_abd = abd_alloc_gang();
568 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
569 			abd_gang_add(rc->rc_abd,
570 			    abd_get_zeros(1ULL << ashift), B_TRUE);
571 			skipped++;
572 		} else {
573 			rc->rc_abd = abd;
574 		}
575 		off += rc->rc_size;
576 	}
577 
578 	ASSERT3U(off, ==, zio->io_size);
579 	ASSERT3S(skipped, ==, rm->rm_nskip);
580 }
581 
582 static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)583 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
584 {
585 	int c;
586 	raidz_row_t *rr = rm->rm_row[0];
587 
588 	ASSERT3U(rm->rm_nrows, ==, 1);
589 
590 	/* Allocate buffers for the parity columns */
591 	for (c = 0; c < rr->rr_firstdatacol; c++)
592 		rr->rr_col[c].rc_abd =
593 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
594 
595 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
596 		raidz_col_t *rc = &rr->rr_col[c];
597 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
598 		    zio->io_abd, off, rc->rc_size);
599 		off += rc->rc_size;
600 	}
601 }
602 
603 /*
604  * Divides the IO evenly across all child vdevs; usually, dcols is
605  * the number of children in the target vdev.
606  *
607  * Avoid inlining the function to keep vdev_raidz_io_start(), which
608  * is this functions only caller, as small as possible on the stack.
609  */
610 noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)611 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
612     uint64_t nparity)
613 {
614 	raidz_row_t *rr;
615 	/* The starting RAIDZ (parent) vdev sector of the block. */
616 	uint64_t b = zio->io_offset >> ashift;
617 	/* The zio's size in units of the vdev's minimum sector size. */
618 	uint64_t s = zio->io_size >> ashift;
619 	/* The first column for this stripe. */
620 	uint64_t f = b % dcols;
621 	/* The starting byte offset on each child vdev. */
622 	uint64_t o = (b / dcols) << ashift;
623 	uint64_t acols, scols;
624 
625 	raidz_map_t *rm =
626 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
627 	rm->rm_nrows = 1;
628 
629 	/*
630 	 * "Quotient": The number of data sectors for this stripe on all but
631 	 * the "big column" child vdevs that also contain "remainder" data.
632 	 */
633 	uint64_t q = s / (dcols - nparity);
634 
635 	/*
636 	 * "Remainder": The number of partial stripe data sectors in this I/O.
637 	 * This will add a sector to some, but not all, child vdevs.
638 	 */
639 	uint64_t r = s - q * (dcols - nparity);
640 
641 	/* The number of "big columns" - those which contain remainder data. */
642 	uint64_t bc = (r == 0 ? 0 : r + nparity);
643 
644 	/*
645 	 * The total number of data and parity sectors associated with
646 	 * this I/O.
647 	 */
648 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
649 
650 	/*
651 	 * acols: The columns that will be accessed.
652 	 * scols: The columns that will be accessed or skipped.
653 	 */
654 	if (q == 0) {
655 		/* Our I/O request doesn't span all child vdevs. */
656 		acols = bc;
657 		scols = MIN(dcols, roundup(bc, nparity + 1));
658 	} else {
659 		acols = dcols;
660 		scols = dcols;
661 	}
662 
663 	ASSERT3U(acols, <=, scols);
664 	rr = vdev_raidz_row_alloc(scols, zio);
665 	rm->rm_row[0] = rr;
666 	rr->rr_cols = acols;
667 	rr->rr_bigcols = bc;
668 	rr->rr_firstdatacol = nparity;
669 #ifdef ZFS_DEBUG
670 	rr->rr_offset = zio->io_offset;
671 	rr->rr_size = zio->io_size;
672 #endif
673 
674 	uint64_t asize = 0;
675 
676 	for (uint64_t c = 0; c < scols; c++) {
677 		raidz_col_t *rc = &rr->rr_col[c];
678 		uint64_t col = f + c;
679 		uint64_t coff = o;
680 		if (col >= dcols) {
681 			col -= dcols;
682 			coff += 1ULL << ashift;
683 		}
684 		rc->rc_devidx = col;
685 		rc->rc_offset = coff;
686 
687 		if (c >= acols)
688 			rc->rc_size = 0;
689 		else if (c < bc)
690 			rc->rc_size = (q + 1) << ashift;
691 		else
692 			rc->rc_size = q << ashift;
693 
694 		asize += rc->rc_size;
695 	}
696 
697 	ASSERT3U(asize, ==, tot << ashift);
698 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
699 	rm->rm_skipstart = bc;
700 
701 	/*
702 	 * If all data stored spans all columns, there's a danger that parity
703 	 * will always be on the same device and, since parity isn't read
704 	 * during normal operation, that device's I/O bandwidth won't be
705 	 * used effectively. We therefore switch the parity every 1MB.
706 	 *
707 	 * ... at least that was, ostensibly, the theory. As a practical
708 	 * matter unless we juggle the parity between all devices evenly, we
709 	 * won't see any benefit. Further, occasional writes that aren't a
710 	 * multiple of the LCM of the number of children and the minimum
711 	 * stripe width are sufficient to avoid pessimal behavior.
712 	 * Unfortunately, this decision created an implicit on-disk format
713 	 * requirement that we need to support for all eternity, but only
714 	 * for single-parity RAID-Z.
715 	 *
716 	 * If we intend to skip a sector in the zeroth column for padding
717 	 * we must make sure to note this swap. We will never intend to
718 	 * skip the first column since at least one data and one parity
719 	 * column must appear in each row.
720 	 */
721 	ASSERT(rr->rr_cols >= 2);
722 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
723 
724 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
725 		uint64_t devidx = rr->rr_col[0].rc_devidx;
726 		o = rr->rr_col[0].rc_offset;
727 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
728 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
729 		rr->rr_col[1].rc_devidx = devidx;
730 		rr->rr_col[1].rc_offset = o;
731 		if (rm->rm_skipstart == 0)
732 			rm->rm_skipstart = 1;
733 	}
734 
735 	if (zio->io_type == ZIO_TYPE_WRITE) {
736 		vdev_raidz_map_alloc_write(zio, rm, ashift);
737 	} else {
738 		vdev_raidz_map_alloc_read(zio, rm);
739 	}
740 	/* init RAIDZ parity ops */
741 	rm->rm_ops = vdev_raidz_math_get_ops();
742 
743 	return (rm);
744 }
745 
746 /*
747  * Everything before reflow_offset_synced should have been moved to the new
748  * location (read and write completed).  However, this may not yet be reflected
749  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
750  * uberblock has not yet been written). If reflow is not in progress,
751  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
752  * entirely before reflow_offset_synced, it will come from the new location.
753  * Otherwise this row will come from the old location.  Therefore, rows that
754  * straddle the reflow_offset_synced will come from the old location.
755  *
756  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
757  * been copied, but not yet reflected in the on-disk progress
758  * (reflow_offset_synced), it will also be written to the new (already copied)
759  * offset.
760  */
761 noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)762 vdev_raidz_map_alloc_expanded(zio_t *zio,
763     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
764     uint64_t nparity, uint64_t reflow_offset_synced,
765     uint64_t reflow_offset_next, boolean_t use_scratch)
766 {
767 	abd_t *abd = zio->io_abd;
768 	uint64_t offset = zio->io_offset;
769 	uint64_t size = zio->io_size;
770 
771 	/* The zio's size in units of the vdev's minimum sector size. */
772 	uint64_t s = size >> ashift;
773 
774 	/*
775 	 * "Quotient": The number of data sectors for this stripe on all but
776 	 * the "big column" child vdevs that also contain "remainder" data.
777 	 * AKA "full rows"
778 	 */
779 	uint64_t q = s / (logical_cols - nparity);
780 
781 	/*
782 	 * "Remainder": The number of partial stripe data sectors in this I/O.
783 	 * This will add a sector to some, but not all, child vdevs.
784 	 */
785 	uint64_t r = s - q * (logical_cols - nparity);
786 
787 	/* The number of "big columns" - those which contain remainder data. */
788 	uint64_t bc = (r == 0 ? 0 : r + nparity);
789 
790 	/*
791 	 * The total number of data and parity sectors associated with
792 	 * this I/O.
793 	 */
794 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
795 
796 	/* How many rows contain data (not skip) */
797 	uint64_t rows = howmany(tot, logical_cols);
798 	int cols = MIN(tot, logical_cols);
799 
800 	raidz_map_t *rm =
801 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
802 	    KM_SLEEP);
803 	rm->rm_nrows = rows;
804 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
805 	rm->rm_skipstart = bc;
806 	uint64_t asize = 0;
807 
808 	for (uint64_t row = 0; row < rows; row++) {
809 		boolean_t row_use_scratch = B_FALSE;
810 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
811 		rm->rm_row[row] = rr;
812 
813 		/* The starting RAIDZ (parent) vdev sector of the row. */
814 		uint64_t b = (offset >> ashift) + row * logical_cols;
815 
816 		/*
817 		 * If we are in the middle of a reflow, and the copying has
818 		 * not yet completed for any part of this row, then use the
819 		 * old location of this row.  Note that reflow_offset_synced
820 		 * reflects the i/o that's been completed, because it's
821 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
822 		 * This is sufficient for our check, even if that progress
823 		 * has not yet been recorded to disk (reflected in
824 		 * spa_ubsync).  Also note that we consider the last row to
825 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
826 		 * this calculation. This causes a tiny bit of unnecessary
827 		 * double-writes but is safe and simpler to calculate.
828 		 */
829 		int row_phys_cols = physical_cols;
830 		if (b + cols > reflow_offset_synced >> ashift)
831 			row_phys_cols--;
832 		else if (use_scratch)
833 			row_use_scratch = B_TRUE;
834 
835 		/* starting child of this row */
836 		uint64_t child_id = b % row_phys_cols;
837 		/* The starting byte offset on each child vdev. */
838 		uint64_t child_offset = (b / row_phys_cols) << ashift;
839 
840 		/*
841 		 * Note, rr_cols is the entire width of the block, even
842 		 * if this row is shorter.  This is needed because parity
843 		 * generation (for Q and R) needs to know the entire width,
844 		 * because it treats the short row as though it was
845 		 * full-width (and the "phantom" sectors were zero-filled).
846 		 *
847 		 * Another approach to this would be to set cols shorter
848 		 * (to just the number of columns that we might do i/o to)
849 		 * and have another mechanism to tell the parity generation
850 		 * about the "entire width".  Reconstruction (at least
851 		 * vdev_raidz_reconstruct_general()) would also need to
852 		 * know about the "entire width".
853 		 */
854 		rr->rr_firstdatacol = nparity;
855 #ifdef ZFS_DEBUG
856 		/*
857 		 * note: rr_size is PSIZE, not ASIZE
858 		 */
859 		rr->rr_offset = b << ashift;
860 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
861 #endif
862 
863 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
864 			if (child_id >= row_phys_cols) {
865 				child_id -= row_phys_cols;
866 				child_offset += 1ULL << ashift;
867 			}
868 			raidz_col_t *rc = &rr->rr_col[c];
869 			rc->rc_devidx = child_id;
870 			rc->rc_offset = child_offset;
871 
872 			/*
873 			 * Get this from the scratch space if appropriate.
874 			 * This only happens if we crashed in the middle of
875 			 * raidz_reflow_scratch_sync() (while it's running,
876 			 * the rangelock prevents us from doing concurrent
877 			 * io), and even then only during zpool import or
878 			 * when the pool is imported readonly.
879 			 */
880 			if (row_use_scratch)
881 				rc->rc_offset -= VDEV_BOOT_SIZE;
882 
883 			uint64_t dc = c - rr->rr_firstdatacol;
884 			if (c < rr->rr_firstdatacol) {
885 				rc->rc_size = 1ULL << ashift;
886 
887 				/*
888 				 * Parity sectors' rc_abd's are set below
889 				 * after determining if this is an aggregation.
890 				 */
891 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
892 				/*
893 				 * Past the end of the block (even including
894 				 * skip sectors).  This sector is part of the
895 				 * map so that we have full rows for p/q parity
896 				 * generation.
897 				 */
898 				rc->rc_size = 0;
899 				rc->rc_abd = NULL;
900 			} else {
901 				/* "data column" (col excluding parity) */
902 				uint64_t off;
903 
904 				if (c < bc || r == 0) {
905 					off = dc * rows + row;
906 				} else {
907 					off = r * rows +
908 					    (dc - r) * (rows - 1) + row;
909 				}
910 				rc->rc_size = 1ULL << ashift;
911 				rc->rc_abd = abd_get_offset_struct(
912 				    &rc->rc_abdstruct, abd, off << ashift,
913 				    rc->rc_size);
914 			}
915 
916 			if (rc->rc_size == 0)
917 				continue;
918 
919 			/*
920 			 * If any part of this row is in both old and new
921 			 * locations, the primary location is the old
922 			 * location. If this sector was already copied to the
923 			 * new location, we need to also write to the new,
924 			 * "shadow" location.
925 			 *
926 			 * Note, `row_phys_cols != physical_cols` indicates
927 			 * that the primary location is the old location.
928 			 * `b+c < reflow_offset_next` indicates that the copy
929 			 * to the new location has been initiated. We know
930 			 * that the copy has completed because we have the
931 			 * rangelock, which is held exclusively while the
932 			 * copy is in progress.
933 			 */
934 			if (row_use_scratch ||
935 			    (row_phys_cols != physical_cols &&
936 			    b + c < reflow_offset_next >> ashift)) {
937 				rc->rc_shadow_devidx = (b + c) % physical_cols;
938 				rc->rc_shadow_offset =
939 				    ((b + c) / physical_cols) << ashift;
940 				if (row_use_scratch)
941 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
942 			}
943 
944 			asize += rc->rc_size;
945 		}
946 
947 		/*
948 		 * See comment in vdev_raidz_map_alloc()
949 		 */
950 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
951 		    (offset & (1ULL << 20))) {
952 			ASSERT(rr->rr_cols >= 2);
953 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
954 
955 			int devidx0 = rr->rr_col[0].rc_devidx;
956 			uint64_t offset0 = rr->rr_col[0].rc_offset;
957 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
958 			uint64_t shadow_offset0 =
959 			    rr->rr_col[0].rc_shadow_offset;
960 
961 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
962 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
963 			rr->rr_col[0].rc_shadow_devidx =
964 			    rr->rr_col[1].rc_shadow_devidx;
965 			rr->rr_col[0].rc_shadow_offset =
966 			    rr->rr_col[1].rc_shadow_offset;
967 
968 			rr->rr_col[1].rc_devidx = devidx0;
969 			rr->rr_col[1].rc_offset = offset0;
970 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
971 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
972 		}
973 	}
974 	ASSERT3U(asize, ==, tot << ashift);
975 
976 	/*
977 	 * Determine if the block is contiguous, in which case we can use
978 	 * an aggregation.
979 	 */
980 	if (rows >= raidz_io_aggregate_rows) {
981 		rm->rm_nphys_cols = physical_cols;
982 		rm->rm_phys_col =
983 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
984 		    KM_SLEEP);
985 
986 		/*
987 		 * Determine the aggregate io's offset and size, and check
988 		 * that the io is contiguous.
989 		 */
990 		for (int i = 0;
991 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
992 			raidz_row_t *rr = rm->rm_row[i];
993 			for (int c = 0; c < rr->rr_cols; c++) {
994 				raidz_col_t *rc = &rr->rr_col[c];
995 				raidz_col_t *prc =
996 				    &rm->rm_phys_col[rc->rc_devidx];
997 
998 				if (rc->rc_size == 0)
999 					continue;
1000 
1001 				if (prc->rc_size == 0) {
1002 					ASSERT0(prc->rc_offset);
1003 					prc->rc_offset = rc->rc_offset;
1004 				} else if (prc->rc_offset + prc->rc_size !=
1005 				    rc->rc_offset) {
1006 					/*
1007 					 * This block is not contiguous and
1008 					 * therefore can't be aggregated.
1009 					 * This is expected to be rare, so
1010 					 * the cost of allocating and then
1011 					 * freeing rm_phys_col is not
1012 					 * significant.
1013 					 */
1014 					kmem_free(rm->rm_phys_col,
1015 					    sizeof (raidz_col_t) *
1016 					    rm->rm_nphys_cols);
1017 					rm->rm_phys_col = NULL;
1018 					rm->rm_nphys_cols = 0;
1019 					break;
1020 				}
1021 				prc->rc_size += rc->rc_size;
1022 			}
1023 		}
1024 	}
1025 	if (rm->rm_phys_col != NULL) {
1026 		/*
1027 		 * Allocate aggregate ABD's.
1028 		 */
1029 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
1030 			raidz_col_t *prc = &rm->rm_phys_col[i];
1031 
1032 			prc->rc_devidx = i;
1033 
1034 			if (prc->rc_size == 0)
1035 				continue;
1036 
1037 			prc->rc_abd =
1038 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1039 			    B_FALSE);
1040 		}
1041 
1042 		/*
1043 		 * Point the parity abd's into the aggregate abd's.
1044 		 */
1045 		for (int i = 0; i < rm->rm_nrows; i++) {
1046 			raidz_row_t *rr = rm->rm_row[i];
1047 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1048 				raidz_col_t *rc = &rr->rr_col[c];
1049 				raidz_col_t *prc =
1050 				    &rm->rm_phys_col[rc->rc_devidx];
1051 				rc->rc_abd =
1052 				    abd_get_offset_struct(&rc->rc_abdstruct,
1053 				    prc->rc_abd,
1054 				    rc->rc_offset - prc->rc_offset,
1055 				    rc->rc_size);
1056 			}
1057 		}
1058 	} else {
1059 		/*
1060 		 * Allocate new abd's for the parity sectors.
1061 		 */
1062 		for (int i = 0; i < rm->rm_nrows; i++) {
1063 			raidz_row_t *rr = rm->rm_row[i];
1064 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1065 				raidz_col_t *rc = &rr->rr_col[c];
1066 				rc->rc_abd =
1067 				    abd_alloc_linear(rc->rc_size,
1068 				    B_TRUE);
1069 			}
1070 		}
1071 	}
1072 	/* init RAIDZ parity ops */
1073 	rm->rm_ops = vdev_raidz_math_get_ops();
1074 
1075 	return (rm);
1076 }
1077 
1078 struct pqr_struct {
1079 	uint64_t *p;
1080 	uint64_t *q;
1081 	uint64_t *r;
1082 };
1083 
1084 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1085 vdev_raidz_p_func(void *buf, size_t size, void *private)
1086 {
1087 	struct pqr_struct *pqr = private;
1088 	const uint64_t *src = buf;
1089 	int cnt = size / sizeof (src[0]);
1090 
1091 	ASSERT(pqr->p && !pqr->q && !pqr->r);
1092 
1093 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1094 		*pqr->p ^= *src;
1095 
1096 	return (0);
1097 }
1098 
1099 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1100 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1101 {
1102 	struct pqr_struct *pqr = private;
1103 	const uint64_t *src = buf;
1104 	uint64_t mask;
1105 	int cnt = size / sizeof (src[0]);
1106 
1107 	ASSERT(pqr->p && pqr->q && !pqr->r);
1108 
1109 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1110 		*pqr->p ^= *src;
1111 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1112 		*pqr->q ^= *src;
1113 	}
1114 
1115 	return (0);
1116 }
1117 
1118 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1119 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1120 {
1121 	struct pqr_struct *pqr = private;
1122 	const uint64_t *src = buf;
1123 	uint64_t mask;
1124 	int cnt = size / sizeof (src[0]);
1125 
1126 	ASSERT(pqr->p && pqr->q && pqr->r);
1127 
1128 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1129 		*pqr->p ^= *src;
1130 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1131 		*pqr->q ^= *src;
1132 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1133 		*pqr->r ^= *src;
1134 	}
1135 
1136 	return (0);
1137 }
1138 
1139 static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)1140 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1141 {
1142 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1143 
1144 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1145 		abd_t *src = rr->rr_col[c].rc_abd;
1146 
1147 		if (c == rr->rr_firstdatacol) {
1148 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1149 		} else {
1150 			struct pqr_struct pqr = { p, NULL, NULL };
1151 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1152 			    vdev_raidz_p_func, &pqr);
1153 		}
1154 	}
1155 }
1156 
1157 static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)1158 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1159 {
1160 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1161 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1162 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1163 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1164 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1165 
1166 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1167 		abd_t *src = rr->rr_col[c].rc_abd;
1168 
1169 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1170 
1171 		if (c == rr->rr_firstdatacol) {
1172 			ASSERT(ccnt == pcnt || ccnt == 0);
1173 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1174 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1175 
1176 			for (uint64_t i = ccnt; i < pcnt; i++) {
1177 				p[i] = 0;
1178 				q[i] = 0;
1179 			}
1180 		} else {
1181 			struct pqr_struct pqr = { p, q, NULL };
1182 
1183 			ASSERT(ccnt <= pcnt);
1184 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1185 			    vdev_raidz_pq_func, &pqr);
1186 
1187 			/*
1188 			 * Treat short columns as though they are full of 0s.
1189 			 * Note that there's therefore nothing needed for P.
1190 			 */
1191 			uint64_t mask;
1192 			for (uint64_t i = ccnt; i < pcnt; i++) {
1193 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1194 			}
1195 		}
1196 	}
1197 }
1198 
1199 static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)1200 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1201 {
1202 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1203 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1204 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1205 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1206 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1207 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1208 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1209 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1210 
1211 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1212 		abd_t *src = rr->rr_col[c].rc_abd;
1213 
1214 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1215 
1216 		if (c == rr->rr_firstdatacol) {
1217 			ASSERT(ccnt == pcnt || ccnt == 0);
1218 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1219 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1220 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1221 
1222 			for (uint64_t i = ccnt; i < pcnt; i++) {
1223 				p[i] = 0;
1224 				q[i] = 0;
1225 				r[i] = 0;
1226 			}
1227 		} else {
1228 			struct pqr_struct pqr = { p, q, r };
1229 
1230 			ASSERT(ccnt <= pcnt);
1231 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1232 			    vdev_raidz_pqr_func, &pqr);
1233 
1234 			/*
1235 			 * Treat short columns as though they are full of 0s.
1236 			 * Note that there's therefore nothing needed for P.
1237 			 */
1238 			uint64_t mask;
1239 			for (uint64_t i = ccnt; i < pcnt; i++) {
1240 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1241 				VDEV_RAIDZ_64MUL_4(r[i], mask);
1242 			}
1243 		}
1244 	}
1245 }
1246 
1247 /*
1248  * Generate RAID parity in the first virtual columns according to the number of
1249  * parity columns available.
1250  */
1251 void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)1252 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1253 {
1254 	if (rr->rr_cols == 0) {
1255 		/*
1256 		 * We are handling this block one row at a time (because
1257 		 * this block has a different logical vs physical width,
1258 		 * due to RAIDZ expansion), and this is a pad-only row,
1259 		 * which has no parity.
1260 		 */
1261 		return;
1262 	}
1263 
1264 	/* Generate using the new math implementation */
1265 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1266 		return;
1267 
1268 	switch (rr->rr_firstdatacol) {
1269 	case 1:
1270 		vdev_raidz_generate_parity_p(rr);
1271 		break;
1272 	case 2:
1273 		vdev_raidz_generate_parity_pq(rr);
1274 		break;
1275 	case 3:
1276 		vdev_raidz_generate_parity_pqr(rr);
1277 		break;
1278 	default:
1279 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1280 	}
1281 }
1282 
1283 void
vdev_raidz_generate_parity(raidz_map_t * rm)1284 vdev_raidz_generate_parity(raidz_map_t *rm)
1285 {
1286 	for (int i = 0; i < rm->rm_nrows; i++) {
1287 		raidz_row_t *rr = rm->rm_row[i];
1288 		vdev_raidz_generate_parity_row(rm, rr);
1289 	}
1290 }
1291 
1292 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1293 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1294 {
1295 	(void) private;
1296 	uint64_t *dst = dbuf;
1297 	uint64_t *src = sbuf;
1298 	int cnt = size / sizeof (src[0]);
1299 
1300 	for (int i = 0; i < cnt; i++) {
1301 		dst[i] ^= src[i];
1302 	}
1303 
1304 	return (0);
1305 }
1306 
1307 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1308 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1309     void *private)
1310 {
1311 	(void) private;
1312 	uint64_t *dst = dbuf;
1313 	uint64_t *src = sbuf;
1314 	uint64_t mask;
1315 	int cnt = size / sizeof (dst[0]);
1316 
1317 	for (int i = 0; i < cnt; i++, dst++, src++) {
1318 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1319 		*dst ^= *src;
1320 	}
1321 
1322 	return (0);
1323 }
1324 
1325 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1326 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1327 {
1328 	(void) private;
1329 	uint64_t *dst = buf;
1330 	uint64_t mask;
1331 	int cnt = size / sizeof (dst[0]);
1332 
1333 	for (int i = 0; i < cnt; i++, dst++) {
1334 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1335 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1336 	}
1337 
1338 	return (0);
1339 }
1340 
1341 struct reconst_q_struct {
1342 	uint64_t *q;
1343 	int exp;
1344 };
1345 
1346 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1347 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1348 {
1349 	struct reconst_q_struct *rq = private;
1350 	uint64_t *dst = buf;
1351 	int cnt = size / sizeof (dst[0]);
1352 
1353 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1354 		int j;
1355 		uint8_t *b;
1356 
1357 		*dst ^= *rq->q;
1358 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1359 			*b = vdev_raidz_exp2(*b, rq->exp);
1360 		}
1361 	}
1362 
1363 	return (0);
1364 }
1365 
1366 struct reconst_pq_struct {
1367 	uint8_t *p;
1368 	uint8_t *q;
1369 	uint8_t *pxy;
1370 	uint8_t *qxy;
1371 	int aexp;
1372 	int bexp;
1373 };
1374 
1375 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)1376 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1377 {
1378 	struct reconst_pq_struct *rpq = private;
1379 	uint8_t *xd = xbuf;
1380 	uint8_t *yd = ybuf;
1381 
1382 	for (int i = 0; i < size;
1383 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1384 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1385 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1386 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1387 	}
1388 
1389 	return (0);
1390 }
1391 
1392 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1393 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1394 {
1395 	struct reconst_pq_struct *rpq = private;
1396 	uint8_t *xd = xbuf;
1397 
1398 	for (int i = 0; i < size;
1399 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1400 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1401 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1402 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1403 	}
1404 
1405 	return (0);
1406 }
1407 
1408 static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)1409 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1410 {
1411 	int x = tgts[0];
1412 	abd_t *dst, *src;
1413 
1414 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1415 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1416 
1417 	ASSERT3U(ntgts, ==, 1);
1418 	ASSERT3U(x, >=, rr->rr_firstdatacol);
1419 	ASSERT3U(x, <, rr->rr_cols);
1420 
1421 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1422 
1423 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1424 	dst = rr->rr_col[x].rc_abd;
1425 
1426 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1427 
1428 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1429 		uint64_t size = MIN(rr->rr_col[x].rc_size,
1430 		    rr->rr_col[c].rc_size);
1431 
1432 		src = rr->rr_col[c].rc_abd;
1433 
1434 		if (c == x)
1435 			continue;
1436 
1437 		(void) abd_iterate_func2(dst, src, 0, 0, size,
1438 		    vdev_raidz_reconst_p_func, NULL);
1439 	}
1440 }
1441 
1442 static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)1443 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1444 {
1445 	int x = tgts[0];
1446 	int c, exp;
1447 	abd_t *dst, *src;
1448 
1449 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1450 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1451 
1452 	ASSERT(ntgts == 1);
1453 
1454 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1455 
1456 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1457 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1458 		    rr->rr_col[c].rc_size);
1459 
1460 		src = rr->rr_col[c].rc_abd;
1461 		dst = rr->rr_col[x].rc_abd;
1462 
1463 		if (c == rr->rr_firstdatacol) {
1464 			abd_copy(dst, src, size);
1465 			if (rr->rr_col[x].rc_size > size) {
1466 				abd_zero_off(dst, size,
1467 				    rr->rr_col[x].rc_size - size);
1468 			}
1469 		} else {
1470 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1471 			(void) abd_iterate_func2(dst, src, 0, 0, size,
1472 			    vdev_raidz_reconst_q_pre_func, NULL);
1473 			(void) abd_iterate_func(dst,
1474 			    size, rr->rr_col[x].rc_size - size,
1475 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1476 		}
1477 	}
1478 
1479 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1480 	dst = rr->rr_col[x].rc_abd;
1481 	exp = 255 - (rr->rr_cols - 1 - x);
1482 
1483 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1484 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1485 	    vdev_raidz_reconst_q_post_func, &rq);
1486 }
1487 
1488 static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)1489 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1490 {
1491 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1492 	abd_t *pdata, *qdata;
1493 	uint64_t xsize, ysize;
1494 	int x = tgts[0];
1495 	int y = tgts[1];
1496 	abd_t *xd, *yd;
1497 
1498 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1499 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1500 
1501 	ASSERT(ntgts == 2);
1502 	ASSERT(x < y);
1503 	ASSERT(x >= rr->rr_firstdatacol);
1504 	ASSERT(y < rr->rr_cols);
1505 
1506 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1507 
1508 	/*
1509 	 * Move the parity data aside -- we're going to compute parity as
1510 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1511 	 * reuse the parity generation mechanism without trashing the actual
1512 	 * parity so we make those columns appear to be full of zeros by
1513 	 * setting their lengths to zero.
1514 	 */
1515 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1516 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1517 	xsize = rr->rr_col[x].rc_size;
1518 	ysize = rr->rr_col[y].rc_size;
1519 
1520 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1521 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1522 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1523 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1524 	rr->rr_col[x].rc_size = 0;
1525 	rr->rr_col[y].rc_size = 0;
1526 
1527 	vdev_raidz_generate_parity_pq(rr);
1528 
1529 	rr->rr_col[x].rc_size = xsize;
1530 	rr->rr_col[y].rc_size = ysize;
1531 
1532 	p = abd_to_buf(pdata);
1533 	q = abd_to_buf(qdata);
1534 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1535 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1536 	xd = rr->rr_col[x].rc_abd;
1537 	yd = rr->rr_col[y].rc_abd;
1538 
1539 	/*
1540 	 * We now have:
1541 	 *	Pxy = P + D_x + D_y
1542 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1543 	 *
1544 	 * We can then solve for D_x:
1545 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1546 	 * where
1547 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1548 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1549 	 *
1550 	 * With D_x in hand, we can easily solve for D_y:
1551 	 *	D_y = P + Pxy + D_x
1552 	 */
1553 
1554 	a = vdev_raidz_pow2[255 + x - y];
1555 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1556 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1557 
1558 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1559 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1560 
1561 	ASSERT3U(xsize, >=, ysize);
1562 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1563 
1564 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1565 	    vdev_raidz_reconst_pq_func, &rpq);
1566 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1567 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1568 
1569 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1570 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1571 
1572 	/*
1573 	 * Restore the saved parity data.
1574 	 */
1575 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1576 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1577 }
1578 
1579 /*
1580  * In the general case of reconstruction, we must solve the system of linear
1581  * equations defined by the coefficients used to generate parity as well as
1582  * the contents of the data and parity disks. This can be expressed with
1583  * vectors for the original data (D) and the actual data (d) and parity (p)
1584  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1585  *
1586  *            __   __                     __     __
1587  *            |     |         __     __   |  p_0  |
1588  *            |  V  |         |  D_0  |   | p_m-1 |
1589  *            |     |    x    |   :   | = |  d_0  |
1590  *            |  I  |         | D_n-1 |   |   :   |
1591  *            |     |         ~~     ~~   | d_n-1 |
1592  *            ~~   ~~                     ~~     ~~
1593  *
1594  * I is simply a square identity matrix of size n, and V is a vandermonde
1595  * matrix defined by the coefficients we chose for the various parity columns
1596  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1597  * computation as well as linear separability.
1598  *
1599  *      __               __               __     __
1600  *      |   1   ..  1 1 1 |               |  p_0  |
1601  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1602  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1603  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1604  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1605  *      |   :       : : : |   |   :   |   |  d_2  |
1606  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1607  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1608  *      |   0   ..  0 0 1 |               | d_n-1 |
1609  *      ~~               ~~               ~~     ~~
1610  *
1611  * Note that I, V, d, and p are known. To compute D, we must invert the
1612  * matrix and use the known data and parity values to reconstruct the unknown
1613  * data values. We begin by removing the rows in V|I and d|p that correspond
1614  * to failed or missing columns; we then make V|I square (n x n) and d|p
1615  * sized n by removing rows corresponding to unused parity from the bottom up
1616  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1617  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1618  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1619  *           __                               __
1620  *           |  1   1   1   1   1   1   1   1  |
1621  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1622  *           |  19 205 116  29  64  16  4   1  |      / /
1623  *           |  1   0   0   0   0   0   0   0  |     / /
1624  *           |  0   1   0   0   0   0   0   0  | <--' /
1625  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1626  *           |  0   0   0   1   0   0   0   0  |
1627  *           |  0   0   0   0   1   0   0   0  |
1628  *           |  0   0   0   0   0   1   0   0  |
1629  *           |  0   0   0   0   0   0   1   0  |
1630  *           |  0   0   0   0   0   0   0   1  |
1631  *           ~~                               ~~
1632  *           __                               __
1633  *           |  1   1   1   1   1   1   1   1  |
1634  *           | 128  64  32  16  8   4   2   1  |
1635  *           |  19 205 116  29  64  16  4   1  |
1636  *           |  1   0   0   0   0   0   0   0  |
1637  *           |  0   1   0   0   0   0   0   0  |
1638  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1639  *           |  0   0   0   1   0   0   0   0  |
1640  *           |  0   0   0   0   1   0   0   0  |
1641  *           |  0   0   0   0   0   1   0   0  |
1642  *           |  0   0   0   0   0   0   1   0  |
1643  *           |  0   0   0   0   0   0   0   1  |
1644  *           ~~                               ~~
1645  *
1646  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1647  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1648  * matrix is not singular.
1649  * __                                                                 __
1650  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1651  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1652  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1653  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1654  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1655  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1656  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1657  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1658  * ~~                                                                 ~~
1659  * __                                                                 __
1660  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1661  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1662  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1663  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1664  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1665  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1666  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1667  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1668  * ~~                                                                 ~~
1669  * __                                                                 __
1670  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1671  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1672  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1673  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1674  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1675  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1676  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1677  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1678  * ~~                                                                 ~~
1679  * __                                                                 __
1680  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1681  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1682  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1683  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1684  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1685  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1686  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1687  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1688  * ~~                                                                 ~~
1689  * __                                                                 __
1690  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1691  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1692  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1693  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1694  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1695  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1696  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1697  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1698  * ~~                                                                 ~~
1699  * __                                                                 __
1700  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1701  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1702  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1703  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1704  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1705  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1706  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1707  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1708  * ~~                                                                 ~~
1709  *                   __                               __
1710  *                   |  0   0   1   0   0   0   0   0  |
1711  *                   | 167 100  5   41 159 169 217 208 |
1712  *                   | 166 100  4   40 158 168 216 209 |
1713  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1714  *                   |  0   0   0   0   1   0   0   0  |
1715  *                   |  0   0   0   0   0   1   0   0  |
1716  *                   |  0   0   0   0   0   0   1   0  |
1717  *                   |  0   0   0   0   0   0   0   1  |
1718  *                   ~~                               ~~
1719  *
1720  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1721  * of the missing data.
1722  *
1723  * As is apparent from the example above, the only non-trivial rows in the
1724  * inverse matrix correspond to the data disks that we're trying to
1725  * reconstruct. Indeed, those are the only rows we need as the others would
1726  * only be useful for reconstructing data known or assumed to be valid. For
1727  * that reason, we only build the coefficients in the rows that correspond to
1728  * targeted columns.
1729  */
1730 
1731 static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)1732 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1733     uint8_t **rows)
1734 {
1735 	int i, j;
1736 	int pow;
1737 
1738 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1739 
1740 	/*
1741 	 * Fill in the missing rows of interest.
1742 	 */
1743 	for (i = 0; i < nmap; i++) {
1744 		ASSERT3S(0, <=, map[i]);
1745 		ASSERT3S(map[i], <=, 2);
1746 
1747 		pow = map[i] * n;
1748 		if (pow > 255)
1749 			pow -= 255;
1750 		ASSERT(pow <= 255);
1751 
1752 		for (j = 0; j < n; j++) {
1753 			pow -= map[i];
1754 			if (pow < 0)
1755 				pow += 255;
1756 			rows[i][j] = vdev_raidz_pow2[pow];
1757 		}
1758 	}
1759 }
1760 
1761 static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1762 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1763     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1764 {
1765 	int i, j, ii, jj;
1766 	uint8_t log;
1767 
1768 	/*
1769 	 * Assert that the first nmissing entries from the array of used
1770 	 * columns correspond to parity columns and that subsequent entries
1771 	 * correspond to data columns.
1772 	 */
1773 	for (i = 0; i < nmissing; i++) {
1774 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1775 	}
1776 	for (; i < n; i++) {
1777 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1778 	}
1779 
1780 	/*
1781 	 * First initialize the storage where we'll compute the inverse rows.
1782 	 */
1783 	for (i = 0; i < nmissing; i++) {
1784 		for (j = 0; j < n; j++) {
1785 			invrows[i][j] = (i == j) ? 1 : 0;
1786 		}
1787 	}
1788 
1789 	/*
1790 	 * Subtract all trivial rows from the rows of consequence.
1791 	 */
1792 	for (i = 0; i < nmissing; i++) {
1793 		for (j = nmissing; j < n; j++) {
1794 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1795 			jj = used[j] - rr->rr_firstdatacol;
1796 			ASSERT3S(jj, <, n);
1797 			invrows[i][j] = rows[i][jj];
1798 			rows[i][jj] = 0;
1799 		}
1800 	}
1801 
1802 	/*
1803 	 * For each of the rows of interest, we must normalize it and subtract
1804 	 * a multiple of it from the other rows.
1805 	 */
1806 	for (i = 0; i < nmissing; i++) {
1807 		for (j = 0; j < missing[i]; j++) {
1808 			ASSERT0(rows[i][j]);
1809 		}
1810 		ASSERT3U(rows[i][missing[i]], !=, 0);
1811 
1812 		/*
1813 		 * Compute the inverse of the first element and multiply each
1814 		 * element in the row by that value.
1815 		 */
1816 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1817 
1818 		for (j = 0; j < n; j++) {
1819 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1820 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1821 		}
1822 
1823 		for (ii = 0; ii < nmissing; ii++) {
1824 			if (i == ii)
1825 				continue;
1826 
1827 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1828 
1829 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1830 
1831 			for (j = 0; j < n; j++) {
1832 				rows[ii][j] ^=
1833 				    vdev_raidz_exp2(rows[i][j], log);
1834 				invrows[ii][j] ^=
1835 				    vdev_raidz_exp2(invrows[i][j], log);
1836 			}
1837 		}
1838 	}
1839 
1840 	/*
1841 	 * Verify that the data that is left in the rows are properly part of
1842 	 * an identity matrix.
1843 	 */
1844 	for (i = 0; i < nmissing; i++) {
1845 		for (j = 0; j < n; j++) {
1846 			if (j == missing[i]) {
1847 				ASSERT3U(rows[i][j], ==, 1);
1848 			} else {
1849 				ASSERT0(rows[i][j]);
1850 			}
1851 		}
1852 	}
1853 }
1854 
1855 static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1856 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1857     int *missing, uint8_t **invrows, const uint8_t *used)
1858 {
1859 	int i, j, x, cc, c;
1860 	uint8_t *src;
1861 	uint64_t ccount;
1862 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1863 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1864 	uint8_t log = 0;
1865 	uint8_t val;
1866 	int ll;
1867 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1868 	uint8_t *p, *pp;
1869 	size_t psize;
1870 
1871 	psize = sizeof (invlog[0][0]) * n * nmissing;
1872 	p = kmem_alloc(psize, KM_SLEEP);
1873 
1874 	for (pp = p, i = 0; i < nmissing; i++) {
1875 		invlog[i] = pp;
1876 		pp += n;
1877 	}
1878 
1879 	for (i = 0; i < nmissing; i++) {
1880 		for (j = 0; j < n; j++) {
1881 			ASSERT3U(invrows[i][j], !=, 0);
1882 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1883 		}
1884 	}
1885 
1886 	for (i = 0; i < n; i++) {
1887 		c = used[i];
1888 		ASSERT3U(c, <, rr->rr_cols);
1889 
1890 		ccount = rr->rr_col[c].rc_size;
1891 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1892 		if (ccount == 0)
1893 			continue;
1894 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1895 		for (j = 0; j < nmissing; j++) {
1896 			cc = missing[j] + rr->rr_firstdatacol;
1897 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1898 			ASSERT3U(cc, <, rr->rr_cols);
1899 			ASSERT3U(cc, !=, c);
1900 
1901 			dcount[j] = rr->rr_col[cc].rc_size;
1902 			if (dcount[j] != 0)
1903 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1904 		}
1905 
1906 		for (x = 0; x < ccount; x++, src++) {
1907 			if (*src != 0)
1908 				log = vdev_raidz_log2[*src];
1909 
1910 			for (cc = 0; cc < nmissing; cc++) {
1911 				if (x >= dcount[cc])
1912 					continue;
1913 
1914 				if (*src == 0) {
1915 					val = 0;
1916 				} else {
1917 					if ((ll = log + invlog[cc][i]) >= 255)
1918 						ll -= 255;
1919 					val = vdev_raidz_pow2[ll];
1920 				}
1921 
1922 				if (i == 0)
1923 					dst[cc][x] = val;
1924 				else
1925 					dst[cc][x] ^= val;
1926 			}
1927 		}
1928 	}
1929 
1930 	kmem_free(p, psize);
1931 }
1932 
1933 static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)1934 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1935 {
1936 	int i, c, t, tt;
1937 	unsigned int n;
1938 	unsigned int nmissing_rows;
1939 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1940 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1941 	uint8_t *p, *pp;
1942 	size_t psize;
1943 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1944 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1945 	uint8_t *used;
1946 
1947 	abd_t **bufs = NULL;
1948 
1949 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1950 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1951 	/*
1952 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1953 	 * temporary linear ABDs if any non-linear ABDs are found.
1954 	 */
1955 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1956 		ASSERT(rr->rr_col[i].rc_abd != NULL);
1957 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1958 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1959 			    KM_PUSHPAGE);
1960 
1961 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1962 				raidz_col_t *col = &rr->rr_col[c];
1963 
1964 				bufs[c] = col->rc_abd;
1965 				if (bufs[c] != NULL) {
1966 					col->rc_abd = abd_alloc_linear(
1967 					    col->rc_size, B_TRUE);
1968 					abd_copy(col->rc_abd, bufs[c],
1969 					    col->rc_size);
1970 				}
1971 			}
1972 
1973 			break;
1974 		}
1975 	}
1976 
1977 	n = rr->rr_cols - rr->rr_firstdatacol;
1978 
1979 	/*
1980 	 * Figure out which data columns are missing.
1981 	 */
1982 	nmissing_rows = 0;
1983 	for (t = 0; t < ntgts; t++) {
1984 		if (tgts[t] >= rr->rr_firstdatacol) {
1985 			missing_rows[nmissing_rows++] =
1986 			    tgts[t] - rr->rr_firstdatacol;
1987 		}
1988 	}
1989 
1990 	/*
1991 	 * Figure out which parity columns to use to help generate the missing
1992 	 * data columns.
1993 	 */
1994 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1995 		ASSERT(tt < ntgts);
1996 		ASSERT(c < rr->rr_firstdatacol);
1997 
1998 		/*
1999 		 * Skip any targeted parity columns.
2000 		 */
2001 		if (c == tgts[tt]) {
2002 			tt++;
2003 			continue;
2004 		}
2005 
2006 		parity_map[i] = c;
2007 		i++;
2008 	}
2009 
2010 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
2011 	    nmissing_rows * n + sizeof (used[0]) * n;
2012 	p = kmem_alloc(psize, KM_SLEEP);
2013 
2014 	for (pp = p, i = 0; i < nmissing_rows; i++) {
2015 		rows[i] = pp;
2016 		pp += n;
2017 		invrows[i] = pp;
2018 		pp += n;
2019 	}
2020 	used = pp;
2021 
2022 	for (i = 0; i < nmissing_rows; i++) {
2023 		used[i] = parity_map[i];
2024 	}
2025 
2026 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2027 		if (tt < nmissing_rows &&
2028 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
2029 			tt++;
2030 			continue;
2031 		}
2032 
2033 		ASSERT3S(i, <, n);
2034 		used[i] = c;
2035 		i++;
2036 	}
2037 
2038 	/*
2039 	 * Initialize the interesting rows of the matrix.
2040 	 */
2041 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2042 
2043 	/*
2044 	 * Invert the matrix.
2045 	 */
2046 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2047 	    invrows, used);
2048 
2049 	/*
2050 	 * Reconstruct the missing data using the generated matrix.
2051 	 */
2052 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2053 	    invrows, used);
2054 
2055 	kmem_free(p, psize);
2056 
2057 	/*
2058 	 * copy back from temporary linear abds and free them
2059 	 */
2060 	if (bufs) {
2061 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2062 			raidz_col_t *col = &rr->rr_col[c];
2063 
2064 			if (bufs[c] != NULL) {
2065 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2066 				abd_free(col->rc_abd);
2067 			}
2068 			col->rc_abd = bufs[c];
2069 		}
2070 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2071 	}
2072 }
2073 
2074 static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)2075 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2076     const int *t, int nt)
2077 {
2078 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2079 	int ntgts;
2080 	int i, c, ret;
2081 	int nbadparity, nbaddata;
2082 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2083 
2084 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2085 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2086 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2087 		    (int)rr->rr_missingparity);
2088 	}
2089 
2090 	nbadparity = rr->rr_firstdatacol;
2091 	nbaddata = rr->rr_cols - nbadparity;
2092 	ntgts = 0;
2093 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2094 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2095 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2096 			    "offset=%llx error=%u)",
2097 			    rr, c, (int)rr->rr_col[c].rc_devidx,
2098 			    (long long)rr->rr_col[c].rc_offset,
2099 			    (int)rr->rr_col[c].rc_error);
2100 		}
2101 		if (c < rr->rr_firstdatacol)
2102 			parity_valid[c] = B_FALSE;
2103 
2104 		if (i < nt && c == t[i]) {
2105 			tgts[ntgts++] = c;
2106 			i++;
2107 		} else if (rr->rr_col[c].rc_error != 0) {
2108 			tgts[ntgts++] = c;
2109 		} else if (c >= rr->rr_firstdatacol) {
2110 			nbaddata--;
2111 		} else {
2112 			parity_valid[c] = B_TRUE;
2113 			nbadparity--;
2114 		}
2115 	}
2116 
2117 	ASSERT(ntgts >= nt);
2118 	ASSERT(nbaddata >= 0);
2119 	ASSERT(nbaddata + nbadparity == ntgts);
2120 
2121 	dt = &tgts[nbadparity];
2122 
2123 	/* Reconstruct using the new math implementation */
2124 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2125 	if (ret != RAIDZ_ORIGINAL_IMPL)
2126 		return;
2127 
2128 	/*
2129 	 * See if we can use any of our optimized reconstruction routines.
2130 	 */
2131 	switch (nbaddata) {
2132 	case 1:
2133 		if (parity_valid[VDEV_RAIDZ_P]) {
2134 			vdev_raidz_reconstruct_p(rr, dt, 1);
2135 			return;
2136 		}
2137 
2138 		ASSERT(rr->rr_firstdatacol > 1);
2139 
2140 		if (parity_valid[VDEV_RAIDZ_Q]) {
2141 			vdev_raidz_reconstruct_q(rr, dt, 1);
2142 			return;
2143 		}
2144 
2145 		ASSERT(rr->rr_firstdatacol > 2);
2146 		break;
2147 
2148 	case 2:
2149 		ASSERT(rr->rr_firstdatacol > 1);
2150 
2151 		if (parity_valid[VDEV_RAIDZ_P] &&
2152 		    parity_valid[VDEV_RAIDZ_Q]) {
2153 			vdev_raidz_reconstruct_pq(rr, dt, 2);
2154 			return;
2155 		}
2156 
2157 		ASSERT(rr->rr_firstdatacol > 2);
2158 
2159 		break;
2160 	}
2161 
2162 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2163 }
2164 
2165 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)2166 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2167     uint64_t *logical_ashift, uint64_t *physical_ashift)
2168 {
2169 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2170 	uint64_t nparity = vdrz->vd_nparity;
2171 	int c;
2172 	int lasterror = 0;
2173 	int numerrors = 0;
2174 
2175 	ASSERT(nparity > 0);
2176 
2177 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2178 	    vd->vdev_children < nparity + 1) {
2179 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2180 		return (SET_ERROR(EINVAL));
2181 	}
2182 
2183 	vdev_open_children(vd);
2184 
2185 	for (c = 0; c < vd->vdev_children; c++) {
2186 		vdev_t *cvd = vd->vdev_child[c];
2187 
2188 		if (cvd->vdev_open_error != 0) {
2189 			lasterror = cvd->vdev_open_error;
2190 			numerrors++;
2191 			continue;
2192 		}
2193 
2194 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2195 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2196 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2197 	}
2198 	for (c = 0; c < vd->vdev_children; c++) {
2199 		vdev_t *cvd = vd->vdev_child[c];
2200 
2201 		if (cvd->vdev_open_error != 0)
2202 			continue;
2203 		*physical_ashift = vdev_best_ashift(*logical_ashift,
2204 		    *physical_ashift, cvd->vdev_physical_ashift);
2205 	}
2206 
2207 	if (vd->vdev_rz_expanding) {
2208 		*asize *= vd->vdev_children - 1;
2209 		*max_asize *= vd->vdev_children - 1;
2210 
2211 		vd->vdev_min_asize = *asize;
2212 	} else {
2213 		*asize *= vd->vdev_children;
2214 		*max_asize *= vd->vdev_children;
2215 	}
2216 
2217 	if (numerrors > nparity) {
2218 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2219 		return (lasterror);
2220 	}
2221 
2222 	return (0);
2223 }
2224 
2225 static void
vdev_raidz_close(vdev_t * vd)2226 vdev_raidz_close(vdev_t *vd)
2227 {
2228 	for (int c = 0; c < vd->vdev_children; c++) {
2229 		if (vd->vdev_child[c] != NULL)
2230 			vdev_close(vd->vdev_child[c]);
2231 	}
2232 }
2233 
2234 /*
2235  * Return the logical width to use, given the txg in which the allocation
2236  * happened.
2237  */
2238 static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)2239 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2240 {
2241 	reflow_node_t lookup = {
2242 		.re_txg = txg,
2243 	};
2244 	avl_index_t where;
2245 
2246 	uint64_t width;
2247 	mutex_enter(&vdrz->vd_expand_lock);
2248 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2249 	if (re != NULL) {
2250 		width = re->re_logical_width;
2251 	} else {
2252 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2253 		if (re != NULL)
2254 			width = re->re_logical_width;
2255 		else
2256 			width = vdrz->vd_original_width;
2257 	}
2258 	mutex_exit(&vdrz->vd_expand_lock);
2259 	return (width);
2260 }
2261 /*
2262  * This code converts an asize into the largest psize that can safely be written
2263  * to an allocation of that size for this vdev.
2264  *
2265  * Note that this function will not take into account the effect of gang
2266  * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
2267  * the psize_to_asize function.
2268  */
2269 static uint64_t
vdev_raidz_asize_to_psize(vdev_t * vd,uint64_t asize,uint64_t txg)2270 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
2271 {
2272 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2273 	uint64_t psize;
2274 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2275 	uint64_t nparity = vdrz->vd_nparity;
2276 
2277 	uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2278 
2279 	ASSERT0(asize % (1 << ashift));
2280 
2281 	psize = (asize >> ashift);
2282 	/*
2283 	 * If the roundup to nparity + 1 caused us to spill into a new row, we
2284 	 * need to ignore that row entirely (since it can't store data or
2285 	 * parity).
2286 	 */
2287 	uint64_t rows = psize / cols;
2288 	psize = psize - (rows * cols) <= nparity ? rows * cols : psize;
2289 	/*  Subtract out parity sectors for each row storing data. */
2290 	psize -= nparity * DIV_ROUND_UP(psize, cols);
2291 	psize <<= ashift;
2292 
2293 	return (psize);
2294 }
2295 
2296 /*
2297  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2298  * more space due to the lower data-to-parity ratio.  In this case it's
2299  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2300  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2301  * regardless of txg.  This is assured because for a single data sector, we
2302  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2303  */
2304 static uint64_t
vdev_raidz_psize_to_asize(vdev_t * vd,uint64_t psize,uint64_t txg)2305 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2306 {
2307 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2308 	uint64_t asize;
2309 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2310 	uint64_t nparity = vdrz->vd_nparity;
2311 
2312 	uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2313 
2314 	asize = ((psize - 1) >> ashift) + 1;
2315 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2316 	asize = roundup(asize, nparity + 1) << ashift;
2317 
2318 #ifdef ZFS_DEBUG
2319 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2320 	uint64_t ncols_new = vdrz->vd_physical_width;
2321 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2322 	    (ncols_new - nparity));
2323 	asize_new = roundup(asize_new, nparity + 1) << ashift;
2324 	VERIFY3U(asize_new, <=, asize);
2325 #endif
2326 
2327 	return (asize);
2328 }
2329 
2330 /*
2331  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2332  * so each child must provide at least 1/Nth of its asize.
2333  */
2334 static uint64_t
vdev_raidz_min_asize(vdev_t * vd)2335 vdev_raidz_min_asize(vdev_t *vd)
2336 {
2337 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2338 	    vd->vdev_children);
2339 }
2340 
2341 /*
2342  * return B_TRUE if a read should be skipped due to being too slow.
2343  *
2344  * In vdev_child_slow_outlier() it looks for outliers based on disk
2345  * latency from the most recent child reads.  Here we're checking if,
2346  * over time, a disk has has been an outlier too many times and is
2347  * now in a sit out period.
2348  */
2349 boolean_t
vdev_sit_out_reads(vdev_t * vd,zio_flag_t io_flags)2350 vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
2351 {
2352 	if (vdev_read_sit_out_secs == 0)
2353 		return (B_FALSE);
2354 
2355 	/* Avoid skipping a data column read when scrubbing */
2356 	if (io_flags & ZIO_FLAG_SCRUB)
2357 		return (B_FALSE);
2358 
2359 	if (!vd->vdev_ops->vdev_op_leaf) {
2360 		boolean_t sitting = B_FALSE;
2361 		for (int c = 0; c < vd->vdev_children; c++) {
2362 			sitting |= vdev_sit_out_reads(vd->vdev_child[c],
2363 			    io_flags);
2364 		}
2365 		return (sitting);
2366 	}
2367 
2368 	if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
2369 		return (B_TRUE);
2370 
2371 	vd->vdev_read_sit_out_expire = 0;
2372 
2373 	return (B_FALSE);
2374 }
2375 
2376 void
vdev_raidz_child_done(zio_t * zio)2377 vdev_raidz_child_done(zio_t *zio)
2378 {
2379 	raidz_col_t *rc = zio->io_private;
2380 
2381 	ASSERT3P(rc->rc_abd, !=, NULL);
2382 	rc->rc_error = zio->io_error;
2383 	rc->rc_tried = 1;
2384 	rc->rc_skipped = 0;
2385 }
2386 
2387 static void
vdev_raidz_shadow_child_done(zio_t * zio)2388 vdev_raidz_shadow_child_done(zio_t *zio)
2389 {
2390 	raidz_col_t *rc = zio->io_private;
2391 
2392 	rc->rc_shadow_error = zio->io_error;
2393 }
2394 
2395 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)2396 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2397 {
2398 	(void) rm;
2399 #ifdef ZFS_DEBUG
2400 	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
2401 	logical_rs.rs_start = rr->rr_offset;
2402 	logical_rs.rs_end = logical_rs.rs_start +
2403 	    vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
2404 	    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2405 
2406 	raidz_col_t *rc = &rr->rr_col[col];
2407 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2408 
2409 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2410 	ASSERT(vdev_xlate_is_empty(&remain_rs));
2411 	if (vdev_xlate_is_empty(&physical_rs)) {
2412 		/*
2413 		 * If we are in the middle of expansion, the
2414 		 * physical->logical mapping is changing so vdev_xlate()
2415 		 * can't give us a reliable answer.
2416 		 */
2417 		return;
2418 	}
2419 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2420 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2421 	/*
2422 	 * It would be nice to assert that rs_end is equal
2423 	 * to rc_offset + rc_size but there might be an
2424 	 * optional I/O at the end that is not accounted in
2425 	 * rc_size.
2426 	 */
2427 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2428 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2429 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2430 	} else {
2431 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2432 	}
2433 #endif
2434 }
2435 
2436 static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2437 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2438 {
2439 	vdev_t *vd = zio->io_vd;
2440 	raidz_map_t *rm = zio->io_vsd;
2441 
2442 	vdev_raidz_generate_parity_row(rm, rr);
2443 
2444 	for (int c = 0; c < rr->rr_scols; c++) {
2445 		raidz_col_t *rc = &rr->rr_col[c];
2446 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2447 
2448 		/* Verify physical to logical translation */
2449 		vdev_raidz_io_verify(zio, rm, rr, c);
2450 
2451 		if (rc->rc_size == 0)
2452 			continue;
2453 
2454 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2455 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2456 
2457 		ASSERT3P(rc->rc_abd, !=, NULL);
2458 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2459 		    rc->rc_offset, rc->rc_abd,
2460 		    abd_get_size(rc->rc_abd), zio->io_type,
2461 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2462 
2463 		if (rc->rc_shadow_devidx != INT_MAX) {
2464 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2465 
2466 			ASSERT3U(
2467 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2468 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2469 
2470 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2471 			    rc->rc_shadow_offset, rc->rc_abd,
2472 			    abd_get_size(rc->rc_abd),
2473 			    zio->io_type, zio->io_priority, 0,
2474 			    vdev_raidz_shadow_child_done, rc));
2475 		}
2476 	}
2477 }
2478 
2479 /*
2480  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2481  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2482  */
2483 static void
raidz_start_skip_writes(zio_t * zio)2484 raidz_start_skip_writes(zio_t *zio)
2485 {
2486 	vdev_t *vd = zio->io_vd;
2487 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2488 	raidz_map_t *rm = zio->io_vsd;
2489 	ASSERT3U(rm->rm_nrows, ==, 1);
2490 	raidz_row_t *rr = rm->rm_row[0];
2491 	for (int c = 0; c < rr->rr_scols; c++) {
2492 		raidz_col_t *rc = &rr->rr_col[c];
2493 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2494 		if (rc->rc_size != 0)
2495 			continue;
2496 		ASSERT0P(rc->rc_abd);
2497 
2498 		ASSERT3U(rc->rc_offset, <,
2499 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2500 
2501 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2502 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2503 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2504 	}
2505 }
2506 
2507 static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2508 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2509 {
2510 	vdev_t *vd = zio->io_vd;
2511 
2512 	/*
2513 	 * Iterate over the columns in reverse order so that we hit the parity
2514 	 * last -- any errors along the way will force us to read the parity.
2515 	 */
2516 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2517 		raidz_col_t *rc = &rr->rr_col[c];
2518 		if (rc->rc_size == 0)
2519 			continue;
2520 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2521 		if (!vdev_readable(cvd)) {
2522 			if (c >= rr->rr_firstdatacol)
2523 				rr->rr_missingdata++;
2524 			else
2525 				rr->rr_missingparity++;
2526 			rc->rc_error = SET_ERROR(ENXIO);
2527 			rc->rc_tried = 1;	/* don't even try */
2528 			rc->rc_skipped = 1;
2529 			continue;
2530 		}
2531 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2532 			if (c >= rr->rr_firstdatacol)
2533 				rr->rr_missingdata++;
2534 			else
2535 				rr->rr_missingparity++;
2536 			rc->rc_error = SET_ERROR(ESTALE);
2537 			rc->rc_skipped = 1;
2538 			continue;
2539 		}
2540 
2541 		if (vdev_sit_out_reads(cvd, zio->io_flags)) {
2542 			rr->rr_outlier_cnt++;
2543 			ASSERT0(rc->rc_latency_outlier);
2544 			rc->rc_latency_outlier = 1;
2545 		}
2546 	}
2547 
2548 	/*
2549 	 * When the row contains a latency outlier and sufficient parity
2550 	 * exists to reconstruct the column data, then skip reading the
2551 	 * known slow child vdev as a performance optimization.
2552 	 */
2553 	if (rr->rr_outlier_cnt > 0 &&
2554 	    (rr->rr_firstdatacol - rr->rr_missingparity) >=
2555 	    (rr->rr_missingdata + 1)) {
2556 
2557 		for (int c = rr->rr_cols - 1; c >= 0; c--) {
2558 			raidz_col_t *rc = &rr->rr_col[c];
2559 
2560 			if (rc->rc_error == 0 && rc->rc_latency_outlier) {
2561 				if (c >= rr->rr_firstdatacol)
2562 					rr->rr_missingdata++;
2563 				else
2564 					rr->rr_missingparity++;
2565 				rc->rc_error = SET_ERROR(EAGAIN);
2566 				rc->rc_skipped = 1;
2567 				break;
2568 			}
2569 		}
2570 	}
2571 
2572 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2573 		raidz_col_t *rc = &rr->rr_col[c];
2574 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2575 
2576 		if (rc->rc_error || rc->rc_size == 0)
2577 			continue;
2578 
2579 		if (forceparity ||
2580 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2581 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2582 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2583 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2584 			    zio->io_type, zio->io_priority, 0,
2585 			    vdev_raidz_child_done, rc));
2586 		}
2587 	}
2588 }
2589 
2590 static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)2591 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2592 {
2593 	vdev_t *vd = zio->io_vd;
2594 
2595 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2596 		raidz_col_t *prc = &rm->rm_phys_col[i];
2597 		if (prc->rc_size == 0)
2598 			continue;
2599 
2600 		ASSERT3U(prc->rc_devidx, ==, i);
2601 		vdev_t *cvd = vd->vdev_child[i];
2602 
2603 		if (!vdev_readable(cvd)) {
2604 			prc->rc_error = SET_ERROR(ENXIO);
2605 			prc->rc_tried = 1;	/* don't even try */
2606 			prc->rc_skipped = 1;
2607 			continue;
2608 		}
2609 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2610 			prc->rc_error = SET_ERROR(ESTALE);
2611 			prc->rc_skipped = 1;
2612 			continue;
2613 		}
2614 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2615 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2616 		    zio->io_type, zio->io_priority, 0,
2617 		    vdev_raidz_child_done, prc));
2618 	}
2619 }
2620 
2621 static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)2622 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2623 {
2624 	/*
2625 	 * If there are multiple rows, we will be hitting
2626 	 * all disks, so go ahead and read the parity so
2627 	 * that we are reading in decent size chunks.
2628 	 */
2629 	boolean_t forceparity = rm->rm_nrows > 1;
2630 
2631 	if (rm->rm_phys_col) {
2632 		vdev_raidz_io_start_read_phys_cols(zio, rm);
2633 	} else {
2634 		for (int i = 0; i < rm->rm_nrows; i++) {
2635 			raidz_row_t *rr = rm->rm_row[i];
2636 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2637 		}
2638 	}
2639 }
2640 
2641 /*
2642  * Start an IO operation on a RAIDZ VDev
2643  *
2644  * Outline:
2645  * - For write operations:
2646  *   1. Generate the parity data
2647  *   2. Create child zio write operations to each column's vdev, for both
2648  *      data and parity.
2649  *   3. If the column skips any sectors for padding, create optional dummy
2650  *      write zio children for those areas to improve aggregation continuity.
2651  * - For read operations:
2652  *   1. Create child zio read operations to each data column's vdev to read
2653  *      the range of data required for zio.
2654  *   2. If this is a scrub or resilver operation, or if any of the data
2655  *      vdevs have had errors, then create zio read operations to the parity
2656  *      columns' VDevs as well.
2657  */
2658 static void
vdev_raidz_io_start(zio_t * zio)2659 vdev_raidz_io_start(zio_t *zio)
2660 {
2661 	vdev_t *vd = zio->io_vd;
2662 	vdev_t *tvd = vd->vdev_top;
2663 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2664 	raidz_map_t *rm;
2665 
2666 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2667 	    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2668 	if (logical_width != vdrz->vd_physical_width) {
2669 		zfs_locked_range_t *lr = NULL;
2670 		uint64_t synced_offset = UINT64_MAX;
2671 		uint64_t next_offset = UINT64_MAX;
2672 		boolean_t use_scratch = B_FALSE;
2673 		/*
2674 		 * Note: when the expansion is completing, we set
2675 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2676 		 * in a later txg than when we last update spa_ubsync's state
2677 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2678 		 * may see vre_state!=SCANNING before
2679 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2680 		 * on disk, but the copying progress has been synced to disk
2681 		 * (and reflected in spa_ubsync).  In this case it's fine to
2682 		 * treat the expansion as completed, since if we crash there's
2683 		 * no additional copying to do.
2684 		 */
2685 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2686 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2687 			    &vdrz->vn_vre);
2688 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2689 			    zio->io_offset, zio->io_size, RL_READER);
2690 			use_scratch =
2691 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2692 			    RRSS_SCRATCH_VALID);
2693 			synced_offset =
2694 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2695 			next_offset = vdrz->vn_vre.vre_offset;
2696 			/*
2697 			 * If we haven't resumed expanding since importing the
2698 			 * pool, vre_offset won't have been set yet.  In
2699 			 * this case the next offset to be copied is the same
2700 			 * as what was synced.
2701 			 */
2702 			if (next_offset == UINT64_MAX) {
2703 				next_offset = synced_offset;
2704 			}
2705 		}
2706 		if (use_scratch) {
2707 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2708 			    "%lld next_offset=%lld use_scratch=%u",
2709 			    zio,
2710 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2711 			    (long long)zio->io_offset,
2712 			    (long long)synced_offset,
2713 			    (long long)next_offset,
2714 			    use_scratch);
2715 		}
2716 
2717 		rm = vdev_raidz_map_alloc_expanded(zio,
2718 		    tvd->vdev_ashift, vdrz->vd_physical_width,
2719 		    logical_width, vdrz->vd_nparity,
2720 		    synced_offset, next_offset, use_scratch);
2721 		rm->rm_lr = lr;
2722 	} else {
2723 		rm = vdev_raidz_map_alloc(zio,
2724 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2725 	}
2726 	rm->rm_original_width = vdrz->vd_original_width;
2727 
2728 	zio->io_vsd = rm;
2729 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2730 	if (zio->io_type == ZIO_TYPE_WRITE) {
2731 		for (int i = 0; i < rm->rm_nrows; i++) {
2732 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2733 		}
2734 
2735 		if (logical_width == vdrz->vd_physical_width) {
2736 			raidz_start_skip_writes(zio);
2737 		}
2738 	} else {
2739 		ASSERT(zio->io_type == ZIO_TYPE_READ);
2740 		vdev_raidz_io_start_read(zio, rm);
2741 	}
2742 
2743 	zio_execute(zio);
2744 }
2745 
2746 /*
2747  * Report a checksum error for a child of a RAID-Z device.
2748  */
2749 void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2750 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2751 {
2752 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2753 
2754 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2755 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2756 		zio_bad_cksum_t zbc;
2757 		raidz_map_t *rm = zio->io_vsd;
2758 
2759 		zbc.zbc_has_cksum = 0;
2760 		zbc.zbc_injected = rm->rm_ecksuminjected;
2761 
2762 		mutex_enter(&vd->vdev_stat_lock);
2763 		vd->vdev_stat.vs_checksum_errors++;
2764 		mutex_exit(&vd->vdev_stat_lock);
2765 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2766 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2767 		    rc->rc_abd, bad_data, &zbc);
2768 	}
2769 }
2770 
2771 /*
2772  * We keep track of whether or not there were any injected errors, so that
2773  * any ereports we generate can note it.
2774  */
2775 static int
raidz_checksum_verify(zio_t * zio)2776 raidz_checksum_verify(zio_t *zio)
2777 {
2778 	zio_bad_cksum_t zbc = {0};
2779 	raidz_map_t *rm = zio->io_vsd;
2780 
2781 	int ret = zio_checksum_error(zio, &zbc);
2782 	/*
2783 	 * Any Direct I/O read that has a checksum error must be treated as
2784 	 * suspicious as the contents of the buffer could be getting
2785 	 * manipulated while the I/O is taking place. The checksum verify error
2786 	 * will be reported to the top-level RAIDZ VDEV.
2787 	 */
2788 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2789 		zio->io_error = ret;
2790 		zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
2791 		zio_dio_chksum_verify_error_report(zio);
2792 		zio_checksum_verified(zio);
2793 		return (0);
2794 	}
2795 
2796 	if (ret != 0 && zbc.zbc_injected != 0)
2797 		rm->rm_ecksuminjected = 1;
2798 
2799 	return (ret);
2800 }
2801 
2802 /*
2803  * Generate the parity from the data columns. If we tried and were able to
2804  * read the parity without error, verify that the generated parity matches the
2805  * data we read. If it doesn't, we fire off a checksum error. Return the
2806  * number of such failures.
2807  */
2808 static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)2809 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2810 {
2811 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2812 	int c, ret = 0;
2813 	raidz_map_t *rm = zio->io_vsd;
2814 	raidz_col_t *rc;
2815 
2816 	blkptr_t *bp = zio->io_bp;
2817 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2818 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2819 
2820 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2821 		return (ret);
2822 
2823 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2824 		rc = &rr->rr_col[c];
2825 		if (!rc->rc_tried || rc->rc_error != 0)
2826 			continue;
2827 
2828 		orig[c] = rc->rc_abd;
2829 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2830 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2831 	}
2832 
2833 	/*
2834 	 * Verify any empty sectors are zero filled to ensure the parity
2835 	 * is calculated correctly even if these non-data sectors are damaged.
2836 	 */
2837 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2838 		ret += vdev_draid_map_verify_empty(zio, rr);
2839 
2840 	/*
2841 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
2842 	 * isn't harmful but it does have the side effect of fixing stuff
2843 	 * we didn't realize was necessary (i.e. even if we return 0).
2844 	 */
2845 	vdev_raidz_generate_parity_row(rm, rr);
2846 
2847 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2848 		rc = &rr->rr_col[c];
2849 
2850 		if (!rc->rc_tried || rc->rc_error != 0)
2851 			continue;
2852 
2853 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2854 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2855 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2856 			vdev_raidz_checksum_error(zio, rc, orig[c]);
2857 			rc->rc_error = SET_ERROR(ECKSUM);
2858 			ret++;
2859 		}
2860 		abd_free(orig[c]);
2861 	}
2862 
2863 	return (ret);
2864 }
2865 
2866 static int
vdev_raidz_worst_error(raidz_row_t * rr)2867 vdev_raidz_worst_error(raidz_row_t *rr)
2868 {
2869 	int error = 0;
2870 
2871 	for (int c = 0; c < rr->rr_cols; c++) {
2872 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2873 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2874 	}
2875 
2876 	return (error);
2877 }
2878 
2879 /*
2880  * Find the median value from a set of n values
2881  */
2882 static uint64_t
latency_median_value(const uint64_t * data,size_t n)2883 latency_median_value(const uint64_t *data, size_t n)
2884 {
2885 	uint64_t m;
2886 
2887 	if (n % 2 == 0)
2888 		m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
2889 	else
2890 		m = data[((n + 1) >> 1) - 1];
2891 
2892 	return (m);
2893 }
2894 
2895 /*
2896  * Calculate the outlier fence from a set of n latency values
2897  *
2898  * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
2899  */
2900 static uint64_t
latency_quartiles_fence(const uint64_t * data,size_t n,uint64_t * iqr)2901 latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
2902 {
2903 	uint64_t q1 = latency_median_value(&data[0], n >> 1);
2904 	uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
2905 
2906 	/*
2907 	 * To avoid detecting false positive outliers when N is small and
2908 	 * and the latencies values are very close, make sure the IQR
2909 	 * is at least 25% larger than Q1.
2910 	 */
2911 	*iqr = MAX(q3 - q1, q1 / 4);
2912 
2913 	return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
2914 }
2915 #define	LAT_CHILDREN_MIN	5
2916 #define	LAT_OUTLIER_LIMIT	20
2917 
2918 static int
latency_compare(const void * arg1,const void * arg2)2919 latency_compare(const void *arg1, const void *arg2)
2920 {
2921 	const uint64_t *l1 = (uint64_t *)arg1;
2922 	const uint64_t *l2 = (uint64_t *)arg2;
2923 
2924 	return (TREE_CMP(*l1, *l2));
2925 }
2926 
2927 void
vdev_raidz_sit_child(vdev_t * svd,uint64_t secs)2928 vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
2929 {
2930 	for (int c = 0; c < svd->vdev_children; c++)
2931 		vdev_raidz_sit_child(svd->vdev_child[c], secs);
2932 
2933 	if (!svd->vdev_ops->vdev_op_leaf)
2934 		return;
2935 
2936 	/* Begin a sit out period for this slow drive */
2937 	svd->vdev_read_sit_out_expire = gethrestime_sec() +
2938 	    secs;
2939 
2940 	/* Count each slow io period */
2941 	mutex_enter(&svd->vdev_stat_lock);
2942 	svd->vdev_stat.vs_slow_ios++;
2943 	mutex_exit(&svd->vdev_stat_lock);
2944 }
2945 
2946 void
vdev_raidz_unsit_child(vdev_t * vd)2947 vdev_raidz_unsit_child(vdev_t *vd)
2948 {
2949 	for (int c = 0; c < vd->vdev_children; c++)
2950 		vdev_raidz_unsit_child(vd->vdev_child[c]);
2951 
2952 	if (!vd->vdev_ops->vdev_op_leaf)
2953 		return;
2954 
2955 	vd->vdev_read_sit_out_expire = 0;
2956 }
2957 
2958 /*
2959  * Check for any latency outlier from latest set of child reads.
2960  *
2961  * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
2962  * rule defines extreme outliers as data points outside the fence of the
2963  * third quartile plus fifty times the Interquartile Range (IQR). This range
2964  * is the distance between the first and third quartile.
2965  *
2966  * Fifty is an extremely large value for Tukey's fence, but the outliers we're
2967  * attempting to detect here are orders of magnitude times larger than the
2968  * median. This large value should capture any truly fault disk quickly,
2969  * without causing spurious sit-outs.
2970  *
2971  * To further avoid spurious sit-outs, vdevs must be detected multiple times
2972  * as an outlier before they are sat, and outlier counts will gradually decay.
2973  * Every nchildren times we have detected an outlier, we subtract 2 from the
2974  * outlier count of all children. If detected outliers are close to uniformly
2975  * distributed, this will result in the outlier count remaining close to 0
2976  * (in expectation; over long enough time-scales, spurious sit-outs are still
2977  * possible).
2978  */
2979 static void
vdev_child_slow_outlier(zio_t * zio)2980 vdev_child_slow_outlier(zio_t *zio)
2981 {
2982 	vdev_t *vd = zio->io_vd;
2983 	if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
2984 	    vd->vdev_children < LAT_CHILDREN_MIN)
2985 		return;
2986 
2987 	hrtime_t now = getlrtime();
2988 	uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
2989 
2990 	if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
2991 		return;
2992 
2993 	/* Allow a single winner when there are racing callers. */
2994 	if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
2995 		return;
2996 
2997 	int children = vd->vdev_children;
2998 	uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
2999 
3000 	for (int c = 0; c < children; c++) {
3001 		vdev_t *cvd = vd->vdev_child[c];
3002 		if (cvd->vdev_prev_histo == NULL) {
3003 			mutex_enter(&cvd->vdev_stat_lock);
3004 			size_t size =
3005 			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3006 			cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
3007 			memcpy(cvd->vdev_prev_histo,
3008 			    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
3009 			    size);
3010 			mutex_exit(&cvd->vdev_stat_lock);
3011 		}
3012 	}
3013 	uint64_t max = 0;
3014 	vdev_t *svd = NULL;
3015 	uint_t sitouts = 0;
3016 	boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
3017 	for (int c = 0; c < children; c++) {
3018 		vdev_t *cvd = vd->vdev_child[c];
3019 		boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
3020 		    cvd->vdev_state != VDEV_STATE_HEALTHY;
3021 
3022 		/* We can't sit out more disks than we have parity */
3023 		if (sitting && ++sitouts >= vdev_get_nparity(vd))
3024 			skip = B_TRUE;
3025 
3026 		mutex_enter(&cvd->vdev_stat_lock);
3027 
3028 		uint64_t *prev_histo = cvd->vdev_prev_histo;
3029 		uint64_t *histo =
3030 		    cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
3031 		if (skip) {
3032 			size_t size =
3033 			    sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3034 			memcpy(prev_histo, histo, size);
3035 			mutex_exit(&cvd->vdev_stat_lock);
3036 			continue;
3037 		}
3038 		uint64_t count = 0;
3039 		lat_data[c] = 0;
3040 		for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
3041 			uint64_t this_count = histo[i] - prev_histo[i];
3042 			lat_data[c] += (1ULL << i) * this_count;
3043 			count += this_count;
3044 		}
3045 		size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3046 		memcpy(prev_histo, histo, size);
3047 		mutex_exit(&cvd->vdev_stat_lock);
3048 		lat_data[c] /= MAX(1, count);
3049 
3050 		/* Wait until all disks have been read from */
3051 		if (lat_data[c] == 0 && !sitting) {
3052 			skip = B_TRUE;
3053 			continue;
3054 		}
3055 
3056 		/* Keep track of the vdev with largest value */
3057 		if (lat_data[c] > max) {
3058 			max = lat_data[c];
3059 			svd = cvd;
3060 			svd_sitting = sitting;
3061 		}
3062 	}
3063 
3064 	if (skip) {
3065 		kmem_free(lat_data, sizeof (uint64_t) * children);
3066 		return;
3067 	}
3068 
3069 	qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
3070 
3071 	uint64_t iqr;
3072 	uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
3073 
3074 	ASSERT3U(lat_data[children - 1], ==, max);
3075 	if (max > fence && !svd_sitting) {
3076 		ASSERT3U(iqr, >, 0);
3077 		uint64_t incr = MAX(1, MIN((max - fence) / iqr,
3078 		    LAT_OUTLIER_LIMIT / 4));
3079 		vd->vdev_outlier_count += incr;
3080 		if (vd->vdev_outlier_count >= children) {
3081 			for (int c = 0; c < children; c++) {
3082 				vdev_t *cvd = vd->vdev_child[c];
3083 				cvd->vdev_outlier_count -= 2;
3084 				cvd->vdev_outlier_count = MAX(0,
3085 				    cvd->vdev_outlier_count);
3086 			}
3087 			vd->vdev_outlier_count = 0;
3088 		}
3089 		/*
3090 		 * Keep track of how many times this child has had
3091 		 * an outlier read. A disk that persitently has a
3092 		 * higher than peers outlier count will be considered
3093 		 * a slow disk.
3094 		 */
3095 		svd->vdev_outlier_count += incr;
3096 		if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
3097 			ASSERT0(svd->vdev_read_sit_out_expire);
3098 			vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
3099 			(void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
3100 			    zio->io_spa, svd, NULL, NULL, 0);
3101 			vdev_dbgmsg(svd, "begin read sit out for %d secs",
3102 			    (int)vdev_read_sit_out_secs);
3103 
3104 			for (int c = 0; c < vd->vdev_children; c++)
3105 				vd->vdev_child[c]->vdev_outlier_count = 0;
3106 		}
3107 	}
3108 
3109 	kmem_free(lat_data, sizeof (uint64_t) * children);
3110 }
3111 
3112 static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)3113 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
3114 {
3115 	int unexpected_errors = 0;
3116 	int parity_errors = 0;
3117 	int parity_untried = 0;
3118 	int data_errors = 0;
3119 
3120 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
3121 
3122 	for (int c = 0; c < rr->rr_cols; c++) {
3123 		raidz_col_t *rc = &rr->rr_col[c];
3124 
3125 		if (rc->rc_error) {
3126 			if (c < rr->rr_firstdatacol)
3127 				parity_errors++;
3128 			else
3129 				data_errors++;
3130 
3131 			if (!rc->rc_skipped)
3132 				unexpected_errors++;
3133 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3134 			parity_untried++;
3135 		}
3136 
3137 		if (rc->rc_force_repair)
3138 			unexpected_errors++;
3139 	}
3140 
3141 	/*
3142 	 * If we read more parity disks than were used for
3143 	 * reconstruction, confirm that the other parity disks produced
3144 	 * correct data.
3145 	 *
3146 	 * Note that we also regenerate parity when resilvering so we
3147 	 * can write it out to failed devices later.
3148 	 */
3149 	if (parity_errors + parity_untried <
3150 	    rr->rr_firstdatacol - data_errors ||
3151 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
3152 		int n = raidz_parity_verify(zio, rr);
3153 		unexpected_errors += n;
3154 	}
3155 
3156 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
3157 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
3158 		/*
3159 		 * Use the good data we have in hand to repair damaged children.
3160 		 */
3161 		for (int c = 0; c < rr->rr_cols; c++) {
3162 			raidz_col_t *rc = &rr->rr_col[c];
3163 			vdev_t *vd = zio->io_vd;
3164 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
3165 
3166 			if (!rc->rc_allow_repair) {
3167 				continue;
3168 			} else if (!rc->rc_force_repair &&
3169 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
3170 				continue;
3171 			}
3172 			/*
3173 			 * We do not allow self healing for Direct I/O reads.
3174 			 * See comment in vdev_raid_row_alloc().
3175 			 */
3176 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
3177 
3178 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
3179 			    "offset=%llx",
3180 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
3181 
3182 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
3183 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
3184 			    ZIO_TYPE_WRITE,
3185 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
3186 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
3187 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
3188 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
3189 		}
3190 	}
3191 
3192 	/*
3193 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
3194 	 * good data.  This ensures that if we've already copied this sector,
3195 	 * it will be corrected if it was damaged.  This writes more than is
3196 	 * necessary, but since expansion is paused during scrub/resilver, at
3197 	 * most a single row will have a shadow location.
3198 	 */
3199 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
3200 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
3201 		for (int c = 0; c < rr->rr_cols; c++) {
3202 			raidz_col_t *rc = &rr->rr_col[c];
3203 			vdev_t *vd = zio->io_vd;
3204 
3205 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
3206 				continue;
3207 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
3208 
3209 			/*
3210 			 * Note: We don't want to update the repair stats
3211 			 * because that would incorrectly indicate that there
3212 			 * was bad data to repair, which we aren't sure about.
3213 			 * By clearing the SCAN_THREAD flag, we prevent this
3214 			 * from happening, despite having the REPAIR flag set.
3215 			 * We need to set SELF_HEAL so that this i/o can't be
3216 			 * bypassed by zio_vdev_io_start().
3217 			 */
3218 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
3219 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
3220 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
3221 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
3222 			    NULL, NULL);
3223 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
3224 			zio_nowait(cio);
3225 		}
3226 	}
3227 }
3228 
3229 static void
raidz_restore_orig_data(raidz_map_t * rm)3230 raidz_restore_orig_data(raidz_map_t *rm)
3231 {
3232 	for (int i = 0; i < rm->rm_nrows; i++) {
3233 		raidz_row_t *rr = rm->rm_row[i];
3234 		for (int c = 0; c < rr->rr_cols; c++) {
3235 			raidz_col_t *rc = &rr->rr_col[c];
3236 			if (rc->rc_need_orig_restore) {
3237 				abd_copy(rc->rc_abd,
3238 				    rc->rc_orig_data, rc->rc_size);
3239 				rc->rc_need_orig_restore = B_FALSE;
3240 			}
3241 		}
3242 	}
3243 }
3244 
3245 /*
3246  * During raidz_reconstruct() for expanded VDEV, we need special consideration
3247  * failure simulations.  See note in raidz_reconstruct() on simulating failure
3248  * of a pre-expansion device.
3249  *
3250  * Treating logical child i as failed, return TRUE if the given column should
3251  * be treated as failed.  The idea of logical children allows us to imagine
3252  * that a disk silently failed before a RAIDZ expansion (reads from this disk
3253  * succeed but return the wrong data).  Since the expansion doesn't verify
3254  * checksums, the incorrect data will be moved to new locations spread among
3255  * the children (going diagonally across them).
3256  *
3257  * Higher "logical child failures" (values of `i`) indicate these
3258  * "pre-expansion failures".  The first physical_width values imagine that a
3259  * current child failed; the next physical_width-1 values imagine that a
3260  * child failed before the most recent expansion; the next physical_width-2
3261  * values imagine a child failed in the expansion before that, etc.
3262  */
3263 static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)3264 raidz_simulate_failure(int physical_width, int original_width, int ashift,
3265     int i, raidz_col_t *rc)
3266 {
3267 	uint64_t sector_id =
3268 	    physical_width * (rc->rc_offset >> ashift) +
3269 	    rc->rc_devidx;
3270 
3271 	for (int w = physical_width; w >= original_width; w--) {
3272 		if (i < w) {
3273 			return (sector_id % w == i);
3274 		} else {
3275 			i -= w;
3276 		}
3277 	}
3278 	ASSERT(!"invalid logical child id");
3279 	return (B_FALSE);
3280 }
3281 
3282 /*
3283  * returns EINVAL if reconstruction of the block will not be possible
3284  * returns ECKSUM if this specific reconstruction failed
3285  * returns 0 on successful reconstruction
3286  */
3287 static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)3288 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
3289 {
3290 	raidz_map_t *rm = zio->io_vsd;
3291 	int physical_width = zio->io_vd->vdev_children;
3292 	int original_width = (rm->rm_original_width != 0) ?
3293 	    rm->rm_original_width : physical_width;
3294 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
3295 
3296 	if (dbgmsg) {
3297 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
3298 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
3299 	}
3300 
3301 	/* Reconstruct each row */
3302 	for (int r = 0; r < rm->rm_nrows; r++) {
3303 		raidz_row_t *rr = rm->rm_row[r];
3304 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
3305 		int t = 0;
3306 		int dead = 0;
3307 		int dead_data = 0;
3308 
3309 		if (dbgmsg)
3310 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
3311 
3312 		for (int c = 0; c < rr->rr_cols; c++) {
3313 			raidz_col_t *rc = &rr->rr_col[c];
3314 			ASSERT0(rc->rc_need_orig_restore);
3315 			if (rc->rc_error != 0) {
3316 				dead++;
3317 				if (c >= nparity)
3318 					dead_data++;
3319 				continue;
3320 			}
3321 			if (rc->rc_size == 0)
3322 				continue;
3323 			for (int lt = 0; lt < ntgts; lt++) {
3324 				if (raidz_simulate_failure(physical_width,
3325 				    original_width,
3326 				    zio->io_vd->vdev_top->vdev_ashift,
3327 				    ltgts[lt], rc)) {
3328 					if (rc->rc_orig_data == NULL) {
3329 						rc->rc_orig_data =
3330 						    abd_alloc_linear(
3331 						    rc->rc_size, B_TRUE);
3332 						abd_copy(rc->rc_orig_data,
3333 						    rc->rc_abd, rc->rc_size);
3334 					}
3335 					rc->rc_need_orig_restore = B_TRUE;
3336 
3337 					dead++;
3338 					if (c >= nparity)
3339 						dead_data++;
3340 					/*
3341 					 * Note: simulating failure of a
3342 					 * pre-expansion device can hit more
3343 					 * than one column, in which case we
3344 					 * might try to simulate more failures
3345 					 * than can be reconstructed, which is
3346 					 * also more than the size of my_tgts.
3347 					 * This check prevents accessing past
3348 					 * the end of my_tgts.  The "dead >
3349 					 * nparity" check below will fail this
3350 					 * reconstruction attempt.
3351 					 */
3352 					if (t < VDEV_RAIDZ_MAXPARITY) {
3353 						my_tgts[t++] = c;
3354 						if (dbgmsg) {
3355 							zfs_dbgmsg("simulating "
3356 							    "failure of col %u "
3357 							    "devidx %u", c,
3358 							    (int)rc->rc_devidx);
3359 						}
3360 					}
3361 					break;
3362 				}
3363 			}
3364 		}
3365 		if (dead > nparity) {
3366 			/* reconstruction not possible */
3367 			if (dbgmsg) {
3368 				zfs_dbgmsg("reconstruction not possible; "
3369 				    "too many failures");
3370 			}
3371 			raidz_restore_orig_data(rm);
3372 			return (EINVAL);
3373 		}
3374 		if (dead_data > 0)
3375 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3376 	}
3377 
3378 	/* Check for success */
3379 	if (raidz_checksum_verify(zio) == 0) {
3380 		if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3381 			return (0);
3382 
3383 		/* Reconstruction succeeded - report errors */
3384 		for (int i = 0; i < rm->rm_nrows; i++) {
3385 			raidz_row_t *rr = rm->rm_row[i];
3386 
3387 			for (int c = 0; c < rr->rr_cols; c++) {
3388 				raidz_col_t *rc = &rr->rr_col[c];
3389 				if (rc->rc_need_orig_restore) {
3390 					/*
3391 					 * Note: if this is a parity column,
3392 					 * we don't really know if it's wrong.
3393 					 * We need to let
3394 					 * vdev_raidz_io_done_verified() check
3395 					 * it, and if we set rc_error, it will
3396 					 * think that it is a "known" error
3397 					 * that doesn't need to be checked
3398 					 * or corrected.
3399 					 */
3400 					if (rc->rc_error == 0 &&
3401 					    c >= rr->rr_firstdatacol) {
3402 						vdev_raidz_checksum_error(zio,
3403 						    rc, rc->rc_orig_data);
3404 						rc->rc_error =
3405 						    SET_ERROR(ECKSUM);
3406 					}
3407 					rc->rc_need_orig_restore = B_FALSE;
3408 				}
3409 			}
3410 
3411 			vdev_raidz_io_done_verified(zio, rr);
3412 		}
3413 
3414 		zio_checksum_verified(zio);
3415 
3416 		if (dbgmsg) {
3417 			zfs_dbgmsg("reconstruction successful "
3418 			    "(checksum verified)");
3419 		}
3420 		return (0);
3421 	}
3422 
3423 	/* Reconstruction failed - restore original data */
3424 	raidz_restore_orig_data(rm);
3425 	if (dbgmsg) {
3426 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3427 		    "failed", zio);
3428 	}
3429 	return (ECKSUM);
3430 }
3431 
3432 /*
3433  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3434  * Note that the algorithm below is non-optimal because it doesn't take into
3435  * account how reconstruction is actually performed. For example, with
3436  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3437  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3438  * cases we'd only use parity information in column 0.
3439  *
3440  * The order that we find the various possible combinations of failed
3441  * disks is dictated by these rules:
3442  * - Examine each "slot" (the "i" in tgts[i])
3443  *   - Try to increment this slot (tgts[i] += 1)
3444  *   - if we can't increment because it runs into the next slot,
3445  *     reset our slot to the minimum, and examine the next slot
3446  *
3447  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3448  *  3 columns to reconstruct), we will generate the following sequence:
3449  *
3450  *  STATE        ACTION
3451  *  0 1 2        special case: skip since these are all parity
3452  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3453  *  0   2 3      first slot: increment to 1
3454  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3455  *  0 1     4    first: reset to 0; middle: increment to 2
3456  *  0   2   4    first: increment to 1
3457  *    1 2   4    first: reset to 0; middle: increment to 3
3458  *  0     3 4    first: increment to 1
3459  *    1   3 4    first: increment to 2
3460  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3461  *  0 1       5  first: reset to 0; middle: increment to 2
3462  *  0   2     5  first: increment to 1
3463  *    1 2     5  first: reset to 0; middle: increment to 3
3464  *  0     3   5  first: increment to 1
3465  *    1   3   5  first: increment to 2
3466  *      2 3   5  first: reset to 0; middle: increment to 4
3467  *  0       4 5  first: increment to 1
3468  *    1     4 5  first: increment to 2
3469  *      2   4 5  first: increment to 3
3470  *        3 4 5  done
3471  *
3472  * This strategy works for dRAID but is less efficient when there are a large
3473  * number of child vdevs and therefore permutations to check. Furthermore,
3474  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3475  * possible as long as there are no more than nparity data errors per row.
3476  * These additional permutations are not currently checked but could be as
3477  * a future improvement.
3478  *
3479  * Returns 0 on success, ECKSUM on failure.
3480  */
3481 static int
vdev_raidz_combrec(zio_t * zio)3482 vdev_raidz_combrec(zio_t *zio)
3483 {
3484 	int nparity = vdev_get_nparity(zio->io_vd);
3485 	raidz_map_t *rm = zio->io_vsd;
3486 	int physical_width = zio->io_vd->vdev_children;
3487 	int original_width = (rm->rm_original_width != 0) ?
3488 	    rm->rm_original_width : physical_width;
3489 
3490 	for (int i = 0; i < rm->rm_nrows; i++) {
3491 		raidz_row_t *rr = rm->rm_row[i];
3492 		int total_errors = 0;
3493 
3494 		for (int c = 0; c < rr->rr_cols; c++) {
3495 			if (rr->rr_col[c].rc_error)
3496 				total_errors++;
3497 		}
3498 
3499 		if (total_errors > nparity)
3500 			return (vdev_raidz_worst_error(rr));
3501 	}
3502 
3503 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3504 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3505 		int *ltgts = &tstore[1]; /* value is logical child ID */
3506 
3507 
3508 		/*
3509 		 * Determine number of logical children, n.  See comment
3510 		 * above raidz_simulate_failure().
3511 		 */
3512 		int n = 0;
3513 		for (int w = physical_width;
3514 		    w >= original_width; w--) {
3515 			n += w;
3516 		}
3517 
3518 		ASSERT3U(num_failures, <=, nparity);
3519 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3520 
3521 		/* Handle corner cases in combrec logic */
3522 		ltgts[-1] = -1;
3523 		for (int i = 0; i < num_failures; i++) {
3524 			ltgts[i] = i;
3525 		}
3526 		ltgts[num_failures] = n;
3527 
3528 		for (;;) {
3529 			int err = raidz_reconstruct(zio, ltgts, num_failures,
3530 			    nparity);
3531 			if (err == EINVAL) {
3532 				/*
3533 				 * Reconstruction not possible with this #
3534 				 * failures; try more failures.
3535 				 */
3536 				break;
3537 			} else if (err == 0)
3538 				return (0);
3539 
3540 			/* Compute next targets to try */
3541 			for (int t = 0; ; t++) {
3542 				ASSERT3U(t, <, num_failures);
3543 				ltgts[t]++;
3544 				if (ltgts[t] == n) {
3545 					/* try more failures */
3546 					ASSERT3U(t, ==, num_failures - 1);
3547 					if (zfs_flags &
3548 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3549 						zfs_dbgmsg("reconstruction "
3550 						    "failed for num_failures="
3551 						    "%u; tried all "
3552 						    "combinations",
3553 						    num_failures);
3554 					}
3555 					break;
3556 				}
3557 
3558 				ASSERT3U(ltgts[t], <, n);
3559 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3560 
3561 				/*
3562 				 * If that spot is available, we're done here.
3563 				 * Try the next combination.
3564 				 */
3565 				if (ltgts[t] != ltgts[t + 1])
3566 					break; // found next combination
3567 
3568 				/*
3569 				 * Otherwise, reset this tgt to the minimum,
3570 				 * and move on to the next tgt.
3571 				 */
3572 				ltgts[t] = ltgts[t - 1] + 1;
3573 				ASSERT3U(ltgts[t], ==, t);
3574 			}
3575 
3576 			/* Increase the number of failures and keep trying. */
3577 			if (ltgts[num_failures - 1] == n)
3578 				break;
3579 		}
3580 	}
3581 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3582 		zfs_dbgmsg("reconstruction failed for all num_failures");
3583 	return (ECKSUM);
3584 }
3585 
3586 void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)3587 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3588 {
3589 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3590 		raidz_row_t *rr = rm->rm_row[row];
3591 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
3592 	}
3593 }
3594 
3595 /*
3596  * Complete a write IO operation on a RAIDZ VDev
3597  *
3598  * Outline:
3599  *   1. Check for errors on the child IOs.
3600  *   2. Return, setting an error code if too few child VDevs were written
3601  *      to reconstruct the data later.  Note that partial writes are
3602  *      considered successful if they can be reconstructed at all.
3603  */
3604 static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)3605 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3606 {
3607 	int normal_errors = 0;
3608 	int shadow_errors = 0;
3609 
3610 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3611 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3612 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3613 
3614 	for (int c = 0; c < rr->rr_cols; c++) {
3615 		raidz_col_t *rc = &rr->rr_col[c];
3616 
3617 		if (rc->rc_error != 0) {
3618 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3619 			normal_errors++;
3620 		}
3621 		if (rc->rc_shadow_error != 0) {
3622 			ASSERT(rc->rc_shadow_error != ECKSUM);
3623 			shadow_errors++;
3624 		}
3625 	}
3626 
3627 	/*
3628 	 * Treat partial writes as a success. If we couldn't write enough
3629 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3630 	 * enough.  Note that in the case of a shadow write (during raidz
3631 	 * expansion), depending on if we crash, either the normal (old) or
3632 	 * shadow (new) location may become the "real" version of the block,
3633 	 * so both locations must have sufficient redundancy.
3634 	 *
3635 	 * Now that we support write reallocation, it would be better
3636 	 * to treat partial failure as real failure unless there are
3637 	 * no non-degraded top-level vdevs left, and not update DTLs
3638 	 * if we intend to reallocate.
3639 	 */
3640 	if (normal_errors > rr->rr_firstdatacol ||
3641 	    shadow_errors > rr->rr_firstdatacol) {
3642 		zio->io_error = zio_worst_error(zio->io_error,
3643 		    vdev_raidz_worst_error(rr));
3644 	}
3645 }
3646 
3647 static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)3648 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3649     raidz_row_t *rr)
3650 {
3651 	int parity_errors = 0;
3652 	int parity_untried = 0;
3653 	int data_errors = 0;
3654 	int total_errors = 0;
3655 
3656 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3657 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3658 
3659 	for (int c = 0; c < rr->rr_cols; c++) {
3660 		raidz_col_t *rc = &rr->rr_col[c];
3661 
3662 		/*
3663 		 * If scrubbing and a replacing/sparing child vdev determined
3664 		 * that not all of its children have an identical copy of the
3665 		 * data, then clear the error so the column is treated like
3666 		 * any other read and force a repair to correct the damage.
3667 		 */
3668 		if (rc->rc_error == ECKSUM) {
3669 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3670 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3671 			rc->rc_force_repair = 1;
3672 			rc->rc_error = 0;
3673 		}
3674 
3675 		if (rc->rc_error) {
3676 			if (c < rr->rr_firstdatacol)
3677 				parity_errors++;
3678 			else
3679 				data_errors++;
3680 
3681 			total_errors++;
3682 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3683 			parity_untried++;
3684 		}
3685 	}
3686 
3687 	/*
3688 	 * If there were data errors and the number of errors we saw was
3689 	 * correctable -- less than or equal to the number of parity disks read
3690 	 * -- reconstruct based on the missing data.
3691 	 */
3692 	if (data_errors != 0 &&
3693 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3694 		/*
3695 		 * We either attempt to read all the parity columns or
3696 		 * none of them. If we didn't try to read parity, we
3697 		 * wouldn't be here in the correctable case. There must
3698 		 * also have been fewer parity errors than parity
3699 		 * columns or, again, we wouldn't be in this code path.
3700 		 */
3701 		ASSERT0(parity_untried);
3702 		ASSERT(parity_errors < rr->rr_firstdatacol);
3703 
3704 		/*
3705 		 * Identify the data columns that reported an error.
3706 		 */
3707 		int n = 0;
3708 		int tgts[VDEV_RAIDZ_MAXPARITY];
3709 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3710 			raidz_col_t *rc = &rr->rr_col[c];
3711 			if (rc->rc_error != 0) {
3712 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3713 				tgts[n++] = c;
3714 			}
3715 		}
3716 
3717 		ASSERT(rr->rr_firstdatacol >= n);
3718 
3719 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3720 	}
3721 }
3722 
3723 /*
3724  * Return the number of reads issued.
3725  */
3726 static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)3727 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3728 {
3729 	vdev_t *vd = zio->io_vd;
3730 	int nread = 0;
3731 
3732 	rr->rr_missingdata = 0;
3733 	rr->rr_missingparity = 0;
3734 
3735 	/*
3736 	 * If this rows contains empty sectors which are not required
3737 	 * for a normal read then allocate an ABD for them now so they
3738 	 * may be read, verified, and any needed repairs performed.
3739 	 */
3740 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3741 		vdev_draid_map_alloc_empty(zio, rr);
3742 
3743 	for (int c = 0; c < rr->rr_cols; c++) {
3744 		raidz_col_t *rc = &rr->rr_col[c];
3745 		if (rc->rc_tried || rc->rc_size == 0)
3746 			continue;
3747 
3748 		zio_nowait(zio_vdev_child_io(zio, NULL,
3749 		    vd->vdev_child[rc->rc_devidx],
3750 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3751 		    zio->io_type, zio->io_priority, 0,
3752 		    vdev_raidz_child_done, rc));
3753 		nread++;
3754 	}
3755 	return (nread);
3756 }
3757 
3758 /*
3759  * We're here because either there were too many errors to even attempt
3760  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3761  * failed. In either case, there is enough bad data to prevent reconstruction.
3762  * Start checksum ereports for all children which haven't failed.
3763  */
3764 static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)3765 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3766 {
3767 	raidz_map_t *rm = zio->io_vsd;
3768 
3769 	for (int i = 0; i < rm->rm_nrows; i++) {
3770 		raidz_row_t *rr = rm->rm_row[i];
3771 
3772 		for (int c = 0; c < rr->rr_cols; c++) {
3773 			raidz_col_t *rc = &rr->rr_col[c];
3774 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3775 
3776 			if (rc->rc_error != 0)
3777 				continue;
3778 
3779 			zio_bad_cksum_t zbc;
3780 			zbc.zbc_has_cksum = 0;
3781 			zbc.zbc_injected = rm->rm_ecksuminjected;
3782 			mutex_enter(&cvd->vdev_stat_lock);
3783 			cvd->vdev_stat.vs_checksum_errors++;
3784 			mutex_exit(&cvd->vdev_stat_lock);
3785 			(void) zfs_ereport_start_checksum(zio->io_spa,
3786 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3787 			    rc->rc_size, &zbc);
3788 		}
3789 	}
3790 }
3791 
3792 void
vdev_raidz_io_done(zio_t * zio)3793 vdev_raidz_io_done(zio_t *zio)
3794 {
3795 	raidz_map_t *rm = zio->io_vsd;
3796 
3797 	ASSERT(zio->io_bp != NULL);
3798 	if (zio->io_type == ZIO_TYPE_WRITE) {
3799 		for (int i = 0; i < rm->rm_nrows; i++) {
3800 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3801 		}
3802 	} else {
3803 		if (rm->rm_phys_col) {
3804 			/*
3805 			 * This is an aggregated read.  Copy the data and status
3806 			 * from the aggregate abd's to the individual rows.
3807 			 */
3808 			for (int i = 0; i < rm->rm_nrows; i++) {
3809 				raidz_row_t *rr = rm->rm_row[i];
3810 
3811 				for (int c = 0; c < rr->rr_cols; c++) {
3812 					raidz_col_t *rc = &rr->rr_col[c];
3813 					if (rc->rc_tried || rc->rc_size == 0)
3814 						continue;
3815 
3816 					raidz_col_t *prc =
3817 					    &rm->rm_phys_col[rc->rc_devidx];
3818 					rc->rc_error = prc->rc_error;
3819 					rc->rc_tried = prc->rc_tried;
3820 					rc->rc_skipped = prc->rc_skipped;
3821 					if (c >= rr->rr_firstdatacol) {
3822 						/*
3823 						 * Note: this is slightly faster
3824 						 * than using abd_copy_off().
3825 						 */
3826 						char *physbuf = abd_to_buf(
3827 						    prc->rc_abd);
3828 						void *physloc = physbuf +
3829 						    rc->rc_offset -
3830 						    prc->rc_offset;
3831 
3832 						abd_copy_from_buf(rc->rc_abd,
3833 						    physloc, rc->rc_size);
3834 					}
3835 				}
3836 			}
3837 		}
3838 
3839 		for (int i = 0; i < rm->rm_nrows; i++) {
3840 			raidz_row_t *rr = rm->rm_row[i];
3841 			vdev_raidz_io_done_reconstruct_known_missing(zio,
3842 			    rm, rr);
3843 		}
3844 
3845 		if (raidz_checksum_verify(zio) == 0) {
3846 			if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3847 				goto done;
3848 
3849 			for (int i = 0; i < rm->rm_nrows; i++) {
3850 				raidz_row_t *rr = rm->rm_row[i];
3851 				vdev_raidz_io_done_verified(zio, rr);
3852 			}
3853 			/* Periodically check for a read outlier */
3854 			if (zio->io_type == ZIO_TYPE_READ)
3855 				vdev_child_slow_outlier(zio);
3856 			zio_checksum_verified(zio);
3857 		} else {
3858 			/*
3859 			 * A sequential resilver has no checksum which makes
3860 			 * combinatoral reconstruction impossible. This code
3861 			 * path is unreachable since raidz_checksum_verify()
3862 			 * has no checksum to verify and must succeed.
3863 			 */
3864 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3865 
3866 			/*
3867 			 * This isn't a typical situation -- either we got a
3868 			 * read error or a child silently returned bad data.
3869 			 * Read every block so we can try again with as much
3870 			 * data and parity as we can track down. If we've
3871 			 * already been through once before, all children will
3872 			 * be marked as tried so we'll proceed to combinatorial
3873 			 * reconstruction.
3874 			 */
3875 			int nread = 0;
3876 			for (int i = 0; i < rm->rm_nrows; i++) {
3877 				nread += vdev_raidz_read_all(zio,
3878 				    rm->rm_row[i]);
3879 			}
3880 			if (nread != 0) {
3881 				/*
3882 				 * Normally our stage is VDEV_IO_DONE, but if
3883 				 * we've already called redone(), it will have
3884 				 * changed to VDEV_IO_START, in which case we
3885 				 * don't want to call redone() again.
3886 				 */
3887 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3888 					zio_vdev_io_redone(zio);
3889 				return;
3890 			}
3891 			/*
3892 			 * It would be too expensive to try every possible
3893 			 * combination of failed sectors in every row, so
3894 			 * instead we try every combination of failed current or
3895 			 * past physical disk. This means that if the incorrect
3896 			 * sectors were all on Nparity disks at any point in the
3897 			 * past, we will find the correct data.  The only known
3898 			 * case where this is less durable than a non-expanded
3899 			 * RAIDZ, is if we have a silent failure during
3900 			 * expansion.  In that case, one block could be
3901 			 * partially in the old format and partially in the
3902 			 * new format, so we'd lost some sectors from the old
3903 			 * format and some from the new format.
3904 			 *
3905 			 * e.g. logical_width=4 physical_width=6
3906 			 * the 15 (6+5+4) possible failed disks are:
3907 			 * width=6 child=0
3908 			 * width=6 child=1
3909 			 * width=6 child=2
3910 			 * width=6 child=3
3911 			 * width=6 child=4
3912 			 * width=6 child=5
3913 			 * width=5 child=0
3914 			 * width=5 child=1
3915 			 * width=5 child=2
3916 			 * width=5 child=3
3917 			 * width=5 child=4
3918 			 * width=4 child=0
3919 			 * width=4 child=1
3920 			 * width=4 child=2
3921 			 * width=4 child=3
3922 			 * And we will try every combination of Nparity of these
3923 			 * failing.
3924 			 *
3925 			 * As a first pass, we can generate every combo,
3926 			 * and try reconstructing, ignoring any known
3927 			 * failures.  If any row has too many known + simulated
3928 			 * failures, then we bail on reconstructing with this
3929 			 * number of simulated failures.  As an improvement,
3930 			 * we could detect the number of whole known failures
3931 			 * (i.e. we have known failures on these disks for
3932 			 * every row; the disks never succeeded), and
3933 			 * subtract that from the max # failures to simulate.
3934 			 * We could go even further like the current
3935 			 * combrec code, but that doesn't seem like it
3936 			 * gains us very much.  If we simulate a failure
3937 			 * that is also a known failure, that's fine.
3938 			 */
3939 			zio->io_error = vdev_raidz_combrec(zio);
3940 			if (zio->io_error == ECKSUM &&
3941 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3942 				vdev_raidz_io_done_unrecoverable(zio);
3943 			}
3944 		}
3945 	}
3946 done:
3947 	if (rm->rm_lr != NULL) {
3948 		zfs_rangelock_exit(rm->rm_lr);
3949 		rm->rm_lr = NULL;
3950 	}
3951 }
3952 
3953 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)3954 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3955 {
3956 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3957 	if (faulted > vdrz->vd_nparity)
3958 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3959 		    VDEV_AUX_NO_REPLICAS);
3960 	else if (degraded + faulted != 0)
3961 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3962 	else
3963 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3964 }
3965 
3966 /*
3967  * Determine if any portion of the provided block resides on a child vdev
3968  * with a dirty DTL and therefore needs to be resilvered.  The function
3969  * assumes that at least one DTL is dirty which implies that full stripe
3970  * width blocks must be resilvered.
3971  */
3972 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3973 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3974     uint64_t phys_birth)
3975 {
3976 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3977 
3978 	/*
3979 	 * If we're in the middle of a RAIDZ expansion, this block may be in
3980 	 * the old and/or new location.  For simplicity, always resilver it.
3981 	 */
3982 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3983 		return (B_TRUE);
3984 
3985 	uint64_t dcols = vd->vdev_children;
3986 	uint64_t nparity = vdrz->vd_nparity;
3987 	uint64_t ashift = vd->vdev_top->vdev_ashift;
3988 	/* The starting RAIDZ (parent) vdev sector of the block. */
3989 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3990 	/* The zio's size in units of the vdev's minimum sector size. */
3991 	uint64_t s = ((psize - 1) >> ashift) + 1;
3992 	/* The first column for this stripe. */
3993 	uint64_t f = b % dcols;
3994 
3995 	/* Unreachable by sequential resilver. */
3996 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3997 
3998 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3999 		return (B_FALSE);
4000 
4001 	if (s + nparity >= dcols)
4002 		return (B_TRUE);
4003 
4004 	for (uint64_t c = 0; c < s + nparity; c++) {
4005 		uint64_t devidx = (f + c) % dcols;
4006 		vdev_t *cvd = vd->vdev_child[devidx];
4007 
4008 		/*
4009 		 * dsl_scan_need_resilver() already checked vd with
4010 		 * vdev_dtl_contains(). So here just check cvd with
4011 		 * vdev_dtl_empty(), cheaper and a good approximation.
4012 		 */
4013 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
4014 			return (B_TRUE);
4015 	}
4016 
4017 	return (B_FALSE);
4018 }
4019 
4020 static void
vdev_raidz_xlate(vdev_t * cvd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)4021 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
4022     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
4023 {
4024 	(void) remain_rs;
4025 
4026 	vdev_t *raidvd = cvd->vdev_parent;
4027 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
4028 
4029 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4030 
4031 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4032 		/*
4033 		 * We're in the middle of expansion, in which case the
4034 		 * translation is in flux.  Any answer we give may be wrong
4035 		 * by the time we return, so it isn't safe for the caller to
4036 		 * act on it.  Therefore we say that this range isn't present
4037 		 * on any children.  The only consumers of this are "zpool
4038 		 * initialize" and trimming, both of which are "best effort"
4039 		 * anyway.
4040 		 */
4041 		physical_rs->rs_start = physical_rs->rs_end = 0;
4042 		remain_rs->rs_start = remain_rs->rs_end = 0;
4043 		return;
4044 	}
4045 
4046 	uint64_t width = vdrz->vd_physical_width;
4047 	uint64_t tgt_col = cvd->vdev_id;
4048 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
4049 
4050 	/* make sure the offsets are block-aligned */
4051 	ASSERT0(logical_rs->rs_start % (1 << ashift));
4052 	ASSERT0(logical_rs->rs_end % (1 << ashift));
4053 	uint64_t b_start = logical_rs->rs_start >> ashift;
4054 	uint64_t b_end = logical_rs->rs_end >> ashift;
4055 
4056 	uint64_t start_row = 0;
4057 	if (b_start > tgt_col) /* avoid underflow */
4058 		start_row = ((b_start - tgt_col - 1) / width) + 1;
4059 
4060 	uint64_t end_row = 0;
4061 	if (b_end > tgt_col)
4062 		end_row = ((b_end - tgt_col - 1) / width) + 1;
4063 
4064 	physical_rs->rs_start = start_row << ashift;
4065 	physical_rs->rs_end = end_row << ashift;
4066 
4067 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
4068 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
4069 	    logical_rs->rs_end - logical_rs->rs_start);
4070 }
4071 
4072 static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)4073 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
4074 {
4075 	spa_t *spa = arg;
4076 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4077 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4078 
4079 	/*
4080 	 * Ensure there are no i/os to the range that is being committed.
4081 	 */
4082 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
4083 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
4084 
4085 	mutex_enter(&vre->vre_lock);
4086 	uint64_t new_offset =
4087 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
4088 	/*
4089 	 * We should not have committed anything that failed.
4090 	 */
4091 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
4092 	mutex_exit(&vre->vre_lock);
4093 
4094 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4095 	    old_offset, new_offset - old_offset,
4096 	    RL_WRITER);
4097 
4098 	/*
4099 	 * Update the uberblock that will be written when this txg completes.
4100 	 */
4101 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
4102 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
4103 	vre->vre_offset_pertxg[txgoff] = 0;
4104 	zfs_rangelock_exit(lr);
4105 
4106 	mutex_enter(&vre->vre_lock);
4107 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
4108 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
4109 	mutex_exit(&vre->vre_lock);
4110 
4111 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4112 	VERIFY0(zap_update(spa->spa_meta_objset,
4113 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4114 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
4115 }
4116 
4117 static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)4118 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
4119 {
4120 	spa_t *spa = arg;
4121 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4122 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4123 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4124 
4125 	for (int i = 0; i < TXG_SIZE; i++)
4126 		VERIFY0(vre->vre_offset_pertxg[i]);
4127 
4128 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4129 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
4130 	re->re_logical_width = vdrz->vd_physical_width;
4131 	mutex_enter(&vdrz->vd_expand_lock);
4132 	avl_add(&vdrz->vd_expand_txgs, re);
4133 	mutex_exit(&vdrz->vd_expand_lock);
4134 
4135 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4136 
4137 	/*
4138 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
4139 	 * will get written (based on vd_expand_txgs).
4140 	 */
4141 	vdev_config_dirty(vd);
4142 
4143 	/*
4144 	 * Before we change vre_state, the on-disk state must reflect that we
4145 	 * have completed all copying, so that vdev_raidz_io_start() can use
4146 	 * vre_state to determine if the reflow is in progress.  See also the
4147 	 * end of spa_raidz_expand_thread().
4148 	 */
4149 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
4150 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
4151 
4152 	vre->vre_end_time = gethrestime_sec();
4153 	vre->vre_state = DSS_FINISHED;
4154 
4155 	uint64_t state = vre->vre_state;
4156 	VERIFY0(zap_update(spa->spa_meta_objset,
4157 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4158 	    sizeof (state), 1, &state, tx));
4159 
4160 	uint64_t end_time = vre->vre_end_time;
4161 	VERIFY0(zap_update(spa->spa_meta_objset,
4162 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4163 	    sizeof (end_time), 1, &end_time, tx));
4164 
4165 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
4166 
4167 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
4168 	    "%s vdev %llu new width %llu", spa_name(spa),
4169 	    (unsigned long long)vd->vdev_id,
4170 	    (unsigned long long)vd->vdev_children);
4171 
4172 	spa->spa_raidz_expand = NULL;
4173 	raidvd->vdev_rz_expanding = B_FALSE;
4174 
4175 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
4176 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
4177 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
4178 
4179 	spa_notify_waiters(spa);
4180 
4181 	/*
4182 	 * While we're in syncing context take the opportunity to
4183 	 * setup a scrub. All the data has been sucessfully copied
4184 	 * but we have not validated any checksums.
4185 	 */
4186 	setup_sync_arg_t setup_sync_arg = {
4187 		.func = POOL_SCAN_SCRUB,
4188 		.txgstart = 0,
4189 		.txgend = 0,
4190 	};
4191 	if (zfs_scrub_after_expand &&
4192 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
4193 		dsl_scan_setup_sync(&setup_sync_arg, tx);
4194 	}
4195 }
4196 
4197 /*
4198  * State of one copy batch.
4199  */
4200 typedef struct raidz_reflow_arg {
4201 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
4202 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
4203 	uint64_t rra_txg;	/* TXG of this batch. */
4204 	uint_t rra_ashift;	/* Ashift of the vdev. */
4205 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
4206 	uint32_t rra_writes;	/* Number of write ZIOs. */
4207 	zio_t *rra_zio[];	/* Write ZIO pointers. */
4208 } raidz_reflow_arg_t;
4209 
4210 /*
4211  * Write of the new location on one child is done.  Once all of them are done
4212  * we can unlock and free everything.
4213  */
4214 static void
raidz_reflow_write_done(zio_t * zio)4215 raidz_reflow_write_done(zio_t *zio)
4216 {
4217 	raidz_reflow_arg_t *rra = zio->io_private;
4218 	vdev_raidz_expand_t *vre = rra->rra_vre;
4219 
4220 	abd_free(zio->io_abd);
4221 
4222 	mutex_enter(&vre->vre_lock);
4223 	if (zio->io_error != 0) {
4224 		/* Force a reflow pause on errors */
4225 		vre->vre_failed_offset =
4226 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4227 	}
4228 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
4229 	vre->vre_outstanding_bytes -= zio->io_size;
4230 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
4231 	    vre->vre_failed_offset) {
4232 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
4233 		    zio->io_size;
4234 	}
4235 	cv_signal(&vre->vre_cv);
4236 	boolean_t done = (--rra->rra_tbd == 0);
4237 	mutex_exit(&vre->vre_lock);
4238 
4239 	if (!done)
4240 		return;
4241 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
4242 	zfs_rangelock_exit(rra->rra_lr);
4243 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
4244 }
4245 
4246 /*
4247  * Read of the old location on one child is done.  Once all of them are done
4248  * writes should have all the data and we can issue them.
4249  */
4250 static void
raidz_reflow_read_done(zio_t * zio)4251 raidz_reflow_read_done(zio_t *zio)
4252 {
4253 	raidz_reflow_arg_t *rra = zio->io_private;
4254 	vdev_raidz_expand_t *vre = rra->rra_vre;
4255 
4256 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
4257 	if (zio->io_size > (1 << rra->rra_ashift))
4258 		abd_free(zio->io_abd);
4259 
4260 	/*
4261 	 * If the read failed, or if it was done on a vdev that is not fully
4262 	 * healthy (e.g. a child that has a resilver in progress), we may not
4263 	 * have the correct data.  Note that it's OK if the write proceeds.
4264 	 * It may write garbage but the location is otherwise unused and we
4265 	 * will retry later due to vre_failed_offset.
4266 	 */
4267 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
4268 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
4269 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
4270 		    (long long)rra->rra_lr->lr_offset,
4271 		    (long long)rra->rra_lr->lr_length,
4272 		    (long long)rra->rra_txg,
4273 		    zio->io_error,
4274 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
4275 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
4276 		mutex_enter(&vre->vre_lock);
4277 		/* Force a reflow pause on errors */
4278 		vre->vre_failed_offset =
4279 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4280 		mutex_exit(&vre->vre_lock);
4281 	}
4282 
4283 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
4284 		return;
4285 	uint32_t writes = rra->rra_tbd = rra->rra_writes;
4286 	for (uint64_t i = 0; i < writes; i++)
4287 		zio_nowait(rra->rra_zio[i]);
4288 }
4289 
4290 static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)4291 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
4292     dmu_tx_t *tx)
4293 {
4294 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4295 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4296 
4297 	if (offset == 0)
4298 		return;
4299 
4300 	mutex_enter(&vre->vre_lock);
4301 	ASSERT3U(vre->vre_offset, <=, offset);
4302 	vre->vre_offset = offset;
4303 	mutex_exit(&vre->vre_lock);
4304 
4305 	if (vre->vre_offset_pertxg[txgoff] == 0) {
4306 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
4307 		    spa, tx);
4308 	}
4309 	vre->vre_offset_pertxg[txgoff] = offset;
4310 }
4311 
4312 static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)4313 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
4314 {
4315 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
4316 		/* Quick check if a child is being replaced */
4317 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
4318 			return (B_TRUE);
4319 	}
4320 	return (B_FALSE);
4321 }
4322 
4323 static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,zfs_range_tree_t * rt,dmu_tx_t * tx)4324 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
4325     dmu_tx_t *tx)
4326 {
4327 	spa_t *spa = vd->vdev_spa;
4328 	uint_t ashift = vd->vdev_top->vdev_ashift;
4329 
4330 	zfs_range_seg_t *rs = zfs_range_tree_first(rt);
4331 	if (rt == NULL)
4332 		return (B_FALSE);
4333 	uint64_t offset = zfs_rs_get_start(rs, rt);
4334 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
4335 	uint64_t size = zfs_rs_get_end(rs, rt) - offset;
4336 	ASSERT3U(size, >=, 1 << ashift);
4337 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
4338 
4339 	uint64_t blkid = offset >> ashift;
4340 	uint_t old_children = vd->vdev_children - 1;
4341 
4342 	/*
4343 	 * We can only progress to the point that writes will not overlap
4344 	 * with blocks whose progress has not yet been recorded on disk.
4345 	 * Since partially-copied rows are still read from the old location,
4346 	 * we need to stop one row before the sector-wise overlap, to prevent
4347 	 * row-wise overlap.
4348 	 *
4349 	 * Note that even if we are skipping over a large unallocated region,
4350 	 * we can't move the on-disk progress to `offset`, because concurrent
4351 	 * writes/allocations could still use the currently-unallocated
4352 	 * region.
4353 	 */
4354 	uint64_t ubsync_blkid =
4355 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
4356 	uint64_t next_overwrite_blkid = ubsync_blkid +
4357 	    ubsync_blkid / old_children - old_children;
4358 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
4359 	if (blkid >= next_overwrite_blkid) {
4360 		raidz_reflow_record_progress(vre,
4361 		    next_overwrite_blkid << ashift, tx);
4362 		return (B_TRUE);
4363 	}
4364 
4365 	size = MIN(size, raidz_expand_max_copy_bytes);
4366 	size = MIN(size, (uint64_t)old_children *
4367 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4368 	size = MAX(size, 1 << ashift);
4369 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4370 	size = (uint64_t)blocks << ashift;
4371 
4372 	zfs_range_tree_remove(rt, offset, size);
4373 
4374 	uint_t reads = MIN(blocks, old_children);
4375 	uint_t writes = MIN(blocks, vd->vdev_children);
4376 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4377 	    sizeof (zio_t *) * writes, KM_SLEEP);
4378 	rra->rra_vre = vre;
4379 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4380 	    offset, size, RL_WRITER);
4381 	rra->rra_txg = dmu_tx_get_txg(tx);
4382 	rra->rra_ashift = ashift;
4383 	rra->rra_tbd = reads;
4384 	rra->rra_writes = writes;
4385 
4386 	raidz_reflow_record_progress(vre, offset + size, tx);
4387 
4388 	/*
4389 	 * SCL_STATE will be released when the read and write are done,
4390 	 * by raidz_reflow_write_done().
4391 	 */
4392 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4393 
4394 	/* check if a replacing vdev was added, if so treat it as an error */
4395 	if (vdev_raidz_expand_child_replacing(vd)) {
4396 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4397 		    "offset=%llu txg=%llu",
4398 		    (long long)rra->rra_lr->lr_offset,
4399 		    (long long)rra->rra_txg);
4400 
4401 		mutex_enter(&vre->vre_lock);
4402 		vre->vre_failed_offset =
4403 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4404 		cv_signal(&vre->vre_cv);
4405 		mutex_exit(&vre->vre_lock);
4406 
4407 		/* drop everything we acquired */
4408 		spa_config_exit(spa, SCL_STATE, spa);
4409 		zfs_rangelock_exit(rra->rra_lr);
4410 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4411 		return (B_TRUE);
4412 	}
4413 
4414 	mutex_enter(&vre->vre_lock);
4415 	vre->vre_outstanding_bytes += size;
4416 	mutex_exit(&vre->vre_lock);
4417 
4418 	/* Allocate ABD and ZIO for each child we write. */
4419 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4420 	zio_t *pio = spa->spa_txg_zio[txgoff];
4421 	uint_t b = blocks / vd->vdev_children;
4422 	uint_t bb = blocks % vd->vdev_children;
4423 	for (uint_t i = 0; i < writes; i++) {
4424 		uint_t n = b + (i < bb);
4425 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4426 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4427 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
4428 		    ((blkid + i) / vd->vdev_children) << ashift,
4429 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4430 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4431 	}
4432 
4433 	/*
4434 	 * Allocate and issue ZIO for each child we read.  For reads of only
4435 	 * one block we can use respective writer ABDs, since they will also
4436 	 * have only one block.  For bigger reads create gang ABDs and fill
4437 	 * them with respective blocks from writer ABDs.
4438 	 */
4439 	b = blocks / old_children;
4440 	bb = blocks % old_children;
4441 	for (uint_t i = 0; i < reads; i++) {
4442 		uint_t n = b + (i < bb);
4443 		abd_t *abd;
4444 		if (n > 1) {
4445 			abd = abd_alloc_gang();
4446 			for (uint_t j = 0; j < n; j++) {
4447 				uint_t b = j * old_children + i;
4448 				abd_t *cabd = abd_get_offset_size(
4449 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
4450 				    (b / vd->vdev_children) << ashift,
4451 				    1 << ashift);
4452 				abd_gang_add(abd, cabd, B_TRUE);
4453 			}
4454 		} else {
4455 			abd = rra->rra_zio[i]->io_abd;
4456 		}
4457 		zio_nowait(zio_vdev_child_io(pio, NULL,
4458 		    vd->vdev_child[(blkid + i) % old_children],
4459 		    ((blkid + i) / old_children) << ashift, abd,
4460 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4461 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4462 	}
4463 
4464 	return (B_FALSE);
4465 }
4466 
4467 /*
4468  * For testing (ztest specific)
4469  */
4470 static void
raidz_expand_pause(uint_t pause_point)4471 raidz_expand_pause(uint_t pause_point)
4472 {
4473 	while (raidz_expand_pause_point != 0 &&
4474 	    raidz_expand_pause_point <= pause_point)
4475 		delay(hz);
4476 }
4477 
4478 static void
raidz_scratch_child_done(zio_t * zio)4479 raidz_scratch_child_done(zio_t *zio)
4480 {
4481 	zio_t *pio = zio->io_private;
4482 
4483 	mutex_enter(&pio->io_lock);
4484 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4485 	mutex_exit(&pio->io_lock);
4486 }
4487 
4488 /*
4489  * Reflow the beginning portion of the vdev into an intermediate scratch area
4490  * in memory and on disk. This operation must be persisted on disk before we
4491  * proceed to overwrite the beginning portion with the reflowed data.
4492  *
4493  * This multi-step task can fail to complete if disk errors are encountered
4494  * and we can return here after a pause (waiting for disk to become healthy).
4495  */
4496 static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4497 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4498 {
4499 	vdev_raidz_expand_t *vre = arg;
4500 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4501 	zio_t *pio;
4502 	int error;
4503 
4504 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4505 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4506 	int ashift = raidvd->vdev_ashift;
4507 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4508 	    uint64_t);
4509 	uint64_t logical_size = write_size * raidvd->vdev_children;
4510 	uint64_t read_size =
4511 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4512 	    1 << ashift);
4513 
4514 	/*
4515 	 * The scratch space must be large enough to get us to the point
4516 	 * that one row does not overlap itself when moved.  This is checked
4517 	 * by vdev_raidz_attach_check().
4518 	 */
4519 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4520 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4521 	VERIFY3U(write_size, <=, read_size);
4522 
4523 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4524 	    0, logical_size, RL_WRITER);
4525 
4526 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4527 	    KM_SLEEP);
4528 	for (int i = 0; i < raidvd->vdev_children; i++) {
4529 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4530 	}
4531 
4532 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4533 
4534 	/*
4535 	 * If we have already written the scratch area then we must read from
4536 	 * there, since new writes were redirected there while we were paused
4537 	 * or the original location may have been partially overwritten with
4538 	 * reflowed data.
4539 	 */
4540 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4541 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4542 		/*
4543 		 * Read from scratch space.
4544 		 */
4545 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4546 		for (int i = 0; i < raidvd->vdev_children; i++) {
4547 			/*
4548 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4549 			 * to the offset to calculate the physical offset to
4550 			 * write to.  Passing in a negative offset makes us
4551 			 * access the scratch area.
4552 			 */
4553 			zio_nowait(zio_vdev_child_io(pio, NULL,
4554 			    raidvd->vdev_child[i],
4555 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4556 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4557 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4558 		}
4559 		error = zio_wait(pio);
4560 		if (error != 0) {
4561 			zfs_dbgmsg("reflow: error %d reading scratch location",
4562 			    error);
4563 			goto io_error_exit;
4564 		}
4565 		goto overwrite;
4566 	}
4567 
4568 	/*
4569 	 * Read from original location.
4570 	 */
4571 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4572 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4573 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4574 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4575 		    0, abds[i], read_size, ZIO_TYPE_READ,
4576 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4577 		    raidz_scratch_child_done, pio));
4578 	}
4579 	error = zio_wait(pio);
4580 	if (error != 0) {
4581 		zfs_dbgmsg("reflow: error %d reading original location", error);
4582 io_error_exit:
4583 		for (int i = 0; i < raidvd->vdev_children; i++)
4584 			abd_free(abds[i]);
4585 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4586 		zfs_rangelock_exit(lr);
4587 		spa_config_exit(spa, SCL_STATE, FTAG);
4588 		return;
4589 	}
4590 
4591 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4592 
4593 	/*
4594 	 * Reflow in memory.
4595 	 */
4596 	uint64_t logical_sectors = logical_size >> ashift;
4597 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4598 		int oldchild = i % (raidvd->vdev_children - 1);
4599 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4600 
4601 		int newchild = i % raidvd->vdev_children;
4602 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4603 
4604 		/* a single sector should not be copying over itself */
4605 		ASSERT(!(newchild == oldchild && newoff == oldoff));
4606 
4607 		abd_copy_off(abds[newchild], abds[oldchild],
4608 		    newoff, oldoff, 1 << ashift);
4609 	}
4610 
4611 	/*
4612 	 * Verify that we filled in everything we intended to (write_size on
4613 	 * each child).
4614 	 */
4615 	VERIFY0(logical_sectors % raidvd->vdev_children);
4616 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4617 	    write_size);
4618 
4619 	/*
4620 	 * Write to scratch location (boot area).
4621 	 */
4622 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4623 	for (int i = 0; i < raidvd->vdev_children; i++) {
4624 		/*
4625 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4626 		 * the offset to calculate the physical offset to write to.
4627 		 * Passing in a negative offset lets us access the boot area.
4628 		 */
4629 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4630 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4631 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4632 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4633 	}
4634 	error = zio_wait(pio);
4635 	if (error != 0) {
4636 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4637 		goto io_error_exit;
4638 	}
4639 	pio = zio_root(spa, NULL, NULL, 0);
4640 	zio_flush(pio, raidvd);
4641 	zio_wait(pio);
4642 
4643 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4644 	    (long long)logical_size);
4645 
4646 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4647 
4648 	/*
4649 	 * Update uberblock to indicate that scratch space is valid.  This is
4650 	 * needed because after this point, the real location may be
4651 	 * overwritten.  If we crash, we need to get the data from the
4652 	 * scratch space, rather than the real location.
4653 	 *
4654 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4655 	 * will prefer this uberblock.
4656 	 */
4657 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4658 	spa->spa_ubsync.ub_timestamp++;
4659 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4660 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4661 	if (spa_multihost(spa))
4662 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4663 
4664 	zfs_dbgmsg("reflow: uberblock updated "
4665 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4666 	    (long long)spa->spa_ubsync.ub_txg,
4667 	    (long long)logical_size,
4668 	    (long long)spa->spa_ubsync.ub_timestamp);
4669 
4670 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4671 
4672 	/*
4673 	 * Overwrite with reflow'ed data.
4674 	 */
4675 overwrite:
4676 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4677 	for (int i = 0; i < raidvd->vdev_children; i++) {
4678 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4679 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4680 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4681 		    raidz_scratch_child_done, pio));
4682 	}
4683 	error = zio_wait(pio);
4684 	if (error != 0) {
4685 		/*
4686 		 * When we exit early here and drop the range lock, new
4687 		 * writes will go into the scratch area so we'll need to
4688 		 * read from there when we return after pausing.
4689 		 */
4690 		zfs_dbgmsg("reflow: error %d writing real location", error);
4691 		/*
4692 		 * Update the uberblock that is written when this txg completes.
4693 		 */
4694 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4695 		    logical_size);
4696 		goto io_error_exit;
4697 	}
4698 	pio = zio_root(spa, NULL, NULL, 0);
4699 	zio_flush(pio, raidvd);
4700 	zio_wait(pio);
4701 
4702 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4703 	    (long long)logical_size);
4704 	for (int i = 0; i < raidvd->vdev_children; i++)
4705 		abd_free(abds[i]);
4706 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4707 
4708 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4709 
4710 	/*
4711 	 * Update uberblock to indicate that the initial part has been
4712 	 * reflow'ed.  This is needed because after this point (when we exit
4713 	 * the rangelock), we allow regular writes to this region, which will
4714 	 * be written to the new location only (because reflow_offset_next ==
4715 	 * reflow_offset_synced).  If we crashed and re-copied from the
4716 	 * scratch space, we would lose the regular writes.
4717 	 */
4718 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4719 	    logical_size);
4720 	spa->spa_ubsync.ub_timestamp++;
4721 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4722 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4723 	if (spa_multihost(spa))
4724 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4725 
4726 	zfs_dbgmsg("reflow: uberblock updated "
4727 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4728 	    (long long)spa->spa_ubsync.ub_txg,
4729 	    (long long)logical_size,
4730 	    (long long)spa->spa_ubsync.ub_timestamp);
4731 
4732 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4733 
4734 	/*
4735 	 * Update progress.
4736 	 */
4737 	vre->vre_offset = logical_size;
4738 	zfs_rangelock_exit(lr);
4739 	spa_config_exit(spa, SCL_STATE, FTAG);
4740 
4741 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4742 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4743 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4744 	/*
4745 	 * Note - raidz_reflow_sync() will update the uberblock state to
4746 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4747 	 */
4748 	raidz_reflow_sync(spa, tx);
4749 
4750 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4751 }
4752 
4753 /*
4754  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4755  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4756  */
4757 void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4758 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4759 {
4760 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4761 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4762 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4763 
4764 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4765 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4766 	ASSERT0(logical_size % raidvd->vdev_children);
4767 	uint64_t write_size = logical_size / raidvd->vdev_children;
4768 
4769 	zio_t *pio;
4770 
4771 	/*
4772 	 * Read from scratch space.
4773 	 */
4774 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4775 	    KM_SLEEP);
4776 	for (int i = 0; i < raidvd->vdev_children; i++) {
4777 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4778 	}
4779 
4780 	pio = zio_root(spa, NULL, NULL, 0);
4781 	for (int i = 0; i < raidvd->vdev_children; i++) {
4782 		/*
4783 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4784 		 * the offset to calculate the physical offset to write to.
4785 		 * Passing in a negative offset lets us access the boot area.
4786 		 */
4787 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4788 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4789 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4790 		    raidz_scratch_child_done, pio));
4791 	}
4792 	zio_wait(pio);
4793 
4794 	/*
4795 	 * Overwrite real location with reflow'ed data.
4796 	 */
4797 	pio = zio_root(spa, NULL, NULL, 0);
4798 	for (int i = 0; i < raidvd->vdev_children; i++) {
4799 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4800 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4801 		    ZIO_PRIORITY_REMOVAL, 0,
4802 		    raidz_scratch_child_done, pio));
4803 	}
4804 	zio_wait(pio);
4805 	pio = zio_root(spa, NULL, NULL, 0);
4806 	zio_flush(pio, raidvd);
4807 	zio_wait(pio);
4808 
4809 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4810 	    "to real location", (long long)logical_size);
4811 
4812 	for (int i = 0; i < raidvd->vdev_children; i++)
4813 		abd_free(abds[i]);
4814 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4815 
4816 	/*
4817 	 * Update uberblock.
4818 	 */
4819 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4820 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4821 	spa->spa_ubsync.ub_timestamp++;
4822 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4823 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4824 	if (spa_multihost(spa))
4825 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4826 
4827 	zfs_dbgmsg("reflow recovery: uberblock updated "
4828 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4829 	    (long long)spa->spa_ubsync.ub_txg,
4830 	    (long long)logical_size,
4831 	    (long long)spa->spa_ubsync.ub_timestamp);
4832 
4833 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4834 	    spa_first_txg(spa));
4835 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4836 	vre->vre_offset = logical_size;
4837 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4838 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4839 	/*
4840 	 * Note that raidz_reflow_sync() will update the uberblock once more
4841 	 */
4842 	raidz_reflow_sync(spa, tx);
4843 
4844 	dmu_tx_commit(tx);
4845 
4846 	spa_config_exit(spa, SCL_STATE, FTAG);
4847 }
4848 
4849 static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4850 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4851 {
4852 	(void) zthr;
4853 	spa_t *spa = arg;
4854 
4855 	return (spa->spa_raidz_expand != NULL &&
4856 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4857 }
4858 
4859 /*
4860  * RAIDZ expansion background thread
4861  *
4862  * Can be called multiple times if the reflow is paused
4863  */
4864 static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4865 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4866 {
4867 	spa_t *spa = arg;
4868 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4869 
4870 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4871 		vre->vre_offset = 0;
4872 	else
4873 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4874 
4875 	/* Reflow the begining portion using the scratch area */
4876 	if (vre->vre_offset == 0) {
4877 		VERIFY0(dsl_sync_task(spa_name(spa),
4878 		    NULL, raidz_reflow_scratch_sync,
4879 		    vre, 0, ZFS_SPACE_CHECK_NONE));
4880 
4881 		/* if we encountered errors then pause */
4882 		if (vre->vre_offset == 0) {
4883 			mutex_enter(&vre->vre_lock);
4884 			vre->vre_waiting_for_resilver = B_TRUE;
4885 			mutex_exit(&vre->vre_lock);
4886 			return;
4887 		}
4888 	}
4889 
4890 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4891 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4892 
4893 	uint64_t guid = raidvd->vdev_guid;
4894 
4895 	/* Iterate over all the remaining metaslabs */
4896 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4897 	    i < raidvd->vdev_ms_count &&
4898 	    !zthr_iscancelled(zthr) &&
4899 	    vre->vre_failed_offset == UINT64_MAX; i++) {
4900 		metaslab_t *msp = raidvd->vdev_ms[i];
4901 
4902 		metaslab_disable(msp);
4903 		mutex_enter(&msp->ms_lock);
4904 
4905 		/*
4906 		 * The metaslab may be newly created (for the expanded
4907 		 * space), in which case its trees won't exist yet,
4908 		 * so we need to bail out early.
4909 		 */
4910 		if (msp->ms_new) {
4911 			mutex_exit(&msp->ms_lock);
4912 			metaslab_enable(msp, B_FALSE, B_FALSE);
4913 			continue;
4914 		}
4915 
4916 		VERIFY0(metaslab_load(msp));
4917 
4918 		/*
4919 		 * We want to copy everything except the free (allocatable)
4920 		 * space.  Note that there may be a little bit more free
4921 		 * space (e.g. in ms_defer), and it's fine to copy that too.
4922 		 */
4923 		uint64_t shift, start;
4924 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
4925 		    raidvd, msp, &start, &shift);
4926 		zfs_range_tree_t *rt = zfs_range_tree_create_flags(
4927 		    NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
4928 		    metaslab_rt_name(msp->ms_group, msp,
4929 		    "spa_raidz_expand_thread:rt"));
4930 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
4931 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
4932 		    rt);
4933 		mutex_exit(&msp->ms_lock);
4934 
4935 		/*
4936 		 * Force the last sector of each metaslab to be copied.  This
4937 		 * ensures that we advance the on-disk progress to the end of
4938 		 * this metaslab while the metaslab is disabled.  Otherwise, we
4939 		 * could move past this metaslab without advancing the on-disk
4940 		 * progress, and then an allocation to this metaslab would not
4941 		 * be copied.
4942 		 */
4943 		int sectorsz = 1 << raidvd->vdev_ashift;
4944 		uint64_t ms_last_offset = msp->ms_start +
4945 		    msp->ms_size - sectorsz;
4946 		if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
4947 			zfs_range_tree_add(rt, ms_last_offset, sectorsz);
4948 		}
4949 
4950 		/*
4951 		 * When we are resuming from a paused expansion (i.e.
4952 		 * when importing a pool with a expansion in progress),
4953 		 * discard any state that we have already processed.
4954 		 */
4955 		if (vre->vre_offset > msp->ms_start) {
4956 			zfs_range_tree_clear(rt, msp->ms_start,
4957 			    vre->vre_offset - msp->ms_start);
4958 		}
4959 
4960 		while (!zthr_iscancelled(zthr) &&
4961 		    !zfs_range_tree_is_empty(rt) &&
4962 		    vre->vre_failed_offset == UINT64_MAX) {
4963 
4964 			/*
4965 			 * We need to periodically drop the config lock so that
4966 			 * writers can get in.  Additionally, we can't wait
4967 			 * for a txg to sync while holding a config lock
4968 			 * (since a waiting writer could cause a 3-way deadlock
4969 			 * with the sync thread, which also gets a config
4970 			 * lock for reader).  So we can't hold the config lock
4971 			 * while calling dmu_tx_assign().
4972 			 */
4973 			spa_config_exit(spa, SCL_CONFIG, FTAG);
4974 
4975 			/*
4976 			 * If requested, pause the reflow when the amount
4977 			 * specified by raidz_expand_max_reflow_bytes is reached
4978 			 *
4979 			 * This pause is only used during testing or debugging.
4980 			 */
4981 			while (raidz_expand_max_reflow_bytes != 0 &&
4982 			    raidz_expand_max_reflow_bytes <=
4983 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4984 				delay(hz);
4985 			}
4986 
4987 			mutex_enter(&vre->vre_lock);
4988 			while (vre->vre_outstanding_bytes >
4989 			    raidz_expand_max_copy_bytes) {
4990 				cv_wait(&vre->vre_cv, &vre->vre_lock);
4991 			}
4992 			mutex_exit(&vre->vre_lock);
4993 
4994 			dmu_tx_t *tx =
4995 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4996 
4997 			VERIFY0(dmu_tx_assign(tx,
4998 			    DMU_TX_WAIT | DMU_TX_SUSPEND));
4999 			uint64_t txg = dmu_tx_get_txg(tx);
5000 
5001 			/*
5002 			 * Reacquire the vdev_config lock.  Theoretically, the
5003 			 * vdev_t that we're expanding may have changed.
5004 			 */
5005 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5006 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
5007 
5008 			boolean_t needsync =
5009 			    raidz_reflow_impl(raidvd, vre, rt, tx);
5010 
5011 			dmu_tx_commit(tx);
5012 
5013 			if (needsync) {
5014 				spa_config_exit(spa, SCL_CONFIG, FTAG);
5015 				txg_wait_synced(spa->spa_dsl_pool, txg);
5016 				spa_config_enter(spa, SCL_CONFIG, FTAG,
5017 				    RW_READER);
5018 			}
5019 		}
5020 
5021 		spa_config_exit(spa, SCL_CONFIG, FTAG);
5022 
5023 		metaslab_enable(msp, B_FALSE, B_FALSE);
5024 		zfs_range_tree_vacate(rt, NULL, NULL);
5025 		zfs_range_tree_destroy(rt);
5026 
5027 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5028 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
5029 	}
5030 
5031 	spa_config_exit(spa, SCL_CONFIG, FTAG);
5032 
5033 	/*
5034 	 * The txg_wait_synced() here ensures that all reflow zio's have
5035 	 * completed, and vre_failed_offset has been set if necessary.  It
5036 	 * also ensures that the progress of the last raidz_reflow_sync() is
5037 	 * written to disk before raidz_reflow_complete_sync() changes the
5038 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
5039 	 * determine if a reflow is in progress, in which case we may need to
5040 	 * write to both old and new locations.  Therefore we can only change
5041 	 * vre_state once this is not necessary, which is once the on-disk
5042 	 * progress (in spa_ubsync) has been set past any possible writes (to
5043 	 * the end of the last metaslab).
5044 	 */
5045 	txg_wait_synced(spa->spa_dsl_pool, 0);
5046 
5047 	if (!zthr_iscancelled(zthr) &&
5048 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
5049 		/*
5050 		 * We are not being canceled or paused, so the reflow must be
5051 		 * complete. In that case also mark it as completed on disk.
5052 		 */
5053 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
5054 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
5055 		    raidz_reflow_complete_sync, spa,
5056 		    0, ZFS_SPACE_CHECK_NONE));
5057 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
5058 	} else {
5059 		/*
5060 		 * Wait for all copy zio's to complete and for all the
5061 		 * raidz_reflow_sync() synctasks to be run.
5062 		 */
5063 		spa_history_log_internal(spa, "reflow pause",
5064 		    NULL, "offset=%llu failed_offset=%lld",
5065 		    (long long)vre->vre_offset,
5066 		    (long long)vre->vre_failed_offset);
5067 		mutex_enter(&vre->vre_lock);
5068 		if (vre->vre_failed_offset != UINT64_MAX) {
5069 			/*
5070 			 * Reset progress so that we will retry everything
5071 			 * after the point that something failed.
5072 			 */
5073 			vre->vre_offset = vre->vre_failed_offset;
5074 			vre->vre_failed_offset = UINT64_MAX;
5075 			vre->vre_waiting_for_resilver = B_TRUE;
5076 		}
5077 		mutex_exit(&vre->vre_lock);
5078 	}
5079 }
5080 
5081 void
spa_start_raidz_expansion_thread(spa_t * spa)5082 spa_start_raidz_expansion_thread(spa_t *spa)
5083 {
5084 	ASSERT0P(spa->spa_raidz_expand_zthr);
5085 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
5086 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
5087 	    spa, defclsyspri);
5088 }
5089 
5090 void
raidz_dtl_reassessed(vdev_t * vd)5091 raidz_dtl_reassessed(vdev_t *vd)
5092 {
5093 	spa_t *spa = vd->vdev_spa;
5094 	if (spa->spa_raidz_expand != NULL) {
5095 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
5096 		/*
5097 		 * we get called often from vdev_dtl_reassess() so make
5098 		 * sure it's our vdev and any replacing is complete
5099 		 */
5100 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
5101 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
5102 			mutex_enter(&vre->vre_lock);
5103 			if (vre->vre_waiting_for_resilver) {
5104 				vdev_dbgmsg(vd, "DTL reassessed, "
5105 				    "continuing raidz expansion");
5106 				vre->vre_waiting_for_resilver = B_FALSE;
5107 				zthr_wakeup(spa->spa_raidz_expand_zthr);
5108 			}
5109 			mutex_exit(&vre->vre_lock);
5110 		}
5111 	}
5112 }
5113 
5114 int
vdev_raidz_attach_check(vdev_t * new_child)5115 vdev_raidz_attach_check(vdev_t *new_child)
5116 {
5117 	vdev_t *raidvd = new_child->vdev_parent;
5118 	uint64_t new_children = raidvd->vdev_children;
5119 
5120 	/*
5121 	 * We use the "boot" space as scratch space to handle overwriting the
5122 	 * initial part of the vdev.  If it is too small, then this expansion
5123 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
5124 	 * >200 children).
5125 	 */
5126 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
5127 		return (EINVAL);
5128 	}
5129 	return (0);
5130 }
5131 
5132 void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)5133 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
5134 {
5135 	vdev_t *new_child = arg;
5136 	spa_t *spa = new_child->vdev_spa;
5137 	vdev_t *raidvd = new_child->vdev_parent;
5138 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
5139 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
5140 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
5141 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
5142 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
5143 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
5144 	    new_child);
5145 
5146 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
5147 
5148 	vdrz->vd_physical_width++;
5149 
5150 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
5151 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
5152 	vdrz->vn_vre.vre_offset = 0;
5153 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
5154 	spa->spa_raidz_expand = &vdrz->vn_vre;
5155 	zthr_wakeup(spa->spa_raidz_expand_zthr);
5156 
5157 	/*
5158 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
5159 	 * written to the config.
5160 	 */
5161 	vdev_config_dirty(raidvd);
5162 
5163 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
5164 	vdrz->vn_vre.vre_end_time = 0;
5165 	vdrz->vn_vre.vre_state = DSS_SCANNING;
5166 	vdrz->vn_vre.vre_bytes_copied = 0;
5167 
5168 	uint64_t state = vdrz->vn_vre.vre_state;
5169 	VERIFY0(zap_update(spa->spa_meta_objset,
5170 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
5171 	    sizeof (state), 1, &state, tx));
5172 
5173 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
5174 	VERIFY0(zap_update(spa->spa_meta_objset,
5175 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
5176 	    sizeof (start_time), 1, &start_time, tx));
5177 
5178 	(void) zap_remove(spa->spa_meta_objset,
5179 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
5180 	(void) zap_remove(spa->spa_meta_objset,
5181 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
5182 
5183 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
5184 	    "%s vdev %llu new width %llu", spa_name(spa),
5185 	    (unsigned long long)raidvd->vdev_id,
5186 	    (unsigned long long)raidvd->vdev_children);
5187 }
5188 
5189 int
vdev_raidz_load(vdev_t * vd)5190 vdev_raidz_load(vdev_t *vd)
5191 {
5192 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5193 	int err;
5194 
5195 	uint64_t state = DSS_NONE;
5196 	uint64_t start_time = 0;
5197 	uint64_t end_time = 0;
5198 	uint64_t bytes_copied = 0;
5199 
5200 	if (vd->vdev_top_zap != 0) {
5201 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5202 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
5203 		    sizeof (state), 1, &state);
5204 		if (err != 0 && err != ENOENT)
5205 			return (err);
5206 
5207 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5208 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
5209 		    sizeof (start_time), 1, &start_time);
5210 		if (err != 0 && err != ENOENT)
5211 			return (err);
5212 
5213 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5214 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
5215 		    sizeof (end_time), 1, &end_time);
5216 		if (err != 0 && err != ENOENT)
5217 			return (err);
5218 
5219 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5220 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
5221 		    sizeof (bytes_copied), 1, &bytes_copied);
5222 		if (err != 0 && err != ENOENT)
5223 			return (err);
5224 	}
5225 
5226 	/*
5227 	 * If we are in the middle of expansion, vre_state should have
5228 	 * already been set by vdev_raidz_init().
5229 	 */
5230 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
5231 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
5232 	vdrz->vn_vre.vre_start_time = start_time;
5233 	vdrz->vn_vre.vre_end_time = end_time;
5234 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
5235 
5236 	return (0);
5237 }
5238 
5239 int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)5240 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
5241 {
5242 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
5243 
5244 	if (vre == NULL) {
5245 		/* no removal in progress; find most recent completed */
5246 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
5247 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
5248 			if (vd->vdev_ops == &vdev_raidz_ops) {
5249 				vdev_raidz_t *vdrz = vd->vdev_tsd;
5250 
5251 				if (vdrz->vn_vre.vre_end_time != 0 &&
5252 				    (vre == NULL ||
5253 				    vdrz->vn_vre.vre_end_time >
5254 				    vre->vre_end_time)) {
5255 					vre = &vdrz->vn_vre;
5256 				}
5257 			}
5258 		}
5259 	}
5260 
5261 	if (vre == NULL) {
5262 		return (SET_ERROR(ENOENT));
5263 	}
5264 
5265 	pres->pres_state = vre->vre_state;
5266 	pres->pres_expanding_vdev = vre->vre_vdev_id;
5267 
5268 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
5269 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
5270 
5271 	mutex_enter(&vre->vre_lock);
5272 	pres->pres_reflowed = vre->vre_bytes_copied;
5273 	for (int i = 0; i < TXG_SIZE; i++)
5274 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
5275 	mutex_exit(&vre->vre_lock);
5276 
5277 	pres->pres_start_time = vre->vre_start_time;
5278 	pres->pres_end_time = vre->vre_end_time;
5279 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
5280 
5281 	return (0);
5282 }
5283 
5284 /*
5285  * Initialize private RAIDZ specific fields from the nvlist.
5286  */
5287 static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)5288 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
5289 {
5290 	uint_t children;
5291 	nvlist_t **child;
5292 	int error = nvlist_lookup_nvlist_array(nv,
5293 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
5294 	if (error != 0)
5295 		return (SET_ERROR(EINVAL));
5296 
5297 	uint64_t nparity;
5298 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
5299 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
5300 			return (SET_ERROR(EINVAL));
5301 
5302 		/*
5303 		 * Previous versions could only support 1 or 2 parity
5304 		 * device.
5305 		 */
5306 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
5307 			return (SET_ERROR(EINVAL));
5308 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
5309 			return (SET_ERROR(EINVAL));
5310 	} else {
5311 		/*
5312 		 * We require the parity to be specified for SPAs that
5313 		 * support multiple parity levels.
5314 		 */
5315 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
5316 			return (SET_ERROR(EINVAL));
5317 
5318 		/*
5319 		 * Otherwise, we default to 1 parity device for RAID-Z.
5320 		 */
5321 		nparity = 1;
5322 	}
5323 
5324 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
5325 	vdrz->vn_vre.vre_vdev_id = -1;
5326 	vdrz->vn_vre.vre_offset = UINT64_MAX;
5327 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
5328 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
5329 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
5330 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
5331 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
5332 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
5333 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
5334 
5335 	vdrz->vd_physical_width = children;
5336 	vdrz->vd_nparity = nparity;
5337 
5338 	/* note, the ID does not exist when creating a pool */
5339 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
5340 	    &vdrz->vn_vre.vre_vdev_id);
5341 
5342 	boolean_t reflow_in_progress =
5343 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5344 	if (reflow_in_progress) {
5345 		spa->spa_raidz_expand = &vdrz->vn_vre;
5346 		vdrz->vn_vre.vre_state = DSS_SCANNING;
5347 	}
5348 
5349 	vdrz->vd_original_width = children;
5350 	uint64_t *txgs;
5351 	unsigned int txgs_size = 0;
5352 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5353 	    &txgs, &txgs_size);
5354 	if (error == 0) {
5355 		for (int i = 0; i < txgs_size; i++) {
5356 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
5357 			re->re_txg = txgs[txgs_size - i - 1];
5358 			re->re_logical_width = vdrz->vd_physical_width - i;
5359 
5360 			if (reflow_in_progress)
5361 				re->re_logical_width--;
5362 
5363 			avl_add(&vdrz->vd_expand_txgs, re);
5364 		}
5365 
5366 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
5367 	}
5368 	if (reflow_in_progress) {
5369 		vdrz->vd_original_width--;
5370 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
5371 		    children, txgs_size);
5372 	}
5373 
5374 	*tsd = vdrz;
5375 
5376 	return (0);
5377 }
5378 
5379 static void
vdev_raidz_fini(vdev_t * vd)5380 vdev_raidz_fini(vdev_t *vd)
5381 {
5382 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5383 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5384 		vd->vdev_spa->spa_raidz_expand = NULL;
5385 	reflow_node_t *re;
5386 	void *cookie = NULL;
5387 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
5388 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5389 		kmem_free(re, sizeof (*re));
5390 	avl_destroy(&vdrz->vd_expand_txgs);
5391 	mutex_destroy(&vdrz->vd_expand_lock);
5392 	mutex_destroy(&vdrz->vn_vre.vre_lock);
5393 	cv_destroy(&vdrz->vn_vre.vre_cv);
5394 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5395 	kmem_free(vdrz, sizeof (*vdrz));
5396 }
5397 
5398 /*
5399  * Add RAIDZ specific fields to the config nvlist.
5400  */
5401 static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)5402 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5403 {
5404 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5405 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5406 
5407 	/*
5408 	 * Make sure someone hasn't managed to sneak a fancy new vdev
5409 	 * into a crufty old storage pool.
5410 	 */
5411 	ASSERT(vdrz->vd_nparity == 1 ||
5412 	    (vdrz->vd_nparity <= 2 &&
5413 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5414 	    (vdrz->vd_nparity <= 3 &&
5415 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5416 
5417 	/*
5418 	 * Note that we'll add these even on storage pools where they
5419 	 * aren't strictly required -- older software will just ignore
5420 	 * it.
5421 	 */
5422 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5423 
5424 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5425 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5426 	}
5427 
5428 	mutex_enter(&vdrz->vd_expand_lock);
5429 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5430 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5431 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5432 		    KM_SLEEP);
5433 		uint64_t i = 0;
5434 
5435 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5436 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5437 			txgs[i++] = re->re_txg;
5438 		}
5439 
5440 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5441 		    txgs, count);
5442 
5443 		kmem_free(txgs, sizeof (uint64_t) * count);
5444 	}
5445 	mutex_exit(&vdrz->vd_expand_lock);
5446 }
5447 
5448 static uint64_t
vdev_raidz_nparity(vdev_t * vd)5449 vdev_raidz_nparity(vdev_t *vd)
5450 {
5451 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5452 	return (vdrz->vd_nparity);
5453 }
5454 
5455 static uint64_t
vdev_raidz_ndisks(vdev_t * vd)5456 vdev_raidz_ndisks(vdev_t *vd)
5457 {
5458 	return (vd->vdev_children);
5459 }
5460 
5461 vdev_ops_t vdev_raidz_ops = {
5462 	.vdev_op_init = vdev_raidz_init,
5463 	.vdev_op_fini = vdev_raidz_fini,
5464 	.vdev_op_open = vdev_raidz_open,
5465 	.vdev_op_close = vdev_raidz_close,
5466 	.vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
5467 	.vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
5468 	.vdev_op_min_asize = vdev_raidz_min_asize,
5469 	.vdev_op_min_alloc = NULL,
5470 	.vdev_op_io_start = vdev_raidz_io_start,
5471 	.vdev_op_io_done = vdev_raidz_io_done,
5472 	.vdev_op_state_change = vdev_raidz_state_change,
5473 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
5474 	.vdev_op_hold = NULL,
5475 	.vdev_op_rele = NULL,
5476 	.vdev_op_remap = NULL,
5477 	.vdev_op_xlate = vdev_raidz_xlate,
5478 	.vdev_op_rebuild_asize = NULL,
5479 	.vdev_op_metaslab_init = NULL,
5480 	.vdev_op_config_generate = vdev_raidz_config_generate,
5481 	.vdev_op_nparity = vdev_raidz_nparity,
5482 	.vdev_op_ndisks = vdev_raidz_ndisks,
5483 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5484 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5485 };
5486 
5487 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5488 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5489 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5490 	"Max amount of concurrent i/o for RAIDZ expansion");
5491 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5492 	"For expanded RAIDZ, aggregate reads that have more rows than this");
5493 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5494 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5495 	"completes");
5496 ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
5497 	"Raidz/draid slow disk sit out time period in seconds");
5498 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
5499 	ZMOD_RW, "Interval to check for slow raidz/draid children");
5500 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
5501 	ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
5502 /* END CSTYLED */
5503