xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision 071ab5a1f3cbfd29c8fbec27f7e619418adaf074)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/zap.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/metaslab_impl.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/abd.h>
39 #include <sys/zfs_rlock.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
42 #include <sys/vdev_raidz.h>
43 #include <sys/vdev_raidz_impl.h>
44 #include <sys/vdev_draid.h>
45 #include <sys/uberblock_impl.h>
46 #include <sys/dsl_scan.h>
47 
48 #ifdef ZFS_DEBUG
49 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
50 #endif
51 
52 /*
53  * Virtual device vector for RAID-Z.
54  *
55  * This vdev supports single, double, and triple parity. For single parity,
56  * we use a simple XOR of all the data columns. For double or triple parity,
57  * we use a special case of Reed-Solomon coding. This extends the
58  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
59  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
60  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
61  * former is also based. The latter is designed to provide higher performance
62  * for writes.
63  *
64  * Note that the Plank paper claimed to support arbitrary N+M, but was then
65  * amended six years later identifying a critical flaw that invalidates its
66  * claims. Nevertheless, the technique can be adapted to work for up to
67  * triple parity. For additional parity, the amendment "Note: Correction to
68  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
69  * is viable, but the additional complexity means that write performance will
70  * suffer.
71  *
72  * All of the methods above operate on a Galois field, defined over the
73  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
74  * can be expressed with a single byte. Briefly, the operations on the
75  * field are defined as follows:
76  *
77  *   o addition (+) is represented by a bitwise XOR
78  *   o subtraction (-) is therefore identical to addition: A + B = A - B
79  *   o multiplication of A by 2 is defined by the following bitwise expression:
80  *
81  *	(A * 2)_7 = A_6
82  *	(A * 2)_6 = A_5
83  *	(A * 2)_5 = A_4
84  *	(A * 2)_4 = A_3 + A_7
85  *	(A * 2)_3 = A_2 + A_7
86  *	(A * 2)_2 = A_1 + A_7
87  *	(A * 2)_1 = A_0
88  *	(A * 2)_0 = A_7
89  *
90  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
91  * As an aside, this multiplication is derived from the error correcting
92  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
93  *
94  * Observe that any number in the field (except for 0) can be expressed as a
95  * power of 2 -- a generator for the field. We store a table of the powers of
96  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
97  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
98  * than field addition). The inverse of a field element A (A^-1) is therefore
99  * A ^ (255 - 1) = A^254.
100  *
101  * The up-to-three parity columns, P, Q, R over several data columns,
102  * D_0, ... D_n-1, can be expressed by field operations:
103  *
104  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
105  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
106  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
107  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
108  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
109  *
110  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
111  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
112  * independent coefficients. (There are no additional coefficients that have
113  * this property which is why the uncorrected Plank method breaks down.)
114  *
115  * See the reconstruction code below for how P, Q and R can used individually
116  * or in concert to recover missing data columns.
117  */
118 
119 #define	VDEV_RAIDZ_P		0
120 #define	VDEV_RAIDZ_Q		1
121 #define	VDEV_RAIDZ_R		2
122 
123 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
124 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
125 
126 /*
127  * We provide a mechanism to perform the field multiplication operation on a
128  * 64-bit value all at once rather than a byte at a time. This works by
129  * creating a mask from the top bit in each byte and using that to
130  * conditionally apply the XOR of 0x1d.
131  */
132 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
133 { \
134 	(mask) = (x) & 0x8080808080808080ULL; \
135 	(mask) = ((mask) << 1) - ((mask) >> 7); \
136 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
137 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
138 }
139 
140 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
141 { \
142 	VDEV_RAIDZ_64MUL_2((x), mask); \
143 	VDEV_RAIDZ_64MUL_2((x), mask); \
144 }
145 
146 
147 /*
148  * Big Theory Statement for how a RAIDZ VDEV is expanded
149  *
150  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
151  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
152  * that have been previously expanded can be expanded again.
153  *
154  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
155  * the VDEV) when an expansion starts.  And the expansion will pause if any
156  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
157  * operations on the pool can continue while an expansion is in progress (e.g.
158  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
159  * and zpool initialize which can't be run during an expansion.  Following a
160  * reboot or export/import, the expansion resumes where it left off.
161  *
162  * == Reflowing the Data ==
163  *
164  * The expansion involves reflowing (copying) the data from the current set
165  * of disks to spread it across the new set which now has one more disk. This
166  * reflow operation is similar to reflowing text when the column width of a
167  * text editor window is expanded. The text doesn’t change but the location of
168  * the text changes to accommodate the new width. An example reflow result for
169  * a 4-wide RAIDZ1 to a 5-wide is shown below.
170  *
171  *                            Reflow End State
172  *            Each letter indicates a parity group (logical stripe)
173  *
174  *         Before expansion                         After Expansion
175  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
176  *  +------+------+------+------+         +------+------+------+------+------+
177  *  |      |      |      |      |         |      |      |      |      |      |
178  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
179  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
180  *  +------+------+------+------+         +------+------+------+------+------+
181  *  |      |      |      |      |         |      |      |      |      |      |
182  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
183  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
184  *  +------+------+------+------+         +------+------+------+------+------+
185  *  |      |      |      |      |         |      |      |      |      |      |
186  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
187  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
188  *  +------+------+------+------+         +------+------+------+------+------+
189  *  |      |      |      |      |         |      |      |      |      |      |
190  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
191  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
192  *  +------+------+------+------+         +------+------+------+------+------+
193  *  |      |      |      |      |         |      |      |      |      |      |
194  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
195  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
196  *  +------+------+------+------+         +------+------+------+------+------+
197  *  |      |      |      |      |         |      |      |      |      |      |
198  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
199  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
200  *  +------+------+------+------+         +------+------+------+------+------+
201  *  |      |      |      |      |         |      |      |      |      |      |
202  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
203  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
204  *  +------+------+------+------+         +------+------+------+------+------+
205  *
206  * This reflow approach has several advantages. There is no need to read or
207  * modify the block pointers or recompute any block checksums.  The reflow
208  * doesn’t need to know where the parity sectors reside. We can read and write
209  * data sequentially and the copy can occur in a background thread in open
210  * context. The design also allows for fast discovery of what data to copy.
211  *
212  * The VDEV metaslabs are processed, one at a time, to copy the block data to
213  * have it flow across all the disks. The metaslab is disabled for allocations
214  * during the copy. As an optimization, we only copy the allocated data which
215  * can be determined by looking at the metaslab range tree. During the copy we
216  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
217  * need to be able to survive losing parity count disks).  This means we
218  * cannot overwrite data during the reflow that would be needed if a disk is
219  * lost.
220  *
221  * After the reflow completes, all newly-written blocks will have the new
222  * layout, i.e., they will have the parity to data ratio implied by the new
223  * number of disks in the RAIDZ group.  Even though the reflow copies all of
224  * the allocated space (data and parity), it is only rearranged, not changed.
225  *
226  * This act of reflowing the data has a few implications about blocks
227  * that were written before the reflow completes:
228  *
229  *  - Old blocks will still use the same amount of space (i.e., they will have
230  *    the parity to data ratio implied by the old number of disks in the RAIDZ
231  *    group).
232  *  - Reading old blocks will be slightly slower than before the reflow, for
233  *    two reasons. First, we will have to read from all disks in the RAIDZ
234  *    VDEV, rather than being able to skip the children that contain only
235  *    parity of this block (because the data of a single block is now spread
236  *    out across all the disks).  Second, in most cases there will be an extra
237  *    bcopy, needed to rearrange the data back to its original layout in memory.
238  *
239  * == Scratch Area ==
240  *
241  * As we copy the block data, we can only progress to the point that writes
242  * will not overlap with blocks whose progress has not yet been recorded on
243  * disk.  Since partially-copied rows are always read from the old location,
244  * we need to stop one row before the sector-wise overlap, to prevent any
245  * row-wise overlap. For example, in the diagram above, when we reflow sector
246  * B6 it will overwite the original location for B5.
247  *
248  * To get around this, a scratch space is used so that we can start copying
249  * without risking data loss by overlapping the row. As an added benefit, it
250  * improves performance at the beginning of the reflow, but that small perf
251  * boost wouldn't be worth the complexity on its own.
252  *
253  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
254  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
255  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
256  * the widths will likely be single digits so we can get a substantial chuck
257  * size using only a few MB of scratch per disk.
258  *
259  * The scratch area is persisted to disk which holds a large amount of reflowed
260  * state. We can always read the partially written stripes when a disk fails or
261  * the copy is interrupted (crash) during the initial copying phase and also
262  * get past a small chunk size restriction.  At a minimum, the scratch space
263  * must be large enough to get us to the point that one row does not overlap
264  * itself when moved (i.e new_width^2).  But going larger is even better. We
265  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
266  * as our scratch space to handle overwriting the initial part of the VDEV.
267  *
268  *	0     256K   512K                    4M
269  *	+------+------+-----------------------+-----------------------------
270  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
271  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
272  *	+------+------+-----------------------+-------------------------------
273  *                        Scratch Area
274  *
275  * == Reflow Progress Updates ==
276  * After the initial scratch-based reflow, the expansion process works
277  * similarly to device removal. We create a new open context thread which
278  * reflows the data, and periodically kicks off sync tasks to update logical
279  * state. In this case, state is the committed progress (offset of next data
280  * to copy). We need to persist the completed offset on disk, so that if we
281  * crash we know which format each VDEV offset is in.
282  *
283  * == Time Dependent Geometry ==
284  *
285  * In non-expanded RAIDZ, blocks are read from disk in a column by column
286  * fashion. For a multi-row block, the second sector is in the first column
287  * not in the second column. This allows us to issue full reads for each
288  * column directly into the request buffer. The block data is thus laid out
289  * sequentially in a column-by-column fashion.
290  *
291  * For example, in the before expansion diagram above, one logical block might
292  * be sectors G19-H26. The parity is in G19,H23; and the data is in
293  * G20,H24,G21,H25,G22,H26.
294  *
295  * After a block is reflowed, the sectors that were all in the original column
296  * data can now reside in different columns. When reading from an expanded
297  * VDEV, we need to know the logical stripe width for each block so we can
298  * reconstitute the block’s data after the reads are completed. Likewise,
299  * when we perform the combinatorial reconstruction we need to know the
300  * original width so we can retry combinations from the past layouts.
301  *
302  * Time dependent geometry is what we call having blocks with different layouts
303  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
304  * block’s birth time (+ the time expansion ended) to establish the correct
305  * width for a given block. After an expansion completes, we record the time
306  * for blocks written with a particular width (geometry).
307  *
308  * == On Disk Format Changes ==
309  *
310  * New pool feature flag, 'raidz_expansion' whose reference count is the number
311  * of RAIDZ VDEVs that have been expanded.
312  *
313  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
314  *
315  * Since the uberblock can point to arbitrary blocks, which might be on the
316  * expanding RAIDZ, and might or might not have been expanded. We need to know
317  * which way a block is laid out before reading it. This info is the next
318  * offset that needs to be reflowed and we persist that in the uberblock, in
319  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
320  * After the expansion is complete, we then use the raidz_expand_txgs array
321  * (see below) to determine how to read a block and the ub_raidz_reflow_info
322  * field no longer required.
323  *
324  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
325  * state (i.e., active or not) which is also required before reading a block
326  * during the initial phase of reflowing the data.
327  *
328  * The top-level RAIDZ VDEV has two new entries in the nvlist:
329  *
330  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
331  *                            and used after the expansion is complete to
332  *                            determine how to read a raidz block
333  * 'raidz_expanding' boolean: present during reflow and removed after completion
334  *                            used during a spa import to resume an unfinished
335  *                            expansion
336  *
337  * And finally the VDEVs top zap adds the following informational entries:
338  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
339  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
341  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
342  */
343 
344 /*
345  * For testing only: pause the raidz expansion after reflowing this amount.
346  * (accessed by ZTS and ztest)
347  */
348 #ifdef	_KERNEL
349 static
350 #endif	/* _KERNEL */
351 unsigned long raidz_expand_max_reflow_bytes = 0;
352 
353 /*
354  * For testing only: pause the raidz expansion at a certain point.
355  */
356 uint_t raidz_expand_pause_point = 0;
357 
358 /*
359  * Maximum amount of copy io's outstanding at once.
360  */
361 #ifdef _ILP32
362 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
363 #else
364 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
365 #endif
366 
367 /*
368  * Apply raidz map abds aggregation if the number of rows in the map is equal
369  * or greater than the value below.
370  */
371 static unsigned long raidz_io_aggregate_rows = 4;
372 
373 /*
374  * Automatically start a pool scrub when a RAIDZ expansion completes in
375  * order to verify the checksums of all blocks which have been copied
376  * during the expansion.  Automatic scrubbing is enabled by default and
377  * is strongly recommended.
378  */
379 static int zfs_scrub_after_expand = 1;
380 
381 static void
382 vdev_raidz_row_free(raidz_row_t *rr)
383 {
384 	for (int c = 0; c < rr->rr_cols; c++) {
385 		raidz_col_t *rc = &rr->rr_col[c];
386 
387 		if (rc->rc_size != 0)
388 			abd_free(rc->rc_abd);
389 		if (rc->rc_orig_data != NULL)
390 			abd_free(rc->rc_orig_data);
391 	}
392 
393 	if (rr->rr_abd_empty != NULL)
394 		abd_free(rr->rr_abd_empty);
395 
396 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
397 }
398 
399 void
400 vdev_raidz_map_free(raidz_map_t *rm)
401 {
402 	for (int i = 0; i < rm->rm_nrows; i++)
403 		vdev_raidz_row_free(rm->rm_row[i]);
404 
405 	if (rm->rm_nphys_cols) {
406 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
407 			if (rm->rm_phys_col[i].rc_abd != NULL)
408 				abd_free(rm->rm_phys_col[i].rc_abd);
409 		}
410 
411 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
412 		    rm->rm_nphys_cols);
413 	}
414 
415 	ASSERT3P(rm->rm_lr, ==, NULL);
416 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
417 }
418 
419 static void
420 vdev_raidz_map_free_vsd(zio_t *zio)
421 {
422 	raidz_map_t *rm = zio->io_vsd;
423 
424 	vdev_raidz_map_free(rm);
425 }
426 
427 static int
428 vdev_raidz_reflow_compare(const void *x1, const void *x2)
429 {
430 	const reflow_node_t *l = x1;
431 	const reflow_node_t *r = x2;
432 
433 	return (TREE_CMP(l->re_txg, r->re_txg));
434 }
435 
436 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
437 	.vsd_free = vdev_raidz_map_free_vsd,
438 };
439 
440 raidz_row_t *
441 vdev_raidz_row_alloc(int cols, zio_t *zio)
442 {
443 	raidz_row_t *rr =
444 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
445 
446 	rr->rr_cols = cols;
447 	rr->rr_scols = cols;
448 
449 	for (int c = 0; c < cols; c++) {
450 		raidz_col_t *rc = &rr->rr_col[c];
451 		rc->rc_shadow_devidx = INT_MAX;
452 		rc->rc_shadow_offset = UINT64_MAX;
453 		/*
454 		 * We can not allow self healing to take place for Direct I/O
455 		 * reads. There is nothing that stops the buffer contents from
456 		 * being manipulated while the I/O is in flight. It is possible
457 		 * that the checksum could be verified on the buffer and then
458 		 * the contents of that buffer are manipulated afterwards. This
459 		 * could lead to bad data being written out during self
460 		 * healing.
461 		 */
462 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
463 			rc->rc_allow_repair = 1;
464 	}
465 	return (rr);
466 }
467 
468 static void
469 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
470 {
471 	int c;
472 	int nwrapped = 0;
473 	uint64_t off = 0;
474 	raidz_row_t *rr = rm->rm_row[0];
475 
476 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
477 	ASSERT3U(rm->rm_nrows, ==, 1);
478 
479 	/*
480 	 * Pad any parity columns with additional space to account for skip
481 	 * sectors.
482 	 */
483 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
484 		ASSERT0(rm->rm_skipstart);
485 		nwrapped = rm->rm_nskip;
486 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
487 		nwrapped =
488 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
489 	}
490 
491 	/*
492 	 * Optional single skip sectors (rc_size == 0) will be handled in
493 	 * vdev_raidz_io_start_write().
494 	 */
495 	int skipped = rr->rr_scols - rr->rr_cols;
496 
497 	/* Allocate buffers for the parity columns */
498 	for (c = 0; c < rr->rr_firstdatacol; c++) {
499 		raidz_col_t *rc = &rr->rr_col[c];
500 
501 		/*
502 		 * Parity columns will pad out a linear ABD to account for
503 		 * the skip sector. A linear ABD is used here because
504 		 * parity calculations use the ABD buffer directly to calculate
505 		 * parity. This avoids doing a memcpy back to the ABD after the
506 		 * parity has been calculated. By issuing the parity column
507 		 * with the skip sector we can reduce contention on the child
508 		 * VDEV queue locks (vq_lock).
509 		 */
510 		if (c < nwrapped) {
511 			rc->rc_abd = abd_alloc_linear(
512 			    rc->rc_size + (1ULL << ashift), B_FALSE);
513 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
514 			skipped++;
515 		} else {
516 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
517 		}
518 	}
519 
520 	for (off = 0; c < rr->rr_cols; c++) {
521 		raidz_col_t *rc = &rr->rr_col[c];
522 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
523 		    zio->io_abd, off, rc->rc_size);
524 
525 		/*
526 		 * Generate I/O for skip sectors to improve aggregation
527 		 * continuity. We will use gang ABD's to reduce contention
528 		 * on the child VDEV queue locks (vq_lock) by issuing
529 		 * a single I/O that contains the data and skip sector.
530 		 *
531 		 * It is important to make sure that rc_size is not updated
532 		 * even though we are adding a skip sector to the ABD. When
533 		 * calculating the parity in vdev_raidz_generate_parity_row()
534 		 * the rc_size is used to iterate through the ABD's. We can
535 		 * not have zero'd out skip sectors used for calculating
536 		 * parity for raidz, because those same sectors are not used
537 		 * during reconstruction.
538 		 */
539 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
540 			rc->rc_abd = abd_alloc_gang();
541 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
542 			abd_gang_add(rc->rc_abd,
543 			    abd_get_zeros(1ULL << ashift), B_TRUE);
544 			skipped++;
545 		} else {
546 			rc->rc_abd = abd;
547 		}
548 		off += rc->rc_size;
549 	}
550 
551 	ASSERT3U(off, ==, zio->io_size);
552 	ASSERT3S(skipped, ==, rm->rm_nskip);
553 }
554 
555 static void
556 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
557 {
558 	int c;
559 	raidz_row_t *rr = rm->rm_row[0];
560 
561 	ASSERT3U(rm->rm_nrows, ==, 1);
562 
563 	/* Allocate buffers for the parity columns */
564 	for (c = 0; c < rr->rr_firstdatacol; c++)
565 		rr->rr_col[c].rc_abd =
566 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
567 
568 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
569 		raidz_col_t *rc = &rr->rr_col[c];
570 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
571 		    zio->io_abd, off, rc->rc_size);
572 		off += rc->rc_size;
573 	}
574 }
575 
576 /*
577  * Divides the IO evenly across all child vdevs; usually, dcols is
578  * the number of children in the target vdev.
579  *
580  * Avoid inlining the function to keep vdev_raidz_io_start(), which
581  * is this functions only caller, as small as possible on the stack.
582  */
583 noinline raidz_map_t *
584 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
585     uint64_t nparity)
586 {
587 	raidz_row_t *rr;
588 	/* The starting RAIDZ (parent) vdev sector of the block. */
589 	uint64_t b = zio->io_offset >> ashift;
590 	/* The zio's size in units of the vdev's minimum sector size. */
591 	uint64_t s = zio->io_size >> ashift;
592 	/* The first column for this stripe. */
593 	uint64_t f = b % dcols;
594 	/* The starting byte offset on each child vdev. */
595 	uint64_t o = (b / dcols) << ashift;
596 	uint64_t acols, scols;
597 
598 	raidz_map_t *rm =
599 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
600 	rm->rm_nrows = 1;
601 
602 	/*
603 	 * "Quotient": The number of data sectors for this stripe on all but
604 	 * the "big column" child vdevs that also contain "remainder" data.
605 	 */
606 	uint64_t q = s / (dcols - nparity);
607 
608 	/*
609 	 * "Remainder": The number of partial stripe data sectors in this I/O.
610 	 * This will add a sector to some, but not all, child vdevs.
611 	 */
612 	uint64_t r = s - q * (dcols - nparity);
613 
614 	/* The number of "big columns" - those which contain remainder data. */
615 	uint64_t bc = (r == 0 ? 0 : r + nparity);
616 
617 	/*
618 	 * The total number of data and parity sectors associated with
619 	 * this I/O.
620 	 */
621 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
622 
623 	/*
624 	 * acols: The columns that will be accessed.
625 	 * scols: The columns that will be accessed or skipped.
626 	 */
627 	if (q == 0) {
628 		/* Our I/O request doesn't span all child vdevs. */
629 		acols = bc;
630 		scols = MIN(dcols, roundup(bc, nparity + 1));
631 	} else {
632 		acols = dcols;
633 		scols = dcols;
634 	}
635 
636 	ASSERT3U(acols, <=, scols);
637 	rr = vdev_raidz_row_alloc(scols, zio);
638 	rm->rm_row[0] = rr;
639 	rr->rr_cols = acols;
640 	rr->rr_bigcols = bc;
641 	rr->rr_firstdatacol = nparity;
642 #ifdef ZFS_DEBUG
643 	rr->rr_offset = zio->io_offset;
644 	rr->rr_size = zio->io_size;
645 #endif
646 
647 	uint64_t asize = 0;
648 
649 	for (uint64_t c = 0; c < scols; c++) {
650 		raidz_col_t *rc = &rr->rr_col[c];
651 		uint64_t col = f + c;
652 		uint64_t coff = o;
653 		if (col >= dcols) {
654 			col -= dcols;
655 			coff += 1ULL << ashift;
656 		}
657 		rc->rc_devidx = col;
658 		rc->rc_offset = coff;
659 
660 		if (c >= acols)
661 			rc->rc_size = 0;
662 		else if (c < bc)
663 			rc->rc_size = (q + 1) << ashift;
664 		else
665 			rc->rc_size = q << ashift;
666 
667 		asize += rc->rc_size;
668 	}
669 
670 	ASSERT3U(asize, ==, tot << ashift);
671 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
672 	rm->rm_skipstart = bc;
673 
674 	/*
675 	 * If all data stored spans all columns, there's a danger that parity
676 	 * will always be on the same device and, since parity isn't read
677 	 * during normal operation, that device's I/O bandwidth won't be
678 	 * used effectively. We therefore switch the parity every 1MB.
679 	 *
680 	 * ... at least that was, ostensibly, the theory. As a practical
681 	 * matter unless we juggle the parity between all devices evenly, we
682 	 * won't see any benefit. Further, occasional writes that aren't a
683 	 * multiple of the LCM of the number of children and the minimum
684 	 * stripe width are sufficient to avoid pessimal behavior.
685 	 * Unfortunately, this decision created an implicit on-disk format
686 	 * requirement that we need to support for all eternity, but only
687 	 * for single-parity RAID-Z.
688 	 *
689 	 * If we intend to skip a sector in the zeroth column for padding
690 	 * we must make sure to note this swap. We will never intend to
691 	 * skip the first column since at least one data and one parity
692 	 * column must appear in each row.
693 	 */
694 	ASSERT(rr->rr_cols >= 2);
695 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
696 
697 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
698 		uint64_t devidx = rr->rr_col[0].rc_devidx;
699 		o = rr->rr_col[0].rc_offset;
700 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
701 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
702 		rr->rr_col[1].rc_devidx = devidx;
703 		rr->rr_col[1].rc_offset = o;
704 		if (rm->rm_skipstart == 0)
705 			rm->rm_skipstart = 1;
706 	}
707 
708 	if (zio->io_type == ZIO_TYPE_WRITE) {
709 		vdev_raidz_map_alloc_write(zio, rm, ashift);
710 	} else {
711 		vdev_raidz_map_alloc_read(zio, rm);
712 	}
713 	/* init RAIDZ parity ops */
714 	rm->rm_ops = vdev_raidz_math_get_ops();
715 
716 	return (rm);
717 }
718 
719 /*
720  * Everything before reflow_offset_synced should have been moved to the new
721  * location (read and write completed).  However, this may not yet be reflected
722  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
723  * uberblock has not yet been written). If reflow is not in progress,
724  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
725  * entirely before reflow_offset_synced, it will come from the new location.
726  * Otherwise this row will come from the old location.  Therefore, rows that
727  * straddle the reflow_offset_synced will come from the old location.
728  *
729  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
730  * been copied, but not yet reflected in the on-disk progress
731  * (reflow_offset_synced), it will also be written to the new (already copied)
732  * offset.
733  */
734 noinline raidz_map_t *
735 vdev_raidz_map_alloc_expanded(zio_t *zio,
736     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
737     uint64_t nparity, uint64_t reflow_offset_synced,
738     uint64_t reflow_offset_next, boolean_t use_scratch)
739 {
740 	abd_t *abd = zio->io_abd;
741 	uint64_t offset = zio->io_offset;
742 	uint64_t size = zio->io_size;
743 
744 	/* The zio's size in units of the vdev's minimum sector size. */
745 	uint64_t s = size >> ashift;
746 
747 	/*
748 	 * "Quotient": The number of data sectors for this stripe on all but
749 	 * the "big column" child vdevs that also contain "remainder" data.
750 	 * AKA "full rows"
751 	 */
752 	uint64_t q = s / (logical_cols - nparity);
753 
754 	/*
755 	 * "Remainder": The number of partial stripe data sectors in this I/O.
756 	 * This will add a sector to some, but not all, child vdevs.
757 	 */
758 	uint64_t r = s - q * (logical_cols - nparity);
759 
760 	/* The number of "big columns" - those which contain remainder data. */
761 	uint64_t bc = (r == 0 ? 0 : r + nparity);
762 
763 	/*
764 	 * The total number of data and parity sectors associated with
765 	 * this I/O.
766 	 */
767 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
768 
769 	/* How many rows contain data (not skip) */
770 	uint64_t rows = howmany(tot, logical_cols);
771 	int cols = MIN(tot, logical_cols);
772 
773 	raidz_map_t *rm =
774 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
775 	    KM_SLEEP);
776 	rm->rm_nrows = rows;
777 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
778 	rm->rm_skipstart = bc;
779 	uint64_t asize = 0;
780 
781 	for (uint64_t row = 0; row < rows; row++) {
782 		boolean_t row_use_scratch = B_FALSE;
783 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
784 		rm->rm_row[row] = rr;
785 
786 		/* The starting RAIDZ (parent) vdev sector of the row. */
787 		uint64_t b = (offset >> ashift) + row * logical_cols;
788 
789 		/*
790 		 * If we are in the middle of a reflow, and the copying has
791 		 * not yet completed for any part of this row, then use the
792 		 * old location of this row.  Note that reflow_offset_synced
793 		 * reflects the i/o that's been completed, because it's
794 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
795 		 * This is sufficient for our check, even if that progress
796 		 * has not yet been recorded to disk (reflected in
797 		 * spa_ubsync).  Also note that we consider the last row to
798 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
799 		 * this calculation. This causes a tiny bit of unnecessary
800 		 * double-writes but is safe and simpler to calculate.
801 		 */
802 		int row_phys_cols = physical_cols;
803 		if (b + cols > reflow_offset_synced >> ashift)
804 			row_phys_cols--;
805 		else if (use_scratch)
806 			row_use_scratch = B_TRUE;
807 
808 		/* starting child of this row */
809 		uint64_t child_id = b % row_phys_cols;
810 		/* The starting byte offset on each child vdev. */
811 		uint64_t child_offset = (b / row_phys_cols) << ashift;
812 
813 		/*
814 		 * Note, rr_cols is the entire width of the block, even
815 		 * if this row is shorter.  This is needed because parity
816 		 * generation (for Q and R) needs to know the entire width,
817 		 * because it treats the short row as though it was
818 		 * full-width (and the "phantom" sectors were zero-filled).
819 		 *
820 		 * Another approach to this would be to set cols shorter
821 		 * (to just the number of columns that we might do i/o to)
822 		 * and have another mechanism to tell the parity generation
823 		 * about the "entire width".  Reconstruction (at least
824 		 * vdev_raidz_reconstruct_general()) would also need to
825 		 * know about the "entire width".
826 		 */
827 		rr->rr_firstdatacol = nparity;
828 #ifdef ZFS_DEBUG
829 		/*
830 		 * note: rr_size is PSIZE, not ASIZE
831 		 */
832 		rr->rr_offset = b << ashift;
833 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
834 #endif
835 
836 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
837 			if (child_id >= row_phys_cols) {
838 				child_id -= row_phys_cols;
839 				child_offset += 1ULL << ashift;
840 			}
841 			raidz_col_t *rc = &rr->rr_col[c];
842 			rc->rc_devidx = child_id;
843 			rc->rc_offset = child_offset;
844 
845 			/*
846 			 * Get this from the scratch space if appropriate.
847 			 * This only happens if we crashed in the middle of
848 			 * raidz_reflow_scratch_sync() (while it's running,
849 			 * the rangelock prevents us from doing concurrent
850 			 * io), and even then only during zpool import or
851 			 * when the pool is imported readonly.
852 			 */
853 			if (row_use_scratch)
854 				rc->rc_offset -= VDEV_BOOT_SIZE;
855 
856 			uint64_t dc = c - rr->rr_firstdatacol;
857 			if (c < rr->rr_firstdatacol) {
858 				rc->rc_size = 1ULL << ashift;
859 
860 				/*
861 				 * Parity sectors' rc_abd's are set below
862 				 * after determining if this is an aggregation.
863 				 */
864 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
865 				/*
866 				 * Past the end of the block (even including
867 				 * skip sectors).  This sector is part of the
868 				 * map so that we have full rows for p/q parity
869 				 * generation.
870 				 */
871 				rc->rc_size = 0;
872 				rc->rc_abd = NULL;
873 			} else {
874 				/* "data column" (col excluding parity) */
875 				uint64_t off;
876 
877 				if (c < bc || r == 0) {
878 					off = dc * rows + row;
879 				} else {
880 					off = r * rows +
881 					    (dc - r) * (rows - 1) + row;
882 				}
883 				rc->rc_size = 1ULL << ashift;
884 				rc->rc_abd = abd_get_offset_struct(
885 				    &rc->rc_abdstruct, abd, off << ashift,
886 				    rc->rc_size);
887 			}
888 
889 			if (rc->rc_size == 0)
890 				continue;
891 
892 			/*
893 			 * If any part of this row is in both old and new
894 			 * locations, the primary location is the old
895 			 * location. If this sector was already copied to the
896 			 * new location, we need to also write to the new,
897 			 * "shadow" location.
898 			 *
899 			 * Note, `row_phys_cols != physical_cols` indicates
900 			 * that the primary location is the old location.
901 			 * `b+c < reflow_offset_next` indicates that the copy
902 			 * to the new location has been initiated. We know
903 			 * that the copy has completed because we have the
904 			 * rangelock, which is held exclusively while the
905 			 * copy is in progress.
906 			 */
907 			if (row_use_scratch ||
908 			    (row_phys_cols != physical_cols &&
909 			    b + c < reflow_offset_next >> ashift)) {
910 				rc->rc_shadow_devidx = (b + c) % physical_cols;
911 				rc->rc_shadow_offset =
912 				    ((b + c) / physical_cols) << ashift;
913 				if (row_use_scratch)
914 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
915 			}
916 
917 			asize += rc->rc_size;
918 		}
919 
920 		/*
921 		 * See comment in vdev_raidz_map_alloc()
922 		 */
923 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
924 		    (offset & (1ULL << 20))) {
925 			ASSERT(rr->rr_cols >= 2);
926 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
927 
928 			int devidx0 = rr->rr_col[0].rc_devidx;
929 			uint64_t offset0 = rr->rr_col[0].rc_offset;
930 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
931 			uint64_t shadow_offset0 =
932 			    rr->rr_col[0].rc_shadow_offset;
933 
934 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
935 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
936 			rr->rr_col[0].rc_shadow_devidx =
937 			    rr->rr_col[1].rc_shadow_devidx;
938 			rr->rr_col[0].rc_shadow_offset =
939 			    rr->rr_col[1].rc_shadow_offset;
940 
941 			rr->rr_col[1].rc_devidx = devidx0;
942 			rr->rr_col[1].rc_offset = offset0;
943 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
944 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
945 		}
946 	}
947 	ASSERT3U(asize, ==, tot << ashift);
948 
949 	/*
950 	 * Determine if the block is contiguous, in which case we can use
951 	 * an aggregation.
952 	 */
953 	if (rows >= raidz_io_aggregate_rows) {
954 		rm->rm_nphys_cols = physical_cols;
955 		rm->rm_phys_col =
956 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
957 		    KM_SLEEP);
958 
959 		/*
960 		 * Determine the aggregate io's offset and size, and check
961 		 * that the io is contiguous.
962 		 */
963 		for (int i = 0;
964 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
965 			raidz_row_t *rr = rm->rm_row[i];
966 			for (int c = 0; c < rr->rr_cols; c++) {
967 				raidz_col_t *rc = &rr->rr_col[c];
968 				raidz_col_t *prc =
969 				    &rm->rm_phys_col[rc->rc_devidx];
970 
971 				if (rc->rc_size == 0)
972 					continue;
973 
974 				if (prc->rc_size == 0) {
975 					ASSERT0(prc->rc_offset);
976 					prc->rc_offset = rc->rc_offset;
977 				} else if (prc->rc_offset + prc->rc_size !=
978 				    rc->rc_offset) {
979 					/*
980 					 * This block is not contiguous and
981 					 * therefore can't be aggregated.
982 					 * This is expected to be rare, so
983 					 * the cost of allocating and then
984 					 * freeing rm_phys_col is not
985 					 * significant.
986 					 */
987 					kmem_free(rm->rm_phys_col,
988 					    sizeof (raidz_col_t) *
989 					    rm->rm_nphys_cols);
990 					rm->rm_phys_col = NULL;
991 					rm->rm_nphys_cols = 0;
992 					break;
993 				}
994 				prc->rc_size += rc->rc_size;
995 			}
996 		}
997 	}
998 	if (rm->rm_phys_col != NULL) {
999 		/*
1000 		 * Allocate aggregate ABD's.
1001 		 */
1002 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
1003 			raidz_col_t *prc = &rm->rm_phys_col[i];
1004 
1005 			prc->rc_devidx = i;
1006 
1007 			if (prc->rc_size == 0)
1008 				continue;
1009 
1010 			prc->rc_abd =
1011 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1012 			    B_FALSE);
1013 		}
1014 
1015 		/*
1016 		 * Point the parity abd's into the aggregate abd's.
1017 		 */
1018 		for (int i = 0; i < rm->rm_nrows; i++) {
1019 			raidz_row_t *rr = rm->rm_row[i];
1020 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1021 				raidz_col_t *rc = &rr->rr_col[c];
1022 				raidz_col_t *prc =
1023 				    &rm->rm_phys_col[rc->rc_devidx];
1024 				rc->rc_abd =
1025 				    abd_get_offset_struct(&rc->rc_abdstruct,
1026 				    prc->rc_abd,
1027 				    rc->rc_offset - prc->rc_offset,
1028 				    rc->rc_size);
1029 			}
1030 		}
1031 	} else {
1032 		/*
1033 		 * Allocate new abd's for the parity sectors.
1034 		 */
1035 		for (int i = 0; i < rm->rm_nrows; i++) {
1036 			raidz_row_t *rr = rm->rm_row[i];
1037 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1038 				raidz_col_t *rc = &rr->rr_col[c];
1039 				rc->rc_abd =
1040 				    abd_alloc_linear(rc->rc_size,
1041 				    B_TRUE);
1042 			}
1043 		}
1044 	}
1045 	/* init RAIDZ parity ops */
1046 	rm->rm_ops = vdev_raidz_math_get_ops();
1047 
1048 	return (rm);
1049 }
1050 
1051 struct pqr_struct {
1052 	uint64_t *p;
1053 	uint64_t *q;
1054 	uint64_t *r;
1055 };
1056 
1057 static int
1058 vdev_raidz_p_func(void *buf, size_t size, void *private)
1059 {
1060 	struct pqr_struct *pqr = private;
1061 	const uint64_t *src = buf;
1062 	int cnt = size / sizeof (src[0]);
1063 
1064 	ASSERT(pqr->p && !pqr->q && !pqr->r);
1065 
1066 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1067 		*pqr->p ^= *src;
1068 
1069 	return (0);
1070 }
1071 
1072 static int
1073 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1074 {
1075 	struct pqr_struct *pqr = private;
1076 	const uint64_t *src = buf;
1077 	uint64_t mask;
1078 	int cnt = size / sizeof (src[0]);
1079 
1080 	ASSERT(pqr->p && pqr->q && !pqr->r);
1081 
1082 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1083 		*pqr->p ^= *src;
1084 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1085 		*pqr->q ^= *src;
1086 	}
1087 
1088 	return (0);
1089 }
1090 
1091 static int
1092 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1093 {
1094 	struct pqr_struct *pqr = private;
1095 	const uint64_t *src = buf;
1096 	uint64_t mask;
1097 	int cnt = size / sizeof (src[0]);
1098 
1099 	ASSERT(pqr->p && pqr->q && pqr->r);
1100 
1101 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1102 		*pqr->p ^= *src;
1103 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1104 		*pqr->q ^= *src;
1105 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1106 		*pqr->r ^= *src;
1107 	}
1108 
1109 	return (0);
1110 }
1111 
1112 static void
1113 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1114 {
1115 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1116 
1117 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1118 		abd_t *src = rr->rr_col[c].rc_abd;
1119 
1120 		if (c == rr->rr_firstdatacol) {
1121 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1122 		} else {
1123 			struct pqr_struct pqr = { p, NULL, NULL };
1124 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1125 			    vdev_raidz_p_func, &pqr);
1126 		}
1127 	}
1128 }
1129 
1130 static void
1131 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1132 {
1133 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1134 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1135 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1136 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1137 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1138 
1139 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1140 		abd_t *src = rr->rr_col[c].rc_abd;
1141 
1142 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1143 
1144 		if (c == rr->rr_firstdatacol) {
1145 			ASSERT(ccnt == pcnt || ccnt == 0);
1146 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1147 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1148 
1149 			for (uint64_t i = ccnt; i < pcnt; i++) {
1150 				p[i] = 0;
1151 				q[i] = 0;
1152 			}
1153 		} else {
1154 			struct pqr_struct pqr = { p, q, NULL };
1155 
1156 			ASSERT(ccnt <= pcnt);
1157 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1158 			    vdev_raidz_pq_func, &pqr);
1159 
1160 			/*
1161 			 * Treat short columns as though they are full of 0s.
1162 			 * Note that there's therefore nothing needed for P.
1163 			 */
1164 			uint64_t mask;
1165 			for (uint64_t i = ccnt; i < pcnt; i++) {
1166 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1167 			}
1168 		}
1169 	}
1170 }
1171 
1172 static void
1173 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1174 {
1175 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1176 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1177 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1178 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1179 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1180 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1181 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1182 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1183 
1184 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1185 		abd_t *src = rr->rr_col[c].rc_abd;
1186 
1187 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1188 
1189 		if (c == rr->rr_firstdatacol) {
1190 			ASSERT(ccnt == pcnt || ccnt == 0);
1191 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1192 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1193 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1194 
1195 			for (uint64_t i = ccnt; i < pcnt; i++) {
1196 				p[i] = 0;
1197 				q[i] = 0;
1198 				r[i] = 0;
1199 			}
1200 		} else {
1201 			struct pqr_struct pqr = { p, q, r };
1202 
1203 			ASSERT(ccnt <= pcnt);
1204 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1205 			    vdev_raidz_pqr_func, &pqr);
1206 
1207 			/*
1208 			 * Treat short columns as though they are full of 0s.
1209 			 * Note that there's therefore nothing needed for P.
1210 			 */
1211 			uint64_t mask;
1212 			for (uint64_t i = ccnt; i < pcnt; i++) {
1213 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1214 				VDEV_RAIDZ_64MUL_4(r[i], mask);
1215 			}
1216 		}
1217 	}
1218 }
1219 
1220 /*
1221  * Generate RAID parity in the first virtual columns according to the number of
1222  * parity columns available.
1223  */
1224 void
1225 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1226 {
1227 	if (rr->rr_cols == 0) {
1228 		/*
1229 		 * We are handling this block one row at a time (because
1230 		 * this block has a different logical vs physical width,
1231 		 * due to RAIDZ expansion), and this is a pad-only row,
1232 		 * which has no parity.
1233 		 */
1234 		return;
1235 	}
1236 
1237 	/* Generate using the new math implementation */
1238 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1239 		return;
1240 
1241 	switch (rr->rr_firstdatacol) {
1242 	case 1:
1243 		vdev_raidz_generate_parity_p(rr);
1244 		break;
1245 	case 2:
1246 		vdev_raidz_generate_parity_pq(rr);
1247 		break;
1248 	case 3:
1249 		vdev_raidz_generate_parity_pqr(rr);
1250 		break;
1251 	default:
1252 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1253 	}
1254 }
1255 
1256 void
1257 vdev_raidz_generate_parity(raidz_map_t *rm)
1258 {
1259 	for (int i = 0; i < rm->rm_nrows; i++) {
1260 		raidz_row_t *rr = rm->rm_row[i];
1261 		vdev_raidz_generate_parity_row(rm, rr);
1262 	}
1263 }
1264 
1265 static int
1266 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1267 {
1268 	(void) private;
1269 	uint64_t *dst = dbuf;
1270 	uint64_t *src = sbuf;
1271 	int cnt = size / sizeof (src[0]);
1272 
1273 	for (int i = 0; i < cnt; i++) {
1274 		dst[i] ^= src[i];
1275 	}
1276 
1277 	return (0);
1278 }
1279 
1280 static int
1281 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1282     void *private)
1283 {
1284 	(void) private;
1285 	uint64_t *dst = dbuf;
1286 	uint64_t *src = sbuf;
1287 	uint64_t mask;
1288 	int cnt = size / sizeof (dst[0]);
1289 
1290 	for (int i = 0; i < cnt; i++, dst++, src++) {
1291 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1292 		*dst ^= *src;
1293 	}
1294 
1295 	return (0);
1296 }
1297 
1298 static int
1299 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1300 {
1301 	(void) private;
1302 	uint64_t *dst = buf;
1303 	uint64_t mask;
1304 	int cnt = size / sizeof (dst[0]);
1305 
1306 	for (int i = 0; i < cnt; i++, dst++) {
1307 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1308 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1309 	}
1310 
1311 	return (0);
1312 }
1313 
1314 struct reconst_q_struct {
1315 	uint64_t *q;
1316 	int exp;
1317 };
1318 
1319 static int
1320 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1321 {
1322 	struct reconst_q_struct *rq = private;
1323 	uint64_t *dst = buf;
1324 	int cnt = size / sizeof (dst[0]);
1325 
1326 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1327 		int j;
1328 		uint8_t *b;
1329 
1330 		*dst ^= *rq->q;
1331 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1332 			*b = vdev_raidz_exp2(*b, rq->exp);
1333 		}
1334 	}
1335 
1336 	return (0);
1337 }
1338 
1339 struct reconst_pq_struct {
1340 	uint8_t *p;
1341 	uint8_t *q;
1342 	uint8_t *pxy;
1343 	uint8_t *qxy;
1344 	int aexp;
1345 	int bexp;
1346 };
1347 
1348 static int
1349 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1350 {
1351 	struct reconst_pq_struct *rpq = private;
1352 	uint8_t *xd = xbuf;
1353 	uint8_t *yd = ybuf;
1354 
1355 	for (int i = 0; i < size;
1356 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1357 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1358 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1359 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1360 	}
1361 
1362 	return (0);
1363 }
1364 
1365 static int
1366 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1367 {
1368 	struct reconst_pq_struct *rpq = private;
1369 	uint8_t *xd = xbuf;
1370 
1371 	for (int i = 0; i < size;
1372 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1373 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1374 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1375 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1376 	}
1377 
1378 	return (0);
1379 }
1380 
1381 static void
1382 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1383 {
1384 	int x = tgts[0];
1385 	abd_t *dst, *src;
1386 
1387 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1388 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1389 
1390 	ASSERT3U(ntgts, ==, 1);
1391 	ASSERT3U(x, >=, rr->rr_firstdatacol);
1392 	ASSERT3U(x, <, rr->rr_cols);
1393 
1394 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1395 
1396 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1397 	dst = rr->rr_col[x].rc_abd;
1398 
1399 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1400 
1401 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1402 		uint64_t size = MIN(rr->rr_col[x].rc_size,
1403 		    rr->rr_col[c].rc_size);
1404 
1405 		src = rr->rr_col[c].rc_abd;
1406 
1407 		if (c == x)
1408 			continue;
1409 
1410 		(void) abd_iterate_func2(dst, src, 0, 0, size,
1411 		    vdev_raidz_reconst_p_func, NULL);
1412 	}
1413 }
1414 
1415 static void
1416 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1417 {
1418 	int x = tgts[0];
1419 	int c, exp;
1420 	abd_t *dst, *src;
1421 
1422 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1423 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1424 
1425 	ASSERT(ntgts == 1);
1426 
1427 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1428 
1429 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1430 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1431 		    rr->rr_col[c].rc_size);
1432 
1433 		src = rr->rr_col[c].rc_abd;
1434 		dst = rr->rr_col[x].rc_abd;
1435 
1436 		if (c == rr->rr_firstdatacol) {
1437 			abd_copy(dst, src, size);
1438 			if (rr->rr_col[x].rc_size > size) {
1439 				abd_zero_off(dst, size,
1440 				    rr->rr_col[x].rc_size - size);
1441 			}
1442 		} else {
1443 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1444 			(void) abd_iterate_func2(dst, src, 0, 0, size,
1445 			    vdev_raidz_reconst_q_pre_func, NULL);
1446 			(void) abd_iterate_func(dst,
1447 			    size, rr->rr_col[x].rc_size - size,
1448 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1449 		}
1450 	}
1451 
1452 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1453 	dst = rr->rr_col[x].rc_abd;
1454 	exp = 255 - (rr->rr_cols - 1 - x);
1455 
1456 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1457 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1458 	    vdev_raidz_reconst_q_post_func, &rq);
1459 }
1460 
1461 static void
1462 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1463 {
1464 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1465 	abd_t *pdata, *qdata;
1466 	uint64_t xsize, ysize;
1467 	int x = tgts[0];
1468 	int y = tgts[1];
1469 	abd_t *xd, *yd;
1470 
1471 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1472 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1473 
1474 	ASSERT(ntgts == 2);
1475 	ASSERT(x < y);
1476 	ASSERT(x >= rr->rr_firstdatacol);
1477 	ASSERT(y < rr->rr_cols);
1478 
1479 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1480 
1481 	/*
1482 	 * Move the parity data aside -- we're going to compute parity as
1483 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1484 	 * reuse the parity generation mechanism without trashing the actual
1485 	 * parity so we make those columns appear to be full of zeros by
1486 	 * setting their lengths to zero.
1487 	 */
1488 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1489 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1490 	xsize = rr->rr_col[x].rc_size;
1491 	ysize = rr->rr_col[y].rc_size;
1492 
1493 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1494 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1495 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1496 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1497 	rr->rr_col[x].rc_size = 0;
1498 	rr->rr_col[y].rc_size = 0;
1499 
1500 	vdev_raidz_generate_parity_pq(rr);
1501 
1502 	rr->rr_col[x].rc_size = xsize;
1503 	rr->rr_col[y].rc_size = ysize;
1504 
1505 	p = abd_to_buf(pdata);
1506 	q = abd_to_buf(qdata);
1507 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1508 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1509 	xd = rr->rr_col[x].rc_abd;
1510 	yd = rr->rr_col[y].rc_abd;
1511 
1512 	/*
1513 	 * We now have:
1514 	 *	Pxy = P + D_x + D_y
1515 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1516 	 *
1517 	 * We can then solve for D_x:
1518 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1519 	 * where
1520 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1521 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1522 	 *
1523 	 * With D_x in hand, we can easily solve for D_y:
1524 	 *	D_y = P + Pxy + D_x
1525 	 */
1526 
1527 	a = vdev_raidz_pow2[255 + x - y];
1528 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1529 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1530 
1531 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1532 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1533 
1534 	ASSERT3U(xsize, >=, ysize);
1535 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1536 
1537 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1538 	    vdev_raidz_reconst_pq_func, &rpq);
1539 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1540 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1541 
1542 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1543 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1544 
1545 	/*
1546 	 * Restore the saved parity data.
1547 	 */
1548 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1549 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1550 }
1551 
1552 /*
1553  * In the general case of reconstruction, we must solve the system of linear
1554  * equations defined by the coefficients used to generate parity as well as
1555  * the contents of the data and parity disks. This can be expressed with
1556  * vectors for the original data (D) and the actual data (d) and parity (p)
1557  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1558  *
1559  *            __   __                     __     __
1560  *            |     |         __     __   |  p_0  |
1561  *            |  V  |         |  D_0  |   | p_m-1 |
1562  *            |     |    x    |   :   | = |  d_0  |
1563  *            |  I  |         | D_n-1 |   |   :   |
1564  *            |     |         ~~     ~~   | d_n-1 |
1565  *            ~~   ~~                     ~~     ~~
1566  *
1567  * I is simply a square identity matrix of size n, and V is a vandermonde
1568  * matrix defined by the coefficients we chose for the various parity columns
1569  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1570  * computation as well as linear separability.
1571  *
1572  *      __               __               __     __
1573  *      |   1   ..  1 1 1 |               |  p_0  |
1574  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1575  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1576  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1577  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1578  *      |   :       : : : |   |   :   |   |  d_2  |
1579  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1580  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1581  *      |   0   ..  0 0 1 |               | d_n-1 |
1582  *      ~~               ~~               ~~     ~~
1583  *
1584  * Note that I, V, d, and p are known. To compute D, we must invert the
1585  * matrix and use the known data and parity values to reconstruct the unknown
1586  * data values. We begin by removing the rows in V|I and d|p that correspond
1587  * to failed or missing columns; we then make V|I square (n x n) and d|p
1588  * sized n by removing rows corresponding to unused parity from the bottom up
1589  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1590  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1591  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1592  *           __                               __
1593  *           |  1   1   1   1   1   1   1   1  |
1594  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1595  *           |  19 205 116  29  64  16  4   1  |      / /
1596  *           |  1   0   0   0   0   0   0   0  |     / /
1597  *           |  0   1   0   0   0   0   0   0  | <--' /
1598  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1599  *           |  0   0   0   1   0   0   0   0  |
1600  *           |  0   0   0   0   1   0   0   0  |
1601  *           |  0   0   0   0   0   1   0   0  |
1602  *           |  0   0   0   0   0   0   1   0  |
1603  *           |  0   0   0   0   0   0   0   1  |
1604  *           ~~                               ~~
1605  *           __                               __
1606  *           |  1   1   1   1   1   1   1   1  |
1607  *           | 128  64  32  16  8   4   2   1  |
1608  *           |  19 205 116  29  64  16  4   1  |
1609  *           |  1   0   0   0   0   0   0   0  |
1610  *           |  0   1   0   0   0   0   0   0  |
1611  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1612  *           |  0   0   0   1   0   0   0   0  |
1613  *           |  0   0   0   0   1   0   0   0  |
1614  *           |  0   0   0   0   0   1   0   0  |
1615  *           |  0   0   0   0   0   0   1   0  |
1616  *           |  0   0   0   0   0   0   0   1  |
1617  *           ~~                               ~~
1618  *
1619  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1620  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1621  * matrix is not singular.
1622  * __                                                                 __
1623  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1624  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1625  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1626  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1627  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1628  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1629  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1630  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1631  * ~~                                                                 ~~
1632  * __                                                                 __
1633  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1634  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1635  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1636  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1637  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1638  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1639  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1640  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1641  * ~~                                                                 ~~
1642  * __                                                                 __
1643  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1644  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1645  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1646  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1647  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1648  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1649  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1650  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1651  * ~~                                                                 ~~
1652  * __                                                                 __
1653  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1654  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1655  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1656  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1657  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1658  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1659  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1660  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1661  * ~~                                                                 ~~
1662  * __                                                                 __
1663  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1664  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1665  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1666  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1667  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1668  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1669  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1670  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1671  * ~~                                                                 ~~
1672  * __                                                                 __
1673  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1674  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1675  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1676  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1677  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1678  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1679  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1680  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1681  * ~~                                                                 ~~
1682  *                   __                               __
1683  *                   |  0   0   1   0   0   0   0   0  |
1684  *                   | 167 100  5   41 159 169 217 208 |
1685  *                   | 166 100  4   40 158 168 216 209 |
1686  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1687  *                   |  0   0   0   0   1   0   0   0  |
1688  *                   |  0   0   0   0   0   1   0   0  |
1689  *                   |  0   0   0   0   0   0   1   0  |
1690  *                   |  0   0   0   0   0   0   0   1  |
1691  *                   ~~                               ~~
1692  *
1693  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1694  * of the missing data.
1695  *
1696  * As is apparent from the example above, the only non-trivial rows in the
1697  * inverse matrix correspond to the data disks that we're trying to
1698  * reconstruct. Indeed, those are the only rows we need as the others would
1699  * only be useful for reconstructing data known or assumed to be valid. For
1700  * that reason, we only build the coefficients in the rows that correspond to
1701  * targeted columns.
1702  */
1703 
1704 static void
1705 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1706     uint8_t **rows)
1707 {
1708 	int i, j;
1709 	int pow;
1710 
1711 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1712 
1713 	/*
1714 	 * Fill in the missing rows of interest.
1715 	 */
1716 	for (i = 0; i < nmap; i++) {
1717 		ASSERT3S(0, <=, map[i]);
1718 		ASSERT3S(map[i], <=, 2);
1719 
1720 		pow = map[i] * n;
1721 		if (pow > 255)
1722 			pow -= 255;
1723 		ASSERT(pow <= 255);
1724 
1725 		for (j = 0; j < n; j++) {
1726 			pow -= map[i];
1727 			if (pow < 0)
1728 				pow += 255;
1729 			rows[i][j] = vdev_raidz_pow2[pow];
1730 		}
1731 	}
1732 }
1733 
1734 static void
1735 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1736     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1737 {
1738 	int i, j, ii, jj;
1739 	uint8_t log;
1740 
1741 	/*
1742 	 * Assert that the first nmissing entries from the array of used
1743 	 * columns correspond to parity columns and that subsequent entries
1744 	 * correspond to data columns.
1745 	 */
1746 	for (i = 0; i < nmissing; i++) {
1747 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1748 	}
1749 	for (; i < n; i++) {
1750 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1751 	}
1752 
1753 	/*
1754 	 * First initialize the storage where we'll compute the inverse rows.
1755 	 */
1756 	for (i = 0; i < nmissing; i++) {
1757 		for (j = 0; j < n; j++) {
1758 			invrows[i][j] = (i == j) ? 1 : 0;
1759 		}
1760 	}
1761 
1762 	/*
1763 	 * Subtract all trivial rows from the rows of consequence.
1764 	 */
1765 	for (i = 0; i < nmissing; i++) {
1766 		for (j = nmissing; j < n; j++) {
1767 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1768 			jj = used[j] - rr->rr_firstdatacol;
1769 			ASSERT3S(jj, <, n);
1770 			invrows[i][j] = rows[i][jj];
1771 			rows[i][jj] = 0;
1772 		}
1773 	}
1774 
1775 	/*
1776 	 * For each of the rows of interest, we must normalize it and subtract
1777 	 * a multiple of it from the other rows.
1778 	 */
1779 	for (i = 0; i < nmissing; i++) {
1780 		for (j = 0; j < missing[i]; j++) {
1781 			ASSERT0(rows[i][j]);
1782 		}
1783 		ASSERT3U(rows[i][missing[i]], !=, 0);
1784 
1785 		/*
1786 		 * Compute the inverse of the first element and multiply each
1787 		 * element in the row by that value.
1788 		 */
1789 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1790 
1791 		for (j = 0; j < n; j++) {
1792 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1793 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1794 		}
1795 
1796 		for (ii = 0; ii < nmissing; ii++) {
1797 			if (i == ii)
1798 				continue;
1799 
1800 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1801 
1802 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1803 
1804 			for (j = 0; j < n; j++) {
1805 				rows[ii][j] ^=
1806 				    vdev_raidz_exp2(rows[i][j], log);
1807 				invrows[ii][j] ^=
1808 				    vdev_raidz_exp2(invrows[i][j], log);
1809 			}
1810 		}
1811 	}
1812 
1813 	/*
1814 	 * Verify that the data that is left in the rows are properly part of
1815 	 * an identity matrix.
1816 	 */
1817 	for (i = 0; i < nmissing; i++) {
1818 		for (j = 0; j < n; j++) {
1819 			if (j == missing[i]) {
1820 				ASSERT3U(rows[i][j], ==, 1);
1821 			} else {
1822 				ASSERT0(rows[i][j]);
1823 			}
1824 		}
1825 	}
1826 }
1827 
1828 static void
1829 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1830     int *missing, uint8_t **invrows, const uint8_t *used)
1831 {
1832 	int i, j, x, cc, c;
1833 	uint8_t *src;
1834 	uint64_t ccount;
1835 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1836 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1837 	uint8_t log = 0;
1838 	uint8_t val;
1839 	int ll;
1840 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1841 	uint8_t *p, *pp;
1842 	size_t psize;
1843 
1844 	psize = sizeof (invlog[0][0]) * n * nmissing;
1845 	p = kmem_alloc(psize, KM_SLEEP);
1846 
1847 	for (pp = p, i = 0; i < nmissing; i++) {
1848 		invlog[i] = pp;
1849 		pp += n;
1850 	}
1851 
1852 	for (i = 0; i < nmissing; i++) {
1853 		for (j = 0; j < n; j++) {
1854 			ASSERT3U(invrows[i][j], !=, 0);
1855 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1856 		}
1857 	}
1858 
1859 	for (i = 0; i < n; i++) {
1860 		c = used[i];
1861 		ASSERT3U(c, <, rr->rr_cols);
1862 
1863 		ccount = rr->rr_col[c].rc_size;
1864 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1865 		if (ccount == 0)
1866 			continue;
1867 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1868 		for (j = 0; j < nmissing; j++) {
1869 			cc = missing[j] + rr->rr_firstdatacol;
1870 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1871 			ASSERT3U(cc, <, rr->rr_cols);
1872 			ASSERT3U(cc, !=, c);
1873 
1874 			dcount[j] = rr->rr_col[cc].rc_size;
1875 			if (dcount[j] != 0)
1876 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1877 		}
1878 
1879 		for (x = 0; x < ccount; x++, src++) {
1880 			if (*src != 0)
1881 				log = vdev_raidz_log2[*src];
1882 
1883 			for (cc = 0; cc < nmissing; cc++) {
1884 				if (x >= dcount[cc])
1885 					continue;
1886 
1887 				if (*src == 0) {
1888 					val = 0;
1889 				} else {
1890 					if ((ll = log + invlog[cc][i]) >= 255)
1891 						ll -= 255;
1892 					val = vdev_raidz_pow2[ll];
1893 				}
1894 
1895 				if (i == 0)
1896 					dst[cc][x] = val;
1897 				else
1898 					dst[cc][x] ^= val;
1899 			}
1900 		}
1901 	}
1902 
1903 	kmem_free(p, psize);
1904 }
1905 
1906 static void
1907 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1908 {
1909 	int i, c, t, tt;
1910 	unsigned int n;
1911 	unsigned int nmissing_rows;
1912 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1913 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1914 	uint8_t *p, *pp;
1915 	size_t psize;
1916 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1917 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1918 	uint8_t *used;
1919 
1920 	abd_t **bufs = NULL;
1921 
1922 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1923 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1924 	/*
1925 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1926 	 * temporary linear ABDs if any non-linear ABDs are found.
1927 	 */
1928 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1929 		ASSERT(rr->rr_col[i].rc_abd != NULL);
1930 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1931 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1932 			    KM_PUSHPAGE);
1933 
1934 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1935 				raidz_col_t *col = &rr->rr_col[c];
1936 
1937 				bufs[c] = col->rc_abd;
1938 				if (bufs[c] != NULL) {
1939 					col->rc_abd = abd_alloc_linear(
1940 					    col->rc_size, B_TRUE);
1941 					abd_copy(col->rc_abd, bufs[c],
1942 					    col->rc_size);
1943 				}
1944 			}
1945 
1946 			break;
1947 		}
1948 	}
1949 
1950 	n = rr->rr_cols - rr->rr_firstdatacol;
1951 
1952 	/*
1953 	 * Figure out which data columns are missing.
1954 	 */
1955 	nmissing_rows = 0;
1956 	for (t = 0; t < ntgts; t++) {
1957 		if (tgts[t] >= rr->rr_firstdatacol) {
1958 			missing_rows[nmissing_rows++] =
1959 			    tgts[t] - rr->rr_firstdatacol;
1960 		}
1961 	}
1962 
1963 	/*
1964 	 * Figure out which parity columns to use to help generate the missing
1965 	 * data columns.
1966 	 */
1967 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1968 		ASSERT(tt < ntgts);
1969 		ASSERT(c < rr->rr_firstdatacol);
1970 
1971 		/*
1972 		 * Skip any targeted parity columns.
1973 		 */
1974 		if (c == tgts[tt]) {
1975 			tt++;
1976 			continue;
1977 		}
1978 
1979 		parity_map[i] = c;
1980 		i++;
1981 	}
1982 
1983 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1984 	    nmissing_rows * n + sizeof (used[0]) * n;
1985 	p = kmem_alloc(psize, KM_SLEEP);
1986 
1987 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1988 		rows[i] = pp;
1989 		pp += n;
1990 		invrows[i] = pp;
1991 		pp += n;
1992 	}
1993 	used = pp;
1994 
1995 	for (i = 0; i < nmissing_rows; i++) {
1996 		used[i] = parity_map[i];
1997 	}
1998 
1999 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2000 		if (tt < nmissing_rows &&
2001 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
2002 			tt++;
2003 			continue;
2004 		}
2005 
2006 		ASSERT3S(i, <, n);
2007 		used[i] = c;
2008 		i++;
2009 	}
2010 
2011 	/*
2012 	 * Initialize the interesting rows of the matrix.
2013 	 */
2014 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2015 
2016 	/*
2017 	 * Invert the matrix.
2018 	 */
2019 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2020 	    invrows, used);
2021 
2022 	/*
2023 	 * Reconstruct the missing data using the generated matrix.
2024 	 */
2025 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2026 	    invrows, used);
2027 
2028 	kmem_free(p, psize);
2029 
2030 	/*
2031 	 * copy back from temporary linear abds and free them
2032 	 */
2033 	if (bufs) {
2034 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2035 			raidz_col_t *col = &rr->rr_col[c];
2036 
2037 			if (bufs[c] != NULL) {
2038 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2039 				abd_free(col->rc_abd);
2040 			}
2041 			col->rc_abd = bufs[c];
2042 		}
2043 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2044 	}
2045 }
2046 
2047 static void
2048 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2049     const int *t, int nt)
2050 {
2051 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2052 	int ntgts;
2053 	int i, c, ret;
2054 	int nbadparity, nbaddata;
2055 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2056 
2057 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2058 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2059 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2060 		    (int)rr->rr_missingparity);
2061 	}
2062 
2063 	nbadparity = rr->rr_firstdatacol;
2064 	nbaddata = rr->rr_cols - nbadparity;
2065 	ntgts = 0;
2066 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2067 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2068 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2069 			    "offset=%llx error=%u)",
2070 			    rr, c, (int)rr->rr_col[c].rc_devidx,
2071 			    (long long)rr->rr_col[c].rc_offset,
2072 			    (int)rr->rr_col[c].rc_error);
2073 		}
2074 		if (c < rr->rr_firstdatacol)
2075 			parity_valid[c] = B_FALSE;
2076 
2077 		if (i < nt && c == t[i]) {
2078 			tgts[ntgts++] = c;
2079 			i++;
2080 		} else if (rr->rr_col[c].rc_error != 0) {
2081 			tgts[ntgts++] = c;
2082 		} else if (c >= rr->rr_firstdatacol) {
2083 			nbaddata--;
2084 		} else {
2085 			parity_valid[c] = B_TRUE;
2086 			nbadparity--;
2087 		}
2088 	}
2089 
2090 	ASSERT(ntgts >= nt);
2091 	ASSERT(nbaddata >= 0);
2092 	ASSERT(nbaddata + nbadparity == ntgts);
2093 
2094 	dt = &tgts[nbadparity];
2095 
2096 	/* Reconstruct using the new math implementation */
2097 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2098 	if (ret != RAIDZ_ORIGINAL_IMPL)
2099 		return;
2100 
2101 	/*
2102 	 * See if we can use any of our optimized reconstruction routines.
2103 	 */
2104 	switch (nbaddata) {
2105 	case 1:
2106 		if (parity_valid[VDEV_RAIDZ_P]) {
2107 			vdev_raidz_reconstruct_p(rr, dt, 1);
2108 			return;
2109 		}
2110 
2111 		ASSERT(rr->rr_firstdatacol > 1);
2112 
2113 		if (parity_valid[VDEV_RAIDZ_Q]) {
2114 			vdev_raidz_reconstruct_q(rr, dt, 1);
2115 			return;
2116 		}
2117 
2118 		ASSERT(rr->rr_firstdatacol > 2);
2119 		break;
2120 
2121 	case 2:
2122 		ASSERT(rr->rr_firstdatacol > 1);
2123 
2124 		if (parity_valid[VDEV_RAIDZ_P] &&
2125 		    parity_valid[VDEV_RAIDZ_Q]) {
2126 			vdev_raidz_reconstruct_pq(rr, dt, 2);
2127 			return;
2128 		}
2129 
2130 		ASSERT(rr->rr_firstdatacol > 2);
2131 
2132 		break;
2133 	}
2134 
2135 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2136 }
2137 
2138 static int
2139 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2140     uint64_t *logical_ashift, uint64_t *physical_ashift)
2141 {
2142 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2143 	uint64_t nparity = vdrz->vd_nparity;
2144 	int c;
2145 	int lasterror = 0;
2146 	int numerrors = 0;
2147 
2148 	ASSERT(nparity > 0);
2149 
2150 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2151 	    vd->vdev_children < nparity + 1) {
2152 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2153 		return (SET_ERROR(EINVAL));
2154 	}
2155 
2156 	vdev_open_children(vd);
2157 
2158 	for (c = 0; c < vd->vdev_children; c++) {
2159 		vdev_t *cvd = vd->vdev_child[c];
2160 
2161 		if (cvd->vdev_open_error != 0) {
2162 			lasterror = cvd->vdev_open_error;
2163 			numerrors++;
2164 			continue;
2165 		}
2166 
2167 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2168 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2169 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2170 	}
2171 	for (c = 0; c < vd->vdev_children; c++) {
2172 		vdev_t *cvd = vd->vdev_child[c];
2173 
2174 		if (cvd->vdev_open_error != 0)
2175 			continue;
2176 		*physical_ashift = vdev_best_ashift(*logical_ashift,
2177 		    *physical_ashift, cvd->vdev_physical_ashift);
2178 	}
2179 
2180 	if (vd->vdev_rz_expanding) {
2181 		*asize *= vd->vdev_children - 1;
2182 		*max_asize *= vd->vdev_children - 1;
2183 
2184 		vd->vdev_min_asize = *asize;
2185 	} else {
2186 		*asize *= vd->vdev_children;
2187 		*max_asize *= vd->vdev_children;
2188 	}
2189 
2190 	if (numerrors > nparity) {
2191 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2192 		return (lasterror);
2193 	}
2194 
2195 	return (0);
2196 }
2197 
2198 static void
2199 vdev_raidz_close(vdev_t *vd)
2200 {
2201 	for (int c = 0; c < vd->vdev_children; c++) {
2202 		if (vd->vdev_child[c] != NULL)
2203 			vdev_close(vd->vdev_child[c]);
2204 	}
2205 }
2206 
2207 /*
2208  * Return the logical width to use, given the txg in which the allocation
2209  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2210  * BP was allocated.  Remapped BP's (that were relocated due to device
2211  * removal, see remap_blkptr_cb()), will have a more recent physical birth
2212  * which reflects when the BP was relocated, but we can ignore these because
2213  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2214  */
2215 static uint64_t
2216 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2217 {
2218 	reflow_node_t lookup = {
2219 		.re_txg = txg,
2220 	};
2221 	avl_index_t where;
2222 
2223 	uint64_t width;
2224 	mutex_enter(&vdrz->vd_expand_lock);
2225 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2226 	if (re != NULL) {
2227 		width = re->re_logical_width;
2228 	} else {
2229 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2230 		if (re != NULL)
2231 			width = re->re_logical_width;
2232 		else
2233 			width = vdrz->vd_original_width;
2234 	}
2235 	mutex_exit(&vdrz->vd_expand_lock);
2236 	return (width);
2237 }
2238 /*
2239  * This code converts an asize into the largest psize that can safely be written
2240  * to an allocation of that size for this vdev.
2241  *
2242  * Note that this function will not take into account the effect of gang
2243  * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
2244  * the psize_to_asize function.
2245  */
2246 static uint64_t
2247 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
2248 {
2249 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2250 	uint64_t psize;
2251 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2252 	uint64_t cols = vdrz->vd_original_width;
2253 	uint64_t nparity = vdrz->vd_nparity;
2254 
2255 	cols = vdev_raidz_get_logical_width(vdrz, txg);
2256 
2257 	ASSERT0(asize % (1 << ashift));
2258 
2259 	psize = (asize >> ashift);
2260 	psize -= nparity * DIV_ROUND_UP(psize, cols);
2261 	psize <<= ashift;
2262 
2263 	return (asize);
2264 }
2265 
2266 /*
2267  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2268  * more space due to the lower data-to-parity ratio.  In this case it's
2269  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2270  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2271  * regardless of txg.  This is assured because for a single data sector, we
2272  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2273  */
2274 static uint64_t
2275 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2276 {
2277 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2278 	uint64_t asize;
2279 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2280 	uint64_t cols = vdrz->vd_original_width;
2281 	uint64_t nparity = vdrz->vd_nparity;
2282 
2283 	cols = vdev_raidz_get_logical_width(vdrz, txg);
2284 
2285 	asize = ((psize - 1) >> ashift) + 1;
2286 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2287 	asize = roundup(asize, nparity + 1) << ashift;
2288 
2289 #ifdef ZFS_DEBUG
2290 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2291 	uint64_t ncols_new = vdrz->vd_physical_width;
2292 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2293 	    (ncols_new - nparity));
2294 	asize_new = roundup(asize_new, nparity + 1) << ashift;
2295 	VERIFY3U(asize_new, <=, asize);
2296 #endif
2297 
2298 	return (asize);
2299 }
2300 
2301 /*
2302  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2303  * so each child must provide at least 1/Nth of its asize.
2304  */
2305 static uint64_t
2306 vdev_raidz_min_asize(vdev_t *vd)
2307 {
2308 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2309 	    vd->vdev_children);
2310 }
2311 
2312 void
2313 vdev_raidz_child_done(zio_t *zio)
2314 {
2315 	raidz_col_t *rc = zio->io_private;
2316 
2317 	ASSERT3P(rc->rc_abd, !=, NULL);
2318 	rc->rc_error = zio->io_error;
2319 	rc->rc_tried = 1;
2320 	rc->rc_skipped = 0;
2321 }
2322 
2323 static void
2324 vdev_raidz_shadow_child_done(zio_t *zio)
2325 {
2326 	raidz_col_t *rc = zio->io_private;
2327 
2328 	rc->rc_shadow_error = zio->io_error;
2329 }
2330 
2331 static void
2332 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2333 {
2334 	(void) rm;
2335 #ifdef ZFS_DEBUG
2336 	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
2337 	logical_rs.rs_start = rr->rr_offset;
2338 	logical_rs.rs_end = logical_rs.rs_start +
2339 	    vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
2340 	    BP_GET_BIRTH(zio->io_bp));
2341 
2342 	raidz_col_t *rc = &rr->rr_col[col];
2343 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2344 
2345 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2346 	ASSERT(vdev_xlate_is_empty(&remain_rs));
2347 	if (vdev_xlate_is_empty(&physical_rs)) {
2348 		/*
2349 		 * If we are in the middle of expansion, the
2350 		 * physical->logical mapping is changing so vdev_xlate()
2351 		 * can't give us a reliable answer.
2352 		 */
2353 		return;
2354 	}
2355 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2356 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2357 	/*
2358 	 * It would be nice to assert that rs_end is equal
2359 	 * to rc_offset + rc_size but there might be an
2360 	 * optional I/O at the end that is not accounted in
2361 	 * rc_size.
2362 	 */
2363 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2364 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2365 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2366 	} else {
2367 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2368 	}
2369 #endif
2370 }
2371 
2372 static void
2373 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2374 {
2375 	vdev_t *vd = zio->io_vd;
2376 	raidz_map_t *rm = zio->io_vsd;
2377 
2378 	vdev_raidz_generate_parity_row(rm, rr);
2379 
2380 	for (int c = 0; c < rr->rr_scols; c++) {
2381 		raidz_col_t *rc = &rr->rr_col[c];
2382 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2383 
2384 		/* Verify physical to logical translation */
2385 		vdev_raidz_io_verify(zio, rm, rr, c);
2386 
2387 		if (rc->rc_size == 0)
2388 			continue;
2389 
2390 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2391 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2392 
2393 		ASSERT3P(rc->rc_abd, !=, NULL);
2394 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2395 		    rc->rc_offset, rc->rc_abd,
2396 		    abd_get_size(rc->rc_abd), zio->io_type,
2397 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2398 
2399 		if (rc->rc_shadow_devidx != INT_MAX) {
2400 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2401 
2402 			ASSERT3U(
2403 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2404 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2405 
2406 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2407 			    rc->rc_shadow_offset, rc->rc_abd,
2408 			    abd_get_size(rc->rc_abd),
2409 			    zio->io_type, zio->io_priority, 0,
2410 			    vdev_raidz_shadow_child_done, rc));
2411 		}
2412 	}
2413 }
2414 
2415 /*
2416  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2417  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2418  */
2419 static void
2420 raidz_start_skip_writes(zio_t *zio)
2421 {
2422 	vdev_t *vd = zio->io_vd;
2423 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2424 	raidz_map_t *rm = zio->io_vsd;
2425 	ASSERT3U(rm->rm_nrows, ==, 1);
2426 	raidz_row_t *rr = rm->rm_row[0];
2427 	for (int c = 0; c < rr->rr_scols; c++) {
2428 		raidz_col_t *rc = &rr->rr_col[c];
2429 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2430 		if (rc->rc_size != 0)
2431 			continue;
2432 		ASSERT3P(rc->rc_abd, ==, NULL);
2433 
2434 		ASSERT3U(rc->rc_offset, <,
2435 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2436 
2437 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2438 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2439 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2440 	}
2441 }
2442 
2443 static void
2444 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2445 {
2446 	vdev_t *vd = zio->io_vd;
2447 
2448 	/*
2449 	 * Iterate over the columns in reverse order so that we hit the parity
2450 	 * last -- any errors along the way will force us to read the parity.
2451 	 */
2452 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2453 		raidz_col_t *rc = &rr->rr_col[c];
2454 		if (rc->rc_size == 0)
2455 			continue;
2456 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2457 		if (!vdev_readable(cvd)) {
2458 			if (c >= rr->rr_firstdatacol)
2459 				rr->rr_missingdata++;
2460 			else
2461 				rr->rr_missingparity++;
2462 			rc->rc_error = SET_ERROR(ENXIO);
2463 			rc->rc_tried = 1;	/* don't even try */
2464 			rc->rc_skipped = 1;
2465 			continue;
2466 		}
2467 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2468 			if (c >= rr->rr_firstdatacol)
2469 				rr->rr_missingdata++;
2470 			else
2471 				rr->rr_missingparity++;
2472 			rc->rc_error = SET_ERROR(ESTALE);
2473 			rc->rc_skipped = 1;
2474 			continue;
2475 		}
2476 		if (forceparity ||
2477 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2478 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2479 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2480 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2481 			    zio->io_type, zio->io_priority, 0,
2482 			    vdev_raidz_child_done, rc));
2483 		}
2484 	}
2485 }
2486 
2487 static void
2488 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2489 {
2490 	vdev_t *vd = zio->io_vd;
2491 
2492 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2493 		raidz_col_t *prc = &rm->rm_phys_col[i];
2494 		if (prc->rc_size == 0)
2495 			continue;
2496 
2497 		ASSERT3U(prc->rc_devidx, ==, i);
2498 		vdev_t *cvd = vd->vdev_child[i];
2499 		if (!vdev_readable(cvd)) {
2500 			prc->rc_error = SET_ERROR(ENXIO);
2501 			prc->rc_tried = 1;	/* don't even try */
2502 			prc->rc_skipped = 1;
2503 			continue;
2504 		}
2505 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2506 			prc->rc_error = SET_ERROR(ESTALE);
2507 			prc->rc_skipped = 1;
2508 			continue;
2509 		}
2510 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2511 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2512 		    zio->io_type, zio->io_priority, 0,
2513 		    vdev_raidz_child_done, prc));
2514 	}
2515 }
2516 
2517 static void
2518 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2519 {
2520 	/*
2521 	 * If there are multiple rows, we will be hitting
2522 	 * all disks, so go ahead and read the parity so
2523 	 * that we are reading in decent size chunks.
2524 	 */
2525 	boolean_t forceparity = rm->rm_nrows > 1;
2526 
2527 	if (rm->rm_phys_col) {
2528 		vdev_raidz_io_start_read_phys_cols(zio, rm);
2529 	} else {
2530 		for (int i = 0; i < rm->rm_nrows; i++) {
2531 			raidz_row_t *rr = rm->rm_row[i];
2532 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2533 		}
2534 	}
2535 }
2536 
2537 /*
2538  * Start an IO operation on a RAIDZ VDev
2539  *
2540  * Outline:
2541  * - For write operations:
2542  *   1. Generate the parity data
2543  *   2. Create child zio write operations to each column's vdev, for both
2544  *      data and parity.
2545  *   3. If the column skips any sectors for padding, create optional dummy
2546  *      write zio children for those areas to improve aggregation continuity.
2547  * - For read operations:
2548  *   1. Create child zio read operations to each data column's vdev to read
2549  *      the range of data required for zio.
2550  *   2. If this is a scrub or resilver operation, or if any of the data
2551  *      vdevs have had errors, then create zio read operations to the parity
2552  *      columns' VDevs as well.
2553  */
2554 static void
2555 vdev_raidz_io_start(zio_t *zio)
2556 {
2557 	vdev_t *vd = zio->io_vd;
2558 	vdev_t *tvd = vd->vdev_top;
2559 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2560 	raidz_map_t *rm;
2561 
2562 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2563 	    BP_GET_BIRTH(zio->io_bp));
2564 	if (logical_width != vdrz->vd_physical_width) {
2565 		zfs_locked_range_t *lr = NULL;
2566 		uint64_t synced_offset = UINT64_MAX;
2567 		uint64_t next_offset = UINT64_MAX;
2568 		boolean_t use_scratch = B_FALSE;
2569 		/*
2570 		 * Note: when the expansion is completing, we set
2571 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2572 		 * in a later txg than when we last update spa_ubsync's state
2573 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2574 		 * may see vre_state!=SCANNING before
2575 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2576 		 * on disk, but the copying progress has been synced to disk
2577 		 * (and reflected in spa_ubsync).  In this case it's fine to
2578 		 * treat the expansion as completed, since if we crash there's
2579 		 * no additional copying to do.
2580 		 */
2581 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2582 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2583 			    &vdrz->vn_vre);
2584 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2585 			    zio->io_offset, zio->io_size, RL_READER);
2586 			use_scratch =
2587 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2588 			    RRSS_SCRATCH_VALID);
2589 			synced_offset =
2590 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2591 			next_offset = vdrz->vn_vre.vre_offset;
2592 			/*
2593 			 * If we haven't resumed expanding since importing the
2594 			 * pool, vre_offset won't have been set yet.  In
2595 			 * this case the next offset to be copied is the same
2596 			 * as what was synced.
2597 			 */
2598 			if (next_offset == UINT64_MAX) {
2599 				next_offset = synced_offset;
2600 			}
2601 		}
2602 		if (use_scratch) {
2603 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2604 			    "%lld next_offset=%lld use_scratch=%u",
2605 			    zio,
2606 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2607 			    (long long)zio->io_offset,
2608 			    (long long)synced_offset,
2609 			    (long long)next_offset,
2610 			    use_scratch);
2611 		}
2612 
2613 		rm = vdev_raidz_map_alloc_expanded(zio,
2614 		    tvd->vdev_ashift, vdrz->vd_physical_width,
2615 		    logical_width, vdrz->vd_nparity,
2616 		    synced_offset, next_offset, use_scratch);
2617 		rm->rm_lr = lr;
2618 	} else {
2619 		rm = vdev_raidz_map_alloc(zio,
2620 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2621 	}
2622 	rm->rm_original_width = vdrz->vd_original_width;
2623 
2624 	zio->io_vsd = rm;
2625 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2626 	if (zio->io_type == ZIO_TYPE_WRITE) {
2627 		for (int i = 0; i < rm->rm_nrows; i++) {
2628 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2629 		}
2630 
2631 		if (logical_width == vdrz->vd_physical_width) {
2632 			raidz_start_skip_writes(zio);
2633 		}
2634 	} else {
2635 		ASSERT(zio->io_type == ZIO_TYPE_READ);
2636 		vdev_raidz_io_start_read(zio, rm);
2637 	}
2638 
2639 	zio_execute(zio);
2640 }
2641 
2642 /*
2643  * Report a checksum error for a child of a RAID-Z device.
2644  */
2645 void
2646 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2647 {
2648 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2649 
2650 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2651 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2652 		zio_bad_cksum_t zbc;
2653 		raidz_map_t *rm = zio->io_vsd;
2654 
2655 		zbc.zbc_has_cksum = 0;
2656 		zbc.zbc_injected = rm->rm_ecksuminjected;
2657 
2658 		mutex_enter(&vd->vdev_stat_lock);
2659 		vd->vdev_stat.vs_checksum_errors++;
2660 		mutex_exit(&vd->vdev_stat_lock);
2661 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2662 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2663 		    rc->rc_abd, bad_data, &zbc);
2664 	}
2665 }
2666 
2667 /*
2668  * We keep track of whether or not there were any injected errors, so that
2669  * any ereports we generate can note it.
2670  */
2671 static int
2672 raidz_checksum_verify(zio_t *zio)
2673 {
2674 	zio_bad_cksum_t zbc = {0};
2675 	raidz_map_t *rm = zio->io_vsd;
2676 
2677 	int ret = zio_checksum_error(zio, &zbc);
2678 	/*
2679 	 * Any Direct I/O read that has a checksum error must be treated as
2680 	 * suspicious as the contents of the buffer could be getting
2681 	 * manipulated while the I/O is taking place. The checksum verify error
2682 	 * will be reported to the top-level RAIDZ VDEV.
2683 	 */
2684 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2685 		zio->io_error = ret;
2686 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
2687 		zio_dio_chksum_verify_error_report(zio);
2688 		zio_checksum_verified(zio);
2689 		return (0);
2690 	}
2691 
2692 	if (ret != 0 && zbc.zbc_injected != 0)
2693 		rm->rm_ecksuminjected = 1;
2694 
2695 	return (ret);
2696 }
2697 
2698 /*
2699  * Generate the parity from the data columns. If we tried and were able to
2700  * read the parity without error, verify that the generated parity matches the
2701  * data we read. If it doesn't, we fire off a checksum error. Return the
2702  * number of such failures.
2703  */
2704 static int
2705 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2706 {
2707 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2708 	int c, ret = 0;
2709 	raidz_map_t *rm = zio->io_vsd;
2710 	raidz_col_t *rc;
2711 
2712 	blkptr_t *bp = zio->io_bp;
2713 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2714 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2715 
2716 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2717 		return (ret);
2718 
2719 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2720 		rc = &rr->rr_col[c];
2721 		if (!rc->rc_tried || rc->rc_error != 0)
2722 			continue;
2723 
2724 		orig[c] = rc->rc_abd;
2725 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2726 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2727 	}
2728 
2729 	/*
2730 	 * Verify any empty sectors are zero filled to ensure the parity
2731 	 * is calculated correctly even if these non-data sectors are damaged.
2732 	 */
2733 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2734 		ret += vdev_draid_map_verify_empty(zio, rr);
2735 
2736 	/*
2737 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
2738 	 * isn't harmful but it does have the side effect of fixing stuff
2739 	 * we didn't realize was necessary (i.e. even if we return 0).
2740 	 */
2741 	vdev_raidz_generate_parity_row(rm, rr);
2742 
2743 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2744 		rc = &rr->rr_col[c];
2745 
2746 		if (!rc->rc_tried || rc->rc_error != 0)
2747 			continue;
2748 
2749 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2750 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2751 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2752 			vdev_raidz_checksum_error(zio, rc, orig[c]);
2753 			rc->rc_error = SET_ERROR(ECKSUM);
2754 			ret++;
2755 		}
2756 		abd_free(orig[c]);
2757 	}
2758 
2759 	return (ret);
2760 }
2761 
2762 static int
2763 vdev_raidz_worst_error(raidz_row_t *rr)
2764 {
2765 	int error = 0;
2766 
2767 	for (int c = 0; c < rr->rr_cols; c++) {
2768 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2769 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2770 	}
2771 
2772 	return (error);
2773 }
2774 
2775 static void
2776 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2777 {
2778 	int unexpected_errors = 0;
2779 	int parity_errors = 0;
2780 	int parity_untried = 0;
2781 	int data_errors = 0;
2782 
2783 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2784 
2785 	for (int c = 0; c < rr->rr_cols; c++) {
2786 		raidz_col_t *rc = &rr->rr_col[c];
2787 
2788 		if (rc->rc_error) {
2789 			if (c < rr->rr_firstdatacol)
2790 				parity_errors++;
2791 			else
2792 				data_errors++;
2793 
2794 			if (!rc->rc_skipped)
2795 				unexpected_errors++;
2796 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2797 			parity_untried++;
2798 		}
2799 
2800 		if (rc->rc_force_repair)
2801 			unexpected_errors++;
2802 	}
2803 
2804 	/*
2805 	 * If we read more parity disks than were used for
2806 	 * reconstruction, confirm that the other parity disks produced
2807 	 * correct data.
2808 	 *
2809 	 * Note that we also regenerate parity when resilvering so we
2810 	 * can write it out to failed devices later.
2811 	 */
2812 	if (parity_errors + parity_untried <
2813 	    rr->rr_firstdatacol - data_errors ||
2814 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2815 		int n = raidz_parity_verify(zio, rr);
2816 		unexpected_errors += n;
2817 	}
2818 
2819 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2820 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2821 		/*
2822 		 * Use the good data we have in hand to repair damaged children.
2823 		 */
2824 		for (int c = 0; c < rr->rr_cols; c++) {
2825 			raidz_col_t *rc = &rr->rr_col[c];
2826 			vdev_t *vd = zio->io_vd;
2827 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2828 
2829 			if (!rc->rc_allow_repair) {
2830 				continue;
2831 			} else if (!rc->rc_force_repair &&
2832 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
2833 				continue;
2834 			}
2835 			/*
2836 			 * We do not allow self healing for Direct I/O reads.
2837 			 * See comment in vdev_raid_row_alloc().
2838 			 */
2839 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2840 
2841 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2842 			    "offset=%llx",
2843 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2844 
2845 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2846 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2847 			    ZIO_TYPE_WRITE,
2848 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
2849 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2850 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2851 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2852 		}
2853 	}
2854 
2855 	/*
2856 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
2857 	 * good data.  This ensures that if we've already copied this sector,
2858 	 * it will be corrected if it was damaged.  This writes more than is
2859 	 * necessary, but since expansion is paused during scrub/resilver, at
2860 	 * most a single row will have a shadow location.
2861 	 */
2862 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2863 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2864 		for (int c = 0; c < rr->rr_cols; c++) {
2865 			raidz_col_t *rc = &rr->rr_col[c];
2866 			vdev_t *vd = zio->io_vd;
2867 
2868 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2869 				continue;
2870 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2871 
2872 			/*
2873 			 * Note: We don't want to update the repair stats
2874 			 * because that would incorrectly indicate that there
2875 			 * was bad data to repair, which we aren't sure about.
2876 			 * By clearing the SCAN_THREAD flag, we prevent this
2877 			 * from happening, despite having the REPAIR flag set.
2878 			 * We need to set SELF_HEAL so that this i/o can't be
2879 			 * bypassed by zio_vdev_io_start().
2880 			 */
2881 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2882 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2883 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2884 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2885 			    NULL, NULL);
2886 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2887 			zio_nowait(cio);
2888 		}
2889 	}
2890 }
2891 
2892 static void
2893 raidz_restore_orig_data(raidz_map_t *rm)
2894 {
2895 	for (int i = 0; i < rm->rm_nrows; i++) {
2896 		raidz_row_t *rr = rm->rm_row[i];
2897 		for (int c = 0; c < rr->rr_cols; c++) {
2898 			raidz_col_t *rc = &rr->rr_col[c];
2899 			if (rc->rc_need_orig_restore) {
2900 				abd_copy(rc->rc_abd,
2901 				    rc->rc_orig_data, rc->rc_size);
2902 				rc->rc_need_orig_restore = B_FALSE;
2903 			}
2904 		}
2905 	}
2906 }
2907 
2908 /*
2909  * During raidz_reconstruct() for expanded VDEV, we need special consideration
2910  * failure simulations.  See note in raidz_reconstruct() on simulating failure
2911  * of a pre-expansion device.
2912  *
2913  * Treating logical child i as failed, return TRUE if the given column should
2914  * be treated as failed.  The idea of logical children allows us to imagine
2915  * that a disk silently failed before a RAIDZ expansion (reads from this disk
2916  * succeed but return the wrong data).  Since the expansion doesn't verify
2917  * checksums, the incorrect data will be moved to new locations spread among
2918  * the children (going diagonally across them).
2919  *
2920  * Higher "logical child failures" (values of `i`) indicate these
2921  * "pre-expansion failures".  The first physical_width values imagine that a
2922  * current child failed; the next physical_width-1 values imagine that a
2923  * child failed before the most recent expansion; the next physical_width-2
2924  * values imagine a child failed in the expansion before that, etc.
2925  */
2926 static boolean_t
2927 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2928     int i, raidz_col_t *rc)
2929 {
2930 	uint64_t sector_id =
2931 	    physical_width * (rc->rc_offset >> ashift) +
2932 	    rc->rc_devidx;
2933 
2934 	for (int w = physical_width; w >= original_width; w--) {
2935 		if (i < w) {
2936 			return (sector_id % w == i);
2937 		} else {
2938 			i -= w;
2939 		}
2940 	}
2941 	ASSERT(!"invalid logical child id");
2942 	return (B_FALSE);
2943 }
2944 
2945 /*
2946  * returns EINVAL if reconstruction of the block will not be possible
2947  * returns ECKSUM if this specific reconstruction failed
2948  * returns 0 on successful reconstruction
2949  */
2950 static int
2951 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2952 {
2953 	raidz_map_t *rm = zio->io_vsd;
2954 	int physical_width = zio->io_vd->vdev_children;
2955 	int original_width = (rm->rm_original_width != 0) ?
2956 	    rm->rm_original_width : physical_width;
2957 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2958 
2959 	if (dbgmsg) {
2960 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2961 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2962 	}
2963 
2964 	/* Reconstruct each row */
2965 	for (int r = 0; r < rm->rm_nrows; r++) {
2966 		raidz_row_t *rr = rm->rm_row[r];
2967 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2968 		int t = 0;
2969 		int dead = 0;
2970 		int dead_data = 0;
2971 
2972 		if (dbgmsg)
2973 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2974 
2975 		for (int c = 0; c < rr->rr_cols; c++) {
2976 			raidz_col_t *rc = &rr->rr_col[c];
2977 			ASSERT0(rc->rc_need_orig_restore);
2978 			if (rc->rc_error != 0) {
2979 				dead++;
2980 				if (c >= nparity)
2981 					dead_data++;
2982 				continue;
2983 			}
2984 			if (rc->rc_size == 0)
2985 				continue;
2986 			for (int lt = 0; lt < ntgts; lt++) {
2987 				if (raidz_simulate_failure(physical_width,
2988 				    original_width,
2989 				    zio->io_vd->vdev_top->vdev_ashift,
2990 				    ltgts[lt], rc)) {
2991 					if (rc->rc_orig_data == NULL) {
2992 						rc->rc_orig_data =
2993 						    abd_alloc_linear(
2994 						    rc->rc_size, B_TRUE);
2995 						abd_copy(rc->rc_orig_data,
2996 						    rc->rc_abd, rc->rc_size);
2997 					}
2998 					rc->rc_need_orig_restore = B_TRUE;
2999 
3000 					dead++;
3001 					if (c >= nparity)
3002 						dead_data++;
3003 					/*
3004 					 * Note: simulating failure of a
3005 					 * pre-expansion device can hit more
3006 					 * than one column, in which case we
3007 					 * might try to simulate more failures
3008 					 * than can be reconstructed, which is
3009 					 * also more than the size of my_tgts.
3010 					 * This check prevents accessing past
3011 					 * the end of my_tgts.  The "dead >
3012 					 * nparity" check below will fail this
3013 					 * reconstruction attempt.
3014 					 */
3015 					if (t < VDEV_RAIDZ_MAXPARITY) {
3016 						my_tgts[t++] = c;
3017 						if (dbgmsg) {
3018 							zfs_dbgmsg("simulating "
3019 							    "failure of col %u "
3020 							    "devidx %u", c,
3021 							    (int)rc->rc_devidx);
3022 						}
3023 					}
3024 					break;
3025 				}
3026 			}
3027 		}
3028 		if (dead > nparity) {
3029 			/* reconstruction not possible */
3030 			if (dbgmsg) {
3031 				zfs_dbgmsg("reconstruction not possible; "
3032 				    "too many failures");
3033 			}
3034 			raidz_restore_orig_data(rm);
3035 			return (EINVAL);
3036 		}
3037 		if (dead_data > 0)
3038 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3039 	}
3040 
3041 	/* Check for success */
3042 	if (raidz_checksum_verify(zio) == 0) {
3043 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3044 			return (0);
3045 
3046 		/* Reconstruction succeeded - report errors */
3047 		for (int i = 0; i < rm->rm_nrows; i++) {
3048 			raidz_row_t *rr = rm->rm_row[i];
3049 
3050 			for (int c = 0; c < rr->rr_cols; c++) {
3051 				raidz_col_t *rc = &rr->rr_col[c];
3052 				if (rc->rc_need_orig_restore) {
3053 					/*
3054 					 * Note: if this is a parity column,
3055 					 * we don't really know if it's wrong.
3056 					 * We need to let
3057 					 * vdev_raidz_io_done_verified() check
3058 					 * it, and if we set rc_error, it will
3059 					 * think that it is a "known" error
3060 					 * that doesn't need to be checked
3061 					 * or corrected.
3062 					 */
3063 					if (rc->rc_error == 0 &&
3064 					    c >= rr->rr_firstdatacol) {
3065 						vdev_raidz_checksum_error(zio,
3066 						    rc, rc->rc_orig_data);
3067 						rc->rc_error =
3068 						    SET_ERROR(ECKSUM);
3069 					}
3070 					rc->rc_need_orig_restore = B_FALSE;
3071 				}
3072 			}
3073 
3074 			vdev_raidz_io_done_verified(zio, rr);
3075 		}
3076 
3077 		zio_checksum_verified(zio);
3078 
3079 		if (dbgmsg) {
3080 			zfs_dbgmsg("reconstruction successful "
3081 			    "(checksum verified)");
3082 		}
3083 		return (0);
3084 	}
3085 
3086 	/* Reconstruction failed - restore original data */
3087 	raidz_restore_orig_data(rm);
3088 	if (dbgmsg) {
3089 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3090 		    "failed", zio);
3091 	}
3092 	return (ECKSUM);
3093 }
3094 
3095 /*
3096  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3097  * Note that the algorithm below is non-optimal because it doesn't take into
3098  * account how reconstruction is actually performed. For example, with
3099  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3100  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3101  * cases we'd only use parity information in column 0.
3102  *
3103  * The order that we find the various possible combinations of failed
3104  * disks is dictated by these rules:
3105  * - Examine each "slot" (the "i" in tgts[i])
3106  *   - Try to increment this slot (tgts[i] += 1)
3107  *   - if we can't increment because it runs into the next slot,
3108  *     reset our slot to the minimum, and examine the next slot
3109  *
3110  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3111  *  3 columns to reconstruct), we will generate the following sequence:
3112  *
3113  *  STATE        ACTION
3114  *  0 1 2        special case: skip since these are all parity
3115  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3116  *  0   2 3      first slot: increment to 1
3117  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3118  *  0 1     4    first: reset to 0; middle: increment to 2
3119  *  0   2   4    first: increment to 1
3120  *    1 2   4    first: reset to 0; middle: increment to 3
3121  *  0     3 4    first: increment to 1
3122  *    1   3 4    first: increment to 2
3123  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3124  *  0 1       5  first: reset to 0; middle: increment to 2
3125  *  0   2     5  first: increment to 1
3126  *    1 2     5  first: reset to 0; middle: increment to 3
3127  *  0     3   5  first: increment to 1
3128  *    1   3   5  first: increment to 2
3129  *      2 3   5  first: reset to 0; middle: increment to 4
3130  *  0       4 5  first: increment to 1
3131  *    1     4 5  first: increment to 2
3132  *      2   4 5  first: increment to 3
3133  *        3 4 5  done
3134  *
3135  * This strategy works for dRAID but is less efficient when there are a large
3136  * number of child vdevs and therefore permutations to check. Furthermore,
3137  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3138  * possible as long as there are no more than nparity data errors per row.
3139  * These additional permutations are not currently checked but could be as
3140  * a future improvement.
3141  *
3142  * Returns 0 on success, ECKSUM on failure.
3143  */
3144 static int
3145 vdev_raidz_combrec(zio_t *zio)
3146 {
3147 	int nparity = vdev_get_nparity(zio->io_vd);
3148 	raidz_map_t *rm = zio->io_vsd;
3149 	int physical_width = zio->io_vd->vdev_children;
3150 	int original_width = (rm->rm_original_width != 0) ?
3151 	    rm->rm_original_width : physical_width;
3152 
3153 	for (int i = 0; i < rm->rm_nrows; i++) {
3154 		raidz_row_t *rr = rm->rm_row[i];
3155 		int total_errors = 0;
3156 
3157 		for (int c = 0; c < rr->rr_cols; c++) {
3158 			if (rr->rr_col[c].rc_error)
3159 				total_errors++;
3160 		}
3161 
3162 		if (total_errors > nparity)
3163 			return (vdev_raidz_worst_error(rr));
3164 	}
3165 
3166 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3167 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3168 		int *ltgts = &tstore[1]; /* value is logical child ID */
3169 
3170 
3171 		/*
3172 		 * Determine number of logical children, n.  See comment
3173 		 * above raidz_simulate_failure().
3174 		 */
3175 		int n = 0;
3176 		for (int w = physical_width;
3177 		    w >= original_width; w--) {
3178 			n += w;
3179 		}
3180 
3181 		ASSERT3U(num_failures, <=, nparity);
3182 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3183 
3184 		/* Handle corner cases in combrec logic */
3185 		ltgts[-1] = -1;
3186 		for (int i = 0; i < num_failures; i++) {
3187 			ltgts[i] = i;
3188 		}
3189 		ltgts[num_failures] = n;
3190 
3191 		for (;;) {
3192 			int err = raidz_reconstruct(zio, ltgts, num_failures,
3193 			    nparity);
3194 			if (err == EINVAL) {
3195 				/*
3196 				 * Reconstruction not possible with this #
3197 				 * failures; try more failures.
3198 				 */
3199 				break;
3200 			} else if (err == 0)
3201 				return (0);
3202 
3203 			/* Compute next targets to try */
3204 			for (int t = 0; ; t++) {
3205 				ASSERT3U(t, <, num_failures);
3206 				ltgts[t]++;
3207 				if (ltgts[t] == n) {
3208 					/* try more failures */
3209 					ASSERT3U(t, ==, num_failures - 1);
3210 					if (zfs_flags &
3211 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3212 						zfs_dbgmsg("reconstruction "
3213 						    "failed for num_failures="
3214 						    "%u; tried all "
3215 						    "combinations",
3216 						    num_failures);
3217 					}
3218 					break;
3219 				}
3220 
3221 				ASSERT3U(ltgts[t], <, n);
3222 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3223 
3224 				/*
3225 				 * If that spot is available, we're done here.
3226 				 * Try the next combination.
3227 				 */
3228 				if (ltgts[t] != ltgts[t + 1])
3229 					break; // found next combination
3230 
3231 				/*
3232 				 * Otherwise, reset this tgt to the minimum,
3233 				 * and move on to the next tgt.
3234 				 */
3235 				ltgts[t] = ltgts[t - 1] + 1;
3236 				ASSERT3U(ltgts[t], ==, t);
3237 			}
3238 
3239 			/* Increase the number of failures and keep trying. */
3240 			if (ltgts[num_failures - 1] == n)
3241 				break;
3242 		}
3243 	}
3244 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3245 		zfs_dbgmsg("reconstruction failed for all num_failures");
3246 	return (ECKSUM);
3247 }
3248 
3249 void
3250 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3251 {
3252 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3253 		raidz_row_t *rr = rm->rm_row[row];
3254 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
3255 	}
3256 }
3257 
3258 /*
3259  * Complete a write IO operation on a RAIDZ VDev
3260  *
3261  * Outline:
3262  *   1. Check for errors on the child IOs.
3263  *   2. Return, setting an error code if too few child VDevs were written
3264  *      to reconstruct the data later.  Note that partial writes are
3265  *      considered successful if they can be reconstructed at all.
3266  */
3267 static void
3268 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3269 {
3270 	int normal_errors = 0;
3271 	int shadow_errors = 0;
3272 
3273 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3274 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3275 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3276 
3277 	for (int c = 0; c < rr->rr_cols; c++) {
3278 		raidz_col_t *rc = &rr->rr_col[c];
3279 
3280 		if (rc->rc_error != 0) {
3281 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3282 			normal_errors++;
3283 		}
3284 		if (rc->rc_shadow_error != 0) {
3285 			ASSERT(rc->rc_shadow_error != ECKSUM);
3286 			shadow_errors++;
3287 		}
3288 	}
3289 
3290 	/*
3291 	 * Treat partial writes as a success. If we couldn't write enough
3292 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3293 	 * enough.  Note that in the case of a shadow write (during raidz
3294 	 * expansion), depending on if we crash, either the normal (old) or
3295 	 * shadow (new) location may become the "real" version of the block,
3296 	 * so both locations must have sufficient redundancy.
3297 	 *
3298 	 * Now that we support write reallocation, it would be better
3299 	 * to treat partial failure as real failure unless there are
3300 	 * no non-degraded top-level vdevs left, and not update DTLs
3301 	 * if we intend to reallocate.
3302 	 */
3303 	if (normal_errors > rr->rr_firstdatacol ||
3304 	    shadow_errors > rr->rr_firstdatacol) {
3305 		zio->io_error = zio_worst_error(zio->io_error,
3306 		    vdev_raidz_worst_error(rr));
3307 	}
3308 }
3309 
3310 static void
3311 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3312     raidz_row_t *rr)
3313 {
3314 	int parity_errors = 0;
3315 	int parity_untried = 0;
3316 	int data_errors = 0;
3317 	int total_errors = 0;
3318 
3319 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3320 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3321 
3322 	for (int c = 0; c < rr->rr_cols; c++) {
3323 		raidz_col_t *rc = &rr->rr_col[c];
3324 
3325 		/*
3326 		 * If scrubbing and a replacing/sparing child vdev determined
3327 		 * that not all of its children have an identical copy of the
3328 		 * data, then clear the error so the column is treated like
3329 		 * any other read and force a repair to correct the damage.
3330 		 */
3331 		if (rc->rc_error == ECKSUM) {
3332 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3333 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3334 			rc->rc_force_repair = 1;
3335 			rc->rc_error = 0;
3336 		}
3337 
3338 		if (rc->rc_error) {
3339 			if (c < rr->rr_firstdatacol)
3340 				parity_errors++;
3341 			else
3342 				data_errors++;
3343 
3344 			total_errors++;
3345 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3346 			parity_untried++;
3347 		}
3348 	}
3349 
3350 	/*
3351 	 * If there were data errors and the number of errors we saw was
3352 	 * correctable -- less than or equal to the number of parity disks read
3353 	 * -- reconstruct based on the missing data.
3354 	 */
3355 	if (data_errors != 0 &&
3356 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3357 		/*
3358 		 * We either attempt to read all the parity columns or
3359 		 * none of them. If we didn't try to read parity, we
3360 		 * wouldn't be here in the correctable case. There must
3361 		 * also have been fewer parity errors than parity
3362 		 * columns or, again, we wouldn't be in this code path.
3363 		 */
3364 		ASSERT(parity_untried == 0);
3365 		ASSERT(parity_errors < rr->rr_firstdatacol);
3366 
3367 		/*
3368 		 * Identify the data columns that reported an error.
3369 		 */
3370 		int n = 0;
3371 		int tgts[VDEV_RAIDZ_MAXPARITY];
3372 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3373 			raidz_col_t *rc = &rr->rr_col[c];
3374 			if (rc->rc_error != 0) {
3375 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3376 				tgts[n++] = c;
3377 			}
3378 		}
3379 
3380 		ASSERT(rr->rr_firstdatacol >= n);
3381 
3382 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3383 	}
3384 }
3385 
3386 /*
3387  * Return the number of reads issued.
3388  */
3389 static int
3390 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3391 {
3392 	vdev_t *vd = zio->io_vd;
3393 	int nread = 0;
3394 
3395 	rr->rr_missingdata = 0;
3396 	rr->rr_missingparity = 0;
3397 
3398 	/*
3399 	 * If this rows contains empty sectors which are not required
3400 	 * for a normal read then allocate an ABD for them now so they
3401 	 * may be read, verified, and any needed repairs performed.
3402 	 */
3403 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3404 		vdev_draid_map_alloc_empty(zio, rr);
3405 
3406 	for (int c = 0; c < rr->rr_cols; c++) {
3407 		raidz_col_t *rc = &rr->rr_col[c];
3408 		if (rc->rc_tried || rc->rc_size == 0)
3409 			continue;
3410 
3411 		zio_nowait(zio_vdev_child_io(zio, NULL,
3412 		    vd->vdev_child[rc->rc_devidx],
3413 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3414 		    zio->io_type, zio->io_priority, 0,
3415 		    vdev_raidz_child_done, rc));
3416 		nread++;
3417 	}
3418 	return (nread);
3419 }
3420 
3421 /*
3422  * We're here because either there were too many errors to even attempt
3423  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3424  * failed. In either case, there is enough bad data to prevent reconstruction.
3425  * Start checksum ereports for all children which haven't failed.
3426  */
3427 static void
3428 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3429 {
3430 	raidz_map_t *rm = zio->io_vsd;
3431 
3432 	for (int i = 0; i < rm->rm_nrows; i++) {
3433 		raidz_row_t *rr = rm->rm_row[i];
3434 
3435 		for (int c = 0; c < rr->rr_cols; c++) {
3436 			raidz_col_t *rc = &rr->rr_col[c];
3437 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3438 
3439 			if (rc->rc_error != 0)
3440 				continue;
3441 
3442 			zio_bad_cksum_t zbc;
3443 			zbc.zbc_has_cksum = 0;
3444 			zbc.zbc_injected = rm->rm_ecksuminjected;
3445 			mutex_enter(&cvd->vdev_stat_lock);
3446 			cvd->vdev_stat.vs_checksum_errors++;
3447 			mutex_exit(&cvd->vdev_stat_lock);
3448 			(void) zfs_ereport_start_checksum(zio->io_spa,
3449 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3450 			    rc->rc_size, &zbc);
3451 		}
3452 	}
3453 }
3454 
3455 void
3456 vdev_raidz_io_done(zio_t *zio)
3457 {
3458 	raidz_map_t *rm = zio->io_vsd;
3459 
3460 	ASSERT(zio->io_bp != NULL);
3461 	if (zio->io_type == ZIO_TYPE_WRITE) {
3462 		for (int i = 0; i < rm->rm_nrows; i++) {
3463 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3464 		}
3465 	} else {
3466 		if (rm->rm_phys_col) {
3467 			/*
3468 			 * This is an aggregated read.  Copy the data and status
3469 			 * from the aggregate abd's to the individual rows.
3470 			 */
3471 			for (int i = 0; i < rm->rm_nrows; i++) {
3472 				raidz_row_t *rr = rm->rm_row[i];
3473 
3474 				for (int c = 0; c < rr->rr_cols; c++) {
3475 					raidz_col_t *rc = &rr->rr_col[c];
3476 					if (rc->rc_tried || rc->rc_size == 0)
3477 						continue;
3478 
3479 					raidz_col_t *prc =
3480 					    &rm->rm_phys_col[rc->rc_devidx];
3481 					rc->rc_error = prc->rc_error;
3482 					rc->rc_tried = prc->rc_tried;
3483 					rc->rc_skipped = prc->rc_skipped;
3484 					if (c >= rr->rr_firstdatacol) {
3485 						/*
3486 						 * Note: this is slightly faster
3487 						 * than using abd_copy_off().
3488 						 */
3489 						char *physbuf = abd_to_buf(
3490 						    prc->rc_abd);
3491 						void *physloc = physbuf +
3492 						    rc->rc_offset -
3493 						    prc->rc_offset;
3494 
3495 						abd_copy_from_buf(rc->rc_abd,
3496 						    physloc, rc->rc_size);
3497 					}
3498 				}
3499 			}
3500 		}
3501 
3502 		for (int i = 0; i < rm->rm_nrows; i++) {
3503 			raidz_row_t *rr = rm->rm_row[i];
3504 			vdev_raidz_io_done_reconstruct_known_missing(zio,
3505 			    rm, rr);
3506 		}
3507 
3508 		if (raidz_checksum_verify(zio) == 0) {
3509 			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3510 				goto done;
3511 
3512 			for (int i = 0; i < rm->rm_nrows; i++) {
3513 				raidz_row_t *rr = rm->rm_row[i];
3514 				vdev_raidz_io_done_verified(zio, rr);
3515 			}
3516 			zio_checksum_verified(zio);
3517 		} else {
3518 			/*
3519 			 * A sequential resilver has no checksum which makes
3520 			 * combinatoral reconstruction impossible. This code
3521 			 * path is unreachable since raidz_checksum_verify()
3522 			 * has no checksum to verify and must succeed.
3523 			 */
3524 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3525 
3526 			/*
3527 			 * This isn't a typical situation -- either we got a
3528 			 * read error or a child silently returned bad data.
3529 			 * Read every block so we can try again with as much
3530 			 * data and parity as we can track down. If we've
3531 			 * already been through once before, all children will
3532 			 * be marked as tried so we'll proceed to combinatorial
3533 			 * reconstruction.
3534 			 */
3535 			int nread = 0;
3536 			for (int i = 0; i < rm->rm_nrows; i++) {
3537 				nread += vdev_raidz_read_all(zio,
3538 				    rm->rm_row[i]);
3539 			}
3540 			if (nread != 0) {
3541 				/*
3542 				 * Normally our stage is VDEV_IO_DONE, but if
3543 				 * we've already called redone(), it will have
3544 				 * changed to VDEV_IO_START, in which case we
3545 				 * don't want to call redone() again.
3546 				 */
3547 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3548 					zio_vdev_io_redone(zio);
3549 				return;
3550 			}
3551 			/*
3552 			 * It would be too expensive to try every possible
3553 			 * combination of failed sectors in every row, so
3554 			 * instead we try every combination of failed current or
3555 			 * past physical disk. This means that if the incorrect
3556 			 * sectors were all on Nparity disks at any point in the
3557 			 * past, we will find the correct data.  The only known
3558 			 * case where this is less durable than a non-expanded
3559 			 * RAIDZ, is if we have a silent failure during
3560 			 * expansion.  In that case, one block could be
3561 			 * partially in the old format and partially in the
3562 			 * new format, so we'd lost some sectors from the old
3563 			 * format and some from the new format.
3564 			 *
3565 			 * e.g. logical_width=4 physical_width=6
3566 			 * the 15 (6+5+4) possible failed disks are:
3567 			 * width=6 child=0
3568 			 * width=6 child=1
3569 			 * width=6 child=2
3570 			 * width=6 child=3
3571 			 * width=6 child=4
3572 			 * width=6 child=5
3573 			 * width=5 child=0
3574 			 * width=5 child=1
3575 			 * width=5 child=2
3576 			 * width=5 child=3
3577 			 * width=5 child=4
3578 			 * width=4 child=0
3579 			 * width=4 child=1
3580 			 * width=4 child=2
3581 			 * width=4 child=3
3582 			 * And we will try every combination of Nparity of these
3583 			 * failing.
3584 			 *
3585 			 * As a first pass, we can generate every combo,
3586 			 * and try reconstructing, ignoring any known
3587 			 * failures.  If any row has too many known + simulated
3588 			 * failures, then we bail on reconstructing with this
3589 			 * number of simulated failures.  As an improvement,
3590 			 * we could detect the number of whole known failures
3591 			 * (i.e. we have known failures on these disks for
3592 			 * every row; the disks never succeeded), and
3593 			 * subtract that from the max # failures to simulate.
3594 			 * We could go even further like the current
3595 			 * combrec code, but that doesn't seem like it
3596 			 * gains us very much.  If we simulate a failure
3597 			 * that is also a known failure, that's fine.
3598 			 */
3599 			zio->io_error = vdev_raidz_combrec(zio);
3600 			if (zio->io_error == ECKSUM &&
3601 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3602 				vdev_raidz_io_done_unrecoverable(zio);
3603 			}
3604 		}
3605 	}
3606 done:
3607 	if (rm->rm_lr != NULL) {
3608 		zfs_rangelock_exit(rm->rm_lr);
3609 		rm->rm_lr = NULL;
3610 	}
3611 }
3612 
3613 static void
3614 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3615 {
3616 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3617 	if (faulted > vdrz->vd_nparity)
3618 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3619 		    VDEV_AUX_NO_REPLICAS);
3620 	else if (degraded + faulted != 0)
3621 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3622 	else
3623 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3624 }
3625 
3626 /*
3627  * Determine if any portion of the provided block resides on a child vdev
3628  * with a dirty DTL and therefore needs to be resilvered.  The function
3629  * assumes that at least one DTL is dirty which implies that full stripe
3630  * width blocks must be resilvered.
3631  */
3632 static boolean_t
3633 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3634     uint64_t phys_birth)
3635 {
3636 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3637 
3638 	/*
3639 	 * If we're in the middle of a RAIDZ expansion, this block may be in
3640 	 * the old and/or new location.  For simplicity, always resilver it.
3641 	 */
3642 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3643 		return (B_TRUE);
3644 
3645 	uint64_t dcols = vd->vdev_children;
3646 	uint64_t nparity = vdrz->vd_nparity;
3647 	uint64_t ashift = vd->vdev_top->vdev_ashift;
3648 	/* The starting RAIDZ (parent) vdev sector of the block. */
3649 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3650 	/* The zio's size in units of the vdev's minimum sector size. */
3651 	uint64_t s = ((psize - 1) >> ashift) + 1;
3652 	/* The first column for this stripe. */
3653 	uint64_t f = b % dcols;
3654 
3655 	/* Unreachable by sequential resilver. */
3656 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3657 
3658 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3659 		return (B_FALSE);
3660 
3661 	if (s + nparity >= dcols)
3662 		return (B_TRUE);
3663 
3664 	for (uint64_t c = 0; c < s + nparity; c++) {
3665 		uint64_t devidx = (f + c) % dcols;
3666 		vdev_t *cvd = vd->vdev_child[devidx];
3667 
3668 		/*
3669 		 * dsl_scan_need_resilver() already checked vd with
3670 		 * vdev_dtl_contains(). So here just check cvd with
3671 		 * vdev_dtl_empty(), cheaper and a good approximation.
3672 		 */
3673 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3674 			return (B_TRUE);
3675 	}
3676 
3677 	return (B_FALSE);
3678 }
3679 
3680 static void
3681 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
3682     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
3683 {
3684 	(void) remain_rs;
3685 
3686 	vdev_t *raidvd = cvd->vdev_parent;
3687 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3688 
3689 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3690 
3691 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3692 		/*
3693 		 * We're in the middle of expansion, in which case the
3694 		 * translation is in flux.  Any answer we give may be wrong
3695 		 * by the time we return, so it isn't safe for the caller to
3696 		 * act on it.  Therefore we say that this range isn't present
3697 		 * on any children.  The only consumers of this are "zpool
3698 		 * initialize" and trimming, both of which are "best effort"
3699 		 * anyway.
3700 		 */
3701 		physical_rs->rs_start = physical_rs->rs_end = 0;
3702 		remain_rs->rs_start = remain_rs->rs_end = 0;
3703 		return;
3704 	}
3705 
3706 	uint64_t width = vdrz->vd_physical_width;
3707 	uint64_t tgt_col = cvd->vdev_id;
3708 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3709 
3710 	/* make sure the offsets are block-aligned */
3711 	ASSERT0(logical_rs->rs_start % (1 << ashift));
3712 	ASSERT0(logical_rs->rs_end % (1 << ashift));
3713 	uint64_t b_start = logical_rs->rs_start >> ashift;
3714 	uint64_t b_end = logical_rs->rs_end >> ashift;
3715 
3716 	uint64_t start_row = 0;
3717 	if (b_start > tgt_col) /* avoid underflow */
3718 		start_row = ((b_start - tgt_col - 1) / width) + 1;
3719 
3720 	uint64_t end_row = 0;
3721 	if (b_end > tgt_col)
3722 		end_row = ((b_end - tgt_col - 1) / width) + 1;
3723 
3724 	physical_rs->rs_start = start_row << ashift;
3725 	physical_rs->rs_end = end_row << ashift;
3726 
3727 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3728 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3729 	    logical_rs->rs_end - logical_rs->rs_start);
3730 }
3731 
3732 static void
3733 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3734 {
3735 	spa_t *spa = arg;
3736 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3737 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3738 
3739 	/*
3740 	 * Ensure there are no i/os to the range that is being committed.
3741 	 */
3742 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3743 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3744 
3745 	mutex_enter(&vre->vre_lock);
3746 	uint64_t new_offset =
3747 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3748 	/*
3749 	 * We should not have committed anything that failed.
3750 	 */
3751 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3752 	mutex_exit(&vre->vre_lock);
3753 
3754 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3755 	    old_offset, new_offset - old_offset,
3756 	    RL_WRITER);
3757 
3758 	/*
3759 	 * Update the uberblock that will be written when this txg completes.
3760 	 */
3761 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3762 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3763 	vre->vre_offset_pertxg[txgoff] = 0;
3764 	zfs_rangelock_exit(lr);
3765 
3766 	mutex_enter(&vre->vre_lock);
3767 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3768 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
3769 	mutex_exit(&vre->vre_lock);
3770 
3771 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3772 	VERIFY0(zap_update(spa->spa_meta_objset,
3773 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3774 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3775 }
3776 
3777 static void
3778 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3779 {
3780 	spa_t *spa = arg;
3781 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3782 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3783 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3784 
3785 	for (int i = 0; i < TXG_SIZE; i++)
3786 		VERIFY0(vre->vre_offset_pertxg[i]);
3787 
3788 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3789 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3790 	re->re_logical_width = vdrz->vd_physical_width;
3791 	mutex_enter(&vdrz->vd_expand_lock);
3792 	avl_add(&vdrz->vd_expand_txgs, re);
3793 	mutex_exit(&vdrz->vd_expand_lock);
3794 
3795 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3796 
3797 	/*
3798 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3799 	 * will get written (based on vd_expand_txgs).
3800 	 */
3801 	vdev_config_dirty(vd);
3802 
3803 	/*
3804 	 * Before we change vre_state, the on-disk state must reflect that we
3805 	 * have completed all copying, so that vdev_raidz_io_start() can use
3806 	 * vre_state to determine if the reflow is in progress.  See also the
3807 	 * end of spa_raidz_expand_thread().
3808 	 */
3809 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3810 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3811 
3812 	vre->vre_end_time = gethrestime_sec();
3813 	vre->vre_state = DSS_FINISHED;
3814 
3815 	uint64_t state = vre->vre_state;
3816 	VERIFY0(zap_update(spa->spa_meta_objset,
3817 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3818 	    sizeof (state), 1, &state, tx));
3819 
3820 	uint64_t end_time = vre->vre_end_time;
3821 	VERIFY0(zap_update(spa->spa_meta_objset,
3822 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3823 	    sizeof (end_time), 1, &end_time, tx));
3824 
3825 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
3826 
3827 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3828 	    "%s vdev %llu new width %llu", spa_name(spa),
3829 	    (unsigned long long)vd->vdev_id,
3830 	    (unsigned long long)vd->vdev_children);
3831 
3832 	spa->spa_raidz_expand = NULL;
3833 	raidvd->vdev_rz_expanding = B_FALSE;
3834 
3835 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3836 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3837 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3838 
3839 	spa_notify_waiters(spa);
3840 
3841 	/*
3842 	 * While we're in syncing context take the opportunity to
3843 	 * setup a scrub. All the data has been sucessfully copied
3844 	 * but we have not validated any checksums.
3845 	 */
3846 	setup_sync_arg_t setup_sync_arg = {
3847 		.func = POOL_SCAN_SCRUB,
3848 		.txgstart = 0,
3849 		.txgend = 0,
3850 	};
3851 	if (zfs_scrub_after_expand &&
3852 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
3853 		dsl_scan_setup_sync(&setup_sync_arg, tx);
3854 	}
3855 }
3856 
3857 /*
3858  * State of one copy batch.
3859  */
3860 typedef struct raidz_reflow_arg {
3861 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
3862 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
3863 	uint64_t rra_txg;	/* TXG of this batch. */
3864 	uint_t rra_ashift;	/* Ashift of the vdev. */
3865 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
3866 	uint32_t rra_writes;	/* Number of write ZIOs. */
3867 	zio_t *rra_zio[];	/* Write ZIO pointers. */
3868 } raidz_reflow_arg_t;
3869 
3870 /*
3871  * Write of the new location on one child is done.  Once all of them are done
3872  * we can unlock and free everything.
3873  */
3874 static void
3875 raidz_reflow_write_done(zio_t *zio)
3876 {
3877 	raidz_reflow_arg_t *rra = zio->io_private;
3878 	vdev_raidz_expand_t *vre = rra->rra_vre;
3879 
3880 	abd_free(zio->io_abd);
3881 
3882 	mutex_enter(&vre->vre_lock);
3883 	if (zio->io_error != 0) {
3884 		/* Force a reflow pause on errors */
3885 		vre->vre_failed_offset =
3886 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3887 	}
3888 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3889 	vre->vre_outstanding_bytes -= zio->io_size;
3890 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3891 	    vre->vre_failed_offset) {
3892 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3893 		    zio->io_size;
3894 	}
3895 	cv_signal(&vre->vre_cv);
3896 	boolean_t done = (--rra->rra_tbd == 0);
3897 	mutex_exit(&vre->vre_lock);
3898 
3899 	if (!done)
3900 		return;
3901 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3902 	zfs_rangelock_exit(rra->rra_lr);
3903 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3904 }
3905 
3906 /*
3907  * Read of the old location on one child is done.  Once all of them are done
3908  * writes should have all the data and we can issue them.
3909  */
3910 static void
3911 raidz_reflow_read_done(zio_t *zio)
3912 {
3913 	raidz_reflow_arg_t *rra = zio->io_private;
3914 	vdev_raidz_expand_t *vre = rra->rra_vre;
3915 
3916 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
3917 	if (zio->io_size > (1 << rra->rra_ashift))
3918 		abd_free(zio->io_abd);
3919 
3920 	/*
3921 	 * If the read failed, or if it was done on a vdev that is not fully
3922 	 * healthy (e.g. a child that has a resilver in progress), we may not
3923 	 * have the correct data.  Note that it's OK if the write proceeds.
3924 	 * It may write garbage but the location is otherwise unused and we
3925 	 * will retry later due to vre_failed_offset.
3926 	 */
3927 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3928 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3929 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3930 		    (long long)rra->rra_lr->lr_offset,
3931 		    (long long)rra->rra_lr->lr_length,
3932 		    (long long)rra->rra_txg,
3933 		    zio->io_error,
3934 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3935 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3936 		mutex_enter(&vre->vre_lock);
3937 		/* Force a reflow pause on errors */
3938 		vre->vre_failed_offset =
3939 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3940 		mutex_exit(&vre->vre_lock);
3941 	}
3942 
3943 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
3944 		return;
3945 	uint32_t writes = rra->rra_tbd = rra->rra_writes;
3946 	for (uint64_t i = 0; i < writes; i++)
3947 		zio_nowait(rra->rra_zio[i]);
3948 }
3949 
3950 static void
3951 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3952     dmu_tx_t *tx)
3953 {
3954 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3955 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3956 
3957 	if (offset == 0)
3958 		return;
3959 
3960 	mutex_enter(&vre->vre_lock);
3961 	ASSERT3U(vre->vre_offset, <=, offset);
3962 	vre->vre_offset = offset;
3963 	mutex_exit(&vre->vre_lock);
3964 
3965 	if (vre->vre_offset_pertxg[txgoff] == 0) {
3966 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3967 		    spa, tx);
3968 	}
3969 	vre->vre_offset_pertxg[txgoff] = offset;
3970 }
3971 
3972 static boolean_t
3973 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3974 {
3975 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
3976 		/* Quick check if a child is being replaced */
3977 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3978 			return (B_TRUE);
3979 	}
3980 	return (B_FALSE);
3981 }
3982 
3983 static boolean_t
3984 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
3985     dmu_tx_t *tx)
3986 {
3987 	spa_t *spa = vd->vdev_spa;
3988 	uint_t ashift = vd->vdev_top->vdev_ashift;
3989 
3990 	zfs_range_seg_t *rs = zfs_range_tree_first(rt);
3991 	if (rt == NULL)
3992 		return (B_FALSE);
3993 	uint64_t offset = zfs_rs_get_start(rs, rt);
3994 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3995 	uint64_t size = zfs_rs_get_end(rs, rt) - offset;
3996 	ASSERT3U(size, >=, 1 << ashift);
3997 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
3998 
3999 	uint64_t blkid = offset >> ashift;
4000 	uint_t old_children = vd->vdev_children - 1;
4001 
4002 	/*
4003 	 * We can only progress to the point that writes will not overlap
4004 	 * with blocks whose progress has not yet been recorded on disk.
4005 	 * Since partially-copied rows are still read from the old location,
4006 	 * we need to stop one row before the sector-wise overlap, to prevent
4007 	 * row-wise overlap.
4008 	 *
4009 	 * Note that even if we are skipping over a large unallocated region,
4010 	 * we can't move the on-disk progress to `offset`, because concurrent
4011 	 * writes/allocations could still use the currently-unallocated
4012 	 * region.
4013 	 */
4014 	uint64_t ubsync_blkid =
4015 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
4016 	uint64_t next_overwrite_blkid = ubsync_blkid +
4017 	    ubsync_blkid / old_children - old_children;
4018 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
4019 	if (blkid >= next_overwrite_blkid) {
4020 		raidz_reflow_record_progress(vre,
4021 		    next_overwrite_blkid << ashift, tx);
4022 		return (B_TRUE);
4023 	}
4024 
4025 	size = MIN(size, raidz_expand_max_copy_bytes);
4026 	size = MIN(size, (uint64_t)old_children *
4027 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4028 	size = MAX(size, 1 << ashift);
4029 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4030 	size = (uint64_t)blocks << ashift;
4031 
4032 	zfs_range_tree_remove(rt, offset, size);
4033 
4034 	uint_t reads = MIN(blocks, old_children);
4035 	uint_t writes = MIN(blocks, vd->vdev_children);
4036 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4037 	    sizeof (zio_t *) * writes, KM_SLEEP);
4038 	rra->rra_vre = vre;
4039 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4040 	    offset, size, RL_WRITER);
4041 	rra->rra_txg = dmu_tx_get_txg(tx);
4042 	rra->rra_ashift = ashift;
4043 	rra->rra_tbd = reads;
4044 	rra->rra_writes = writes;
4045 
4046 	raidz_reflow_record_progress(vre, offset + size, tx);
4047 
4048 	/*
4049 	 * SCL_STATE will be released when the read and write are done,
4050 	 * by raidz_reflow_write_done().
4051 	 */
4052 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4053 
4054 	/* check if a replacing vdev was added, if so treat it as an error */
4055 	if (vdev_raidz_expand_child_replacing(vd)) {
4056 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4057 		    "offset=%llu txg=%llu",
4058 		    (long long)rra->rra_lr->lr_offset,
4059 		    (long long)rra->rra_txg);
4060 
4061 		mutex_enter(&vre->vre_lock);
4062 		vre->vre_failed_offset =
4063 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4064 		cv_signal(&vre->vre_cv);
4065 		mutex_exit(&vre->vre_lock);
4066 
4067 		/* drop everything we acquired */
4068 		spa_config_exit(spa, SCL_STATE, spa);
4069 		zfs_rangelock_exit(rra->rra_lr);
4070 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4071 		return (B_TRUE);
4072 	}
4073 
4074 	mutex_enter(&vre->vre_lock);
4075 	vre->vre_outstanding_bytes += size;
4076 	mutex_exit(&vre->vre_lock);
4077 
4078 	/* Allocate ABD and ZIO for each child we write. */
4079 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4080 	zio_t *pio = spa->spa_txg_zio[txgoff];
4081 	uint_t b = blocks / vd->vdev_children;
4082 	uint_t bb = blocks % vd->vdev_children;
4083 	for (uint_t i = 0; i < writes; i++) {
4084 		uint_t n = b + (i < bb);
4085 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4086 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4087 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
4088 		    ((blkid + i) / vd->vdev_children) << ashift,
4089 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4090 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4091 	}
4092 
4093 	/*
4094 	 * Allocate and issue ZIO for each child we read.  For reads of only
4095 	 * one block we can use respective writer ABDs, since they will also
4096 	 * have only one block.  For bigger reads create gang ABDs and fill
4097 	 * them with respective blocks from writer ABDs.
4098 	 */
4099 	b = blocks / old_children;
4100 	bb = blocks % old_children;
4101 	for (uint_t i = 0; i < reads; i++) {
4102 		uint_t n = b + (i < bb);
4103 		abd_t *abd;
4104 		if (n > 1) {
4105 			abd = abd_alloc_gang();
4106 			for (uint_t j = 0; j < n; j++) {
4107 				uint_t b = j * old_children + i;
4108 				abd_t *cabd = abd_get_offset_size(
4109 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
4110 				    (b / vd->vdev_children) << ashift,
4111 				    1 << ashift);
4112 				abd_gang_add(abd, cabd, B_TRUE);
4113 			}
4114 		} else {
4115 			abd = rra->rra_zio[i]->io_abd;
4116 		}
4117 		zio_nowait(zio_vdev_child_io(pio, NULL,
4118 		    vd->vdev_child[(blkid + i) % old_children],
4119 		    ((blkid + i) / old_children) << ashift, abd,
4120 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4121 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4122 	}
4123 
4124 	return (B_FALSE);
4125 }
4126 
4127 /*
4128  * For testing (ztest specific)
4129  */
4130 static void
4131 raidz_expand_pause(uint_t pause_point)
4132 {
4133 	while (raidz_expand_pause_point != 0 &&
4134 	    raidz_expand_pause_point <= pause_point)
4135 		delay(hz);
4136 }
4137 
4138 static void
4139 raidz_scratch_child_done(zio_t *zio)
4140 {
4141 	zio_t *pio = zio->io_private;
4142 
4143 	mutex_enter(&pio->io_lock);
4144 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4145 	mutex_exit(&pio->io_lock);
4146 }
4147 
4148 /*
4149  * Reflow the beginning portion of the vdev into an intermediate scratch area
4150  * in memory and on disk. This operation must be persisted on disk before we
4151  * proceed to overwrite the beginning portion with the reflowed data.
4152  *
4153  * This multi-step task can fail to complete if disk errors are encountered
4154  * and we can return here after a pause (waiting for disk to become healthy).
4155  */
4156 static void
4157 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4158 {
4159 	vdev_raidz_expand_t *vre = arg;
4160 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4161 	zio_t *pio;
4162 	int error;
4163 
4164 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4165 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4166 	int ashift = raidvd->vdev_ashift;
4167 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4168 	    uint64_t);
4169 	uint64_t logical_size = write_size * raidvd->vdev_children;
4170 	uint64_t read_size =
4171 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4172 	    1 << ashift);
4173 
4174 	/*
4175 	 * The scratch space must be large enough to get us to the point
4176 	 * that one row does not overlap itself when moved.  This is checked
4177 	 * by vdev_raidz_attach_check().
4178 	 */
4179 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4180 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4181 	VERIFY3U(write_size, <=, read_size);
4182 
4183 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4184 	    0, logical_size, RL_WRITER);
4185 
4186 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4187 	    KM_SLEEP);
4188 	for (int i = 0; i < raidvd->vdev_children; i++) {
4189 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4190 	}
4191 
4192 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4193 
4194 	/*
4195 	 * If we have already written the scratch area then we must read from
4196 	 * there, since new writes were redirected there while we were paused
4197 	 * or the original location may have been partially overwritten with
4198 	 * reflowed data.
4199 	 */
4200 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4201 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4202 		/*
4203 		 * Read from scratch space.
4204 		 */
4205 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4206 		for (int i = 0; i < raidvd->vdev_children; i++) {
4207 			/*
4208 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4209 			 * to the offset to calculate the physical offset to
4210 			 * write to.  Passing in a negative offset makes us
4211 			 * access the scratch area.
4212 			 */
4213 			zio_nowait(zio_vdev_child_io(pio, NULL,
4214 			    raidvd->vdev_child[i],
4215 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4216 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4217 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4218 		}
4219 		error = zio_wait(pio);
4220 		if (error != 0) {
4221 			zfs_dbgmsg("reflow: error %d reading scratch location",
4222 			    error);
4223 			goto io_error_exit;
4224 		}
4225 		goto overwrite;
4226 	}
4227 
4228 	/*
4229 	 * Read from original location.
4230 	 */
4231 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4232 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4233 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4234 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4235 		    0, abds[i], read_size, ZIO_TYPE_READ,
4236 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4237 		    raidz_scratch_child_done, pio));
4238 	}
4239 	error = zio_wait(pio);
4240 	if (error != 0) {
4241 		zfs_dbgmsg("reflow: error %d reading original location", error);
4242 io_error_exit:
4243 		for (int i = 0; i < raidvd->vdev_children; i++)
4244 			abd_free(abds[i]);
4245 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4246 		zfs_rangelock_exit(lr);
4247 		spa_config_exit(spa, SCL_STATE, FTAG);
4248 		return;
4249 	}
4250 
4251 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4252 
4253 	/*
4254 	 * Reflow in memory.
4255 	 */
4256 	uint64_t logical_sectors = logical_size >> ashift;
4257 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4258 		int oldchild = i % (raidvd->vdev_children - 1);
4259 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4260 
4261 		int newchild = i % raidvd->vdev_children;
4262 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4263 
4264 		/* a single sector should not be copying over itself */
4265 		ASSERT(!(newchild == oldchild && newoff == oldoff));
4266 
4267 		abd_copy_off(abds[newchild], abds[oldchild],
4268 		    newoff, oldoff, 1 << ashift);
4269 	}
4270 
4271 	/*
4272 	 * Verify that we filled in everything we intended to (write_size on
4273 	 * each child).
4274 	 */
4275 	VERIFY0(logical_sectors % raidvd->vdev_children);
4276 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4277 	    write_size);
4278 
4279 	/*
4280 	 * Write to scratch location (boot area).
4281 	 */
4282 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4283 	for (int i = 0; i < raidvd->vdev_children; i++) {
4284 		/*
4285 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4286 		 * the offset to calculate the physical offset to write to.
4287 		 * Passing in a negative offset lets us access the boot area.
4288 		 */
4289 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4290 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4291 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4292 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4293 	}
4294 	error = zio_wait(pio);
4295 	if (error != 0) {
4296 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4297 		goto io_error_exit;
4298 	}
4299 	pio = zio_root(spa, NULL, NULL, 0);
4300 	zio_flush(pio, raidvd);
4301 	zio_wait(pio);
4302 
4303 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4304 	    (long long)logical_size);
4305 
4306 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4307 
4308 	/*
4309 	 * Update uberblock to indicate that scratch space is valid.  This is
4310 	 * needed because after this point, the real location may be
4311 	 * overwritten.  If we crash, we need to get the data from the
4312 	 * scratch space, rather than the real location.
4313 	 *
4314 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4315 	 * will prefer this uberblock.
4316 	 */
4317 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4318 	spa->spa_ubsync.ub_timestamp++;
4319 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4320 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4321 	if (spa_multihost(spa))
4322 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4323 
4324 	zfs_dbgmsg("reflow: uberblock updated "
4325 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4326 	    (long long)spa->spa_ubsync.ub_txg,
4327 	    (long long)logical_size,
4328 	    (long long)spa->spa_ubsync.ub_timestamp);
4329 
4330 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4331 
4332 	/*
4333 	 * Overwrite with reflow'ed data.
4334 	 */
4335 overwrite:
4336 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4337 	for (int i = 0; i < raidvd->vdev_children; i++) {
4338 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4339 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4340 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4341 		    raidz_scratch_child_done, pio));
4342 	}
4343 	error = zio_wait(pio);
4344 	if (error != 0) {
4345 		/*
4346 		 * When we exit early here and drop the range lock, new
4347 		 * writes will go into the scratch area so we'll need to
4348 		 * read from there when we return after pausing.
4349 		 */
4350 		zfs_dbgmsg("reflow: error %d writing real location", error);
4351 		/*
4352 		 * Update the uberblock that is written when this txg completes.
4353 		 */
4354 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4355 		    logical_size);
4356 		goto io_error_exit;
4357 	}
4358 	pio = zio_root(spa, NULL, NULL, 0);
4359 	zio_flush(pio, raidvd);
4360 	zio_wait(pio);
4361 
4362 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4363 	    (long long)logical_size);
4364 	for (int i = 0; i < raidvd->vdev_children; i++)
4365 		abd_free(abds[i]);
4366 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4367 
4368 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4369 
4370 	/*
4371 	 * Update uberblock to indicate that the initial part has been
4372 	 * reflow'ed.  This is needed because after this point (when we exit
4373 	 * the rangelock), we allow regular writes to this region, which will
4374 	 * be written to the new location only (because reflow_offset_next ==
4375 	 * reflow_offset_synced).  If we crashed and re-copied from the
4376 	 * scratch space, we would lose the regular writes.
4377 	 */
4378 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4379 	    logical_size);
4380 	spa->spa_ubsync.ub_timestamp++;
4381 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4382 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4383 	if (spa_multihost(spa))
4384 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4385 
4386 	zfs_dbgmsg("reflow: uberblock updated "
4387 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4388 	    (long long)spa->spa_ubsync.ub_txg,
4389 	    (long long)logical_size,
4390 	    (long long)spa->spa_ubsync.ub_timestamp);
4391 
4392 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4393 
4394 	/*
4395 	 * Update progress.
4396 	 */
4397 	vre->vre_offset = logical_size;
4398 	zfs_rangelock_exit(lr);
4399 	spa_config_exit(spa, SCL_STATE, FTAG);
4400 
4401 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4402 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4403 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4404 	/*
4405 	 * Note - raidz_reflow_sync() will update the uberblock state to
4406 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4407 	 */
4408 	raidz_reflow_sync(spa, tx);
4409 
4410 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4411 }
4412 
4413 /*
4414  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4415  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4416  */
4417 void
4418 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4419 {
4420 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4421 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4422 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4423 
4424 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4425 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4426 	ASSERT0(logical_size % raidvd->vdev_children);
4427 	uint64_t write_size = logical_size / raidvd->vdev_children;
4428 
4429 	zio_t *pio;
4430 
4431 	/*
4432 	 * Read from scratch space.
4433 	 */
4434 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4435 	    KM_SLEEP);
4436 	for (int i = 0; i < raidvd->vdev_children; i++) {
4437 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4438 	}
4439 
4440 	pio = zio_root(spa, NULL, NULL, 0);
4441 	for (int i = 0; i < raidvd->vdev_children; i++) {
4442 		/*
4443 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4444 		 * the offset to calculate the physical offset to write to.
4445 		 * Passing in a negative offset lets us access the boot area.
4446 		 */
4447 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4448 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4449 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4450 		    raidz_scratch_child_done, pio));
4451 	}
4452 	zio_wait(pio);
4453 
4454 	/*
4455 	 * Overwrite real location with reflow'ed data.
4456 	 */
4457 	pio = zio_root(spa, NULL, NULL, 0);
4458 	for (int i = 0; i < raidvd->vdev_children; i++) {
4459 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4460 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4461 		    ZIO_PRIORITY_REMOVAL, 0,
4462 		    raidz_scratch_child_done, pio));
4463 	}
4464 	zio_wait(pio);
4465 	pio = zio_root(spa, NULL, NULL, 0);
4466 	zio_flush(pio, raidvd);
4467 	zio_wait(pio);
4468 
4469 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4470 	    "to real location", (long long)logical_size);
4471 
4472 	for (int i = 0; i < raidvd->vdev_children; i++)
4473 		abd_free(abds[i]);
4474 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4475 
4476 	/*
4477 	 * Update uberblock.
4478 	 */
4479 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4480 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4481 	spa->spa_ubsync.ub_timestamp++;
4482 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4483 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4484 	if (spa_multihost(spa))
4485 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4486 
4487 	zfs_dbgmsg("reflow recovery: uberblock updated "
4488 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4489 	    (long long)spa->spa_ubsync.ub_txg,
4490 	    (long long)logical_size,
4491 	    (long long)spa->spa_ubsync.ub_timestamp);
4492 
4493 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4494 	    spa_first_txg(spa));
4495 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4496 	vre->vre_offset = logical_size;
4497 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4498 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4499 	/*
4500 	 * Note that raidz_reflow_sync() will update the uberblock once more
4501 	 */
4502 	raidz_reflow_sync(spa, tx);
4503 
4504 	dmu_tx_commit(tx);
4505 
4506 	spa_config_exit(spa, SCL_STATE, FTAG);
4507 }
4508 
4509 static boolean_t
4510 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4511 {
4512 	(void) zthr;
4513 	spa_t *spa = arg;
4514 
4515 	return (spa->spa_raidz_expand != NULL &&
4516 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4517 }
4518 
4519 /*
4520  * RAIDZ expansion background thread
4521  *
4522  * Can be called multiple times if the reflow is paused
4523  */
4524 static void
4525 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4526 {
4527 	spa_t *spa = arg;
4528 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4529 
4530 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4531 		vre->vre_offset = 0;
4532 	else
4533 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4534 
4535 	/* Reflow the begining portion using the scratch area */
4536 	if (vre->vre_offset == 0) {
4537 		VERIFY0(dsl_sync_task(spa_name(spa),
4538 		    NULL, raidz_reflow_scratch_sync,
4539 		    vre, 0, ZFS_SPACE_CHECK_NONE));
4540 
4541 		/* if we encountered errors then pause */
4542 		if (vre->vre_offset == 0) {
4543 			mutex_enter(&vre->vre_lock);
4544 			vre->vre_waiting_for_resilver = B_TRUE;
4545 			mutex_exit(&vre->vre_lock);
4546 			return;
4547 		}
4548 	}
4549 
4550 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4551 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4552 
4553 	uint64_t guid = raidvd->vdev_guid;
4554 
4555 	/* Iterate over all the remaining metaslabs */
4556 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4557 	    i < raidvd->vdev_ms_count &&
4558 	    !zthr_iscancelled(zthr) &&
4559 	    vre->vre_failed_offset == UINT64_MAX; i++) {
4560 		metaslab_t *msp = raidvd->vdev_ms[i];
4561 
4562 		metaslab_disable(msp);
4563 		mutex_enter(&msp->ms_lock);
4564 
4565 		/*
4566 		 * The metaslab may be newly created (for the expanded
4567 		 * space), in which case its trees won't exist yet,
4568 		 * so we need to bail out early.
4569 		 */
4570 		if (msp->ms_new) {
4571 			mutex_exit(&msp->ms_lock);
4572 			metaslab_enable(msp, B_FALSE, B_FALSE);
4573 			continue;
4574 		}
4575 
4576 		VERIFY0(metaslab_load(msp));
4577 
4578 		/*
4579 		 * We want to copy everything except the free (allocatable)
4580 		 * space.  Note that there may be a little bit more free
4581 		 * space (e.g. in ms_defer), and it's fine to copy that too.
4582 		 */
4583 		uint64_t shift, start;
4584 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
4585 		    raidvd, msp, &start, &shift);
4586 		zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
4587 		    start, shift);
4588 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
4589 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
4590 		    rt);
4591 		mutex_exit(&msp->ms_lock);
4592 
4593 		/*
4594 		 * Force the last sector of each metaslab to be copied.  This
4595 		 * ensures that we advance the on-disk progress to the end of
4596 		 * this metaslab while the metaslab is disabled.  Otherwise, we
4597 		 * could move past this metaslab without advancing the on-disk
4598 		 * progress, and then an allocation to this metaslab would not
4599 		 * be copied.
4600 		 */
4601 		int sectorsz = 1 << raidvd->vdev_ashift;
4602 		uint64_t ms_last_offset = msp->ms_start +
4603 		    msp->ms_size - sectorsz;
4604 		if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
4605 			zfs_range_tree_add(rt, ms_last_offset, sectorsz);
4606 		}
4607 
4608 		/*
4609 		 * When we are resuming from a paused expansion (i.e.
4610 		 * when importing a pool with a expansion in progress),
4611 		 * discard any state that we have already processed.
4612 		 */
4613 		if (vre->vre_offset > msp->ms_start) {
4614 			zfs_range_tree_clear(rt, msp->ms_start,
4615 			    vre->vre_offset - msp->ms_start);
4616 		}
4617 
4618 		while (!zthr_iscancelled(zthr) &&
4619 		    !zfs_range_tree_is_empty(rt) &&
4620 		    vre->vre_failed_offset == UINT64_MAX) {
4621 
4622 			/*
4623 			 * We need to periodically drop the config lock so that
4624 			 * writers can get in.  Additionally, we can't wait
4625 			 * for a txg to sync while holding a config lock
4626 			 * (since a waiting writer could cause a 3-way deadlock
4627 			 * with the sync thread, which also gets a config
4628 			 * lock for reader).  So we can't hold the config lock
4629 			 * while calling dmu_tx_assign().
4630 			 */
4631 			spa_config_exit(spa, SCL_CONFIG, FTAG);
4632 
4633 			/*
4634 			 * If requested, pause the reflow when the amount
4635 			 * specified by raidz_expand_max_reflow_bytes is reached
4636 			 *
4637 			 * This pause is only used during testing or debugging.
4638 			 */
4639 			while (raidz_expand_max_reflow_bytes != 0 &&
4640 			    raidz_expand_max_reflow_bytes <=
4641 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4642 				delay(hz);
4643 			}
4644 
4645 			mutex_enter(&vre->vre_lock);
4646 			while (vre->vre_outstanding_bytes >
4647 			    raidz_expand_max_copy_bytes) {
4648 				cv_wait(&vre->vre_cv, &vre->vre_lock);
4649 			}
4650 			mutex_exit(&vre->vre_lock);
4651 
4652 			dmu_tx_t *tx =
4653 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4654 
4655 			VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
4656 			uint64_t txg = dmu_tx_get_txg(tx);
4657 
4658 			/*
4659 			 * Reacquire the vdev_config lock.  Theoretically, the
4660 			 * vdev_t that we're expanding may have changed.
4661 			 */
4662 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4663 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4664 
4665 			boolean_t needsync =
4666 			    raidz_reflow_impl(raidvd, vre, rt, tx);
4667 
4668 			dmu_tx_commit(tx);
4669 
4670 			if (needsync) {
4671 				spa_config_exit(spa, SCL_CONFIG, FTAG);
4672 				txg_wait_synced(spa->spa_dsl_pool, txg);
4673 				spa_config_enter(spa, SCL_CONFIG, FTAG,
4674 				    RW_READER);
4675 			}
4676 		}
4677 
4678 		spa_config_exit(spa, SCL_CONFIG, FTAG);
4679 
4680 		metaslab_enable(msp, B_FALSE, B_FALSE);
4681 		zfs_range_tree_vacate(rt, NULL, NULL);
4682 		zfs_range_tree_destroy(rt);
4683 
4684 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4685 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4686 	}
4687 
4688 	spa_config_exit(spa, SCL_CONFIG, FTAG);
4689 
4690 	/*
4691 	 * The txg_wait_synced() here ensures that all reflow zio's have
4692 	 * completed, and vre_failed_offset has been set if necessary.  It
4693 	 * also ensures that the progress of the last raidz_reflow_sync() is
4694 	 * written to disk before raidz_reflow_complete_sync() changes the
4695 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4696 	 * determine if a reflow is in progress, in which case we may need to
4697 	 * write to both old and new locations.  Therefore we can only change
4698 	 * vre_state once this is not necessary, which is once the on-disk
4699 	 * progress (in spa_ubsync) has been set past any possible writes (to
4700 	 * the end of the last metaslab).
4701 	 */
4702 	txg_wait_synced(spa->spa_dsl_pool, 0);
4703 
4704 	if (!zthr_iscancelled(zthr) &&
4705 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4706 		/*
4707 		 * We are not being canceled or paused, so the reflow must be
4708 		 * complete. In that case also mark it as completed on disk.
4709 		 */
4710 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4711 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4712 		    raidz_reflow_complete_sync, spa,
4713 		    0, ZFS_SPACE_CHECK_NONE));
4714 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4715 	} else {
4716 		/*
4717 		 * Wait for all copy zio's to complete and for all the
4718 		 * raidz_reflow_sync() synctasks to be run.
4719 		 */
4720 		spa_history_log_internal(spa, "reflow pause",
4721 		    NULL, "offset=%llu failed_offset=%lld",
4722 		    (long long)vre->vre_offset,
4723 		    (long long)vre->vre_failed_offset);
4724 		mutex_enter(&vre->vre_lock);
4725 		if (vre->vre_failed_offset != UINT64_MAX) {
4726 			/*
4727 			 * Reset progress so that we will retry everything
4728 			 * after the point that something failed.
4729 			 */
4730 			vre->vre_offset = vre->vre_failed_offset;
4731 			vre->vre_failed_offset = UINT64_MAX;
4732 			vre->vre_waiting_for_resilver = B_TRUE;
4733 		}
4734 		mutex_exit(&vre->vre_lock);
4735 	}
4736 }
4737 
4738 void
4739 spa_start_raidz_expansion_thread(spa_t *spa)
4740 {
4741 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4742 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4743 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4744 	    spa, defclsyspri);
4745 }
4746 
4747 void
4748 raidz_dtl_reassessed(vdev_t *vd)
4749 {
4750 	spa_t *spa = vd->vdev_spa;
4751 	if (spa->spa_raidz_expand != NULL) {
4752 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4753 		/*
4754 		 * we get called often from vdev_dtl_reassess() so make
4755 		 * sure it's our vdev and any replacing is complete
4756 		 */
4757 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4758 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4759 			mutex_enter(&vre->vre_lock);
4760 			if (vre->vre_waiting_for_resilver) {
4761 				vdev_dbgmsg(vd, "DTL reassessed, "
4762 				    "continuing raidz expansion");
4763 				vre->vre_waiting_for_resilver = B_FALSE;
4764 				zthr_wakeup(spa->spa_raidz_expand_zthr);
4765 			}
4766 			mutex_exit(&vre->vre_lock);
4767 		}
4768 	}
4769 }
4770 
4771 int
4772 vdev_raidz_attach_check(vdev_t *new_child)
4773 {
4774 	vdev_t *raidvd = new_child->vdev_parent;
4775 	uint64_t new_children = raidvd->vdev_children;
4776 
4777 	/*
4778 	 * We use the "boot" space as scratch space to handle overwriting the
4779 	 * initial part of the vdev.  If it is too small, then this expansion
4780 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4781 	 * >200 children).
4782 	 */
4783 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4784 		return (EINVAL);
4785 	}
4786 	return (0);
4787 }
4788 
4789 void
4790 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4791 {
4792 	vdev_t *new_child = arg;
4793 	spa_t *spa = new_child->vdev_spa;
4794 	vdev_t *raidvd = new_child->vdev_parent;
4795 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4796 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4797 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
4798 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4799 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4800 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4801 	    new_child);
4802 
4803 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4804 
4805 	vdrz->vd_physical_width++;
4806 
4807 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4808 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4809 	vdrz->vn_vre.vre_offset = 0;
4810 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4811 	spa->spa_raidz_expand = &vdrz->vn_vre;
4812 	zthr_wakeup(spa->spa_raidz_expand_zthr);
4813 
4814 	/*
4815 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4816 	 * written to the config.
4817 	 */
4818 	vdev_config_dirty(raidvd);
4819 
4820 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
4821 	vdrz->vn_vre.vre_end_time = 0;
4822 	vdrz->vn_vre.vre_state = DSS_SCANNING;
4823 	vdrz->vn_vre.vre_bytes_copied = 0;
4824 
4825 	uint64_t state = vdrz->vn_vre.vre_state;
4826 	VERIFY0(zap_update(spa->spa_meta_objset,
4827 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4828 	    sizeof (state), 1, &state, tx));
4829 
4830 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
4831 	VERIFY0(zap_update(spa->spa_meta_objset,
4832 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4833 	    sizeof (start_time), 1, &start_time, tx));
4834 
4835 	(void) zap_remove(spa->spa_meta_objset,
4836 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4837 	(void) zap_remove(spa->spa_meta_objset,
4838 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4839 
4840 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4841 	    "%s vdev %llu new width %llu", spa_name(spa),
4842 	    (unsigned long long)raidvd->vdev_id,
4843 	    (unsigned long long)raidvd->vdev_children);
4844 }
4845 
4846 int
4847 vdev_raidz_load(vdev_t *vd)
4848 {
4849 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4850 	int err;
4851 
4852 	uint64_t state = DSS_NONE;
4853 	uint64_t start_time = 0;
4854 	uint64_t end_time = 0;
4855 	uint64_t bytes_copied = 0;
4856 
4857 	if (vd->vdev_top_zap != 0) {
4858 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4859 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4860 		    sizeof (state), 1, &state);
4861 		if (err != 0 && err != ENOENT)
4862 			return (err);
4863 
4864 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4865 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4866 		    sizeof (start_time), 1, &start_time);
4867 		if (err != 0 && err != ENOENT)
4868 			return (err);
4869 
4870 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4871 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4872 		    sizeof (end_time), 1, &end_time);
4873 		if (err != 0 && err != ENOENT)
4874 			return (err);
4875 
4876 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4877 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4878 		    sizeof (bytes_copied), 1, &bytes_copied);
4879 		if (err != 0 && err != ENOENT)
4880 			return (err);
4881 	}
4882 
4883 	/*
4884 	 * If we are in the middle of expansion, vre_state should have
4885 	 * already been set by vdev_raidz_init().
4886 	 */
4887 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4888 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4889 	vdrz->vn_vre.vre_start_time = start_time;
4890 	vdrz->vn_vre.vre_end_time = end_time;
4891 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4892 
4893 	return (0);
4894 }
4895 
4896 int
4897 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4898 {
4899 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4900 
4901 	if (vre == NULL) {
4902 		/* no removal in progress; find most recent completed */
4903 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4904 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4905 			if (vd->vdev_ops == &vdev_raidz_ops) {
4906 				vdev_raidz_t *vdrz = vd->vdev_tsd;
4907 
4908 				if (vdrz->vn_vre.vre_end_time != 0 &&
4909 				    (vre == NULL ||
4910 				    vdrz->vn_vre.vre_end_time >
4911 				    vre->vre_end_time)) {
4912 					vre = &vdrz->vn_vre;
4913 				}
4914 			}
4915 		}
4916 	}
4917 
4918 	if (vre == NULL) {
4919 		return (SET_ERROR(ENOENT));
4920 	}
4921 
4922 	pres->pres_state = vre->vre_state;
4923 	pres->pres_expanding_vdev = vre->vre_vdev_id;
4924 
4925 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4926 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4927 
4928 	mutex_enter(&vre->vre_lock);
4929 	pres->pres_reflowed = vre->vre_bytes_copied;
4930 	for (int i = 0; i < TXG_SIZE; i++)
4931 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4932 	mutex_exit(&vre->vre_lock);
4933 
4934 	pres->pres_start_time = vre->vre_start_time;
4935 	pres->pres_end_time = vre->vre_end_time;
4936 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4937 
4938 	return (0);
4939 }
4940 
4941 /*
4942  * Initialize private RAIDZ specific fields from the nvlist.
4943  */
4944 static int
4945 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4946 {
4947 	uint_t children;
4948 	nvlist_t **child;
4949 	int error = nvlist_lookup_nvlist_array(nv,
4950 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
4951 	if (error != 0)
4952 		return (SET_ERROR(EINVAL));
4953 
4954 	uint64_t nparity;
4955 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4956 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4957 			return (SET_ERROR(EINVAL));
4958 
4959 		/*
4960 		 * Previous versions could only support 1 or 2 parity
4961 		 * device.
4962 		 */
4963 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4964 			return (SET_ERROR(EINVAL));
4965 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4966 			return (SET_ERROR(EINVAL));
4967 	} else {
4968 		/*
4969 		 * We require the parity to be specified for SPAs that
4970 		 * support multiple parity levels.
4971 		 */
4972 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4973 			return (SET_ERROR(EINVAL));
4974 
4975 		/*
4976 		 * Otherwise, we default to 1 parity device for RAID-Z.
4977 		 */
4978 		nparity = 1;
4979 	}
4980 
4981 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4982 	vdrz->vn_vre.vre_vdev_id = -1;
4983 	vdrz->vn_vre.vre_offset = UINT64_MAX;
4984 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4985 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4986 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4987 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4988 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4989 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4990 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4991 
4992 	vdrz->vd_physical_width = children;
4993 	vdrz->vd_nparity = nparity;
4994 
4995 	/* note, the ID does not exist when creating a pool */
4996 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4997 	    &vdrz->vn_vre.vre_vdev_id);
4998 
4999 	boolean_t reflow_in_progress =
5000 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5001 	if (reflow_in_progress) {
5002 		spa->spa_raidz_expand = &vdrz->vn_vre;
5003 		vdrz->vn_vre.vre_state = DSS_SCANNING;
5004 	}
5005 
5006 	vdrz->vd_original_width = children;
5007 	uint64_t *txgs;
5008 	unsigned int txgs_size = 0;
5009 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5010 	    &txgs, &txgs_size);
5011 	if (error == 0) {
5012 		for (int i = 0; i < txgs_size; i++) {
5013 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
5014 			re->re_txg = txgs[txgs_size - i - 1];
5015 			re->re_logical_width = vdrz->vd_physical_width - i;
5016 
5017 			if (reflow_in_progress)
5018 				re->re_logical_width--;
5019 
5020 			avl_add(&vdrz->vd_expand_txgs, re);
5021 		}
5022 
5023 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
5024 	}
5025 	if (reflow_in_progress) {
5026 		vdrz->vd_original_width--;
5027 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
5028 		    children, txgs_size);
5029 	}
5030 
5031 	*tsd = vdrz;
5032 
5033 	return (0);
5034 }
5035 
5036 static void
5037 vdev_raidz_fini(vdev_t *vd)
5038 {
5039 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5040 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5041 		vd->vdev_spa->spa_raidz_expand = NULL;
5042 	reflow_node_t *re;
5043 	void *cookie = NULL;
5044 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
5045 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5046 		kmem_free(re, sizeof (*re));
5047 	avl_destroy(&vdrz->vd_expand_txgs);
5048 	mutex_destroy(&vdrz->vd_expand_lock);
5049 	mutex_destroy(&vdrz->vn_vre.vre_lock);
5050 	cv_destroy(&vdrz->vn_vre.vre_cv);
5051 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5052 	kmem_free(vdrz, sizeof (*vdrz));
5053 }
5054 
5055 /*
5056  * Add RAIDZ specific fields to the config nvlist.
5057  */
5058 static void
5059 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5060 {
5061 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5062 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5063 
5064 	/*
5065 	 * Make sure someone hasn't managed to sneak a fancy new vdev
5066 	 * into a crufty old storage pool.
5067 	 */
5068 	ASSERT(vdrz->vd_nparity == 1 ||
5069 	    (vdrz->vd_nparity <= 2 &&
5070 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5071 	    (vdrz->vd_nparity <= 3 &&
5072 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5073 
5074 	/*
5075 	 * Note that we'll add these even on storage pools where they
5076 	 * aren't strictly required -- older software will just ignore
5077 	 * it.
5078 	 */
5079 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5080 
5081 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5082 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5083 	}
5084 
5085 	mutex_enter(&vdrz->vd_expand_lock);
5086 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5087 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5088 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5089 		    KM_SLEEP);
5090 		uint64_t i = 0;
5091 
5092 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5093 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5094 			txgs[i++] = re->re_txg;
5095 		}
5096 
5097 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5098 		    txgs, count);
5099 
5100 		kmem_free(txgs, sizeof (uint64_t) * count);
5101 	}
5102 	mutex_exit(&vdrz->vd_expand_lock);
5103 }
5104 
5105 static uint64_t
5106 vdev_raidz_nparity(vdev_t *vd)
5107 {
5108 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5109 	return (vdrz->vd_nparity);
5110 }
5111 
5112 static uint64_t
5113 vdev_raidz_ndisks(vdev_t *vd)
5114 {
5115 	return (vd->vdev_children);
5116 }
5117 
5118 vdev_ops_t vdev_raidz_ops = {
5119 	.vdev_op_init = vdev_raidz_init,
5120 	.vdev_op_fini = vdev_raidz_fini,
5121 	.vdev_op_open = vdev_raidz_open,
5122 	.vdev_op_close = vdev_raidz_close,
5123 	.vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
5124 	.vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
5125 	.vdev_op_min_asize = vdev_raidz_min_asize,
5126 	.vdev_op_min_alloc = NULL,
5127 	.vdev_op_io_start = vdev_raidz_io_start,
5128 	.vdev_op_io_done = vdev_raidz_io_done,
5129 	.vdev_op_state_change = vdev_raidz_state_change,
5130 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
5131 	.vdev_op_hold = NULL,
5132 	.vdev_op_rele = NULL,
5133 	.vdev_op_remap = NULL,
5134 	.vdev_op_xlate = vdev_raidz_xlate,
5135 	.vdev_op_rebuild_asize = NULL,
5136 	.vdev_op_metaslab_init = NULL,
5137 	.vdev_op_config_generate = vdev_raidz_config_generate,
5138 	.vdev_op_nparity = vdev_raidz_nparity,
5139 	.vdev_op_ndisks = vdev_raidz_ndisks,
5140 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5141 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5142 };
5143 
5144 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5145 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5146 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5147 	"Max amount of concurrent i/o for RAIDZ expansion");
5148 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5149 	"For expanded RAIDZ, aggregate reads that have more rows than this");
5150 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5151 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5152 	"completes");
5153