xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/zap.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/abd.h>
38 #include <sys/zfs_rlock.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #include <sys/vdev_raidz.h>
42 #include <sys/vdev_raidz_impl.h>
43 #include <sys/vdev_draid.h>
44 #include <sys/uberblock_impl.h>
45 #include <sys/dsl_scan.h>
46 
47 #ifdef ZFS_DEBUG
48 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
49 #endif
50 
51 /*
52  * Virtual device vector for RAID-Z.
53  *
54  * This vdev supports single, double, and triple parity. For single parity,
55  * we use a simple XOR of all the data columns. For double or triple parity,
56  * we use a special case of Reed-Solomon coding. This extends the
57  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60  * former is also based. The latter is designed to provide higher performance
61  * for writes.
62  *
63  * Note that the Plank paper claimed to support arbitrary N+M, but was then
64  * amended six years later identifying a critical flaw that invalidates its
65  * claims. Nevertheless, the technique can be adapted to work for up to
66  * triple parity. For additional parity, the amendment "Note: Correction to
67  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68  * is viable, but the additional complexity means that write performance will
69  * suffer.
70  *
71  * All of the methods above operate on a Galois field, defined over the
72  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73  * can be expressed with a single byte. Briefly, the operations on the
74  * field are defined as follows:
75  *
76  *   o addition (+) is represented by a bitwise XOR
77  *   o subtraction (-) is therefore identical to addition: A + B = A - B
78  *   o multiplication of A by 2 is defined by the following bitwise expression:
79  *
80  *	(A * 2)_7 = A_6
81  *	(A * 2)_6 = A_5
82  *	(A * 2)_5 = A_4
83  *	(A * 2)_4 = A_3 + A_7
84  *	(A * 2)_3 = A_2 + A_7
85  *	(A * 2)_2 = A_1 + A_7
86  *	(A * 2)_1 = A_0
87  *	(A * 2)_0 = A_7
88  *
89  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90  * As an aside, this multiplication is derived from the error correcting
91  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
92  *
93  * Observe that any number in the field (except for 0) can be expressed as a
94  * power of 2 -- a generator for the field. We store a table of the powers of
95  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97  * than field addition). The inverse of a field element A (A^-1) is therefore
98  * A ^ (255 - 1) = A^254.
99  *
100  * The up-to-three parity columns, P, Q, R over several data columns,
101  * D_0, ... D_n-1, can be expressed by field operations:
102  *
103  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
104  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
108  *
109  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111  * independent coefficients. (There are no additional coefficients that have
112  * this property which is why the uncorrected Plank method breaks down.)
113  *
114  * See the reconstruction code below for how P, Q and R can used individually
115  * or in concert to recover missing data columns.
116  */
117 
118 #define	VDEV_RAIDZ_P		0
119 #define	VDEV_RAIDZ_Q		1
120 #define	VDEV_RAIDZ_R		2
121 
122 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124 
125 /*
126  * We provide a mechanism to perform the field multiplication operation on a
127  * 64-bit value all at once rather than a byte at a time. This works by
128  * creating a mask from the top bit in each byte and using that to
129  * conditionally apply the XOR of 0x1d.
130  */
131 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
132 { \
133 	(mask) = (x) & 0x8080808080808080ULL; \
134 	(mask) = ((mask) << 1) - ((mask) >> 7); \
135 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
137 }
138 
139 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
140 { \
141 	VDEV_RAIDZ_64MUL_2((x), mask); \
142 	VDEV_RAIDZ_64MUL_2((x), mask); \
143 }
144 
145 
146 /*
147  * Big Theory Statement for how a RAIDZ VDEV is expanded
148  *
149  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151  * that have been previously expanded can be expanded again.
152  *
153  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154  * the VDEV) when an expansion starts.  And the expansion will pause if any
155  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156  * operations on the pool can continue while an expansion is in progress (e.g.
157  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158  * and zpool initialize which can't be run during an expansion.  Following a
159  * reboot or export/import, the expansion resumes where it left off.
160  *
161  * == Reflowing the Data ==
162  *
163  * The expansion involves reflowing (copying) the data from the current set
164  * of disks to spread it across the new set which now has one more disk. This
165  * reflow operation is similar to reflowing text when the column width of a
166  * text editor window is expanded. The text doesn’t change but the location of
167  * the text changes to accommodate the new width. An example reflow result for
168  * a 4-wide RAIDZ1 to a 5-wide is shown below.
169  *
170  *                            Reflow End State
171  *            Each letter indicates a parity group (logical stripe)
172  *
173  *         Before expansion                         After Expansion
174  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
175  *  +------+------+------+------+         +------+------+------+------+------+
176  *  |      |      |      |      |         |      |      |      |      |      |
177  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
178  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
179  *  +------+------+------+------+         +------+------+------+------+------+
180  *  |      |      |      |      |         |      |      |      |      |      |
181  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
182  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
183  *  +------+------+------+------+         +------+------+------+------+------+
184  *  |      |      |      |      |         |      |      |      |      |      |
185  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
186  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
187  *  +------+------+------+------+         +------+------+------+------+------+
188  *  |      |      |      |      |         |      |      |      |      |      |
189  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
190  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
191  *  +------+------+------+------+         +------+------+------+------+------+
192  *  |      |      |      |      |         |      |      |      |      |      |
193  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
194  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
195  *  +------+------+------+------+         +------+------+------+------+------+
196  *  |      |      |      |      |         |      |      |      |      |      |
197  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
198  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
199  *  +------+------+------+------+         +------+------+------+------+------+
200  *  |      |      |      |      |         |      |      |      |      |      |
201  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
202  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
203  *  +------+------+------+------+         +------+------+------+------+------+
204  *
205  * This reflow approach has several advantages. There is no need to read or
206  * modify the block pointers or recompute any block checksums.  The reflow
207  * doesn’t need to know where the parity sectors reside. We can read and write
208  * data sequentially and the copy can occur in a background thread in open
209  * context. The design also allows for fast discovery of what data to copy.
210  *
211  * The VDEV metaslabs are processed, one at a time, to copy the block data to
212  * have it flow across all the disks. The metaslab is disabled for allocations
213  * during the copy. As an optimization, we only copy the allocated data which
214  * can be determined by looking at the metaslab range tree. During the copy we
215  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216  * need to be able to survive losing parity count disks).  This means we
217  * cannot overwrite data during the reflow that would be needed if a disk is
218  * lost.
219  *
220  * After the reflow completes, all newly-written blocks will have the new
221  * layout, i.e., they will have the parity to data ratio implied by the new
222  * number of disks in the RAIDZ group.  Even though the reflow copies all of
223  * the allocated space (data and parity), it is only rearranged, not changed.
224  *
225  * This act of reflowing the data has a few implications about blocks
226  * that were written before the reflow completes:
227  *
228  *  - Old blocks will still use the same amount of space (i.e., they will have
229  *    the parity to data ratio implied by the old number of disks in the RAIDZ
230  *    group).
231  *  - Reading old blocks will be slightly slower than before the reflow, for
232  *    two reasons. First, we will have to read from all disks in the RAIDZ
233  *    VDEV, rather than being able to skip the children that contain only
234  *    parity of this block (because the data of a single block is now spread
235  *    out across all the disks).  Second, in most cases there will be an extra
236  *    bcopy, needed to rearrange the data back to its original layout in memory.
237  *
238  * == Scratch Area ==
239  *
240  * As we copy the block data, we can only progress to the point that writes
241  * will not overlap with blocks whose progress has not yet been recorded on
242  * disk.  Since partially-copied rows are always read from the old location,
243  * we need to stop one row before the sector-wise overlap, to prevent any
244  * row-wise overlap. For example, in the diagram above, when we reflow sector
245  * B6 it will overwite the original location for B5.
246  *
247  * To get around this, a scratch space is used so that we can start copying
248  * without risking data loss by overlapping the row. As an added benefit, it
249  * improves performance at the beginning of the reflow, but that small perf
250  * boost wouldn't be worth the complexity on its own.
251  *
252  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255  * the widths will likely be single digits so we can get a substantial chuck
256  * size using only a few MB of scratch per disk.
257  *
258  * The scratch area is persisted to disk which holds a large amount of reflowed
259  * state. We can always read the partially written stripes when a disk fails or
260  * the copy is interrupted (crash) during the initial copying phase and also
261  * get past a small chunk size restriction.  At a minimum, the scratch space
262  * must be large enough to get us to the point that one row does not overlap
263  * itself when moved (i.e new_width^2).  But going larger is even better. We
264  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265  * as our scratch space to handle overwriting the initial part of the VDEV.
266  *
267  *	0     256K   512K                    4M
268  *	+------+------+-----------------------+-----------------------------
269  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
270  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
271  *	+------+------+-----------------------+-------------------------------
272  *                        Scratch Area
273  *
274  * == Reflow Progress Updates ==
275  * After the initial scratch-based reflow, the expansion process works
276  * similarly to device removal. We create a new open context thread which
277  * reflows the data, and periodically kicks off sync tasks to update logical
278  * state. In this case, state is the committed progress (offset of next data
279  * to copy). We need to persist the completed offset on disk, so that if we
280  * crash we know which format each VDEV offset is in.
281  *
282  * == Time Dependent Geometry ==
283  *
284  * In non-expanded RAIDZ, blocks are read from disk in a column by column
285  * fashion. For a multi-row block, the second sector is in the first column
286  * not in the second column. This allows us to issue full reads for each
287  * column directly into the request buffer. The block data is thus laid out
288  * sequentially in a column-by-column fashion.
289  *
290  * For example, in the before expansion diagram above, one logical block might
291  * be sectors G19-H26. The parity is in G19,H23; and the data is in
292  * G20,H24,G21,H25,G22,H26.
293  *
294  * After a block is reflowed, the sectors that were all in the original column
295  * data can now reside in different columns. When reading from an expanded
296  * VDEV, we need to know the logical stripe width for each block so we can
297  * reconstitute the block’s data after the reads are completed. Likewise,
298  * when we perform the combinatorial reconstruction we need to know the
299  * original width so we can retry combinations from the past layouts.
300  *
301  * Time dependent geometry is what we call having blocks with different layouts
302  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303  * block’s birth time (+ the time expansion ended) to establish the correct
304  * width for a given block. After an expansion completes, we record the time
305  * for blocks written with a particular width (geometry).
306  *
307  * == On Disk Format Changes ==
308  *
309  * New pool feature flag, 'raidz_expansion' whose reference count is the number
310  * of RAIDZ VDEVs that have been expanded.
311  *
312  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313  *
314  * Since the uberblock can point to arbitrary blocks, which might be on the
315  * expanding RAIDZ, and might or might not have been expanded. We need to know
316  * which way a block is laid out before reading it. This info is the next
317  * offset that needs to be reflowed and we persist that in the uberblock, in
318  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319  * After the expansion is complete, we then use the raidz_expand_txgs array
320  * (see below) to determine how to read a block and the ub_raidz_reflow_info
321  * field no longer required.
322  *
323  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324  * state (i.e., active or not) which is also required before reading a block
325  * during the initial phase of reflowing the data.
326  *
327  * The top-level RAIDZ VDEV has two new entries in the nvlist:
328  *
329  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330  *                            and used after the expansion is complete to
331  *                            determine how to read a raidz block
332  * 'raidz_expanding' boolean: present during reflow and removed after completion
333  *                            used during a spa import to resume an unfinished
334  *                            expansion
335  *
336  * And finally the VDEVs top zap adds the following informational entries:
337  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
341  */
342 
343 /*
344  * For testing only: pause the raidz expansion after reflowing this amount.
345  * (accessed by ZTS and ztest)
346  */
347 #ifdef	_KERNEL
348 static
349 #endif	/* _KERNEL */
350 unsigned long raidz_expand_max_reflow_bytes = 0;
351 
352 /*
353  * For testing only: pause the raidz expansion at a certain point.
354  */
355 uint_t raidz_expand_pause_point = 0;
356 
357 /*
358  * Maximum amount of copy io's outstanding at once.
359  */
360 #ifdef _ILP32
361 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
362 #else
363 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
364 #endif
365 
366 /*
367  * Apply raidz map abds aggregation if the number of rows in the map is equal
368  * or greater than the value below.
369  */
370 static unsigned long raidz_io_aggregate_rows = 4;
371 
372 /*
373  * Automatically start a pool scrub when a RAIDZ expansion completes in
374  * order to verify the checksums of all blocks which have been copied
375  * during the expansion.  Automatic scrubbing is enabled by default and
376  * is strongly recommended.
377  */
378 static int zfs_scrub_after_expand = 1;
379 
380 static void
381 vdev_raidz_row_free(raidz_row_t *rr)
382 {
383 	for (int c = 0; c < rr->rr_cols; c++) {
384 		raidz_col_t *rc = &rr->rr_col[c];
385 
386 		if (rc->rc_size != 0)
387 			abd_free(rc->rc_abd);
388 		if (rc->rc_orig_data != NULL)
389 			abd_free(rc->rc_orig_data);
390 	}
391 
392 	if (rr->rr_abd_empty != NULL)
393 		abd_free(rr->rr_abd_empty);
394 
395 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
396 }
397 
398 void
399 vdev_raidz_map_free(raidz_map_t *rm)
400 {
401 	for (int i = 0; i < rm->rm_nrows; i++)
402 		vdev_raidz_row_free(rm->rm_row[i]);
403 
404 	if (rm->rm_nphys_cols) {
405 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
406 			if (rm->rm_phys_col[i].rc_abd != NULL)
407 				abd_free(rm->rm_phys_col[i].rc_abd);
408 		}
409 
410 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
411 		    rm->rm_nphys_cols);
412 	}
413 
414 	ASSERT3P(rm->rm_lr, ==, NULL);
415 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
416 }
417 
418 static void
419 vdev_raidz_map_free_vsd(zio_t *zio)
420 {
421 	raidz_map_t *rm = zio->io_vsd;
422 
423 	vdev_raidz_map_free(rm);
424 }
425 
426 static int
427 vdev_raidz_reflow_compare(const void *x1, const void *x2)
428 {
429 	const reflow_node_t *l = x1;
430 	const reflow_node_t *r = x2;
431 
432 	return (TREE_CMP(l->re_txg, r->re_txg));
433 }
434 
435 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
436 	.vsd_free = vdev_raidz_map_free_vsd,
437 };
438 
439 raidz_row_t *
440 vdev_raidz_row_alloc(int cols, zio_t *zio)
441 {
442 	raidz_row_t *rr =
443 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
444 
445 	rr->rr_cols = cols;
446 	rr->rr_scols = cols;
447 
448 	for (int c = 0; c < cols; c++) {
449 		raidz_col_t *rc = &rr->rr_col[c];
450 		rc->rc_shadow_devidx = INT_MAX;
451 		rc->rc_shadow_offset = UINT64_MAX;
452 		/*
453 		 * We can not allow self healing to take place for Direct I/O
454 		 * reads. There is nothing that stops the buffer contents from
455 		 * being manipulated while the I/O is in flight. It is possible
456 		 * that the checksum could be verified on the buffer and then
457 		 * the contents of that buffer are manipulated afterwards. This
458 		 * could lead to bad data being written out during self
459 		 * healing.
460 		 */
461 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
462 			rc->rc_allow_repair = 1;
463 	}
464 	return (rr);
465 }
466 
467 static void
468 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
469 {
470 	int c;
471 	int nwrapped = 0;
472 	uint64_t off = 0;
473 	raidz_row_t *rr = rm->rm_row[0];
474 
475 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
476 	ASSERT3U(rm->rm_nrows, ==, 1);
477 
478 	/*
479 	 * Pad any parity columns with additional space to account for skip
480 	 * sectors.
481 	 */
482 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
483 		ASSERT0(rm->rm_skipstart);
484 		nwrapped = rm->rm_nskip;
485 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
486 		nwrapped =
487 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
488 	}
489 
490 	/*
491 	 * Optional single skip sectors (rc_size == 0) will be handled in
492 	 * vdev_raidz_io_start_write().
493 	 */
494 	int skipped = rr->rr_scols - rr->rr_cols;
495 
496 	/* Allocate buffers for the parity columns */
497 	for (c = 0; c < rr->rr_firstdatacol; c++) {
498 		raidz_col_t *rc = &rr->rr_col[c];
499 
500 		/*
501 		 * Parity columns will pad out a linear ABD to account for
502 		 * the skip sector. A linear ABD is used here because
503 		 * parity calculations use the ABD buffer directly to calculate
504 		 * parity. This avoids doing a memcpy back to the ABD after the
505 		 * parity has been calculated. By issuing the parity column
506 		 * with the skip sector we can reduce contention on the child
507 		 * VDEV queue locks (vq_lock).
508 		 */
509 		if (c < nwrapped) {
510 			rc->rc_abd = abd_alloc_linear(
511 			    rc->rc_size + (1ULL << ashift), B_FALSE);
512 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
513 			skipped++;
514 		} else {
515 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
516 		}
517 	}
518 
519 	for (off = 0; c < rr->rr_cols; c++) {
520 		raidz_col_t *rc = &rr->rr_col[c];
521 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
522 		    zio->io_abd, off, rc->rc_size);
523 
524 		/*
525 		 * Generate I/O for skip sectors to improve aggregation
526 		 * continuity. We will use gang ABD's to reduce contention
527 		 * on the child VDEV queue locks (vq_lock) by issuing
528 		 * a single I/O that contains the data and skip sector.
529 		 *
530 		 * It is important to make sure that rc_size is not updated
531 		 * even though we are adding a skip sector to the ABD. When
532 		 * calculating the parity in vdev_raidz_generate_parity_row()
533 		 * the rc_size is used to iterate through the ABD's. We can
534 		 * not have zero'd out skip sectors used for calculating
535 		 * parity for raidz, because those same sectors are not used
536 		 * during reconstruction.
537 		 */
538 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
539 			rc->rc_abd = abd_alloc_gang();
540 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
541 			abd_gang_add(rc->rc_abd,
542 			    abd_get_zeros(1ULL << ashift), B_TRUE);
543 			skipped++;
544 		} else {
545 			rc->rc_abd = abd;
546 		}
547 		off += rc->rc_size;
548 	}
549 
550 	ASSERT3U(off, ==, zio->io_size);
551 	ASSERT3S(skipped, ==, rm->rm_nskip);
552 }
553 
554 static void
555 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
556 {
557 	int c;
558 	raidz_row_t *rr = rm->rm_row[0];
559 
560 	ASSERT3U(rm->rm_nrows, ==, 1);
561 
562 	/* Allocate buffers for the parity columns */
563 	for (c = 0; c < rr->rr_firstdatacol; c++)
564 		rr->rr_col[c].rc_abd =
565 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
566 
567 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
568 		raidz_col_t *rc = &rr->rr_col[c];
569 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
570 		    zio->io_abd, off, rc->rc_size);
571 		off += rc->rc_size;
572 	}
573 }
574 
575 /*
576  * Divides the IO evenly across all child vdevs; usually, dcols is
577  * the number of children in the target vdev.
578  *
579  * Avoid inlining the function to keep vdev_raidz_io_start(), which
580  * is this functions only caller, as small as possible on the stack.
581  */
582 noinline raidz_map_t *
583 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
584     uint64_t nparity)
585 {
586 	raidz_row_t *rr;
587 	/* The starting RAIDZ (parent) vdev sector of the block. */
588 	uint64_t b = zio->io_offset >> ashift;
589 	/* The zio's size in units of the vdev's minimum sector size. */
590 	uint64_t s = zio->io_size >> ashift;
591 	/* The first column for this stripe. */
592 	uint64_t f = b % dcols;
593 	/* The starting byte offset on each child vdev. */
594 	uint64_t o = (b / dcols) << ashift;
595 	uint64_t acols, scols;
596 
597 	raidz_map_t *rm =
598 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
599 	rm->rm_nrows = 1;
600 
601 	/*
602 	 * "Quotient": The number of data sectors for this stripe on all but
603 	 * the "big column" child vdevs that also contain "remainder" data.
604 	 */
605 	uint64_t q = s / (dcols - nparity);
606 
607 	/*
608 	 * "Remainder": The number of partial stripe data sectors in this I/O.
609 	 * This will add a sector to some, but not all, child vdevs.
610 	 */
611 	uint64_t r = s - q * (dcols - nparity);
612 
613 	/* The number of "big columns" - those which contain remainder data. */
614 	uint64_t bc = (r == 0 ? 0 : r + nparity);
615 
616 	/*
617 	 * The total number of data and parity sectors associated with
618 	 * this I/O.
619 	 */
620 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
621 
622 	/*
623 	 * acols: The columns that will be accessed.
624 	 * scols: The columns that will be accessed or skipped.
625 	 */
626 	if (q == 0) {
627 		/* Our I/O request doesn't span all child vdevs. */
628 		acols = bc;
629 		scols = MIN(dcols, roundup(bc, nparity + 1));
630 	} else {
631 		acols = dcols;
632 		scols = dcols;
633 	}
634 
635 	ASSERT3U(acols, <=, scols);
636 	rr = vdev_raidz_row_alloc(scols, zio);
637 	rm->rm_row[0] = rr;
638 	rr->rr_cols = acols;
639 	rr->rr_bigcols = bc;
640 	rr->rr_firstdatacol = nparity;
641 #ifdef ZFS_DEBUG
642 	rr->rr_offset = zio->io_offset;
643 	rr->rr_size = zio->io_size;
644 #endif
645 
646 	uint64_t asize = 0;
647 
648 	for (uint64_t c = 0; c < scols; c++) {
649 		raidz_col_t *rc = &rr->rr_col[c];
650 		uint64_t col = f + c;
651 		uint64_t coff = o;
652 		if (col >= dcols) {
653 			col -= dcols;
654 			coff += 1ULL << ashift;
655 		}
656 		rc->rc_devidx = col;
657 		rc->rc_offset = coff;
658 
659 		if (c >= acols)
660 			rc->rc_size = 0;
661 		else if (c < bc)
662 			rc->rc_size = (q + 1) << ashift;
663 		else
664 			rc->rc_size = q << ashift;
665 
666 		asize += rc->rc_size;
667 	}
668 
669 	ASSERT3U(asize, ==, tot << ashift);
670 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
671 	rm->rm_skipstart = bc;
672 
673 	/*
674 	 * If all data stored spans all columns, there's a danger that parity
675 	 * will always be on the same device and, since parity isn't read
676 	 * during normal operation, that device's I/O bandwidth won't be
677 	 * used effectively. We therefore switch the parity every 1MB.
678 	 *
679 	 * ... at least that was, ostensibly, the theory. As a practical
680 	 * matter unless we juggle the parity between all devices evenly, we
681 	 * won't see any benefit. Further, occasional writes that aren't a
682 	 * multiple of the LCM of the number of children and the minimum
683 	 * stripe width are sufficient to avoid pessimal behavior.
684 	 * Unfortunately, this decision created an implicit on-disk format
685 	 * requirement that we need to support for all eternity, but only
686 	 * for single-parity RAID-Z.
687 	 *
688 	 * If we intend to skip a sector in the zeroth column for padding
689 	 * we must make sure to note this swap. We will never intend to
690 	 * skip the first column since at least one data and one parity
691 	 * column must appear in each row.
692 	 */
693 	ASSERT(rr->rr_cols >= 2);
694 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
695 
696 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
697 		uint64_t devidx = rr->rr_col[0].rc_devidx;
698 		o = rr->rr_col[0].rc_offset;
699 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
700 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
701 		rr->rr_col[1].rc_devidx = devidx;
702 		rr->rr_col[1].rc_offset = o;
703 		if (rm->rm_skipstart == 0)
704 			rm->rm_skipstart = 1;
705 	}
706 
707 	if (zio->io_type == ZIO_TYPE_WRITE) {
708 		vdev_raidz_map_alloc_write(zio, rm, ashift);
709 	} else {
710 		vdev_raidz_map_alloc_read(zio, rm);
711 	}
712 	/* init RAIDZ parity ops */
713 	rm->rm_ops = vdev_raidz_math_get_ops();
714 
715 	return (rm);
716 }
717 
718 /*
719  * Everything before reflow_offset_synced should have been moved to the new
720  * location (read and write completed).  However, this may not yet be reflected
721  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
722  * uberblock has not yet been written). If reflow is not in progress,
723  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
724  * entirely before reflow_offset_synced, it will come from the new location.
725  * Otherwise this row will come from the old location.  Therefore, rows that
726  * straddle the reflow_offset_synced will come from the old location.
727  *
728  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
729  * been copied, but not yet reflected in the on-disk progress
730  * (reflow_offset_synced), it will also be written to the new (already copied)
731  * offset.
732  */
733 noinline raidz_map_t *
734 vdev_raidz_map_alloc_expanded(zio_t *zio,
735     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
736     uint64_t nparity, uint64_t reflow_offset_synced,
737     uint64_t reflow_offset_next, boolean_t use_scratch)
738 {
739 	abd_t *abd = zio->io_abd;
740 	uint64_t offset = zio->io_offset;
741 	uint64_t size = zio->io_size;
742 
743 	/* The zio's size in units of the vdev's minimum sector size. */
744 	uint64_t s = size >> ashift;
745 
746 	/*
747 	 * "Quotient": The number of data sectors for this stripe on all but
748 	 * the "big column" child vdevs that also contain "remainder" data.
749 	 * AKA "full rows"
750 	 */
751 	uint64_t q = s / (logical_cols - nparity);
752 
753 	/*
754 	 * "Remainder": The number of partial stripe data sectors in this I/O.
755 	 * This will add a sector to some, but not all, child vdevs.
756 	 */
757 	uint64_t r = s - q * (logical_cols - nparity);
758 
759 	/* The number of "big columns" - those which contain remainder data. */
760 	uint64_t bc = (r == 0 ? 0 : r + nparity);
761 
762 	/*
763 	 * The total number of data and parity sectors associated with
764 	 * this I/O.
765 	 */
766 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
767 
768 	/* How many rows contain data (not skip) */
769 	uint64_t rows = howmany(tot, logical_cols);
770 	int cols = MIN(tot, logical_cols);
771 
772 	raidz_map_t *rm =
773 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
774 	    KM_SLEEP);
775 	rm->rm_nrows = rows;
776 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
777 	rm->rm_skipstart = bc;
778 	uint64_t asize = 0;
779 
780 	for (uint64_t row = 0; row < rows; row++) {
781 		boolean_t row_use_scratch = B_FALSE;
782 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
783 		rm->rm_row[row] = rr;
784 
785 		/* The starting RAIDZ (parent) vdev sector of the row. */
786 		uint64_t b = (offset >> ashift) + row * logical_cols;
787 
788 		/*
789 		 * If we are in the middle of a reflow, and the copying has
790 		 * not yet completed for any part of this row, then use the
791 		 * old location of this row.  Note that reflow_offset_synced
792 		 * reflects the i/o that's been completed, because it's
793 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
794 		 * This is sufficient for our check, even if that progress
795 		 * has not yet been recorded to disk (reflected in
796 		 * spa_ubsync).  Also note that we consider the last row to
797 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
798 		 * this calculation. This causes a tiny bit of unnecessary
799 		 * double-writes but is safe and simpler to calculate.
800 		 */
801 		int row_phys_cols = physical_cols;
802 		if (b + cols > reflow_offset_synced >> ashift)
803 			row_phys_cols--;
804 		else if (use_scratch)
805 			row_use_scratch = B_TRUE;
806 
807 		/* starting child of this row */
808 		uint64_t child_id = b % row_phys_cols;
809 		/* The starting byte offset on each child vdev. */
810 		uint64_t child_offset = (b / row_phys_cols) << ashift;
811 
812 		/*
813 		 * Note, rr_cols is the entire width of the block, even
814 		 * if this row is shorter.  This is needed because parity
815 		 * generation (for Q and R) needs to know the entire width,
816 		 * because it treats the short row as though it was
817 		 * full-width (and the "phantom" sectors were zero-filled).
818 		 *
819 		 * Another approach to this would be to set cols shorter
820 		 * (to just the number of columns that we might do i/o to)
821 		 * and have another mechanism to tell the parity generation
822 		 * about the "entire width".  Reconstruction (at least
823 		 * vdev_raidz_reconstruct_general()) would also need to
824 		 * know about the "entire width".
825 		 */
826 		rr->rr_firstdatacol = nparity;
827 #ifdef ZFS_DEBUG
828 		/*
829 		 * note: rr_size is PSIZE, not ASIZE
830 		 */
831 		rr->rr_offset = b << ashift;
832 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
833 #endif
834 
835 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
836 			if (child_id >= row_phys_cols) {
837 				child_id -= row_phys_cols;
838 				child_offset += 1ULL << ashift;
839 			}
840 			raidz_col_t *rc = &rr->rr_col[c];
841 			rc->rc_devidx = child_id;
842 			rc->rc_offset = child_offset;
843 
844 			/*
845 			 * Get this from the scratch space if appropriate.
846 			 * This only happens if we crashed in the middle of
847 			 * raidz_reflow_scratch_sync() (while it's running,
848 			 * the rangelock prevents us from doing concurrent
849 			 * io), and even then only during zpool import or
850 			 * when the pool is imported readonly.
851 			 */
852 			if (row_use_scratch)
853 				rc->rc_offset -= VDEV_BOOT_SIZE;
854 
855 			uint64_t dc = c - rr->rr_firstdatacol;
856 			if (c < rr->rr_firstdatacol) {
857 				rc->rc_size = 1ULL << ashift;
858 
859 				/*
860 				 * Parity sectors' rc_abd's are set below
861 				 * after determining if this is an aggregation.
862 				 */
863 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
864 				/*
865 				 * Past the end of the block (even including
866 				 * skip sectors).  This sector is part of the
867 				 * map so that we have full rows for p/q parity
868 				 * generation.
869 				 */
870 				rc->rc_size = 0;
871 				rc->rc_abd = NULL;
872 			} else {
873 				/* "data column" (col excluding parity) */
874 				uint64_t off;
875 
876 				if (c < bc || r == 0) {
877 					off = dc * rows + row;
878 				} else {
879 					off = r * rows +
880 					    (dc - r) * (rows - 1) + row;
881 				}
882 				rc->rc_size = 1ULL << ashift;
883 				rc->rc_abd = abd_get_offset_struct(
884 				    &rc->rc_abdstruct, abd, off << ashift,
885 				    rc->rc_size);
886 			}
887 
888 			if (rc->rc_size == 0)
889 				continue;
890 
891 			/*
892 			 * If any part of this row is in both old and new
893 			 * locations, the primary location is the old
894 			 * location. If this sector was already copied to the
895 			 * new location, we need to also write to the new,
896 			 * "shadow" location.
897 			 *
898 			 * Note, `row_phys_cols != physical_cols` indicates
899 			 * that the primary location is the old location.
900 			 * `b+c < reflow_offset_next` indicates that the copy
901 			 * to the new location has been initiated. We know
902 			 * that the copy has completed because we have the
903 			 * rangelock, which is held exclusively while the
904 			 * copy is in progress.
905 			 */
906 			if (row_use_scratch ||
907 			    (row_phys_cols != physical_cols &&
908 			    b + c < reflow_offset_next >> ashift)) {
909 				rc->rc_shadow_devidx = (b + c) % physical_cols;
910 				rc->rc_shadow_offset =
911 				    ((b + c) / physical_cols) << ashift;
912 				if (row_use_scratch)
913 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
914 			}
915 
916 			asize += rc->rc_size;
917 		}
918 
919 		/*
920 		 * See comment in vdev_raidz_map_alloc()
921 		 */
922 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
923 		    (offset & (1ULL << 20))) {
924 			ASSERT(rr->rr_cols >= 2);
925 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
926 
927 			int devidx0 = rr->rr_col[0].rc_devidx;
928 			uint64_t offset0 = rr->rr_col[0].rc_offset;
929 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
930 			uint64_t shadow_offset0 =
931 			    rr->rr_col[0].rc_shadow_offset;
932 
933 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
934 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
935 			rr->rr_col[0].rc_shadow_devidx =
936 			    rr->rr_col[1].rc_shadow_devidx;
937 			rr->rr_col[0].rc_shadow_offset =
938 			    rr->rr_col[1].rc_shadow_offset;
939 
940 			rr->rr_col[1].rc_devidx = devidx0;
941 			rr->rr_col[1].rc_offset = offset0;
942 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
943 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
944 		}
945 	}
946 	ASSERT3U(asize, ==, tot << ashift);
947 
948 	/*
949 	 * Determine if the block is contiguous, in which case we can use
950 	 * an aggregation.
951 	 */
952 	if (rows >= raidz_io_aggregate_rows) {
953 		rm->rm_nphys_cols = physical_cols;
954 		rm->rm_phys_col =
955 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
956 		    KM_SLEEP);
957 
958 		/*
959 		 * Determine the aggregate io's offset and size, and check
960 		 * that the io is contiguous.
961 		 */
962 		for (int i = 0;
963 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
964 			raidz_row_t *rr = rm->rm_row[i];
965 			for (int c = 0; c < rr->rr_cols; c++) {
966 				raidz_col_t *rc = &rr->rr_col[c];
967 				raidz_col_t *prc =
968 				    &rm->rm_phys_col[rc->rc_devidx];
969 
970 				if (rc->rc_size == 0)
971 					continue;
972 
973 				if (prc->rc_size == 0) {
974 					ASSERT0(prc->rc_offset);
975 					prc->rc_offset = rc->rc_offset;
976 				} else if (prc->rc_offset + prc->rc_size !=
977 				    rc->rc_offset) {
978 					/*
979 					 * This block is not contiguous and
980 					 * therefore can't be aggregated.
981 					 * This is expected to be rare, so
982 					 * the cost of allocating and then
983 					 * freeing rm_phys_col is not
984 					 * significant.
985 					 */
986 					kmem_free(rm->rm_phys_col,
987 					    sizeof (raidz_col_t) *
988 					    rm->rm_nphys_cols);
989 					rm->rm_phys_col = NULL;
990 					rm->rm_nphys_cols = 0;
991 					break;
992 				}
993 				prc->rc_size += rc->rc_size;
994 			}
995 		}
996 	}
997 	if (rm->rm_phys_col != NULL) {
998 		/*
999 		 * Allocate aggregate ABD's.
1000 		 */
1001 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
1002 			raidz_col_t *prc = &rm->rm_phys_col[i];
1003 
1004 			prc->rc_devidx = i;
1005 
1006 			if (prc->rc_size == 0)
1007 				continue;
1008 
1009 			prc->rc_abd =
1010 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1011 			    B_FALSE);
1012 		}
1013 
1014 		/*
1015 		 * Point the parity abd's into the aggregate abd's.
1016 		 */
1017 		for (int i = 0; i < rm->rm_nrows; i++) {
1018 			raidz_row_t *rr = rm->rm_row[i];
1019 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1020 				raidz_col_t *rc = &rr->rr_col[c];
1021 				raidz_col_t *prc =
1022 				    &rm->rm_phys_col[rc->rc_devidx];
1023 				rc->rc_abd =
1024 				    abd_get_offset_struct(&rc->rc_abdstruct,
1025 				    prc->rc_abd,
1026 				    rc->rc_offset - prc->rc_offset,
1027 				    rc->rc_size);
1028 			}
1029 		}
1030 	} else {
1031 		/*
1032 		 * Allocate new abd's for the parity sectors.
1033 		 */
1034 		for (int i = 0; i < rm->rm_nrows; i++) {
1035 			raidz_row_t *rr = rm->rm_row[i];
1036 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1037 				raidz_col_t *rc = &rr->rr_col[c];
1038 				rc->rc_abd =
1039 				    abd_alloc_linear(rc->rc_size,
1040 				    B_TRUE);
1041 			}
1042 		}
1043 	}
1044 	/* init RAIDZ parity ops */
1045 	rm->rm_ops = vdev_raidz_math_get_ops();
1046 
1047 	return (rm);
1048 }
1049 
1050 struct pqr_struct {
1051 	uint64_t *p;
1052 	uint64_t *q;
1053 	uint64_t *r;
1054 };
1055 
1056 static int
1057 vdev_raidz_p_func(void *buf, size_t size, void *private)
1058 {
1059 	struct pqr_struct *pqr = private;
1060 	const uint64_t *src = buf;
1061 	int cnt = size / sizeof (src[0]);
1062 
1063 	ASSERT(pqr->p && !pqr->q && !pqr->r);
1064 
1065 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1066 		*pqr->p ^= *src;
1067 
1068 	return (0);
1069 }
1070 
1071 static int
1072 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1073 {
1074 	struct pqr_struct *pqr = private;
1075 	const uint64_t *src = buf;
1076 	uint64_t mask;
1077 	int cnt = size / sizeof (src[0]);
1078 
1079 	ASSERT(pqr->p && pqr->q && !pqr->r);
1080 
1081 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1082 		*pqr->p ^= *src;
1083 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1084 		*pqr->q ^= *src;
1085 	}
1086 
1087 	return (0);
1088 }
1089 
1090 static int
1091 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1092 {
1093 	struct pqr_struct *pqr = private;
1094 	const uint64_t *src = buf;
1095 	uint64_t mask;
1096 	int cnt = size / sizeof (src[0]);
1097 
1098 	ASSERT(pqr->p && pqr->q && pqr->r);
1099 
1100 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1101 		*pqr->p ^= *src;
1102 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1103 		*pqr->q ^= *src;
1104 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1105 		*pqr->r ^= *src;
1106 	}
1107 
1108 	return (0);
1109 }
1110 
1111 static void
1112 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1113 {
1114 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1115 
1116 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1117 		abd_t *src = rr->rr_col[c].rc_abd;
1118 
1119 		if (c == rr->rr_firstdatacol) {
1120 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1121 		} else {
1122 			struct pqr_struct pqr = { p, NULL, NULL };
1123 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1124 			    vdev_raidz_p_func, &pqr);
1125 		}
1126 	}
1127 }
1128 
1129 static void
1130 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1131 {
1132 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1133 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1134 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1135 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1136 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1137 
1138 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1139 		abd_t *src = rr->rr_col[c].rc_abd;
1140 
1141 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1142 
1143 		if (c == rr->rr_firstdatacol) {
1144 			ASSERT(ccnt == pcnt || ccnt == 0);
1145 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1146 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1147 
1148 			for (uint64_t i = ccnt; i < pcnt; i++) {
1149 				p[i] = 0;
1150 				q[i] = 0;
1151 			}
1152 		} else {
1153 			struct pqr_struct pqr = { p, q, NULL };
1154 
1155 			ASSERT(ccnt <= pcnt);
1156 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1157 			    vdev_raidz_pq_func, &pqr);
1158 
1159 			/*
1160 			 * Treat short columns as though they are full of 0s.
1161 			 * Note that there's therefore nothing needed for P.
1162 			 */
1163 			uint64_t mask;
1164 			for (uint64_t i = ccnt; i < pcnt; i++) {
1165 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1166 			}
1167 		}
1168 	}
1169 }
1170 
1171 static void
1172 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1173 {
1174 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1175 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1176 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1177 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1178 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1179 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1180 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1181 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1182 
1183 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1184 		abd_t *src = rr->rr_col[c].rc_abd;
1185 
1186 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1187 
1188 		if (c == rr->rr_firstdatacol) {
1189 			ASSERT(ccnt == pcnt || ccnt == 0);
1190 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1191 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1192 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1193 
1194 			for (uint64_t i = ccnt; i < pcnt; i++) {
1195 				p[i] = 0;
1196 				q[i] = 0;
1197 				r[i] = 0;
1198 			}
1199 		} else {
1200 			struct pqr_struct pqr = { p, q, r };
1201 
1202 			ASSERT(ccnt <= pcnt);
1203 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1204 			    vdev_raidz_pqr_func, &pqr);
1205 
1206 			/*
1207 			 * Treat short columns as though they are full of 0s.
1208 			 * Note that there's therefore nothing needed for P.
1209 			 */
1210 			uint64_t mask;
1211 			for (uint64_t i = ccnt; i < pcnt; i++) {
1212 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1213 				VDEV_RAIDZ_64MUL_4(r[i], mask);
1214 			}
1215 		}
1216 	}
1217 }
1218 
1219 /*
1220  * Generate RAID parity in the first virtual columns according to the number of
1221  * parity columns available.
1222  */
1223 void
1224 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1225 {
1226 	if (rr->rr_cols == 0) {
1227 		/*
1228 		 * We are handling this block one row at a time (because
1229 		 * this block has a different logical vs physical width,
1230 		 * due to RAIDZ expansion), and this is a pad-only row,
1231 		 * which has no parity.
1232 		 */
1233 		return;
1234 	}
1235 
1236 	/* Generate using the new math implementation */
1237 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1238 		return;
1239 
1240 	switch (rr->rr_firstdatacol) {
1241 	case 1:
1242 		vdev_raidz_generate_parity_p(rr);
1243 		break;
1244 	case 2:
1245 		vdev_raidz_generate_parity_pq(rr);
1246 		break;
1247 	case 3:
1248 		vdev_raidz_generate_parity_pqr(rr);
1249 		break;
1250 	default:
1251 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1252 	}
1253 }
1254 
1255 void
1256 vdev_raidz_generate_parity(raidz_map_t *rm)
1257 {
1258 	for (int i = 0; i < rm->rm_nrows; i++) {
1259 		raidz_row_t *rr = rm->rm_row[i];
1260 		vdev_raidz_generate_parity_row(rm, rr);
1261 	}
1262 }
1263 
1264 static int
1265 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1266 {
1267 	(void) private;
1268 	uint64_t *dst = dbuf;
1269 	uint64_t *src = sbuf;
1270 	int cnt = size / sizeof (src[0]);
1271 
1272 	for (int i = 0; i < cnt; i++) {
1273 		dst[i] ^= src[i];
1274 	}
1275 
1276 	return (0);
1277 }
1278 
1279 static int
1280 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1281     void *private)
1282 {
1283 	(void) private;
1284 	uint64_t *dst = dbuf;
1285 	uint64_t *src = sbuf;
1286 	uint64_t mask;
1287 	int cnt = size / sizeof (dst[0]);
1288 
1289 	for (int i = 0; i < cnt; i++, dst++, src++) {
1290 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1291 		*dst ^= *src;
1292 	}
1293 
1294 	return (0);
1295 }
1296 
1297 static int
1298 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1299 {
1300 	(void) private;
1301 	uint64_t *dst = buf;
1302 	uint64_t mask;
1303 	int cnt = size / sizeof (dst[0]);
1304 
1305 	for (int i = 0; i < cnt; i++, dst++) {
1306 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1307 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1308 	}
1309 
1310 	return (0);
1311 }
1312 
1313 struct reconst_q_struct {
1314 	uint64_t *q;
1315 	int exp;
1316 };
1317 
1318 static int
1319 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1320 {
1321 	struct reconst_q_struct *rq = private;
1322 	uint64_t *dst = buf;
1323 	int cnt = size / sizeof (dst[0]);
1324 
1325 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1326 		int j;
1327 		uint8_t *b;
1328 
1329 		*dst ^= *rq->q;
1330 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1331 			*b = vdev_raidz_exp2(*b, rq->exp);
1332 		}
1333 	}
1334 
1335 	return (0);
1336 }
1337 
1338 struct reconst_pq_struct {
1339 	uint8_t *p;
1340 	uint8_t *q;
1341 	uint8_t *pxy;
1342 	uint8_t *qxy;
1343 	int aexp;
1344 	int bexp;
1345 };
1346 
1347 static int
1348 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1349 {
1350 	struct reconst_pq_struct *rpq = private;
1351 	uint8_t *xd = xbuf;
1352 	uint8_t *yd = ybuf;
1353 
1354 	for (int i = 0; i < size;
1355 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1356 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1357 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1358 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1359 	}
1360 
1361 	return (0);
1362 }
1363 
1364 static int
1365 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1366 {
1367 	struct reconst_pq_struct *rpq = private;
1368 	uint8_t *xd = xbuf;
1369 
1370 	for (int i = 0; i < size;
1371 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1372 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1373 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1374 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1375 	}
1376 
1377 	return (0);
1378 }
1379 
1380 static void
1381 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1382 {
1383 	int x = tgts[0];
1384 	abd_t *dst, *src;
1385 
1386 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1387 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1388 
1389 	ASSERT3U(ntgts, ==, 1);
1390 	ASSERT3U(x, >=, rr->rr_firstdatacol);
1391 	ASSERT3U(x, <, rr->rr_cols);
1392 
1393 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1394 
1395 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1396 	dst = rr->rr_col[x].rc_abd;
1397 
1398 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1399 
1400 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1401 		uint64_t size = MIN(rr->rr_col[x].rc_size,
1402 		    rr->rr_col[c].rc_size);
1403 
1404 		src = rr->rr_col[c].rc_abd;
1405 
1406 		if (c == x)
1407 			continue;
1408 
1409 		(void) abd_iterate_func2(dst, src, 0, 0, size,
1410 		    vdev_raidz_reconst_p_func, NULL);
1411 	}
1412 }
1413 
1414 static void
1415 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1416 {
1417 	int x = tgts[0];
1418 	int c, exp;
1419 	abd_t *dst, *src;
1420 
1421 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1422 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1423 
1424 	ASSERT(ntgts == 1);
1425 
1426 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1427 
1428 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1429 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1430 		    rr->rr_col[c].rc_size);
1431 
1432 		src = rr->rr_col[c].rc_abd;
1433 		dst = rr->rr_col[x].rc_abd;
1434 
1435 		if (c == rr->rr_firstdatacol) {
1436 			abd_copy(dst, src, size);
1437 			if (rr->rr_col[x].rc_size > size) {
1438 				abd_zero_off(dst, size,
1439 				    rr->rr_col[x].rc_size - size);
1440 			}
1441 		} else {
1442 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1443 			(void) abd_iterate_func2(dst, src, 0, 0, size,
1444 			    vdev_raidz_reconst_q_pre_func, NULL);
1445 			(void) abd_iterate_func(dst,
1446 			    size, rr->rr_col[x].rc_size - size,
1447 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1448 		}
1449 	}
1450 
1451 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1452 	dst = rr->rr_col[x].rc_abd;
1453 	exp = 255 - (rr->rr_cols - 1 - x);
1454 
1455 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1456 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1457 	    vdev_raidz_reconst_q_post_func, &rq);
1458 }
1459 
1460 static void
1461 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1462 {
1463 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1464 	abd_t *pdata, *qdata;
1465 	uint64_t xsize, ysize;
1466 	int x = tgts[0];
1467 	int y = tgts[1];
1468 	abd_t *xd, *yd;
1469 
1470 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1471 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1472 
1473 	ASSERT(ntgts == 2);
1474 	ASSERT(x < y);
1475 	ASSERT(x >= rr->rr_firstdatacol);
1476 	ASSERT(y < rr->rr_cols);
1477 
1478 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1479 
1480 	/*
1481 	 * Move the parity data aside -- we're going to compute parity as
1482 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1483 	 * reuse the parity generation mechanism without trashing the actual
1484 	 * parity so we make those columns appear to be full of zeros by
1485 	 * setting their lengths to zero.
1486 	 */
1487 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1488 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1489 	xsize = rr->rr_col[x].rc_size;
1490 	ysize = rr->rr_col[y].rc_size;
1491 
1492 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1493 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1494 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1495 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1496 	rr->rr_col[x].rc_size = 0;
1497 	rr->rr_col[y].rc_size = 0;
1498 
1499 	vdev_raidz_generate_parity_pq(rr);
1500 
1501 	rr->rr_col[x].rc_size = xsize;
1502 	rr->rr_col[y].rc_size = ysize;
1503 
1504 	p = abd_to_buf(pdata);
1505 	q = abd_to_buf(qdata);
1506 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1507 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1508 	xd = rr->rr_col[x].rc_abd;
1509 	yd = rr->rr_col[y].rc_abd;
1510 
1511 	/*
1512 	 * We now have:
1513 	 *	Pxy = P + D_x + D_y
1514 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1515 	 *
1516 	 * We can then solve for D_x:
1517 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1518 	 * where
1519 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1520 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1521 	 *
1522 	 * With D_x in hand, we can easily solve for D_y:
1523 	 *	D_y = P + Pxy + D_x
1524 	 */
1525 
1526 	a = vdev_raidz_pow2[255 + x - y];
1527 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1528 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1529 
1530 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1531 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1532 
1533 	ASSERT3U(xsize, >=, ysize);
1534 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1535 
1536 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1537 	    vdev_raidz_reconst_pq_func, &rpq);
1538 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1539 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1540 
1541 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1542 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1543 
1544 	/*
1545 	 * Restore the saved parity data.
1546 	 */
1547 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1548 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1549 }
1550 
1551 /*
1552  * In the general case of reconstruction, we must solve the system of linear
1553  * equations defined by the coefficients used to generate parity as well as
1554  * the contents of the data and parity disks. This can be expressed with
1555  * vectors for the original data (D) and the actual data (d) and parity (p)
1556  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1557  *
1558  *            __   __                     __     __
1559  *            |     |         __     __   |  p_0  |
1560  *            |  V  |         |  D_0  |   | p_m-1 |
1561  *            |     |    x    |   :   | = |  d_0  |
1562  *            |  I  |         | D_n-1 |   |   :   |
1563  *            |     |         ~~     ~~   | d_n-1 |
1564  *            ~~   ~~                     ~~     ~~
1565  *
1566  * I is simply a square identity matrix of size n, and V is a vandermonde
1567  * matrix defined by the coefficients we chose for the various parity columns
1568  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1569  * computation as well as linear separability.
1570  *
1571  *      __               __               __     __
1572  *      |   1   ..  1 1 1 |               |  p_0  |
1573  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1574  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1575  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1576  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1577  *      |   :       : : : |   |   :   |   |  d_2  |
1578  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1579  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1580  *      |   0   ..  0 0 1 |               | d_n-1 |
1581  *      ~~               ~~               ~~     ~~
1582  *
1583  * Note that I, V, d, and p are known. To compute D, we must invert the
1584  * matrix and use the known data and parity values to reconstruct the unknown
1585  * data values. We begin by removing the rows in V|I and d|p that correspond
1586  * to failed or missing columns; we then make V|I square (n x n) and d|p
1587  * sized n by removing rows corresponding to unused parity from the bottom up
1588  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1589  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1590  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1591  *           __                               __
1592  *           |  1   1   1   1   1   1   1   1  |
1593  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1594  *           |  19 205 116  29  64  16  4   1  |      / /
1595  *           |  1   0   0   0   0   0   0   0  |     / /
1596  *           |  0   1   0   0   0   0   0   0  | <--' /
1597  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1598  *           |  0   0   0   1   0   0   0   0  |
1599  *           |  0   0   0   0   1   0   0   0  |
1600  *           |  0   0   0   0   0   1   0   0  |
1601  *           |  0   0   0   0   0   0   1   0  |
1602  *           |  0   0   0   0   0   0   0   1  |
1603  *           ~~                               ~~
1604  *           __                               __
1605  *           |  1   1   1   1   1   1   1   1  |
1606  *           | 128  64  32  16  8   4   2   1  |
1607  *           |  19 205 116  29  64  16  4   1  |
1608  *           |  1   0   0   0   0   0   0   0  |
1609  *           |  0   1   0   0   0   0   0   0  |
1610  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1611  *           |  0   0   0   1   0   0   0   0  |
1612  *           |  0   0   0   0   1   0   0   0  |
1613  *           |  0   0   0   0   0   1   0   0  |
1614  *           |  0   0   0   0   0   0   1   0  |
1615  *           |  0   0   0   0   0   0   0   1  |
1616  *           ~~                               ~~
1617  *
1618  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1619  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1620  * matrix is not singular.
1621  * __                                                                 __
1622  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1623  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1624  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1625  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1626  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1627  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1628  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1629  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1630  * ~~                                                                 ~~
1631  * __                                                                 __
1632  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1633  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1634  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1635  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1636  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1637  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1638  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1639  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1640  * ~~                                                                 ~~
1641  * __                                                                 __
1642  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1643  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1644  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1645  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1646  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1647  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1648  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1649  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1650  * ~~                                                                 ~~
1651  * __                                                                 __
1652  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1653  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1654  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1655  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1656  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1657  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1658  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1659  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1660  * ~~                                                                 ~~
1661  * __                                                                 __
1662  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1663  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1664  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1665  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1666  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1667  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1668  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1669  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1670  * ~~                                                                 ~~
1671  * __                                                                 __
1672  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1673  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1674  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1675  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1676  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1677  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1678  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1679  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1680  * ~~                                                                 ~~
1681  *                   __                               __
1682  *                   |  0   0   1   0   0   0   0   0  |
1683  *                   | 167 100  5   41 159 169 217 208 |
1684  *                   | 166 100  4   40 158 168 216 209 |
1685  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1686  *                   |  0   0   0   0   1   0   0   0  |
1687  *                   |  0   0   0   0   0   1   0   0  |
1688  *                   |  0   0   0   0   0   0   1   0  |
1689  *                   |  0   0   0   0   0   0   0   1  |
1690  *                   ~~                               ~~
1691  *
1692  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1693  * of the missing data.
1694  *
1695  * As is apparent from the example above, the only non-trivial rows in the
1696  * inverse matrix correspond to the data disks that we're trying to
1697  * reconstruct. Indeed, those are the only rows we need as the others would
1698  * only be useful for reconstructing data known or assumed to be valid. For
1699  * that reason, we only build the coefficients in the rows that correspond to
1700  * targeted columns.
1701  */
1702 
1703 static void
1704 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1705     uint8_t **rows)
1706 {
1707 	int i, j;
1708 	int pow;
1709 
1710 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1711 
1712 	/*
1713 	 * Fill in the missing rows of interest.
1714 	 */
1715 	for (i = 0; i < nmap; i++) {
1716 		ASSERT3S(0, <=, map[i]);
1717 		ASSERT3S(map[i], <=, 2);
1718 
1719 		pow = map[i] * n;
1720 		if (pow > 255)
1721 			pow -= 255;
1722 		ASSERT(pow <= 255);
1723 
1724 		for (j = 0; j < n; j++) {
1725 			pow -= map[i];
1726 			if (pow < 0)
1727 				pow += 255;
1728 			rows[i][j] = vdev_raidz_pow2[pow];
1729 		}
1730 	}
1731 }
1732 
1733 static void
1734 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1735     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1736 {
1737 	int i, j, ii, jj;
1738 	uint8_t log;
1739 
1740 	/*
1741 	 * Assert that the first nmissing entries from the array of used
1742 	 * columns correspond to parity columns and that subsequent entries
1743 	 * correspond to data columns.
1744 	 */
1745 	for (i = 0; i < nmissing; i++) {
1746 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1747 	}
1748 	for (; i < n; i++) {
1749 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1750 	}
1751 
1752 	/*
1753 	 * First initialize the storage where we'll compute the inverse rows.
1754 	 */
1755 	for (i = 0; i < nmissing; i++) {
1756 		for (j = 0; j < n; j++) {
1757 			invrows[i][j] = (i == j) ? 1 : 0;
1758 		}
1759 	}
1760 
1761 	/*
1762 	 * Subtract all trivial rows from the rows of consequence.
1763 	 */
1764 	for (i = 0; i < nmissing; i++) {
1765 		for (j = nmissing; j < n; j++) {
1766 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1767 			jj = used[j] - rr->rr_firstdatacol;
1768 			ASSERT3S(jj, <, n);
1769 			invrows[i][j] = rows[i][jj];
1770 			rows[i][jj] = 0;
1771 		}
1772 	}
1773 
1774 	/*
1775 	 * For each of the rows of interest, we must normalize it and subtract
1776 	 * a multiple of it from the other rows.
1777 	 */
1778 	for (i = 0; i < nmissing; i++) {
1779 		for (j = 0; j < missing[i]; j++) {
1780 			ASSERT0(rows[i][j]);
1781 		}
1782 		ASSERT3U(rows[i][missing[i]], !=, 0);
1783 
1784 		/*
1785 		 * Compute the inverse of the first element and multiply each
1786 		 * element in the row by that value.
1787 		 */
1788 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1789 
1790 		for (j = 0; j < n; j++) {
1791 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1792 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1793 		}
1794 
1795 		for (ii = 0; ii < nmissing; ii++) {
1796 			if (i == ii)
1797 				continue;
1798 
1799 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1800 
1801 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1802 
1803 			for (j = 0; j < n; j++) {
1804 				rows[ii][j] ^=
1805 				    vdev_raidz_exp2(rows[i][j], log);
1806 				invrows[ii][j] ^=
1807 				    vdev_raidz_exp2(invrows[i][j], log);
1808 			}
1809 		}
1810 	}
1811 
1812 	/*
1813 	 * Verify that the data that is left in the rows are properly part of
1814 	 * an identity matrix.
1815 	 */
1816 	for (i = 0; i < nmissing; i++) {
1817 		for (j = 0; j < n; j++) {
1818 			if (j == missing[i]) {
1819 				ASSERT3U(rows[i][j], ==, 1);
1820 			} else {
1821 				ASSERT0(rows[i][j]);
1822 			}
1823 		}
1824 	}
1825 }
1826 
1827 static void
1828 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1829     int *missing, uint8_t **invrows, const uint8_t *used)
1830 {
1831 	int i, j, x, cc, c;
1832 	uint8_t *src;
1833 	uint64_t ccount;
1834 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1835 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1836 	uint8_t log = 0;
1837 	uint8_t val;
1838 	int ll;
1839 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1840 	uint8_t *p, *pp;
1841 	size_t psize;
1842 
1843 	psize = sizeof (invlog[0][0]) * n * nmissing;
1844 	p = kmem_alloc(psize, KM_SLEEP);
1845 
1846 	for (pp = p, i = 0; i < nmissing; i++) {
1847 		invlog[i] = pp;
1848 		pp += n;
1849 	}
1850 
1851 	for (i = 0; i < nmissing; i++) {
1852 		for (j = 0; j < n; j++) {
1853 			ASSERT3U(invrows[i][j], !=, 0);
1854 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1855 		}
1856 	}
1857 
1858 	for (i = 0; i < n; i++) {
1859 		c = used[i];
1860 		ASSERT3U(c, <, rr->rr_cols);
1861 
1862 		ccount = rr->rr_col[c].rc_size;
1863 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1864 		if (ccount == 0)
1865 			continue;
1866 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1867 		for (j = 0; j < nmissing; j++) {
1868 			cc = missing[j] + rr->rr_firstdatacol;
1869 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1870 			ASSERT3U(cc, <, rr->rr_cols);
1871 			ASSERT3U(cc, !=, c);
1872 
1873 			dcount[j] = rr->rr_col[cc].rc_size;
1874 			if (dcount[j] != 0)
1875 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1876 		}
1877 
1878 		for (x = 0; x < ccount; x++, src++) {
1879 			if (*src != 0)
1880 				log = vdev_raidz_log2[*src];
1881 
1882 			for (cc = 0; cc < nmissing; cc++) {
1883 				if (x >= dcount[cc])
1884 					continue;
1885 
1886 				if (*src == 0) {
1887 					val = 0;
1888 				} else {
1889 					if ((ll = log + invlog[cc][i]) >= 255)
1890 						ll -= 255;
1891 					val = vdev_raidz_pow2[ll];
1892 				}
1893 
1894 				if (i == 0)
1895 					dst[cc][x] = val;
1896 				else
1897 					dst[cc][x] ^= val;
1898 			}
1899 		}
1900 	}
1901 
1902 	kmem_free(p, psize);
1903 }
1904 
1905 static void
1906 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1907 {
1908 	int i, c, t, tt;
1909 	unsigned int n;
1910 	unsigned int nmissing_rows;
1911 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1912 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1913 	uint8_t *p, *pp;
1914 	size_t psize;
1915 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1916 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1917 	uint8_t *used;
1918 
1919 	abd_t **bufs = NULL;
1920 
1921 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1922 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1923 	/*
1924 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1925 	 * temporary linear ABDs if any non-linear ABDs are found.
1926 	 */
1927 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1928 		ASSERT(rr->rr_col[i].rc_abd != NULL);
1929 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1930 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1931 			    KM_PUSHPAGE);
1932 
1933 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1934 				raidz_col_t *col = &rr->rr_col[c];
1935 
1936 				bufs[c] = col->rc_abd;
1937 				if (bufs[c] != NULL) {
1938 					col->rc_abd = abd_alloc_linear(
1939 					    col->rc_size, B_TRUE);
1940 					abd_copy(col->rc_abd, bufs[c],
1941 					    col->rc_size);
1942 				}
1943 			}
1944 
1945 			break;
1946 		}
1947 	}
1948 
1949 	n = rr->rr_cols - rr->rr_firstdatacol;
1950 
1951 	/*
1952 	 * Figure out which data columns are missing.
1953 	 */
1954 	nmissing_rows = 0;
1955 	for (t = 0; t < ntgts; t++) {
1956 		if (tgts[t] >= rr->rr_firstdatacol) {
1957 			missing_rows[nmissing_rows++] =
1958 			    tgts[t] - rr->rr_firstdatacol;
1959 		}
1960 	}
1961 
1962 	/*
1963 	 * Figure out which parity columns to use to help generate the missing
1964 	 * data columns.
1965 	 */
1966 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1967 		ASSERT(tt < ntgts);
1968 		ASSERT(c < rr->rr_firstdatacol);
1969 
1970 		/*
1971 		 * Skip any targeted parity columns.
1972 		 */
1973 		if (c == tgts[tt]) {
1974 			tt++;
1975 			continue;
1976 		}
1977 
1978 		parity_map[i] = c;
1979 		i++;
1980 	}
1981 
1982 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1983 	    nmissing_rows * n + sizeof (used[0]) * n;
1984 	p = kmem_alloc(psize, KM_SLEEP);
1985 
1986 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1987 		rows[i] = pp;
1988 		pp += n;
1989 		invrows[i] = pp;
1990 		pp += n;
1991 	}
1992 	used = pp;
1993 
1994 	for (i = 0; i < nmissing_rows; i++) {
1995 		used[i] = parity_map[i];
1996 	}
1997 
1998 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1999 		if (tt < nmissing_rows &&
2000 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
2001 			tt++;
2002 			continue;
2003 		}
2004 
2005 		ASSERT3S(i, <, n);
2006 		used[i] = c;
2007 		i++;
2008 	}
2009 
2010 	/*
2011 	 * Initialize the interesting rows of the matrix.
2012 	 */
2013 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2014 
2015 	/*
2016 	 * Invert the matrix.
2017 	 */
2018 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2019 	    invrows, used);
2020 
2021 	/*
2022 	 * Reconstruct the missing data using the generated matrix.
2023 	 */
2024 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2025 	    invrows, used);
2026 
2027 	kmem_free(p, psize);
2028 
2029 	/*
2030 	 * copy back from temporary linear abds and free them
2031 	 */
2032 	if (bufs) {
2033 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2034 			raidz_col_t *col = &rr->rr_col[c];
2035 
2036 			if (bufs[c] != NULL) {
2037 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2038 				abd_free(col->rc_abd);
2039 			}
2040 			col->rc_abd = bufs[c];
2041 		}
2042 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2043 	}
2044 }
2045 
2046 static void
2047 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2048     const int *t, int nt)
2049 {
2050 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2051 	int ntgts;
2052 	int i, c, ret;
2053 	int nbadparity, nbaddata;
2054 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2055 
2056 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2057 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2058 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2059 		    (int)rr->rr_missingparity);
2060 	}
2061 
2062 	nbadparity = rr->rr_firstdatacol;
2063 	nbaddata = rr->rr_cols - nbadparity;
2064 	ntgts = 0;
2065 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2066 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2067 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2068 			    "offset=%llx error=%u)",
2069 			    rr, c, (int)rr->rr_col[c].rc_devidx,
2070 			    (long long)rr->rr_col[c].rc_offset,
2071 			    (int)rr->rr_col[c].rc_error);
2072 		}
2073 		if (c < rr->rr_firstdatacol)
2074 			parity_valid[c] = B_FALSE;
2075 
2076 		if (i < nt && c == t[i]) {
2077 			tgts[ntgts++] = c;
2078 			i++;
2079 		} else if (rr->rr_col[c].rc_error != 0) {
2080 			tgts[ntgts++] = c;
2081 		} else if (c >= rr->rr_firstdatacol) {
2082 			nbaddata--;
2083 		} else {
2084 			parity_valid[c] = B_TRUE;
2085 			nbadparity--;
2086 		}
2087 	}
2088 
2089 	ASSERT(ntgts >= nt);
2090 	ASSERT(nbaddata >= 0);
2091 	ASSERT(nbaddata + nbadparity == ntgts);
2092 
2093 	dt = &tgts[nbadparity];
2094 
2095 	/* Reconstruct using the new math implementation */
2096 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2097 	if (ret != RAIDZ_ORIGINAL_IMPL)
2098 		return;
2099 
2100 	/*
2101 	 * See if we can use any of our optimized reconstruction routines.
2102 	 */
2103 	switch (nbaddata) {
2104 	case 1:
2105 		if (parity_valid[VDEV_RAIDZ_P]) {
2106 			vdev_raidz_reconstruct_p(rr, dt, 1);
2107 			return;
2108 		}
2109 
2110 		ASSERT(rr->rr_firstdatacol > 1);
2111 
2112 		if (parity_valid[VDEV_RAIDZ_Q]) {
2113 			vdev_raidz_reconstruct_q(rr, dt, 1);
2114 			return;
2115 		}
2116 
2117 		ASSERT(rr->rr_firstdatacol > 2);
2118 		break;
2119 
2120 	case 2:
2121 		ASSERT(rr->rr_firstdatacol > 1);
2122 
2123 		if (parity_valid[VDEV_RAIDZ_P] &&
2124 		    parity_valid[VDEV_RAIDZ_Q]) {
2125 			vdev_raidz_reconstruct_pq(rr, dt, 2);
2126 			return;
2127 		}
2128 
2129 		ASSERT(rr->rr_firstdatacol > 2);
2130 
2131 		break;
2132 	}
2133 
2134 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2135 }
2136 
2137 static int
2138 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2139     uint64_t *logical_ashift, uint64_t *physical_ashift)
2140 {
2141 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2142 	uint64_t nparity = vdrz->vd_nparity;
2143 	int c;
2144 	int lasterror = 0;
2145 	int numerrors = 0;
2146 
2147 	ASSERT(nparity > 0);
2148 
2149 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2150 	    vd->vdev_children < nparity + 1) {
2151 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2152 		return (SET_ERROR(EINVAL));
2153 	}
2154 
2155 	vdev_open_children(vd);
2156 
2157 	for (c = 0; c < vd->vdev_children; c++) {
2158 		vdev_t *cvd = vd->vdev_child[c];
2159 
2160 		if (cvd->vdev_open_error != 0) {
2161 			lasterror = cvd->vdev_open_error;
2162 			numerrors++;
2163 			continue;
2164 		}
2165 
2166 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2167 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2168 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2169 	}
2170 	for (c = 0; c < vd->vdev_children; c++) {
2171 		vdev_t *cvd = vd->vdev_child[c];
2172 
2173 		if (cvd->vdev_open_error != 0)
2174 			continue;
2175 		*physical_ashift = vdev_best_ashift(*logical_ashift,
2176 		    *physical_ashift, cvd->vdev_physical_ashift);
2177 	}
2178 
2179 	if (vd->vdev_rz_expanding) {
2180 		*asize *= vd->vdev_children - 1;
2181 		*max_asize *= vd->vdev_children - 1;
2182 
2183 		vd->vdev_min_asize = *asize;
2184 	} else {
2185 		*asize *= vd->vdev_children;
2186 		*max_asize *= vd->vdev_children;
2187 	}
2188 
2189 	if (numerrors > nparity) {
2190 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2191 		return (lasterror);
2192 	}
2193 
2194 	return (0);
2195 }
2196 
2197 static void
2198 vdev_raidz_close(vdev_t *vd)
2199 {
2200 	for (int c = 0; c < vd->vdev_children; c++) {
2201 		if (vd->vdev_child[c] != NULL)
2202 			vdev_close(vd->vdev_child[c]);
2203 	}
2204 }
2205 
2206 /*
2207  * Return the logical width to use, given the txg in which the allocation
2208  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2209  * BP was allocated.  Remapped BP's (that were relocated due to device
2210  * removal, see remap_blkptr_cb()), will have a more recent physical birth
2211  * which reflects when the BP was relocated, but we can ignore these because
2212  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2213  */
2214 static uint64_t
2215 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2216 {
2217 	reflow_node_t lookup = {
2218 		.re_txg = txg,
2219 	};
2220 	avl_index_t where;
2221 
2222 	uint64_t width;
2223 	mutex_enter(&vdrz->vd_expand_lock);
2224 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2225 	if (re != NULL) {
2226 		width = re->re_logical_width;
2227 	} else {
2228 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2229 		if (re != NULL)
2230 			width = re->re_logical_width;
2231 		else
2232 			width = vdrz->vd_original_width;
2233 	}
2234 	mutex_exit(&vdrz->vd_expand_lock);
2235 	return (width);
2236 }
2237 
2238 /*
2239  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2240  * more space due to the lower data-to-parity ratio.  In this case it's
2241  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2242  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2243  * regardless of txg.  This is assured because for a single data sector, we
2244  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2245  */
2246 static uint64_t
2247 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2248 {
2249 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2250 	uint64_t asize;
2251 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2252 	uint64_t cols = vdrz->vd_original_width;
2253 	uint64_t nparity = vdrz->vd_nparity;
2254 
2255 	cols = vdev_raidz_get_logical_width(vdrz, txg);
2256 
2257 	asize = ((psize - 1) >> ashift) + 1;
2258 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2259 	asize = roundup(asize, nparity + 1) << ashift;
2260 
2261 #ifdef ZFS_DEBUG
2262 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2263 	uint64_t ncols_new = vdrz->vd_physical_width;
2264 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2265 	    (ncols_new - nparity));
2266 	asize_new = roundup(asize_new, nparity + 1) << ashift;
2267 	VERIFY3U(asize_new, <=, asize);
2268 #endif
2269 
2270 	return (asize);
2271 }
2272 
2273 /*
2274  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2275  * so each child must provide at least 1/Nth of its asize.
2276  */
2277 static uint64_t
2278 vdev_raidz_min_asize(vdev_t *vd)
2279 {
2280 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2281 	    vd->vdev_children);
2282 }
2283 
2284 void
2285 vdev_raidz_child_done(zio_t *zio)
2286 {
2287 	raidz_col_t *rc = zio->io_private;
2288 
2289 	ASSERT3P(rc->rc_abd, !=, NULL);
2290 	rc->rc_error = zio->io_error;
2291 	rc->rc_tried = 1;
2292 	rc->rc_skipped = 0;
2293 }
2294 
2295 static void
2296 vdev_raidz_shadow_child_done(zio_t *zio)
2297 {
2298 	raidz_col_t *rc = zio->io_private;
2299 
2300 	rc->rc_shadow_error = zio->io_error;
2301 }
2302 
2303 static void
2304 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2305 {
2306 	(void) rm;
2307 #ifdef ZFS_DEBUG
2308 	range_seg64_t logical_rs, physical_rs, remain_rs;
2309 	logical_rs.rs_start = rr->rr_offset;
2310 	logical_rs.rs_end = logical_rs.rs_start +
2311 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
2312 	    BP_GET_BIRTH(zio->io_bp));
2313 
2314 	raidz_col_t *rc = &rr->rr_col[col];
2315 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2316 
2317 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2318 	ASSERT(vdev_xlate_is_empty(&remain_rs));
2319 	if (vdev_xlate_is_empty(&physical_rs)) {
2320 		/*
2321 		 * If we are in the middle of expansion, the
2322 		 * physical->logical mapping is changing so vdev_xlate()
2323 		 * can't give us a reliable answer.
2324 		 */
2325 		return;
2326 	}
2327 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2328 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2329 	/*
2330 	 * It would be nice to assert that rs_end is equal
2331 	 * to rc_offset + rc_size but there might be an
2332 	 * optional I/O at the end that is not accounted in
2333 	 * rc_size.
2334 	 */
2335 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2336 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2337 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2338 	} else {
2339 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2340 	}
2341 #endif
2342 }
2343 
2344 static void
2345 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2346 {
2347 	vdev_t *vd = zio->io_vd;
2348 	raidz_map_t *rm = zio->io_vsd;
2349 
2350 	vdev_raidz_generate_parity_row(rm, rr);
2351 
2352 	for (int c = 0; c < rr->rr_scols; c++) {
2353 		raidz_col_t *rc = &rr->rr_col[c];
2354 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2355 
2356 		/* Verify physical to logical translation */
2357 		vdev_raidz_io_verify(zio, rm, rr, c);
2358 
2359 		if (rc->rc_size == 0)
2360 			continue;
2361 
2362 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2363 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2364 
2365 		ASSERT3P(rc->rc_abd, !=, NULL);
2366 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2367 		    rc->rc_offset, rc->rc_abd,
2368 		    abd_get_size(rc->rc_abd), zio->io_type,
2369 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2370 
2371 		if (rc->rc_shadow_devidx != INT_MAX) {
2372 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2373 
2374 			ASSERT3U(
2375 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2376 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2377 
2378 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2379 			    rc->rc_shadow_offset, rc->rc_abd,
2380 			    abd_get_size(rc->rc_abd),
2381 			    zio->io_type, zio->io_priority, 0,
2382 			    vdev_raidz_shadow_child_done, rc));
2383 		}
2384 	}
2385 }
2386 
2387 /*
2388  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2389  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2390  */
2391 static void
2392 raidz_start_skip_writes(zio_t *zio)
2393 {
2394 	vdev_t *vd = zio->io_vd;
2395 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2396 	raidz_map_t *rm = zio->io_vsd;
2397 	ASSERT3U(rm->rm_nrows, ==, 1);
2398 	raidz_row_t *rr = rm->rm_row[0];
2399 	for (int c = 0; c < rr->rr_scols; c++) {
2400 		raidz_col_t *rc = &rr->rr_col[c];
2401 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2402 		if (rc->rc_size != 0)
2403 			continue;
2404 		ASSERT3P(rc->rc_abd, ==, NULL);
2405 
2406 		ASSERT3U(rc->rc_offset, <,
2407 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2408 
2409 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2410 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2411 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2412 	}
2413 }
2414 
2415 static void
2416 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2417 {
2418 	vdev_t *vd = zio->io_vd;
2419 
2420 	/*
2421 	 * Iterate over the columns in reverse order so that we hit the parity
2422 	 * last -- any errors along the way will force us to read the parity.
2423 	 */
2424 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2425 		raidz_col_t *rc = &rr->rr_col[c];
2426 		if (rc->rc_size == 0)
2427 			continue;
2428 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2429 		if (!vdev_readable(cvd)) {
2430 			if (c >= rr->rr_firstdatacol)
2431 				rr->rr_missingdata++;
2432 			else
2433 				rr->rr_missingparity++;
2434 			rc->rc_error = SET_ERROR(ENXIO);
2435 			rc->rc_tried = 1;	/* don't even try */
2436 			rc->rc_skipped = 1;
2437 			continue;
2438 		}
2439 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2440 			if (c >= rr->rr_firstdatacol)
2441 				rr->rr_missingdata++;
2442 			else
2443 				rr->rr_missingparity++;
2444 			rc->rc_error = SET_ERROR(ESTALE);
2445 			rc->rc_skipped = 1;
2446 			continue;
2447 		}
2448 		if (forceparity ||
2449 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2450 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2451 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2452 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2453 			    zio->io_type, zio->io_priority, 0,
2454 			    vdev_raidz_child_done, rc));
2455 		}
2456 	}
2457 }
2458 
2459 static void
2460 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2461 {
2462 	vdev_t *vd = zio->io_vd;
2463 
2464 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2465 		raidz_col_t *prc = &rm->rm_phys_col[i];
2466 		if (prc->rc_size == 0)
2467 			continue;
2468 
2469 		ASSERT3U(prc->rc_devidx, ==, i);
2470 		vdev_t *cvd = vd->vdev_child[i];
2471 		if (!vdev_readable(cvd)) {
2472 			prc->rc_error = SET_ERROR(ENXIO);
2473 			prc->rc_tried = 1;	/* don't even try */
2474 			prc->rc_skipped = 1;
2475 			continue;
2476 		}
2477 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2478 			prc->rc_error = SET_ERROR(ESTALE);
2479 			prc->rc_skipped = 1;
2480 			continue;
2481 		}
2482 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2483 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2484 		    zio->io_type, zio->io_priority, 0,
2485 		    vdev_raidz_child_done, prc));
2486 	}
2487 }
2488 
2489 static void
2490 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2491 {
2492 	/*
2493 	 * If there are multiple rows, we will be hitting
2494 	 * all disks, so go ahead and read the parity so
2495 	 * that we are reading in decent size chunks.
2496 	 */
2497 	boolean_t forceparity = rm->rm_nrows > 1;
2498 
2499 	if (rm->rm_phys_col) {
2500 		vdev_raidz_io_start_read_phys_cols(zio, rm);
2501 	} else {
2502 		for (int i = 0; i < rm->rm_nrows; i++) {
2503 			raidz_row_t *rr = rm->rm_row[i];
2504 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2505 		}
2506 	}
2507 }
2508 
2509 /*
2510  * Start an IO operation on a RAIDZ VDev
2511  *
2512  * Outline:
2513  * - For write operations:
2514  *   1. Generate the parity data
2515  *   2. Create child zio write operations to each column's vdev, for both
2516  *      data and parity.
2517  *   3. If the column skips any sectors for padding, create optional dummy
2518  *      write zio children for those areas to improve aggregation continuity.
2519  * - For read operations:
2520  *   1. Create child zio read operations to each data column's vdev to read
2521  *      the range of data required for zio.
2522  *   2. If this is a scrub or resilver operation, or if any of the data
2523  *      vdevs have had errors, then create zio read operations to the parity
2524  *      columns' VDevs as well.
2525  */
2526 static void
2527 vdev_raidz_io_start(zio_t *zio)
2528 {
2529 	vdev_t *vd = zio->io_vd;
2530 	vdev_t *tvd = vd->vdev_top;
2531 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2532 	raidz_map_t *rm;
2533 
2534 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2535 	    BP_GET_BIRTH(zio->io_bp));
2536 	if (logical_width != vdrz->vd_physical_width) {
2537 		zfs_locked_range_t *lr = NULL;
2538 		uint64_t synced_offset = UINT64_MAX;
2539 		uint64_t next_offset = UINT64_MAX;
2540 		boolean_t use_scratch = B_FALSE;
2541 		/*
2542 		 * Note: when the expansion is completing, we set
2543 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2544 		 * in a later txg than when we last update spa_ubsync's state
2545 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2546 		 * may see vre_state!=SCANNING before
2547 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2548 		 * on disk, but the copying progress has been synced to disk
2549 		 * (and reflected in spa_ubsync).  In this case it's fine to
2550 		 * treat the expansion as completed, since if we crash there's
2551 		 * no additional copying to do.
2552 		 */
2553 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2554 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2555 			    &vdrz->vn_vre);
2556 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2557 			    zio->io_offset, zio->io_size, RL_READER);
2558 			use_scratch =
2559 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2560 			    RRSS_SCRATCH_VALID);
2561 			synced_offset =
2562 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2563 			next_offset = vdrz->vn_vre.vre_offset;
2564 			/*
2565 			 * If we haven't resumed expanding since importing the
2566 			 * pool, vre_offset won't have been set yet.  In
2567 			 * this case the next offset to be copied is the same
2568 			 * as what was synced.
2569 			 */
2570 			if (next_offset == UINT64_MAX) {
2571 				next_offset = synced_offset;
2572 			}
2573 		}
2574 		if (use_scratch) {
2575 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2576 			    "%lld next_offset=%lld use_scratch=%u",
2577 			    zio,
2578 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2579 			    (long long)zio->io_offset,
2580 			    (long long)synced_offset,
2581 			    (long long)next_offset,
2582 			    use_scratch);
2583 		}
2584 
2585 		rm = vdev_raidz_map_alloc_expanded(zio,
2586 		    tvd->vdev_ashift, vdrz->vd_physical_width,
2587 		    logical_width, vdrz->vd_nparity,
2588 		    synced_offset, next_offset, use_scratch);
2589 		rm->rm_lr = lr;
2590 	} else {
2591 		rm = vdev_raidz_map_alloc(zio,
2592 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2593 	}
2594 	rm->rm_original_width = vdrz->vd_original_width;
2595 
2596 	zio->io_vsd = rm;
2597 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2598 	if (zio->io_type == ZIO_TYPE_WRITE) {
2599 		for (int i = 0; i < rm->rm_nrows; i++) {
2600 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2601 		}
2602 
2603 		if (logical_width == vdrz->vd_physical_width) {
2604 			raidz_start_skip_writes(zio);
2605 		}
2606 	} else {
2607 		ASSERT(zio->io_type == ZIO_TYPE_READ);
2608 		vdev_raidz_io_start_read(zio, rm);
2609 	}
2610 
2611 	zio_execute(zio);
2612 }
2613 
2614 /*
2615  * Report a checksum error for a child of a RAID-Z device.
2616  */
2617 void
2618 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2619 {
2620 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2621 
2622 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2623 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2624 		zio_bad_cksum_t zbc;
2625 		raidz_map_t *rm = zio->io_vsd;
2626 
2627 		zbc.zbc_has_cksum = 0;
2628 		zbc.zbc_injected = rm->rm_ecksuminjected;
2629 
2630 		mutex_enter(&vd->vdev_stat_lock);
2631 		vd->vdev_stat.vs_checksum_errors++;
2632 		mutex_exit(&vd->vdev_stat_lock);
2633 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2634 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2635 		    rc->rc_abd, bad_data, &zbc);
2636 	}
2637 }
2638 
2639 /*
2640  * We keep track of whether or not there were any injected errors, so that
2641  * any ereports we generate can note it.
2642  */
2643 static int
2644 raidz_checksum_verify(zio_t *zio)
2645 {
2646 	zio_bad_cksum_t zbc = {0};
2647 	raidz_map_t *rm = zio->io_vsd;
2648 
2649 	int ret = zio_checksum_error(zio, &zbc);
2650 	/*
2651 	 * Any Direct I/O read that has a checksum error must be treated as
2652 	 * suspicious as the contents of the buffer could be getting
2653 	 * manipulated while the I/O is taking place. The checksum verify error
2654 	 * will be reported to the top-level RAIDZ VDEV.
2655 	 */
2656 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2657 		zio->io_error = ret;
2658 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
2659 		zio_dio_chksum_verify_error_report(zio);
2660 		zio_checksum_verified(zio);
2661 		return (0);
2662 	}
2663 
2664 	if (ret != 0 && zbc.zbc_injected != 0)
2665 		rm->rm_ecksuminjected = 1;
2666 
2667 	return (ret);
2668 }
2669 
2670 /*
2671  * Generate the parity from the data columns. If we tried and were able to
2672  * read the parity without error, verify that the generated parity matches the
2673  * data we read. If it doesn't, we fire off a checksum error. Return the
2674  * number of such failures.
2675  */
2676 static int
2677 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2678 {
2679 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2680 	int c, ret = 0;
2681 	raidz_map_t *rm = zio->io_vsd;
2682 	raidz_col_t *rc;
2683 
2684 	blkptr_t *bp = zio->io_bp;
2685 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2686 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2687 
2688 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2689 		return (ret);
2690 
2691 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2692 		rc = &rr->rr_col[c];
2693 		if (!rc->rc_tried || rc->rc_error != 0)
2694 			continue;
2695 
2696 		orig[c] = rc->rc_abd;
2697 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2698 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2699 	}
2700 
2701 	/*
2702 	 * Verify any empty sectors are zero filled to ensure the parity
2703 	 * is calculated correctly even if these non-data sectors are damaged.
2704 	 */
2705 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2706 		ret += vdev_draid_map_verify_empty(zio, rr);
2707 
2708 	/*
2709 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
2710 	 * isn't harmful but it does have the side effect of fixing stuff
2711 	 * we didn't realize was necessary (i.e. even if we return 0).
2712 	 */
2713 	vdev_raidz_generate_parity_row(rm, rr);
2714 
2715 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2716 		rc = &rr->rr_col[c];
2717 
2718 		if (!rc->rc_tried || rc->rc_error != 0)
2719 			continue;
2720 
2721 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2722 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2723 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2724 			vdev_raidz_checksum_error(zio, rc, orig[c]);
2725 			rc->rc_error = SET_ERROR(ECKSUM);
2726 			ret++;
2727 		}
2728 		abd_free(orig[c]);
2729 	}
2730 
2731 	return (ret);
2732 }
2733 
2734 static int
2735 vdev_raidz_worst_error(raidz_row_t *rr)
2736 {
2737 	int error = 0;
2738 
2739 	for (int c = 0; c < rr->rr_cols; c++) {
2740 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2741 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2742 	}
2743 
2744 	return (error);
2745 }
2746 
2747 static void
2748 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2749 {
2750 	int unexpected_errors = 0;
2751 	int parity_errors = 0;
2752 	int parity_untried = 0;
2753 	int data_errors = 0;
2754 
2755 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2756 
2757 	for (int c = 0; c < rr->rr_cols; c++) {
2758 		raidz_col_t *rc = &rr->rr_col[c];
2759 
2760 		if (rc->rc_error) {
2761 			if (c < rr->rr_firstdatacol)
2762 				parity_errors++;
2763 			else
2764 				data_errors++;
2765 
2766 			if (!rc->rc_skipped)
2767 				unexpected_errors++;
2768 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2769 			parity_untried++;
2770 		}
2771 
2772 		if (rc->rc_force_repair)
2773 			unexpected_errors++;
2774 	}
2775 
2776 	/*
2777 	 * If we read more parity disks than were used for
2778 	 * reconstruction, confirm that the other parity disks produced
2779 	 * correct data.
2780 	 *
2781 	 * Note that we also regenerate parity when resilvering so we
2782 	 * can write it out to failed devices later.
2783 	 */
2784 	if (parity_errors + parity_untried <
2785 	    rr->rr_firstdatacol - data_errors ||
2786 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2787 		int n = raidz_parity_verify(zio, rr);
2788 		unexpected_errors += n;
2789 	}
2790 
2791 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2792 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2793 		/*
2794 		 * Use the good data we have in hand to repair damaged children.
2795 		 */
2796 		for (int c = 0; c < rr->rr_cols; c++) {
2797 			raidz_col_t *rc = &rr->rr_col[c];
2798 			vdev_t *vd = zio->io_vd;
2799 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2800 
2801 			if (!rc->rc_allow_repair) {
2802 				continue;
2803 			} else if (!rc->rc_force_repair &&
2804 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
2805 				continue;
2806 			}
2807 			/*
2808 			 * We do not allow self healing for Direct I/O reads.
2809 			 * See comment in vdev_raid_row_alloc().
2810 			 */
2811 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2812 
2813 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2814 			    "offset=%llx",
2815 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2816 
2817 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2818 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2819 			    ZIO_TYPE_WRITE,
2820 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
2821 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2822 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2823 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2824 		}
2825 	}
2826 
2827 	/*
2828 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
2829 	 * good data.  This ensures that if we've already copied this sector,
2830 	 * it will be corrected if it was damaged.  This writes more than is
2831 	 * necessary, but since expansion is paused during scrub/resilver, at
2832 	 * most a single row will have a shadow location.
2833 	 */
2834 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2835 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2836 		for (int c = 0; c < rr->rr_cols; c++) {
2837 			raidz_col_t *rc = &rr->rr_col[c];
2838 			vdev_t *vd = zio->io_vd;
2839 
2840 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2841 				continue;
2842 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2843 
2844 			/*
2845 			 * Note: We don't want to update the repair stats
2846 			 * because that would incorrectly indicate that there
2847 			 * was bad data to repair, which we aren't sure about.
2848 			 * By clearing the SCAN_THREAD flag, we prevent this
2849 			 * from happening, despite having the REPAIR flag set.
2850 			 * We need to set SELF_HEAL so that this i/o can't be
2851 			 * bypassed by zio_vdev_io_start().
2852 			 */
2853 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2854 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2855 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2856 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2857 			    NULL, NULL);
2858 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2859 			zio_nowait(cio);
2860 		}
2861 	}
2862 }
2863 
2864 static void
2865 raidz_restore_orig_data(raidz_map_t *rm)
2866 {
2867 	for (int i = 0; i < rm->rm_nrows; i++) {
2868 		raidz_row_t *rr = rm->rm_row[i];
2869 		for (int c = 0; c < rr->rr_cols; c++) {
2870 			raidz_col_t *rc = &rr->rr_col[c];
2871 			if (rc->rc_need_orig_restore) {
2872 				abd_copy(rc->rc_abd,
2873 				    rc->rc_orig_data, rc->rc_size);
2874 				rc->rc_need_orig_restore = B_FALSE;
2875 			}
2876 		}
2877 	}
2878 }
2879 
2880 /*
2881  * During raidz_reconstruct() for expanded VDEV, we need special consideration
2882  * failure simulations.  See note in raidz_reconstruct() on simulating failure
2883  * of a pre-expansion device.
2884  *
2885  * Treating logical child i as failed, return TRUE if the given column should
2886  * be treated as failed.  The idea of logical children allows us to imagine
2887  * that a disk silently failed before a RAIDZ expansion (reads from this disk
2888  * succeed but return the wrong data).  Since the expansion doesn't verify
2889  * checksums, the incorrect data will be moved to new locations spread among
2890  * the children (going diagonally across them).
2891  *
2892  * Higher "logical child failures" (values of `i`) indicate these
2893  * "pre-expansion failures".  The first physical_width values imagine that a
2894  * current child failed; the next physical_width-1 values imagine that a
2895  * child failed before the most recent expansion; the next physical_width-2
2896  * values imagine a child failed in the expansion before that, etc.
2897  */
2898 static boolean_t
2899 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2900     int i, raidz_col_t *rc)
2901 {
2902 	uint64_t sector_id =
2903 	    physical_width * (rc->rc_offset >> ashift) +
2904 	    rc->rc_devidx;
2905 
2906 	for (int w = physical_width; w >= original_width; w--) {
2907 		if (i < w) {
2908 			return (sector_id % w == i);
2909 		} else {
2910 			i -= w;
2911 		}
2912 	}
2913 	ASSERT(!"invalid logical child id");
2914 	return (B_FALSE);
2915 }
2916 
2917 /*
2918  * returns EINVAL if reconstruction of the block will not be possible
2919  * returns ECKSUM if this specific reconstruction failed
2920  * returns 0 on successful reconstruction
2921  */
2922 static int
2923 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2924 {
2925 	raidz_map_t *rm = zio->io_vsd;
2926 	int physical_width = zio->io_vd->vdev_children;
2927 	int original_width = (rm->rm_original_width != 0) ?
2928 	    rm->rm_original_width : physical_width;
2929 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2930 
2931 	if (dbgmsg) {
2932 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2933 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2934 	}
2935 
2936 	/* Reconstruct each row */
2937 	for (int r = 0; r < rm->rm_nrows; r++) {
2938 		raidz_row_t *rr = rm->rm_row[r];
2939 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2940 		int t = 0;
2941 		int dead = 0;
2942 		int dead_data = 0;
2943 
2944 		if (dbgmsg)
2945 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2946 
2947 		for (int c = 0; c < rr->rr_cols; c++) {
2948 			raidz_col_t *rc = &rr->rr_col[c];
2949 			ASSERT0(rc->rc_need_orig_restore);
2950 			if (rc->rc_error != 0) {
2951 				dead++;
2952 				if (c >= nparity)
2953 					dead_data++;
2954 				continue;
2955 			}
2956 			if (rc->rc_size == 0)
2957 				continue;
2958 			for (int lt = 0; lt < ntgts; lt++) {
2959 				if (raidz_simulate_failure(physical_width,
2960 				    original_width,
2961 				    zio->io_vd->vdev_top->vdev_ashift,
2962 				    ltgts[lt], rc)) {
2963 					if (rc->rc_orig_data == NULL) {
2964 						rc->rc_orig_data =
2965 						    abd_alloc_linear(
2966 						    rc->rc_size, B_TRUE);
2967 						abd_copy(rc->rc_orig_data,
2968 						    rc->rc_abd, rc->rc_size);
2969 					}
2970 					rc->rc_need_orig_restore = B_TRUE;
2971 
2972 					dead++;
2973 					if (c >= nparity)
2974 						dead_data++;
2975 					/*
2976 					 * Note: simulating failure of a
2977 					 * pre-expansion device can hit more
2978 					 * than one column, in which case we
2979 					 * might try to simulate more failures
2980 					 * than can be reconstructed, which is
2981 					 * also more than the size of my_tgts.
2982 					 * This check prevents accessing past
2983 					 * the end of my_tgts.  The "dead >
2984 					 * nparity" check below will fail this
2985 					 * reconstruction attempt.
2986 					 */
2987 					if (t < VDEV_RAIDZ_MAXPARITY) {
2988 						my_tgts[t++] = c;
2989 						if (dbgmsg) {
2990 							zfs_dbgmsg("simulating "
2991 							    "failure of col %u "
2992 							    "devidx %u", c,
2993 							    (int)rc->rc_devidx);
2994 						}
2995 					}
2996 					break;
2997 				}
2998 			}
2999 		}
3000 		if (dead > nparity) {
3001 			/* reconstruction not possible */
3002 			if (dbgmsg) {
3003 				zfs_dbgmsg("reconstruction not possible; "
3004 				    "too many failures");
3005 			}
3006 			raidz_restore_orig_data(rm);
3007 			return (EINVAL);
3008 		}
3009 		if (dead_data > 0)
3010 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3011 	}
3012 
3013 	/* Check for success */
3014 	if (raidz_checksum_verify(zio) == 0) {
3015 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3016 			return (0);
3017 
3018 		/* Reconstruction succeeded - report errors */
3019 		for (int i = 0; i < rm->rm_nrows; i++) {
3020 			raidz_row_t *rr = rm->rm_row[i];
3021 
3022 			for (int c = 0; c < rr->rr_cols; c++) {
3023 				raidz_col_t *rc = &rr->rr_col[c];
3024 				if (rc->rc_need_orig_restore) {
3025 					/*
3026 					 * Note: if this is a parity column,
3027 					 * we don't really know if it's wrong.
3028 					 * We need to let
3029 					 * vdev_raidz_io_done_verified() check
3030 					 * it, and if we set rc_error, it will
3031 					 * think that it is a "known" error
3032 					 * that doesn't need to be checked
3033 					 * or corrected.
3034 					 */
3035 					if (rc->rc_error == 0 &&
3036 					    c >= rr->rr_firstdatacol) {
3037 						vdev_raidz_checksum_error(zio,
3038 						    rc, rc->rc_orig_data);
3039 						rc->rc_error =
3040 						    SET_ERROR(ECKSUM);
3041 					}
3042 					rc->rc_need_orig_restore = B_FALSE;
3043 				}
3044 			}
3045 
3046 			vdev_raidz_io_done_verified(zio, rr);
3047 		}
3048 
3049 		zio_checksum_verified(zio);
3050 
3051 		if (dbgmsg) {
3052 			zfs_dbgmsg("reconstruction successful "
3053 			    "(checksum verified)");
3054 		}
3055 		return (0);
3056 	}
3057 
3058 	/* Reconstruction failed - restore original data */
3059 	raidz_restore_orig_data(rm);
3060 	if (dbgmsg) {
3061 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3062 		    "failed", zio);
3063 	}
3064 	return (ECKSUM);
3065 }
3066 
3067 /*
3068  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3069  * Note that the algorithm below is non-optimal because it doesn't take into
3070  * account how reconstruction is actually performed. For example, with
3071  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3072  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3073  * cases we'd only use parity information in column 0.
3074  *
3075  * The order that we find the various possible combinations of failed
3076  * disks is dictated by these rules:
3077  * - Examine each "slot" (the "i" in tgts[i])
3078  *   - Try to increment this slot (tgts[i] += 1)
3079  *   - if we can't increment because it runs into the next slot,
3080  *     reset our slot to the minimum, and examine the next slot
3081  *
3082  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3083  *  3 columns to reconstruct), we will generate the following sequence:
3084  *
3085  *  STATE        ACTION
3086  *  0 1 2        special case: skip since these are all parity
3087  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3088  *  0   2 3      first slot: increment to 1
3089  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3090  *  0 1     4    first: reset to 0; middle: increment to 2
3091  *  0   2   4    first: increment to 1
3092  *    1 2   4    first: reset to 0; middle: increment to 3
3093  *  0     3 4    first: increment to 1
3094  *    1   3 4    first: increment to 2
3095  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3096  *  0 1       5  first: reset to 0; middle: increment to 2
3097  *  0   2     5  first: increment to 1
3098  *    1 2     5  first: reset to 0; middle: increment to 3
3099  *  0     3   5  first: increment to 1
3100  *    1   3   5  first: increment to 2
3101  *      2 3   5  first: reset to 0; middle: increment to 4
3102  *  0       4 5  first: increment to 1
3103  *    1     4 5  first: increment to 2
3104  *      2   4 5  first: increment to 3
3105  *        3 4 5  done
3106  *
3107  * This strategy works for dRAID but is less efficient when there are a large
3108  * number of child vdevs and therefore permutations to check. Furthermore,
3109  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3110  * possible as long as there are no more than nparity data errors per row.
3111  * These additional permutations are not currently checked but could be as
3112  * a future improvement.
3113  *
3114  * Returns 0 on success, ECKSUM on failure.
3115  */
3116 static int
3117 vdev_raidz_combrec(zio_t *zio)
3118 {
3119 	int nparity = vdev_get_nparity(zio->io_vd);
3120 	raidz_map_t *rm = zio->io_vsd;
3121 	int physical_width = zio->io_vd->vdev_children;
3122 	int original_width = (rm->rm_original_width != 0) ?
3123 	    rm->rm_original_width : physical_width;
3124 
3125 	for (int i = 0; i < rm->rm_nrows; i++) {
3126 		raidz_row_t *rr = rm->rm_row[i];
3127 		int total_errors = 0;
3128 
3129 		for (int c = 0; c < rr->rr_cols; c++) {
3130 			if (rr->rr_col[c].rc_error)
3131 				total_errors++;
3132 		}
3133 
3134 		if (total_errors > nparity)
3135 			return (vdev_raidz_worst_error(rr));
3136 	}
3137 
3138 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3139 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3140 		int *ltgts = &tstore[1]; /* value is logical child ID */
3141 
3142 
3143 		/*
3144 		 * Determine number of logical children, n.  See comment
3145 		 * above raidz_simulate_failure().
3146 		 */
3147 		int n = 0;
3148 		for (int w = physical_width;
3149 		    w >= original_width; w--) {
3150 			n += w;
3151 		}
3152 
3153 		ASSERT3U(num_failures, <=, nparity);
3154 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3155 
3156 		/* Handle corner cases in combrec logic */
3157 		ltgts[-1] = -1;
3158 		for (int i = 0; i < num_failures; i++) {
3159 			ltgts[i] = i;
3160 		}
3161 		ltgts[num_failures] = n;
3162 
3163 		for (;;) {
3164 			int err = raidz_reconstruct(zio, ltgts, num_failures,
3165 			    nparity);
3166 			if (err == EINVAL) {
3167 				/*
3168 				 * Reconstruction not possible with this #
3169 				 * failures; try more failures.
3170 				 */
3171 				break;
3172 			} else if (err == 0)
3173 				return (0);
3174 
3175 			/* Compute next targets to try */
3176 			for (int t = 0; ; t++) {
3177 				ASSERT3U(t, <, num_failures);
3178 				ltgts[t]++;
3179 				if (ltgts[t] == n) {
3180 					/* try more failures */
3181 					ASSERT3U(t, ==, num_failures - 1);
3182 					if (zfs_flags &
3183 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3184 						zfs_dbgmsg("reconstruction "
3185 						    "failed for num_failures="
3186 						    "%u; tried all "
3187 						    "combinations",
3188 						    num_failures);
3189 					}
3190 					break;
3191 				}
3192 
3193 				ASSERT3U(ltgts[t], <, n);
3194 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3195 
3196 				/*
3197 				 * If that spot is available, we're done here.
3198 				 * Try the next combination.
3199 				 */
3200 				if (ltgts[t] != ltgts[t + 1])
3201 					break; // found next combination
3202 
3203 				/*
3204 				 * Otherwise, reset this tgt to the minimum,
3205 				 * and move on to the next tgt.
3206 				 */
3207 				ltgts[t] = ltgts[t - 1] + 1;
3208 				ASSERT3U(ltgts[t], ==, t);
3209 			}
3210 
3211 			/* Increase the number of failures and keep trying. */
3212 			if (ltgts[num_failures - 1] == n)
3213 				break;
3214 		}
3215 	}
3216 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3217 		zfs_dbgmsg("reconstruction failed for all num_failures");
3218 	return (ECKSUM);
3219 }
3220 
3221 void
3222 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3223 {
3224 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3225 		raidz_row_t *rr = rm->rm_row[row];
3226 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
3227 	}
3228 }
3229 
3230 /*
3231  * Complete a write IO operation on a RAIDZ VDev
3232  *
3233  * Outline:
3234  *   1. Check for errors on the child IOs.
3235  *   2. Return, setting an error code if too few child VDevs were written
3236  *      to reconstruct the data later.  Note that partial writes are
3237  *      considered successful if they can be reconstructed at all.
3238  */
3239 static void
3240 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3241 {
3242 	int normal_errors = 0;
3243 	int shadow_errors = 0;
3244 
3245 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3246 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3247 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3248 
3249 	for (int c = 0; c < rr->rr_cols; c++) {
3250 		raidz_col_t *rc = &rr->rr_col[c];
3251 
3252 		if (rc->rc_error != 0) {
3253 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3254 			normal_errors++;
3255 		}
3256 		if (rc->rc_shadow_error != 0) {
3257 			ASSERT(rc->rc_shadow_error != ECKSUM);
3258 			shadow_errors++;
3259 		}
3260 	}
3261 
3262 	/*
3263 	 * Treat partial writes as a success. If we couldn't write enough
3264 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3265 	 * enough.  Note that in the case of a shadow write (during raidz
3266 	 * expansion), depending on if we crash, either the normal (old) or
3267 	 * shadow (new) location may become the "real" version of the block,
3268 	 * so both locations must have sufficient redundancy.
3269 	 *
3270 	 * Now that we support write reallocation, it would be better
3271 	 * to treat partial failure as real failure unless there are
3272 	 * no non-degraded top-level vdevs left, and not update DTLs
3273 	 * if we intend to reallocate.
3274 	 */
3275 	if (normal_errors > rr->rr_firstdatacol ||
3276 	    shadow_errors > rr->rr_firstdatacol) {
3277 		zio->io_error = zio_worst_error(zio->io_error,
3278 		    vdev_raidz_worst_error(rr));
3279 	}
3280 }
3281 
3282 static void
3283 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3284     raidz_row_t *rr)
3285 {
3286 	int parity_errors = 0;
3287 	int parity_untried = 0;
3288 	int data_errors = 0;
3289 	int total_errors = 0;
3290 
3291 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3292 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3293 
3294 	for (int c = 0; c < rr->rr_cols; c++) {
3295 		raidz_col_t *rc = &rr->rr_col[c];
3296 
3297 		/*
3298 		 * If scrubbing and a replacing/sparing child vdev determined
3299 		 * that not all of its children have an identical copy of the
3300 		 * data, then clear the error so the column is treated like
3301 		 * any other read and force a repair to correct the damage.
3302 		 */
3303 		if (rc->rc_error == ECKSUM) {
3304 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3305 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3306 			rc->rc_force_repair = 1;
3307 			rc->rc_error = 0;
3308 		}
3309 
3310 		if (rc->rc_error) {
3311 			if (c < rr->rr_firstdatacol)
3312 				parity_errors++;
3313 			else
3314 				data_errors++;
3315 
3316 			total_errors++;
3317 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3318 			parity_untried++;
3319 		}
3320 	}
3321 
3322 	/*
3323 	 * If there were data errors and the number of errors we saw was
3324 	 * correctable -- less than or equal to the number of parity disks read
3325 	 * -- reconstruct based on the missing data.
3326 	 */
3327 	if (data_errors != 0 &&
3328 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3329 		/*
3330 		 * We either attempt to read all the parity columns or
3331 		 * none of them. If we didn't try to read parity, we
3332 		 * wouldn't be here in the correctable case. There must
3333 		 * also have been fewer parity errors than parity
3334 		 * columns or, again, we wouldn't be in this code path.
3335 		 */
3336 		ASSERT(parity_untried == 0);
3337 		ASSERT(parity_errors < rr->rr_firstdatacol);
3338 
3339 		/*
3340 		 * Identify the data columns that reported an error.
3341 		 */
3342 		int n = 0;
3343 		int tgts[VDEV_RAIDZ_MAXPARITY];
3344 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3345 			raidz_col_t *rc = &rr->rr_col[c];
3346 			if (rc->rc_error != 0) {
3347 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3348 				tgts[n++] = c;
3349 			}
3350 		}
3351 
3352 		ASSERT(rr->rr_firstdatacol >= n);
3353 
3354 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3355 	}
3356 }
3357 
3358 /*
3359  * Return the number of reads issued.
3360  */
3361 static int
3362 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3363 {
3364 	vdev_t *vd = zio->io_vd;
3365 	int nread = 0;
3366 
3367 	rr->rr_missingdata = 0;
3368 	rr->rr_missingparity = 0;
3369 
3370 	/*
3371 	 * If this rows contains empty sectors which are not required
3372 	 * for a normal read then allocate an ABD for them now so they
3373 	 * may be read, verified, and any needed repairs performed.
3374 	 */
3375 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3376 		vdev_draid_map_alloc_empty(zio, rr);
3377 
3378 	for (int c = 0; c < rr->rr_cols; c++) {
3379 		raidz_col_t *rc = &rr->rr_col[c];
3380 		if (rc->rc_tried || rc->rc_size == 0)
3381 			continue;
3382 
3383 		zio_nowait(zio_vdev_child_io(zio, NULL,
3384 		    vd->vdev_child[rc->rc_devidx],
3385 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3386 		    zio->io_type, zio->io_priority, 0,
3387 		    vdev_raidz_child_done, rc));
3388 		nread++;
3389 	}
3390 	return (nread);
3391 }
3392 
3393 /*
3394  * We're here because either there were too many errors to even attempt
3395  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3396  * failed. In either case, there is enough bad data to prevent reconstruction.
3397  * Start checksum ereports for all children which haven't failed.
3398  */
3399 static void
3400 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3401 {
3402 	raidz_map_t *rm = zio->io_vsd;
3403 
3404 	for (int i = 0; i < rm->rm_nrows; i++) {
3405 		raidz_row_t *rr = rm->rm_row[i];
3406 
3407 		for (int c = 0; c < rr->rr_cols; c++) {
3408 			raidz_col_t *rc = &rr->rr_col[c];
3409 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3410 
3411 			if (rc->rc_error != 0)
3412 				continue;
3413 
3414 			zio_bad_cksum_t zbc;
3415 			zbc.zbc_has_cksum = 0;
3416 			zbc.zbc_injected = rm->rm_ecksuminjected;
3417 			mutex_enter(&cvd->vdev_stat_lock);
3418 			cvd->vdev_stat.vs_checksum_errors++;
3419 			mutex_exit(&cvd->vdev_stat_lock);
3420 			(void) zfs_ereport_start_checksum(zio->io_spa,
3421 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3422 			    rc->rc_size, &zbc);
3423 		}
3424 	}
3425 }
3426 
3427 void
3428 vdev_raidz_io_done(zio_t *zio)
3429 {
3430 	raidz_map_t *rm = zio->io_vsd;
3431 
3432 	ASSERT(zio->io_bp != NULL);
3433 	if (zio->io_type == ZIO_TYPE_WRITE) {
3434 		for (int i = 0; i < rm->rm_nrows; i++) {
3435 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3436 		}
3437 	} else {
3438 		if (rm->rm_phys_col) {
3439 			/*
3440 			 * This is an aggregated read.  Copy the data and status
3441 			 * from the aggregate abd's to the individual rows.
3442 			 */
3443 			for (int i = 0; i < rm->rm_nrows; i++) {
3444 				raidz_row_t *rr = rm->rm_row[i];
3445 
3446 				for (int c = 0; c < rr->rr_cols; c++) {
3447 					raidz_col_t *rc = &rr->rr_col[c];
3448 					if (rc->rc_tried || rc->rc_size == 0)
3449 						continue;
3450 
3451 					raidz_col_t *prc =
3452 					    &rm->rm_phys_col[rc->rc_devidx];
3453 					rc->rc_error = prc->rc_error;
3454 					rc->rc_tried = prc->rc_tried;
3455 					rc->rc_skipped = prc->rc_skipped;
3456 					if (c >= rr->rr_firstdatacol) {
3457 						/*
3458 						 * Note: this is slightly faster
3459 						 * than using abd_copy_off().
3460 						 */
3461 						char *physbuf = abd_to_buf(
3462 						    prc->rc_abd);
3463 						void *physloc = physbuf +
3464 						    rc->rc_offset -
3465 						    prc->rc_offset;
3466 
3467 						abd_copy_from_buf(rc->rc_abd,
3468 						    physloc, rc->rc_size);
3469 					}
3470 				}
3471 			}
3472 		}
3473 
3474 		for (int i = 0; i < rm->rm_nrows; i++) {
3475 			raidz_row_t *rr = rm->rm_row[i];
3476 			vdev_raidz_io_done_reconstruct_known_missing(zio,
3477 			    rm, rr);
3478 		}
3479 
3480 		if (raidz_checksum_verify(zio) == 0) {
3481 			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3482 				goto done;
3483 
3484 			for (int i = 0; i < rm->rm_nrows; i++) {
3485 				raidz_row_t *rr = rm->rm_row[i];
3486 				vdev_raidz_io_done_verified(zio, rr);
3487 			}
3488 			zio_checksum_verified(zio);
3489 		} else {
3490 			/*
3491 			 * A sequential resilver has no checksum which makes
3492 			 * combinatoral reconstruction impossible. This code
3493 			 * path is unreachable since raidz_checksum_verify()
3494 			 * has no checksum to verify and must succeed.
3495 			 */
3496 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3497 
3498 			/*
3499 			 * This isn't a typical situation -- either we got a
3500 			 * read error or a child silently returned bad data.
3501 			 * Read every block so we can try again with as much
3502 			 * data and parity as we can track down. If we've
3503 			 * already been through once before, all children will
3504 			 * be marked as tried so we'll proceed to combinatorial
3505 			 * reconstruction.
3506 			 */
3507 			int nread = 0;
3508 			for (int i = 0; i < rm->rm_nrows; i++) {
3509 				nread += vdev_raidz_read_all(zio,
3510 				    rm->rm_row[i]);
3511 			}
3512 			if (nread != 0) {
3513 				/*
3514 				 * Normally our stage is VDEV_IO_DONE, but if
3515 				 * we've already called redone(), it will have
3516 				 * changed to VDEV_IO_START, in which case we
3517 				 * don't want to call redone() again.
3518 				 */
3519 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3520 					zio_vdev_io_redone(zio);
3521 				return;
3522 			}
3523 			/*
3524 			 * It would be too expensive to try every possible
3525 			 * combination of failed sectors in every row, so
3526 			 * instead we try every combination of failed current or
3527 			 * past physical disk. This means that if the incorrect
3528 			 * sectors were all on Nparity disks at any point in the
3529 			 * past, we will find the correct data.  The only known
3530 			 * case where this is less durable than a non-expanded
3531 			 * RAIDZ, is if we have a silent failure during
3532 			 * expansion.  In that case, one block could be
3533 			 * partially in the old format and partially in the
3534 			 * new format, so we'd lost some sectors from the old
3535 			 * format and some from the new format.
3536 			 *
3537 			 * e.g. logical_width=4 physical_width=6
3538 			 * the 15 (6+5+4) possible failed disks are:
3539 			 * width=6 child=0
3540 			 * width=6 child=1
3541 			 * width=6 child=2
3542 			 * width=6 child=3
3543 			 * width=6 child=4
3544 			 * width=6 child=5
3545 			 * width=5 child=0
3546 			 * width=5 child=1
3547 			 * width=5 child=2
3548 			 * width=5 child=3
3549 			 * width=5 child=4
3550 			 * width=4 child=0
3551 			 * width=4 child=1
3552 			 * width=4 child=2
3553 			 * width=4 child=3
3554 			 * And we will try every combination of Nparity of these
3555 			 * failing.
3556 			 *
3557 			 * As a first pass, we can generate every combo,
3558 			 * and try reconstructing, ignoring any known
3559 			 * failures.  If any row has too many known + simulated
3560 			 * failures, then we bail on reconstructing with this
3561 			 * number of simulated failures.  As an improvement,
3562 			 * we could detect the number of whole known failures
3563 			 * (i.e. we have known failures on these disks for
3564 			 * every row; the disks never succeeded), and
3565 			 * subtract that from the max # failures to simulate.
3566 			 * We could go even further like the current
3567 			 * combrec code, but that doesn't seem like it
3568 			 * gains us very much.  If we simulate a failure
3569 			 * that is also a known failure, that's fine.
3570 			 */
3571 			zio->io_error = vdev_raidz_combrec(zio);
3572 			if (zio->io_error == ECKSUM &&
3573 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3574 				vdev_raidz_io_done_unrecoverable(zio);
3575 			}
3576 		}
3577 	}
3578 done:
3579 	if (rm->rm_lr != NULL) {
3580 		zfs_rangelock_exit(rm->rm_lr);
3581 		rm->rm_lr = NULL;
3582 	}
3583 }
3584 
3585 static void
3586 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3587 {
3588 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3589 	if (faulted > vdrz->vd_nparity)
3590 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3591 		    VDEV_AUX_NO_REPLICAS);
3592 	else if (degraded + faulted != 0)
3593 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3594 	else
3595 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3596 }
3597 
3598 /*
3599  * Determine if any portion of the provided block resides on a child vdev
3600  * with a dirty DTL and therefore needs to be resilvered.  The function
3601  * assumes that at least one DTL is dirty which implies that full stripe
3602  * width blocks must be resilvered.
3603  */
3604 static boolean_t
3605 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3606     uint64_t phys_birth)
3607 {
3608 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3609 
3610 	/*
3611 	 * If we're in the middle of a RAIDZ expansion, this block may be in
3612 	 * the old and/or new location.  For simplicity, always resilver it.
3613 	 */
3614 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3615 		return (B_TRUE);
3616 
3617 	uint64_t dcols = vd->vdev_children;
3618 	uint64_t nparity = vdrz->vd_nparity;
3619 	uint64_t ashift = vd->vdev_top->vdev_ashift;
3620 	/* The starting RAIDZ (parent) vdev sector of the block. */
3621 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3622 	/* The zio's size in units of the vdev's minimum sector size. */
3623 	uint64_t s = ((psize - 1) >> ashift) + 1;
3624 	/* The first column for this stripe. */
3625 	uint64_t f = b % dcols;
3626 
3627 	/* Unreachable by sequential resilver. */
3628 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3629 
3630 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3631 		return (B_FALSE);
3632 
3633 	if (s + nparity >= dcols)
3634 		return (B_TRUE);
3635 
3636 	for (uint64_t c = 0; c < s + nparity; c++) {
3637 		uint64_t devidx = (f + c) % dcols;
3638 		vdev_t *cvd = vd->vdev_child[devidx];
3639 
3640 		/*
3641 		 * dsl_scan_need_resilver() already checked vd with
3642 		 * vdev_dtl_contains(). So here just check cvd with
3643 		 * vdev_dtl_empty(), cheaper and a good approximation.
3644 		 */
3645 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3646 			return (B_TRUE);
3647 	}
3648 
3649 	return (B_FALSE);
3650 }
3651 
3652 static void
3653 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3654     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3655 {
3656 	(void) remain_rs;
3657 
3658 	vdev_t *raidvd = cvd->vdev_parent;
3659 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3660 
3661 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3662 
3663 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3664 		/*
3665 		 * We're in the middle of expansion, in which case the
3666 		 * translation is in flux.  Any answer we give may be wrong
3667 		 * by the time we return, so it isn't safe for the caller to
3668 		 * act on it.  Therefore we say that this range isn't present
3669 		 * on any children.  The only consumers of this are "zpool
3670 		 * initialize" and trimming, both of which are "best effort"
3671 		 * anyway.
3672 		 */
3673 		physical_rs->rs_start = physical_rs->rs_end = 0;
3674 		remain_rs->rs_start = remain_rs->rs_end = 0;
3675 		return;
3676 	}
3677 
3678 	uint64_t width = vdrz->vd_physical_width;
3679 	uint64_t tgt_col = cvd->vdev_id;
3680 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3681 
3682 	/* make sure the offsets are block-aligned */
3683 	ASSERT0(logical_rs->rs_start % (1 << ashift));
3684 	ASSERT0(logical_rs->rs_end % (1 << ashift));
3685 	uint64_t b_start = logical_rs->rs_start >> ashift;
3686 	uint64_t b_end = logical_rs->rs_end >> ashift;
3687 
3688 	uint64_t start_row = 0;
3689 	if (b_start > tgt_col) /* avoid underflow */
3690 		start_row = ((b_start - tgt_col - 1) / width) + 1;
3691 
3692 	uint64_t end_row = 0;
3693 	if (b_end > tgt_col)
3694 		end_row = ((b_end - tgt_col - 1) / width) + 1;
3695 
3696 	physical_rs->rs_start = start_row << ashift;
3697 	physical_rs->rs_end = end_row << ashift;
3698 
3699 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3700 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3701 	    logical_rs->rs_end - logical_rs->rs_start);
3702 }
3703 
3704 static void
3705 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3706 {
3707 	spa_t *spa = arg;
3708 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3709 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3710 
3711 	/*
3712 	 * Ensure there are no i/os to the range that is being committed.
3713 	 */
3714 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3715 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3716 
3717 	mutex_enter(&vre->vre_lock);
3718 	uint64_t new_offset =
3719 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3720 	/*
3721 	 * We should not have committed anything that failed.
3722 	 */
3723 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3724 	mutex_exit(&vre->vre_lock);
3725 
3726 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3727 	    old_offset, new_offset - old_offset,
3728 	    RL_WRITER);
3729 
3730 	/*
3731 	 * Update the uberblock that will be written when this txg completes.
3732 	 */
3733 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3734 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3735 	vre->vre_offset_pertxg[txgoff] = 0;
3736 	zfs_rangelock_exit(lr);
3737 
3738 	mutex_enter(&vre->vre_lock);
3739 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3740 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
3741 	mutex_exit(&vre->vre_lock);
3742 
3743 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3744 	VERIFY0(zap_update(spa->spa_meta_objset,
3745 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3746 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3747 }
3748 
3749 static void
3750 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3751 {
3752 	spa_t *spa = arg;
3753 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3754 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3755 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3756 
3757 	for (int i = 0; i < TXG_SIZE; i++)
3758 		VERIFY0(vre->vre_offset_pertxg[i]);
3759 
3760 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3761 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3762 	re->re_logical_width = vdrz->vd_physical_width;
3763 	mutex_enter(&vdrz->vd_expand_lock);
3764 	avl_add(&vdrz->vd_expand_txgs, re);
3765 	mutex_exit(&vdrz->vd_expand_lock);
3766 
3767 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3768 
3769 	/*
3770 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3771 	 * will get written (based on vd_expand_txgs).
3772 	 */
3773 	vdev_config_dirty(vd);
3774 
3775 	/*
3776 	 * Before we change vre_state, the on-disk state must reflect that we
3777 	 * have completed all copying, so that vdev_raidz_io_start() can use
3778 	 * vre_state to determine if the reflow is in progress.  See also the
3779 	 * end of spa_raidz_expand_thread().
3780 	 */
3781 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3782 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3783 
3784 	vre->vre_end_time = gethrestime_sec();
3785 	vre->vre_state = DSS_FINISHED;
3786 
3787 	uint64_t state = vre->vre_state;
3788 	VERIFY0(zap_update(spa->spa_meta_objset,
3789 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3790 	    sizeof (state), 1, &state, tx));
3791 
3792 	uint64_t end_time = vre->vre_end_time;
3793 	VERIFY0(zap_update(spa->spa_meta_objset,
3794 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3795 	    sizeof (end_time), 1, &end_time, tx));
3796 
3797 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
3798 
3799 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3800 	    "%s vdev %llu new width %llu", spa_name(spa),
3801 	    (unsigned long long)vd->vdev_id,
3802 	    (unsigned long long)vd->vdev_children);
3803 
3804 	spa->spa_raidz_expand = NULL;
3805 	raidvd->vdev_rz_expanding = B_FALSE;
3806 
3807 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3808 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3809 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3810 
3811 	spa_notify_waiters(spa);
3812 
3813 	/*
3814 	 * While we're in syncing context take the opportunity to
3815 	 * setup a scrub. All the data has been sucessfully copied
3816 	 * but we have not validated any checksums.
3817 	 */
3818 	setup_sync_arg_t setup_sync_arg = {
3819 		.func = POOL_SCAN_SCRUB,
3820 		.txgstart = 0,
3821 		.txgend = 0,
3822 	};
3823 	if (zfs_scrub_after_expand &&
3824 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
3825 		dsl_scan_setup_sync(&setup_sync_arg, tx);
3826 	}
3827 }
3828 
3829 /*
3830  * State of one copy batch.
3831  */
3832 typedef struct raidz_reflow_arg {
3833 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
3834 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
3835 	uint64_t rra_txg;	/* TXG of this batch. */
3836 	uint_t rra_ashift;	/* Ashift of the vdev. */
3837 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
3838 	uint32_t rra_writes;	/* Number of write ZIOs. */
3839 	zio_t *rra_zio[];	/* Write ZIO pointers. */
3840 } raidz_reflow_arg_t;
3841 
3842 /*
3843  * Write of the new location on one child is done.  Once all of them are done
3844  * we can unlock and free everything.
3845  */
3846 static void
3847 raidz_reflow_write_done(zio_t *zio)
3848 {
3849 	raidz_reflow_arg_t *rra = zio->io_private;
3850 	vdev_raidz_expand_t *vre = rra->rra_vre;
3851 
3852 	abd_free(zio->io_abd);
3853 
3854 	mutex_enter(&vre->vre_lock);
3855 	if (zio->io_error != 0) {
3856 		/* Force a reflow pause on errors */
3857 		vre->vre_failed_offset =
3858 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3859 	}
3860 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3861 	vre->vre_outstanding_bytes -= zio->io_size;
3862 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3863 	    vre->vre_failed_offset) {
3864 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3865 		    zio->io_size;
3866 	}
3867 	cv_signal(&vre->vre_cv);
3868 	boolean_t done = (--rra->rra_tbd == 0);
3869 	mutex_exit(&vre->vre_lock);
3870 
3871 	if (!done)
3872 		return;
3873 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3874 	zfs_rangelock_exit(rra->rra_lr);
3875 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3876 }
3877 
3878 /*
3879  * Read of the old location on one child is done.  Once all of them are done
3880  * writes should have all the data and we can issue them.
3881  */
3882 static void
3883 raidz_reflow_read_done(zio_t *zio)
3884 {
3885 	raidz_reflow_arg_t *rra = zio->io_private;
3886 	vdev_raidz_expand_t *vre = rra->rra_vre;
3887 
3888 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
3889 	if (zio->io_size > (1 << rra->rra_ashift))
3890 		abd_free(zio->io_abd);
3891 
3892 	/*
3893 	 * If the read failed, or if it was done on a vdev that is not fully
3894 	 * healthy (e.g. a child that has a resilver in progress), we may not
3895 	 * have the correct data.  Note that it's OK if the write proceeds.
3896 	 * It may write garbage but the location is otherwise unused and we
3897 	 * will retry later due to vre_failed_offset.
3898 	 */
3899 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3900 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3901 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3902 		    (long long)rra->rra_lr->lr_offset,
3903 		    (long long)rra->rra_lr->lr_length,
3904 		    (long long)rra->rra_txg,
3905 		    zio->io_error,
3906 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3907 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3908 		mutex_enter(&vre->vre_lock);
3909 		/* Force a reflow pause on errors */
3910 		vre->vre_failed_offset =
3911 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3912 		mutex_exit(&vre->vre_lock);
3913 	}
3914 
3915 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
3916 		return;
3917 	rra->rra_tbd = rra->rra_writes;
3918 	for (uint64_t i = 0; i < rra->rra_writes; i++)
3919 		zio_nowait(rra->rra_zio[i]);
3920 }
3921 
3922 static void
3923 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3924     dmu_tx_t *tx)
3925 {
3926 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3927 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3928 
3929 	if (offset == 0)
3930 		return;
3931 
3932 	mutex_enter(&vre->vre_lock);
3933 	ASSERT3U(vre->vre_offset, <=, offset);
3934 	vre->vre_offset = offset;
3935 	mutex_exit(&vre->vre_lock);
3936 
3937 	if (vre->vre_offset_pertxg[txgoff] == 0) {
3938 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3939 		    spa, tx);
3940 	}
3941 	vre->vre_offset_pertxg[txgoff] = offset;
3942 }
3943 
3944 static boolean_t
3945 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3946 {
3947 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
3948 		/* Quick check if a child is being replaced */
3949 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3950 			return (B_TRUE);
3951 	}
3952 	return (B_FALSE);
3953 }
3954 
3955 static boolean_t
3956 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3957     dmu_tx_t *tx)
3958 {
3959 	spa_t *spa = vd->vdev_spa;
3960 	uint_t ashift = vd->vdev_top->vdev_ashift;
3961 
3962 	range_seg_t *rs = range_tree_first(rt);
3963 	if (rt == NULL)
3964 		return (B_FALSE);
3965 	uint64_t offset = rs_get_start(rs, rt);
3966 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3967 	uint64_t size = rs_get_end(rs, rt) - offset;
3968 	ASSERT3U(size, >=, 1 << ashift);
3969 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
3970 
3971 	uint64_t blkid = offset >> ashift;
3972 	uint_t old_children = vd->vdev_children - 1;
3973 
3974 	/*
3975 	 * We can only progress to the point that writes will not overlap
3976 	 * with blocks whose progress has not yet been recorded on disk.
3977 	 * Since partially-copied rows are still read from the old location,
3978 	 * we need to stop one row before the sector-wise overlap, to prevent
3979 	 * row-wise overlap.
3980 	 *
3981 	 * Note that even if we are skipping over a large unallocated region,
3982 	 * we can't move the on-disk progress to `offset`, because concurrent
3983 	 * writes/allocations could still use the currently-unallocated
3984 	 * region.
3985 	 */
3986 	uint64_t ubsync_blkid =
3987 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3988 	uint64_t next_overwrite_blkid = ubsync_blkid +
3989 	    ubsync_blkid / old_children - old_children;
3990 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3991 	if (blkid >= next_overwrite_blkid) {
3992 		raidz_reflow_record_progress(vre,
3993 		    next_overwrite_blkid << ashift, tx);
3994 		return (B_TRUE);
3995 	}
3996 
3997 	size = MIN(size, raidz_expand_max_copy_bytes);
3998 	size = MIN(size, (uint64_t)old_children *
3999 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4000 	size = MAX(size, 1 << ashift);
4001 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4002 	size = (uint64_t)blocks << ashift;
4003 
4004 	range_tree_remove(rt, offset, size);
4005 
4006 	uint_t reads = MIN(blocks, old_children);
4007 	uint_t writes = MIN(blocks, vd->vdev_children);
4008 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4009 	    sizeof (zio_t *) * writes, KM_SLEEP);
4010 	rra->rra_vre = vre;
4011 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4012 	    offset, size, RL_WRITER);
4013 	rra->rra_txg = dmu_tx_get_txg(tx);
4014 	rra->rra_ashift = ashift;
4015 	rra->rra_tbd = reads;
4016 	rra->rra_writes = writes;
4017 
4018 	raidz_reflow_record_progress(vre, offset + size, tx);
4019 
4020 	/*
4021 	 * SCL_STATE will be released when the read and write are done,
4022 	 * by raidz_reflow_write_done().
4023 	 */
4024 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4025 
4026 	/* check if a replacing vdev was added, if so treat it as an error */
4027 	if (vdev_raidz_expand_child_replacing(vd)) {
4028 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4029 		    "offset=%llu txg=%llu",
4030 		    (long long)rra->rra_lr->lr_offset,
4031 		    (long long)rra->rra_txg);
4032 
4033 		mutex_enter(&vre->vre_lock);
4034 		vre->vre_failed_offset =
4035 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4036 		cv_signal(&vre->vre_cv);
4037 		mutex_exit(&vre->vre_lock);
4038 
4039 		/* drop everything we acquired */
4040 		spa_config_exit(spa, SCL_STATE, spa);
4041 		zfs_rangelock_exit(rra->rra_lr);
4042 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4043 		return (B_TRUE);
4044 	}
4045 
4046 	mutex_enter(&vre->vre_lock);
4047 	vre->vre_outstanding_bytes += size;
4048 	mutex_exit(&vre->vre_lock);
4049 
4050 	/* Allocate ABD and ZIO for each child we write. */
4051 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4052 	zio_t *pio = spa->spa_txg_zio[txgoff];
4053 	uint_t b = blocks / vd->vdev_children;
4054 	uint_t bb = blocks % vd->vdev_children;
4055 	for (uint_t i = 0; i < writes; i++) {
4056 		uint_t n = b + (i < bb);
4057 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4058 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4059 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
4060 		    ((blkid + i) / vd->vdev_children) << ashift,
4061 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4062 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4063 	}
4064 
4065 	/*
4066 	 * Allocate and issue ZIO for each child we read.  For reads of only
4067 	 * one block we can use respective writer ABDs, since they will also
4068 	 * have only one block.  For bigger reads create gang ABDs and fill
4069 	 * them with respective blocks from writer ABDs.
4070 	 */
4071 	b = blocks / old_children;
4072 	bb = blocks % old_children;
4073 	for (uint_t i = 0; i < reads; i++) {
4074 		uint_t n = b + (i < bb);
4075 		abd_t *abd;
4076 		if (n > 1) {
4077 			abd = abd_alloc_gang();
4078 			for (uint_t j = 0; j < n; j++) {
4079 				uint_t b = j * old_children + i;
4080 				abd_t *cabd = abd_get_offset_size(
4081 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
4082 				    (b / vd->vdev_children) << ashift,
4083 				    1 << ashift);
4084 				abd_gang_add(abd, cabd, B_TRUE);
4085 			}
4086 		} else {
4087 			abd = rra->rra_zio[i]->io_abd;
4088 		}
4089 		zio_nowait(zio_vdev_child_io(pio, NULL,
4090 		    vd->vdev_child[(blkid + i) % old_children],
4091 		    ((blkid + i) / old_children) << ashift, abd,
4092 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4093 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4094 	}
4095 
4096 	return (B_FALSE);
4097 }
4098 
4099 /*
4100  * For testing (ztest specific)
4101  */
4102 static void
4103 raidz_expand_pause(uint_t pause_point)
4104 {
4105 	while (raidz_expand_pause_point != 0 &&
4106 	    raidz_expand_pause_point <= pause_point)
4107 		delay(hz);
4108 }
4109 
4110 static void
4111 raidz_scratch_child_done(zio_t *zio)
4112 {
4113 	zio_t *pio = zio->io_private;
4114 
4115 	mutex_enter(&pio->io_lock);
4116 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4117 	mutex_exit(&pio->io_lock);
4118 }
4119 
4120 /*
4121  * Reflow the beginning portion of the vdev into an intermediate scratch area
4122  * in memory and on disk. This operation must be persisted on disk before we
4123  * proceed to overwrite the beginning portion with the reflowed data.
4124  *
4125  * This multi-step task can fail to complete if disk errors are encountered
4126  * and we can return here after a pause (waiting for disk to become healthy).
4127  */
4128 static void
4129 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4130 {
4131 	vdev_raidz_expand_t *vre = arg;
4132 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4133 	zio_t *pio;
4134 	int error;
4135 
4136 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4137 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4138 	int ashift = raidvd->vdev_ashift;
4139 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4140 	    uint64_t);
4141 	uint64_t logical_size = write_size * raidvd->vdev_children;
4142 	uint64_t read_size =
4143 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4144 	    1 << ashift);
4145 
4146 	/*
4147 	 * The scratch space must be large enough to get us to the point
4148 	 * that one row does not overlap itself when moved.  This is checked
4149 	 * by vdev_raidz_attach_check().
4150 	 */
4151 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4152 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4153 	VERIFY3U(write_size, <=, read_size);
4154 
4155 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4156 	    0, logical_size, RL_WRITER);
4157 
4158 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4159 	    KM_SLEEP);
4160 	for (int i = 0; i < raidvd->vdev_children; i++) {
4161 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4162 	}
4163 
4164 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4165 
4166 	/*
4167 	 * If we have already written the scratch area then we must read from
4168 	 * there, since new writes were redirected there while we were paused
4169 	 * or the original location may have been partially overwritten with
4170 	 * reflowed data.
4171 	 */
4172 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4173 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4174 		/*
4175 		 * Read from scratch space.
4176 		 */
4177 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4178 		for (int i = 0; i < raidvd->vdev_children; i++) {
4179 			/*
4180 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4181 			 * to the offset to calculate the physical offset to
4182 			 * write to.  Passing in a negative offset makes us
4183 			 * access the scratch area.
4184 			 */
4185 			zio_nowait(zio_vdev_child_io(pio, NULL,
4186 			    raidvd->vdev_child[i],
4187 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4188 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4189 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4190 		}
4191 		error = zio_wait(pio);
4192 		if (error != 0) {
4193 			zfs_dbgmsg("reflow: error %d reading scratch location",
4194 			    error);
4195 			goto io_error_exit;
4196 		}
4197 		goto overwrite;
4198 	}
4199 
4200 	/*
4201 	 * Read from original location.
4202 	 */
4203 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4204 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4205 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4206 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4207 		    0, abds[i], read_size, ZIO_TYPE_READ,
4208 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4209 		    raidz_scratch_child_done, pio));
4210 	}
4211 	error = zio_wait(pio);
4212 	if (error != 0) {
4213 		zfs_dbgmsg("reflow: error %d reading original location", error);
4214 io_error_exit:
4215 		for (int i = 0; i < raidvd->vdev_children; i++)
4216 			abd_free(abds[i]);
4217 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4218 		zfs_rangelock_exit(lr);
4219 		spa_config_exit(spa, SCL_STATE, FTAG);
4220 		return;
4221 	}
4222 
4223 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4224 
4225 	/*
4226 	 * Reflow in memory.
4227 	 */
4228 	uint64_t logical_sectors = logical_size >> ashift;
4229 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4230 		int oldchild = i % (raidvd->vdev_children - 1);
4231 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4232 
4233 		int newchild = i % raidvd->vdev_children;
4234 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4235 
4236 		/* a single sector should not be copying over itself */
4237 		ASSERT(!(newchild == oldchild && newoff == oldoff));
4238 
4239 		abd_copy_off(abds[newchild], abds[oldchild],
4240 		    newoff, oldoff, 1 << ashift);
4241 	}
4242 
4243 	/*
4244 	 * Verify that we filled in everything we intended to (write_size on
4245 	 * each child).
4246 	 */
4247 	VERIFY0(logical_sectors % raidvd->vdev_children);
4248 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4249 	    write_size);
4250 
4251 	/*
4252 	 * Write to scratch location (boot area).
4253 	 */
4254 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4255 	for (int i = 0; i < raidvd->vdev_children; i++) {
4256 		/*
4257 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4258 		 * the offset to calculate the physical offset to write to.
4259 		 * Passing in a negative offset lets us access the boot area.
4260 		 */
4261 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4262 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4263 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4264 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4265 	}
4266 	error = zio_wait(pio);
4267 	if (error != 0) {
4268 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4269 		goto io_error_exit;
4270 	}
4271 	pio = zio_root(spa, NULL, NULL, 0);
4272 	zio_flush(pio, raidvd);
4273 	zio_wait(pio);
4274 
4275 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4276 	    (long long)logical_size);
4277 
4278 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4279 
4280 	/*
4281 	 * Update uberblock to indicate that scratch space is valid.  This is
4282 	 * needed because after this point, the real location may be
4283 	 * overwritten.  If we crash, we need to get the data from the
4284 	 * scratch space, rather than the real location.
4285 	 *
4286 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4287 	 * will prefer this uberblock.
4288 	 */
4289 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4290 	spa->spa_ubsync.ub_timestamp++;
4291 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4292 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4293 	if (spa_multihost(spa))
4294 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4295 
4296 	zfs_dbgmsg("reflow: uberblock updated "
4297 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4298 	    (long long)spa->spa_ubsync.ub_txg,
4299 	    (long long)logical_size,
4300 	    (long long)spa->spa_ubsync.ub_timestamp);
4301 
4302 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4303 
4304 	/*
4305 	 * Overwrite with reflow'ed data.
4306 	 */
4307 overwrite:
4308 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4309 	for (int i = 0; i < raidvd->vdev_children; i++) {
4310 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4311 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4312 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4313 		    raidz_scratch_child_done, pio));
4314 	}
4315 	error = zio_wait(pio);
4316 	if (error != 0) {
4317 		/*
4318 		 * When we exit early here and drop the range lock, new
4319 		 * writes will go into the scratch area so we'll need to
4320 		 * read from there when we return after pausing.
4321 		 */
4322 		zfs_dbgmsg("reflow: error %d writing real location", error);
4323 		/*
4324 		 * Update the uberblock that is written when this txg completes.
4325 		 */
4326 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4327 		    logical_size);
4328 		goto io_error_exit;
4329 	}
4330 	pio = zio_root(spa, NULL, NULL, 0);
4331 	zio_flush(pio, raidvd);
4332 	zio_wait(pio);
4333 
4334 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4335 	    (long long)logical_size);
4336 	for (int i = 0; i < raidvd->vdev_children; i++)
4337 		abd_free(abds[i]);
4338 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4339 
4340 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4341 
4342 	/*
4343 	 * Update uberblock to indicate that the initial part has been
4344 	 * reflow'ed.  This is needed because after this point (when we exit
4345 	 * the rangelock), we allow regular writes to this region, which will
4346 	 * be written to the new location only (because reflow_offset_next ==
4347 	 * reflow_offset_synced).  If we crashed and re-copied from the
4348 	 * scratch space, we would lose the regular writes.
4349 	 */
4350 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4351 	    logical_size);
4352 	spa->spa_ubsync.ub_timestamp++;
4353 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4354 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4355 	if (spa_multihost(spa))
4356 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4357 
4358 	zfs_dbgmsg("reflow: uberblock updated "
4359 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4360 	    (long long)spa->spa_ubsync.ub_txg,
4361 	    (long long)logical_size,
4362 	    (long long)spa->spa_ubsync.ub_timestamp);
4363 
4364 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4365 
4366 	/*
4367 	 * Update progress.
4368 	 */
4369 	vre->vre_offset = logical_size;
4370 	zfs_rangelock_exit(lr);
4371 	spa_config_exit(spa, SCL_STATE, FTAG);
4372 
4373 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4374 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4375 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4376 	/*
4377 	 * Note - raidz_reflow_sync() will update the uberblock state to
4378 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4379 	 */
4380 	raidz_reflow_sync(spa, tx);
4381 
4382 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4383 }
4384 
4385 /*
4386  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4387  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4388  */
4389 void
4390 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4391 {
4392 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4393 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4394 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4395 
4396 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4397 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4398 	ASSERT0(logical_size % raidvd->vdev_children);
4399 	uint64_t write_size = logical_size / raidvd->vdev_children;
4400 
4401 	zio_t *pio;
4402 
4403 	/*
4404 	 * Read from scratch space.
4405 	 */
4406 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4407 	    KM_SLEEP);
4408 	for (int i = 0; i < raidvd->vdev_children; i++) {
4409 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4410 	}
4411 
4412 	pio = zio_root(spa, NULL, NULL, 0);
4413 	for (int i = 0; i < raidvd->vdev_children; i++) {
4414 		/*
4415 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4416 		 * the offset to calculate the physical offset to write to.
4417 		 * Passing in a negative offset lets us access the boot area.
4418 		 */
4419 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4420 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4421 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4422 		    raidz_scratch_child_done, pio));
4423 	}
4424 	zio_wait(pio);
4425 
4426 	/*
4427 	 * Overwrite real location with reflow'ed data.
4428 	 */
4429 	pio = zio_root(spa, NULL, NULL, 0);
4430 	for (int i = 0; i < raidvd->vdev_children; i++) {
4431 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4432 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4433 		    ZIO_PRIORITY_REMOVAL, 0,
4434 		    raidz_scratch_child_done, pio));
4435 	}
4436 	zio_wait(pio);
4437 	pio = zio_root(spa, NULL, NULL, 0);
4438 	zio_flush(pio, raidvd);
4439 	zio_wait(pio);
4440 
4441 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4442 	    "to real location", (long long)logical_size);
4443 
4444 	for (int i = 0; i < raidvd->vdev_children; i++)
4445 		abd_free(abds[i]);
4446 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4447 
4448 	/*
4449 	 * Update uberblock.
4450 	 */
4451 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4452 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4453 	spa->spa_ubsync.ub_timestamp++;
4454 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4455 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4456 	if (spa_multihost(spa))
4457 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4458 
4459 	zfs_dbgmsg("reflow recovery: uberblock updated "
4460 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4461 	    (long long)spa->spa_ubsync.ub_txg,
4462 	    (long long)logical_size,
4463 	    (long long)spa->spa_ubsync.ub_timestamp);
4464 
4465 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4466 	    spa_first_txg(spa));
4467 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4468 	vre->vre_offset = logical_size;
4469 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4470 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4471 	/*
4472 	 * Note that raidz_reflow_sync() will update the uberblock once more
4473 	 */
4474 	raidz_reflow_sync(spa, tx);
4475 
4476 	dmu_tx_commit(tx);
4477 
4478 	spa_config_exit(spa, SCL_STATE, FTAG);
4479 }
4480 
4481 static boolean_t
4482 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4483 {
4484 	(void) zthr;
4485 	spa_t *spa = arg;
4486 
4487 	return (spa->spa_raidz_expand != NULL &&
4488 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4489 }
4490 
4491 /*
4492  * RAIDZ expansion background thread
4493  *
4494  * Can be called multiple times if the reflow is paused
4495  */
4496 static void
4497 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4498 {
4499 	spa_t *spa = arg;
4500 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4501 
4502 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4503 		vre->vre_offset = 0;
4504 	else
4505 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4506 
4507 	/* Reflow the begining portion using the scratch area */
4508 	if (vre->vre_offset == 0) {
4509 		VERIFY0(dsl_sync_task(spa_name(spa),
4510 		    NULL, raidz_reflow_scratch_sync,
4511 		    vre, 0, ZFS_SPACE_CHECK_NONE));
4512 
4513 		/* if we encountered errors then pause */
4514 		if (vre->vre_offset == 0) {
4515 			mutex_enter(&vre->vre_lock);
4516 			vre->vre_waiting_for_resilver = B_TRUE;
4517 			mutex_exit(&vre->vre_lock);
4518 			return;
4519 		}
4520 	}
4521 
4522 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4523 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4524 
4525 	uint64_t guid = raidvd->vdev_guid;
4526 
4527 	/* Iterate over all the remaining metaslabs */
4528 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4529 	    i < raidvd->vdev_ms_count &&
4530 	    !zthr_iscancelled(zthr) &&
4531 	    vre->vre_failed_offset == UINT64_MAX; i++) {
4532 		metaslab_t *msp = raidvd->vdev_ms[i];
4533 
4534 		metaslab_disable(msp);
4535 		mutex_enter(&msp->ms_lock);
4536 
4537 		/*
4538 		 * The metaslab may be newly created (for the expanded
4539 		 * space), in which case its trees won't exist yet,
4540 		 * so we need to bail out early.
4541 		 */
4542 		if (msp->ms_new) {
4543 			mutex_exit(&msp->ms_lock);
4544 			metaslab_enable(msp, B_FALSE, B_FALSE);
4545 			continue;
4546 		}
4547 
4548 		VERIFY0(metaslab_load(msp));
4549 
4550 		/*
4551 		 * We want to copy everything except the free (allocatable)
4552 		 * space.  Note that there may be a little bit more free
4553 		 * space (e.g. in ms_defer), and it's fine to copy that too.
4554 		 */
4555 		uint64_t shift, start;
4556 		range_seg_type_t type = metaslab_calculate_range_tree_type(
4557 		    raidvd, msp, &start, &shift);
4558 		range_tree_t *rt = range_tree_create(NULL, type, NULL,
4559 		    start, shift);
4560 		range_tree_add(rt, msp->ms_start, msp->ms_size);
4561 		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4562 		mutex_exit(&msp->ms_lock);
4563 
4564 		/*
4565 		 * Force the last sector of each metaslab to be copied.  This
4566 		 * ensures that we advance the on-disk progress to the end of
4567 		 * this metaslab while the metaslab is disabled.  Otherwise, we
4568 		 * could move past this metaslab without advancing the on-disk
4569 		 * progress, and then an allocation to this metaslab would not
4570 		 * be copied.
4571 		 */
4572 		int sectorsz = 1 << raidvd->vdev_ashift;
4573 		uint64_t ms_last_offset = msp->ms_start +
4574 		    msp->ms_size - sectorsz;
4575 		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4576 			range_tree_add(rt, ms_last_offset, sectorsz);
4577 		}
4578 
4579 		/*
4580 		 * When we are resuming from a paused expansion (i.e.
4581 		 * when importing a pool with a expansion in progress),
4582 		 * discard any state that we have already processed.
4583 		 */
4584 		if (vre->vre_offset > msp->ms_start) {
4585 			range_tree_clear(rt, msp->ms_start,
4586 			    vre->vre_offset - msp->ms_start);
4587 		}
4588 
4589 		while (!zthr_iscancelled(zthr) &&
4590 		    !range_tree_is_empty(rt) &&
4591 		    vre->vre_failed_offset == UINT64_MAX) {
4592 
4593 			/*
4594 			 * We need to periodically drop the config lock so that
4595 			 * writers can get in.  Additionally, we can't wait
4596 			 * for a txg to sync while holding a config lock
4597 			 * (since a waiting writer could cause a 3-way deadlock
4598 			 * with the sync thread, which also gets a config
4599 			 * lock for reader).  So we can't hold the config lock
4600 			 * while calling dmu_tx_assign().
4601 			 */
4602 			spa_config_exit(spa, SCL_CONFIG, FTAG);
4603 
4604 			/*
4605 			 * If requested, pause the reflow when the amount
4606 			 * specified by raidz_expand_max_reflow_bytes is reached
4607 			 *
4608 			 * This pause is only used during testing or debugging.
4609 			 */
4610 			while (raidz_expand_max_reflow_bytes != 0 &&
4611 			    raidz_expand_max_reflow_bytes <=
4612 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4613 				delay(hz);
4614 			}
4615 
4616 			mutex_enter(&vre->vre_lock);
4617 			while (vre->vre_outstanding_bytes >
4618 			    raidz_expand_max_copy_bytes) {
4619 				cv_wait(&vre->vre_cv, &vre->vre_lock);
4620 			}
4621 			mutex_exit(&vre->vre_lock);
4622 
4623 			dmu_tx_t *tx =
4624 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4625 
4626 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4627 			uint64_t txg = dmu_tx_get_txg(tx);
4628 
4629 			/*
4630 			 * Reacquire the vdev_config lock.  Theoretically, the
4631 			 * vdev_t that we're expanding may have changed.
4632 			 */
4633 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4634 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4635 
4636 			boolean_t needsync =
4637 			    raidz_reflow_impl(raidvd, vre, rt, tx);
4638 
4639 			dmu_tx_commit(tx);
4640 
4641 			if (needsync) {
4642 				spa_config_exit(spa, SCL_CONFIG, FTAG);
4643 				txg_wait_synced(spa->spa_dsl_pool, txg);
4644 				spa_config_enter(spa, SCL_CONFIG, FTAG,
4645 				    RW_READER);
4646 			}
4647 		}
4648 
4649 		spa_config_exit(spa, SCL_CONFIG, FTAG);
4650 
4651 		metaslab_enable(msp, B_FALSE, B_FALSE);
4652 		range_tree_vacate(rt, NULL, NULL);
4653 		range_tree_destroy(rt);
4654 
4655 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4656 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4657 	}
4658 
4659 	spa_config_exit(spa, SCL_CONFIG, FTAG);
4660 
4661 	/*
4662 	 * The txg_wait_synced() here ensures that all reflow zio's have
4663 	 * completed, and vre_failed_offset has been set if necessary.  It
4664 	 * also ensures that the progress of the last raidz_reflow_sync() is
4665 	 * written to disk before raidz_reflow_complete_sync() changes the
4666 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4667 	 * determine if a reflow is in progress, in which case we may need to
4668 	 * write to both old and new locations.  Therefore we can only change
4669 	 * vre_state once this is not necessary, which is once the on-disk
4670 	 * progress (in spa_ubsync) has been set past any possible writes (to
4671 	 * the end of the last metaslab).
4672 	 */
4673 	txg_wait_synced(spa->spa_dsl_pool, 0);
4674 
4675 	if (!zthr_iscancelled(zthr) &&
4676 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4677 		/*
4678 		 * We are not being canceled or paused, so the reflow must be
4679 		 * complete. In that case also mark it as completed on disk.
4680 		 */
4681 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4682 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4683 		    raidz_reflow_complete_sync, spa,
4684 		    0, ZFS_SPACE_CHECK_NONE));
4685 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4686 	} else {
4687 		/*
4688 		 * Wait for all copy zio's to complete and for all the
4689 		 * raidz_reflow_sync() synctasks to be run.
4690 		 */
4691 		spa_history_log_internal(spa, "reflow pause",
4692 		    NULL, "offset=%llu failed_offset=%lld",
4693 		    (long long)vre->vre_offset,
4694 		    (long long)vre->vre_failed_offset);
4695 		mutex_enter(&vre->vre_lock);
4696 		if (vre->vre_failed_offset != UINT64_MAX) {
4697 			/*
4698 			 * Reset progress so that we will retry everything
4699 			 * after the point that something failed.
4700 			 */
4701 			vre->vre_offset = vre->vre_failed_offset;
4702 			vre->vre_failed_offset = UINT64_MAX;
4703 			vre->vre_waiting_for_resilver = B_TRUE;
4704 		}
4705 		mutex_exit(&vre->vre_lock);
4706 	}
4707 }
4708 
4709 void
4710 spa_start_raidz_expansion_thread(spa_t *spa)
4711 {
4712 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4713 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4714 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4715 	    spa, defclsyspri);
4716 }
4717 
4718 void
4719 raidz_dtl_reassessed(vdev_t *vd)
4720 {
4721 	spa_t *spa = vd->vdev_spa;
4722 	if (spa->spa_raidz_expand != NULL) {
4723 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4724 		/*
4725 		 * we get called often from vdev_dtl_reassess() so make
4726 		 * sure it's our vdev and any replacing is complete
4727 		 */
4728 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4729 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4730 			mutex_enter(&vre->vre_lock);
4731 			if (vre->vre_waiting_for_resilver) {
4732 				vdev_dbgmsg(vd, "DTL reassessed, "
4733 				    "continuing raidz expansion");
4734 				vre->vre_waiting_for_resilver = B_FALSE;
4735 				zthr_wakeup(spa->spa_raidz_expand_zthr);
4736 			}
4737 			mutex_exit(&vre->vre_lock);
4738 		}
4739 	}
4740 }
4741 
4742 int
4743 vdev_raidz_attach_check(vdev_t *new_child)
4744 {
4745 	vdev_t *raidvd = new_child->vdev_parent;
4746 	uint64_t new_children = raidvd->vdev_children;
4747 
4748 	/*
4749 	 * We use the "boot" space as scratch space to handle overwriting the
4750 	 * initial part of the vdev.  If it is too small, then this expansion
4751 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4752 	 * >200 children).
4753 	 */
4754 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4755 		return (EINVAL);
4756 	}
4757 	return (0);
4758 }
4759 
4760 void
4761 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4762 {
4763 	vdev_t *new_child = arg;
4764 	spa_t *spa = new_child->vdev_spa;
4765 	vdev_t *raidvd = new_child->vdev_parent;
4766 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4767 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4768 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
4769 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4770 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4771 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4772 	    new_child);
4773 
4774 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4775 
4776 	vdrz->vd_physical_width++;
4777 
4778 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4779 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4780 	vdrz->vn_vre.vre_offset = 0;
4781 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4782 	spa->spa_raidz_expand = &vdrz->vn_vre;
4783 	zthr_wakeup(spa->spa_raidz_expand_zthr);
4784 
4785 	/*
4786 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4787 	 * written to the config.
4788 	 */
4789 	vdev_config_dirty(raidvd);
4790 
4791 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
4792 	vdrz->vn_vre.vre_end_time = 0;
4793 	vdrz->vn_vre.vre_state = DSS_SCANNING;
4794 	vdrz->vn_vre.vre_bytes_copied = 0;
4795 
4796 	uint64_t state = vdrz->vn_vre.vre_state;
4797 	VERIFY0(zap_update(spa->spa_meta_objset,
4798 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4799 	    sizeof (state), 1, &state, tx));
4800 
4801 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
4802 	VERIFY0(zap_update(spa->spa_meta_objset,
4803 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4804 	    sizeof (start_time), 1, &start_time, tx));
4805 
4806 	(void) zap_remove(spa->spa_meta_objset,
4807 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4808 	(void) zap_remove(spa->spa_meta_objset,
4809 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4810 
4811 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4812 	    "%s vdev %llu new width %llu", spa_name(spa),
4813 	    (unsigned long long)raidvd->vdev_id,
4814 	    (unsigned long long)raidvd->vdev_children);
4815 }
4816 
4817 int
4818 vdev_raidz_load(vdev_t *vd)
4819 {
4820 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4821 	int err;
4822 
4823 	uint64_t state = DSS_NONE;
4824 	uint64_t start_time = 0;
4825 	uint64_t end_time = 0;
4826 	uint64_t bytes_copied = 0;
4827 
4828 	if (vd->vdev_top_zap != 0) {
4829 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4830 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4831 		    sizeof (state), 1, &state);
4832 		if (err != 0 && err != ENOENT)
4833 			return (err);
4834 
4835 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4836 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4837 		    sizeof (start_time), 1, &start_time);
4838 		if (err != 0 && err != ENOENT)
4839 			return (err);
4840 
4841 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4842 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4843 		    sizeof (end_time), 1, &end_time);
4844 		if (err != 0 && err != ENOENT)
4845 			return (err);
4846 
4847 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4848 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4849 		    sizeof (bytes_copied), 1, &bytes_copied);
4850 		if (err != 0 && err != ENOENT)
4851 			return (err);
4852 	}
4853 
4854 	/*
4855 	 * If we are in the middle of expansion, vre_state should have
4856 	 * already been set by vdev_raidz_init().
4857 	 */
4858 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4859 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4860 	vdrz->vn_vre.vre_start_time = start_time;
4861 	vdrz->vn_vre.vre_end_time = end_time;
4862 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4863 
4864 	return (0);
4865 }
4866 
4867 int
4868 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4869 {
4870 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4871 
4872 	if (vre == NULL) {
4873 		/* no removal in progress; find most recent completed */
4874 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4875 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4876 			if (vd->vdev_ops == &vdev_raidz_ops) {
4877 				vdev_raidz_t *vdrz = vd->vdev_tsd;
4878 
4879 				if (vdrz->vn_vre.vre_end_time != 0 &&
4880 				    (vre == NULL ||
4881 				    vdrz->vn_vre.vre_end_time >
4882 				    vre->vre_end_time)) {
4883 					vre = &vdrz->vn_vre;
4884 				}
4885 			}
4886 		}
4887 	}
4888 
4889 	if (vre == NULL) {
4890 		return (SET_ERROR(ENOENT));
4891 	}
4892 
4893 	pres->pres_state = vre->vre_state;
4894 	pres->pres_expanding_vdev = vre->vre_vdev_id;
4895 
4896 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4897 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4898 
4899 	mutex_enter(&vre->vre_lock);
4900 	pres->pres_reflowed = vre->vre_bytes_copied;
4901 	for (int i = 0; i < TXG_SIZE; i++)
4902 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4903 	mutex_exit(&vre->vre_lock);
4904 
4905 	pres->pres_start_time = vre->vre_start_time;
4906 	pres->pres_end_time = vre->vre_end_time;
4907 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4908 
4909 	return (0);
4910 }
4911 
4912 /*
4913  * Initialize private RAIDZ specific fields from the nvlist.
4914  */
4915 static int
4916 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4917 {
4918 	uint_t children;
4919 	nvlist_t **child;
4920 	int error = nvlist_lookup_nvlist_array(nv,
4921 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
4922 	if (error != 0)
4923 		return (SET_ERROR(EINVAL));
4924 
4925 	uint64_t nparity;
4926 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4927 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4928 			return (SET_ERROR(EINVAL));
4929 
4930 		/*
4931 		 * Previous versions could only support 1 or 2 parity
4932 		 * device.
4933 		 */
4934 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4935 			return (SET_ERROR(EINVAL));
4936 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4937 			return (SET_ERROR(EINVAL));
4938 	} else {
4939 		/*
4940 		 * We require the parity to be specified for SPAs that
4941 		 * support multiple parity levels.
4942 		 */
4943 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4944 			return (SET_ERROR(EINVAL));
4945 
4946 		/*
4947 		 * Otherwise, we default to 1 parity device for RAID-Z.
4948 		 */
4949 		nparity = 1;
4950 	}
4951 
4952 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4953 	vdrz->vn_vre.vre_vdev_id = -1;
4954 	vdrz->vn_vre.vre_offset = UINT64_MAX;
4955 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4956 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4957 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4958 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4959 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4960 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4961 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4962 
4963 	vdrz->vd_physical_width = children;
4964 	vdrz->vd_nparity = nparity;
4965 
4966 	/* note, the ID does not exist when creating a pool */
4967 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4968 	    &vdrz->vn_vre.vre_vdev_id);
4969 
4970 	boolean_t reflow_in_progress =
4971 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4972 	if (reflow_in_progress) {
4973 		spa->spa_raidz_expand = &vdrz->vn_vre;
4974 		vdrz->vn_vre.vre_state = DSS_SCANNING;
4975 	}
4976 
4977 	vdrz->vd_original_width = children;
4978 	uint64_t *txgs;
4979 	unsigned int txgs_size = 0;
4980 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4981 	    &txgs, &txgs_size);
4982 	if (error == 0) {
4983 		for (int i = 0; i < txgs_size; i++) {
4984 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4985 			re->re_txg = txgs[txgs_size - i - 1];
4986 			re->re_logical_width = vdrz->vd_physical_width - i;
4987 
4988 			if (reflow_in_progress)
4989 				re->re_logical_width--;
4990 
4991 			avl_add(&vdrz->vd_expand_txgs, re);
4992 		}
4993 
4994 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4995 	}
4996 	if (reflow_in_progress) {
4997 		vdrz->vd_original_width--;
4998 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4999 		    children, txgs_size);
5000 	}
5001 
5002 	*tsd = vdrz;
5003 
5004 	return (0);
5005 }
5006 
5007 static void
5008 vdev_raidz_fini(vdev_t *vd)
5009 {
5010 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5011 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5012 		vd->vdev_spa->spa_raidz_expand = NULL;
5013 	reflow_node_t *re;
5014 	void *cookie = NULL;
5015 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
5016 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5017 		kmem_free(re, sizeof (*re));
5018 	avl_destroy(&vdrz->vd_expand_txgs);
5019 	mutex_destroy(&vdrz->vd_expand_lock);
5020 	mutex_destroy(&vdrz->vn_vre.vre_lock);
5021 	cv_destroy(&vdrz->vn_vre.vre_cv);
5022 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5023 	kmem_free(vdrz, sizeof (*vdrz));
5024 }
5025 
5026 /*
5027  * Add RAIDZ specific fields to the config nvlist.
5028  */
5029 static void
5030 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5031 {
5032 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5033 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5034 
5035 	/*
5036 	 * Make sure someone hasn't managed to sneak a fancy new vdev
5037 	 * into a crufty old storage pool.
5038 	 */
5039 	ASSERT(vdrz->vd_nparity == 1 ||
5040 	    (vdrz->vd_nparity <= 2 &&
5041 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5042 	    (vdrz->vd_nparity <= 3 &&
5043 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5044 
5045 	/*
5046 	 * Note that we'll add these even on storage pools where they
5047 	 * aren't strictly required -- older software will just ignore
5048 	 * it.
5049 	 */
5050 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5051 
5052 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5053 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5054 	}
5055 
5056 	mutex_enter(&vdrz->vd_expand_lock);
5057 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5058 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5059 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5060 		    KM_SLEEP);
5061 		uint64_t i = 0;
5062 
5063 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5064 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5065 			txgs[i++] = re->re_txg;
5066 		}
5067 
5068 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5069 		    txgs, count);
5070 
5071 		kmem_free(txgs, sizeof (uint64_t) * count);
5072 	}
5073 	mutex_exit(&vdrz->vd_expand_lock);
5074 }
5075 
5076 static uint64_t
5077 vdev_raidz_nparity(vdev_t *vd)
5078 {
5079 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5080 	return (vdrz->vd_nparity);
5081 }
5082 
5083 static uint64_t
5084 vdev_raidz_ndisks(vdev_t *vd)
5085 {
5086 	return (vd->vdev_children);
5087 }
5088 
5089 vdev_ops_t vdev_raidz_ops = {
5090 	.vdev_op_init = vdev_raidz_init,
5091 	.vdev_op_fini = vdev_raidz_fini,
5092 	.vdev_op_open = vdev_raidz_open,
5093 	.vdev_op_close = vdev_raidz_close,
5094 	.vdev_op_asize = vdev_raidz_asize,
5095 	.vdev_op_min_asize = vdev_raidz_min_asize,
5096 	.vdev_op_min_alloc = NULL,
5097 	.vdev_op_io_start = vdev_raidz_io_start,
5098 	.vdev_op_io_done = vdev_raidz_io_done,
5099 	.vdev_op_state_change = vdev_raidz_state_change,
5100 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
5101 	.vdev_op_hold = NULL,
5102 	.vdev_op_rele = NULL,
5103 	.vdev_op_remap = NULL,
5104 	.vdev_op_xlate = vdev_raidz_xlate,
5105 	.vdev_op_rebuild_asize = NULL,
5106 	.vdev_op_metaslab_init = NULL,
5107 	.vdev_op_config_generate = vdev_raidz_config_generate,
5108 	.vdev_op_nparity = vdev_raidz_nparity,
5109 	.vdev_op_ndisks = vdev_raidz_ndisks,
5110 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5111 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5112 };
5113 
5114 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5115 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5116 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5117 	"Max amount of concurrent i/o for RAIDZ expansion");
5118 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5119 	"For expanded RAIDZ, aggregate reads that have more rows than this");
5120 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5121 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5122 	"completes");
5123