xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision b9128a37faafede823eb456aa65a11ac69997284)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/zap.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/abd.h>
38 #include <sys/zfs_rlock.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #include <sys/vdev_raidz.h>
42 #include <sys/vdev_raidz_impl.h>
43 #include <sys/vdev_draid.h>
44 #include <sys/uberblock_impl.h>
45 #include <sys/dsl_scan.h>
46 
47 #ifdef ZFS_DEBUG
48 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
49 #endif
50 
51 /*
52  * Virtual device vector for RAID-Z.
53  *
54  * This vdev supports single, double, and triple parity. For single parity,
55  * we use a simple XOR of all the data columns. For double or triple parity,
56  * we use a special case of Reed-Solomon coding. This extends the
57  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60  * former is also based. The latter is designed to provide higher performance
61  * for writes.
62  *
63  * Note that the Plank paper claimed to support arbitrary N+M, but was then
64  * amended six years later identifying a critical flaw that invalidates its
65  * claims. Nevertheless, the technique can be adapted to work for up to
66  * triple parity. For additional parity, the amendment "Note: Correction to
67  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68  * is viable, but the additional complexity means that write performance will
69  * suffer.
70  *
71  * All of the methods above operate on a Galois field, defined over the
72  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73  * can be expressed with a single byte. Briefly, the operations on the
74  * field are defined as follows:
75  *
76  *   o addition (+) is represented by a bitwise XOR
77  *   o subtraction (-) is therefore identical to addition: A + B = A - B
78  *   o multiplication of A by 2 is defined by the following bitwise expression:
79  *
80  *	(A * 2)_7 = A_6
81  *	(A * 2)_6 = A_5
82  *	(A * 2)_5 = A_4
83  *	(A * 2)_4 = A_3 + A_7
84  *	(A * 2)_3 = A_2 + A_7
85  *	(A * 2)_2 = A_1 + A_7
86  *	(A * 2)_1 = A_0
87  *	(A * 2)_0 = A_7
88  *
89  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90  * As an aside, this multiplication is derived from the error correcting
91  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
92  *
93  * Observe that any number in the field (except for 0) can be expressed as a
94  * power of 2 -- a generator for the field. We store a table of the powers of
95  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97  * than field addition). The inverse of a field element A (A^-1) is therefore
98  * A ^ (255 - 1) = A^254.
99  *
100  * The up-to-three parity columns, P, Q, R over several data columns,
101  * D_0, ... D_n-1, can be expressed by field operations:
102  *
103  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
104  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
108  *
109  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111  * independent coefficients. (There are no additional coefficients that have
112  * this property which is why the uncorrected Plank method breaks down.)
113  *
114  * See the reconstruction code below for how P, Q and R can used individually
115  * or in concert to recover missing data columns.
116  */
117 
118 #define	VDEV_RAIDZ_P		0
119 #define	VDEV_RAIDZ_Q		1
120 #define	VDEV_RAIDZ_R		2
121 
122 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124 
125 /*
126  * We provide a mechanism to perform the field multiplication operation on a
127  * 64-bit value all at once rather than a byte at a time. This works by
128  * creating a mask from the top bit in each byte and using that to
129  * conditionally apply the XOR of 0x1d.
130  */
131 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
132 { \
133 	(mask) = (x) & 0x8080808080808080ULL; \
134 	(mask) = ((mask) << 1) - ((mask) >> 7); \
135 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
137 }
138 
139 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
140 { \
141 	VDEV_RAIDZ_64MUL_2((x), mask); \
142 	VDEV_RAIDZ_64MUL_2((x), mask); \
143 }
144 
145 
146 /*
147  * Big Theory Statement for how a RAIDZ VDEV is expanded
148  *
149  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151  * that have been previously expanded can be expanded again.
152  *
153  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154  * the VDEV) when an expansion starts.  And the expansion will pause if any
155  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156  * operations on the pool can continue while an expansion is in progress (e.g.
157  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158  * and zpool initialize which can't be run during an expansion.  Following a
159  * reboot or export/import, the expansion resumes where it left off.
160  *
161  * == Reflowing the Data ==
162  *
163  * The expansion involves reflowing (copying) the data from the current set
164  * of disks to spread it across the new set which now has one more disk. This
165  * reflow operation is similar to reflowing text when the column width of a
166  * text editor window is expanded. The text doesn’t change but the location of
167  * the text changes to accommodate the new width. An example reflow result for
168  * a 4-wide RAIDZ1 to a 5-wide is shown below.
169  *
170  *                            Reflow End State
171  *            Each letter indicates a parity group (logical stripe)
172  *
173  *         Before expansion                         After Expansion
174  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
175  *  +------+------+------+------+         +------+------+------+------+------+
176  *  |      |      |      |      |         |      |      |      |      |      |
177  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
178  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
179  *  +------+------+------+------+         +------+------+------+------+------+
180  *  |      |      |      |      |         |      |      |      |      |      |
181  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
182  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
183  *  +------+------+------+------+         +------+------+------+------+------+
184  *  |      |      |      |      |         |      |      |      |      |      |
185  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
186  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
187  *  +------+------+------+------+         +------+------+------+------+------+
188  *  |      |      |      |      |         |      |      |      |      |      |
189  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
190  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
191  *  +------+------+------+------+         +------+------+------+------+------+
192  *  |      |      |      |      |         |      |      |      |      |      |
193  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
194  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
195  *  +------+------+------+------+         +------+------+------+------+------+
196  *  |      |      |      |      |         |      |      |      |      |      |
197  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
198  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
199  *  +------+------+------+------+         +------+------+------+------+------+
200  *  |      |      |      |      |         |      |      |      |      |      |
201  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
202  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
203  *  +------+------+------+------+         +------+------+------+------+------+
204  *
205  * This reflow approach has several advantages. There is no need to read or
206  * modify the block pointers or recompute any block checksums.  The reflow
207  * doesn’t need to know where the parity sectors reside. We can read and write
208  * data sequentially and the copy can occur in a background thread in open
209  * context. The design also allows for fast discovery of what data to copy.
210  *
211  * The VDEV metaslabs are processed, one at a time, to copy the block data to
212  * have it flow across all the disks. The metaslab is disabled for allocations
213  * during the copy. As an optimization, we only copy the allocated data which
214  * can be determined by looking at the metaslab range tree. During the copy we
215  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216  * need to be able to survive losing parity count disks).  This means we
217  * cannot overwrite data during the reflow that would be needed if a disk is
218  * lost.
219  *
220  * After the reflow completes, all newly-written blocks will have the new
221  * layout, i.e., they will have the parity to data ratio implied by the new
222  * number of disks in the RAIDZ group.  Even though the reflow copies all of
223  * the allocated space (data and parity), it is only rearranged, not changed.
224  *
225  * This act of reflowing the data has a few implications about blocks
226  * that were written before the reflow completes:
227  *
228  *  - Old blocks will still use the same amount of space (i.e., they will have
229  *    the parity to data ratio implied by the old number of disks in the RAIDZ
230  *    group).
231  *  - Reading old blocks will be slightly slower than before the reflow, for
232  *    two reasons. First, we will have to read from all disks in the RAIDZ
233  *    VDEV, rather than being able to skip the children that contain only
234  *    parity of this block (because the data of a single block is now spread
235  *    out across all the disks).  Second, in most cases there will be an extra
236  *    bcopy, needed to rearrange the data back to its original layout in memory.
237  *
238  * == Scratch Area ==
239  *
240  * As we copy the block data, we can only progress to the point that writes
241  * will not overlap with blocks whose progress has not yet been recorded on
242  * disk.  Since partially-copied rows are always read from the old location,
243  * we need to stop one row before the sector-wise overlap, to prevent any
244  * row-wise overlap. For example, in the diagram above, when we reflow sector
245  * B6 it will overwite the original location for B5.
246  *
247  * To get around this, a scratch space is used so that we can start copying
248  * without risking data loss by overlapping the row. As an added benefit, it
249  * improves performance at the beginning of the reflow, but that small perf
250  * boost wouldn't be worth the complexity on its own.
251  *
252  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255  * the widths will likely be single digits so we can get a substantial chuck
256  * size using only a few MB of scratch per disk.
257  *
258  * The scratch area is persisted to disk which holds a large amount of reflowed
259  * state. We can always read the partially written stripes when a disk fails or
260  * the copy is interrupted (crash) during the initial copying phase and also
261  * get past a small chunk size restriction.  At a minimum, the scratch space
262  * must be large enough to get us to the point that one row does not overlap
263  * itself when moved (i.e new_width^2).  But going larger is even better. We
264  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265  * as our scratch space to handle overwriting the initial part of the VDEV.
266  *
267  *	0     256K   512K                    4M
268  *	+------+------+-----------------------+-----------------------------
269  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
270  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
271  *	+------+------+-----------------------+-------------------------------
272  *                        Scratch Area
273  *
274  * == Reflow Progress Updates ==
275  * After the initial scratch-based reflow, the expansion process works
276  * similarly to device removal. We create a new open context thread which
277  * reflows the data, and periodically kicks off sync tasks to update logical
278  * state. In this case, state is the committed progress (offset of next data
279  * to copy). We need to persist the completed offset on disk, so that if we
280  * crash we know which format each VDEV offset is in.
281  *
282  * == Time Dependent Geometry ==
283  *
284  * In non-expanded RAIDZ, blocks are read from disk in a column by column
285  * fashion. For a multi-row block, the second sector is in the first column
286  * not in the second column. This allows us to issue full reads for each
287  * column directly into the request buffer. The block data is thus laid out
288  * sequentially in a column-by-column fashion.
289  *
290  * For example, in the before expansion diagram above, one logical block might
291  * be sectors G19-H26. The parity is in G19,H23; and the data is in
292  * G20,H24,G21,H25,G22,H26.
293  *
294  * After a block is reflowed, the sectors that were all in the original column
295  * data can now reside in different columns. When reading from an expanded
296  * VDEV, we need to know the logical stripe width for each block so we can
297  * reconstitute the block’s data after the reads are completed. Likewise,
298  * when we perform the combinatorial reconstruction we need to know the
299  * original width so we can retry combinations from the past layouts.
300  *
301  * Time dependent geometry is what we call having blocks with different layouts
302  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303  * block’s birth time (+ the time expansion ended) to establish the correct
304  * width for a given block. After an expansion completes, we record the time
305  * for blocks written with a particular width (geometry).
306  *
307  * == On Disk Format Changes ==
308  *
309  * New pool feature flag, 'raidz_expansion' whose reference count is the number
310  * of RAIDZ VDEVs that have been expanded.
311  *
312  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313  *
314  * Since the uberblock can point to arbitrary blocks, which might be on the
315  * expanding RAIDZ, and might or might not have been expanded. We need to know
316  * which way a block is laid out before reading it. This info is the next
317  * offset that needs to be reflowed and we persist that in the uberblock, in
318  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319  * After the expansion is complete, we then use the raidz_expand_txgs array
320  * (see below) to determine how to read a block and the ub_raidz_reflow_info
321  * field no longer required.
322  *
323  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324  * state (i.e., active or not) which is also required before reading a block
325  * during the initial phase of reflowing the data.
326  *
327  * The top-level RAIDZ VDEV has two new entries in the nvlist:
328  *
329  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330  *                            and used after the expansion is complete to
331  *                            determine how to read a raidz block
332  * 'raidz_expanding' boolean: present during reflow and removed after completion
333  *                            used during a spa import to resume an unfinished
334  *                            expansion
335  *
336  * And finally the VDEVs top zap adds the following informational entries:
337  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
341  */
342 
343 /*
344  * For testing only: pause the raidz expansion after reflowing this amount.
345  * (accessed by ZTS and ztest)
346  */
347 #ifdef	_KERNEL
348 static
349 #endif	/* _KERNEL */
350 unsigned long raidz_expand_max_reflow_bytes = 0;
351 
352 /*
353  * For testing only: pause the raidz expansion at a certain point.
354  */
355 uint_t raidz_expand_pause_point = 0;
356 
357 /*
358  * Maximum amount of copy io's outstanding at once.
359  */
360 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
361 
362 /*
363  * Apply raidz map abds aggregation if the number of rows in the map is equal
364  * or greater than the value below.
365  */
366 static unsigned long raidz_io_aggregate_rows = 4;
367 
368 /*
369  * Automatically start a pool scrub when a RAIDZ expansion completes in
370  * order to verify the checksums of all blocks which have been copied
371  * during the expansion.  Automatic scrubbing is enabled by default and
372  * is strongly recommended.
373  */
374 static int zfs_scrub_after_expand = 1;
375 
376 static void
377 vdev_raidz_row_free(raidz_row_t *rr)
378 {
379 	for (int c = 0; c < rr->rr_cols; c++) {
380 		raidz_col_t *rc = &rr->rr_col[c];
381 
382 		if (rc->rc_size != 0)
383 			abd_free(rc->rc_abd);
384 		if (rc->rc_orig_data != NULL)
385 			abd_free(rc->rc_orig_data);
386 	}
387 
388 	if (rr->rr_abd_empty != NULL)
389 		abd_free(rr->rr_abd_empty);
390 
391 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
392 }
393 
394 void
395 vdev_raidz_map_free(raidz_map_t *rm)
396 {
397 	for (int i = 0; i < rm->rm_nrows; i++)
398 		vdev_raidz_row_free(rm->rm_row[i]);
399 
400 	if (rm->rm_nphys_cols) {
401 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
402 			if (rm->rm_phys_col[i].rc_abd != NULL)
403 				abd_free(rm->rm_phys_col[i].rc_abd);
404 		}
405 
406 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
407 		    rm->rm_nphys_cols);
408 	}
409 
410 	ASSERT3P(rm->rm_lr, ==, NULL);
411 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
412 }
413 
414 static void
415 vdev_raidz_map_free_vsd(zio_t *zio)
416 {
417 	raidz_map_t *rm = zio->io_vsd;
418 
419 	vdev_raidz_map_free(rm);
420 }
421 
422 static int
423 vdev_raidz_reflow_compare(const void *x1, const void *x2)
424 {
425 	const reflow_node_t *l = x1;
426 	const reflow_node_t *r = x2;
427 
428 	return (TREE_CMP(l->re_txg, r->re_txg));
429 }
430 
431 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
432 	.vsd_free = vdev_raidz_map_free_vsd,
433 };
434 
435 raidz_row_t *
436 vdev_raidz_row_alloc(int cols)
437 {
438 	raidz_row_t *rr =
439 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
440 
441 	rr->rr_cols = cols;
442 	rr->rr_scols = cols;
443 
444 	for (int c = 0; c < cols; c++) {
445 		raidz_col_t *rc = &rr->rr_col[c];
446 		rc->rc_shadow_devidx = INT_MAX;
447 		rc->rc_shadow_offset = UINT64_MAX;
448 		rc->rc_allow_repair = 1;
449 	}
450 	return (rr);
451 }
452 
453 static void
454 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
455 {
456 	int c;
457 	int nwrapped = 0;
458 	uint64_t off = 0;
459 	raidz_row_t *rr = rm->rm_row[0];
460 
461 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
462 	ASSERT3U(rm->rm_nrows, ==, 1);
463 
464 	/*
465 	 * Pad any parity columns with additional space to account for skip
466 	 * sectors.
467 	 */
468 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
469 		ASSERT0(rm->rm_skipstart);
470 		nwrapped = rm->rm_nskip;
471 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
472 		nwrapped =
473 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
474 	}
475 
476 	/*
477 	 * Optional single skip sectors (rc_size == 0) will be handled in
478 	 * vdev_raidz_io_start_write().
479 	 */
480 	int skipped = rr->rr_scols - rr->rr_cols;
481 
482 	/* Allocate buffers for the parity columns */
483 	for (c = 0; c < rr->rr_firstdatacol; c++) {
484 		raidz_col_t *rc = &rr->rr_col[c];
485 
486 		/*
487 		 * Parity columns will pad out a linear ABD to account for
488 		 * the skip sector. A linear ABD is used here because
489 		 * parity calculations use the ABD buffer directly to calculate
490 		 * parity. This avoids doing a memcpy back to the ABD after the
491 		 * parity has been calculated. By issuing the parity column
492 		 * with the skip sector we can reduce contention on the child
493 		 * VDEV queue locks (vq_lock).
494 		 */
495 		if (c < nwrapped) {
496 			rc->rc_abd = abd_alloc_linear(
497 			    rc->rc_size + (1ULL << ashift), B_FALSE);
498 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
499 			skipped++;
500 		} else {
501 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
502 		}
503 	}
504 
505 	for (off = 0; c < rr->rr_cols; c++) {
506 		raidz_col_t *rc = &rr->rr_col[c];
507 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
508 		    zio->io_abd, off, rc->rc_size);
509 
510 		/*
511 		 * Generate I/O for skip sectors to improve aggregation
512 		 * continuity. We will use gang ABD's to reduce contention
513 		 * on the child VDEV queue locks (vq_lock) by issuing
514 		 * a single I/O that contains the data and skip sector.
515 		 *
516 		 * It is important to make sure that rc_size is not updated
517 		 * even though we are adding a skip sector to the ABD. When
518 		 * calculating the parity in vdev_raidz_generate_parity_row()
519 		 * the rc_size is used to iterate through the ABD's. We can
520 		 * not have zero'd out skip sectors used for calculating
521 		 * parity for raidz, because those same sectors are not used
522 		 * during reconstruction.
523 		 */
524 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
525 			rc->rc_abd = abd_alloc_gang();
526 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
527 			abd_gang_add(rc->rc_abd,
528 			    abd_get_zeros(1ULL << ashift), B_TRUE);
529 			skipped++;
530 		} else {
531 			rc->rc_abd = abd;
532 		}
533 		off += rc->rc_size;
534 	}
535 
536 	ASSERT3U(off, ==, zio->io_size);
537 	ASSERT3S(skipped, ==, rm->rm_nskip);
538 }
539 
540 static void
541 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
542 {
543 	int c;
544 	raidz_row_t *rr = rm->rm_row[0];
545 
546 	ASSERT3U(rm->rm_nrows, ==, 1);
547 
548 	/* Allocate buffers for the parity columns */
549 	for (c = 0; c < rr->rr_firstdatacol; c++)
550 		rr->rr_col[c].rc_abd =
551 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
552 
553 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
554 		raidz_col_t *rc = &rr->rr_col[c];
555 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
556 		    zio->io_abd, off, rc->rc_size);
557 		off += rc->rc_size;
558 	}
559 }
560 
561 /*
562  * Divides the IO evenly across all child vdevs; usually, dcols is
563  * the number of children in the target vdev.
564  *
565  * Avoid inlining the function to keep vdev_raidz_io_start(), which
566  * is this functions only caller, as small as possible on the stack.
567  */
568 noinline raidz_map_t *
569 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
570     uint64_t nparity)
571 {
572 	raidz_row_t *rr;
573 	/* The starting RAIDZ (parent) vdev sector of the block. */
574 	uint64_t b = zio->io_offset >> ashift;
575 	/* The zio's size in units of the vdev's minimum sector size. */
576 	uint64_t s = zio->io_size >> ashift;
577 	/* The first column for this stripe. */
578 	uint64_t f = b % dcols;
579 	/* The starting byte offset on each child vdev. */
580 	uint64_t o = (b / dcols) << ashift;
581 	uint64_t acols, scols;
582 
583 	raidz_map_t *rm =
584 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
585 	rm->rm_nrows = 1;
586 
587 	/*
588 	 * "Quotient": The number of data sectors for this stripe on all but
589 	 * the "big column" child vdevs that also contain "remainder" data.
590 	 */
591 	uint64_t q = s / (dcols - nparity);
592 
593 	/*
594 	 * "Remainder": The number of partial stripe data sectors in this I/O.
595 	 * This will add a sector to some, but not all, child vdevs.
596 	 */
597 	uint64_t r = s - q * (dcols - nparity);
598 
599 	/* The number of "big columns" - those which contain remainder data. */
600 	uint64_t bc = (r == 0 ? 0 : r + nparity);
601 
602 	/*
603 	 * The total number of data and parity sectors associated with
604 	 * this I/O.
605 	 */
606 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
607 
608 	/*
609 	 * acols: The columns that will be accessed.
610 	 * scols: The columns that will be accessed or skipped.
611 	 */
612 	if (q == 0) {
613 		/* Our I/O request doesn't span all child vdevs. */
614 		acols = bc;
615 		scols = MIN(dcols, roundup(bc, nparity + 1));
616 	} else {
617 		acols = dcols;
618 		scols = dcols;
619 	}
620 
621 	ASSERT3U(acols, <=, scols);
622 	rr = vdev_raidz_row_alloc(scols);
623 	rm->rm_row[0] = rr;
624 	rr->rr_cols = acols;
625 	rr->rr_bigcols = bc;
626 	rr->rr_firstdatacol = nparity;
627 #ifdef ZFS_DEBUG
628 	rr->rr_offset = zio->io_offset;
629 	rr->rr_size = zio->io_size;
630 #endif
631 
632 	uint64_t asize = 0;
633 
634 	for (uint64_t c = 0; c < scols; c++) {
635 		raidz_col_t *rc = &rr->rr_col[c];
636 		uint64_t col = f + c;
637 		uint64_t coff = o;
638 		if (col >= dcols) {
639 			col -= dcols;
640 			coff += 1ULL << ashift;
641 		}
642 		rc->rc_devidx = col;
643 		rc->rc_offset = coff;
644 
645 		if (c >= acols)
646 			rc->rc_size = 0;
647 		else if (c < bc)
648 			rc->rc_size = (q + 1) << ashift;
649 		else
650 			rc->rc_size = q << ashift;
651 
652 		asize += rc->rc_size;
653 	}
654 
655 	ASSERT3U(asize, ==, tot << ashift);
656 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
657 	rm->rm_skipstart = bc;
658 
659 	/*
660 	 * If all data stored spans all columns, there's a danger that parity
661 	 * will always be on the same device and, since parity isn't read
662 	 * during normal operation, that device's I/O bandwidth won't be
663 	 * used effectively. We therefore switch the parity every 1MB.
664 	 *
665 	 * ... at least that was, ostensibly, the theory. As a practical
666 	 * matter unless we juggle the parity between all devices evenly, we
667 	 * won't see any benefit. Further, occasional writes that aren't a
668 	 * multiple of the LCM of the number of children and the minimum
669 	 * stripe width are sufficient to avoid pessimal behavior.
670 	 * Unfortunately, this decision created an implicit on-disk format
671 	 * requirement that we need to support for all eternity, but only
672 	 * for single-parity RAID-Z.
673 	 *
674 	 * If we intend to skip a sector in the zeroth column for padding
675 	 * we must make sure to note this swap. We will never intend to
676 	 * skip the first column since at least one data and one parity
677 	 * column must appear in each row.
678 	 */
679 	ASSERT(rr->rr_cols >= 2);
680 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
681 
682 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
683 		uint64_t devidx = rr->rr_col[0].rc_devidx;
684 		o = rr->rr_col[0].rc_offset;
685 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
686 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
687 		rr->rr_col[1].rc_devidx = devidx;
688 		rr->rr_col[1].rc_offset = o;
689 		if (rm->rm_skipstart == 0)
690 			rm->rm_skipstart = 1;
691 	}
692 
693 	if (zio->io_type == ZIO_TYPE_WRITE) {
694 		vdev_raidz_map_alloc_write(zio, rm, ashift);
695 	} else {
696 		vdev_raidz_map_alloc_read(zio, rm);
697 	}
698 	/* init RAIDZ parity ops */
699 	rm->rm_ops = vdev_raidz_math_get_ops();
700 
701 	return (rm);
702 }
703 
704 /*
705  * Everything before reflow_offset_synced should have been moved to the new
706  * location (read and write completed).  However, this may not yet be reflected
707  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
708  * uberblock has not yet been written). If reflow is not in progress,
709  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
710  * entirely before reflow_offset_synced, it will come from the new location.
711  * Otherwise this row will come from the old location.  Therefore, rows that
712  * straddle the reflow_offset_synced will come from the old location.
713  *
714  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
715  * been copied, but not yet reflected in the on-disk progress
716  * (reflow_offset_synced), it will also be written to the new (already copied)
717  * offset.
718  */
719 noinline raidz_map_t *
720 vdev_raidz_map_alloc_expanded(zio_t *zio,
721     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
722     uint64_t nparity, uint64_t reflow_offset_synced,
723     uint64_t reflow_offset_next, boolean_t use_scratch)
724 {
725 	abd_t *abd = zio->io_abd;
726 	uint64_t offset = zio->io_offset;
727 	uint64_t size = zio->io_size;
728 
729 	/* The zio's size in units of the vdev's minimum sector size. */
730 	uint64_t s = size >> ashift;
731 
732 	/*
733 	 * "Quotient": The number of data sectors for this stripe on all but
734 	 * the "big column" child vdevs that also contain "remainder" data.
735 	 * AKA "full rows"
736 	 */
737 	uint64_t q = s / (logical_cols - nparity);
738 
739 	/*
740 	 * "Remainder": The number of partial stripe data sectors in this I/O.
741 	 * This will add a sector to some, but not all, child vdevs.
742 	 */
743 	uint64_t r = s - q * (logical_cols - nparity);
744 
745 	/* The number of "big columns" - those which contain remainder data. */
746 	uint64_t bc = (r == 0 ? 0 : r + nparity);
747 
748 	/*
749 	 * The total number of data and parity sectors associated with
750 	 * this I/O.
751 	 */
752 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
753 
754 	/* How many rows contain data (not skip) */
755 	uint64_t rows = howmany(tot, logical_cols);
756 	int cols = MIN(tot, logical_cols);
757 
758 	raidz_map_t *rm =
759 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
760 	    KM_SLEEP);
761 	rm->rm_nrows = rows;
762 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
763 	rm->rm_skipstart = bc;
764 	uint64_t asize = 0;
765 
766 	for (uint64_t row = 0; row < rows; row++) {
767 		boolean_t row_use_scratch = B_FALSE;
768 		raidz_row_t *rr = vdev_raidz_row_alloc(cols);
769 		rm->rm_row[row] = rr;
770 
771 		/* The starting RAIDZ (parent) vdev sector of the row. */
772 		uint64_t b = (offset >> ashift) + row * logical_cols;
773 
774 		/*
775 		 * If we are in the middle of a reflow, and the copying has
776 		 * not yet completed for any part of this row, then use the
777 		 * old location of this row.  Note that reflow_offset_synced
778 		 * reflects the i/o that's been completed, because it's
779 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
780 		 * This is sufficient for our check, even if that progress
781 		 * has not yet been recorded to disk (reflected in
782 		 * spa_ubsync).  Also note that we consider the last row to
783 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
784 		 * this calculation. This causes a tiny bit of unnecessary
785 		 * double-writes but is safe and simpler to calculate.
786 		 */
787 		int row_phys_cols = physical_cols;
788 		if (b + cols > reflow_offset_synced >> ashift)
789 			row_phys_cols--;
790 		else if (use_scratch)
791 			row_use_scratch = B_TRUE;
792 
793 		/* starting child of this row */
794 		uint64_t child_id = b % row_phys_cols;
795 		/* The starting byte offset on each child vdev. */
796 		uint64_t child_offset = (b / row_phys_cols) << ashift;
797 
798 		/*
799 		 * Note, rr_cols is the entire width of the block, even
800 		 * if this row is shorter.  This is needed because parity
801 		 * generation (for Q and R) needs to know the entire width,
802 		 * because it treats the short row as though it was
803 		 * full-width (and the "phantom" sectors were zero-filled).
804 		 *
805 		 * Another approach to this would be to set cols shorter
806 		 * (to just the number of columns that we might do i/o to)
807 		 * and have another mechanism to tell the parity generation
808 		 * about the "entire width".  Reconstruction (at least
809 		 * vdev_raidz_reconstruct_general()) would also need to
810 		 * know about the "entire width".
811 		 */
812 		rr->rr_firstdatacol = nparity;
813 #ifdef ZFS_DEBUG
814 		/*
815 		 * note: rr_size is PSIZE, not ASIZE
816 		 */
817 		rr->rr_offset = b << ashift;
818 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
819 #endif
820 
821 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
822 			if (child_id >= row_phys_cols) {
823 				child_id -= row_phys_cols;
824 				child_offset += 1ULL << ashift;
825 			}
826 			raidz_col_t *rc = &rr->rr_col[c];
827 			rc->rc_devidx = child_id;
828 			rc->rc_offset = child_offset;
829 
830 			/*
831 			 * Get this from the scratch space if appropriate.
832 			 * This only happens if we crashed in the middle of
833 			 * raidz_reflow_scratch_sync() (while it's running,
834 			 * the rangelock prevents us from doing concurrent
835 			 * io), and even then only during zpool import or
836 			 * when the pool is imported readonly.
837 			 */
838 			if (row_use_scratch)
839 				rc->rc_offset -= VDEV_BOOT_SIZE;
840 
841 			uint64_t dc = c - rr->rr_firstdatacol;
842 			if (c < rr->rr_firstdatacol) {
843 				rc->rc_size = 1ULL << ashift;
844 
845 				/*
846 				 * Parity sectors' rc_abd's are set below
847 				 * after determining if this is an aggregation.
848 				 */
849 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
850 				/*
851 				 * Past the end of the block (even including
852 				 * skip sectors).  This sector is part of the
853 				 * map so that we have full rows for p/q parity
854 				 * generation.
855 				 */
856 				rc->rc_size = 0;
857 				rc->rc_abd = NULL;
858 			} else {
859 				/* "data column" (col excluding parity) */
860 				uint64_t off;
861 
862 				if (c < bc || r == 0) {
863 					off = dc * rows + row;
864 				} else {
865 					off = r * rows +
866 					    (dc - r) * (rows - 1) + row;
867 				}
868 				rc->rc_size = 1ULL << ashift;
869 				rc->rc_abd = abd_get_offset_struct(
870 				    &rc->rc_abdstruct, abd, off << ashift,
871 				    rc->rc_size);
872 			}
873 
874 			if (rc->rc_size == 0)
875 				continue;
876 
877 			/*
878 			 * If any part of this row is in both old and new
879 			 * locations, the primary location is the old
880 			 * location. If this sector was already copied to the
881 			 * new location, we need to also write to the new,
882 			 * "shadow" location.
883 			 *
884 			 * Note, `row_phys_cols != physical_cols` indicates
885 			 * that the primary location is the old location.
886 			 * `b+c < reflow_offset_next` indicates that the copy
887 			 * to the new location has been initiated. We know
888 			 * that the copy has completed because we have the
889 			 * rangelock, which is held exclusively while the
890 			 * copy is in progress.
891 			 */
892 			if (row_use_scratch ||
893 			    (row_phys_cols != physical_cols &&
894 			    b + c < reflow_offset_next >> ashift)) {
895 				rc->rc_shadow_devidx = (b + c) % physical_cols;
896 				rc->rc_shadow_offset =
897 				    ((b + c) / physical_cols) << ashift;
898 				if (row_use_scratch)
899 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
900 			}
901 
902 			asize += rc->rc_size;
903 		}
904 
905 		/*
906 		 * See comment in vdev_raidz_map_alloc()
907 		 */
908 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
909 		    (offset & (1ULL << 20))) {
910 			ASSERT(rr->rr_cols >= 2);
911 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
912 
913 			int devidx0 = rr->rr_col[0].rc_devidx;
914 			uint64_t offset0 = rr->rr_col[0].rc_offset;
915 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
916 			uint64_t shadow_offset0 =
917 			    rr->rr_col[0].rc_shadow_offset;
918 
919 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
920 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
921 			rr->rr_col[0].rc_shadow_devidx =
922 			    rr->rr_col[1].rc_shadow_devidx;
923 			rr->rr_col[0].rc_shadow_offset =
924 			    rr->rr_col[1].rc_shadow_offset;
925 
926 			rr->rr_col[1].rc_devidx = devidx0;
927 			rr->rr_col[1].rc_offset = offset0;
928 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
929 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
930 		}
931 	}
932 	ASSERT3U(asize, ==, tot << ashift);
933 
934 	/*
935 	 * Determine if the block is contiguous, in which case we can use
936 	 * an aggregation.
937 	 */
938 	if (rows >= raidz_io_aggregate_rows) {
939 		rm->rm_nphys_cols = physical_cols;
940 		rm->rm_phys_col =
941 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
942 		    KM_SLEEP);
943 
944 		/*
945 		 * Determine the aggregate io's offset and size, and check
946 		 * that the io is contiguous.
947 		 */
948 		for (int i = 0;
949 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
950 			raidz_row_t *rr = rm->rm_row[i];
951 			for (int c = 0; c < rr->rr_cols; c++) {
952 				raidz_col_t *rc = &rr->rr_col[c];
953 				raidz_col_t *prc =
954 				    &rm->rm_phys_col[rc->rc_devidx];
955 
956 				if (rc->rc_size == 0)
957 					continue;
958 
959 				if (prc->rc_size == 0) {
960 					ASSERT0(prc->rc_offset);
961 					prc->rc_offset = rc->rc_offset;
962 				} else if (prc->rc_offset + prc->rc_size !=
963 				    rc->rc_offset) {
964 					/*
965 					 * This block is not contiguous and
966 					 * therefore can't be aggregated.
967 					 * This is expected to be rare, so
968 					 * the cost of allocating and then
969 					 * freeing rm_phys_col is not
970 					 * significant.
971 					 */
972 					kmem_free(rm->rm_phys_col,
973 					    sizeof (raidz_col_t) *
974 					    rm->rm_nphys_cols);
975 					rm->rm_phys_col = NULL;
976 					rm->rm_nphys_cols = 0;
977 					break;
978 				}
979 				prc->rc_size += rc->rc_size;
980 			}
981 		}
982 	}
983 	if (rm->rm_phys_col != NULL) {
984 		/*
985 		 * Allocate aggregate ABD's.
986 		 */
987 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
988 			raidz_col_t *prc = &rm->rm_phys_col[i];
989 
990 			prc->rc_devidx = i;
991 
992 			if (prc->rc_size == 0)
993 				continue;
994 
995 			prc->rc_abd =
996 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
997 			    B_FALSE);
998 		}
999 
1000 		/*
1001 		 * Point the parity abd's into the aggregate abd's.
1002 		 */
1003 		for (int i = 0; i < rm->rm_nrows; i++) {
1004 			raidz_row_t *rr = rm->rm_row[i];
1005 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1006 				raidz_col_t *rc = &rr->rr_col[c];
1007 				raidz_col_t *prc =
1008 				    &rm->rm_phys_col[rc->rc_devidx];
1009 				rc->rc_abd =
1010 				    abd_get_offset_struct(&rc->rc_abdstruct,
1011 				    prc->rc_abd,
1012 				    rc->rc_offset - prc->rc_offset,
1013 				    rc->rc_size);
1014 			}
1015 		}
1016 	} else {
1017 		/*
1018 		 * Allocate new abd's for the parity sectors.
1019 		 */
1020 		for (int i = 0; i < rm->rm_nrows; i++) {
1021 			raidz_row_t *rr = rm->rm_row[i];
1022 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1023 				raidz_col_t *rc = &rr->rr_col[c];
1024 				rc->rc_abd =
1025 				    abd_alloc_linear(rc->rc_size,
1026 				    B_TRUE);
1027 			}
1028 		}
1029 	}
1030 	/* init RAIDZ parity ops */
1031 	rm->rm_ops = vdev_raidz_math_get_ops();
1032 
1033 	return (rm);
1034 }
1035 
1036 struct pqr_struct {
1037 	uint64_t *p;
1038 	uint64_t *q;
1039 	uint64_t *r;
1040 };
1041 
1042 static int
1043 vdev_raidz_p_func(void *buf, size_t size, void *private)
1044 {
1045 	struct pqr_struct *pqr = private;
1046 	const uint64_t *src = buf;
1047 	int cnt = size / sizeof (src[0]);
1048 
1049 	ASSERT(pqr->p && !pqr->q && !pqr->r);
1050 
1051 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1052 		*pqr->p ^= *src;
1053 
1054 	return (0);
1055 }
1056 
1057 static int
1058 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1059 {
1060 	struct pqr_struct *pqr = private;
1061 	const uint64_t *src = buf;
1062 	uint64_t mask;
1063 	int cnt = size / sizeof (src[0]);
1064 
1065 	ASSERT(pqr->p && pqr->q && !pqr->r);
1066 
1067 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1068 		*pqr->p ^= *src;
1069 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1070 		*pqr->q ^= *src;
1071 	}
1072 
1073 	return (0);
1074 }
1075 
1076 static int
1077 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1078 {
1079 	struct pqr_struct *pqr = private;
1080 	const uint64_t *src = buf;
1081 	uint64_t mask;
1082 	int cnt = size / sizeof (src[0]);
1083 
1084 	ASSERT(pqr->p && pqr->q && pqr->r);
1085 
1086 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1087 		*pqr->p ^= *src;
1088 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1089 		*pqr->q ^= *src;
1090 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1091 		*pqr->r ^= *src;
1092 	}
1093 
1094 	return (0);
1095 }
1096 
1097 static void
1098 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1099 {
1100 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1101 
1102 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1103 		abd_t *src = rr->rr_col[c].rc_abd;
1104 
1105 		if (c == rr->rr_firstdatacol) {
1106 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1107 		} else {
1108 			struct pqr_struct pqr = { p, NULL, NULL };
1109 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1110 			    vdev_raidz_p_func, &pqr);
1111 		}
1112 	}
1113 }
1114 
1115 static void
1116 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1117 {
1118 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1119 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1120 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1121 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1122 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1123 
1124 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1125 		abd_t *src = rr->rr_col[c].rc_abd;
1126 
1127 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1128 
1129 		if (c == rr->rr_firstdatacol) {
1130 			ASSERT(ccnt == pcnt || ccnt == 0);
1131 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1132 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1133 
1134 			for (uint64_t i = ccnt; i < pcnt; i++) {
1135 				p[i] = 0;
1136 				q[i] = 0;
1137 			}
1138 		} else {
1139 			struct pqr_struct pqr = { p, q, NULL };
1140 
1141 			ASSERT(ccnt <= pcnt);
1142 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1143 			    vdev_raidz_pq_func, &pqr);
1144 
1145 			/*
1146 			 * Treat short columns as though they are full of 0s.
1147 			 * Note that there's therefore nothing needed for P.
1148 			 */
1149 			uint64_t mask;
1150 			for (uint64_t i = ccnt; i < pcnt; i++) {
1151 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1152 			}
1153 		}
1154 	}
1155 }
1156 
1157 static void
1158 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1159 {
1160 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1161 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1162 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1163 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1164 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1165 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1166 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1167 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1168 
1169 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1170 		abd_t *src = rr->rr_col[c].rc_abd;
1171 
1172 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1173 
1174 		if (c == rr->rr_firstdatacol) {
1175 			ASSERT(ccnt == pcnt || ccnt == 0);
1176 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1177 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1178 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1179 
1180 			for (uint64_t i = ccnt; i < pcnt; i++) {
1181 				p[i] = 0;
1182 				q[i] = 0;
1183 				r[i] = 0;
1184 			}
1185 		} else {
1186 			struct pqr_struct pqr = { p, q, r };
1187 
1188 			ASSERT(ccnt <= pcnt);
1189 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1190 			    vdev_raidz_pqr_func, &pqr);
1191 
1192 			/*
1193 			 * Treat short columns as though they are full of 0s.
1194 			 * Note that there's therefore nothing needed for P.
1195 			 */
1196 			uint64_t mask;
1197 			for (uint64_t i = ccnt; i < pcnt; i++) {
1198 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1199 				VDEV_RAIDZ_64MUL_4(r[i], mask);
1200 			}
1201 		}
1202 	}
1203 }
1204 
1205 /*
1206  * Generate RAID parity in the first virtual columns according to the number of
1207  * parity columns available.
1208  */
1209 void
1210 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1211 {
1212 	if (rr->rr_cols == 0) {
1213 		/*
1214 		 * We are handling this block one row at a time (because
1215 		 * this block has a different logical vs physical width,
1216 		 * due to RAIDZ expansion), and this is a pad-only row,
1217 		 * which has no parity.
1218 		 */
1219 		return;
1220 	}
1221 
1222 	/* Generate using the new math implementation */
1223 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1224 		return;
1225 
1226 	switch (rr->rr_firstdatacol) {
1227 	case 1:
1228 		vdev_raidz_generate_parity_p(rr);
1229 		break;
1230 	case 2:
1231 		vdev_raidz_generate_parity_pq(rr);
1232 		break;
1233 	case 3:
1234 		vdev_raidz_generate_parity_pqr(rr);
1235 		break;
1236 	default:
1237 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1238 	}
1239 }
1240 
1241 void
1242 vdev_raidz_generate_parity(raidz_map_t *rm)
1243 {
1244 	for (int i = 0; i < rm->rm_nrows; i++) {
1245 		raidz_row_t *rr = rm->rm_row[i];
1246 		vdev_raidz_generate_parity_row(rm, rr);
1247 	}
1248 }
1249 
1250 static int
1251 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1252 {
1253 	(void) private;
1254 	uint64_t *dst = dbuf;
1255 	uint64_t *src = sbuf;
1256 	int cnt = size / sizeof (src[0]);
1257 
1258 	for (int i = 0; i < cnt; i++) {
1259 		dst[i] ^= src[i];
1260 	}
1261 
1262 	return (0);
1263 }
1264 
1265 static int
1266 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1267     void *private)
1268 {
1269 	(void) private;
1270 	uint64_t *dst = dbuf;
1271 	uint64_t *src = sbuf;
1272 	uint64_t mask;
1273 	int cnt = size / sizeof (dst[0]);
1274 
1275 	for (int i = 0; i < cnt; i++, dst++, src++) {
1276 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1277 		*dst ^= *src;
1278 	}
1279 
1280 	return (0);
1281 }
1282 
1283 static int
1284 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1285 {
1286 	(void) private;
1287 	uint64_t *dst = buf;
1288 	uint64_t mask;
1289 	int cnt = size / sizeof (dst[0]);
1290 
1291 	for (int i = 0; i < cnt; i++, dst++) {
1292 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1293 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1294 	}
1295 
1296 	return (0);
1297 }
1298 
1299 struct reconst_q_struct {
1300 	uint64_t *q;
1301 	int exp;
1302 };
1303 
1304 static int
1305 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1306 {
1307 	struct reconst_q_struct *rq = private;
1308 	uint64_t *dst = buf;
1309 	int cnt = size / sizeof (dst[0]);
1310 
1311 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1312 		int j;
1313 		uint8_t *b;
1314 
1315 		*dst ^= *rq->q;
1316 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1317 			*b = vdev_raidz_exp2(*b, rq->exp);
1318 		}
1319 	}
1320 
1321 	return (0);
1322 }
1323 
1324 struct reconst_pq_struct {
1325 	uint8_t *p;
1326 	uint8_t *q;
1327 	uint8_t *pxy;
1328 	uint8_t *qxy;
1329 	int aexp;
1330 	int bexp;
1331 };
1332 
1333 static int
1334 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1335 {
1336 	struct reconst_pq_struct *rpq = private;
1337 	uint8_t *xd = xbuf;
1338 	uint8_t *yd = ybuf;
1339 
1340 	for (int i = 0; i < size;
1341 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1342 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1343 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1344 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1345 	}
1346 
1347 	return (0);
1348 }
1349 
1350 static int
1351 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1352 {
1353 	struct reconst_pq_struct *rpq = private;
1354 	uint8_t *xd = xbuf;
1355 
1356 	for (int i = 0; i < size;
1357 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1358 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1359 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1360 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1361 	}
1362 
1363 	return (0);
1364 }
1365 
1366 static void
1367 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1368 {
1369 	int x = tgts[0];
1370 	abd_t *dst, *src;
1371 
1372 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1373 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1374 
1375 	ASSERT3U(ntgts, ==, 1);
1376 	ASSERT3U(x, >=, rr->rr_firstdatacol);
1377 	ASSERT3U(x, <, rr->rr_cols);
1378 
1379 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1380 
1381 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1382 	dst = rr->rr_col[x].rc_abd;
1383 
1384 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1385 
1386 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1387 		uint64_t size = MIN(rr->rr_col[x].rc_size,
1388 		    rr->rr_col[c].rc_size);
1389 
1390 		src = rr->rr_col[c].rc_abd;
1391 
1392 		if (c == x)
1393 			continue;
1394 
1395 		(void) abd_iterate_func2(dst, src, 0, 0, size,
1396 		    vdev_raidz_reconst_p_func, NULL);
1397 	}
1398 }
1399 
1400 static void
1401 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1402 {
1403 	int x = tgts[0];
1404 	int c, exp;
1405 	abd_t *dst, *src;
1406 
1407 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1408 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1409 
1410 	ASSERT(ntgts == 1);
1411 
1412 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1413 
1414 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1415 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1416 		    rr->rr_col[c].rc_size);
1417 
1418 		src = rr->rr_col[c].rc_abd;
1419 		dst = rr->rr_col[x].rc_abd;
1420 
1421 		if (c == rr->rr_firstdatacol) {
1422 			abd_copy(dst, src, size);
1423 			if (rr->rr_col[x].rc_size > size) {
1424 				abd_zero_off(dst, size,
1425 				    rr->rr_col[x].rc_size - size);
1426 			}
1427 		} else {
1428 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1429 			(void) abd_iterate_func2(dst, src, 0, 0, size,
1430 			    vdev_raidz_reconst_q_pre_func, NULL);
1431 			(void) abd_iterate_func(dst,
1432 			    size, rr->rr_col[x].rc_size - size,
1433 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1434 		}
1435 	}
1436 
1437 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1438 	dst = rr->rr_col[x].rc_abd;
1439 	exp = 255 - (rr->rr_cols - 1 - x);
1440 
1441 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1442 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1443 	    vdev_raidz_reconst_q_post_func, &rq);
1444 }
1445 
1446 static void
1447 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1448 {
1449 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1450 	abd_t *pdata, *qdata;
1451 	uint64_t xsize, ysize;
1452 	int x = tgts[0];
1453 	int y = tgts[1];
1454 	abd_t *xd, *yd;
1455 
1456 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1457 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1458 
1459 	ASSERT(ntgts == 2);
1460 	ASSERT(x < y);
1461 	ASSERT(x >= rr->rr_firstdatacol);
1462 	ASSERT(y < rr->rr_cols);
1463 
1464 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1465 
1466 	/*
1467 	 * Move the parity data aside -- we're going to compute parity as
1468 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1469 	 * reuse the parity generation mechanism without trashing the actual
1470 	 * parity so we make those columns appear to be full of zeros by
1471 	 * setting their lengths to zero.
1472 	 */
1473 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1474 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1475 	xsize = rr->rr_col[x].rc_size;
1476 	ysize = rr->rr_col[y].rc_size;
1477 
1478 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1479 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1480 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1481 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1482 	rr->rr_col[x].rc_size = 0;
1483 	rr->rr_col[y].rc_size = 0;
1484 
1485 	vdev_raidz_generate_parity_pq(rr);
1486 
1487 	rr->rr_col[x].rc_size = xsize;
1488 	rr->rr_col[y].rc_size = ysize;
1489 
1490 	p = abd_to_buf(pdata);
1491 	q = abd_to_buf(qdata);
1492 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1493 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1494 	xd = rr->rr_col[x].rc_abd;
1495 	yd = rr->rr_col[y].rc_abd;
1496 
1497 	/*
1498 	 * We now have:
1499 	 *	Pxy = P + D_x + D_y
1500 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1501 	 *
1502 	 * We can then solve for D_x:
1503 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1504 	 * where
1505 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1506 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1507 	 *
1508 	 * With D_x in hand, we can easily solve for D_y:
1509 	 *	D_y = P + Pxy + D_x
1510 	 */
1511 
1512 	a = vdev_raidz_pow2[255 + x - y];
1513 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1514 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1515 
1516 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1517 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1518 
1519 	ASSERT3U(xsize, >=, ysize);
1520 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1521 
1522 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1523 	    vdev_raidz_reconst_pq_func, &rpq);
1524 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1525 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1526 
1527 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1528 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1529 
1530 	/*
1531 	 * Restore the saved parity data.
1532 	 */
1533 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1534 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1535 }
1536 
1537 /*
1538  * In the general case of reconstruction, we must solve the system of linear
1539  * equations defined by the coefficients used to generate parity as well as
1540  * the contents of the data and parity disks. This can be expressed with
1541  * vectors for the original data (D) and the actual data (d) and parity (p)
1542  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1543  *
1544  *            __   __                     __     __
1545  *            |     |         __     __   |  p_0  |
1546  *            |  V  |         |  D_0  |   | p_m-1 |
1547  *            |     |    x    |   :   | = |  d_0  |
1548  *            |  I  |         | D_n-1 |   |   :   |
1549  *            |     |         ~~     ~~   | d_n-1 |
1550  *            ~~   ~~                     ~~     ~~
1551  *
1552  * I is simply a square identity matrix of size n, and V is a vandermonde
1553  * matrix defined by the coefficients we chose for the various parity columns
1554  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1555  * computation as well as linear separability.
1556  *
1557  *      __               __               __     __
1558  *      |   1   ..  1 1 1 |               |  p_0  |
1559  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1560  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1561  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1562  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1563  *      |   :       : : : |   |   :   |   |  d_2  |
1564  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1565  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1566  *      |   0   ..  0 0 1 |               | d_n-1 |
1567  *      ~~               ~~               ~~     ~~
1568  *
1569  * Note that I, V, d, and p are known. To compute D, we must invert the
1570  * matrix and use the known data and parity values to reconstruct the unknown
1571  * data values. We begin by removing the rows in V|I and d|p that correspond
1572  * to failed or missing columns; we then make V|I square (n x n) and d|p
1573  * sized n by removing rows corresponding to unused parity from the bottom up
1574  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1575  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1576  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1577  *           __                               __
1578  *           |  1   1   1   1   1   1   1   1  |
1579  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1580  *           |  19 205 116  29  64  16  4   1  |      / /
1581  *           |  1   0   0   0   0   0   0   0  |     / /
1582  *           |  0   1   0   0   0   0   0   0  | <--' /
1583  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1584  *           |  0   0   0   1   0   0   0   0  |
1585  *           |  0   0   0   0   1   0   0   0  |
1586  *           |  0   0   0   0   0   1   0   0  |
1587  *           |  0   0   0   0   0   0   1   0  |
1588  *           |  0   0   0   0   0   0   0   1  |
1589  *           ~~                               ~~
1590  *           __                               __
1591  *           |  1   1   1   1   1   1   1   1  |
1592  *           | 128  64  32  16  8   4   2   1  |
1593  *           |  19 205 116  29  64  16  4   1  |
1594  *           |  1   0   0   0   0   0   0   0  |
1595  *           |  0   1   0   0   0   0   0   0  |
1596  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1597  *           |  0   0   0   1   0   0   0   0  |
1598  *           |  0   0   0   0   1   0   0   0  |
1599  *           |  0   0   0   0   0   1   0   0  |
1600  *           |  0   0   0   0   0   0   1   0  |
1601  *           |  0   0   0   0   0   0   0   1  |
1602  *           ~~                               ~~
1603  *
1604  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1605  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1606  * matrix is not singular.
1607  * __                                                                 __
1608  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1609  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1610  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1611  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1612  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1613  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1614  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1615  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1616  * ~~                                                                 ~~
1617  * __                                                                 __
1618  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1619  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1620  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1621  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1622  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1623  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1624  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1625  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1626  * ~~                                                                 ~~
1627  * __                                                                 __
1628  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1629  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1630  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1631  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1632  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1633  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1634  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1635  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1636  * ~~                                                                 ~~
1637  * __                                                                 __
1638  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1639  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1640  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1641  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1642  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1643  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1644  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1645  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1646  * ~~                                                                 ~~
1647  * __                                                                 __
1648  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1649  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1650  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1651  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1652  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1653  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1654  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1655  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1656  * ~~                                                                 ~~
1657  * __                                                                 __
1658  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1659  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1660  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1661  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1662  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1663  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1664  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1665  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1666  * ~~                                                                 ~~
1667  *                   __                               __
1668  *                   |  0   0   1   0   0   0   0   0  |
1669  *                   | 167 100  5   41 159 169 217 208 |
1670  *                   | 166 100  4   40 158 168 216 209 |
1671  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1672  *                   |  0   0   0   0   1   0   0   0  |
1673  *                   |  0   0   0   0   0   1   0   0  |
1674  *                   |  0   0   0   0   0   0   1   0  |
1675  *                   |  0   0   0   0   0   0   0   1  |
1676  *                   ~~                               ~~
1677  *
1678  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1679  * of the missing data.
1680  *
1681  * As is apparent from the example above, the only non-trivial rows in the
1682  * inverse matrix correspond to the data disks that we're trying to
1683  * reconstruct. Indeed, those are the only rows we need as the others would
1684  * only be useful for reconstructing data known or assumed to be valid. For
1685  * that reason, we only build the coefficients in the rows that correspond to
1686  * targeted columns.
1687  */
1688 
1689 static void
1690 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1691     uint8_t **rows)
1692 {
1693 	int i, j;
1694 	int pow;
1695 
1696 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1697 
1698 	/*
1699 	 * Fill in the missing rows of interest.
1700 	 */
1701 	for (i = 0; i < nmap; i++) {
1702 		ASSERT3S(0, <=, map[i]);
1703 		ASSERT3S(map[i], <=, 2);
1704 
1705 		pow = map[i] * n;
1706 		if (pow > 255)
1707 			pow -= 255;
1708 		ASSERT(pow <= 255);
1709 
1710 		for (j = 0; j < n; j++) {
1711 			pow -= map[i];
1712 			if (pow < 0)
1713 				pow += 255;
1714 			rows[i][j] = vdev_raidz_pow2[pow];
1715 		}
1716 	}
1717 }
1718 
1719 static void
1720 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1721     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1722 {
1723 	int i, j, ii, jj;
1724 	uint8_t log;
1725 
1726 	/*
1727 	 * Assert that the first nmissing entries from the array of used
1728 	 * columns correspond to parity columns and that subsequent entries
1729 	 * correspond to data columns.
1730 	 */
1731 	for (i = 0; i < nmissing; i++) {
1732 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1733 	}
1734 	for (; i < n; i++) {
1735 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1736 	}
1737 
1738 	/*
1739 	 * First initialize the storage where we'll compute the inverse rows.
1740 	 */
1741 	for (i = 0; i < nmissing; i++) {
1742 		for (j = 0; j < n; j++) {
1743 			invrows[i][j] = (i == j) ? 1 : 0;
1744 		}
1745 	}
1746 
1747 	/*
1748 	 * Subtract all trivial rows from the rows of consequence.
1749 	 */
1750 	for (i = 0; i < nmissing; i++) {
1751 		for (j = nmissing; j < n; j++) {
1752 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1753 			jj = used[j] - rr->rr_firstdatacol;
1754 			ASSERT3S(jj, <, n);
1755 			invrows[i][j] = rows[i][jj];
1756 			rows[i][jj] = 0;
1757 		}
1758 	}
1759 
1760 	/*
1761 	 * For each of the rows of interest, we must normalize it and subtract
1762 	 * a multiple of it from the other rows.
1763 	 */
1764 	for (i = 0; i < nmissing; i++) {
1765 		for (j = 0; j < missing[i]; j++) {
1766 			ASSERT0(rows[i][j]);
1767 		}
1768 		ASSERT3U(rows[i][missing[i]], !=, 0);
1769 
1770 		/*
1771 		 * Compute the inverse of the first element and multiply each
1772 		 * element in the row by that value.
1773 		 */
1774 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1775 
1776 		for (j = 0; j < n; j++) {
1777 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1778 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1779 		}
1780 
1781 		for (ii = 0; ii < nmissing; ii++) {
1782 			if (i == ii)
1783 				continue;
1784 
1785 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1786 
1787 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1788 
1789 			for (j = 0; j < n; j++) {
1790 				rows[ii][j] ^=
1791 				    vdev_raidz_exp2(rows[i][j], log);
1792 				invrows[ii][j] ^=
1793 				    vdev_raidz_exp2(invrows[i][j], log);
1794 			}
1795 		}
1796 	}
1797 
1798 	/*
1799 	 * Verify that the data that is left in the rows are properly part of
1800 	 * an identity matrix.
1801 	 */
1802 	for (i = 0; i < nmissing; i++) {
1803 		for (j = 0; j < n; j++) {
1804 			if (j == missing[i]) {
1805 				ASSERT3U(rows[i][j], ==, 1);
1806 			} else {
1807 				ASSERT0(rows[i][j]);
1808 			}
1809 		}
1810 	}
1811 }
1812 
1813 static void
1814 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1815     int *missing, uint8_t **invrows, const uint8_t *used)
1816 {
1817 	int i, j, x, cc, c;
1818 	uint8_t *src;
1819 	uint64_t ccount;
1820 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1821 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1822 	uint8_t log = 0;
1823 	uint8_t val;
1824 	int ll;
1825 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1826 	uint8_t *p, *pp;
1827 	size_t psize;
1828 
1829 	psize = sizeof (invlog[0][0]) * n * nmissing;
1830 	p = kmem_alloc(psize, KM_SLEEP);
1831 
1832 	for (pp = p, i = 0; i < nmissing; i++) {
1833 		invlog[i] = pp;
1834 		pp += n;
1835 	}
1836 
1837 	for (i = 0; i < nmissing; i++) {
1838 		for (j = 0; j < n; j++) {
1839 			ASSERT3U(invrows[i][j], !=, 0);
1840 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1841 		}
1842 	}
1843 
1844 	for (i = 0; i < n; i++) {
1845 		c = used[i];
1846 		ASSERT3U(c, <, rr->rr_cols);
1847 
1848 		ccount = rr->rr_col[c].rc_size;
1849 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1850 		if (ccount == 0)
1851 			continue;
1852 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1853 		for (j = 0; j < nmissing; j++) {
1854 			cc = missing[j] + rr->rr_firstdatacol;
1855 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1856 			ASSERT3U(cc, <, rr->rr_cols);
1857 			ASSERT3U(cc, !=, c);
1858 
1859 			dcount[j] = rr->rr_col[cc].rc_size;
1860 			if (dcount[j] != 0)
1861 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1862 		}
1863 
1864 		for (x = 0; x < ccount; x++, src++) {
1865 			if (*src != 0)
1866 				log = vdev_raidz_log2[*src];
1867 
1868 			for (cc = 0; cc < nmissing; cc++) {
1869 				if (x >= dcount[cc])
1870 					continue;
1871 
1872 				if (*src == 0) {
1873 					val = 0;
1874 				} else {
1875 					if ((ll = log + invlog[cc][i]) >= 255)
1876 						ll -= 255;
1877 					val = vdev_raidz_pow2[ll];
1878 				}
1879 
1880 				if (i == 0)
1881 					dst[cc][x] = val;
1882 				else
1883 					dst[cc][x] ^= val;
1884 			}
1885 		}
1886 	}
1887 
1888 	kmem_free(p, psize);
1889 }
1890 
1891 static void
1892 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1893 {
1894 	int n, i, c, t, tt;
1895 	int nmissing_rows;
1896 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1897 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1898 	uint8_t *p, *pp;
1899 	size_t psize;
1900 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1901 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1902 	uint8_t *used;
1903 
1904 	abd_t **bufs = NULL;
1905 
1906 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1907 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1908 	/*
1909 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1910 	 * temporary linear ABDs if any non-linear ABDs are found.
1911 	 */
1912 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1913 		ASSERT(rr->rr_col[i].rc_abd != NULL);
1914 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1915 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1916 			    KM_PUSHPAGE);
1917 
1918 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1919 				raidz_col_t *col = &rr->rr_col[c];
1920 
1921 				bufs[c] = col->rc_abd;
1922 				if (bufs[c] != NULL) {
1923 					col->rc_abd = abd_alloc_linear(
1924 					    col->rc_size, B_TRUE);
1925 					abd_copy(col->rc_abd, bufs[c],
1926 					    col->rc_size);
1927 				}
1928 			}
1929 
1930 			break;
1931 		}
1932 	}
1933 
1934 	n = rr->rr_cols - rr->rr_firstdatacol;
1935 
1936 	/*
1937 	 * Figure out which data columns are missing.
1938 	 */
1939 	nmissing_rows = 0;
1940 	for (t = 0; t < ntgts; t++) {
1941 		if (tgts[t] >= rr->rr_firstdatacol) {
1942 			missing_rows[nmissing_rows++] =
1943 			    tgts[t] - rr->rr_firstdatacol;
1944 		}
1945 	}
1946 
1947 	/*
1948 	 * Figure out which parity columns to use to help generate the missing
1949 	 * data columns.
1950 	 */
1951 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1952 		ASSERT(tt < ntgts);
1953 		ASSERT(c < rr->rr_firstdatacol);
1954 
1955 		/*
1956 		 * Skip any targeted parity columns.
1957 		 */
1958 		if (c == tgts[tt]) {
1959 			tt++;
1960 			continue;
1961 		}
1962 
1963 		parity_map[i] = c;
1964 		i++;
1965 	}
1966 
1967 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1968 	    nmissing_rows * n + sizeof (used[0]) * n;
1969 	p = kmem_alloc(psize, KM_SLEEP);
1970 
1971 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1972 		rows[i] = pp;
1973 		pp += n;
1974 		invrows[i] = pp;
1975 		pp += n;
1976 	}
1977 	used = pp;
1978 
1979 	for (i = 0; i < nmissing_rows; i++) {
1980 		used[i] = parity_map[i];
1981 	}
1982 
1983 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1984 		if (tt < nmissing_rows &&
1985 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
1986 			tt++;
1987 			continue;
1988 		}
1989 
1990 		ASSERT3S(i, <, n);
1991 		used[i] = c;
1992 		i++;
1993 	}
1994 
1995 	/*
1996 	 * Initialize the interesting rows of the matrix.
1997 	 */
1998 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
1999 
2000 	/*
2001 	 * Invert the matrix.
2002 	 */
2003 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2004 	    invrows, used);
2005 
2006 	/*
2007 	 * Reconstruct the missing data using the generated matrix.
2008 	 */
2009 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2010 	    invrows, used);
2011 
2012 	kmem_free(p, psize);
2013 
2014 	/*
2015 	 * copy back from temporary linear abds and free them
2016 	 */
2017 	if (bufs) {
2018 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2019 			raidz_col_t *col = &rr->rr_col[c];
2020 
2021 			if (bufs[c] != NULL) {
2022 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2023 				abd_free(col->rc_abd);
2024 			}
2025 			col->rc_abd = bufs[c];
2026 		}
2027 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2028 	}
2029 }
2030 
2031 static void
2032 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2033     const int *t, int nt)
2034 {
2035 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2036 	int ntgts;
2037 	int i, c, ret;
2038 	int nbadparity, nbaddata;
2039 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2040 
2041 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2042 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2043 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2044 		    (int)rr->rr_missingparity);
2045 	}
2046 
2047 	nbadparity = rr->rr_firstdatacol;
2048 	nbaddata = rr->rr_cols - nbadparity;
2049 	ntgts = 0;
2050 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2051 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2052 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2053 			    "offset=%llx error=%u)",
2054 			    rr, c, (int)rr->rr_col[c].rc_devidx,
2055 			    (long long)rr->rr_col[c].rc_offset,
2056 			    (int)rr->rr_col[c].rc_error);
2057 		}
2058 		if (c < rr->rr_firstdatacol)
2059 			parity_valid[c] = B_FALSE;
2060 
2061 		if (i < nt && c == t[i]) {
2062 			tgts[ntgts++] = c;
2063 			i++;
2064 		} else if (rr->rr_col[c].rc_error != 0) {
2065 			tgts[ntgts++] = c;
2066 		} else if (c >= rr->rr_firstdatacol) {
2067 			nbaddata--;
2068 		} else {
2069 			parity_valid[c] = B_TRUE;
2070 			nbadparity--;
2071 		}
2072 	}
2073 
2074 	ASSERT(ntgts >= nt);
2075 	ASSERT(nbaddata >= 0);
2076 	ASSERT(nbaddata + nbadparity == ntgts);
2077 
2078 	dt = &tgts[nbadparity];
2079 
2080 	/* Reconstruct using the new math implementation */
2081 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2082 	if (ret != RAIDZ_ORIGINAL_IMPL)
2083 		return;
2084 
2085 	/*
2086 	 * See if we can use any of our optimized reconstruction routines.
2087 	 */
2088 	switch (nbaddata) {
2089 	case 1:
2090 		if (parity_valid[VDEV_RAIDZ_P]) {
2091 			vdev_raidz_reconstruct_p(rr, dt, 1);
2092 			return;
2093 		}
2094 
2095 		ASSERT(rr->rr_firstdatacol > 1);
2096 
2097 		if (parity_valid[VDEV_RAIDZ_Q]) {
2098 			vdev_raidz_reconstruct_q(rr, dt, 1);
2099 			return;
2100 		}
2101 
2102 		ASSERT(rr->rr_firstdatacol > 2);
2103 		break;
2104 
2105 	case 2:
2106 		ASSERT(rr->rr_firstdatacol > 1);
2107 
2108 		if (parity_valid[VDEV_RAIDZ_P] &&
2109 		    parity_valid[VDEV_RAIDZ_Q]) {
2110 			vdev_raidz_reconstruct_pq(rr, dt, 2);
2111 			return;
2112 		}
2113 
2114 		ASSERT(rr->rr_firstdatacol > 2);
2115 
2116 		break;
2117 	}
2118 
2119 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2120 }
2121 
2122 static int
2123 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2124     uint64_t *logical_ashift, uint64_t *physical_ashift)
2125 {
2126 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2127 	uint64_t nparity = vdrz->vd_nparity;
2128 	int c;
2129 	int lasterror = 0;
2130 	int numerrors = 0;
2131 
2132 	ASSERT(nparity > 0);
2133 
2134 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2135 	    vd->vdev_children < nparity + 1) {
2136 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2137 		return (SET_ERROR(EINVAL));
2138 	}
2139 
2140 	vdev_open_children(vd);
2141 
2142 	for (c = 0; c < vd->vdev_children; c++) {
2143 		vdev_t *cvd = vd->vdev_child[c];
2144 
2145 		if (cvd->vdev_open_error != 0) {
2146 			lasterror = cvd->vdev_open_error;
2147 			numerrors++;
2148 			continue;
2149 		}
2150 
2151 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2152 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2153 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2154 	}
2155 	for (c = 0; c < vd->vdev_children; c++) {
2156 		vdev_t *cvd = vd->vdev_child[c];
2157 
2158 		if (cvd->vdev_open_error != 0)
2159 			continue;
2160 		*physical_ashift = vdev_best_ashift(*logical_ashift,
2161 		    *physical_ashift, cvd->vdev_physical_ashift);
2162 	}
2163 
2164 	if (vd->vdev_rz_expanding) {
2165 		*asize *= vd->vdev_children - 1;
2166 		*max_asize *= vd->vdev_children - 1;
2167 
2168 		vd->vdev_min_asize = *asize;
2169 	} else {
2170 		*asize *= vd->vdev_children;
2171 		*max_asize *= vd->vdev_children;
2172 	}
2173 
2174 	if (numerrors > nparity) {
2175 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2176 		return (lasterror);
2177 	}
2178 
2179 	return (0);
2180 }
2181 
2182 static void
2183 vdev_raidz_close(vdev_t *vd)
2184 {
2185 	for (int c = 0; c < vd->vdev_children; c++) {
2186 		if (vd->vdev_child[c] != NULL)
2187 			vdev_close(vd->vdev_child[c]);
2188 	}
2189 }
2190 
2191 /*
2192  * Return the logical width to use, given the txg in which the allocation
2193  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2194  * BP was allocated.  Remapped BP's (that were relocated due to device
2195  * removal, see remap_blkptr_cb()), will have a more recent physical birth
2196  * which reflects when the BP was relocated, but we can ignore these because
2197  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2198  */
2199 static uint64_t
2200 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2201 {
2202 	reflow_node_t lookup = {
2203 		.re_txg = txg,
2204 	};
2205 	avl_index_t where;
2206 
2207 	uint64_t width;
2208 	mutex_enter(&vdrz->vd_expand_lock);
2209 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2210 	if (re != NULL) {
2211 		width = re->re_logical_width;
2212 	} else {
2213 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2214 		if (re != NULL)
2215 			width = re->re_logical_width;
2216 		else
2217 			width = vdrz->vd_original_width;
2218 	}
2219 	mutex_exit(&vdrz->vd_expand_lock);
2220 	return (width);
2221 }
2222 
2223 /*
2224  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2225  * more space due to the lower data-to-parity ratio.  In this case it's
2226  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2227  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2228  * regardless of txg.  This is assured because for a single data sector, we
2229  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2230  */
2231 static uint64_t
2232 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2233 {
2234 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2235 	uint64_t asize;
2236 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2237 	uint64_t cols = vdrz->vd_original_width;
2238 	uint64_t nparity = vdrz->vd_nparity;
2239 
2240 	cols = vdev_raidz_get_logical_width(vdrz, txg);
2241 
2242 	asize = ((psize - 1) >> ashift) + 1;
2243 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2244 	asize = roundup(asize, nparity + 1) << ashift;
2245 
2246 #ifdef ZFS_DEBUG
2247 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2248 	uint64_t ncols_new = vdrz->vd_physical_width;
2249 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2250 	    (ncols_new - nparity));
2251 	asize_new = roundup(asize_new, nparity + 1) << ashift;
2252 	VERIFY3U(asize_new, <=, asize);
2253 #endif
2254 
2255 	return (asize);
2256 }
2257 
2258 /*
2259  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2260  * so each child must provide at least 1/Nth of its asize.
2261  */
2262 static uint64_t
2263 vdev_raidz_min_asize(vdev_t *vd)
2264 {
2265 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2266 	    vd->vdev_children);
2267 }
2268 
2269 void
2270 vdev_raidz_child_done(zio_t *zio)
2271 {
2272 	raidz_col_t *rc = zio->io_private;
2273 
2274 	ASSERT3P(rc->rc_abd, !=, NULL);
2275 	rc->rc_error = zio->io_error;
2276 	rc->rc_tried = 1;
2277 	rc->rc_skipped = 0;
2278 }
2279 
2280 static void
2281 vdev_raidz_shadow_child_done(zio_t *zio)
2282 {
2283 	raidz_col_t *rc = zio->io_private;
2284 
2285 	rc->rc_shadow_error = zio->io_error;
2286 }
2287 
2288 static void
2289 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2290 {
2291 	(void) rm;
2292 #ifdef ZFS_DEBUG
2293 	range_seg64_t logical_rs, physical_rs, remain_rs;
2294 	logical_rs.rs_start = rr->rr_offset;
2295 	logical_rs.rs_end = logical_rs.rs_start +
2296 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
2297 	    BP_GET_BIRTH(zio->io_bp));
2298 
2299 	raidz_col_t *rc = &rr->rr_col[col];
2300 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2301 
2302 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2303 	ASSERT(vdev_xlate_is_empty(&remain_rs));
2304 	if (vdev_xlate_is_empty(&physical_rs)) {
2305 		/*
2306 		 * If we are in the middle of expansion, the
2307 		 * physical->logical mapping is changing so vdev_xlate()
2308 		 * can't give us a reliable answer.
2309 		 */
2310 		return;
2311 	}
2312 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2313 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2314 	/*
2315 	 * It would be nice to assert that rs_end is equal
2316 	 * to rc_offset + rc_size but there might be an
2317 	 * optional I/O at the end that is not accounted in
2318 	 * rc_size.
2319 	 */
2320 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2321 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2322 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2323 	} else {
2324 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2325 	}
2326 #endif
2327 }
2328 
2329 static void
2330 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2331 {
2332 	vdev_t *vd = zio->io_vd;
2333 	raidz_map_t *rm = zio->io_vsd;
2334 
2335 	vdev_raidz_generate_parity_row(rm, rr);
2336 
2337 	for (int c = 0; c < rr->rr_scols; c++) {
2338 		raidz_col_t *rc = &rr->rr_col[c];
2339 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2340 
2341 		/* Verify physical to logical translation */
2342 		vdev_raidz_io_verify(zio, rm, rr, c);
2343 
2344 		if (rc->rc_size == 0)
2345 			continue;
2346 
2347 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2348 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2349 
2350 		ASSERT3P(rc->rc_abd, !=, NULL);
2351 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2352 		    rc->rc_offset, rc->rc_abd,
2353 		    abd_get_size(rc->rc_abd), zio->io_type,
2354 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2355 
2356 		if (rc->rc_shadow_devidx != INT_MAX) {
2357 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2358 
2359 			ASSERT3U(
2360 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2361 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2362 
2363 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2364 			    rc->rc_shadow_offset, rc->rc_abd,
2365 			    abd_get_size(rc->rc_abd),
2366 			    zio->io_type, zio->io_priority, 0,
2367 			    vdev_raidz_shadow_child_done, rc));
2368 		}
2369 	}
2370 }
2371 
2372 /*
2373  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2374  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2375  */
2376 static void
2377 raidz_start_skip_writes(zio_t *zio)
2378 {
2379 	vdev_t *vd = zio->io_vd;
2380 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2381 	raidz_map_t *rm = zio->io_vsd;
2382 	ASSERT3U(rm->rm_nrows, ==, 1);
2383 	raidz_row_t *rr = rm->rm_row[0];
2384 	for (int c = 0; c < rr->rr_scols; c++) {
2385 		raidz_col_t *rc = &rr->rr_col[c];
2386 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2387 		if (rc->rc_size != 0)
2388 			continue;
2389 		ASSERT3P(rc->rc_abd, ==, NULL);
2390 
2391 		ASSERT3U(rc->rc_offset, <,
2392 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2393 
2394 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2395 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2396 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2397 	}
2398 }
2399 
2400 static void
2401 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2402 {
2403 	vdev_t *vd = zio->io_vd;
2404 
2405 	/*
2406 	 * Iterate over the columns in reverse order so that we hit the parity
2407 	 * last -- any errors along the way will force us to read the parity.
2408 	 */
2409 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2410 		raidz_col_t *rc = &rr->rr_col[c];
2411 		if (rc->rc_size == 0)
2412 			continue;
2413 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2414 		if (!vdev_readable(cvd)) {
2415 			if (c >= rr->rr_firstdatacol)
2416 				rr->rr_missingdata++;
2417 			else
2418 				rr->rr_missingparity++;
2419 			rc->rc_error = SET_ERROR(ENXIO);
2420 			rc->rc_tried = 1;	/* don't even try */
2421 			rc->rc_skipped = 1;
2422 			continue;
2423 		}
2424 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2425 			if (c >= rr->rr_firstdatacol)
2426 				rr->rr_missingdata++;
2427 			else
2428 				rr->rr_missingparity++;
2429 			rc->rc_error = SET_ERROR(ESTALE);
2430 			rc->rc_skipped = 1;
2431 			continue;
2432 		}
2433 		if (forceparity ||
2434 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2435 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2436 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2437 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2438 			    zio->io_type, zio->io_priority, 0,
2439 			    vdev_raidz_child_done, rc));
2440 		}
2441 	}
2442 }
2443 
2444 static void
2445 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2446 {
2447 	vdev_t *vd = zio->io_vd;
2448 
2449 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2450 		raidz_col_t *prc = &rm->rm_phys_col[i];
2451 		if (prc->rc_size == 0)
2452 			continue;
2453 
2454 		ASSERT3U(prc->rc_devidx, ==, i);
2455 		vdev_t *cvd = vd->vdev_child[i];
2456 		if (!vdev_readable(cvd)) {
2457 			prc->rc_error = SET_ERROR(ENXIO);
2458 			prc->rc_tried = 1;	/* don't even try */
2459 			prc->rc_skipped = 1;
2460 			continue;
2461 		}
2462 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2463 			prc->rc_error = SET_ERROR(ESTALE);
2464 			prc->rc_skipped = 1;
2465 			continue;
2466 		}
2467 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2468 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2469 		    zio->io_type, zio->io_priority, 0,
2470 		    vdev_raidz_child_done, prc));
2471 	}
2472 }
2473 
2474 static void
2475 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2476 {
2477 	/*
2478 	 * If there are multiple rows, we will be hitting
2479 	 * all disks, so go ahead and read the parity so
2480 	 * that we are reading in decent size chunks.
2481 	 */
2482 	boolean_t forceparity = rm->rm_nrows > 1;
2483 
2484 	if (rm->rm_phys_col) {
2485 		vdev_raidz_io_start_read_phys_cols(zio, rm);
2486 	} else {
2487 		for (int i = 0; i < rm->rm_nrows; i++) {
2488 			raidz_row_t *rr = rm->rm_row[i];
2489 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2490 		}
2491 	}
2492 }
2493 
2494 /*
2495  * Start an IO operation on a RAIDZ VDev
2496  *
2497  * Outline:
2498  * - For write operations:
2499  *   1. Generate the parity data
2500  *   2. Create child zio write operations to each column's vdev, for both
2501  *      data and parity.
2502  *   3. If the column skips any sectors for padding, create optional dummy
2503  *      write zio children for those areas to improve aggregation continuity.
2504  * - For read operations:
2505  *   1. Create child zio read operations to each data column's vdev to read
2506  *      the range of data required for zio.
2507  *   2. If this is a scrub or resilver operation, or if any of the data
2508  *      vdevs have had errors, then create zio read operations to the parity
2509  *      columns' VDevs as well.
2510  */
2511 static void
2512 vdev_raidz_io_start(zio_t *zio)
2513 {
2514 	vdev_t *vd = zio->io_vd;
2515 	vdev_t *tvd = vd->vdev_top;
2516 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2517 	raidz_map_t *rm;
2518 
2519 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2520 	    BP_GET_BIRTH(zio->io_bp));
2521 	if (logical_width != vdrz->vd_physical_width) {
2522 		zfs_locked_range_t *lr = NULL;
2523 		uint64_t synced_offset = UINT64_MAX;
2524 		uint64_t next_offset = UINT64_MAX;
2525 		boolean_t use_scratch = B_FALSE;
2526 		/*
2527 		 * Note: when the expansion is completing, we set
2528 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2529 		 * in a later txg than when we last update spa_ubsync's state
2530 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2531 		 * may see vre_state!=SCANNING before
2532 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2533 		 * on disk, but the copying progress has been synced to disk
2534 		 * (and reflected in spa_ubsync).  In this case it's fine to
2535 		 * treat the expansion as completed, since if we crash there's
2536 		 * no additional copying to do.
2537 		 */
2538 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2539 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2540 			    &vdrz->vn_vre);
2541 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2542 			    zio->io_offset, zio->io_size, RL_READER);
2543 			use_scratch =
2544 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2545 			    RRSS_SCRATCH_VALID);
2546 			synced_offset =
2547 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2548 			next_offset = vdrz->vn_vre.vre_offset;
2549 			/*
2550 			 * If we haven't resumed expanding since importing the
2551 			 * pool, vre_offset won't have been set yet.  In
2552 			 * this case the next offset to be copied is the same
2553 			 * as what was synced.
2554 			 */
2555 			if (next_offset == UINT64_MAX) {
2556 				next_offset = synced_offset;
2557 			}
2558 		}
2559 		if (use_scratch) {
2560 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2561 			    "%lld next_offset=%lld use_scratch=%u",
2562 			    zio,
2563 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2564 			    (long long)zio->io_offset,
2565 			    (long long)synced_offset,
2566 			    (long long)next_offset,
2567 			    use_scratch);
2568 		}
2569 
2570 		rm = vdev_raidz_map_alloc_expanded(zio,
2571 		    tvd->vdev_ashift, vdrz->vd_physical_width,
2572 		    logical_width, vdrz->vd_nparity,
2573 		    synced_offset, next_offset, use_scratch);
2574 		rm->rm_lr = lr;
2575 	} else {
2576 		rm = vdev_raidz_map_alloc(zio,
2577 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2578 	}
2579 	rm->rm_original_width = vdrz->vd_original_width;
2580 
2581 	zio->io_vsd = rm;
2582 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2583 	if (zio->io_type == ZIO_TYPE_WRITE) {
2584 		for (int i = 0; i < rm->rm_nrows; i++) {
2585 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2586 		}
2587 
2588 		if (logical_width == vdrz->vd_physical_width) {
2589 			raidz_start_skip_writes(zio);
2590 		}
2591 	} else {
2592 		ASSERT(zio->io_type == ZIO_TYPE_READ);
2593 		vdev_raidz_io_start_read(zio, rm);
2594 	}
2595 
2596 	zio_execute(zio);
2597 }
2598 
2599 /*
2600  * Report a checksum error for a child of a RAID-Z device.
2601  */
2602 void
2603 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2604 {
2605 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2606 
2607 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2608 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2609 		zio_bad_cksum_t zbc;
2610 		raidz_map_t *rm = zio->io_vsd;
2611 
2612 		zbc.zbc_has_cksum = 0;
2613 		zbc.zbc_injected = rm->rm_ecksuminjected;
2614 
2615 		mutex_enter(&vd->vdev_stat_lock);
2616 		vd->vdev_stat.vs_checksum_errors++;
2617 		mutex_exit(&vd->vdev_stat_lock);
2618 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2619 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2620 		    rc->rc_abd, bad_data, &zbc);
2621 	}
2622 }
2623 
2624 /*
2625  * We keep track of whether or not there were any injected errors, so that
2626  * any ereports we generate can note it.
2627  */
2628 static int
2629 raidz_checksum_verify(zio_t *zio)
2630 {
2631 	zio_bad_cksum_t zbc = {0};
2632 	raidz_map_t *rm = zio->io_vsd;
2633 
2634 	int ret = zio_checksum_error(zio, &zbc);
2635 	if (ret != 0 && zbc.zbc_injected != 0)
2636 		rm->rm_ecksuminjected = 1;
2637 
2638 	return (ret);
2639 }
2640 
2641 /*
2642  * Generate the parity from the data columns. If we tried and were able to
2643  * read the parity without error, verify that the generated parity matches the
2644  * data we read. If it doesn't, we fire off a checksum error. Return the
2645  * number of such failures.
2646  */
2647 static int
2648 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2649 {
2650 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2651 	int c, ret = 0;
2652 	raidz_map_t *rm = zio->io_vsd;
2653 	raidz_col_t *rc;
2654 
2655 	blkptr_t *bp = zio->io_bp;
2656 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2657 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2658 
2659 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2660 		return (ret);
2661 
2662 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2663 		rc = &rr->rr_col[c];
2664 		if (!rc->rc_tried || rc->rc_error != 0)
2665 			continue;
2666 
2667 		orig[c] = rc->rc_abd;
2668 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2669 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2670 	}
2671 
2672 	/*
2673 	 * Verify any empty sectors are zero filled to ensure the parity
2674 	 * is calculated correctly even if these non-data sectors are damaged.
2675 	 */
2676 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2677 		ret += vdev_draid_map_verify_empty(zio, rr);
2678 
2679 	/*
2680 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
2681 	 * isn't harmful but it does have the side effect of fixing stuff
2682 	 * we didn't realize was necessary (i.e. even if we return 0).
2683 	 */
2684 	vdev_raidz_generate_parity_row(rm, rr);
2685 
2686 	for (c = 0; c < rr->rr_firstdatacol; c++) {
2687 		rc = &rr->rr_col[c];
2688 
2689 		if (!rc->rc_tried || rc->rc_error != 0)
2690 			continue;
2691 
2692 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2693 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2694 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2695 			vdev_raidz_checksum_error(zio, rc, orig[c]);
2696 			rc->rc_error = SET_ERROR(ECKSUM);
2697 			ret++;
2698 		}
2699 		abd_free(orig[c]);
2700 	}
2701 
2702 	return (ret);
2703 }
2704 
2705 static int
2706 vdev_raidz_worst_error(raidz_row_t *rr)
2707 {
2708 	int error = 0;
2709 
2710 	for (int c = 0; c < rr->rr_cols; c++) {
2711 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2712 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2713 	}
2714 
2715 	return (error);
2716 }
2717 
2718 static void
2719 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2720 {
2721 	int unexpected_errors = 0;
2722 	int parity_errors = 0;
2723 	int parity_untried = 0;
2724 	int data_errors = 0;
2725 
2726 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2727 
2728 	for (int c = 0; c < rr->rr_cols; c++) {
2729 		raidz_col_t *rc = &rr->rr_col[c];
2730 
2731 		if (rc->rc_error) {
2732 			if (c < rr->rr_firstdatacol)
2733 				parity_errors++;
2734 			else
2735 				data_errors++;
2736 
2737 			if (!rc->rc_skipped)
2738 				unexpected_errors++;
2739 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2740 			parity_untried++;
2741 		}
2742 
2743 		if (rc->rc_force_repair)
2744 			unexpected_errors++;
2745 	}
2746 
2747 	/*
2748 	 * If we read more parity disks than were used for
2749 	 * reconstruction, confirm that the other parity disks produced
2750 	 * correct data.
2751 	 *
2752 	 * Note that we also regenerate parity when resilvering so we
2753 	 * can write it out to failed devices later.
2754 	 */
2755 	if (parity_errors + parity_untried <
2756 	    rr->rr_firstdatacol - data_errors ||
2757 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2758 		int n = raidz_parity_verify(zio, rr);
2759 		unexpected_errors += n;
2760 	}
2761 
2762 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2763 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2764 		/*
2765 		 * Use the good data we have in hand to repair damaged children.
2766 		 */
2767 		for (int c = 0; c < rr->rr_cols; c++) {
2768 			raidz_col_t *rc = &rr->rr_col[c];
2769 			vdev_t *vd = zio->io_vd;
2770 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2771 
2772 			if (!rc->rc_allow_repair) {
2773 				continue;
2774 			} else if (!rc->rc_force_repair &&
2775 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
2776 				continue;
2777 			}
2778 
2779 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2780 			    "offset=%llx",
2781 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2782 
2783 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2784 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2785 			    ZIO_TYPE_WRITE,
2786 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
2787 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2788 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2789 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2790 		}
2791 	}
2792 
2793 	/*
2794 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
2795 	 * good data.  This ensures that if we've already copied this sector,
2796 	 * it will be corrected if it was damaged.  This writes more than is
2797 	 * necessary, but since expansion is paused during scrub/resilver, at
2798 	 * most a single row will have a shadow location.
2799 	 */
2800 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2801 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2802 		for (int c = 0; c < rr->rr_cols; c++) {
2803 			raidz_col_t *rc = &rr->rr_col[c];
2804 			vdev_t *vd = zio->io_vd;
2805 
2806 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2807 				continue;
2808 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2809 
2810 			/*
2811 			 * Note: We don't want to update the repair stats
2812 			 * because that would incorrectly indicate that there
2813 			 * was bad data to repair, which we aren't sure about.
2814 			 * By clearing the SCAN_THREAD flag, we prevent this
2815 			 * from happening, despite having the REPAIR flag set.
2816 			 * We need to set SELF_HEAL so that this i/o can't be
2817 			 * bypassed by zio_vdev_io_start().
2818 			 */
2819 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2820 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2821 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2822 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2823 			    NULL, NULL);
2824 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2825 			zio_nowait(cio);
2826 		}
2827 	}
2828 }
2829 
2830 static void
2831 raidz_restore_orig_data(raidz_map_t *rm)
2832 {
2833 	for (int i = 0; i < rm->rm_nrows; i++) {
2834 		raidz_row_t *rr = rm->rm_row[i];
2835 		for (int c = 0; c < rr->rr_cols; c++) {
2836 			raidz_col_t *rc = &rr->rr_col[c];
2837 			if (rc->rc_need_orig_restore) {
2838 				abd_copy(rc->rc_abd,
2839 				    rc->rc_orig_data, rc->rc_size);
2840 				rc->rc_need_orig_restore = B_FALSE;
2841 			}
2842 		}
2843 	}
2844 }
2845 
2846 /*
2847  * During raidz_reconstruct() for expanded VDEV, we need special consideration
2848  * failure simulations.  See note in raidz_reconstruct() on simulating failure
2849  * of a pre-expansion device.
2850  *
2851  * Treating logical child i as failed, return TRUE if the given column should
2852  * be treated as failed.  The idea of logical children allows us to imagine
2853  * that a disk silently failed before a RAIDZ expansion (reads from this disk
2854  * succeed but return the wrong data).  Since the expansion doesn't verify
2855  * checksums, the incorrect data will be moved to new locations spread among
2856  * the children (going diagonally across them).
2857  *
2858  * Higher "logical child failures" (values of `i`) indicate these
2859  * "pre-expansion failures".  The first physical_width values imagine that a
2860  * current child failed; the next physical_width-1 values imagine that a
2861  * child failed before the most recent expansion; the next physical_width-2
2862  * values imagine a child failed in the expansion before that, etc.
2863  */
2864 static boolean_t
2865 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2866     int i, raidz_col_t *rc)
2867 {
2868 	uint64_t sector_id =
2869 	    physical_width * (rc->rc_offset >> ashift) +
2870 	    rc->rc_devidx;
2871 
2872 	for (int w = physical_width; w >= original_width; w--) {
2873 		if (i < w) {
2874 			return (sector_id % w == i);
2875 		} else {
2876 			i -= w;
2877 		}
2878 	}
2879 	ASSERT(!"invalid logical child id");
2880 	return (B_FALSE);
2881 }
2882 
2883 /*
2884  * returns EINVAL if reconstruction of the block will not be possible
2885  * returns ECKSUM if this specific reconstruction failed
2886  * returns 0 on successful reconstruction
2887  */
2888 static int
2889 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2890 {
2891 	raidz_map_t *rm = zio->io_vsd;
2892 	int physical_width = zio->io_vd->vdev_children;
2893 	int original_width = (rm->rm_original_width != 0) ?
2894 	    rm->rm_original_width : physical_width;
2895 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2896 
2897 	if (dbgmsg) {
2898 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2899 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2900 	}
2901 
2902 	/* Reconstruct each row */
2903 	for (int r = 0; r < rm->rm_nrows; r++) {
2904 		raidz_row_t *rr = rm->rm_row[r];
2905 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2906 		int t = 0;
2907 		int dead = 0;
2908 		int dead_data = 0;
2909 
2910 		if (dbgmsg)
2911 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2912 
2913 		for (int c = 0; c < rr->rr_cols; c++) {
2914 			raidz_col_t *rc = &rr->rr_col[c];
2915 			ASSERT0(rc->rc_need_orig_restore);
2916 			if (rc->rc_error != 0) {
2917 				dead++;
2918 				if (c >= nparity)
2919 					dead_data++;
2920 				continue;
2921 			}
2922 			if (rc->rc_size == 0)
2923 				continue;
2924 			for (int lt = 0; lt < ntgts; lt++) {
2925 				if (raidz_simulate_failure(physical_width,
2926 				    original_width,
2927 				    zio->io_vd->vdev_top->vdev_ashift,
2928 				    ltgts[lt], rc)) {
2929 					if (rc->rc_orig_data == NULL) {
2930 						rc->rc_orig_data =
2931 						    abd_alloc_linear(
2932 						    rc->rc_size, B_TRUE);
2933 						abd_copy(rc->rc_orig_data,
2934 						    rc->rc_abd, rc->rc_size);
2935 					}
2936 					rc->rc_need_orig_restore = B_TRUE;
2937 
2938 					dead++;
2939 					if (c >= nparity)
2940 						dead_data++;
2941 					/*
2942 					 * Note: simulating failure of a
2943 					 * pre-expansion device can hit more
2944 					 * than one column, in which case we
2945 					 * might try to simulate more failures
2946 					 * than can be reconstructed, which is
2947 					 * also more than the size of my_tgts.
2948 					 * This check prevents accessing past
2949 					 * the end of my_tgts.  The "dead >
2950 					 * nparity" check below will fail this
2951 					 * reconstruction attempt.
2952 					 */
2953 					if (t < VDEV_RAIDZ_MAXPARITY) {
2954 						my_tgts[t++] = c;
2955 						if (dbgmsg) {
2956 							zfs_dbgmsg("simulating "
2957 							    "failure of col %u "
2958 							    "devidx %u", c,
2959 							    (int)rc->rc_devidx);
2960 						}
2961 					}
2962 					break;
2963 				}
2964 			}
2965 		}
2966 		if (dead > nparity) {
2967 			/* reconstruction not possible */
2968 			if (dbgmsg) {
2969 				zfs_dbgmsg("reconstruction not possible; "
2970 				    "too many failures");
2971 			}
2972 			raidz_restore_orig_data(rm);
2973 			return (EINVAL);
2974 		}
2975 		if (dead_data > 0)
2976 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
2977 	}
2978 
2979 	/* Check for success */
2980 	if (raidz_checksum_verify(zio) == 0) {
2981 
2982 		/* Reconstruction succeeded - report errors */
2983 		for (int i = 0; i < rm->rm_nrows; i++) {
2984 			raidz_row_t *rr = rm->rm_row[i];
2985 
2986 			for (int c = 0; c < rr->rr_cols; c++) {
2987 				raidz_col_t *rc = &rr->rr_col[c];
2988 				if (rc->rc_need_orig_restore) {
2989 					/*
2990 					 * Note: if this is a parity column,
2991 					 * we don't really know if it's wrong.
2992 					 * We need to let
2993 					 * vdev_raidz_io_done_verified() check
2994 					 * it, and if we set rc_error, it will
2995 					 * think that it is a "known" error
2996 					 * that doesn't need to be checked
2997 					 * or corrected.
2998 					 */
2999 					if (rc->rc_error == 0 &&
3000 					    c >= rr->rr_firstdatacol) {
3001 						vdev_raidz_checksum_error(zio,
3002 						    rc, rc->rc_orig_data);
3003 						rc->rc_error =
3004 						    SET_ERROR(ECKSUM);
3005 					}
3006 					rc->rc_need_orig_restore = B_FALSE;
3007 				}
3008 			}
3009 
3010 			vdev_raidz_io_done_verified(zio, rr);
3011 		}
3012 
3013 		zio_checksum_verified(zio);
3014 
3015 		if (dbgmsg) {
3016 			zfs_dbgmsg("reconstruction successful "
3017 			    "(checksum verified)");
3018 		}
3019 		return (0);
3020 	}
3021 
3022 	/* Reconstruction failed - restore original data */
3023 	raidz_restore_orig_data(rm);
3024 	if (dbgmsg) {
3025 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3026 		    "failed", zio);
3027 	}
3028 	return (ECKSUM);
3029 }
3030 
3031 /*
3032  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3033  * Note that the algorithm below is non-optimal because it doesn't take into
3034  * account how reconstruction is actually performed. For example, with
3035  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3036  * is targeted as invalid as if columns 1 and 4 are targeted since in both
3037  * cases we'd only use parity information in column 0.
3038  *
3039  * The order that we find the various possible combinations of failed
3040  * disks is dictated by these rules:
3041  * - Examine each "slot" (the "i" in tgts[i])
3042  *   - Try to increment this slot (tgts[i] += 1)
3043  *   - if we can't increment because it runs into the next slot,
3044  *     reset our slot to the minimum, and examine the next slot
3045  *
3046  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3047  *  3 columns to reconstruct), we will generate the following sequence:
3048  *
3049  *  STATE        ACTION
3050  *  0 1 2        special case: skip since these are all parity
3051  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3052  *  0   2 3      first slot: increment to 1
3053  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3054  *  0 1     4    first: reset to 0; middle: increment to 2
3055  *  0   2   4    first: increment to 1
3056  *    1 2   4    first: reset to 0; middle: increment to 3
3057  *  0     3 4    first: increment to 1
3058  *    1   3 4    first: increment to 2
3059  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3060  *  0 1       5  first: reset to 0; middle: increment to 2
3061  *  0   2     5  first: increment to 1
3062  *    1 2     5  first: reset to 0; middle: increment to 3
3063  *  0     3   5  first: increment to 1
3064  *    1   3   5  first: increment to 2
3065  *      2 3   5  first: reset to 0; middle: increment to 4
3066  *  0       4 5  first: increment to 1
3067  *    1     4 5  first: increment to 2
3068  *      2   4 5  first: increment to 3
3069  *        3 4 5  done
3070  *
3071  * This strategy works for dRAID but is less efficient when there are a large
3072  * number of child vdevs and therefore permutations to check. Furthermore,
3073  * since the raidz_map_t rows likely do not overlap, reconstruction would be
3074  * possible as long as there are no more than nparity data errors per row.
3075  * These additional permutations are not currently checked but could be as
3076  * a future improvement.
3077  *
3078  * Returns 0 on success, ECKSUM on failure.
3079  */
3080 static int
3081 vdev_raidz_combrec(zio_t *zio)
3082 {
3083 	int nparity = vdev_get_nparity(zio->io_vd);
3084 	raidz_map_t *rm = zio->io_vsd;
3085 	int physical_width = zio->io_vd->vdev_children;
3086 	int original_width = (rm->rm_original_width != 0) ?
3087 	    rm->rm_original_width : physical_width;
3088 
3089 	for (int i = 0; i < rm->rm_nrows; i++) {
3090 		raidz_row_t *rr = rm->rm_row[i];
3091 		int total_errors = 0;
3092 
3093 		for (int c = 0; c < rr->rr_cols; c++) {
3094 			if (rr->rr_col[c].rc_error)
3095 				total_errors++;
3096 		}
3097 
3098 		if (total_errors > nparity)
3099 			return (vdev_raidz_worst_error(rr));
3100 	}
3101 
3102 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3103 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3104 		int *ltgts = &tstore[1]; /* value is logical child ID */
3105 
3106 
3107 		/*
3108 		 * Determine number of logical children, n.  See comment
3109 		 * above raidz_simulate_failure().
3110 		 */
3111 		int n = 0;
3112 		for (int w = physical_width;
3113 		    w >= original_width; w--) {
3114 			n += w;
3115 		}
3116 
3117 		ASSERT3U(num_failures, <=, nparity);
3118 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3119 
3120 		/* Handle corner cases in combrec logic */
3121 		ltgts[-1] = -1;
3122 		for (int i = 0; i < num_failures; i++) {
3123 			ltgts[i] = i;
3124 		}
3125 		ltgts[num_failures] = n;
3126 
3127 		for (;;) {
3128 			int err = raidz_reconstruct(zio, ltgts, num_failures,
3129 			    nparity);
3130 			if (err == EINVAL) {
3131 				/*
3132 				 * Reconstruction not possible with this #
3133 				 * failures; try more failures.
3134 				 */
3135 				break;
3136 			} else if (err == 0)
3137 				return (0);
3138 
3139 			/* Compute next targets to try */
3140 			for (int t = 0; ; t++) {
3141 				ASSERT3U(t, <, num_failures);
3142 				ltgts[t]++;
3143 				if (ltgts[t] == n) {
3144 					/* try more failures */
3145 					ASSERT3U(t, ==, num_failures - 1);
3146 					if (zfs_flags &
3147 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3148 						zfs_dbgmsg("reconstruction "
3149 						    "failed for num_failures="
3150 						    "%u; tried all "
3151 						    "combinations",
3152 						    num_failures);
3153 					}
3154 					break;
3155 				}
3156 
3157 				ASSERT3U(ltgts[t], <, n);
3158 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3159 
3160 				/*
3161 				 * If that spot is available, we're done here.
3162 				 * Try the next combination.
3163 				 */
3164 				if (ltgts[t] != ltgts[t + 1])
3165 					break; // found next combination
3166 
3167 				/*
3168 				 * Otherwise, reset this tgt to the minimum,
3169 				 * and move on to the next tgt.
3170 				 */
3171 				ltgts[t] = ltgts[t - 1] + 1;
3172 				ASSERT3U(ltgts[t], ==, t);
3173 			}
3174 
3175 			/* Increase the number of failures and keep trying. */
3176 			if (ltgts[num_failures - 1] == n)
3177 				break;
3178 		}
3179 	}
3180 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3181 		zfs_dbgmsg("reconstruction failed for all num_failures");
3182 	return (ECKSUM);
3183 }
3184 
3185 void
3186 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3187 {
3188 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3189 		raidz_row_t *rr = rm->rm_row[row];
3190 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
3191 	}
3192 }
3193 
3194 /*
3195  * Complete a write IO operation on a RAIDZ VDev
3196  *
3197  * Outline:
3198  *   1. Check for errors on the child IOs.
3199  *   2. Return, setting an error code if too few child VDevs were written
3200  *      to reconstruct the data later.  Note that partial writes are
3201  *      considered successful if they can be reconstructed at all.
3202  */
3203 static void
3204 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3205 {
3206 	int normal_errors = 0;
3207 	int shadow_errors = 0;
3208 
3209 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3210 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3211 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3212 
3213 	for (int c = 0; c < rr->rr_cols; c++) {
3214 		raidz_col_t *rc = &rr->rr_col[c];
3215 
3216 		if (rc->rc_error != 0) {
3217 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3218 			normal_errors++;
3219 		}
3220 		if (rc->rc_shadow_error != 0) {
3221 			ASSERT(rc->rc_shadow_error != ECKSUM);
3222 			shadow_errors++;
3223 		}
3224 	}
3225 
3226 	/*
3227 	 * Treat partial writes as a success. If we couldn't write enough
3228 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3229 	 * enough.  Note that in the case of a shadow write (during raidz
3230 	 * expansion), depending on if we crash, either the normal (old) or
3231 	 * shadow (new) location may become the "real" version of the block,
3232 	 * so both locations must have sufficient redundancy.
3233 	 *
3234 	 * Now that we support write reallocation, it would be better
3235 	 * to treat partial failure as real failure unless there are
3236 	 * no non-degraded top-level vdevs left, and not update DTLs
3237 	 * if we intend to reallocate.
3238 	 */
3239 	if (normal_errors > rr->rr_firstdatacol ||
3240 	    shadow_errors > rr->rr_firstdatacol) {
3241 		zio->io_error = zio_worst_error(zio->io_error,
3242 		    vdev_raidz_worst_error(rr));
3243 	}
3244 }
3245 
3246 static void
3247 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3248     raidz_row_t *rr)
3249 {
3250 	int parity_errors = 0;
3251 	int parity_untried = 0;
3252 	int data_errors = 0;
3253 	int total_errors = 0;
3254 
3255 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3256 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3257 
3258 	for (int c = 0; c < rr->rr_cols; c++) {
3259 		raidz_col_t *rc = &rr->rr_col[c];
3260 
3261 		/*
3262 		 * If scrubbing and a replacing/sparing child vdev determined
3263 		 * that not all of its children have an identical copy of the
3264 		 * data, then clear the error so the column is treated like
3265 		 * any other read and force a repair to correct the damage.
3266 		 */
3267 		if (rc->rc_error == ECKSUM) {
3268 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3269 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3270 			rc->rc_force_repair = 1;
3271 			rc->rc_error = 0;
3272 		}
3273 
3274 		if (rc->rc_error) {
3275 			if (c < rr->rr_firstdatacol)
3276 				parity_errors++;
3277 			else
3278 				data_errors++;
3279 
3280 			total_errors++;
3281 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3282 			parity_untried++;
3283 		}
3284 	}
3285 
3286 	/*
3287 	 * If there were data errors and the number of errors we saw was
3288 	 * correctable -- less than or equal to the number of parity disks read
3289 	 * -- reconstruct based on the missing data.
3290 	 */
3291 	if (data_errors != 0 &&
3292 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3293 		/*
3294 		 * We either attempt to read all the parity columns or
3295 		 * none of them. If we didn't try to read parity, we
3296 		 * wouldn't be here in the correctable case. There must
3297 		 * also have been fewer parity errors than parity
3298 		 * columns or, again, we wouldn't be in this code path.
3299 		 */
3300 		ASSERT(parity_untried == 0);
3301 		ASSERT(parity_errors < rr->rr_firstdatacol);
3302 
3303 		/*
3304 		 * Identify the data columns that reported an error.
3305 		 */
3306 		int n = 0;
3307 		int tgts[VDEV_RAIDZ_MAXPARITY];
3308 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3309 			raidz_col_t *rc = &rr->rr_col[c];
3310 			if (rc->rc_error != 0) {
3311 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3312 				tgts[n++] = c;
3313 			}
3314 		}
3315 
3316 		ASSERT(rr->rr_firstdatacol >= n);
3317 
3318 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3319 	}
3320 }
3321 
3322 /*
3323  * Return the number of reads issued.
3324  */
3325 static int
3326 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3327 {
3328 	vdev_t *vd = zio->io_vd;
3329 	int nread = 0;
3330 
3331 	rr->rr_missingdata = 0;
3332 	rr->rr_missingparity = 0;
3333 
3334 	/*
3335 	 * If this rows contains empty sectors which are not required
3336 	 * for a normal read then allocate an ABD for them now so they
3337 	 * may be read, verified, and any needed repairs performed.
3338 	 */
3339 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3340 		vdev_draid_map_alloc_empty(zio, rr);
3341 
3342 	for (int c = 0; c < rr->rr_cols; c++) {
3343 		raidz_col_t *rc = &rr->rr_col[c];
3344 		if (rc->rc_tried || rc->rc_size == 0)
3345 			continue;
3346 
3347 		zio_nowait(zio_vdev_child_io(zio, NULL,
3348 		    vd->vdev_child[rc->rc_devidx],
3349 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3350 		    zio->io_type, zio->io_priority, 0,
3351 		    vdev_raidz_child_done, rc));
3352 		nread++;
3353 	}
3354 	return (nread);
3355 }
3356 
3357 /*
3358  * We're here because either there were too many errors to even attempt
3359  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3360  * failed. In either case, there is enough bad data to prevent reconstruction.
3361  * Start checksum ereports for all children which haven't failed.
3362  */
3363 static void
3364 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3365 {
3366 	raidz_map_t *rm = zio->io_vsd;
3367 
3368 	for (int i = 0; i < rm->rm_nrows; i++) {
3369 		raidz_row_t *rr = rm->rm_row[i];
3370 
3371 		for (int c = 0; c < rr->rr_cols; c++) {
3372 			raidz_col_t *rc = &rr->rr_col[c];
3373 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3374 
3375 			if (rc->rc_error != 0)
3376 				continue;
3377 
3378 			zio_bad_cksum_t zbc;
3379 			zbc.zbc_has_cksum = 0;
3380 			zbc.zbc_injected = rm->rm_ecksuminjected;
3381 
3382 			mutex_enter(&cvd->vdev_stat_lock);
3383 			cvd->vdev_stat.vs_checksum_errors++;
3384 			mutex_exit(&cvd->vdev_stat_lock);
3385 			(void) zfs_ereport_start_checksum(zio->io_spa,
3386 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3387 			    rc->rc_size, &zbc);
3388 		}
3389 	}
3390 }
3391 
3392 void
3393 vdev_raidz_io_done(zio_t *zio)
3394 {
3395 	raidz_map_t *rm = zio->io_vsd;
3396 
3397 	ASSERT(zio->io_bp != NULL);
3398 	if (zio->io_type == ZIO_TYPE_WRITE) {
3399 		for (int i = 0; i < rm->rm_nrows; i++) {
3400 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3401 		}
3402 	} else {
3403 		if (rm->rm_phys_col) {
3404 			/*
3405 			 * This is an aggregated read.  Copy the data and status
3406 			 * from the aggregate abd's to the individual rows.
3407 			 */
3408 			for (int i = 0; i < rm->rm_nrows; i++) {
3409 				raidz_row_t *rr = rm->rm_row[i];
3410 
3411 				for (int c = 0; c < rr->rr_cols; c++) {
3412 					raidz_col_t *rc = &rr->rr_col[c];
3413 					if (rc->rc_tried || rc->rc_size == 0)
3414 						continue;
3415 
3416 					raidz_col_t *prc =
3417 					    &rm->rm_phys_col[rc->rc_devidx];
3418 					rc->rc_error = prc->rc_error;
3419 					rc->rc_tried = prc->rc_tried;
3420 					rc->rc_skipped = prc->rc_skipped;
3421 					if (c >= rr->rr_firstdatacol) {
3422 						/*
3423 						 * Note: this is slightly faster
3424 						 * than using abd_copy_off().
3425 						 */
3426 						char *physbuf = abd_to_buf(
3427 						    prc->rc_abd);
3428 						void *physloc = physbuf +
3429 						    rc->rc_offset -
3430 						    prc->rc_offset;
3431 
3432 						abd_copy_from_buf(rc->rc_abd,
3433 						    physloc, rc->rc_size);
3434 					}
3435 				}
3436 			}
3437 		}
3438 
3439 		for (int i = 0; i < rm->rm_nrows; i++) {
3440 			raidz_row_t *rr = rm->rm_row[i];
3441 			vdev_raidz_io_done_reconstruct_known_missing(zio,
3442 			    rm, rr);
3443 		}
3444 
3445 		if (raidz_checksum_verify(zio) == 0) {
3446 			for (int i = 0; i < rm->rm_nrows; i++) {
3447 				raidz_row_t *rr = rm->rm_row[i];
3448 				vdev_raidz_io_done_verified(zio, rr);
3449 			}
3450 			zio_checksum_verified(zio);
3451 		} else {
3452 			/*
3453 			 * A sequential resilver has no checksum which makes
3454 			 * combinatoral reconstruction impossible. This code
3455 			 * path is unreachable since raidz_checksum_verify()
3456 			 * has no checksum to verify and must succeed.
3457 			 */
3458 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3459 
3460 			/*
3461 			 * This isn't a typical situation -- either we got a
3462 			 * read error or a child silently returned bad data.
3463 			 * Read every block so we can try again with as much
3464 			 * data and parity as we can track down. If we've
3465 			 * already been through once before, all children will
3466 			 * be marked as tried so we'll proceed to combinatorial
3467 			 * reconstruction.
3468 			 */
3469 			int nread = 0;
3470 			for (int i = 0; i < rm->rm_nrows; i++) {
3471 				nread += vdev_raidz_read_all(zio,
3472 				    rm->rm_row[i]);
3473 			}
3474 			if (nread != 0) {
3475 				/*
3476 				 * Normally our stage is VDEV_IO_DONE, but if
3477 				 * we've already called redone(), it will have
3478 				 * changed to VDEV_IO_START, in which case we
3479 				 * don't want to call redone() again.
3480 				 */
3481 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3482 					zio_vdev_io_redone(zio);
3483 				return;
3484 			}
3485 			/*
3486 			 * It would be too expensive to try every possible
3487 			 * combination of failed sectors in every row, so
3488 			 * instead we try every combination of failed current or
3489 			 * past physical disk. This means that if the incorrect
3490 			 * sectors were all on Nparity disks at any point in the
3491 			 * past, we will find the correct data.  The only known
3492 			 * case where this is less durable than a non-expanded
3493 			 * RAIDZ, is if we have a silent failure during
3494 			 * expansion.  In that case, one block could be
3495 			 * partially in the old format and partially in the
3496 			 * new format, so we'd lost some sectors from the old
3497 			 * format and some from the new format.
3498 			 *
3499 			 * e.g. logical_width=4 physical_width=6
3500 			 * the 15 (6+5+4) possible failed disks are:
3501 			 * width=6 child=0
3502 			 * width=6 child=1
3503 			 * width=6 child=2
3504 			 * width=6 child=3
3505 			 * width=6 child=4
3506 			 * width=6 child=5
3507 			 * width=5 child=0
3508 			 * width=5 child=1
3509 			 * width=5 child=2
3510 			 * width=5 child=3
3511 			 * width=5 child=4
3512 			 * width=4 child=0
3513 			 * width=4 child=1
3514 			 * width=4 child=2
3515 			 * width=4 child=3
3516 			 * And we will try every combination of Nparity of these
3517 			 * failing.
3518 			 *
3519 			 * As a first pass, we can generate every combo,
3520 			 * and try reconstructing, ignoring any known
3521 			 * failures.  If any row has too many known + simulated
3522 			 * failures, then we bail on reconstructing with this
3523 			 * number of simulated failures.  As an improvement,
3524 			 * we could detect the number of whole known failures
3525 			 * (i.e. we have known failures on these disks for
3526 			 * every row; the disks never succeeded), and
3527 			 * subtract that from the max # failures to simulate.
3528 			 * We could go even further like the current
3529 			 * combrec code, but that doesn't seem like it
3530 			 * gains us very much.  If we simulate a failure
3531 			 * that is also a known failure, that's fine.
3532 			 */
3533 			zio->io_error = vdev_raidz_combrec(zio);
3534 			if (zio->io_error == ECKSUM &&
3535 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3536 				vdev_raidz_io_done_unrecoverable(zio);
3537 			}
3538 		}
3539 	}
3540 	if (rm->rm_lr != NULL) {
3541 		zfs_rangelock_exit(rm->rm_lr);
3542 		rm->rm_lr = NULL;
3543 	}
3544 }
3545 
3546 static void
3547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3548 {
3549 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3550 	if (faulted > vdrz->vd_nparity)
3551 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3552 		    VDEV_AUX_NO_REPLICAS);
3553 	else if (degraded + faulted != 0)
3554 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3555 	else
3556 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3557 }
3558 
3559 /*
3560  * Determine if any portion of the provided block resides on a child vdev
3561  * with a dirty DTL and therefore needs to be resilvered.  The function
3562  * assumes that at least one DTL is dirty which implies that full stripe
3563  * width blocks must be resilvered.
3564  */
3565 static boolean_t
3566 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3567     uint64_t phys_birth)
3568 {
3569 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3570 
3571 	/*
3572 	 * If we're in the middle of a RAIDZ expansion, this block may be in
3573 	 * the old and/or new location.  For simplicity, always resilver it.
3574 	 */
3575 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3576 		return (B_TRUE);
3577 
3578 	uint64_t dcols = vd->vdev_children;
3579 	uint64_t nparity = vdrz->vd_nparity;
3580 	uint64_t ashift = vd->vdev_top->vdev_ashift;
3581 	/* The starting RAIDZ (parent) vdev sector of the block. */
3582 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3583 	/* The zio's size in units of the vdev's minimum sector size. */
3584 	uint64_t s = ((psize - 1) >> ashift) + 1;
3585 	/* The first column for this stripe. */
3586 	uint64_t f = b % dcols;
3587 
3588 	/* Unreachable by sequential resilver. */
3589 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3590 
3591 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3592 		return (B_FALSE);
3593 
3594 	if (s + nparity >= dcols)
3595 		return (B_TRUE);
3596 
3597 	for (uint64_t c = 0; c < s + nparity; c++) {
3598 		uint64_t devidx = (f + c) % dcols;
3599 		vdev_t *cvd = vd->vdev_child[devidx];
3600 
3601 		/*
3602 		 * dsl_scan_need_resilver() already checked vd with
3603 		 * vdev_dtl_contains(). So here just check cvd with
3604 		 * vdev_dtl_empty(), cheaper and a good approximation.
3605 		 */
3606 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3607 			return (B_TRUE);
3608 	}
3609 
3610 	return (B_FALSE);
3611 }
3612 
3613 static void
3614 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3615     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3616 {
3617 	(void) remain_rs;
3618 
3619 	vdev_t *raidvd = cvd->vdev_parent;
3620 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3621 
3622 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3623 
3624 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3625 		/*
3626 		 * We're in the middle of expansion, in which case the
3627 		 * translation is in flux.  Any answer we give may be wrong
3628 		 * by the time we return, so it isn't safe for the caller to
3629 		 * act on it.  Therefore we say that this range isn't present
3630 		 * on any children.  The only consumers of this are "zpool
3631 		 * initialize" and trimming, both of which are "best effort"
3632 		 * anyway.
3633 		 */
3634 		physical_rs->rs_start = physical_rs->rs_end = 0;
3635 		remain_rs->rs_start = remain_rs->rs_end = 0;
3636 		return;
3637 	}
3638 
3639 	uint64_t width = vdrz->vd_physical_width;
3640 	uint64_t tgt_col = cvd->vdev_id;
3641 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3642 
3643 	/* make sure the offsets are block-aligned */
3644 	ASSERT0(logical_rs->rs_start % (1 << ashift));
3645 	ASSERT0(logical_rs->rs_end % (1 << ashift));
3646 	uint64_t b_start = logical_rs->rs_start >> ashift;
3647 	uint64_t b_end = logical_rs->rs_end >> ashift;
3648 
3649 	uint64_t start_row = 0;
3650 	if (b_start > tgt_col) /* avoid underflow */
3651 		start_row = ((b_start - tgt_col - 1) / width) + 1;
3652 
3653 	uint64_t end_row = 0;
3654 	if (b_end > tgt_col)
3655 		end_row = ((b_end - tgt_col - 1) / width) + 1;
3656 
3657 	physical_rs->rs_start = start_row << ashift;
3658 	physical_rs->rs_end = end_row << ashift;
3659 
3660 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3661 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3662 	    logical_rs->rs_end - logical_rs->rs_start);
3663 }
3664 
3665 static void
3666 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3667 {
3668 	spa_t *spa = arg;
3669 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3670 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3671 
3672 	/*
3673 	 * Ensure there are no i/os to the range that is being committed.
3674 	 */
3675 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3676 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3677 
3678 	mutex_enter(&vre->vre_lock);
3679 	uint64_t new_offset =
3680 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3681 	/*
3682 	 * We should not have committed anything that failed.
3683 	 */
3684 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3685 	mutex_exit(&vre->vre_lock);
3686 
3687 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3688 	    old_offset, new_offset - old_offset,
3689 	    RL_WRITER);
3690 
3691 	/*
3692 	 * Update the uberblock that will be written when this txg completes.
3693 	 */
3694 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3695 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3696 	vre->vre_offset_pertxg[txgoff] = 0;
3697 	zfs_rangelock_exit(lr);
3698 
3699 	mutex_enter(&vre->vre_lock);
3700 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3701 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
3702 	mutex_exit(&vre->vre_lock);
3703 
3704 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3705 	VERIFY0(zap_update(spa->spa_meta_objset,
3706 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3707 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3708 }
3709 
3710 static void
3711 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3712 {
3713 	spa_t *spa = arg;
3714 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3715 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3716 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3717 
3718 	for (int i = 0; i < TXG_SIZE; i++)
3719 		VERIFY0(vre->vre_offset_pertxg[i]);
3720 
3721 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3722 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3723 	re->re_logical_width = vdrz->vd_physical_width;
3724 	mutex_enter(&vdrz->vd_expand_lock);
3725 	avl_add(&vdrz->vd_expand_txgs, re);
3726 	mutex_exit(&vdrz->vd_expand_lock);
3727 
3728 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3729 
3730 	/*
3731 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3732 	 * will get written (based on vd_expand_txgs).
3733 	 */
3734 	vdev_config_dirty(vd);
3735 
3736 	/*
3737 	 * Before we change vre_state, the on-disk state must reflect that we
3738 	 * have completed all copying, so that vdev_raidz_io_start() can use
3739 	 * vre_state to determine if the reflow is in progress.  See also the
3740 	 * end of spa_raidz_expand_thread().
3741 	 */
3742 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3743 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3744 
3745 	vre->vre_end_time = gethrestime_sec();
3746 	vre->vre_state = DSS_FINISHED;
3747 
3748 	uint64_t state = vre->vre_state;
3749 	VERIFY0(zap_update(spa->spa_meta_objset,
3750 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3751 	    sizeof (state), 1, &state, tx));
3752 
3753 	uint64_t end_time = vre->vre_end_time;
3754 	VERIFY0(zap_update(spa->spa_meta_objset,
3755 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3756 	    sizeof (end_time), 1, &end_time, tx));
3757 
3758 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
3759 
3760 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3761 	    "%s vdev %llu new width %llu", spa_name(spa),
3762 	    (unsigned long long)vd->vdev_id,
3763 	    (unsigned long long)vd->vdev_children);
3764 
3765 	spa->spa_raidz_expand = NULL;
3766 	raidvd->vdev_rz_expanding = B_FALSE;
3767 
3768 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3769 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3770 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3771 
3772 	spa_notify_waiters(spa);
3773 
3774 	/*
3775 	 * While we're in syncing context take the opportunity to
3776 	 * setup a scrub. All the data has been sucessfully copied
3777 	 * but we have not validated any checksums.
3778 	 */
3779 	pool_scan_func_t func = POOL_SCAN_SCRUB;
3780 	if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
3781 		dsl_scan_setup_sync(&func, tx);
3782 }
3783 
3784 /*
3785  * Struct for one copy zio.
3786  */
3787 typedef struct raidz_reflow_arg {
3788 	vdev_raidz_expand_t *rra_vre;
3789 	zfs_locked_range_t *rra_lr;
3790 	uint64_t rra_txg;
3791 } raidz_reflow_arg_t;
3792 
3793 /*
3794  * The write of the new location is done.
3795  */
3796 static void
3797 raidz_reflow_write_done(zio_t *zio)
3798 {
3799 	raidz_reflow_arg_t *rra = zio->io_private;
3800 	vdev_raidz_expand_t *vre = rra->rra_vre;
3801 
3802 	abd_free(zio->io_abd);
3803 
3804 	mutex_enter(&vre->vre_lock);
3805 	if (zio->io_error != 0) {
3806 		/* Force a reflow pause on errors */
3807 		vre->vre_failed_offset =
3808 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3809 	}
3810 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3811 	vre->vre_outstanding_bytes -= zio->io_size;
3812 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3813 	    vre->vre_failed_offset) {
3814 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3815 		    zio->io_size;
3816 	}
3817 	cv_signal(&vre->vre_cv);
3818 	mutex_exit(&vre->vre_lock);
3819 
3820 	zfs_rangelock_exit(rra->rra_lr);
3821 
3822 	kmem_free(rra, sizeof (*rra));
3823 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3824 }
3825 
3826 /*
3827  * The read of the old location is done.  The parent zio is the write to
3828  * the new location.  Allow it to start.
3829  */
3830 static void
3831 raidz_reflow_read_done(zio_t *zio)
3832 {
3833 	raidz_reflow_arg_t *rra = zio->io_private;
3834 	vdev_raidz_expand_t *vre = rra->rra_vre;
3835 
3836 	/*
3837 	 * If the read failed, or if it was done on a vdev that is not fully
3838 	 * healthy (e.g. a child that has a resilver in progress), we may not
3839 	 * have the correct data.  Note that it's OK if the write proceeds.
3840 	 * It may write garbage but the location is otherwise unused and we
3841 	 * will retry later due to vre_failed_offset.
3842 	 */
3843 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3844 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3845 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3846 		    (long long)rra->rra_lr->lr_offset,
3847 		    (long long)rra->rra_lr->lr_length,
3848 		    (long long)rra->rra_txg,
3849 		    zio->io_error,
3850 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3851 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3852 		mutex_enter(&vre->vre_lock);
3853 		/* Force a reflow pause on errors */
3854 		vre->vre_failed_offset =
3855 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3856 		mutex_exit(&vre->vre_lock);
3857 	}
3858 
3859 	zio_nowait(zio_unique_parent(zio));
3860 }
3861 
3862 static void
3863 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3864     dmu_tx_t *tx)
3865 {
3866 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3867 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3868 
3869 	if (offset == 0)
3870 		return;
3871 
3872 	mutex_enter(&vre->vre_lock);
3873 	ASSERT3U(vre->vre_offset, <=, offset);
3874 	vre->vre_offset = offset;
3875 	mutex_exit(&vre->vre_lock);
3876 
3877 	if (vre->vre_offset_pertxg[txgoff] == 0) {
3878 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3879 		    spa, tx);
3880 	}
3881 	vre->vre_offset_pertxg[txgoff] = offset;
3882 }
3883 
3884 static boolean_t
3885 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3886 {
3887 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
3888 		/* Quick check if a child is being replaced */
3889 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3890 			return (B_TRUE);
3891 	}
3892 	return (B_FALSE);
3893 }
3894 
3895 static boolean_t
3896 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3897     dmu_tx_t *tx)
3898 {
3899 	spa_t *spa = vd->vdev_spa;
3900 	int ashift = vd->vdev_top->vdev_ashift;
3901 	uint64_t offset, size;
3902 
3903 	if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3904 	    &offset, &size)) {
3905 		return (B_FALSE);
3906 	}
3907 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3908 	ASSERT3U(size, >=, 1 << ashift);
3909 	uint64_t length = 1 << ashift;
3910 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3911 
3912 	uint64_t blkid = offset >> ashift;
3913 
3914 	int old_children = vd->vdev_children - 1;
3915 
3916 	/*
3917 	 * We can only progress to the point that writes will not overlap
3918 	 * with blocks whose progress has not yet been recorded on disk.
3919 	 * Since partially-copied rows are still read from the old location,
3920 	 * we need to stop one row before the sector-wise overlap, to prevent
3921 	 * row-wise overlap.
3922 	 *
3923 	 * Note that even if we are skipping over a large unallocated region,
3924 	 * we can't move the on-disk progress to `offset`, because concurrent
3925 	 * writes/allocations could still use the currently-unallocated
3926 	 * region.
3927 	 */
3928 	uint64_t ubsync_blkid =
3929 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3930 	uint64_t next_overwrite_blkid = ubsync_blkid +
3931 	    ubsync_blkid / old_children - old_children;
3932 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3933 
3934 	if (blkid >= next_overwrite_blkid) {
3935 		raidz_reflow_record_progress(vre,
3936 		    next_overwrite_blkid << ashift, tx);
3937 		return (B_TRUE);
3938 	}
3939 
3940 	range_tree_remove(rt, offset, length);
3941 
3942 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
3943 	rra->rra_vre = vre;
3944 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3945 	    offset, length, RL_WRITER);
3946 	rra->rra_txg = dmu_tx_get_txg(tx);
3947 
3948 	raidz_reflow_record_progress(vre, offset + length, tx);
3949 
3950 	mutex_enter(&vre->vre_lock);
3951 	vre->vre_outstanding_bytes += length;
3952 	mutex_exit(&vre->vre_lock);
3953 
3954 	/*
3955 	 * SCL_STATE will be released when the read and write are done,
3956 	 * by raidz_reflow_write_done().
3957 	 */
3958 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3959 
3960 	/* check if a replacing vdev was added, if so treat it as an error */
3961 	if (vdev_raidz_expand_child_replacing(vd)) {
3962 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3963 		    "offset=%llu txg=%llu",
3964 		    (long long)rra->rra_lr->lr_offset,
3965 		    (long long)rra->rra_txg);
3966 
3967 		mutex_enter(&vre->vre_lock);
3968 		vre->vre_failed_offset =
3969 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3970 		cv_signal(&vre->vre_cv);
3971 		mutex_exit(&vre->vre_lock);
3972 
3973 		/* drop everything we acquired */
3974 		zfs_rangelock_exit(rra->rra_lr);
3975 		kmem_free(rra, sizeof (*rra));
3976 		spa_config_exit(spa, SCL_STATE, spa);
3977 		return (B_TRUE);
3978 	}
3979 
3980 	zio_t *pio = spa->spa_txg_zio[txgoff];
3981 	abd_t *abd = abd_alloc_for_io(length, B_FALSE);
3982 	zio_t *write_zio = zio_vdev_child_io(pio, NULL,
3983 	    vd->vdev_child[blkid % vd->vdev_children],
3984 	    (blkid / vd->vdev_children) << ashift,
3985 	    abd, length,
3986 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
3987 	    ZIO_FLAG_CANFAIL,
3988 	    raidz_reflow_write_done, rra);
3989 
3990 	zio_nowait(zio_vdev_child_io(write_zio, NULL,
3991 	    vd->vdev_child[blkid % old_children],
3992 	    (blkid / old_children) << ashift,
3993 	    abd, length,
3994 	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
3995 	    ZIO_FLAG_CANFAIL,
3996 	    raidz_reflow_read_done, rra));
3997 
3998 	return (B_FALSE);
3999 }
4000 
4001 /*
4002  * For testing (ztest specific)
4003  */
4004 static void
4005 raidz_expand_pause(uint_t pause_point)
4006 {
4007 	while (raidz_expand_pause_point != 0 &&
4008 	    raidz_expand_pause_point <= pause_point)
4009 		delay(hz);
4010 }
4011 
4012 static void
4013 raidz_scratch_child_done(zio_t *zio)
4014 {
4015 	zio_t *pio = zio->io_private;
4016 
4017 	mutex_enter(&pio->io_lock);
4018 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4019 	mutex_exit(&pio->io_lock);
4020 }
4021 
4022 /*
4023  * Reflow the beginning portion of the vdev into an intermediate scratch area
4024  * in memory and on disk. This operation must be persisted on disk before we
4025  * proceed to overwrite the beginning portion with the reflowed data.
4026  *
4027  * This multi-step task can fail to complete if disk errors are encountered
4028  * and we can return here after a pause (waiting for disk to become healthy).
4029  */
4030 static void
4031 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4032 {
4033 	vdev_raidz_expand_t *vre = arg;
4034 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4035 	zio_t *pio;
4036 	int error;
4037 
4038 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4039 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4040 	int ashift = raidvd->vdev_ashift;
4041 	uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift);
4042 	uint64_t logical_size = write_size * raidvd->vdev_children;
4043 	uint64_t read_size =
4044 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4045 	    1 << ashift);
4046 
4047 	/*
4048 	 * The scratch space must be large enough to get us to the point
4049 	 * that one row does not overlap itself when moved.  This is checked
4050 	 * by vdev_raidz_attach_check().
4051 	 */
4052 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4053 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4054 	VERIFY3U(write_size, <=, read_size);
4055 
4056 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4057 	    0, logical_size, RL_WRITER);
4058 
4059 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4060 	    KM_SLEEP);
4061 	for (int i = 0; i < raidvd->vdev_children; i++) {
4062 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4063 	}
4064 
4065 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4066 
4067 	/*
4068 	 * If we have already written the scratch area then we must read from
4069 	 * there, since new writes were redirected there while we were paused
4070 	 * or the original location may have been partially overwritten with
4071 	 * reflowed data.
4072 	 */
4073 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4074 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4075 		/*
4076 		 * Read from scratch space.
4077 		 */
4078 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4079 		for (int i = 0; i < raidvd->vdev_children; i++) {
4080 			/*
4081 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4082 			 * to the offset to calculate the physical offset to
4083 			 * write to.  Passing in a negative offset makes us
4084 			 * access the scratch area.
4085 			 */
4086 			zio_nowait(zio_vdev_child_io(pio, NULL,
4087 			    raidvd->vdev_child[i],
4088 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4089 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4090 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4091 		}
4092 		error = zio_wait(pio);
4093 		if (error != 0) {
4094 			zfs_dbgmsg("reflow: error %d reading scratch location",
4095 			    error);
4096 			goto io_error_exit;
4097 		}
4098 		goto overwrite;
4099 	}
4100 
4101 	/*
4102 	 * Read from original location.
4103 	 */
4104 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4105 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4106 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4107 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4108 		    0, abds[i], read_size, ZIO_TYPE_READ,
4109 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4110 		    raidz_scratch_child_done, pio));
4111 	}
4112 	error = zio_wait(pio);
4113 	if (error != 0) {
4114 		zfs_dbgmsg("reflow: error %d reading original location", error);
4115 io_error_exit:
4116 		for (int i = 0; i < raidvd->vdev_children; i++)
4117 			abd_free(abds[i]);
4118 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4119 		zfs_rangelock_exit(lr);
4120 		spa_config_exit(spa, SCL_STATE, FTAG);
4121 		return;
4122 	}
4123 
4124 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4125 
4126 	/*
4127 	 * Reflow in memory.
4128 	 */
4129 	uint64_t logical_sectors = logical_size >> ashift;
4130 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4131 		int oldchild = i % (raidvd->vdev_children - 1);
4132 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4133 
4134 		int newchild = i % raidvd->vdev_children;
4135 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4136 
4137 		/* a single sector should not be copying over itself */
4138 		ASSERT(!(newchild == oldchild && newoff == oldoff));
4139 
4140 		abd_copy_off(abds[newchild], abds[oldchild],
4141 		    newoff, oldoff, 1 << ashift);
4142 	}
4143 
4144 	/*
4145 	 * Verify that we filled in everything we intended to (write_size on
4146 	 * each child).
4147 	 */
4148 	VERIFY0(logical_sectors % raidvd->vdev_children);
4149 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4150 	    write_size);
4151 
4152 	/*
4153 	 * Write to scratch location (boot area).
4154 	 */
4155 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4156 	for (int i = 0; i < raidvd->vdev_children; i++) {
4157 		/*
4158 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4159 		 * the offset to calculate the physical offset to write to.
4160 		 * Passing in a negative offset lets us access the boot area.
4161 		 */
4162 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4163 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4164 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4165 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4166 	}
4167 	error = zio_wait(pio);
4168 	if (error != 0) {
4169 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4170 		goto io_error_exit;
4171 	}
4172 	pio = zio_root(spa, NULL, NULL, 0);
4173 	zio_flush(pio, raidvd);
4174 	zio_wait(pio);
4175 
4176 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4177 	    (long long)logical_size);
4178 
4179 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4180 
4181 	/*
4182 	 * Update uberblock to indicate that scratch space is valid.  This is
4183 	 * needed because after this point, the real location may be
4184 	 * overwritten.  If we crash, we need to get the data from the
4185 	 * scratch space, rather than the real location.
4186 	 *
4187 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4188 	 * will prefer this uberblock.
4189 	 */
4190 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4191 	spa->spa_ubsync.ub_timestamp++;
4192 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4193 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4194 	if (spa_multihost(spa))
4195 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4196 
4197 	zfs_dbgmsg("reflow: uberblock updated "
4198 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4199 	    (long long)spa->spa_ubsync.ub_txg,
4200 	    (long long)logical_size,
4201 	    (long long)spa->spa_ubsync.ub_timestamp);
4202 
4203 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4204 
4205 	/*
4206 	 * Overwrite with reflow'ed data.
4207 	 */
4208 overwrite:
4209 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4210 	for (int i = 0; i < raidvd->vdev_children; i++) {
4211 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4212 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4213 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4214 		    raidz_scratch_child_done, pio));
4215 	}
4216 	error = zio_wait(pio);
4217 	if (error != 0) {
4218 		/*
4219 		 * When we exit early here and drop the range lock, new
4220 		 * writes will go into the scratch area so we'll need to
4221 		 * read from there when we return after pausing.
4222 		 */
4223 		zfs_dbgmsg("reflow: error %d writing real location", error);
4224 		/*
4225 		 * Update the uberblock that is written when this txg completes.
4226 		 */
4227 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4228 		    logical_size);
4229 		goto io_error_exit;
4230 	}
4231 	pio = zio_root(spa, NULL, NULL, 0);
4232 	zio_flush(pio, raidvd);
4233 	zio_wait(pio);
4234 
4235 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4236 	    (long long)logical_size);
4237 	for (int i = 0; i < raidvd->vdev_children; i++)
4238 		abd_free(abds[i]);
4239 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4240 
4241 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4242 
4243 	/*
4244 	 * Update uberblock to indicate that the initial part has been
4245 	 * reflow'ed.  This is needed because after this point (when we exit
4246 	 * the rangelock), we allow regular writes to this region, which will
4247 	 * be written to the new location only (because reflow_offset_next ==
4248 	 * reflow_offset_synced).  If we crashed and re-copied from the
4249 	 * scratch space, we would lose the regular writes.
4250 	 */
4251 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4252 	    logical_size);
4253 	spa->spa_ubsync.ub_timestamp++;
4254 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4255 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4256 	if (spa_multihost(spa))
4257 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4258 
4259 	zfs_dbgmsg("reflow: uberblock updated "
4260 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4261 	    (long long)spa->spa_ubsync.ub_txg,
4262 	    (long long)logical_size,
4263 	    (long long)spa->spa_ubsync.ub_timestamp);
4264 
4265 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4266 
4267 	/*
4268 	 * Update progress.
4269 	 */
4270 	vre->vre_offset = logical_size;
4271 	zfs_rangelock_exit(lr);
4272 	spa_config_exit(spa, SCL_STATE, FTAG);
4273 
4274 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4275 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4276 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4277 	/*
4278 	 * Note - raidz_reflow_sync() will update the uberblock state to
4279 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4280 	 */
4281 	raidz_reflow_sync(spa, tx);
4282 
4283 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4284 }
4285 
4286 /*
4287  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4288  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4289  */
4290 void
4291 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4292 {
4293 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4294 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4295 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4296 
4297 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4298 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4299 	ASSERT0(logical_size % raidvd->vdev_children);
4300 	uint64_t write_size = logical_size / raidvd->vdev_children;
4301 
4302 	zio_t *pio;
4303 
4304 	/*
4305 	 * Read from scratch space.
4306 	 */
4307 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4308 	    KM_SLEEP);
4309 	for (int i = 0; i < raidvd->vdev_children; i++) {
4310 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4311 	}
4312 
4313 	pio = zio_root(spa, NULL, NULL, 0);
4314 	for (int i = 0; i < raidvd->vdev_children; i++) {
4315 		/*
4316 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4317 		 * the offset to calculate the physical offset to write to.
4318 		 * Passing in a negative offset lets us access the boot area.
4319 		 */
4320 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4321 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4322 		    write_size, ZIO_TYPE_READ,
4323 		    ZIO_PRIORITY_ASYNC_READ, 0,
4324 		    raidz_scratch_child_done, pio));
4325 	}
4326 	zio_wait(pio);
4327 
4328 	/*
4329 	 * Overwrite real location with reflow'ed data.
4330 	 */
4331 	pio = zio_root(spa, NULL, NULL, 0);
4332 	for (int i = 0; i < raidvd->vdev_children; i++) {
4333 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4334 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4335 		    ZIO_PRIORITY_ASYNC_WRITE, 0,
4336 		    raidz_scratch_child_done, pio));
4337 	}
4338 	zio_wait(pio);
4339 	pio = zio_root(spa, NULL, NULL, 0);
4340 	zio_flush(pio, raidvd);
4341 	zio_wait(pio);
4342 
4343 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4344 	    "to real location", (long long)logical_size);
4345 
4346 	for (int i = 0; i < raidvd->vdev_children; i++)
4347 		abd_free(abds[i]);
4348 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4349 
4350 	/*
4351 	 * Update uberblock.
4352 	 */
4353 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4354 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4355 	spa->spa_ubsync.ub_timestamp++;
4356 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4357 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4358 	if (spa_multihost(spa))
4359 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4360 
4361 	zfs_dbgmsg("reflow recovery: uberblock updated "
4362 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4363 	    (long long)spa->spa_ubsync.ub_txg,
4364 	    (long long)logical_size,
4365 	    (long long)spa->spa_ubsync.ub_timestamp);
4366 
4367 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4368 	    spa_first_txg(spa));
4369 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4370 	vre->vre_offset = logical_size;
4371 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4372 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4373 	/*
4374 	 * Note that raidz_reflow_sync() will update the uberblock once more
4375 	 */
4376 	raidz_reflow_sync(spa, tx);
4377 
4378 	dmu_tx_commit(tx);
4379 
4380 	spa_config_exit(spa, SCL_STATE, FTAG);
4381 }
4382 
4383 static boolean_t
4384 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4385 {
4386 	(void) zthr;
4387 	spa_t *spa = arg;
4388 
4389 	return (spa->spa_raidz_expand != NULL &&
4390 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4391 }
4392 
4393 /*
4394  * RAIDZ expansion background thread
4395  *
4396  * Can be called multiple times if the reflow is paused
4397  */
4398 static void
4399 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4400 {
4401 	spa_t *spa = arg;
4402 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4403 
4404 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4405 		vre->vre_offset = 0;
4406 	else
4407 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4408 
4409 	/* Reflow the begining portion using the scratch area */
4410 	if (vre->vre_offset == 0) {
4411 		VERIFY0(dsl_sync_task(spa_name(spa),
4412 		    NULL, raidz_reflow_scratch_sync,
4413 		    vre, 0, ZFS_SPACE_CHECK_NONE));
4414 
4415 		/* if we encountered errors then pause */
4416 		if (vre->vre_offset == 0) {
4417 			mutex_enter(&vre->vre_lock);
4418 			vre->vre_waiting_for_resilver = B_TRUE;
4419 			mutex_exit(&vre->vre_lock);
4420 			return;
4421 		}
4422 	}
4423 
4424 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4425 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4426 
4427 	uint64_t guid = raidvd->vdev_guid;
4428 
4429 	/* Iterate over all the remaining metaslabs */
4430 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4431 	    i < raidvd->vdev_ms_count &&
4432 	    !zthr_iscancelled(zthr) &&
4433 	    vre->vre_failed_offset == UINT64_MAX; i++) {
4434 		metaslab_t *msp = raidvd->vdev_ms[i];
4435 
4436 		metaslab_disable(msp);
4437 		mutex_enter(&msp->ms_lock);
4438 
4439 		/*
4440 		 * The metaslab may be newly created (for the expanded
4441 		 * space), in which case its trees won't exist yet,
4442 		 * so we need to bail out early.
4443 		 */
4444 		if (msp->ms_new) {
4445 			mutex_exit(&msp->ms_lock);
4446 			metaslab_enable(msp, B_FALSE, B_FALSE);
4447 			continue;
4448 		}
4449 
4450 		VERIFY0(metaslab_load(msp));
4451 
4452 		/*
4453 		 * We want to copy everything except the free (allocatable)
4454 		 * space.  Note that there may be a little bit more free
4455 		 * space (e.g. in ms_defer), and it's fine to copy that too.
4456 		 */
4457 		range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4458 		    NULL, 0, 0);
4459 		range_tree_add(rt, msp->ms_start, msp->ms_size);
4460 		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4461 		mutex_exit(&msp->ms_lock);
4462 
4463 		/*
4464 		 * Force the last sector of each metaslab to be copied.  This
4465 		 * ensures that we advance the on-disk progress to the end of
4466 		 * this metaslab while the metaslab is disabled.  Otherwise, we
4467 		 * could move past this metaslab without advancing the on-disk
4468 		 * progress, and then an allocation to this metaslab would not
4469 		 * be copied.
4470 		 */
4471 		int sectorsz = 1 << raidvd->vdev_ashift;
4472 		uint64_t ms_last_offset = msp->ms_start +
4473 		    msp->ms_size - sectorsz;
4474 		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4475 			range_tree_add(rt, ms_last_offset, sectorsz);
4476 		}
4477 
4478 		/*
4479 		 * When we are resuming from a paused expansion (i.e.
4480 		 * when importing a pool with a expansion in progress),
4481 		 * discard any state that we have already processed.
4482 		 */
4483 		range_tree_clear(rt, 0, vre->vre_offset);
4484 
4485 		while (!zthr_iscancelled(zthr) &&
4486 		    !range_tree_is_empty(rt) &&
4487 		    vre->vre_failed_offset == UINT64_MAX) {
4488 
4489 			/*
4490 			 * We need to periodically drop the config lock so that
4491 			 * writers can get in.  Additionally, we can't wait
4492 			 * for a txg to sync while holding a config lock
4493 			 * (since a waiting writer could cause a 3-way deadlock
4494 			 * with the sync thread, which also gets a config
4495 			 * lock for reader).  So we can't hold the config lock
4496 			 * while calling dmu_tx_assign().
4497 			 */
4498 			spa_config_exit(spa, SCL_CONFIG, FTAG);
4499 
4500 			/*
4501 			 * If requested, pause the reflow when the amount
4502 			 * specified by raidz_expand_max_reflow_bytes is reached
4503 			 *
4504 			 * This pause is only used during testing or debugging.
4505 			 */
4506 			while (raidz_expand_max_reflow_bytes != 0 &&
4507 			    raidz_expand_max_reflow_bytes <=
4508 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4509 				delay(hz);
4510 			}
4511 
4512 			mutex_enter(&vre->vre_lock);
4513 			while (vre->vre_outstanding_bytes >
4514 			    raidz_expand_max_copy_bytes) {
4515 				cv_wait(&vre->vre_cv, &vre->vre_lock);
4516 			}
4517 			mutex_exit(&vre->vre_lock);
4518 
4519 			dmu_tx_t *tx =
4520 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4521 
4522 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4523 			uint64_t txg = dmu_tx_get_txg(tx);
4524 
4525 			/*
4526 			 * Reacquire the vdev_config lock.  Theoretically, the
4527 			 * vdev_t that we're expanding may have changed.
4528 			 */
4529 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4530 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4531 
4532 			boolean_t needsync =
4533 			    raidz_reflow_impl(raidvd, vre, rt, tx);
4534 
4535 			dmu_tx_commit(tx);
4536 
4537 			if (needsync) {
4538 				spa_config_exit(spa, SCL_CONFIG, FTAG);
4539 				txg_wait_synced(spa->spa_dsl_pool, txg);
4540 				spa_config_enter(spa, SCL_CONFIG, FTAG,
4541 				    RW_READER);
4542 			}
4543 		}
4544 
4545 		spa_config_exit(spa, SCL_CONFIG, FTAG);
4546 
4547 		metaslab_enable(msp, B_FALSE, B_FALSE);
4548 		range_tree_vacate(rt, NULL, NULL);
4549 		range_tree_destroy(rt);
4550 
4551 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4552 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4553 	}
4554 
4555 	spa_config_exit(spa, SCL_CONFIG, FTAG);
4556 
4557 	/*
4558 	 * The txg_wait_synced() here ensures that all reflow zio's have
4559 	 * completed, and vre_failed_offset has been set if necessary.  It
4560 	 * also ensures that the progress of the last raidz_reflow_sync() is
4561 	 * written to disk before raidz_reflow_complete_sync() changes the
4562 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4563 	 * determine if a reflow is in progress, in which case we may need to
4564 	 * write to both old and new locations.  Therefore we can only change
4565 	 * vre_state once this is not necessary, which is once the on-disk
4566 	 * progress (in spa_ubsync) has been set past any possible writes (to
4567 	 * the end of the last metaslab).
4568 	 */
4569 	txg_wait_synced(spa->spa_dsl_pool, 0);
4570 
4571 	if (!zthr_iscancelled(zthr) &&
4572 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4573 		/*
4574 		 * We are not being canceled or paused, so the reflow must be
4575 		 * complete. In that case also mark it as completed on disk.
4576 		 */
4577 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4578 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4579 		    raidz_reflow_complete_sync, spa,
4580 		    0, ZFS_SPACE_CHECK_NONE));
4581 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4582 	} else {
4583 		/*
4584 		 * Wait for all copy zio's to complete and for all the
4585 		 * raidz_reflow_sync() synctasks to be run.
4586 		 */
4587 		spa_history_log_internal(spa, "reflow pause",
4588 		    NULL, "offset=%llu failed_offset=%lld",
4589 		    (long long)vre->vre_offset,
4590 		    (long long)vre->vre_failed_offset);
4591 		mutex_enter(&vre->vre_lock);
4592 		if (vre->vre_failed_offset != UINT64_MAX) {
4593 			/*
4594 			 * Reset progress so that we will retry everything
4595 			 * after the point that something failed.
4596 			 */
4597 			vre->vre_offset = vre->vre_failed_offset;
4598 			vre->vre_failed_offset = UINT64_MAX;
4599 			vre->vre_waiting_for_resilver = B_TRUE;
4600 		}
4601 		mutex_exit(&vre->vre_lock);
4602 	}
4603 }
4604 
4605 void
4606 spa_start_raidz_expansion_thread(spa_t *spa)
4607 {
4608 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4609 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4610 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4611 	    spa, defclsyspri);
4612 }
4613 
4614 void
4615 raidz_dtl_reassessed(vdev_t *vd)
4616 {
4617 	spa_t *spa = vd->vdev_spa;
4618 	if (spa->spa_raidz_expand != NULL) {
4619 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4620 		/*
4621 		 * we get called often from vdev_dtl_reassess() so make
4622 		 * sure it's our vdev and any replacing is complete
4623 		 */
4624 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4625 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4626 			mutex_enter(&vre->vre_lock);
4627 			if (vre->vre_waiting_for_resilver) {
4628 				vdev_dbgmsg(vd, "DTL reassessed, "
4629 				    "continuing raidz expansion");
4630 				vre->vre_waiting_for_resilver = B_FALSE;
4631 				zthr_wakeup(spa->spa_raidz_expand_zthr);
4632 			}
4633 			mutex_exit(&vre->vre_lock);
4634 		}
4635 	}
4636 }
4637 
4638 int
4639 vdev_raidz_attach_check(vdev_t *new_child)
4640 {
4641 	vdev_t *raidvd = new_child->vdev_parent;
4642 	uint64_t new_children = raidvd->vdev_children;
4643 
4644 	/*
4645 	 * We use the "boot" space as scratch space to handle overwriting the
4646 	 * initial part of the vdev.  If it is too small, then this expansion
4647 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4648 	 * >200 children).
4649 	 */
4650 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4651 		return (EINVAL);
4652 	}
4653 	return (0);
4654 }
4655 
4656 void
4657 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4658 {
4659 	vdev_t *new_child = arg;
4660 	spa_t *spa = new_child->vdev_spa;
4661 	vdev_t *raidvd = new_child->vdev_parent;
4662 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4663 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4664 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
4665 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4666 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4667 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4668 	    new_child);
4669 
4670 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4671 
4672 	vdrz->vd_physical_width++;
4673 
4674 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4675 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4676 	vdrz->vn_vre.vre_offset = 0;
4677 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4678 	spa->spa_raidz_expand = &vdrz->vn_vre;
4679 	zthr_wakeup(spa->spa_raidz_expand_zthr);
4680 
4681 	/*
4682 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4683 	 * written to the config.
4684 	 */
4685 	vdev_config_dirty(raidvd);
4686 
4687 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
4688 	vdrz->vn_vre.vre_end_time = 0;
4689 	vdrz->vn_vre.vre_state = DSS_SCANNING;
4690 	vdrz->vn_vre.vre_bytes_copied = 0;
4691 
4692 	uint64_t state = vdrz->vn_vre.vre_state;
4693 	VERIFY0(zap_update(spa->spa_meta_objset,
4694 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4695 	    sizeof (state), 1, &state, tx));
4696 
4697 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
4698 	VERIFY0(zap_update(spa->spa_meta_objset,
4699 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4700 	    sizeof (start_time), 1, &start_time, tx));
4701 
4702 	(void) zap_remove(spa->spa_meta_objset,
4703 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4704 	(void) zap_remove(spa->spa_meta_objset,
4705 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4706 
4707 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4708 	    "%s vdev %llu new width %llu", spa_name(spa),
4709 	    (unsigned long long)raidvd->vdev_id,
4710 	    (unsigned long long)raidvd->vdev_children);
4711 }
4712 
4713 int
4714 vdev_raidz_load(vdev_t *vd)
4715 {
4716 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4717 	int err;
4718 
4719 	uint64_t state = DSS_NONE;
4720 	uint64_t start_time = 0;
4721 	uint64_t end_time = 0;
4722 	uint64_t bytes_copied = 0;
4723 
4724 	if (vd->vdev_top_zap != 0) {
4725 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4726 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4727 		    sizeof (state), 1, &state);
4728 		if (err != 0 && err != ENOENT)
4729 			return (err);
4730 
4731 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4732 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4733 		    sizeof (start_time), 1, &start_time);
4734 		if (err != 0 && err != ENOENT)
4735 			return (err);
4736 
4737 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4738 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4739 		    sizeof (end_time), 1, &end_time);
4740 		if (err != 0 && err != ENOENT)
4741 			return (err);
4742 
4743 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4744 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4745 		    sizeof (bytes_copied), 1, &bytes_copied);
4746 		if (err != 0 && err != ENOENT)
4747 			return (err);
4748 	}
4749 
4750 	/*
4751 	 * If we are in the middle of expansion, vre_state should have
4752 	 * already been set by vdev_raidz_init().
4753 	 */
4754 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4755 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4756 	vdrz->vn_vre.vre_start_time = start_time;
4757 	vdrz->vn_vre.vre_end_time = end_time;
4758 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4759 
4760 	return (0);
4761 }
4762 
4763 int
4764 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4765 {
4766 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4767 
4768 	if (vre == NULL) {
4769 		/* no removal in progress; find most recent completed */
4770 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4771 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4772 			if (vd->vdev_ops == &vdev_raidz_ops) {
4773 				vdev_raidz_t *vdrz = vd->vdev_tsd;
4774 
4775 				if (vdrz->vn_vre.vre_end_time != 0 &&
4776 				    (vre == NULL ||
4777 				    vdrz->vn_vre.vre_end_time >
4778 				    vre->vre_end_time)) {
4779 					vre = &vdrz->vn_vre;
4780 				}
4781 			}
4782 		}
4783 	}
4784 
4785 	if (vre == NULL) {
4786 		return (SET_ERROR(ENOENT));
4787 	}
4788 
4789 	pres->pres_state = vre->vre_state;
4790 	pres->pres_expanding_vdev = vre->vre_vdev_id;
4791 
4792 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4793 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4794 
4795 	mutex_enter(&vre->vre_lock);
4796 	pres->pres_reflowed = vre->vre_bytes_copied;
4797 	for (int i = 0; i < TXG_SIZE; i++)
4798 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4799 	mutex_exit(&vre->vre_lock);
4800 
4801 	pres->pres_start_time = vre->vre_start_time;
4802 	pres->pres_end_time = vre->vre_end_time;
4803 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4804 
4805 	return (0);
4806 }
4807 
4808 /*
4809  * Initialize private RAIDZ specific fields from the nvlist.
4810  */
4811 static int
4812 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4813 {
4814 	uint_t children;
4815 	nvlist_t **child;
4816 	int error = nvlist_lookup_nvlist_array(nv,
4817 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
4818 	if (error != 0)
4819 		return (SET_ERROR(EINVAL));
4820 
4821 	uint64_t nparity;
4822 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4823 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4824 			return (SET_ERROR(EINVAL));
4825 
4826 		/*
4827 		 * Previous versions could only support 1 or 2 parity
4828 		 * device.
4829 		 */
4830 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4831 			return (SET_ERROR(EINVAL));
4832 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4833 			return (SET_ERROR(EINVAL));
4834 	} else {
4835 		/*
4836 		 * We require the parity to be specified for SPAs that
4837 		 * support multiple parity levels.
4838 		 */
4839 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4840 			return (SET_ERROR(EINVAL));
4841 
4842 		/*
4843 		 * Otherwise, we default to 1 parity device for RAID-Z.
4844 		 */
4845 		nparity = 1;
4846 	}
4847 
4848 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4849 	vdrz->vn_vre.vre_vdev_id = -1;
4850 	vdrz->vn_vre.vre_offset = UINT64_MAX;
4851 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4852 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4853 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4854 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4855 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4856 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4857 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4858 
4859 	vdrz->vd_physical_width = children;
4860 	vdrz->vd_nparity = nparity;
4861 
4862 	/* note, the ID does not exist when creating a pool */
4863 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4864 	    &vdrz->vn_vre.vre_vdev_id);
4865 
4866 	boolean_t reflow_in_progress =
4867 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4868 	if (reflow_in_progress) {
4869 		spa->spa_raidz_expand = &vdrz->vn_vre;
4870 		vdrz->vn_vre.vre_state = DSS_SCANNING;
4871 	}
4872 
4873 	vdrz->vd_original_width = children;
4874 	uint64_t *txgs;
4875 	unsigned int txgs_size = 0;
4876 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4877 	    &txgs, &txgs_size);
4878 	if (error == 0) {
4879 		for (int i = 0; i < txgs_size; i++) {
4880 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4881 			re->re_txg = txgs[txgs_size - i - 1];
4882 			re->re_logical_width = vdrz->vd_physical_width - i;
4883 
4884 			if (reflow_in_progress)
4885 				re->re_logical_width--;
4886 
4887 			avl_add(&vdrz->vd_expand_txgs, re);
4888 		}
4889 
4890 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4891 	}
4892 	if (reflow_in_progress) {
4893 		vdrz->vd_original_width--;
4894 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4895 		    children, txgs_size);
4896 	}
4897 
4898 	*tsd = vdrz;
4899 
4900 	return (0);
4901 }
4902 
4903 static void
4904 vdev_raidz_fini(vdev_t *vd)
4905 {
4906 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4907 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
4908 		vd->vdev_spa->spa_raidz_expand = NULL;
4909 	reflow_node_t *re;
4910 	void *cookie = NULL;
4911 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
4912 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
4913 		kmem_free(re, sizeof (*re));
4914 	avl_destroy(&vdrz->vd_expand_txgs);
4915 	mutex_destroy(&vdrz->vd_expand_lock);
4916 	mutex_destroy(&vdrz->vn_vre.vre_lock);
4917 	cv_destroy(&vdrz->vn_vre.vre_cv);
4918 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
4919 	kmem_free(vdrz, sizeof (*vdrz));
4920 }
4921 
4922 /*
4923  * Add RAIDZ specific fields to the config nvlist.
4924  */
4925 static void
4926 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
4927 {
4928 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
4929 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4930 
4931 	/*
4932 	 * Make sure someone hasn't managed to sneak a fancy new vdev
4933 	 * into a crufty old storage pool.
4934 	 */
4935 	ASSERT(vdrz->vd_nparity == 1 ||
4936 	    (vdrz->vd_nparity <= 2 &&
4937 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
4938 	    (vdrz->vd_nparity <= 3 &&
4939 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
4940 
4941 	/*
4942 	 * Note that we'll add these even on storage pools where they
4943 	 * aren't strictly required -- older software will just ignore
4944 	 * it.
4945 	 */
4946 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
4947 
4948 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4949 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4950 	}
4951 
4952 	mutex_enter(&vdrz->vd_expand_lock);
4953 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
4954 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
4955 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
4956 		    KM_SLEEP);
4957 		uint64_t i = 0;
4958 
4959 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
4960 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
4961 			txgs[i++] = re->re_txg;
4962 		}
4963 
4964 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4965 		    txgs, count);
4966 
4967 		kmem_free(txgs, sizeof (uint64_t) * count);
4968 	}
4969 	mutex_exit(&vdrz->vd_expand_lock);
4970 }
4971 
4972 static uint64_t
4973 vdev_raidz_nparity(vdev_t *vd)
4974 {
4975 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4976 	return (vdrz->vd_nparity);
4977 }
4978 
4979 static uint64_t
4980 vdev_raidz_ndisks(vdev_t *vd)
4981 {
4982 	return (vd->vdev_children);
4983 }
4984 
4985 vdev_ops_t vdev_raidz_ops = {
4986 	.vdev_op_init = vdev_raidz_init,
4987 	.vdev_op_fini = vdev_raidz_fini,
4988 	.vdev_op_open = vdev_raidz_open,
4989 	.vdev_op_close = vdev_raidz_close,
4990 	.vdev_op_asize = vdev_raidz_asize,
4991 	.vdev_op_min_asize = vdev_raidz_min_asize,
4992 	.vdev_op_min_alloc = NULL,
4993 	.vdev_op_io_start = vdev_raidz_io_start,
4994 	.vdev_op_io_done = vdev_raidz_io_done,
4995 	.vdev_op_state_change = vdev_raidz_state_change,
4996 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
4997 	.vdev_op_hold = NULL,
4998 	.vdev_op_rele = NULL,
4999 	.vdev_op_remap = NULL,
5000 	.vdev_op_xlate = vdev_raidz_xlate,
5001 	.vdev_op_rebuild_asize = NULL,
5002 	.vdev_op_metaslab_init = NULL,
5003 	.vdev_op_config_generate = vdev_raidz_config_generate,
5004 	.vdev_op_nparity = vdev_raidz_nparity,
5005 	.vdev_op_ndisks = vdev_raidz_ndisks,
5006 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5007 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5008 };
5009 
5010 /* BEGIN CSTYLED */
5011 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5012 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5013 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5014 	"Max amount of concurrent i/o for RAIDZ expansion");
5015 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5016 	"For expanded RAIDZ, aggregate reads that have more rows than this");
5017 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5018 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5019 	"completes");
5020 /* END CSTYLED */
5021