1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/zap.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/abd.h>
38 #include <sys/zfs_rlock.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #include <sys/vdev_raidz.h>
42 #include <sys/vdev_raidz_impl.h>
43 #include <sys/vdev_draid.h>
44 #include <sys/uberblock_impl.h>
45 #include <sys/dsl_scan.h>
46
47 #ifdef ZFS_DEBUG
48 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
49 #endif
50
51 /*
52 * Virtual device vector for RAID-Z.
53 *
54 * This vdev supports single, double, and triple parity. For single parity,
55 * we use a simple XOR of all the data columns. For double or triple parity,
56 * we use a special case of Reed-Solomon coding. This extends the
57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60 * former is also based. The latter is designed to provide higher performance
61 * for writes.
62 *
63 * Note that the Plank paper claimed to support arbitrary N+M, but was then
64 * amended six years later identifying a critical flaw that invalidates its
65 * claims. Nevertheless, the technique can be adapted to work for up to
66 * triple parity. For additional parity, the amendment "Note: Correction to
67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68 * is viable, but the additional complexity means that write performance will
69 * suffer.
70 *
71 * All of the methods above operate on a Galois field, defined over the
72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73 * can be expressed with a single byte. Briefly, the operations on the
74 * field are defined as follows:
75 *
76 * o addition (+) is represented by a bitwise XOR
77 * o subtraction (-) is therefore identical to addition: A + B = A - B
78 * o multiplication of A by 2 is defined by the following bitwise expression:
79 *
80 * (A * 2)_7 = A_6
81 * (A * 2)_6 = A_5
82 * (A * 2)_5 = A_4
83 * (A * 2)_4 = A_3 + A_7
84 * (A * 2)_3 = A_2 + A_7
85 * (A * 2)_2 = A_1 + A_7
86 * (A * 2)_1 = A_0
87 * (A * 2)_0 = A_7
88 *
89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90 * As an aside, this multiplication is derived from the error correcting
91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
92 *
93 * Observe that any number in the field (except for 0) can be expressed as a
94 * power of 2 -- a generator for the field. We store a table of the powers of
95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97 * than field addition). The inverse of a field element A (A^-1) is therefore
98 * A ^ (255 - 1) = A^254.
99 *
100 * The up-to-three parity columns, P, Q, R over several data columns,
101 * D_0, ... D_n-1, can be expressed by field operations:
102 *
103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
108 *
109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111 * independent coefficients. (There are no additional coefficients that have
112 * this property which is why the uncorrected Plank method breaks down.)
113 *
114 * See the reconstruction code below for how P, Q and R can used individually
115 * or in concert to recover missing data columns.
116 */
117
118 #define VDEV_RAIDZ_P 0
119 #define VDEV_RAIDZ_Q 1
120 #define VDEV_RAIDZ_R 2
121
122 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124
125 /*
126 * We provide a mechanism to perform the field multiplication operation on a
127 * 64-bit value all at once rather than a byte at a time. This works by
128 * creating a mask from the top bit in each byte and using that to
129 * conditionally apply the XOR of 0x1d.
130 */
131 #define VDEV_RAIDZ_64MUL_2(x, mask) \
132 { \
133 (mask) = (x) & 0x8080808080808080ULL; \
134 (mask) = ((mask) << 1) - ((mask) >> 7); \
135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
137 }
138
139 #define VDEV_RAIDZ_64MUL_4(x, mask) \
140 { \
141 VDEV_RAIDZ_64MUL_2((x), mask); \
142 VDEV_RAIDZ_64MUL_2((x), mask); \
143 }
144
145
146 /*
147 * Big Theory Statement for how a RAIDZ VDEV is expanded
148 *
149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151 * that have been previously expanded can be expanded again.
152 *
153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154 * the VDEV) when an expansion starts. And the expansion will pause if any
155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156 * operations on the pool can continue while an expansion is in progress (e.g.
157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158 * and zpool initialize which can't be run during an expansion. Following a
159 * reboot or export/import, the expansion resumes where it left off.
160 *
161 * == Reflowing the Data ==
162 *
163 * The expansion involves reflowing (copying) the data from the current set
164 * of disks to spread it across the new set which now has one more disk. This
165 * reflow operation is similar to reflowing text when the column width of a
166 * text editor window is expanded. The text doesn’t change but the location of
167 * the text changes to accommodate the new width. An example reflow result for
168 * a 4-wide RAIDZ1 to a 5-wide is shown below.
169 *
170 * Reflow End State
171 * Each letter indicates a parity group (logical stripe)
172 *
173 * Before expansion After Expansion
174 * D1 D2 D3 D4 D1 D2 D3 D4 D5
175 * +------+------+------+------+ +------+------+------+------+------+
176 * | | | | | | | | | | |
177 * | A | A | A | A | | A | A | A | A | B |
178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
179 * +------+------+------+------+ +------+------+------+------+------+
180 * | | | | | | | | | | |
181 * | B | B | C | C | | B | C | C | C | C |
182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
183 * +------+------+------+------+ +------+------+------+------+------+
184 * | | | | | | | | | | |
185 * | C | C | D | D | | D | D | E | E | E |
186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
187 * +------+------+------+------+ +------+------+------+------+------+
188 * | | | | | | | | | | |
189 * | E | E | E | E | --> | E | F | F | G | G |
190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
191 * +------+------+------+------+ +------+------+------+------+------+
192 * | | | | | | | | | | |
193 * | F | F | G | G | | G | G | H | H | H |
194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
195 * +------+------+------+------+ +------+------+------+------+------+
196 * | | | | | | | | | | |
197 * | G | G | H | H | | H | I | I | J | J |
198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
199 * +------+------+------+------+ +------+------+------+------+------+
200 * | | | | | | | | | | |
201 * | H | H | I | I | | J | J | | | K |
202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
203 * +------+------+------+------+ +------+------+------+------+------+
204 *
205 * This reflow approach has several advantages. There is no need to read or
206 * modify the block pointers or recompute any block checksums. The reflow
207 * doesn’t need to know where the parity sectors reside. We can read and write
208 * data sequentially and the copy can occur in a background thread in open
209 * context. The design also allows for fast discovery of what data to copy.
210 *
211 * The VDEV metaslabs are processed, one at a time, to copy the block data to
212 * have it flow across all the disks. The metaslab is disabled for allocations
213 * during the copy. As an optimization, we only copy the allocated data which
214 * can be determined by looking at the metaslab range tree. During the copy we
215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216 * need to be able to survive losing parity count disks). This means we
217 * cannot overwrite data during the reflow that would be needed if a disk is
218 * lost.
219 *
220 * After the reflow completes, all newly-written blocks will have the new
221 * layout, i.e., they will have the parity to data ratio implied by the new
222 * number of disks in the RAIDZ group. Even though the reflow copies all of
223 * the allocated space (data and parity), it is only rearranged, not changed.
224 *
225 * This act of reflowing the data has a few implications about blocks
226 * that were written before the reflow completes:
227 *
228 * - Old blocks will still use the same amount of space (i.e., they will have
229 * the parity to data ratio implied by the old number of disks in the RAIDZ
230 * group).
231 * - Reading old blocks will be slightly slower than before the reflow, for
232 * two reasons. First, we will have to read from all disks in the RAIDZ
233 * VDEV, rather than being able to skip the children that contain only
234 * parity of this block (because the data of a single block is now spread
235 * out across all the disks). Second, in most cases there will be an extra
236 * bcopy, needed to rearrange the data back to its original layout in memory.
237 *
238 * == Scratch Area ==
239 *
240 * As we copy the block data, we can only progress to the point that writes
241 * will not overlap with blocks whose progress has not yet been recorded on
242 * disk. Since partially-copied rows are always read from the old location,
243 * we need to stop one row before the sector-wise overlap, to prevent any
244 * row-wise overlap. For example, in the diagram above, when we reflow sector
245 * B6 it will overwite the original location for B5.
246 *
247 * To get around this, a scratch space is used so that we can start copying
248 * without risking data loss by overlapping the row. As an added benefit, it
249 * improves performance at the beginning of the reflow, but that small perf
250 * boost wouldn't be worth the complexity on its own.
251 *
252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255 * the widths will likely be single digits so we can get a substantial chuck
256 * size using only a few MB of scratch per disk.
257 *
258 * The scratch area is persisted to disk which holds a large amount of reflowed
259 * state. We can always read the partially written stripes when a disk fails or
260 * the copy is interrupted (crash) during the initial copying phase and also
261 * get past a small chunk size restriction. At a minimum, the scratch space
262 * must be large enough to get us to the point that one row does not overlap
263 * itself when moved (i.e new_width^2). But going larger is even better. We
264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265 * as our scratch space to handle overwriting the initial part of the VDEV.
266 *
267 * 0 256K 512K 4M
268 * +------+------+-----------------------+-----------------------------
269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
270 * | L0 | L1 | Reserved | (Metaslabs)
271 * +------+------+-----------------------+-------------------------------
272 * Scratch Area
273 *
274 * == Reflow Progress Updates ==
275 * After the initial scratch-based reflow, the expansion process works
276 * similarly to device removal. We create a new open context thread which
277 * reflows the data, and periodically kicks off sync tasks to update logical
278 * state. In this case, state is the committed progress (offset of next data
279 * to copy). We need to persist the completed offset on disk, so that if we
280 * crash we know which format each VDEV offset is in.
281 *
282 * == Time Dependent Geometry ==
283 *
284 * In non-expanded RAIDZ, blocks are read from disk in a column by column
285 * fashion. For a multi-row block, the second sector is in the first column
286 * not in the second column. This allows us to issue full reads for each
287 * column directly into the request buffer. The block data is thus laid out
288 * sequentially in a column-by-column fashion.
289 *
290 * For example, in the before expansion diagram above, one logical block might
291 * be sectors G19-H26. The parity is in G19,H23; and the data is in
292 * G20,H24,G21,H25,G22,H26.
293 *
294 * After a block is reflowed, the sectors that were all in the original column
295 * data can now reside in different columns. When reading from an expanded
296 * VDEV, we need to know the logical stripe width for each block so we can
297 * reconstitute the block’s data after the reads are completed. Likewise,
298 * when we perform the combinatorial reconstruction we need to know the
299 * original width so we can retry combinations from the past layouts.
300 *
301 * Time dependent geometry is what we call having blocks with different layouts
302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303 * block’s birth time (+ the time expansion ended) to establish the correct
304 * width for a given block. After an expansion completes, we record the time
305 * for blocks written with a particular width (geometry).
306 *
307 * == On Disk Format Changes ==
308 *
309 * New pool feature flag, 'raidz_expansion' whose reference count is the number
310 * of RAIDZ VDEVs that have been expanded.
311 *
312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313 *
314 * Since the uberblock can point to arbitrary blocks, which might be on the
315 * expanding RAIDZ, and might or might not have been expanded. We need to know
316 * which way a block is laid out before reading it. This info is the next
317 * offset that needs to be reflowed and we persist that in the uberblock, in
318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319 * After the expansion is complete, we then use the raidz_expand_txgs array
320 * (see below) to determine how to read a block and the ub_raidz_reflow_info
321 * field no longer required.
322 *
323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324 * state (i.e., active or not) which is also required before reading a block
325 * during the initial phase of reflowing the data.
326 *
327 * The top-level RAIDZ VDEV has two new entries in the nvlist:
328 *
329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330 * and used after the expansion is complete to
331 * determine how to read a raidz block
332 * 'raidz_expanding' boolean: present during reflow and removed after completion
333 * used during a spa import to resume an unfinished
334 * expansion
335 *
336 * And finally the VDEVs top zap adds the following informational entries:
337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
341 */
342
343 /*
344 * For testing only: pause the raidz expansion after reflowing this amount.
345 * (accessed by ZTS and ztest)
346 */
347 #ifdef _KERNEL
348 static
349 #endif /* _KERNEL */
350 unsigned long raidz_expand_max_reflow_bytes = 0;
351
352 /*
353 * For testing only: pause the raidz expansion at a certain point.
354 */
355 uint_t raidz_expand_pause_point = 0;
356
357 /*
358 * Maximum amount of copy io's outstanding at once.
359 */
360 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
361
362 /*
363 * Apply raidz map abds aggregation if the number of rows in the map is equal
364 * or greater than the value below.
365 */
366 static unsigned long raidz_io_aggregate_rows = 4;
367
368 /*
369 * Automatically start a pool scrub when a RAIDZ expansion completes in
370 * order to verify the checksums of all blocks which have been copied
371 * during the expansion. Automatic scrubbing is enabled by default and
372 * is strongly recommended.
373 */
374 static int zfs_scrub_after_expand = 1;
375
376 static void
vdev_raidz_row_free(raidz_row_t * rr)377 vdev_raidz_row_free(raidz_row_t *rr)
378 {
379 for (int c = 0; c < rr->rr_cols; c++) {
380 raidz_col_t *rc = &rr->rr_col[c];
381
382 if (rc->rc_size != 0)
383 abd_free(rc->rc_abd);
384 if (rc->rc_orig_data != NULL)
385 abd_free(rc->rc_orig_data);
386 }
387
388 if (rr->rr_abd_empty != NULL)
389 abd_free(rr->rr_abd_empty);
390
391 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
392 }
393
394 void
vdev_raidz_map_free(raidz_map_t * rm)395 vdev_raidz_map_free(raidz_map_t *rm)
396 {
397 for (int i = 0; i < rm->rm_nrows; i++)
398 vdev_raidz_row_free(rm->rm_row[i]);
399
400 if (rm->rm_nphys_cols) {
401 for (int i = 0; i < rm->rm_nphys_cols; i++) {
402 if (rm->rm_phys_col[i].rc_abd != NULL)
403 abd_free(rm->rm_phys_col[i].rc_abd);
404 }
405
406 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
407 rm->rm_nphys_cols);
408 }
409
410 ASSERT3P(rm->rm_lr, ==, NULL);
411 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
412 }
413
414 static void
vdev_raidz_map_free_vsd(zio_t * zio)415 vdev_raidz_map_free_vsd(zio_t *zio)
416 {
417 raidz_map_t *rm = zio->io_vsd;
418
419 vdev_raidz_map_free(rm);
420 }
421
422 static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)423 vdev_raidz_reflow_compare(const void *x1, const void *x2)
424 {
425 const reflow_node_t *l = x1;
426 const reflow_node_t *r = x2;
427
428 return (TREE_CMP(l->re_txg, r->re_txg));
429 }
430
431 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
432 .vsd_free = vdev_raidz_map_free_vsd,
433 };
434
435 raidz_row_t *
vdev_raidz_row_alloc(int cols,zio_t * zio)436 vdev_raidz_row_alloc(int cols, zio_t *zio)
437 {
438 raidz_row_t *rr =
439 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
440
441 rr->rr_cols = cols;
442 rr->rr_scols = cols;
443
444 for (int c = 0; c < cols; c++) {
445 raidz_col_t *rc = &rr->rr_col[c];
446 rc->rc_shadow_devidx = INT_MAX;
447 rc->rc_shadow_offset = UINT64_MAX;
448 /*
449 * We can not allow self healing to take place for Direct I/O
450 * reads. There is nothing that stops the buffer contents from
451 * being manipulated while the I/O is in flight. It is possible
452 * that the checksum could be verified on the buffer and then
453 * the contents of that buffer are manipulated afterwards. This
454 * could lead to bad data being written out during self
455 * healing.
456 */
457 if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
458 rc->rc_allow_repair = 1;
459 }
460 return (rr);
461 }
462
463 static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)464 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
465 {
466 int c;
467 int nwrapped = 0;
468 uint64_t off = 0;
469 raidz_row_t *rr = rm->rm_row[0];
470
471 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
472 ASSERT3U(rm->rm_nrows, ==, 1);
473
474 /*
475 * Pad any parity columns with additional space to account for skip
476 * sectors.
477 */
478 if (rm->rm_skipstart < rr->rr_firstdatacol) {
479 ASSERT0(rm->rm_skipstart);
480 nwrapped = rm->rm_nskip;
481 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
482 nwrapped =
483 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
484 }
485
486 /*
487 * Optional single skip sectors (rc_size == 0) will be handled in
488 * vdev_raidz_io_start_write().
489 */
490 int skipped = rr->rr_scols - rr->rr_cols;
491
492 /* Allocate buffers for the parity columns */
493 for (c = 0; c < rr->rr_firstdatacol; c++) {
494 raidz_col_t *rc = &rr->rr_col[c];
495
496 /*
497 * Parity columns will pad out a linear ABD to account for
498 * the skip sector. A linear ABD is used here because
499 * parity calculations use the ABD buffer directly to calculate
500 * parity. This avoids doing a memcpy back to the ABD after the
501 * parity has been calculated. By issuing the parity column
502 * with the skip sector we can reduce contention on the child
503 * VDEV queue locks (vq_lock).
504 */
505 if (c < nwrapped) {
506 rc->rc_abd = abd_alloc_linear(
507 rc->rc_size + (1ULL << ashift), B_FALSE);
508 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
509 skipped++;
510 } else {
511 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
512 }
513 }
514
515 for (off = 0; c < rr->rr_cols; c++) {
516 raidz_col_t *rc = &rr->rr_col[c];
517 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
518 zio->io_abd, off, rc->rc_size);
519
520 /*
521 * Generate I/O for skip sectors to improve aggregation
522 * continuity. We will use gang ABD's to reduce contention
523 * on the child VDEV queue locks (vq_lock) by issuing
524 * a single I/O that contains the data and skip sector.
525 *
526 * It is important to make sure that rc_size is not updated
527 * even though we are adding a skip sector to the ABD. When
528 * calculating the parity in vdev_raidz_generate_parity_row()
529 * the rc_size is used to iterate through the ABD's. We can
530 * not have zero'd out skip sectors used for calculating
531 * parity for raidz, because those same sectors are not used
532 * during reconstruction.
533 */
534 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
535 rc->rc_abd = abd_alloc_gang();
536 abd_gang_add(rc->rc_abd, abd, B_TRUE);
537 abd_gang_add(rc->rc_abd,
538 abd_get_zeros(1ULL << ashift), B_TRUE);
539 skipped++;
540 } else {
541 rc->rc_abd = abd;
542 }
543 off += rc->rc_size;
544 }
545
546 ASSERT3U(off, ==, zio->io_size);
547 ASSERT3S(skipped, ==, rm->rm_nskip);
548 }
549
550 static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)551 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
552 {
553 int c;
554 raidz_row_t *rr = rm->rm_row[0];
555
556 ASSERT3U(rm->rm_nrows, ==, 1);
557
558 /* Allocate buffers for the parity columns */
559 for (c = 0; c < rr->rr_firstdatacol; c++)
560 rr->rr_col[c].rc_abd =
561 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
562
563 for (uint64_t off = 0; c < rr->rr_cols; c++) {
564 raidz_col_t *rc = &rr->rr_col[c];
565 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
566 zio->io_abd, off, rc->rc_size);
567 off += rc->rc_size;
568 }
569 }
570
571 /*
572 * Divides the IO evenly across all child vdevs; usually, dcols is
573 * the number of children in the target vdev.
574 *
575 * Avoid inlining the function to keep vdev_raidz_io_start(), which
576 * is this functions only caller, as small as possible on the stack.
577 */
578 noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)579 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
580 uint64_t nparity)
581 {
582 raidz_row_t *rr;
583 /* The starting RAIDZ (parent) vdev sector of the block. */
584 uint64_t b = zio->io_offset >> ashift;
585 /* The zio's size in units of the vdev's minimum sector size. */
586 uint64_t s = zio->io_size >> ashift;
587 /* The first column for this stripe. */
588 uint64_t f = b % dcols;
589 /* The starting byte offset on each child vdev. */
590 uint64_t o = (b / dcols) << ashift;
591 uint64_t acols, scols;
592
593 raidz_map_t *rm =
594 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
595 rm->rm_nrows = 1;
596
597 /*
598 * "Quotient": The number of data sectors for this stripe on all but
599 * the "big column" child vdevs that also contain "remainder" data.
600 */
601 uint64_t q = s / (dcols - nparity);
602
603 /*
604 * "Remainder": The number of partial stripe data sectors in this I/O.
605 * This will add a sector to some, but not all, child vdevs.
606 */
607 uint64_t r = s - q * (dcols - nparity);
608
609 /* The number of "big columns" - those which contain remainder data. */
610 uint64_t bc = (r == 0 ? 0 : r + nparity);
611
612 /*
613 * The total number of data and parity sectors associated with
614 * this I/O.
615 */
616 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
617
618 /*
619 * acols: The columns that will be accessed.
620 * scols: The columns that will be accessed or skipped.
621 */
622 if (q == 0) {
623 /* Our I/O request doesn't span all child vdevs. */
624 acols = bc;
625 scols = MIN(dcols, roundup(bc, nparity + 1));
626 } else {
627 acols = dcols;
628 scols = dcols;
629 }
630
631 ASSERT3U(acols, <=, scols);
632 rr = vdev_raidz_row_alloc(scols, zio);
633 rm->rm_row[0] = rr;
634 rr->rr_cols = acols;
635 rr->rr_bigcols = bc;
636 rr->rr_firstdatacol = nparity;
637 #ifdef ZFS_DEBUG
638 rr->rr_offset = zio->io_offset;
639 rr->rr_size = zio->io_size;
640 #endif
641
642 uint64_t asize = 0;
643
644 for (uint64_t c = 0; c < scols; c++) {
645 raidz_col_t *rc = &rr->rr_col[c];
646 uint64_t col = f + c;
647 uint64_t coff = o;
648 if (col >= dcols) {
649 col -= dcols;
650 coff += 1ULL << ashift;
651 }
652 rc->rc_devidx = col;
653 rc->rc_offset = coff;
654
655 if (c >= acols)
656 rc->rc_size = 0;
657 else if (c < bc)
658 rc->rc_size = (q + 1) << ashift;
659 else
660 rc->rc_size = q << ashift;
661
662 asize += rc->rc_size;
663 }
664
665 ASSERT3U(asize, ==, tot << ashift);
666 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
667 rm->rm_skipstart = bc;
668
669 /*
670 * If all data stored spans all columns, there's a danger that parity
671 * will always be on the same device and, since parity isn't read
672 * during normal operation, that device's I/O bandwidth won't be
673 * used effectively. We therefore switch the parity every 1MB.
674 *
675 * ... at least that was, ostensibly, the theory. As a practical
676 * matter unless we juggle the parity between all devices evenly, we
677 * won't see any benefit. Further, occasional writes that aren't a
678 * multiple of the LCM of the number of children and the minimum
679 * stripe width are sufficient to avoid pessimal behavior.
680 * Unfortunately, this decision created an implicit on-disk format
681 * requirement that we need to support for all eternity, but only
682 * for single-parity RAID-Z.
683 *
684 * If we intend to skip a sector in the zeroth column for padding
685 * we must make sure to note this swap. We will never intend to
686 * skip the first column since at least one data and one parity
687 * column must appear in each row.
688 */
689 ASSERT(rr->rr_cols >= 2);
690 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
691
692 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
693 uint64_t devidx = rr->rr_col[0].rc_devidx;
694 o = rr->rr_col[0].rc_offset;
695 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
696 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
697 rr->rr_col[1].rc_devidx = devidx;
698 rr->rr_col[1].rc_offset = o;
699 if (rm->rm_skipstart == 0)
700 rm->rm_skipstart = 1;
701 }
702
703 if (zio->io_type == ZIO_TYPE_WRITE) {
704 vdev_raidz_map_alloc_write(zio, rm, ashift);
705 } else {
706 vdev_raidz_map_alloc_read(zio, rm);
707 }
708 /* init RAIDZ parity ops */
709 rm->rm_ops = vdev_raidz_math_get_ops();
710
711 return (rm);
712 }
713
714 /*
715 * Everything before reflow_offset_synced should have been moved to the new
716 * location (read and write completed). However, this may not yet be reflected
717 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
718 * uberblock has not yet been written). If reflow is not in progress,
719 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
720 * entirely before reflow_offset_synced, it will come from the new location.
721 * Otherwise this row will come from the old location. Therefore, rows that
722 * straddle the reflow_offset_synced will come from the old location.
723 *
724 * For writes, reflow_offset_next is the next offset to copy. If a sector has
725 * been copied, but not yet reflected in the on-disk progress
726 * (reflow_offset_synced), it will also be written to the new (already copied)
727 * offset.
728 */
729 noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)730 vdev_raidz_map_alloc_expanded(zio_t *zio,
731 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
732 uint64_t nparity, uint64_t reflow_offset_synced,
733 uint64_t reflow_offset_next, boolean_t use_scratch)
734 {
735 abd_t *abd = zio->io_abd;
736 uint64_t offset = zio->io_offset;
737 uint64_t size = zio->io_size;
738
739 /* The zio's size in units of the vdev's minimum sector size. */
740 uint64_t s = size >> ashift;
741
742 /*
743 * "Quotient": The number of data sectors for this stripe on all but
744 * the "big column" child vdevs that also contain "remainder" data.
745 * AKA "full rows"
746 */
747 uint64_t q = s / (logical_cols - nparity);
748
749 /*
750 * "Remainder": The number of partial stripe data sectors in this I/O.
751 * This will add a sector to some, but not all, child vdevs.
752 */
753 uint64_t r = s - q * (logical_cols - nparity);
754
755 /* The number of "big columns" - those which contain remainder data. */
756 uint64_t bc = (r == 0 ? 0 : r + nparity);
757
758 /*
759 * The total number of data and parity sectors associated with
760 * this I/O.
761 */
762 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
763
764 /* How many rows contain data (not skip) */
765 uint64_t rows = howmany(tot, logical_cols);
766 int cols = MIN(tot, logical_cols);
767
768 raidz_map_t *rm =
769 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
770 KM_SLEEP);
771 rm->rm_nrows = rows;
772 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
773 rm->rm_skipstart = bc;
774 uint64_t asize = 0;
775
776 for (uint64_t row = 0; row < rows; row++) {
777 boolean_t row_use_scratch = B_FALSE;
778 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
779 rm->rm_row[row] = rr;
780
781 /* The starting RAIDZ (parent) vdev sector of the row. */
782 uint64_t b = (offset >> ashift) + row * logical_cols;
783
784 /*
785 * If we are in the middle of a reflow, and the copying has
786 * not yet completed for any part of this row, then use the
787 * old location of this row. Note that reflow_offset_synced
788 * reflects the i/o that's been completed, because it's
789 * updated by a synctask, after zio_wait(spa_txg_zio[]).
790 * This is sufficient for our check, even if that progress
791 * has not yet been recorded to disk (reflected in
792 * spa_ubsync). Also note that we consider the last row to
793 * be "full width" (`cols`-wide rather than `bc`-wide) for
794 * this calculation. This causes a tiny bit of unnecessary
795 * double-writes but is safe and simpler to calculate.
796 */
797 int row_phys_cols = physical_cols;
798 if (b + cols > reflow_offset_synced >> ashift)
799 row_phys_cols--;
800 else if (use_scratch)
801 row_use_scratch = B_TRUE;
802
803 /* starting child of this row */
804 uint64_t child_id = b % row_phys_cols;
805 /* The starting byte offset on each child vdev. */
806 uint64_t child_offset = (b / row_phys_cols) << ashift;
807
808 /*
809 * Note, rr_cols is the entire width of the block, even
810 * if this row is shorter. This is needed because parity
811 * generation (for Q and R) needs to know the entire width,
812 * because it treats the short row as though it was
813 * full-width (and the "phantom" sectors were zero-filled).
814 *
815 * Another approach to this would be to set cols shorter
816 * (to just the number of columns that we might do i/o to)
817 * and have another mechanism to tell the parity generation
818 * about the "entire width". Reconstruction (at least
819 * vdev_raidz_reconstruct_general()) would also need to
820 * know about the "entire width".
821 */
822 rr->rr_firstdatacol = nparity;
823 #ifdef ZFS_DEBUG
824 /*
825 * note: rr_size is PSIZE, not ASIZE
826 */
827 rr->rr_offset = b << ashift;
828 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
829 #endif
830
831 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
832 if (child_id >= row_phys_cols) {
833 child_id -= row_phys_cols;
834 child_offset += 1ULL << ashift;
835 }
836 raidz_col_t *rc = &rr->rr_col[c];
837 rc->rc_devidx = child_id;
838 rc->rc_offset = child_offset;
839
840 /*
841 * Get this from the scratch space if appropriate.
842 * This only happens if we crashed in the middle of
843 * raidz_reflow_scratch_sync() (while it's running,
844 * the rangelock prevents us from doing concurrent
845 * io), and even then only during zpool import or
846 * when the pool is imported readonly.
847 */
848 if (row_use_scratch)
849 rc->rc_offset -= VDEV_BOOT_SIZE;
850
851 uint64_t dc = c - rr->rr_firstdatacol;
852 if (c < rr->rr_firstdatacol) {
853 rc->rc_size = 1ULL << ashift;
854
855 /*
856 * Parity sectors' rc_abd's are set below
857 * after determining if this is an aggregation.
858 */
859 } else if (row == rows - 1 && bc != 0 && c >= bc) {
860 /*
861 * Past the end of the block (even including
862 * skip sectors). This sector is part of the
863 * map so that we have full rows for p/q parity
864 * generation.
865 */
866 rc->rc_size = 0;
867 rc->rc_abd = NULL;
868 } else {
869 /* "data column" (col excluding parity) */
870 uint64_t off;
871
872 if (c < bc || r == 0) {
873 off = dc * rows + row;
874 } else {
875 off = r * rows +
876 (dc - r) * (rows - 1) + row;
877 }
878 rc->rc_size = 1ULL << ashift;
879 rc->rc_abd = abd_get_offset_struct(
880 &rc->rc_abdstruct, abd, off << ashift,
881 rc->rc_size);
882 }
883
884 if (rc->rc_size == 0)
885 continue;
886
887 /*
888 * If any part of this row is in both old and new
889 * locations, the primary location is the old
890 * location. If this sector was already copied to the
891 * new location, we need to also write to the new,
892 * "shadow" location.
893 *
894 * Note, `row_phys_cols != physical_cols` indicates
895 * that the primary location is the old location.
896 * `b+c < reflow_offset_next` indicates that the copy
897 * to the new location has been initiated. We know
898 * that the copy has completed because we have the
899 * rangelock, which is held exclusively while the
900 * copy is in progress.
901 */
902 if (row_use_scratch ||
903 (row_phys_cols != physical_cols &&
904 b + c < reflow_offset_next >> ashift)) {
905 rc->rc_shadow_devidx = (b + c) % physical_cols;
906 rc->rc_shadow_offset =
907 ((b + c) / physical_cols) << ashift;
908 if (row_use_scratch)
909 rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
910 }
911
912 asize += rc->rc_size;
913 }
914
915 /*
916 * See comment in vdev_raidz_map_alloc()
917 */
918 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
919 (offset & (1ULL << 20))) {
920 ASSERT(rr->rr_cols >= 2);
921 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
922
923 int devidx0 = rr->rr_col[0].rc_devidx;
924 uint64_t offset0 = rr->rr_col[0].rc_offset;
925 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
926 uint64_t shadow_offset0 =
927 rr->rr_col[0].rc_shadow_offset;
928
929 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
930 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
931 rr->rr_col[0].rc_shadow_devidx =
932 rr->rr_col[1].rc_shadow_devidx;
933 rr->rr_col[0].rc_shadow_offset =
934 rr->rr_col[1].rc_shadow_offset;
935
936 rr->rr_col[1].rc_devidx = devidx0;
937 rr->rr_col[1].rc_offset = offset0;
938 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
939 rr->rr_col[1].rc_shadow_offset = shadow_offset0;
940 }
941 }
942 ASSERT3U(asize, ==, tot << ashift);
943
944 /*
945 * Determine if the block is contiguous, in which case we can use
946 * an aggregation.
947 */
948 if (rows >= raidz_io_aggregate_rows) {
949 rm->rm_nphys_cols = physical_cols;
950 rm->rm_phys_col =
951 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
952 KM_SLEEP);
953
954 /*
955 * Determine the aggregate io's offset and size, and check
956 * that the io is contiguous.
957 */
958 for (int i = 0;
959 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
960 raidz_row_t *rr = rm->rm_row[i];
961 for (int c = 0; c < rr->rr_cols; c++) {
962 raidz_col_t *rc = &rr->rr_col[c];
963 raidz_col_t *prc =
964 &rm->rm_phys_col[rc->rc_devidx];
965
966 if (rc->rc_size == 0)
967 continue;
968
969 if (prc->rc_size == 0) {
970 ASSERT0(prc->rc_offset);
971 prc->rc_offset = rc->rc_offset;
972 } else if (prc->rc_offset + prc->rc_size !=
973 rc->rc_offset) {
974 /*
975 * This block is not contiguous and
976 * therefore can't be aggregated.
977 * This is expected to be rare, so
978 * the cost of allocating and then
979 * freeing rm_phys_col is not
980 * significant.
981 */
982 kmem_free(rm->rm_phys_col,
983 sizeof (raidz_col_t) *
984 rm->rm_nphys_cols);
985 rm->rm_phys_col = NULL;
986 rm->rm_nphys_cols = 0;
987 break;
988 }
989 prc->rc_size += rc->rc_size;
990 }
991 }
992 }
993 if (rm->rm_phys_col != NULL) {
994 /*
995 * Allocate aggregate ABD's.
996 */
997 for (int i = 0; i < rm->rm_nphys_cols; i++) {
998 raidz_col_t *prc = &rm->rm_phys_col[i];
999
1000 prc->rc_devidx = i;
1001
1002 if (prc->rc_size == 0)
1003 continue;
1004
1005 prc->rc_abd =
1006 abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1007 B_FALSE);
1008 }
1009
1010 /*
1011 * Point the parity abd's into the aggregate abd's.
1012 */
1013 for (int i = 0; i < rm->rm_nrows; i++) {
1014 raidz_row_t *rr = rm->rm_row[i];
1015 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1016 raidz_col_t *rc = &rr->rr_col[c];
1017 raidz_col_t *prc =
1018 &rm->rm_phys_col[rc->rc_devidx];
1019 rc->rc_abd =
1020 abd_get_offset_struct(&rc->rc_abdstruct,
1021 prc->rc_abd,
1022 rc->rc_offset - prc->rc_offset,
1023 rc->rc_size);
1024 }
1025 }
1026 } else {
1027 /*
1028 * Allocate new abd's for the parity sectors.
1029 */
1030 for (int i = 0; i < rm->rm_nrows; i++) {
1031 raidz_row_t *rr = rm->rm_row[i];
1032 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1033 raidz_col_t *rc = &rr->rr_col[c];
1034 rc->rc_abd =
1035 abd_alloc_linear(rc->rc_size,
1036 B_TRUE);
1037 }
1038 }
1039 }
1040 /* init RAIDZ parity ops */
1041 rm->rm_ops = vdev_raidz_math_get_ops();
1042
1043 return (rm);
1044 }
1045
1046 struct pqr_struct {
1047 uint64_t *p;
1048 uint64_t *q;
1049 uint64_t *r;
1050 };
1051
1052 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1053 vdev_raidz_p_func(void *buf, size_t size, void *private)
1054 {
1055 struct pqr_struct *pqr = private;
1056 const uint64_t *src = buf;
1057 int cnt = size / sizeof (src[0]);
1058
1059 ASSERT(pqr->p && !pqr->q && !pqr->r);
1060
1061 for (int i = 0; i < cnt; i++, src++, pqr->p++)
1062 *pqr->p ^= *src;
1063
1064 return (0);
1065 }
1066
1067 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1068 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1069 {
1070 struct pqr_struct *pqr = private;
1071 const uint64_t *src = buf;
1072 uint64_t mask;
1073 int cnt = size / sizeof (src[0]);
1074
1075 ASSERT(pqr->p && pqr->q && !pqr->r);
1076
1077 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1078 *pqr->p ^= *src;
1079 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1080 *pqr->q ^= *src;
1081 }
1082
1083 return (0);
1084 }
1085
1086 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1087 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1088 {
1089 struct pqr_struct *pqr = private;
1090 const uint64_t *src = buf;
1091 uint64_t mask;
1092 int cnt = size / sizeof (src[0]);
1093
1094 ASSERT(pqr->p && pqr->q && pqr->r);
1095
1096 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1097 *pqr->p ^= *src;
1098 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1099 *pqr->q ^= *src;
1100 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1101 *pqr->r ^= *src;
1102 }
1103
1104 return (0);
1105 }
1106
1107 static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)1108 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1109 {
1110 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1111
1112 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1113 abd_t *src = rr->rr_col[c].rc_abd;
1114
1115 if (c == rr->rr_firstdatacol) {
1116 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1117 } else {
1118 struct pqr_struct pqr = { p, NULL, NULL };
1119 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1120 vdev_raidz_p_func, &pqr);
1121 }
1122 }
1123 }
1124
1125 static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)1126 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1127 {
1128 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1129 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1130 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1131 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1132 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1133
1134 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1135 abd_t *src = rr->rr_col[c].rc_abd;
1136
1137 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1138
1139 if (c == rr->rr_firstdatacol) {
1140 ASSERT(ccnt == pcnt || ccnt == 0);
1141 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1142 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1143
1144 for (uint64_t i = ccnt; i < pcnt; i++) {
1145 p[i] = 0;
1146 q[i] = 0;
1147 }
1148 } else {
1149 struct pqr_struct pqr = { p, q, NULL };
1150
1151 ASSERT(ccnt <= pcnt);
1152 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1153 vdev_raidz_pq_func, &pqr);
1154
1155 /*
1156 * Treat short columns as though they are full of 0s.
1157 * Note that there's therefore nothing needed for P.
1158 */
1159 uint64_t mask;
1160 for (uint64_t i = ccnt; i < pcnt; i++) {
1161 VDEV_RAIDZ_64MUL_2(q[i], mask);
1162 }
1163 }
1164 }
1165 }
1166
1167 static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)1168 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1169 {
1170 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1171 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1172 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1173 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1174 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1175 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1176 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1177 rr->rr_col[VDEV_RAIDZ_R].rc_size);
1178
1179 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1180 abd_t *src = rr->rr_col[c].rc_abd;
1181
1182 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1183
1184 if (c == rr->rr_firstdatacol) {
1185 ASSERT(ccnt == pcnt || ccnt == 0);
1186 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1187 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1188 (void) memcpy(r, p, rr->rr_col[c].rc_size);
1189
1190 for (uint64_t i = ccnt; i < pcnt; i++) {
1191 p[i] = 0;
1192 q[i] = 0;
1193 r[i] = 0;
1194 }
1195 } else {
1196 struct pqr_struct pqr = { p, q, r };
1197
1198 ASSERT(ccnt <= pcnt);
1199 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1200 vdev_raidz_pqr_func, &pqr);
1201
1202 /*
1203 * Treat short columns as though they are full of 0s.
1204 * Note that there's therefore nothing needed for P.
1205 */
1206 uint64_t mask;
1207 for (uint64_t i = ccnt; i < pcnt; i++) {
1208 VDEV_RAIDZ_64MUL_2(q[i], mask);
1209 VDEV_RAIDZ_64MUL_4(r[i], mask);
1210 }
1211 }
1212 }
1213 }
1214
1215 /*
1216 * Generate RAID parity in the first virtual columns according to the number of
1217 * parity columns available.
1218 */
1219 void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)1220 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1221 {
1222 if (rr->rr_cols == 0) {
1223 /*
1224 * We are handling this block one row at a time (because
1225 * this block has a different logical vs physical width,
1226 * due to RAIDZ expansion), and this is a pad-only row,
1227 * which has no parity.
1228 */
1229 return;
1230 }
1231
1232 /* Generate using the new math implementation */
1233 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1234 return;
1235
1236 switch (rr->rr_firstdatacol) {
1237 case 1:
1238 vdev_raidz_generate_parity_p(rr);
1239 break;
1240 case 2:
1241 vdev_raidz_generate_parity_pq(rr);
1242 break;
1243 case 3:
1244 vdev_raidz_generate_parity_pqr(rr);
1245 break;
1246 default:
1247 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1248 }
1249 }
1250
1251 void
vdev_raidz_generate_parity(raidz_map_t * rm)1252 vdev_raidz_generate_parity(raidz_map_t *rm)
1253 {
1254 for (int i = 0; i < rm->rm_nrows; i++) {
1255 raidz_row_t *rr = rm->rm_row[i];
1256 vdev_raidz_generate_parity_row(rm, rr);
1257 }
1258 }
1259
1260 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1261 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1262 {
1263 (void) private;
1264 uint64_t *dst = dbuf;
1265 uint64_t *src = sbuf;
1266 int cnt = size / sizeof (src[0]);
1267
1268 for (int i = 0; i < cnt; i++) {
1269 dst[i] ^= src[i];
1270 }
1271
1272 return (0);
1273 }
1274
1275 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1276 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1277 void *private)
1278 {
1279 (void) private;
1280 uint64_t *dst = dbuf;
1281 uint64_t *src = sbuf;
1282 uint64_t mask;
1283 int cnt = size / sizeof (dst[0]);
1284
1285 for (int i = 0; i < cnt; i++, dst++, src++) {
1286 VDEV_RAIDZ_64MUL_2(*dst, mask);
1287 *dst ^= *src;
1288 }
1289
1290 return (0);
1291 }
1292
1293 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1294 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1295 {
1296 (void) private;
1297 uint64_t *dst = buf;
1298 uint64_t mask;
1299 int cnt = size / sizeof (dst[0]);
1300
1301 for (int i = 0; i < cnt; i++, dst++) {
1302 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1303 VDEV_RAIDZ_64MUL_2(*dst, mask);
1304 }
1305
1306 return (0);
1307 }
1308
1309 struct reconst_q_struct {
1310 uint64_t *q;
1311 int exp;
1312 };
1313
1314 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1315 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1316 {
1317 struct reconst_q_struct *rq = private;
1318 uint64_t *dst = buf;
1319 int cnt = size / sizeof (dst[0]);
1320
1321 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1322 int j;
1323 uint8_t *b;
1324
1325 *dst ^= *rq->q;
1326 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1327 *b = vdev_raidz_exp2(*b, rq->exp);
1328 }
1329 }
1330
1331 return (0);
1332 }
1333
1334 struct reconst_pq_struct {
1335 uint8_t *p;
1336 uint8_t *q;
1337 uint8_t *pxy;
1338 uint8_t *qxy;
1339 int aexp;
1340 int bexp;
1341 };
1342
1343 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)1344 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1345 {
1346 struct reconst_pq_struct *rpq = private;
1347 uint8_t *xd = xbuf;
1348 uint8_t *yd = ybuf;
1349
1350 for (int i = 0; i < size;
1351 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1352 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1353 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1354 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1355 }
1356
1357 return (0);
1358 }
1359
1360 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1361 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1362 {
1363 struct reconst_pq_struct *rpq = private;
1364 uint8_t *xd = xbuf;
1365
1366 for (int i = 0; i < size;
1367 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1368 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1369 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1370 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1371 }
1372
1373 return (0);
1374 }
1375
1376 static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)1377 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1378 {
1379 int x = tgts[0];
1380 abd_t *dst, *src;
1381
1382 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1383 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1384
1385 ASSERT3U(ntgts, ==, 1);
1386 ASSERT3U(x, >=, rr->rr_firstdatacol);
1387 ASSERT3U(x, <, rr->rr_cols);
1388
1389 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1390
1391 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1392 dst = rr->rr_col[x].rc_abd;
1393
1394 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1395
1396 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1397 uint64_t size = MIN(rr->rr_col[x].rc_size,
1398 rr->rr_col[c].rc_size);
1399
1400 src = rr->rr_col[c].rc_abd;
1401
1402 if (c == x)
1403 continue;
1404
1405 (void) abd_iterate_func2(dst, src, 0, 0, size,
1406 vdev_raidz_reconst_p_func, NULL);
1407 }
1408 }
1409
1410 static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)1411 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1412 {
1413 int x = tgts[0];
1414 int c, exp;
1415 abd_t *dst, *src;
1416
1417 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1418 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1419
1420 ASSERT(ntgts == 1);
1421
1422 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1423
1424 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1425 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1426 rr->rr_col[c].rc_size);
1427
1428 src = rr->rr_col[c].rc_abd;
1429 dst = rr->rr_col[x].rc_abd;
1430
1431 if (c == rr->rr_firstdatacol) {
1432 abd_copy(dst, src, size);
1433 if (rr->rr_col[x].rc_size > size) {
1434 abd_zero_off(dst, size,
1435 rr->rr_col[x].rc_size - size);
1436 }
1437 } else {
1438 ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1439 (void) abd_iterate_func2(dst, src, 0, 0, size,
1440 vdev_raidz_reconst_q_pre_func, NULL);
1441 (void) abd_iterate_func(dst,
1442 size, rr->rr_col[x].rc_size - size,
1443 vdev_raidz_reconst_q_pre_tail_func, NULL);
1444 }
1445 }
1446
1447 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1448 dst = rr->rr_col[x].rc_abd;
1449 exp = 255 - (rr->rr_cols - 1 - x);
1450
1451 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1452 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1453 vdev_raidz_reconst_q_post_func, &rq);
1454 }
1455
1456 static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)1457 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1458 {
1459 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1460 abd_t *pdata, *qdata;
1461 uint64_t xsize, ysize;
1462 int x = tgts[0];
1463 int y = tgts[1];
1464 abd_t *xd, *yd;
1465
1466 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1467 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1468
1469 ASSERT(ntgts == 2);
1470 ASSERT(x < y);
1471 ASSERT(x >= rr->rr_firstdatacol);
1472 ASSERT(y < rr->rr_cols);
1473
1474 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1475
1476 /*
1477 * Move the parity data aside -- we're going to compute parity as
1478 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1479 * reuse the parity generation mechanism without trashing the actual
1480 * parity so we make those columns appear to be full of zeros by
1481 * setting their lengths to zero.
1482 */
1483 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1484 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1485 xsize = rr->rr_col[x].rc_size;
1486 ysize = rr->rr_col[y].rc_size;
1487
1488 rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1489 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1490 rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1491 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1492 rr->rr_col[x].rc_size = 0;
1493 rr->rr_col[y].rc_size = 0;
1494
1495 vdev_raidz_generate_parity_pq(rr);
1496
1497 rr->rr_col[x].rc_size = xsize;
1498 rr->rr_col[y].rc_size = ysize;
1499
1500 p = abd_to_buf(pdata);
1501 q = abd_to_buf(qdata);
1502 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1503 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1504 xd = rr->rr_col[x].rc_abd;
1505 yd = rr->rr_col[y].rc_abd;
1506
1507 /*
1508 * We now have:
1509 * Pxy = P + D_x + D_y
1510 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1511 *
1512 * We can then solve for D_x:
1513 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1514 * where
1515 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1516 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1517 *
1518 * With D_x in hand, we can easily solve for D_y:
1519 * D_y = P + Pxy + D_x
1520 */
1521
1522 a = vdev_raidz_pow2[255 + x - y];
1523 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1524 tmp = 255 - vdev_raidz_log2[a ^ 1];
1525
1526 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1527 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1528
1529 ASSERT3U(xsize, >=, ysize);
1530 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1531
1532 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1533 vdev_raidz_reconst_pq_func, &rpq);
1534 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1535 vdev_raidz_reconst_pq_tail_func, &rpq);
1536
1537 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1538 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1539
1540 /*
1541 * Restore the saved parity data.
1542 */
1543 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1544 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1545 }
1546
1547 /*
1548 * In the general case of reconstruction, we must solve the system of linear
1549 * equations defined by the coefficients used to generate parity as well as
1550 * the contents of the data and parity disks. This can be expressed with
1551 * vectors for the original data (D) and the actual data (d) and parity (p)
1552 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1553 *
1554 * __ __ __ __
1555 * | | __ __ | p_0 |
1556 * | V | | D_0 | | p_m-1 |
1557 * | | x | : | = | d_0 |
1558 * | I | | D_n-1 | | : |
1559 * | | ~~ ~~ | d_n-1 |
1560 * ~~ ~~ ~~ ~~
1561 *
1562 * I is simply a square identity matrix of size n, and V is a vandermonde
1563 * matrix defined by the coefficients we chose for the various parity columns
1564 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1565 * computation as well as linear separability.
1566 *
1567 * __ __ __ __
1568 * | 1 .. 1 1 1 | | p_0 |
1569 * | 2^n-1 .. 4 2 1 | __ __ | : |
1570 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1571 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1572 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1573 * | : : : : | | : | | d_2 |
1574 * | 0 .. 1 0 0 | | D_n-1 | | : |
1575 * | 0 .. 0 1 0 | ~~ ~~ | : |
1576 * | 0 .. 0 0 1 | | d_n-1 |
1577 * ~~ ~~ ~~ ~~
1578 *
1579 * Note that I, V, d, and p are known. To compute D, we must invert the
1580 * matrix and use the known data and parity values to reconstruct the unknown
1581 * data values. We begin by removing the rows in V|I and d|p that correspond
1582 * to failed or missing columns; we then make V|I square (n x n) and d|p
1583 * sized n by removing rows corresponding to unused parity from the bottom up
1584 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1585 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1586 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1587 * __ __
1588 * | 1 1 1 1 1 1 1 1 |
1589 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1590 * | 19 205 116 29 64 16 4 1 | / /
1591 * | 1 0 0 0 0 0 0 0 | / /
1592 * | 0 1 0 0 0 0 0 0 | <--' /
1593 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1594 * | 0 0 0 1 0 0 0 0 |
1595 * | 0 0 0 0 1 0 0 0 |
1596 * | 0 0 0 0 0 1 0 0 |
1597 * | 0 0 0 0 0 0 1 0 |
1598 * | 0 0 0 0 0 0 0 1 |
1599 * ~~ ~~
1600 * __ __
1601 * | 1 1 1 1 1 1 1 1 |
1602 * | 128 64 32 16 8 4 2 1 |
1603 * | 19 205 116 29 64 16 4 1 |
1604 * | 1 0 0 0 0 0 0 0 |
1605 * | 0 1 0 0 0 0 0 0 |
1606 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1607 * | 0 0 0 1 0 0 0 0 |
1608 * | 0 0 0 0 1 0 0 0 |
1609 * | 0 0 0 0 0 1 0 0 |
1610 * | 0 0 0 0 0 0 1 0 |
1611 * | 0 0 0 0 0 0 0 1 |
1612 * ~~ ~~
1613 *
1614 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1615 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1616 * matrix is not singular.
1617 * __ __
1618 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1619 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1620 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1621 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1622 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1623 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1624 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1625 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1626 * ~~ ~~
1627 * __ __
1628 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1629 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1630 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1631 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1632 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1633 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1634 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1635 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1636 * ~~ ~~
1637 * __ __
1638 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1639 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1640 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1641 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1642 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1643 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1644 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1645 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1646 * ~~ ~~
1647 * __ __
1648 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1649 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1650 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1651 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1652 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1653 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1654 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1655 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1656 * ~~ ~~
1657 * __ __
1658 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1659 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1660 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1661 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1662 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1663 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1664 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1665 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1666 * ~~ ~~
1667 * __ __
1668 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1669 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1670 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1671 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1672 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1673 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1674 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1675 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1676 * ~~ ~~
1677 * __ __
1678 * | 0 0 1 0 0 0 0 0 |
1679 * | 167 100 5 41 159 169 217 208 |
1680 * | 166 100 4 40 158 168 216 209 |
1681 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1682 * | 0 0 0 0 1 0 0 0 |
1683 * | 0 0 0 0 0 1 0 0 |
1684 * | 0 0 0 0 0 0 1 0 |
1685 * | 0 0 0 0 0 0 0 1 |
1686 * ~~ ~~
1687 *
1688 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1689 * of the missing data.
1690 *
1691 * As is apparent from the example above, the only non-trivial rows in the
1692 * inverse matrix correspond to the data disks that we're trying to
1693 * reconstruct. Indeed, those are the only rows we need as the others would
1694 * only be useful for reconstructing data known or assumed to be valid. For
1695 * that reason, we only build the coefficients in the rows that correspond to
1696 * targeted columns.
1697 */
1698
1699 static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)1700 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1701 uint8_t **rows)
1702 {
1703 int i, j;
1704 int pow;
1705
1706 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1707
1708 /*
1709 * Fill in the missing rows of interest.
1710 */
1711 for (i = 0; i < nmap; i++) {
1712 ASSERT3S(0, <=, map[i]);
1713 ASSERT3S(map[i], <=, 2);
1714
1715 pow = map[i] * n;
1716 if (pow > 255)
1717 pow -= 255;
1718 ASSERT(pow <= 255);
1719
1720 for (j = 0; j < n; j++) {
1721 pow -= map[i];
1722 if (pow < 0)
1723 pow += 255;
1724 rows[i][j] = vdev_raidz_pow2[pow];
1725 }
1726 }
1727 }
1728
1729 static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1730 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1731 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1732 {
1733 int i, j, ii, jj;
1734 uint8_t log;
1735
1736 /*
1737 * Assert that the first nmissing entries from the array of used
1738 * columns correspond to parity columns and that subsequent entries
1739 * correspond to data columns.
1740 */
1741 for (i = 0; i < nmissing; i++) {
1742 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1743 }
1744 for (; i < n; i++) {
1745 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1746 }
1747
1748 /*
1749 * First initialize the storage where we'll compute the inverse rows.
1750 */
1751 for (i = 0; i < nmissing; i++) {
1752 for (j = 0; j < n; j++) {
1753 invrows[i][j] = (i == j) ? 1 : 0;
1754 }
1755 }
1756
1757 /*
1758 * Subtract all trivial rows from the rows of consequence.
1759 */
1760 for (i = 0; i < nmissing; i++) {
1761 for (j = nmissing; j < n; j++) {
1762 ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1763 jj = used[j] - rr->rr_firstdatacol;
1764 ASSERT3S(jj, <, n);
1765 invrows[i][j] = rows[i][jj];
1766 rows[i][jj] = 0;
1767 }
1768 }
1769
1770 /*
1771 * For each of the rows of interest, we must normalize it and subtract
1772 * a multiple of it from the other rows.
1773 */
1774 for (i = 0; i < nmissing; i++) {
1775 for (j = 0; j < missing[i]; j++) {
1776 ASSERT0(rows[i][j]);
1777 }
1778 ASSERT3U(rows[i][missing[i]], !=, 0);
1779
1780 /*
1781 * Compute the inverse of the first element and multiply each
1782 * element in the row by that value.
1783 */
1784 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1785
1786 for (j = 0; j < n; j++) {
1787 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1788 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1789 }
1790
1791 for (ii = 0; ii < nmissing; ii++) {
1792 if (i == ii)
1793 continue;
1794
1795 ASSERT3U(rows[ii][missing[i]], !=, 0);
1796
1797 log = vdev_raidz_log2[rows[ii][missing[i]]];
1798
1799 for (j = 0; j < n; j++) {
1800 rows[ii][j] ^=
1801 vdev_raidz_exp2(rows[i][j], log);
1802 invrows[ii][j] ^=
1803 vdev_raidz_exp2(invrows[i][j], log);
1804 }
1805 }
1806 }
1807
1808 /*
1809 * Verify that the data that is left in the rows are properly part of
1810 * an identity matrix.
1811 */
1812 for (i = 0; i < nmissing; i++) {
1813 for (j = 0; j < n; j++) {
1814 if (j == missing[i]) {
1815 ASSERT3U(rows[i][j], ==, 1);
1816 } else {
1817 ASSERT0(rows[i][j]);
1818 }
1819 }
1820 }
1821 }
1822
1823 static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1824 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1825 int *missing, uint8_t **invrows, const uint8_t *used)
1826 {
1827 int i, j, x, cc, c;
1828 uint8_t *src;
1829 uint64_t ccount;
1830 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1831 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1832 uint8_t log = 0;
1833 uint8_t val;
1834 int ll;
1835 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1836 uint8_t *p, *pp;
1837 size_t psize;
1838
1839 psize = sizeof (invlog[0][0]) * n * nmissing;
1840 p = kmem_alloc(psize, KM_SLEEP);
1841
1842 for (pp = p, i = 0; i < nmissing; i++) {
1843 invlog[i] = pp;
1844 pp += n;
1845 }
1846
1847 for (i = 0; i < nmissing; i++) {
1848 for (j = 0; j < n; j++) {
1849 ASSERT3U(invrows[i][j], !=, 0);
1850 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1851 }
1852 }
1853
1854 for (i = 0; i < n; i++) {
1855 c = used[i];
1856 ASSERT3U(c, <, rr->rr_cols);
1857
1858 ccount = rr->rr_col[c].rc_size;
1859 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1860 if (ccount == 0)
1861 continue;
1862 src = abd_to_buf(rr->rr_col[c].rc_abd);
1863 for (j = 0; j < nmissing; j++) {
1864 cc = missing[j] + rr->rr_firstdatacol;
1865 ASSERT3U(cc, >=, rr->rr_firstdatacol);
1866 ASSERT3U(cc, <, rr->rr_cols);
1867 ASSERT3U(cc, !=, c);
1868
1869 dcount[j] = rr->rr_col[cc].rc_size;
1870 if (dcount[j] != 0)
1871 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1872 }
1873
1874 for (x = 0; x < ccount; x++, src++) {
1875 if (*src != 0)
1876 log = vdev_raidz_log2[*src];
1877
1878 for (cc = 0; cc < nmissing; cc++) {
1879 if (x >= dcount[cc])
1880 continue;
1881
1882 if (*src == 0) {
1883 val = 0;
1884 } else {
1885 if ((ll = log + invlog[cc][i]) >= 255)
1886 ll -= 255;
1887 val = vdev_raidz_pow2[ll];
1888 }
1889
1890 if (i == 0)
1891 dst[cc][x] = val;
1892 else
1893 dst[cc][x] ^= val;
1894 }
1895 }
1896 }
1897
1898 kmem_free(p, psize);
1899 }
1900
1901 static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)1902 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1903 {
1904 int i, c, t, tt;
1905 unsigned int n;
1906 unsigned int nmissing_rows;
1907 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1908 int parity_map[VDEV_RAIDZ_MAXPARITY];
1909 uint8_t *p, *pp;
1910 size_t psize;
1911 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1912 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1913 uint8_t *used;
1914
1915 abd_t **bufs = NULL;
1916
1917 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1918 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1919 /*
1920 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1921 * temporary linear ABDs if any non-linear ABDs are found.
1922 */
1923 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1924 ASSERT(rr->rr_col[i].rc_abd != NULL);
1925 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1926 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1927 KM_PUSHPAGE);
1928
1929 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1930 raidz_col_t *col = &rr->rr_col[c];
1931
1932 bufs[c] = col->rc_abd;
1933 if (bufs[c] != NULL) {
1934 col->rc_abd = abd_alloc_linear(
1935 col->rc_size, B_TRUE);
1936 abd_copy(col->rc_abd, bufs[c],
1937 col->rc_size);
1938 }
1939 }
1940
1941 break;
1942 }
1943 }
1944
1945 n = rr->rr_cols - rr->rr_firstdatacol;
1946
1947 /*
1948 * Figure out which data columns are missing.
1949 */
1950 nmissing_rows = 0;
1951 for (t = 0; t < ntgts; t++) {
1952 if (tgts[t] >= rr->rr_firstdatacol) {
1953 missing_rows[nmissing_rows++] =
1954 tgts[t] - rr->rr_firstdatacol;
1955 }
1956 }
1957
1958 /*
1959 * Figure out which parity columns to use to help generate the missing
1960 * data columns.
1961 */
1962 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1963 ASSERT(tt < ntgts);
1964 ASSERT(c < rr->rr_firstdatacol);
1965
1966 /*
1967 * Skip any targeted parity columns.
1968 */
1969 if (c == tgts[tt]) {
1970 tt++;
1971 continue;
1972 }
1973
1974 parity_map[i] = c;
1975 i++;
1976 }
1977
1978 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1979 nmissing_rows * n + sizeof (used[0]) * n;
1980 p = kmem_alloc(psize, KM_SLEEP);
1981
1982 for (pp = p, i = 0; i < nmissing_rows; i++) {
1983 rows[i] = pp;
1984 pp += n;
1985 invrows[i] = pp;
1986 pp += n;
1987 }
1988 used = pp;
1989
1990 for (i = 0; i < nmissing_rows; i++) {
1991 used[i] = parity_map[i];
1992 }
1993
1994 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1995 if (tt < nmissing_rows &&
1996 c == missing_rows[tt] + rr->rr_firstdatacol) {
1997 tt++;
1998 continue;
1999 }
2000
2001 ASSERT3S(i, <, n);
2002 used[i] = c;
2003 i++;
2004 }
2005
2006 /*
2007 * Initialize the interesting rows of the matrix.
2008 */
2009 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2010
2011 /*
2012 * Invert the matrix.
2013 */
2014 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2015 invrows, used);
2016
2017 /*
2018 * Reconstruct the missing data using the generated matrix.
2019 */
2020 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2021 invrows, used);
2022
2023 kmem_free(p, psize);
2024
2025 /*
2026 * copy back from temporary linear abds and free them
2027 */
2028 if (bufs) {
2029 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2030 raidz_col_t *col = &rr->rr_col[c];
2031
2032 if (bufs[c] != NULL) {
2033 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2034 abd_free(col->rc_abd);
2035 }
2036 col->rc_abd = bufs[c];
2037 }
2038 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2039 }
2040 }
2041
2042 static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)2043 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2044 const int *t, int nt)
2045 {
2046 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2047 int ntgts;
2048 int i, c, ret;
2049 int nbadparity, nbaddata;
2050 int parity_valid[VDEV_RAIDZ_MAXPARITY];
2051
2052 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2053 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2054 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2055 (int)rr->rr_missingparity);
2056 }
2057
2058 nbadparity = rr->rr_firstdatacol;
2059 nbaddata = rr->rr_cols - nbadparity;
2060 ntgts = 0;
2061 for (i = 0, c = 0; c < rr->rr_cols; c++) {
2062 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2063 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2064 "offset=%llx error=%u)",
2065 rr, c, (int)rr->rr_col[c].rc_devidx,
2066 (long long)rr->rr_col[c].rc_offset,
2067 (int)rr->rr_col[c].rc_error);
2068 }
2069 if (c < rr->rr_firstdatacol)
2070 parity_valid[c] = B_FALSE;
2071
2072 if (i < nt && c == t[i]) {
2073 tgts[ntgts++] = c;
2074 i++;
2075 } else if (rr->rr_col[c].rc_error != 0) {
2076 tgts[ntgts++] = c;
2077 } else if (c >= rr->rr_firstdatacol) {
2078 nbaddata--;
2079 } else {
2080 parity_valid[c] = B_TRUE;
2081 nbadparity--;
2082 }
2083 }
2084
2085 ASSERT(ntgts >= nt);
2086 ASSERT(nbaddata >= 0);
2087 ASSERT(nbaddata + nbadparity == ntgts);
2088
2089 dt = &tgts[nbadparity];
2090
2091 /* Reconstruct using the new math implementation */
2092 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2093 if (ret != RAIDZ_ORIGINAL_IMPL)
2094 return;
2095
2096 /*
2097 * See if we can use any of our optimized reconstruction routines.
2098 */
2099 switch (nbaddata) {
2100 case 1:
2101 if (parity_valid[VDEV_RAIDZ_P]) {
2102 vdev_raidz_reconstruct_p(rr, dt, 1);
2103 return;
2104 }
2105
2106 ASSERT(rr->rr_firstdatacol > 1);
2107
2108 if (parity_valid[VDEV_RAIDZ_Q]) {
2109 vdev_raidz_reconstruct_q(rr, dt, 1);
2110 return;
2111 }
2112
2113 ASSERT(rr->rr_firstdatacol > 2);
2114 break;
2115
2116 case 2:
2117 ASSERT(rr->rr_firstdatacol > 1);
2118
2119 if (parity_valid[VDEV_RAIDZ_P] &&
2120 parity_valid[VDEV_RAIDZ_Q]) {
2121 vdev_raidz_reconstruct_pq(rr, dt, 2);
2122 return;
2123 }
2124
2125 ASSERT(rr->rr_firstdatacol > 2);
2126
2127 break;
2128 }
2129
2130 vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2131 }
2132
2133 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)2134 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2135 uint64_t *logical_ashift, uint64_t *physical_ashift)
2136 {
2137 vdev_raidz_t *vdrz = vd->vdev_tsd;
2138 uint64_t nparity = vdrz->vd_nparity;
2139 int c;
2140 int lasterror = 0;
2141 int numerrors = 0;
2142
2143 ASSERT(nparity > 0);
2144
2145 if (nparity > VDEV_RAIDZ_MAXPARITY ||
2146 vd->vdev_children < nparity + 1) {
2147 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2148 return (SET_ERROR(EINVAL));
2149 }
2150
2151 vdev_open_children(vd);
2152
2153 for (c = 0; c < vd->vdev_children; c++) {
2154 vdev_t *cvd = vd->vdev_child[c];
2155
2156 if (cvd->vdev_open_error != 0) {
2157 lasterror = cvd->vdev_open_error;
2158 numerrors++;
2159 continue;
2160 }
2161
2162 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2163 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2164 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2165 }
2166 for (c = 0; c < vd->vdev_children; c++) {
2167 vdev_t *cvd = vd->vdev_child[c];
2168
2169 if (cvd->vdev_open_error != 0)
2170 continue;
2171 *physical_ashift = vdev_best_ashift(*logical_ashift,
2172 *physical_ashift, cvd->vdev_physical_ashift);
2173 }
2174
2175 if (vd->vdev_rz_expanding) {
2176 *asize *= vd->vdev_children - 1;
2177 *max_asize *= vd->vdev_children - 1;
2178
2179 vd->vdev_min_asize = *asize;
2180 } else {
2181 *asize *= vd->vdev_children;
2182 *max_asize *= vd->vdev_children;
2183 }
2184
2185 if (numerrors > nparity) {
2186 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2187 return (lasterror);
2188 }
2189
2190 return (0);
2191 }
2192
2193 static void
vdev_raidz_close(vdev_t * vd)2194 vdev_raidz_close(vdev_t *vd)
2195 {
2196 for (int c = 0; c < vd->vdev_children; c++) {
2197 if (vd->vdev_child[c] != NULL)
2198 vdev_close(vd->vdev_child[c]);
2199 }
2200 }
2201
2202 /*
2203 * Return the logical width to use, given the txg in which the allocation
2204 * happened. Note that BP_GET_BIRTH() is usually the txg in which the
2205 * BP was allocated. Remapped BP's (that were relocated due to device
2206 * removal, see remap_blkptr_cb()), will have a more recent physical birth
2207 * which reflects when the BP was relocated, but we can ignore these because
2208 * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2209 */
2210 static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)2211 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2212 {
2213 reflow_node_t lookup = {
2214 .re_txg = txg,
2215 };
2216 avl_index_t where;
2217
2218 uint64_t width;
2219 mutex_enter(&vdrz->vd_expand_lock);
2220 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2221 if (re != NULL) {
2222 width = re->re_logical_width;
2223 } else {
2224 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2225 if (re != NULL)
2226 width = re->re_logical_width;
2227 else
2228 width = vdrz->vd_original_width;
2229 }
2230 mutex_exit(&vdrz->vd_expand_lock);
2231 return (width);
2232 }
2233
2234 /*
2235 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2236 * more space due to the lower data-to-parity ratio. In this case it's
2237 * important to pass in the correct txg. Note that vdev_gang_header_asize()
2238 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2239 * regardless of txg. This is assured because for a single data sector, we
2240 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2241 */
2242 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize,uint64_t txg)2243 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2244 {
2245 vdev_raidz_t *vdrz = vd->vdev_tsd;
2246 uint64_t asize;
2247 uint64_t ashift = vd->vdev_top->vdev_ashift;
2248 uint64_t cols = vdrz->vd_original_width;
2249 uint64_t nparity = vdrz->vd_nparity;
2250
2251 cols = vdev_raidz_get_logical_width(vdrz, txg);
2252
2253 asize = ((psize - 1) >> ashift) + 1;
2254 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2255 asize = roundup(asize, nparity + 1) << ashift;
2256
2257 #ifdef ZFS_DEBUG
2258 uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2259 uint64_t ncols_new = vdrz->vd_physical_width;
2260 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2261 (ncols_new - nparity));
2262 asize_new = roundup(asize_new, nparity + 1) << ashift;
2263 VERIFY3U(asize_new, <=, asize);
2264 #endif
2265
2266 return (asize);
2267 }
2268
2269 /*
2270 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2271 * so each child must provide at least 1/Nth of its asize.
2272 */
2273 static uint64_t
vdev_raidz_min_asize(vdev_t * vd)2274 vdev_raidz_min_asize(vdev_t *vd)
2275 {
2276 return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2277 vd->vdev_children);
2278 }
2279
2280 void
vdev_raidz_child_done(zio_t * zio)2281 vdev_raidz_child_done(zio_t *zio)
2282 {
2283 raidz_col_t *rc = zio->io_private;
2284
2285 ASSERT3P(rc->rc_abd, !=, NULL);
2286 rc->rc_error = zio->io_error;
2287 rc->rc_tried = 1;
2288 rc->rc_skipped = 0;
2289 }
2290
2291 static void
vdev_raidz_shadow_child_done(zio_t * zio)2292 vdev_raidz_shadow_child_done(zio_t *zio)
2293 {
2294 raidz_col_t *rc = zio->io_private;
2295
2296 rc->rc_shadow_error = zio->io_error;
2297 }
2298
2299 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)2300 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2301 {
2302 (void) rm;
2303 #ifdef ZFS_DEBUG
2304 range_seg64_t logical_rs, physical_rs, remain_rs;
2305 logical_rs.rs_start = rr->rr_offset;
2306 logical_rs.rs_end = logical_rs.rs_start +
2307 vdev_raidz_asize(zio->io_vd, rr->rr_size,
2308 BP_GET_BIRTH(zio->io_bp));
2309
2310 raidz_col_t *rc = &rr->rr_col[col];
2311 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2312
2313 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2314 ASSERT(vdev_xlate_is_empty(&remain_rs));
2315 if (vdev_xlate_is_empty(&physical_rs)) {
2316 /*
2317 * If we are in the middle of expansion, the
2318 * physical->logical mapping is changing so vdev_xlate()
2319 * can't give us a reliable answer.
2320 */
2321 return;
2322 }
2323 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2324 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2325 /*
2326 * It would be nice to assert that rs_end is equal
2327 * to rc_offset + rc_size but there might be an
2328 * optional I/O at the end that is not accounted in
2329 * rc_size.
2330 */
2331 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2332 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2333 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2334 } else {
2335 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2336 }
2337 #endif
2338 }
2339
2340 static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2341 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2342 {
2343 vdev_t *vd = zio->io_vd;
2344 raidz_map_t *rm = zio->io_vsd;
2345
2346 vdev_raidz_generate_parity_row(rm, rr);
2347
2348 for (int c = 0; c < rr->rr_scols; c++) {
2349 raidz_col_t *rc = &rr->rr_col[c];
2350 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2351
2352 /* Verify physical to logical translation */
2353 vdev_raidz_io_verify(zio, rm, rr, c);
2354
2355 if (rc->rc_size == 0)
2356 continue;
2357
2358 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2359 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2360
2361 ASSERT3P(rc->rc_abd, !=, NULL);
2362 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2363 rc->rc_offset, rc->rc_abd,
2364 abd_get_size(rc->rc_abd), zio->io_type,
2365 zio->io_priority, 0, vdev_raidz_child_done, rc));
2366
2367 if (rc->rc_shadow_devidx != INT_MAX) {
2368 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2369
2370 ASSERT3U(
2371 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2372 cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2373
2374 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2375 rc->rc_shadow_offset, rc->rc_abd,
2376 abd_get_size(rc->rc_abd),
2377 zio->io_type, zio->io_priority, 0,
2378 vdev_raidz_shadow_child_done, rc));
2379 }
2380 }
2381 }
2382
2383 /*
2384 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2385 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2386 */
2387 static void
raidz_start_skip_writes(zio_t * zio)2388 raidz_start_skip_writes(zio_t *zio)
2389 {
2390 vdev_t *vd = zio->io_vd;
2391 uint64_t ashift = vd->vdev_top->vdev_ashift;
2392 raidz_map_t *rm = zio->io_vsd;
2393 ASSERT3U(rm->rm_nrows, ==, 1);
2394 raidz_row_t *rr = rm->rm_row[0];
2395 for (int c = 0; c < rr->rr_scols; c++) {
2396 raidz_col_t *rc = &rr->rr_col[c];
2397 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2398 if (rc->rc_size != 0)
2399 continue;
2400 ASSERT3P(rc->rc_abd, ==, NULL);
2401
2402 ASSERT3U(rc->rc_offset, <,
2403 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2404
2405 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2406 NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2407 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2408 }
2409 }
2410
2411 static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2412 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2413 {
2414 vdev_t *vd = zio->io_vd;
2415
2416 /*
2417 * Iterate over the columns in reverse order so that we hit the parity
2418 * last -- any errors along the way will force us to read the parity.
2419 */
2420 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2421 raidz_col_t *rc = &rr->rr_col[c];
2422 if (rc->rc_size == 0)
2423 continue;
2424 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2425 if (!vdev_readable(cvd)) {
2426 if (c >= rr->rr_firstdatacol)
2427 rr->rr_missingdata++;
2428 else
2429 rr->rr_missingparity++;
2430 rc->rc_error = SET_ERROR(ENXIO);
2431 rc->rc_tried = 1; /* don't even try */
2432 rc->rc_skipped = 1;
2433 continue;
2434 }
2435 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2436 if (c >= rr->rr_firstdatacol)
2437 rr->rr_missingdata++;
2438 else
2439 rr->rr_missingparity++;
2440 rc->rc_error = SET_ERROR(ESTALE);
2441 rc->rc_skipped = 1;
2442 continue;
2443 }
2444 if (forceparity ||
2445 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2446 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2447 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2448 rc->rc_offset, rc->rc_abd, rc->rc_size,
2449 zio->io_type, zio->io_priority, 0,
2450 vdev_raidz_child_done, rc));
2451 }
2452 }
2453 }
2454
2455 static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)2456 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2457 {
2458 vdev_t *vd = zio->io_vd;
2459
2460 for (int i = 0; i < rm->rm_nphys_cols; i++) {
2461 raidz_col_t *prc = &rm->rm_phys_col[i];
2462 if (prc->rc_size == 0)
2463 continue;
2464
2465 ASSERT3U(prc->rc_devidx, ==, i);
2466 vdev_t *cvd = vd->vdev_child[i];
2467 if (!vdev_readable(cvd)) {
2468 prc->rc_error = SET_ERROR(ENXIO);
2469 prc->rc_tried = 1; /* don't even try */
2470 prc->rc_skipped = 1;
2471 continue;
2472 }
2473 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2474 prc->rc_error = SET_ERROR(ESTALE);
2475 prc->rc_skipped = 1;
2476 continue;
2477 }
2478 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2479 prc->rc_offset, prc->rc_abd, prc->rc_size,
2480 zio->io_type, zio->io_priority, 0,
2481 vdev_raidz_child_done, prc));
2482 }
2483 }
2484
2485 static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)2486 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2487 {
2488 /*
2489 * If there are multiple rows, we will be hitting
2490 * all disks, so go ahead and read the parity so
2491 * that we are reading in decent size chunks.
2492 */
2493 boolean_t forceparity = rm->rm_nrows > 1;
2494
2495 if (rm->rm_phys_col) {
2496 vdev_raidz_io_start_read_phys_cols(zio, rm);
2497 } else {
2498 for (int i = 0; i < rm->rm_nrows; i++) {
2499 raidz_row_t *rr = rm->rm_row[i];
2500 vdev_raidz_io_start_read_row(zio, rr, forceparity);
2501 }
2502 }
2503 }
2504
2505 /*
2506 * Start an IO operation on a RAIDZ VDev
2507 *
2508 * Outline:
2509 * - For write operations:
2510 * 1. Generate the parity data
2511 * 2. Create child zio write operations to each column's vdev, for both
2512 * data and parity.
2513 * 3. If the column skips any sectors for padding, create optional dummy
2514 * write zio children for those areas to improve aggregation continuity.
2515 * - For read operations:
2516 * 1. Create child zio read operations to each data column's vdev to read
2517 * the range of data required for zio.
2518 * 2. If this is a scrub or resilver operation, or if any of the data
2519 * vdevs have had errors, then create zio read operations to the parity
2520 * columns' VDevs as well.
2521 */
2522 static void
vdev_raidz_io_start(zio_t * zio)2523 vdev_raidz_io_start(zio_t *zio)
2524 {
2525 vdev_t *vd = zio->io_vd;
2526 vdev_t *tvd = vd->vdev_top;
2527 vdev_raidz_t *vdrz = vd->vdev_tsd;
2528 raidz_map_t *rm;
2529
2530 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2531 BP_GET_BIRTH(zio->io_bp));
2532 if (logical_width != vdrz->vd_physical_width) {
2533 zfs_locked_range_t *lr = NULL;
2534 uint64_t synced_offset = UINT64_MAX;
2535 uint64_t next_offset = UINT64_MAX;
2536 boolean_t use_scratch = B_FALSE;
2537 /*
2538 * Note: when the expansion is completing, we set
2539 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2540 * in a later txg than when we last update spa_ubsync's state
2541 * (see the end of spa_raidz_expand_thread()). Therefore we
2542 * may see vre_state!=SCANNING before
2543 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2544 * on disk, but the copying progress has been synced to disk
2545 * (and reflected in spa_ubsync). In this case it's fine to
2546 * treat the expansion as completed, since if we crash there's
2547 * no additional copying to do.
2548 */
2549 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2550 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2551 &vdrz->vn_vre);
2552 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2553 zio->io_offset, zio->io_size, RL_READER);
2554 use_scratch =
2555 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2556 RRSS_SCRATCH_VALID);
2557 synced_offset =
2558 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2559 next_offset = vdrz->vn_vre.vre_offset;
2560 /*
2561 * If we haven't resumed expanding since importing the
2562 * pool, vre_offset won't have been set yet. In
2563 * this case the next offset to be copied is the same
2564 * as what was synced.
2565 */
2566 if (next_offset == UINT64_MAX) {
2567 next_offset = synced_offset;
2568 }
2569 }
2570 if (use_scratch) {
2571 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2572 "%lld next_offset=%lld use_scratch=%u",
2573 zio,
2574 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2575 (long long)zio->io_offset,
2576 (long long)synced_offset,
2577 (long long)next_offset,
2578 use_scratch);
2579 }
2580
2581 rm = vdev_raidz_map_alloc_expanded(zio,
2582 tvd->vdev_ashift, vdrz->vd_physical_width,
2583 logical_width, vdrz->vd_nparity,
2584 synced_offset, next_offset, use_scratch);
2585 rm->rm_lr = lr;
2586 } else {
2587 rm = vdev_raidz_map_alloc(zio,
2588 tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2589 }
2590 rm->rm_original_width = vdrz->vd_original_width;
2591
2592 zio->io_vsd = rm;
2593 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2594 if (zio->io_type == ZIO_TYPE_WRITE) {
2595 for (int i = 0; i < rm->rm_nrows; i++) {
2596 vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2597 }
2598
2599 if (logical_width == vdrz->vd_physical_width) {
2600 raidz_start_skip_writes(zio);
2601 }
2602 } else {
2603 ASSERT(zio->io_type == ZIO_TYPE_READ);
2604 vdev_raidz_io_start_read(zio, rm);
2605 }
2606
2607 zio_execute(zio);
2608 }
2609
2610 /*
2611 * Report a checksum error for a child of a RAID-Z device.
2612 */
2613 void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2614 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2615 {
2616 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2617
2618 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2619 zio->io_priority != ZIO_PRIORITY_REBUILD) {
2620 zio_bad_cksum_t zbc;
2621 raidz_map_t *rm = zio->io_vsd;
2622
2623 zbc.zbc_has_cksum = 0;
2624 zbc.zbc_injected = rm->rm_ecksuminjected;
2625
2626 mutex_enter(&vd->vdev_stat_lock);
2627 vd->vdev_stat.vs_checksum_errors++;
2628 mutex_exit(&vd->vdev_stat_lock);
2629 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2630 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2631 rc->rc_abd, bad_data, &zbc);
2632 }
2633 }
2634
2635 /*
2636 * We keep track of whether or not there were any injected errors, so that
2637 * any ereports we generate can note it.
2638 */
2639 static int
raidz_checksum_verify(zio_t * zio)2640 raidz_checksum_verify(zio_t *zio)
2641 {
2642 zio_bad_cksum_t zbc = {0};
2643 raidz_map_t *rm = zio->io_vsd;
2644
2645 int ret = zio_checksum_error(zio, &zbc);
2646 /*
2647 * Any Direct I/O read that has a checksum error must be treated as
2648 * suspicious as the contents of the buffer could be getting
2649 * manipulated while the I/O is taking place. The checksum verify error
2650 * will be reported to the top-level RAIDZ VDEV.
2651 */
2652 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2653 zio->io_error = ret;
2654 zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
2655 zio_dio_chksum_verify_error_report(zio);
2656 zio_checksum_verified(zio);
2657 return (0);
2658 }
2659
2660 if (ret != 0 && zbc.zbc_injected != 0)
2661 rm->rm_ecksuminjected = 1;
2662
2663 return (ret);
2664 }
2665
2666 /*
2667 * Generate the parity from the data columns. If we tried and were able to
2668 * read the parity without error, verify that the generated parity matches the
2669 * data we read. If it doesn't, we fire off a checksum error. Return the
2670 * number of such failures.
2671 */
2672 static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)2673 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2674 {
2675 abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2676 int c, ret = 0;
2677 raidz_map_t *rm = zio->io_vsd;
2678 raidz_col_t *rc;
2679
2680 blkptr_t *bp = zio->io_bp;
2681 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2682 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2683
2684 if (checksum == ZIO_CHECKSUM_NOPARITY)
2685 return (ret);
2686
2687 for (c = 0; c < rr->rr_firstdatacol; c++) {
2688 rc = &rr->rr_col[c];
2689 if (!rc->rc_tried || rc->rc_error != 0)
2690 continue;
2691
2692 orig[c] = rc->rc_abd;
2693 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2694 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2695 }
2696
2697 /*
2698 * Verify any empty sectors are zero filled to ensure the parity
2699 * is calculated correctly even if these non-data sectors are damaged.
2700 */
2701 if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2702 ret += vdev_draid_map_verify_empty(zio, rr);
2703
2704 /*
2705 * Regenerates parity even for !tried||rc_error!=0 columns. This
2706 * isn't harmful but it does have the side effect of fixing stuff
2707 * we didn't realize was necessary (i.e. even if we return 0).
2708 */
2709 vdev_raidz_generate_parity_row(rm, rr);
2710
2711 for (c = 0; c < rr->rr_firstdatacol; c++) {
2712 rc = &rr->rr_col[c];
2713
2714 if (!rc->rc_tried || rc->rc_error != 0)
2715 continue;
2716
2717 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2718 zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2719 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2720 vdev_raidz_checksum_error(zio, rc, orig[c]);
2721 rc->rc_error = SET_ERROR(ECKSUM);
2722 ret++;
2723 }
2724 abd_free(orig[c]);
2725 }
2726
2727 return (ret);
2728 }
2729
2730 static int
vdev_raidz_worst_error(raidz_row_t * rr)2731 vdev_raidz_worst_error(raidz_row_t *rr)
2732 {
2733 int error = 0;
2734
2735 for (int c = 0; c < rr->rr_cols; c++) {
2736 error = zio_worst_error(error, rr->rr_col[c].rc_error);
2737 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2738 }
2739
2740 return (error);
2741 }
2742
2743 static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)2744 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2745 {
2746 int unexpected_errors = 0;
2747 int parity_errors = 0;
2748 int parity_untried = 0;
2749 int data_errors = 0;
2750
2751 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2752
2753 for (int c = 0; c < rr->rr_cols; c++) {
2754 raidz_col_t *rc = &rr->rr_col[c];
2755
2756 if (rc->rc_error) {
2757 if (c < rr->rr_firstdatacol)
2758 parity_errors++;
2759 else
2760 data_errors++;
2761
2762 if (!rc->rc_skipped)
2763 unexpected_errors++;
2764 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2765 parity_untried++;
2766 }
2767
2768 if (rc->rc_force_repair)
2769 unexpected_errors++;
2770 }
2771
2772 /*
2773 * If we read more parity disks than were used for
2774 * reconstruction, confirm that the other parity disks produced
2775 * correct data.
2776 *
2777 * Note that we also regenerate parity when resilvering so we
2778 * can write it out to failed devices later.
2779 */
2780 if (parity_errors + parity_untried <
2781 rr->rr_firstdatacol - data_errors ||
2782 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2783 int n = raidz_parity_verify(zio, rr);
2784 unexpected_errors += n;
2785 }
2786
2787 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2788 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2789 /*
2790 * Use the good data we have in hand to repair damaged children.
2791 */
2792 for (int c = 0; c < rr->rr_cols; c++) {
2793 raidz_col_t *rc = &rr->rr_col[c];
2794 vdev_t *vd = zio->io_vd;
2795 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2796
2797 if (!rc->rc_allow_repair) {
2798 continue;
2799 } else if (!rc->rc_force_repair &&
2800 (rc->rc_error == 0 || rc->rc_size == 0)) {
2801 continue;
2802 }
2803 /*
2804 * We do not allow self healing for Direct I/O reads.
2805 * See comment in vdev_raid_row_alloc().
2806 */
2807 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
2808
2809 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2810 "offset=%llx",
2811 zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2812
2813 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2814 rc->rc_offset, rc->rc_abd, rc->rc_size,
2815 ZIO_TYPE_WRITE,
2816 zio->io_priority == ZIO_PRIORITY_REBUILD ?
2817 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2818 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2819 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2820 }
2821 }
2822
2823 /*
2824 * Scrub or resilver i/o's: overwrite any shadow locations with the
2825 * good data. This ensures that if we've already copied this sector,
2826 * it will be corrected if it was damaged. This writes more than is
2827 * necessary, but since expansion is paused during scrub/resilver, at
2828 * most a single row will have a shadow location.
2829 */
2830 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2831 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2832 for (int c = 0; c < rr->rr_cols; c++) {
2833 raidz_col_t *rc = &rr->rr_col[c];
2834 vdev_t *vd = zio->io_vd;
2835
2836 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2837 continue;
2838 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2839
2840 /*
2841 * Note: We don't want to update the repair stats
2842 * because that would incorrectly indicate that there
2843 * was bad data to repair, which we aren't sure about.
2844 * By clearing the SCAN_THREAD flag, we prevent this
2845 * from happening, despite having the REPAIR flag set.
2846 * We need to set SELF_HEAL so that this i/o can't be
2847 * bypassed by zio_vdev_io_start().
2848 */
2849 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2850 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2851 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2852 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2853 NULL, NULL);
2854 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2855 zio_nowait(cio);
2856 }
2857 }
2858 }
2859
2860 static void
raidz_restore_orig_data(raidz_map_t * rm)2861 raidz_restore_orig_data(raidz_map_t *rm)
2862 {
2863 for (int i = 0; i < rm->rm_nrows; i++) {
2864 raidz_row_t *rr = rm->rm_row[i];
2865 for (int c = 0; c < rr->rr_cols; c++) {
2866 raidz_col_t *rc = &rr->rr_col[c];
2867 if (rc->rc_need_orig_restore) {
2868 abd_copy(rc->rc_abd,
2869 rc->rc_orig_data, rc->rc_size);
2870 rc->rc_need_orig_restore = B_FALSE;
2871 }
2872 }
2873 }
2874 }
2875
2876 /*
2877 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2878 * failure simulations. See note in raidz_reconstruct() on simulating failure
2879 * of a pre-expansion device.
2880 *
2881 * Treating logical child i as failed, return TRUE if the given column should
2882 * be treated as failed. The idea of logical children allows us to imagine
2883 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2884 * succeed but return the wrong data). Since the expansion doesn't verify
2885 * checksums, the incorrect data will be moved to new locations spread among
2886 * the children (going diagonally across them).
2887 *
2888 * Higher "logical child failures" (values of `i`) indicate these
2889 * "pre-expansion failures". The first physical_width values imagine that a
2890 * current child failed; the next physical_width-1 values imagine that a
2891 * child failed before the most recent expansion; the next physical_width-2
2892 * values imagine a child failed in the expansion before that, etc.
2893 */
2894 static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)2895 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2896 int i, raidz_col_t *rc)
2897 {
2898 uint64_t sector_id =
2899 physical_width * (rc->rc_offset >> ashift) +
2900 rc->rc_devidx;
2901
2902 for (int w = physical_width; w >= original_width; w--) {
2903 if (i < w) {
2904 return (sector_id % w == i);
2905 } else {
2906 i -= w;
2907 }
2908 }
2909 ASSERT(!"invalid logical child id");
2910 return (B_FALSE);
2911 }
2912
2913 /*
2914 * returns EINVAL if reconstruction of the block will not be possible
2915 * returns ECKSUM if this specific reconstruction failed
2916 * returns 0 on successful reconstruction
2917 */
2918 static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)2919 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2920 {
2921 raidz_map_t *rm = zio->io_vsd;
2922 int physical_width = zio->io_vd->vdev_children;
2923 int original_width = (rm->rm_original_width != 0) ?
2924 rm->rm_original_width : physical_width;
2925 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2926
2927 if (dbgmsg) {
2928 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2929 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2930 }
2931
2932 /* Reconstruct each row */
2933 for (int r = 0; r < rm->rm_nrows; r++) {
2934 raidz_row_t *rr = rm->rm_row[r];
2935 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2936 int t = 0;
2937 int dead = 0;
2938 int dead_data = 0;
2939
2940 if (dbgmsg)
2941 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2942
2943 for (int c = 0; c < rr->rr_cols; c++) {
2944 raidz_col_t *rc = &rr->rr_col[c];
2945 ASSERT0(rc->rc_need_orig_restore);
2946 if (rc->rc_error != 0) {
2947 dead++;
2948 if (c >= nparity)
2949 dead_data++;
2950 continue;
2951 }
2952 if (rc->rc_size == 0)
2953 continue;
2954 for (int lt = 0; lt < ntgts; lt++) {
2955 if (raidz_simulate_failure(physical_width,
2956 original_width,
2957 zio->io_vd->vdev_top->vdev_ashift,
2958 ltgts[lt], rc)) {
2959 if (rc->rc_orig_data == NULL) {
2960 rc->rc_orig_data =
2961 abd_alloc_linear(
2962 rc->rc_size, B_TRUE);
2963 abd_copy(rc->rc_orig_data,
2964 rc->rc_abd, rc->rc_size);
2965 }
2966 rc->rc_need_orig_restore = B_TRUE;
2967
2968 dead++;
2969 if (c >= nparity)
2970 dead_data++;
2971 /*
2972 * Note: simulating failure of a
2973 * pre-expansion device can hit more
2974 * than one column, in which case we
2975 * might try to simulate more failures
2976 * than can be reconstructed, which is
2977 * also more than the size of my_tgts.
2978 * This check prevents accessing past
2979 * the end of my_tgts. The "dead >
2980 * nparity" check below will fail this
2981 * reconstruction attempt.
2982 */
2983 if (t < VDEV_RAIDZ_MAXPARITY) {
2984 my_tgts[t++] = c;
2985 if (dbgmsg) {
2986 zfs_dbgmsg("simulating "
2987 "failure of col %u "
2988 "devidx %u", c,
2989 (int)rc->rc_devidx);
2990 }
2991 }
2992 break;
2993 }
2994 }
2995 }
2996 if (dead > nparity) {
2997 /* reconstruction not possible */
2998 if (dbgmsg) {
2999 zfs_dbgmsg("reconstruction not possible; "
3000 "too many failures");
3001 }
3002 raidz_restore_orig_data(rm);
3003 return (EINVAL);
3004 }
3005 if (dead_data > 0)
3006 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3007 }
3008
3009 /* Check for success */
3010 if (raidz_checksum_verify(zio) == 0) {
3011 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3012 return (0);
3013
3014 /* Reconstruction succeeded - report errors */
3015 for (int i = 0; i < rm->rm_nrows; i++) {
3016 raidz_row_t *rr = rm->rm_row[i];
3017
3018 for (int c = 0; c < rr->rr_cols; c++) {
3019 raidz_col_t *rc = &rr->rr_col[c];
3020 if (rc->rc_need_orig_restore) {
3021 /*
3022 * Note: if this is a parity column,
3023 * we don't really know if it's wrong.
3024 * We need to let
3025 * vdev_raidz_io_done_verified() check
3026 * it, and if we set rc_error, it will
3027 * think that it is a "known" error
3028 * that doesn't need to be checked
3029 * or corrected.
3030 */
3031 if (rc->rc_error == 0 &&
3032 c >= rr->rr_firstdatacol) {
3033 vdev_raidz_checksum_error(zio,
3034 rc, rc->rc_orig_data);
3035 rc->rc_error =
3036 SET_ERROR(ECKSUM);
3037 }
3038 rc->rc_need_orig_restore = B_FALSE;
3039 }
3040 }
3041
3042 vdev_raidz_io_done_verified(zio, rr);
3043 }
3044
3045 zio_checksum_verified(zio);
3046
3047 if (dbgmsg) {
3048 zfs_dbgmsg("reconstruction successful "
3049 "(checksum verified)");
3050 }
3051 return (0);
3052 }
3053
3054 /* Reconstruction failed - restore original data */
3055 raidz_restore_orig_data(rm);
3056 if (dbgmsg) {
3057 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3058 "failed", zio);
3059 }
3060 return (ECKSUM);
3061 }
3062
3063 /*
3064 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3065 * Note that the algorithm below is non-optimal because it doesn't take into
3066 * account how reconstruction is actually performed. For example, with
3067 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3068 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3069 * cases we'd only use parity information in column 0.
3070 *
3071 * The order that we find the various possible combinations of failed
3072 * disks is dictated by these rules:
3073 * - Examine each "slot" (the "i" in tgts[i])
3074 * - Try to increment this slot (tgts[i] += 1)
3075 * - if we can't increment because it runs into the next slot,
3076 * reset our slot to the minimum, and examine the next slot
3077 *
3078 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3079 * 3 columns to reconstruct), we will generate the following sequence:
3080 *
3081 * STATE ACTION
3082 * 0 1 2 special case: skip since these are all parity
3083 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3084 * 0 2 3 first slot: increment to 1
3085 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3086 * 0 1 4 first: reset to 0; middle: increment to 2
3087 * 0 2 4 first: increment to 1
3088 * 1 2 4 first: reset to 0; middle: increment to 3
3089 * 0 3 4 first: increment to 1
3090 * 1 3 4 first: increment to 2
3091 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3092 * 0 1 5 first: reset to 0; middle: increment to 2
3093 * 0 2 5 first: increment to 1
3094 * 1 2 5 first: reset to 0; middle: increment to 3
3095 * 0 3 5 first: increment to 1
3096 * 1 3 5 first: increment to 2
3097 * 2 3 5 first: reset to 0; middle: increment to 4
3098 * 0 4 5 first: increment to 1
3099 * 1 4 5 first: increment to 2
3100 * 2 4 5 first: increment to 3
3101 * 3 4 5 done
3102 *
3103 * This strategy works for dRAID but is less efficient when there are a large
3104 * number of child vdevs and therefore permutations to check. Furthermore,
3105 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3106 * possible as long as there are no more than nparity data errors per row.
3107 * These additional permutations are not currently checked but could be as
3108 * a future improvement.
3109 *
3110 * Returns 0 on success, ECKSUM on failure.
3111 */
3112 static int
vdev_raidz_combrec(zio_t * zio)3113 vdev_raidz_combrec(zio_t *zio)
3114 {
3115 int nparity = vdev_get_nparity(zio->io_vd);
3116 raidz_map_t *rm = zio->io_vsd;
3117 int physical_width = zio->io_vd->vdev_children;
3118 int original_width = (rm->rm_original_width != 0) ?
3119 rm->rm_original_width : physical_width;
3120
3121 for (int i = 0; i < rm->rm_nrows; i++) {
3122 raidz_row_t *rr = rm->rm_row[i];
3123 int total_errors = 0;
3124
3125 for (int c = 0; c < rr->rr_cols; c++) {
3126 if (rr->rr_col[c].rc_error)
3127 total_errors++;
3128 }
3129
3130 if (total_errors > nparity)
3131 return (vdev_raidz_worst_error(rr));
3132 }
3133
3134 for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3135 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3136 int *ltgts = &tstore[1]; /* value is logical child ID */
3137
3138
3139 /*
3140 * Determine number of logical children, n. See comment
3141 * above raidz_simulate_failure().
3142 */
3143 int n = 0;
3144 for (int w = physical_width;
3145 w >= original_width; w--) {
3146 n += w;
3147 }
3148
3149 ASSERT3U(num_failures, <=, nparity);
3150 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3151
3152 /* Handle corner cases in combrec logic */
3153 ltgts[-1] = -1;
3154 for (int i = 0; i < num_failures; i++) {
3155 ltgts[i] = i;
3156 }
3157 ltgts[num_failures] = n;
3158
3159 for (;;) {
3160 int err = raidz_reconstruct(zio, ltgts, num_failures,
3161 nparity);
3162 if (err == EINVAL) {
3163 /*
3164 * Reconstruction not possible with this #
3165 * failures; try more failures.
3166 */
3167 break;
3168 } else if (err == 0)
3169 return (0);
3170
3171 /* Compute next targets to try */
3172 for (int t = 0; ; t++) {
3173 ASSERT3U(t, <, num_failures);
3174 ltgts[t]++;
3175 if (ltgts[t] == n) {
3176 /* try more failures */
3177 ASSERT3U(t, ==, num_failures - 1);
3178 if (zfs_flags &
3179 ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3180 zfs_dbgmsg("reconstruction "
3181 "failed for num_failures="
3182 "%u; tried all "
3183 "combinations",
3184 num_failures);
3185 }
3186 break;
3187 }
3188
3189 ASSERT3U(ltgts[t], <, n);
3190 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3191
3192 /*
3193 * If that spot is available, we're done here.
3194 * Try the next combination.
3195 */
3196 if (ltgts[t] != ltgts[t + 1])
3197 break; // found next combination
3198
3199 /*
3200 * Otherwise, reset this tgt to the minimum,
3201 * and move on to the next tgt.
3202 */
3203 ltgts[t] = ltgts[t - 1] + 1;
3204 ASSERT3U(ltgts[t], ==, t);
3205 }
3206
3207 /* Increase the number of failures and keep trying. */
3208 if (ltgts[num_failures - 1] == n)
3209 break;
3210 }
3211 }
3212 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3213 zfs_dbgmsg("reconstruction failed for all num_failures");
3214 return (ECKSUM);
3215 }
3216
3217 void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)3218 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3219 {
3220 for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3221 raidz_row_t *rr = rm->rm_row[row];
3222 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3223 }
3224 }
3225
3226 /*
3227 * Complete a write IO operation on a RAIDZ VDev
3228 *
3229 * Outline:
3230 * 1. Check for errors on the child IOs.
3231 * 2. Return, setting an error code if too few child VDevs were written
3232 * to reconstruct the data later. Note that partial writes are
3233 * considered successful if they can be reconstructed at all.
3234 */
3235 static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)3236 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3237 {
3238 int normal_errors = 0;
3239 int shadow_errors = 0;
3240
3241 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3242 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3243 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3244
3245 for (int c = 0; c < rr->rr_cols; c++) {
3246 raidz_col_t *rc = &rr->rr_col[c];
3247
3248 if (rc->rc_error != 0) {
3249 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3250 normal_errors++;
3251 }
3252 if (rc->rc_shadow_error != 0) {
3253 ASSERT(rc->rc_shadow_error != ECKSUM);
3254 shadow_errors++;
3255 }
3256 }
3257
3258 /*
3259 * Treat partial writes as a success. If we couldn't write enough
3260 * columns to reconstruct the data, the I/O failed. Otherwise, good
3261 * enough. Note that in the case of a shadow write (during raidz
3262 * expansion), depending on if we crash, either the normal (old) or
3263 * shadow (new) location may become the "real" version of the block,
3264 * so both locations must have sufficient redundancy.
3265 *
3266 * Now that we support write reallocation, it would be better
3267 * to treat partial failure as real failure unless there are
3268 * no non-degraded top-level vdevs left, and not update DTLs
3269 * if we intend to reallocate.
3270 */
3271 if (normal_errors > rr->rr_firstdatacol ||
3272 shadow_errors > rr->rr_firstdatacol) {
3273 zio->io_error = zio_worst_error(zio->io_error,
3274 vdev_raidz_worst_error(rr));
3275 }
3276 }
3277
3278 static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)3279 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3280 raidz_row_t *rr)
3281 {
3282 int parity_errors = 0;
3283 int parity_untried = 0;
3284 int data_errors = 0;
3285 int total_errors = 0;
3286
3287 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3288 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3289
3290 for (int c = 0; c < rr->rr_cols; c++) {
3291 raidz_col_t *rc = &rr->rr_col[c];
3292
3293 /*
3294 * If scrubbing and a replacing/sparing child vdev determined
3295 * that not all of its children have an identical copy of the
3296 * data, then clear the error so the column is treated like
3297 * any other read and force a repair to correct the damage.
3298 */
3299 if (rc->rc_error == ECKSUM) {
3300 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3301 vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3302 rc->rc_force_repair = 1;
3303 rc->rc_error = 0;
3304 }
3305
3306 if (rc->rc_error) {
3307 if (c < rr->rr_firstdatacol)
3308 parity_errors++;
3309 else
3310 data_errors++;
3311
3312 total_errors++;
3313 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3314 parity_untried++;
3315 }
3316 }
3317
3318 /*
3319 * If there were data errors and the number of errors we saw was
3320 * correctable -- less than or equal to the number of parity disks read
3321 * -- reconstruct based on the missing data.
3322 */
3323 if (data_errors != 0 &&
3324 total_errors <= rr->rr_firstdatacol - parity_untried) {
3325 /*
3326 * We either attempt to read all the parity columns or
3327 * none of them. If we didn't try to read parity, we
3328 * wouldn't be here in the correctable case. There must
3329 * also have been fewer parity errors than parity
3330 * columns or, again, we wouldn't be in this code path.
3331 */
3332 ASSERT(parity_untried == 0);
3333 ASSERT(parity_errors < rr->rr_firstdatacol);
3334
3335 /*
3336 * Identify the data columns that reported an error.
3337 */
3338 int n = 0;
3339 int tgts[VDEV_RAIDZ_MAXPARITY];
3340 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3341 raidz_col_t *rc = &rr->rr_col[c];
3342 if (rc->rc_error != 0) {
3343 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3344 tgts[n++] = c;
3345 }
3346 }
3347
3348 ASSERT(rr->rr_firstdatacol >= n);
3349
3350 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3351 }
3352 }
3353
3354 /*
3355 * Return the number of reads issued.
3356 */
3357 static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)3358 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3359 {
3360 vdev_t *vd = zio->io_vd;
3361 int nread = 0;
3362
3363 rr->rr_missingdata = 0;
3364 rr->rr_missingparity = 0;
3365
3366 /*
3367 * If this rows contains empty sectors which are not required
3368 * for a normal read then allocate an ABD for them now so they
3369 * may be read, verified, and any needed repairs performed.
3370 */
3371 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3372 vdev_draid_map_alloc_empty(zio, rr);
3373
3374 for (int c = 0; c < rr->rr_cols; c++) {
3375 raidz_col_t *rc = &rr->rr_col[c];
3376 if (rc->rc_tried || rc->rc_size == 0)
3377 continue;
3378
3379 zio_nowait(zio_vdev_child_io(zio, NULL,
3380 vd->vdev_child[rc->rc_devidx],
3381 rc->rc_offset, rc->rc_abd, rc->rc_size,
3382 zio->io_type, zio->io_priority, 0,
3383 vdev_raidz_child_done, rc));
3384 nread++;
3385 }
3386 return (nread);
3387 }
3388
3389 /*
3390 * We're here because either there were too many errors to even attempt
3391 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3392 * failed. In either case, there is enough bad data to prevent reconstruction.
3393 * Start checksum ereports for all children which haven't failed.
3394 */
3395 static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)3396 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3397 {
3398 raidz_map_t *rm = zio->io_vsd;
3399
3400 for (int i = 0; i < rm->rm_nrows; i++) {
3401 raidz_row_t *rr = rm->rm_row[i];
3402
3403 for (int c = 0; c < rr->rr_cols; c++) {
3404 raidz_col_t *rc = &rr->rr_col[c];
3405 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3406
3407 if (rc->rc_error != 0)
3408 continue;
3409
3410 zio_bad_cksum_t zbc;
3411 zbc.zbc_has_cksum = 0;
3412 zbc.zbc_injected = rm->rm_ecksuminjected;
3413 mutex_enter(&cvd->vdev_stat_lock);
3414 cvd->vdev_stat.vs_checksum_errors++;
3415 mutex_exit(&cvd->vdev_stat_lock);
3416 (void) zfs_ereport_start_checksum(zio->io_spa,
3417 cvd, &zio->io_bookmark, zio, rc->rc_offset,
3418 rc->rc_size, &zbc);
3419 }
3420 }
3421 }
3422
3423 void
vdev_raidz_io_done(zio_t * zio)3424 vdev_raidz_io_done(zio_t *zio)
3425 {
3426 raidz_map_t *rm = zio->io_vsd;
3427
3428 ASSERT(zio->io_bp != NULL);
3429 if (zio->io_type == ZIO_TYPE_WRITE) {
3430 for (int i = 0; i < rm->rm_nrows; i++) {
3431 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3432 }
3433 } else {
3434 if (rm->rm_phys_col) {
3435 /*
3436 * This is an aggregated read. Copy the data and status
3437 * from the aggregate abd's to the individual rows.
3438 */
3439 for (int i = 0; i < rm->rm_nrows; i++) {
3440 raidz_row_t *rr = rm->rm_row[i];
3441
3442 for (int c = 0; c < rr->rr_cols; c++) {
3443 raidz_col_t *rc = &rr->rr_col[c];
3444 if (rc->rc_tried || rc->rc_size == 0)
3445 continue;
3446
3447 raidz_col_t *prc =
3448 &rm->rm_phys_col[rc->rc_devidx];
3449 rc->rc_error = prc->rc_error;
3450 rc->rc_tried = prc->rc_tried;
3451 rc->rc_skipped = prc->rc_skipped;
3452 if (c >= rr->rr_firstdatacol) {
3453 /*
3454 * Note: this is slightly faster
3455 * than using abd_copy_off().
3456 */
3457 char *physbuf = abd_to_buf(
3458 prc->rc_abd);
3459 void *physloc = physbuf +
3460 rc->rc_offset -
3461 prc->rc_offset;
3462
3463 abd_copy_from_buf(rc->rc_abd,
3464 physloc, rc->rc_size);
3465 }
3466 }
3467 }
3468 }
3469
3470 for (int i = 0; i < rm->rm_nrows; i++) {
3471 raidz_row_t *rr = rm->rm_row[i];
3472 vdev_raidz_io_done_reconstruct_known_missing(zio,
3473 rm, rr);
3474 }
3475
3476 if (raidz_checksum_verify(zio) == 0) {
3477 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
3478 goto done;
3479
3480 for (int i = 0; i < rm->rm_nrows; i++) {
3481 raidz_row_t *rr = rm->rm_row[i];
3482 vdev_raidz_io_done_verified(zio, rr);
3483 }
3484 zio_checksum_verified(zio);
3485 } else {
3486 /*
3487 * A sequential resilver has no checksum which makes
3488 * combinatoral reconstruction impossible. This code
3489 * path is unreachable since raidz_checksum_verify()
3490 * has no checksum to verify and must succeed.
3491 */
3492 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3493
3494 /*
3495 * This isn't a typical situation -- either we got a
3496 * read error or a child silently returned bad data.
3497 * Read every block so we can try again with as much
3498 * data and parity as we can track down. If we've
3499 * already been through once before, all children will
3500 * be marked as tried so we'll proceed to combinatorial
3501 * reconstruction.
3502 */
3503 int nread = 0;
3504 for (int i = 0; i < rm->rm_nrows; i++) {
3505 nread += vdev_raidz_read_all(zio,
3506 rm->rm_row[i]);
3507 }
3508 if (nread != 0) {
3509 /*
3510 * Normally our stage is VDEV_IO_DONE, but if
3511 * we've already called redone(), it will have
3512 * changed to VDEV_IO_START, in which case we
3513 * don't want to call redone() again.
3514 */
3515 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3516 zio_vdev_io_redone(zio);
3517 return;
3518 }
3519 /*
3520 * It would be too expensive to try every possible
3521 * combination of failed sectors in every row, so
3522 * instead we try every combination of failed current or
3523 * past physical disk. This means that if the incorrect
3524 * sectors were all on Nparity disks at any point in the
3525 * past, we will find the correct data. The only known
3526 * case where this is less durable than a non-expanded
3527 * RAIDZ, is if we have a silent failure during
3528 * expansion. In that case, one block could be
3529 * partially in the old format and partially in the
3530 * new format, so we'd lost some sectors from the old
3531 * format and some from the new format.
3532 *
3533 * e.g. logical_width=4 physical_width=6
3534 * the 15 (6+5+4) possible failed disks are:
3535 * width=6 child=0
3536 * width=6 child=1
3537 * width=6 child=2
3538 * width=6 child=3
3539 * width=6 child=4
3540 * width=6 child=5
3541 * width=5 child=0
3542 * width=5 child=1
3543 * width=5 child=2
3544 * width=5 child=3
3545 * width=5 child=4
3546 * width=4 child=0
3547 * width=4 child=1
3548 * width=4 child=2
3549 * width=4 child=3
3550 * And we will try every combination of Nparity of these
3551 * failing.
3552 *
3553 * As a first pass, we can generate every combo,
3554 * and try reconstructing, ignoring any known
3555 * failures. If any row has too many known + simulated
3556 * failures, then we bail on reconstructing with this
3557 * number of simulated failures. As an improvement,
3558 * we could detect the number of whole known failures
3559 * (i.e. we have known failures on these disks for
3560 * every row; the disks never succeeded), and
3561 * subtract that from the max # failures to simulate.
3562 * We could go even further like the current
3563 * combrec code, but that doesn't seem like it
3564 * gains us very much. If we simulate a failure
3565 * that is also a known failure, that's fine.
3566 */
3567 zio->io_error = vdev_raidz_combrec(zio);
3568 if (zio->io_error == ECKSUM &&
3569 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3570 vdev_raidz_io_done_unrecoverable(zio);
3571 }
3572 }
3573 }
3574 done:
3575 if (rm->rm_lr != NULL) {
3576 zfs_rangelock_exit(rm->rm_lr);
3577 rm->rm_lr = NULL;
3578 }
3579 }
3580
3581 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)3582 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3583 {
3584 vdev_raidz_t *vdrz = vd->vdev_tsd;
3585 if (faulted > vdrz->vd_nparity)
3586 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3587 VDEV_AUX_NO_REPLICAS);
3588 else if (degraded + faulted != 0)
3589 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3590 else
3591 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3592 }
3593
3594 /*
3595 * Determine if any portion of the provided block resides on a child vdev
3596 * with a dirty DTL and therefore needs to be resilvered. The function
3597 * assumes that at least one DTL is dirty which implies that full stripe
3598 * width blocks must be resilvered.
3599 */
3600 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3601 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3602 uint64_t phys_birth)
3603 {
3604 vdev_raidz_t *vdrz = vd->vdev_tsd;
3605
3606 /*
3607 * If we're in the middle of a RAIDZ expansion, this block may be in
3608 * the old and/or new location. For simplicity, always resilver it.
3609 */
3610 if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3611 return (B_TRUE);
3612
3613 uint64_t dcols = vd->vdev_children;
3614 uint64_t nparity = vdrz->vd_nparity;
3615 uint64_t ashift = vd->vdev_top->vdev_ashift;
3616 /* The starting RAIDZ (parent) vdev sector of the block. */
3617 uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3618 /* The zio's size in units of the vdev's minimum sector size. */
3619 uint64_t s = ((psize - 1) >> ashift) + 1;
3620 /* The first column for this stripe. */
3621 uint64_t f = b % dcols;
3622
3623 /* Unreachable by sequential resilver. */
3624 ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3625
3626 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3627 return (B_FALSE);
3628
3629 if (s + nparity >= dcols)
3630 return (B_TRUE);
3631
3632 for (uint64_t c = 0; c < s + nparity; c++) {
3633 uint64_t devidx = (f + c) % dcols;
3634 vdev_t *cvd = vd->vdev_child[devidx];
3635
3636 /*
3637 * dsl_scan_need_resilver() already checked vd with
3638 * vdev_dtl_contains(). So here just check cvd with
3639 * vdev_dtl_empty(), cheaper and a good approximation.
3640 */
3641 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3642 return (B_TRUE);
3643 }
3644
3645 return (B_FALSE);
3646 }
3647
3648 static void
vdev_raidz_xlate(vdev_t * cvd,const range_seg64_t * logical_rs,range_seg64_t * physical_rs,range_seg64_t * remain_rs)3649 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3650 range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3651 {
3652 (void) remain_rs;
3653
3654 vdev_t *raidvd = cvd->vdev_parent;
3655 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3656
3657 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3658
3659 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3660 /*
3661 * We're in the middle of expansion, in which case the
3662 * translation is in flux. Any answer we give may be wrong
3663 * by the time we return, so it isn't safe for the caller to
3664 * act on it. Therefore we say that this range isn't present
3665 * on any children. The only consumers of this are "zpool
3666 * initialize" and trimming, both of which are "best effort"
3667 * anyway.
3668 */
3669 physical_rs->rs_start = physical_rs->rs_end = 0;
3670 remain_rs->rs_start = remain_rs->rs_end = 0;
3671 return;
3672 }
3673
3674 uint64_t width = vdrz->vd_physical_width;
3675 uint64_t tgt_col = cvd->vdev_id;
3676 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3677
3678 /* make sure the offsets are block-aligned */
3679 ASSERT0(logical_rs->rs_start % (1 << ashift));
3680 ASSERT0(logical_rs->rs_end % (1 << ashift));
3681 uint64_t b_start = logical_rs->rs_start >> ashift;
3682 uint64_t b_end = logical_rs->rs_end >> ashift;
3683
3684 uint64_t start_row = 0;
3685 if (b_start > tgt_col) /* avoid underflow */
3686 start_row = ((b_start - tgt_col - 1) / width) + 1;
3687
3688 uint64_t end_row = 0;
3689 if (b_end > tgt_col)
3690 end_row = ((b_end - tgt_col - 1) / width) + 1;
3691
3692 physical_rs->rs_start = start_row << ashift;
3693 physical_rs->rs_end = end_row << ashift;
3694
3695 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3696 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3697 logical_rs->rs_end - logical_rs->rs_start);
3698 }
3699
3700 static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)3701 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3702 {
3703 spa_t *spa = arg;
3704 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3705 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3706
3707 /*
3708 * Ensure there are no i/os to the range that is being committed.
3709 */
3710 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3711 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3712
3713 mutex_enter(&vre->vre_lock);
3714 uint64_t new_offset =
3715 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3716 /*
3717 * We should not have committed anything that failed.
3718 */
3719 VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3720 mutex_exit(&vre->vre_lock);
3721
3722 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3723 old_offset, new_offset - old_offset,
3724 RL_WRITER);
3725
3726 /*
3727 * Update the uberblock that will be written when this txg completes.
3728 */
3729 RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3730 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3731 vre->vre_offset_pertxg[txgoff] = 0;
3732 zfs_rangelock_exit(lr);
3733
3734 mutex_enter(&vre->vre_lock);
3735 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3736 vre->vre_bytes_copied_pertxg[txgoff] = 0;
3737 mutex_exit(&vre->vre_lock);
3738
3739 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3740 VERIFY0(zap_update(spa->spa_meta_objset,
3741 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3742 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3743 }
3744
3745 static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)3746 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3747 {
3748 spa_t *spa = arg;
3749 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3750 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3751 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3752
3753 for (int i = 0; i < TXG_SIZE; i++)
3754 VERIFY0(vre->vre_offset_pertxg[i]);
3755
3756 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3757 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3758 re->re_logical_width = vdrz->vd_physical_width;
3759 mutex_enter(&vdrz->vd_expand_lock);
3760 avl_add(&vdrz->vd_expand_txgs, re);
3761 mutex_exit(&vdrz->vd_expand_lock);
3762
3763 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3764
3765 /*
3766 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3767 * will get written (based on vd_expand_txgs).
3768 */
3769 vdev_config_dirty(vd);
3770
3771 /*
3772 * Before we change vre_state, the on-disk state must reflect that we
3773 * have completed all copying, so that vdev_raidz_io_start() can use
3774 * vre_state to determine if the reflow is in progress. See also the
3775 * end of spa_raidz_expand_thread().
3776 */
3777 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3778 raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3779
3780 vre->vre_end_time = gethrestime_sec();
3781 vre->vre_state = DSS_FINISHED;
3782
3783 uint64_t state = vre->vre_state;
3784 VERIFY0(zap_update(spa->spa_meta_objset,
3785 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3786 sizeof (state), 1, &state, tx));
3787
3788 uint64_t end_time = vre->vre_end_time;
3789 VERIFY0(zap_update(spa->spa_meta_objset,
3790 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3791 sizeof (end_time), 1, &end_time, tx));
3792
3793 spa->spa_uberblock.ub_raidz_reflow_info = 0;
3794
3795 spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
3796 "%s vdev %llu new width %llu", spa_name(spa),
3797 (unsigned long long)vd->vdev_id,
3798 (unsigned long long)vd->vdev_children);
3799
3800 spa->spa_raidz_expand = NULL;
3801 raidvd->vdev_rz_expanding = B_FALSE;
3802
3803 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3804 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3805 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3806
3807 spa_notify_waiters(spa);
3808
3809 /*
3810 * While we're in syncing context take the opportunity to
3811 * setup a scrub. All the data has been sucessfully copied
3812 * but we have not validated any checksums.
3813 */
3814 pool_scan_func_t func = POOL_SCAN_SCRUB;
3815 if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
3816 dsl_scan_setup_sync(&func, tx);
3817 }
3818
3819 /*
3820 * Struct for one copy zio.
3821 */
3822 typedef struct raidz_reflow_arg {
3823 vdev_raidz_expand_t *rra_vre;
3824 zfs_locked_range_t *rra_lr;
3825 uint64_t rra_txg;
3826 } raidz_reflow_arg_t;
3827
3828 /*
3829 * The write of the new location is done.
3830 */
3831 static void
raidz_reflow_write_done(zio_t * zio)3832 raidz_reflow_write_done(zio_t *zio)
3833 {
3834 raidz_reflow_arg_t *rra = zio->io_private;
3835 vdev_raidz_expand_t *vre = rra->rra_vre;
3836
3837 abd_free(zio->io_abd);
3838
3839 mutex_enter(&vre->vre_lock);
3840 if (zio->io_error != 0) {
3841 /* Force a reflow pause on errors */
3842 vre->vre_failed_offset =
3843 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3844 }
3845 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3846 vre->vre_outstanding_bytes -= zio->io_size;
3847 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3848 vre->vre_failed_offset) {
3849 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3850 zio->io_size;
3851 }
3852 cv_signal(&vre->vre_cv);
3853 mutex_exit(&vre->vre_lock);
3854
3855 zfs_rangelock_exit(rra->rra_lr);
3856
3857 kmem_free(rra, sizeof (*rra));
3858 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3859 }
3860
3861 /*
3862 * The read of the old location is done. The parent zio is the write to
3863 * the new location. Allow it to start.
3864 */
3865 static void
raidz_reflow_read_done(zio_t * zio)3866 raidz_reflow_read_done(zio_t *zio)
3867 {
3868 raidz_reflow_arg_t *rra = zio->io_private;
3869 vdev_raidz_expand_t *vre = rra->rra_vre;
3870
3871 /*
3872 * If the read failed, or if it was done on a vdev that is not fully
3873 * healthy (e.g. a child that has a resilver in progress), we may not
3874 * have the correct data. Note that it's OK if the write proceeds.
3875 * It may write garbage but the location is otherwise unused and we
3876 * will retry later due to vre_failed_offset.
3877 */
3878 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3879 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3880 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3881 (long long)rra->rra_lr->lr_offset,
3882 (long long)rra->rra_lr->lr_length,
3883 (long long)rra->rra_txg,
3884 zio->io_error,
3885 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3886 vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3887 mutex_enter(&vre->vre_lock);
3888 /* Force a reflow pause on errors */
3889 vre->vre_failed_offset =
3890 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3891 mutex_exit(&vre->vre_lock);
3892 }
3893
3894 zio_nowait(zio_unique_parent(zio));
3895 }
3896
3897 static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)3898 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3899 dmu_tx_t *tx)
3900 {
3901 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3902 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3903
3904 if (offset == 0)
3905 return;
3906
3907 mutex_enter(&vre->vre_lock);
3908 ASSERT3U(vre->vre_offset, <=, offset);
3909 vre->vre_offset = offset;
3910 mutex_exit(&vre->vre_lock);
3911
3912 if (vre->vre_offset_pertxg[txgoff] == 0) {
3913 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3914 spa, tx);
3915 }
3916 vre->vre_offset_pertxg[txgoff] = offset;
3917 }
3918
3919 static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)3920 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3921 {
3922 for (int i = 0; i < raidz_vd->vdev_children; i++) {
3923 /* Quick check if a child is being replaced */
3924 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3925 return (B_TRUE);
3926 }
3927 return (B_FALSE);
3928 }
3929
3930 static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,range_tree_t * rt,dmu_tx_t * tx)3931 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3932 dmu_tx_t *tx)
3933 {
3934 spa_t *spa = vd->vdev_spa;
3935 int ashift = vd->vdev_top->vdev_ashift;
3936 uint64_t offset, size;
3937
3938 if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3939 &offset, &size)) {
3940 return (B_FALSE);
3941 }
3942 ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3943 ASSERT3U(size, >=, 1 << ashift);
3944 uint64_t length = 1 << ashift;
3945 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3946
3947 uint64_t blkid = offset >> ashift;
3948
3949 int old_children = vd->vdev_children - 1;
3950
3951 /*
3952 * We can only progress to the point that writes will not overlap
3953 * with blocks whose progress has not yet been recorded on disk.
3954 * Since partially-copied rows are still read from the old location,
3955 * we need to stop one row before the sector-wise overlap, to prevent
3956 * row-wise overlap.
3957 *
3958 * Note that even if we are skipping over a large unallocated region,
3959 * we can't move the on-disk progress to `offset`, because concurrent
3960 * writes/allocations could still use the currently-unallocated
3961 * region.
3962 */
3963 uint64_t ubsync_blkid =
3964 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3965 uint64_t next_overwrite_blkid = ubsync_blkid +
3966 ubsync_blkid / old_children - old_children;
3967 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3968
3969 if (blkid >= next_overwrite_blkid) {
3970 raidz_reflow_record_progress(vre,
3971 next_overwrite_blkid << ashift, tx);
3972 return (B_TRUE);
3973 }
3974
3975 range_tree_remove(rt, offset, length);
3976
3977 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
3978 rra->rra_vre = vre;
3979 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3980 offset, length, RL_WRITER);
3981 rra->rra_txg = dmu_tx_get_txg(tx);
3982
3983 raidz_reflow_record_progress(vre, offset + length, tx);
3984
3985 mutex_enter(&vre->vre_lock);
3986 vre->vre_outstanding_bytes += length;
3987 mutex_exit(&vre->vre_lock);
3988
3989 /*
3990 * SCL_STATE will be released when the read and write are done,
3991 * by raidz_reflow_write_done().
3992 */
3993 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3994
3995 /* check if a replacing vdev was added, if so treat it as an error */
3996 if (vdev_raidz_expand_child_replacing(vd)) {
3997 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3998 "offset=%llu txg=%llu",
3999 (long long)rra->rra_lr->lr_offset,
4000 (long long)rra->rra_txg);
4001
4002 mutex_enter(&vre->vre_lock);
4003 vre->vre_failed_offset =
4004 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4005 cv_signal(&vre->vre_cv);
4006 mutex_exit(&vre->vre_lock);
4007
4008 /* drop everything we acquired */
4009 zfs_rangelock_exit(rra->rra_lr);
4010 kmem_free(rra, sizeof (*rra));
4011 spa_config_exit(spa, SCL_STATE, spa);
4012 return (B_TRUE);
4013 }
4014
4015 zio_t *pio = spa->spa_txg_zio[txgoff];
4016 abd_t *abd = abd_alloc_for_io(length, B_FALSE);
4017 zio_t *write_zio = zio_vdev_child_io(pio, NULL,
4018 vd->vdev_child[blkid % vd->vdev_children],
4019 (blkid / vd->vdev_children) << ashift,
4020 abd, length,
4021 ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4022 ZIO_FLAG_CANFAIL,
4023 raidz_reflow_write_done, rra);
4024
4025 zio_nowait(zio_vdev_child_io(write_zio, NULL,
4026 vd->vdev_child[blkid % old_children],
4027 (blkid / old_children) << ashift,
4028 abd, length,
4029 ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4030 ZIO_FLAG_CANFAIL,
4031 raidz_reflow_read_done, rra));
4032
4033 return (B_FALSE);
4034 }
4035
4036 /*
4037 * For testing (ztest specific)
4038 */
4039 static void
raidz_expand_pause(uint_t pause_point)4040 raidz_expand_pause(uint_t pause_point)
4041 {
4042 while (raidz_expand_pause_point != 0 &&
4043 raidz_expand_pause_point <= pause_point)
4044 delay(hz);
4045 }
4046
4047 static void
raidz_scratch_child_done(zio_t * zio)4048 raidz_scratch_child_done(zio_t *zio)
4049 {
4050 zio_t *pio = zio->io_private;
4051
4052 mutex_enter(&pio->io_lock);
4053 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4054 mutex_exit(&pio->io_lock);
4055 }
4056
4057 /*
4058 * Reflow the beginning portion of the vdev into an intermediate scratch area
4059 * in memory and on disk. This operation must be persisted on disk before we
4060 * proceed to overwrite the beginning portion with the reflowed data.
4061 *
4062 * This multi-step task can fail to complete if disk errors are encountered
4063 * and we can return here after a pause (waiting for disk to become healthy).
4064 */
4065 static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4066 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4067 {
4068 vdev_raidz_expand_t *vre = arg;
4069 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4070 zio_t *pio;
4071 int error;
4072
4073 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4074 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4075 int ashift = raidvd->vdev_ashift;
4076 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4077 uint64_t);
4078 uint64_t logical_size = write_size * raidvd->vdev_children;
4079 uint64_t read_size =
4080 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4081 1 << ashift);
4082
4083 /*
4084 * The scratch space must be large enough to get us to the point
4085 * that one row does not overlap itself when moved. This is checked
4086 * by vdev_raidz_attach_check().
4087 */
4088 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4089 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4090 VERIFY3U(write_size, <=, read_size);
4091
4092 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4093 0, logical_size, RL_WRITER);
4094
4095 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4096 KM_SLEEP);
4097 for (int i = 0; i < raidvd->vdev_children; i++) {
4098 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4099 }
4100
4101 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4102
4103 /*
4104 * If we have already written the scratch area then we must read from
4105 * there, since new writes were redirected there while we were paused
4106 * or the original location may have been partially overwritten with
4107 * reflowed data.
4108 */
4109 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4110 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4111 /*
4112 * Read from scratch space.
4113 */
4114 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4115 for (int i = 0; i < raidvd->vdev_children; i++) {
4116 /*
4117 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4118 * to the offset to calculate the physical offset to
4119 * write to. Passing in a negative offset makes us
4120 * access the scratch area.
4121 */
4122 zio_nowait(zio_vdev_child_io(pio, NULL,
4123 raidvd->vdev_child[i],
4124 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4125 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4126 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4127 }
4128 error = zio_wait(pio);
4129 if (error != 0) {
4130 zfs_dbgmsg("reflow: error %d reading scratch location",
4131 error);
4132 goto io_error_exit;
4133 }
4134 goto overwrite;
4135 }
4136
4137 /*
4138 * Read from original location.
4139 */
4140 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4141 for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4142 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4143 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4144 0, abds[i], read_size, ZIO_TYPE_READ,
4145 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4146 raidz_scratch_child_done, pio));
4147 }
4148 error = zio_wait(pio);
4149 if (error != 0) {
4150 zfs_dbgmsg("reflow: error %d reading original location", error);
4151 io_error_exit:
4152 for (int i = 0; i < raidvd->vdev_children; i++)
4153 abd_free(abds[i]);
4154 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4155 zfs_rangelock_exit(lr);
4156 spa_config_exit(spa, SCL_STATE, FTAG);
4157 return;
4158 }
4159
4160 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4161
4162 /*
4163 * Reflow in memory.
4164 */
4165 uint64_t logical_sectors = logical_size >> ashift;
4166 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4167 int oldchild = i % (raidvd->vdev_children - 1);
4168 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4169
4170 int newchild = i % raidvd->vdev_children;
4171 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4172
4173 /* a single sector should not be copying over itself */
4174 ASSERT(!(newchild == oldchild && newoff == oldoff));
4175
4176 abd_copy_off(abds[newchild], abds[oldchild],
4177 newoff, oldoff, 1 << ashift);
4178 }
4179
4180 /*
4181 * Verify that we filled in everything we intended to (write_size on
4182 * each child).
4183 */
4184 VERIFY0(logical_sectors % raidvd->vdev_children);
4185 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4186 write_size);
4187
4188 /*
4189 * Write to scratch location (boot area).
4190 */
4191 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4192 for (int i = 0; i < raidvd->vdev_children; i++) {
4193 /*
4194 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4195 * the offset to calculate the physical offset to write to.
4196 * Passing in a negative offset lets us access the boot area.
4197 */
4198 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4199 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4200 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4201 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4202 }
4203 error = zio_wait(pio);
4204 if (error != 0) {
4205 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4206 goto io_error_exit;
4207 }
4208 pio = zio_root(spa, NULL, NULL, 0);
4209 zio_flush(pio, raidvd);
4210 zio_wait(pio);
4211
4212 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4213 (long long)logical_size);
4214
4215 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4216
4217 /*
4218 * Update uberblock to indicate that scratch space is valid. This is
4219 * needed because after this point, the real location may be
4220 * overwritten. If we crash, we need to get the data from the
4221 * scratch space, rather than the real location.
4222 *
4223 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4224 * will prefer this uberblock.
4225 */
4226 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4227 spa->spa_ubsync.ub_timestamp++;
4228 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4229 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4230 if (spa_multihost(spa))
4231 mmp_update_uberblock(spa, &spa->spa_ubsync);
4232
4233 zfs_dbgmsg("reflow: uberblock updated "
4234 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4235 (long long)spa->spa_ubsync.ub_txg,
4236 (long long)logical_size,
4237 (long long)spa->spa_ubsync.ub_timestamp);
4238
4239 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4240
4241 /*
4242 * Overwrite with reflow'ed data.
4243 */
4244 overwrite:
4245 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4246 for (int i = 0; i < raidvd->vdev_children; i++) {
4247 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4248 0, abds[i], write_size, ZIO_TYPE_WRITE,
4249 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4250 raidz_scratch_child_done, pio));
4251 }
4252 error = zio_wait(pio);
4253 if (error != 0) {
4254 /*
4255 * When we exit early here and drop the range lock, new
4256 * writes will go into the scratch area so we'll need to
4257 * read from there when we return after pausing.
4258 */
4259 zfs_dbgmsg("reflow: error %d writing real location", error);
4260 /*
4261 * Update the uberblock that is written when this txg completes.
4262 */
4263 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4264 logical_size);
4265 goto io_error_exit;
4266 }
4267 pio = zio_root(spa, NULL, NULL, 0);
4268 zio_flush(pio, raidvd);
4269 zio_wait(pio);
4270
4271 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4272 (long long)logical_size);
4273 for (int i = 0; i < raidvd->vdev_children; i++)
4274 abd_free(abds[i]);
4275 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4276
4277 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4278
4279 /*
4280 * Update uberblock to indicate that the initial part has been
4281 * reflow'ed. This is needed because after this point (when we exit
4282 * the rangelock), we allow regular writes to this region, which will
4283 * be written to the new location only (because reflow_offset_next ==
4284 * reflow_offset_synced). If we crashed and re-copied from the
4285 * scratch space, we would lose the regular writes.
4286 */
4287 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4288 logical_size);
4289 spa->spa_ubsync.ub_timestamp++;
4290 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4291 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4292 if (spa_multihost(spa))
4293 mmp_update_uberblock(spa, &spa->spa_ubsync);
4294
4295 zfs_dbgmsg("reflow: uberblock updated "
4296 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4297 (long long)spa->spa_ubsync.ub_txg,
4298 (long long)logical_size,
4299 (long long)spa->spa_ubsync.ub_timestamp);
4300
4301 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4302
4303 /*
4304 * Update progress.
4305 */
4306 vre->vre_offset = logical_size;
4307 zfs_rangelock_exit(lr);
4308 spa_config_exit(spa, SCL_STATE, FTAG);
4309
4310 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4311 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4312 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4313 /*
4314 * Note - raidz_reflow_sync() will update the uberblock state to
4315 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4316 */
4317 raidz_reflow_sync(spa, tx);
4318
4319 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4320 }
4321
4322 /*
4323 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4324 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4325 */
4326 void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4327 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4328 {
4329 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4330 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4331 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4332
4333 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4334 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4335 ASSERT0(logical_size % raidvd->vdev_children);
4336 uint64_t write_size = logical_size / raidvd->vdev_children;
4337
4338 zio_t *pio;
4339
4340 /*
4341 * Read from scratch space.
4342 */
4343 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4344 KM_SLEEP);
4345 for (int i = 0; i < raidvd->vdev_children; i++) {
4346 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4347 }
4348
4349 pio = zio_root(spa, NULL, NULL, 0);
4350 for (int i = 0; i < raidvd->vdev_children; i++) {
4351 /*
4352 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4353 * the offset to calculate the physical offset to write to.
4354 * Passing in a negative offset lets us access the boot area.
4355 */
4356 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4357 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4358 write_size, ZIO_TYPE_READ,
4359 ZIO_PRIORITY_ASYNC_READ, 0,
4360 raidz_scratch_child_done, pio));
4361 }
4362 zio_wait(pio);
4363
4364 /*
4365 * Overwrite real location with reflow'ed data.
4366 */
4367 pio = zio_root(spa, NULL, NULL, 0);
4368 for (int i = 0; i < raidvd->vdev_children; i++) {
4369 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4370 0, abds[i], write_size, ZIO_TYPE_WRITE,
4371 ZIO_PRIORITY_ASYNC_WRITE, 0,
4372 raidz_scratch_child_done, pio));
4373 }
4374 zio_wait(pio);
4375 pio = zio_root(spa, NULL, NULL, 0);
4376 zio_flush(pio, raidvd);
4377 zio_wait(pio);
4378
4379 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4380 "to real location", (long long)logical_size);
4381
4382 for (int i = 0; i < raidvd->vdev_children; i++)
4383 abd_free(abds[i]);
4384 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4385
4386 /*
4387 * Update uberblock.
4388 */
4389 RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4390 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4391 spa->spa_ubsync.ub_timestamp++;
4392 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4393 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4394 if (spa_multihost(spa))
4395 mmp_update_uberblock(spa, &spa->spa_ubsync);
4396
4397 zfs_dbgmsg("reflow recovery: uberblock updated "
4398 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4399 (long long)spa->spa_ubsync.ub_txg,
4400 (long long)logical_size,
4401 (long long)spa->spa_ubsync.ub_timestamp);
4402
4403 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4404 spa_first_txg(spa));
4405 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4406 vre->vre_offset = logical_size;
4407 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4408 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4409 /*
4410 * Note that raidz_reflow_sync() will update the uberblock once more
4411 */
4412 raidz_reflow_sync(spa, tx);
4413
4414 dmu_tx_commit(tx);
4415
4416 spa_config_exit(spa, SCL_STATE, FTAG);
4417 }
4418
4419 static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4420 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4421 {
4422 (void) zthr;
4423 spa_t *spa = arg;
4424
4425 return (spa->spa_raidz_expand != NULL &&
4426 !spa->spa_raidz_expand->vre_waiting_for_resilver);
4427 }
4428
4429 /*
4430 * RAIDZ expansion background thread
4431 *
4432 * Can be called multiple times if the reflow is paused
4433 */
4434 static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4435 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4436 {
4437 spa_t *spa = arg;
4438 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4439
4440 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4441 vre->vre_offset = 0;
4442 else
4443 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4444
4445 /* Reflow the begining portion using the scratch area */
4446 if (vre->vre_offset == 0) {
4447 VERIFY0(dsl_sync_task(spa_name(spa),
4448 NULL, raidz_reflow_scratch_sync,
4449 vre, 0, ZFS_SPACE_CHECK_NONE));
4450
4451 /* if we encountered errors then pause */
4452 if (vre->vre_offset == 0) {
4453 mutex_enter(&vre->vre_lock);
4454 vre->vre_waiting_for_resilver = B_TRUE;
4455 mutex_exit(&vre->vre_lock);
4456 return;
4457 }
4458 }
4459
4460 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4461 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4462
4463 uint64_t guid = raidvd->vdev_guid;
4464
4465 /* Iterate over all the remaining metaslabs */
4466 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4467 i < raidvd->vdev_ms_count &&
4468 !zthr_iscancelled(zthr) &&
4469 vre->vre_failed_offset == UINT64_MAX; i++) {
4470 metaslab_t *msp = raidvd->vdev_ms[i];
4471
4472 metaslab_disable(msp);
4473 mutex_enter(&msp->ms_lock);
4474
4475 /*
4476 * The metaslab may be newly created (for the expanded
4477 * space), in which case its trees won't exist yet,
4478 * so we need to bail out early.
4479 */
4480 if (msp->ms_new) {
4481 mutex_exit(&msp->ms_lock);
4482 metaslab_enable(msp, B_FALSE, B_FALSE);
4483 continue;
4484 }
4485
4486 VERIFY0(metaslab_load(msp));
4487
4488 /*
4489 * We want to copy everything except the free (allocatable)
4490 * space. Note that there may be a little bit more free
4491 * space (e.g. in ms_defer), and it's fine to copy that too.
4492 */
4493 range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4494 NULL, 0, 0);
4495 range_tree_add(rt, msp->ms_start, msp->ms_size);
4496 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4497 mutex_exit(&msp->ms_lock);
4498
4499 /*
4500 * Force the last sector of each metaslab to be copied. This
4501 * ensures that we advance the on-disk progress to the end of
4502 * this metaslab while the metaslab is disabled. Otherwise, we
4503 * could move past this metaslab without advancing the on-disk
4504 * progress, and then an allocation to this metaslab would not
4505 * be copied.
4506 */
4507 int sectorsz = 1 << raidvd->vdev_ashift;
4508 uint64_t ms_last_offset = msp->ms_start +
4509 msp->ms_size - sectorsz;
4510 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4511 range_tree_add(rt, ms_last_offset, sectorsz);
4512 }
4513
4514 /*
4515 * When we are resuming from a paused expansion (i.e.
4516 * when importing a pool with a expansion in progress),
4517 * discard any state that we have already processed.
4518 */
4519 range_tree_clear(rt, 0, vre->vre_offset);
4520
4521 while (!zthr_iscancelled(zthr) &&
4522 !range_tree_is_empty(rt) &&
4523 vre->vre_failed_offset == UINT64_MAX) {
4524
4525 /*
4526 * We need to periodically drop the config lock so that
4527 * writers can get in. Additionally, we can't wait
4528 * for a txg to sync while holding a config lock
4529 * (since a waiting writer could cause a 3-way deadlock
4530 * with the sync thread, which also gets a config
4531 * lock for reader). So we can't hold the config lock
4532 * while calling dmu_tx_assign().
4533 */
4534 spa_config_exit(spa, SCL_CONFIG, FTAG);
4535
4536 /*
4537 * If requested, pause the reflow when the amount
4538 * specified by raidz_expand_max_reflow_bytes is reached
4539 *
4540 * This pause is only used during testing or debugging.
4541 */
4542 while (raidz_expand_max_reflow_bytes != 0 &&
4543 raidz_expand_max_reflow_bytes <=
4544 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4545 delay(hz);
4546 }
4547
4548 mutex_enter(&vre->vre_lock);
4549 while (vre->vre_outstanding_bytes >
4550 raidz_expand_max_copy_bytes) {
4551 cv_wait(&vre->vre_cv, &vre->vre_lock);
4552 }
4553 mutex_exit(&vre->vre_lock);
4554
4555 dmu_tx_t *tx =
4556 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4557
4558 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4559 uint64_t txg = dmu_tx_get_txg(tx);
4560
4561 /*
4562 * Reacquire the vdev_config lock. Theoretically, the
4563 * vdev_t that we're expanding may have changed.
4564 */
4565 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4566 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4567
4568 boolean_t needsync =
4569 raidz_reflow_impl(raidvd, vre, rt, tx);
4570
4571 dmu_tx_commit(tx);
4572
4573 if (needsync) {
4574 spa_config_exit(spa, SCL_CONFIG, FTAG);
4575 txg_wait_synced(spa->spa_dsl_pool, txg);
4576 spa_config_enter(spa, SCL_CONFIG, FTAG,
4577 RW_READER);
4578 }
4579 }
4580
4581 spa_config_exit(spa, SCL_CONFIG, FTAG);
4582
4583 metaslab_enable(msp, B_FALSE, B_FALSE);
4584 range_tree_vacate(rt, NULL, NULL);
4585 range_tree_destroy(rt);
4586
4587 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4588 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4589 }
4590
4591 spa_config_exit(spa, SCL_CONFIG, FTAG);
4592
4593 /*
4594 * The txg_wait_synced() here ensures that all reflow zio's have
4595 * completed, and vre_failed_offset has been set if necessary. It
4596 * also ensures that the progress of the last raidz_reflow_sync() is
4597 * written to disk before raidz_reflow_complete_sync() changes the
4598 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4599 * determine if a reflow is in progress, in which case we may need to
4600 * write to both old and new locations. Therefore we can only change
4601 * vre_state once this is not necessary, which is once the on-disk
4602 * progress (in spa_ubsync) has been set past any possible writes (to
4603 * the end of the last metaslab).
4604 */
4605 txg_wait_synced(spa->spa_dsl_pool, 0);
4606
4607 if (!zthr_iscancelled(zthr) &&
4608 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4609 /*
4610 * We are not being canceled or paused, so the reflow must be
4611 * complete. In that case also mark it as completed on disk.
4612 */
4613 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4614 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4615 raidz_reflow_complete_sync, spa,
4616 0, ZFS_SPACE_CHECK_NONE));
4617 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4618 } else {
4619 /*
4620 * Wait for all copy zio's to complete and for all the
4621 * raidz_reflow_sync() synctasks to be run.
4622 */
4623 spa_history_log_internal(spa, "reflow pause",
4624 NULL, "offset=%llu failed_offset=%lld",
4625 (long long)vre->vre_offset,
4626 (long long)vre->vre_failed_offset);
4627 mutex_enter(&vre->vre_lock);
4628 if (vre->vre_failed_offset != UINT64_MAX) {
4629 /*
4630 * Reset progress so that we will retry everything
4631 * after the point that something failed.
4632 */
4633 vre->vre_offset = vre->vre_failed_offset;
4634 vre->vre_failed_offset = UINT64_MAX;
4635 vre->vre_waiting_for_resilver = B_TRUE;
4636 }
4637 mutex_exit(&vre->vre_lock);
4638 }
4639 }
4640
4641 void
spa_start_raidz_expansion_thread(spa_t * spa)4642 spa_start_raidz_expansion_thread(spa_t *spa)
4643 {
4644 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4645 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4646 spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4647 spa, defclsyspri);
4648 }
4649
4650 void
raidz_dtl_reassessed(vdev_t * vd)4651 raidz_dtl_reassessed(vdev_t *vd)
4652 {
4653 spa_t *spa = vd->vdev_spa;
4654 if (spa->spa_raidz_expand != NULL) {
4655 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4656 /*
4657 * we get called often from vdev_dtl_reassess() so make
4658 * sure it's our vdev and any replacing is complete
4659 */
4660 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4661 !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4662 mutex_enter(&vre->vre_lock);
4663 if (vre->vre_waiting_for_resilver) {
4664 vdev_dbgmsg(vd, "DTL reassessed, "
4665 "continuing raidz expansion");
4666 vre->vre_waiting_for_resilver = B_FALSE;
4667 zthr_wakeup(spa->spa_raidz_expand_zthr);
4668 }
4669 mutex_exit(&vre->vre_lock);
4670 }
4671 }
4672 }
4673
4674 int
vdev_raidz_attach_check(vdev_t * new_child)4675 vdev_raidz_attach_check(vdev_t *new_child)
4676 {
4677 vdev_t *raidvd = new_child->vdev_parent;
4678 uint64_t new_children = raidvd->vdev_children;
4679
4680 /*
4681 * We use the "boot" space as scratch space to handle overwriting the
4682 * initial part of the vdev. If it is too small, then this expansion
4683 * is not allowed. This would be very unusual (e.g. ashift > 13 and
4684 * >200 children).
4685 */
4686 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4687 return (EINVAL);
4688 }
4689 return (0);
4690 }
4691
4692 void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)4693 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4694 {
4695 vdev_t *new_child = arg;
4696 spa_t *spa = new_child->vdev_spa;
4697 vdev_t *raidvd = new_child->vdev_parent;
4698 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4699 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4700 ASSERT3P(raidvd->vdev_top, ==, raidvd);
4701 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4702 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4703 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4704 new_child);
4705
4706 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4707
4708 vdrz->vd_physical_width++;
4709
4710 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4711 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4712 vdrz->vn_vre.vre_offset = 0;
4713 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4714 spa->spa_raidz_expand = &vdrz->vn_vre;
4715 zthr_wakeup(spa->spa_raidz_expand_zthr);
4716
4717 /*
4718 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4719 * written to the config.
4720 */
4721 vdev_config_dirty(raidvd);
4722
4723 vdrz->vn_vre.vre_start_time = gethrestime_sec();
4724 vdrz->vn_vre.vre_end_time = 0;
4725 vdrz->vn_vre.vre_state = DSS_SCANNING;
4726 vdrz->vn_vre.vre_bytes_copied = 0;
4727
4728 uint64_t state = vdrz->vn_vre.vre_state;
4729 VERIFY0(zap_update(spa->spa_meta_objset,
4730 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4731 sizeof (state), 1, &state, tx));
4732
4733 uint64_t start_time = vdrz->vn_vre.vre_start_time;
4734 VERIFY0(zap_update(spa->spa_meta_objset,
4735 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4736 sizeof (start_time), 1, &start_time, tx));
4737
4738 (void) zap_remove(spa->spa_meta_objset,
4739 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4740 (void) zap_remove(spa->spa_meta_objset,
4741 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4742
4743 spa_history_log_internal(spa, "raidz vdev expansion started", tx,
4744 "%s vdev %llu new width %llu", spa_name(spa),
4745 (unsigned long long)raidvd->vdev_id,
4746 (unsigned long long)raidvd->vdev_children);
4747 }
4748
4749 int
vdev_raidz_load(vdev_t * vd)4750 vdev_raidz_load(vdev_t *vd)
4751 {
4752 vdev_raidz_t *vdrz = vd->vdev_tsd;
4753 int err;
4754
4755 uint64_t state = DSS_NONE;
4756 uint64_t start_time = 0;
4757 uint64_t end_time = 0;
4758 uint64_t bytes_copied = 0;
4759
4760 if (vd->vdev_top_zap != 0) {
4761 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4762 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4763 sizeof (state), 1, &state);
4764 if (err != 0 && err != ENOENT)
4765 return (err);
4766
4767 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4768 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4769 sizeof (start_time), 1, &start_time);
4770 if (err != 0 && err != ENOENT)
4771 return (err);
4772
4773 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4774 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4775 sizeof (end_time), 1, &end_time);
4776 if (err != 0 && err != ENOENT)
4777 return (err);
4778
4779 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4780 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4781 sizeof (bytes_copied), 1, &bytes_copied);
4782 if (err != 0 && err != ENOENT)
4783 return (err);
4784 }
4785
4786 /*
4787 * If we are in the middle of expansion, vre_state should have
4788 * already been set by vdev_raidz_init().
4789 */
4790 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4791 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4792 vdrz->vn_vre.vre_start_time = start_time;
4793 vdrz->vn_vre.vre_end_time = end_time;
4794 vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4795
4796 return (0);
4797 }
4798
4799 int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)4800 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4801 {
4802 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4803
4804 if (vre == NULL) {
4805 /* no removal in progress; find most recent completed */
4806 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4807 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4808 if (vd->vdev_ops == &vdev_raidz_ops) {
4809 vdev_raidz_t *vdrz = vd->vdev_tsd;
4810
4811 if (vdrz->vn_vre.vre_end_time != 0 &&
4812 (vre == NULL ||
4813 vdrz->vn_vre.vre_end_time >
4814 vre->vre_end_time)) {
4815 vre = &vdrz->vn_vre;
4816 }
4817 }
4818 }
4819 }
4820
4821 if (vre == NULL) {
4822 return (SET_ERROR(ENOENT));
4823 }
4824
4825 pres->pres_state = vre->vre_state;
4826 pres->pres_expanding_vdev = vre->vre_vdev_id;
4827
4828 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4829 pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4830
4831 mutex_enter(&vre->vre_lock);
4832 pres->pres_reflowed = vre->vre_bytes_copied;
4833 for (int i = 0; i < TXG_SIZE; i++)
4834 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4835 mutex_exit(&vre->vre_lock);
4836
4837 pres->pres_start_time = vre->vre_start_time;
4838 pres->pres_end_time = vre->vre_end_time;
4839 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4840
4841 return (0);
4842 }
4843
4844 /*
4845 * Initialize private RAIDZ specific fields from the nvlist.
4846 */
4847 static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)4848 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4849 {
4850 uint_t children;
4851 nvlist_t **child;
4852 int error = nvlist_lookup_nvlist_array(nv,
4853 ZPOOL_CONFIG_CHILDREN, &child, &children);
4854 if (error != 0)
4855 return (SET_ERROR(EINVAL));
4856
4857 uint64_t nparity;
4858 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4859 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4860 return (SET_ERROR(EINVAL));
4861
4862 /*
4863 * Previous versions could only support 1 or 2 parity
4864 * device.
4865 */
4866 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4867 return (SET_ERROR(EINVAL));
4868 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4869 return (SET_ERROR(EINVAL));
4870 } else {
4871 /*
4872 * We require the parity to be specified for SPAs that
4873 * support multiple parity levels.
4874 */
4875 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4876 return (SET_ERROR(EINVAL));
4877
4878 /*
4879 * Otherwise, we default to 1 parity device for RAID-Z.
4880 */
4881 nparity = 1;
4882 }
4883
4884 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4885 vdrz->vn_vre.vre_vdev_id = -1;
4886 vdrz->vn_vre.vre_offset = UINT64_MAX;
4887 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4888 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4889 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4890 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4891 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4892 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4893 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4894
4895 vdrz->vd_physical_width = children;
4896 vdrz->vd_nparity = nparity;
4897
4898 /* note, the ID does not exist when creating a pool */
4899 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4900 &vdrz->vn_vre.vre_vdev_id);
4901
4902 boolean_t reflow_in_progress =
4903 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4904 if (reflow_in_progress) {
4905 spa->spa_raidz_expand = &vdrz->vn_vre;
4906 vdrz->vn_vre.vre_state = DSS_SCANNING;
4907 }
4908
4909 vdrz->vd_original_width = children;
4910 uint64_t *txgs;
4911 unsigned int txgs_size = 0;
4912 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4913 &txgs, &txgs_size);
4914 if (error == 0) {
4915 for (int i = 0; i < txgs_size; i++) {
4916 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4917 re->re_txg = txgs[txgs_size - i - 1];
4918 re->re_logical_width = vdrz->vd_physical_width - i;
4919
4920 if (reflow_in_progress)
4921 re->re_logical_width--;
4922
4923 avl_add(&vdrz->vd_expand_txgs, re);
4924 }
4925
4926 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4927 }
4928 if (reflow_in_progress) {
4929 vdrz->vd_original_width--;
4930 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4931 children, txgs_size);
4932 }
4933
4934 *tsd = vdrz;
4935
4936 return (0);
4937 }
4938
4939 static void
vdev_raidz_fini(vdev_t * vd)4940 vdev_raidz_fini(vdev_t *vd)
4941 {
4942 vdev_raidz_t *vdrz = vd->vdev_tsd;
4943 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
4944 vd->vdev_spa->spa_raidz_expand = NULL;
4945 reflow_node_t *re;
4946 void *cookie = NULL;
4947 avl_tree_t *tree = &vdrz->vd_expand_txgs;
4948 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
4949 kmem_free(re, sizeof (*re));
4950 avl_destroy(&vdrz->vd_expand_txgs);
4951 mutex_destroy(&vdrz->vd_expand_lock);
4952 mutex_destroy(&vdrz->vn_vre.vre_lock);
4953 cv_destroy(&vdrz->vn_vre.vre_cv);
4954 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
4955 kmem_free(vdrz, sizeof (*vdrz));
4956 }
4957
4958 /*
4959 * Add RAIDZ specific fields to the config nvlist.
4960 */
4961 static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)4962 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
4963 {
4964 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
4965 vdev_raidz_t *vdrz = vd->vdev_tsd;
4966
4967 /*
4968 * Make sure someone hasn't managed to sneak a fancy new vdev
4969 * into a crufty old storage pool.
4970 */
4971 ASSERT(vdrz->vd_nparity == 1 ||
4972 (vdrz->vd_nparity <= 2 &&
4973 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
4974 (vdrz->vd_nparity <= 3 &&
4975 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
4976
4977 /*
4978 * Note that we'll add these even on storage pools where they
4979 * aren't strictly required -- older software will just ignore
4980 * it.
4981 */
4982 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
4983
4984 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4985 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4986 }
4987
4988 mutex_enter(&vdrz->vd_expand_lock);
4989 if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
4990 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
4991 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
4992 KM_SLEEP);
4993 uint64_t i = 0;
4994
4995 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
4996 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
4997 txgs[i++] = re->re_txg;
4998 }
4999
5000 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5001 txgs, count);
5002
5003 kmem_free(txgs, sizeof (uint64_t) * count);
5004 }
5005 mutex_exit(&vdrz->vd_expand_lock);
5006 }
5007
5008 static uint64_t
vdev_raidz_nparity(vdev_t * vd)5009 vdev_raidz_nparity(vdev_t *vd)
5010 {
5011 vdev_raidz_t *vdrz = vd->vdev_tsd;
5012 return (vdrz->vd_nparity);
5013 }
5014
5015 static uint64_t
vdev_raidz_ndisks(vdev_t * vd)5016 vdev_raidz_ndisks(vdev_t *vd)
5017 {
5018 return (vd->vdev_children);
5019 }
5020
5021 vdev_ops_t vdev_raidz_ops = {
5022 .vdev_op_init = vdev_raidz_init,
5023 .vdev_op_fini = vdev_raidz_fini,
5024 .vdev_op_open = vdev_raidz_open,
5025 .vdev_op_close = vdev_raidz_close,
5026 .vdev_op_asize = vdev_raidz_asize,
5027 .vdev_op_min_asize = vdev_raidz_min_asize,
5028 .vdev_op_min_alloc = NULL,
5029 .vdev_op_io_start = vdev_raidz_io_start,
5030 .vdev_op_io_done = vdev_raidz_io_done,
5031 .vdev_op_state_change = vdev_raidz_state_change,
5032 .vdev_op_need_resilver = vdev_raidz_need_resilver,
5033 .vdev_op_hold = NULL,
5034 .vdev_op_rele = NULL,
5035 .vdev_op_remap = NULL,
5036 .vdev_op_xlate = vdev_raidz_xlate,
5037 .vdev_op_rebuild_asize = NULL,
5038 .vdev_op_metaslab_init = NULL,
5039 .vdev_op_config_generate = vdev_raidz_config_generate,
5040 .vdev_op_nparity = vdev_raidz_nparity,
5041 .vdev_op_ndisks = vdev_raidz_ndisks,
5042 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5043 .vdev_op_leaf = B_FALSE /* not a leaf vdev */
5044 };
5045
5046 /* BEGIN CSTYLED */
5047 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5048 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5049 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5050 "Max amount of concurrent i/o for RAIDZ expansion");
5051 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5052 "For expanded RAIDZ, aggregate reads that have more rows than this");
5053 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5054 "For expanded RAIDZ, automatically start a pool scrub when expansion "
5055 "completes");
5056 /* END CSTYLED */
5057