1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
27 * Copyright (c) 2025, Klara, Inc.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zap.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/metaslab_impl.h>
36 #include <sys/zio.h>
37 #include <sys/zio_checksum.h>
38 #include <sys/dmu_tx.h>
39 #include <sys/abd.h>
40 #include <sys/zfs_rlock.h>
41 #include <sys/fs/zfs.h>
42 #include <sys/fm/fs/zfs.h>
43 #include <sys/vdev_raidz.h>
44 #include <sys/vdev_raidz_impl.h>
45 #include <sys/vdev_draid.h>
46 #include <sys/uberblock_impl.h>
47 #include <sys/dsl_scan.h>
48
49 #ifdef ZFS_DEBUG
50 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
51 #endif
52
53 /*
54 * Virtual device vector for RAID-Z.
55 *
56 * This vdev supports single, double, and triple parity. For single parity,
57 * we use a simple XOR of all the data columns. For double or triple parity,
58 * we use a special case of Reed-Solomon coding. This extends the
59 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
60 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
61 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
62 * former is also based. The latter is designed to provide higher performance
63 * for writes.
64 *
65 * Note that the Plank paper claimed to support arbitrary N+M, but was then
66 * amended six years later identifying a critical flaw that invalidates its
67 * claims. Nevertheless, the technique can be adapted to work for up to
68 * triple parity. For additional parity, the amendment "Note: Correction to
69 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
70 * is viable, but the additional complexity means that write performance will
71 * suffer.
72 *
73 * All of the methods above operate on a Galois field, defined over the
74 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
75 * can be expressed with a single byte. Briefly, the operations on the
76 * field are defined as follows:
77 *
78 * o addition (+) is represented by a bitwise XOR
79 * o subtraction (-) is therefore identical to addition: A + B = A - B
80 * o multiplication of A by 2 is defined by the following bitwise expression:
81 *
82 * (A * 2)_7 = A_6
83 * (A * 2)_6 = A_5
84 * (A * 2)_5 = A_4
85 * (A * 2)_4 = A_3 + A_7
86 * (A * 2)_3 = A_2 + A_7
87 * (A * 2)_2 = A_1 + A_7
88 * (A * 2)_1 = A_0
89 * (A * 2)_0 = A_7
90 *
91 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
92 * As an aside, this multiplication is derived from the error correcting
93 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
94 *
95 * Observe that any number in the field (except for 0) can be expressed as a
96 * power of 2 -- a generator for the field. We store a table of the powers of
97 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
98 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
99 * than field addition). The inverse of a field element A (A^-1) is therefore
100 * A ^ (255 - 1) = A^254.
101 *
102 * The up-to-three parity columns, P, Q, R over several data columns,
103 * D_0, ... D_n-1, can be expressed by field operations:
104 *
105 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
106 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
107 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
108 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
109 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
110 *
111 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
112 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
113 * independent coefficients. (There are no additional coefficients that have
114 * this property which is why the uncorrected Plank method breaks down.)
115 *
116 * See the reconstruction code below for how P, Q and R can used individually
117 * or in concert to recover missing data columns.
118 */
119
120 #define VDEV_RAIDZ_P 0
121 #define VDEV_RAIDZ_Q 1
122 #define VDEV_RAIDZ_R 2
123
124 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
125 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
126
127 /*
128 * We provide a mechanism to perform the field multiplication operation on a
129 * 64-bit value all at once rather than a byte at a time. This works by
130 * creating a mask from the top bit in each byte and using that to
131 * conditionally apply the XOR of 0x1d.
132 */
133 #define VDEV_RAIDZ_64MUL_2(x, mask) \
134 { \
135 (mask) = (x) & 0x8080808080808080ULL; \
136 (mask) = ((mask) << 1) - ((mask) >> 7); \
137 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
138 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
139 }
140
141 #define VDEV_RAIDZ_64MUL_4(x, mask) \
142 { \
143 VDEV_RAIDZ_64MUL_2((x), mask); \
144 VDEV_RAIDZ_64MUL_2((x), mask); \
145 }
146
147
148 /*
149 * Big Theory Statement for how a RAIDZ VDEV is expanded
150 *
151 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
152 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
153 * that have been previously expanded can be expanded again.
154 *
155 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
156 * the VDEV) when an expansion starts. And the expansion will pause if any
157 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
158 * operations on the pool can continue while an expansion is in progress (e.g.
159 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
160 * and zpool initialize which can't be run during an expansion. Following a
161 * reboot or export/import, the expansion resumes where it left off.
162 *
163 * == Reflowing the Data ==
164 *
165 * The expansion involves reflowing (copying) the data from the current set
166 * of disks to spread it across the new set which now has one more disk. This
167 * reflow operation is similar to reflowing text when the column width of a
168 * text editor window is expanded. The text doesn’t change but the location of
169 * the text changes to accommodate the new width. An example reflow result for
170 * a 4-wide RAIDZ1 to a 5-wide is shown below.
171 *
172 * Reflow End State
173 * Each letter indicates a parity group (logical stripe)
174 *
175 * Before expansion After Expansion
176 * D1 D2 D3 D4 D1 D2 D3 D4 D5
177 * +------+------+------+------+ +------+------+------+------+------+
178 * | | | | | | | | | | |
179 * | A | A | A | A | | A | A | A | A | B |
180 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
181 * +------+------+------+------+ +------+------+------+------+------+
182 * | | | | | | | | | | |
183 * | B | B | C | C | | B | C | C | C | C |
184 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
185 * +------+------+------+------+ +------+------+------+------+------+
186 * | | | | | | | | | | |
187 * | C | C | D | D | | D | D | E | E | E |
188 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
189 * +------+------+------+------+ +------+------+------+------+------+
190 * | | | | | | | | | | |
191 * | E | E | E | E | --> | E | F | F | G | G |
192 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
193 * +------+------+------+------+ +------+------+------+------+------+
194 * | | | | | | | | | | |
195 * | F | F | G | G | | G | G | H | H | H |
196 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
197 * +------+------+------+------+ +------+------+------+------+------+
198 * | | | | | | | | | | |
199 * | G | G | H | H | | H | I | I | J | J |
200 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
201 * +------+------+------+------+ +------+------+------+------+------+
202 * | | | | | | | | | | |
203 * | H | H | I | I | | J | J | | | K |
204 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
205 * +------+------+------+------+ +------+------+------+------+------+
206 *
207 * This reflow approach has several advantages. There is no need to read or
208 * modify the block pointers or recompute any block checksums. The reflow
209 * doesn’t need to know where the parity sectors reside. We can read and write
210 * data sequentially and the copy can occur in a background thread in open
211 * context. The design also allows for fast discovery of what data to copy.
212 *
213 * The VDEV metaslabs are processed, one at a time, to copy the block data to
214 * have it flow across all the disks. The metaslab is disabled for allocations
215 * during the copy. As an optimization, we only copy the allocated data which
216 * can be determined by looking at the metaslab range tree. During the copy we
217 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
218 * need to be able to survive losing parity count disks). This means we
219 * cannot overwrite data during the reflow that would be needed if a disk is
220 * lost.
221 *
222 * After the reflow completes, all newly-written blocks will have the new
223 * layout, i.e., they will have the parity to data ratio implied by the new
224 * number of disks in the RAIDZ group. Even though the reflow copies all of
225 * the allocated space (data and parity), it is only rearranged, not changed.
226 *
227 * This act of reflowing the data has a few implications about blocks
228 * that were written before the reflow completes:
229 *
230 * - Old blocks will still use the same amount of space (i.e., they will have
231 * the parity to data ratio implied by the old number of disks in the RAIDZ
232 * group).
233 * - Reading old blocks will be slightly slower than before the reflow, for
234 * two reasons. First, we will have to read from all disks in the RAIDZ
235 * VDEV, rather than being able to skip the children that contain only
236 * parity of this block (because the data of a single block is now spread
237 * out across all the disks). Second, in most cases there will be an extra
238 * bcopy, needed to rearrange the data back to its original layout in memory.
239 *
240 * == Scratch Area ==
241 *
242 * As we copy the block data, we can only progress to the point that writes
243 * will not overlap with blocks whose progress has not yet been recorded on
244 * disk. Since partially-copied rows are always read from the old location,
245 * we need to stop one row before the sector-wise overlap, to prevent any
246 * row-wise overlap. For example, in the diagram above, when we reflow sector
247 * B6 it will overwite the original location for B5.
248 *
249 * To get around this, a scratch space is used so that we can start copying
250 * without risking data loss by overlapping the row. As an added benefit, it
251 * improves performance at the beginning of the reflow, but that small perf
252 * boost wouldn't be worth the complexity on its own.
253 *
254 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
255 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
256 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
257 * the widths will likely be single digits so we can get a substantial chuck
258 * size using only a few MB of scratch per disk.
259 *
260 * The scratch area is persisted to disk which holds a large amount of reflowed
261 * state. We can always read the partially written stripes when a disk fails or
262 * the copy is interrupted (crash) during the initial copying phase and also
263 * get past a small chunk size restriction. At a minimum, the scratch space
264 * must be large enough to get us to the point that one row does not overlap
265 * itself when moved (i.e new_width^2). But going larger is even better. We
266 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
267 * as our scratch space to handle overwriting the initial part of the VDEV.
268 *
269 * 0 256K 512K 4M
270 * +------+------+-----------------------+-----------------------------
271 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
272 * | L0 | L1 | Reserved | (Metaslabs)
273 * +------+------+-----------------------+-------------------------------
274 * Scratch Area
275 *
276 * == Reflow Progress Updates ==
277 * After the initial scratch-based reflow, the expansion process works
278 * similarly to device removal. We create a new open context thread which
279 * reflows the data, and periodically kicks off sync tasks to update logical
280 * state. In this case, state is the committed progress (offset of next data
281 * to copy). We need to persist the completed offset on disk, so that if we
282 * crash we know which format each VDEV offset is in.
283 *
284 * == Time Dependent Geometry ==
285 *
286 * In non-expanded RAIDZ, blocks are read from disk in a column by column
287 * fashion. For a multi-row block, the second sector is in the first column
288 * not in the second column. This allows us to issue full reads for each
289 * column directly into the request buffer. The block data is thus laid out
290 * sequentially in a column-by-column fashion.
291 *
292 * For example, in the before expansion diagram above, one logical block might
293 * be sectors G19-H26. The parity is in G19,H23; and the data is in
294 * G20,H24,G21,H25,G22,H26.
295 *
296 * After a block is reflowed, the sectors that were all in the original column
297 * data can now reside in different columns. When reading from an expanded
298 * VDEV, we need to know the logical stripe width for each block so we can
299 * reconstitute the block’s data after the reads are completed. Likewise,
300 * when we perform the combinatorial reconstruction we need to know the
301 * original width so we can retry combinations from the past layouts.
302 *
303 * Time dependent geometry is what we call having blocks with different layouts
304 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
305 * block’s birth time (+ the time expansion ended) to establish the correct
306 * width for a given block. After an expansion completes, we record the time
307 * for blocks written with a particular width (geometry).
308 *
309 * == On Disk Format Changes ==
310 *
311 * New pool feature flag, 'raidz_expansion' whose reference count is the number
312 * of RAIDZ VDEVs that have been expanded.
313 *
314 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
315 *
316 * Since the uberblock can point to arbitrary blocks, which might be on the
317 * expanding RAIDZ, and might or might not have been expanded. We need to know
318 * which way a block is laid out before reading it. This info is the next
319 * offset that needs to be reflowed and we persist that in the uberblock, in
320 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
321 * After the expansion is complete, we then use the raidz_expand_txgs array
322 * (see below) to determine how to read a block and the ub_raidz_reflow_info
323 * field no longer required.
324 *
325 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
326 * state (i.e., active or not) which is also required before reading a block
327 * during the initial phase of reflowing the data.
328 *
329 * The top-level RAIDZ VDEV has two new entries in the nvlist:
330 *
331 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
332 * and used after the expansion is complete to
333 * determine how to read a raidz block
334 * 'raidz_expanding' boolean: present during reflow and removed after completion
335 * used during a spa import to resume an unfinished
336 * expansion
337 *
338 * And finally the VDEVs top zap adds the following informational entries:
339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
341 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
342 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
343 */
344
345 /*
346 * For testing only: pause the raidz expansion after reflowing this amount.
347 * (accessed by ZTS and ztest)
348 */
349 #ifdef _KERNEL
350 static
351 #endif /* _KERNEL */
352 unsigned long raidz_expand_max_reflow_bytes = 0;
353
354 /*
355 * For testing only: pause the raidz expansion at a certain point.
356 */
357 uint_t raidz_expand_pause_point = 0;
358
359 /*
360 * This represents the duration for a slow drive read sit out.
361 */
362 static unsigned long vdev_read_sit_out_secs = 600;
363
364 /*
365 * How often each RAID-Z and dRAID vdev will check for slow disk outliers.
366 * Increasing this interval will reduce the sensitivity of detection (since all
367 * I/Os since the last check are included in the statistics), but will slow the
368 * response to a disk developing a problem.
369 *
370 * Defaults to once per second; setting extremely small values may cause
371 * negative performance effects.
372 */
373 static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
374
375 /*
376 * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
377 * used to determine how far out an outlier must be before it counts as an event
378 * worth consdering.
379 *
380 * Smaller values will result in more aggressive sitting out of disks that may
381 * have problems, but may significantly increase the rate of spurious sit-outs.
382 */
383 static uint32_t vdev_raidz_outlier_insensitivity = 50;
384
385 /*
386 * Maximum amount of copy io's outstanding at once.
387 */
388 #ifdef _ILP32
389 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
390 #else
391 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
392 #endif
393
394 /*
395 * Apply raidz map abds aggregation if the number of rows in the map is equal
396 * or greater than the value below.
397 */
398 static unsigned long raidz_io_aggregate_rows = 4;
399
400 /*
401 * Automatically start a pool scrub when a RAIDZ expansion completes in
402 * order to verify the checksums of all blocks which have been copied
403 * during the expansion. Automatic scrubbing is enabled by default and
404 * is strongly recommended.
405 */
406 static int zfs_scrub_after_expand = 1;
407
408 static void
vdev_raidz_row_free(raidz_row_t * rr)409 vdev_raidz_row_free(raidz_row_t *rr)
410 {
411 for (int c = 0; c < rr->rr_cols; c++) {
412 raidz_col_t *rc = &rr->rr_col[c];
413
414 if (rc->rc_size != 0)
415 abd_free(rc->rc_abd);
416 if (rc->rc_orig_data != NULL)
417 abd_free(rc->rc_orig_data);
418 }
419
420 if (rr->rr_abd_empty != NULL)
421 abd_free(rr->rr_abd_empty);
422
423 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
424 }
425
426 void
vdev_raidz_map_free(raidz_map_t * rm)427 vdev_raidz_map_free(raidz_map_t *rm)
428 {
429 for (int i = 0; i < rm->rm_nrows; i++)
430 vdev_raidz_row_free(rm->rm_row[i]);
431
432 if (rm->rm_nphys_cols) {
433 for (int i = 0; i < rm->rm_nphys_cols; i++) {
434 if (rm->rm_phys_col[i].rc_abd != NULL)
435 abd_free(rm->rm_phys_col[i].rc_abd);
436 }
437
438 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
439 rm->rm_nphys_cols);
440 }
441
442 ASSERT0P(rm->rm_lr);
443 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
444 }
445
446 static void
vdev_raidz_map_free_vsd(zio_t * zio)447 vdev_raidz_map_free_vsd(zio_t *zio)
448 {
449 raidz_map_t *rm = zio->io_vsd;
450
451 vdev_raidz_map_free(rm);
452 }
453
454 static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)455 vdev_raidz_reflow_compare(const void *x1, const void *x2)
456 {
457 const reflow_node_t *l = x1;
458 const reflow_node_t *r = x2;
459
460 return (TREE_CMP(l->re_txg, r->re_txg));
461 }
462
463 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
464 .vsd_free = vdev_raidz_map_free_vsd,
465 };
466
467 raidz_row_t *
vdev_raidz_row_alloc(int cols,zio_t * zio)468 vdev_raidz_row_alloc(int cols, zio_t *zio)
469 {
470 raidz_row_t *rr =
471 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
472
473 rr->rr_cols = cols;
474 rr->rr_scols = cols;
475
476 for (int c = 0; c < cols; c++) {
477 raidz_col_t *rc = &rr->rr_col[c];
478 rc->rc_shadow_devidx = INT_MAX;
479 rc->rc_shadow_offset = UINT64_MAX;
480 /*
481 * We can not allow self healing to take place for Direct I/O
482 * reads. There is nothing that stops the buffer contents from
483 * being manipulated while the I/O is in flight. It is possible
484 * that the checksum could be verified on the buffer and then
485 * the contents of that buffer are manipulated afterwards. This
486 * could lead to bad data being written out during self
487 * healing.
488 */
489 if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
490 rc->rc_allow_repair = 1;
491 }
492 return (rr);
493 }
494
495 static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)496 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
497 {
498 int c;
499 int nwrapped = 0;
500 uint64_t off = 0;
501 raidz_row_t *rr = rm->rm_row[0];
502
503 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
504 ASSERT3U(rm->rm_nrows, ==, 1);
505
506 /*
507 * Pad any parity columns with additional space to account for skip
508 * sectors.
509 */
510 if (rm->rm_skipstart < rr->rr_firstdatacol) {
511 ASSERT0(rm->rm_skipstart);
512 nwrapped = rm->rm_nskip;
513 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
514 nwrapped =
515 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
516 }
517
518 /*
519 * Optional single skip sectors (rc_size == 0) will be handled in
520 * vdev_raidz_io_start_write().
521 */
522 int skipped = rr->rr_scols - rr->rr_cols;
523
524 /* Allocate buffers for the parity columns */
525 for (c = 0; c < rr->rr_firstdatacol; c++) {
526 raidz_col_t *rc = &rr->rr_col[c];
527
528 /*
529 * Parity columns will pad out a linear ABD to account for
530 * the skip sector. A linear ABD is used here because
531 * parity calculations use the ABD buffer directly to calculate
532 * parity. This avoids doing a memcpy back to the ABD after the
533 * parity has been calculated. By issuing the parity column
534 * with the skip sector we can reduce contention on the child
535 * VDEV queue locks (vq_lock).
536 */
537 if (c < nwrapped) {
538 rc->rc_abd = abd_alloc_linear(
539 rc->rc_size + (1ULL << ashift), B_FALSE);
540 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
541 skipped++;
542 } else {
543 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
544 }
545 }
546
547 for (off = 0; c < rr->rr_cols; c++) {
548 raidz_col_t *rc = &rr->rr_col[c];
549 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
550 zio->io_abd, off, rc->rc_size);
551
552 /*
553 * Generate I/O for skip sectors to improve aggregation
554 * continuity. We will use gang ABD's to reduce contention
555 * on the child VDEV queue locks (vq_lock) by issuing
556 * a single I/O that contains the data and skip sector.
557 *
558 * It is important to make sure that rc_size is not updated
559 * even though we are adding a skip sector to the ABD. When
560 * calculating the parity in vdev_raidz_generate_parity_row()
561 * the rc_size is used to iterate through the ABD's. We can
562 * not have zero'd out skip sectors used for calculating
563 * parity for raidz, because those same sectors are not used
564 * during reconstruction.
565 */
566 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
567 rc->rc_abd = abd_alloc_gang();
568 abd_gang_add(rc->rc_abd, abd, B_TRUE);
569 abd_gang_add(rc->rc_abd,
570 abd_get_zeros(1ULL << ashift), B_TRUE);
571 skipped++;
572 } else {
573 rc->rc_abd = abd;
574 }
575 off += rc->rc_size;
576 }
577
578 ASSERT3U(off, ==, zio->io_size);
579 ASSERT3S(skipped, ==, rm->rm_nskip);
580 }
581
582 static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)583 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
584 {
585 int c;
586 raidz_row_t *rr = rm->rm_row[0];
587
588 ASSERT3U(rm->rm_nrows, ==, 1);
589
590 /* Allocate buffers for the parity columns */
591 for (c = 0; c < rr->rr_firstdatacol; c++)
592 rr->rr_col[c].rc_abd =
593 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
594
595 for (uint64_t off = 0; c < rr->rr_cols; c++) {
596 raidz_col_t *rc = &rr->rr_col[c];
597 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
598 zio->io_abd, off, rc->rc_size);
599 off += rc->rc_size;
600 }
601 }
602
603 /*
604 * Divides the IO evenly across all child vdevs; usually, dcols is
605 * the number of children in the target vdev.
606 *
607 * Avoid inlining the function to keep vdev_raidz_io_start(), which
608 * is this functions only caller, as small as possible on the stack.
609 */
610 noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)611 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
612 uint64_t nparity)
613 {
614 raidz_row_t *rr;
615 /* The starting RAIDZ (parent) vdev sector of the block. */
616 uint64_t b = zio->io_offset >> ashift;
617 /* The zio's size in units of the vdev's minimum sector size. */
618 uint64_t s = zio->io_size >> ashift;
619 /* The first column for this stripe. */
620 uint64_t f = b % dcols;
621 /* The starting byte offset on each child vdev. */
622 uint64_t o = (b / dcols) << ashift;
623 uint64_t acols, scols;
624
625 raidz_map_t *rm =
626 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
627 rm->rm_nrows = 1;
628
629 /*
630 * "Quotient": The number of data sectors for this stripe on all but
631 * the "big column" child vdevs that also contain "remainder" data.
632 */
633 uint64_t q = s / (dcols - nparity);
634
635 /*
636 * "Remainder": The number of partial stripe data sectors in this I/O.
637 * This will add a sector to some, but not all, child vdevs.
638 */
639 uint64_t r = s - q * (dcols - nparity);
640
641 /* The number of "big columns" - those which contain remainder data. */
642 uint64_t bc = (r == 0 ? 0 : r + nparity);
643
644 /*
645 * The total number of data and parity sectors associated with
646 * this I/O.
647 */
648 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
649
650 /*
651 * acols: The columns that will be accessed.
652 * scols: The columns that will be accessed or skipped.
653 */
654 if (q == 0) {
655 /* Our I/O request doesn't span all child vdevs. */
656 acols = bc;
657 scols = MIN(dcols, roundup(bc, nparity + 1));
658 } else {
659 acols = dcols;
660 scols = dcols;
661 }
662
663 ASSERT3U(acols, <=, scols);
664 rr = vdev_raidz_row_alloc(scols, zio);
665 rm->rm_row[0] = rr;
666 rr->rr_cols = acols;
667 rr->rr_bigcols = bc;
668 rr->rr_firstdatacol = nparity;
669 #ifdef ZFS_DEBUG
670 rr->rr_offset = zio->io_offset;
671 rr->rr_size = zio->io_size;
672 #endif
673
674 uint64_t asize = 0;
675
676 for (uint64_t c = 0; c < scols; c++) {
677 raidz_col_t *rc = &rr->rr_col[c];
678 uint64_t col = f + c;
679 uint64_t coff = o;
680 if (col >= dcols) {
681 col -= dcols;
682 coff += 1ULL << ashift;
683 }
684 rc->rc_devidx = col;
685 rc->rc_offset = coff;
686
687 if (c >= acols)
688 rc->rc_size = 0;
689 else if (c < bc)
690 rc->rc_size = (q + 1) << ashift;
691 else
692 rc->rc_size = q << ashift;
693
694 asize += rc->rc_size;
695 }
696
697 ASSERT3U(asize, ==, tot << ashift);
698 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
699 rm->rm_skipstart = bc;
700
701 /*
702 * If all data stored spans all columns, there's a danger that parity
703 * will always be on the same device and, since parity isn't read
704 * during normal operation, that device's I/O bandwidth won't be
705 * used effectively. We therefore switch the parity every 1MB.
706 *
707 * ... at least that was, ostensibly, the theory. As a practical
708 * matter unless we juggle the parity between all devices evenly, we
709 * won't see any benefit. Further, occasional writes that aren't a
710 * multiple of the LCM of the number of children and the minimum
711 * stripe width are sufficient to avoid pessimal behavior.
712 * Unfortunately, this decision created an implicit on-disk format
713 * requirement that we need to support for all eternity, but only
714 * for single-parity RAID-Z.
715 *
716 * If we intend to skip a sector in the zeroth column for padding
717 * we must make sure to note this swap. We will never intend to
718 * skip the first column since at least one data and one parity
719 * column must appear in each row.
720 */
721 ASSERT(rr->rr_cols >= 2);
722 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
723
724 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
725 uint64_t devidx = rr->rr_col[0].rc_devidx;
726 o = rr->rr_col[0].rc_offset;
727 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
728 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
729 rr->rr_col[1].rc_devidx = devidx;
730 rr->rr_col[1].rc_offset = o;
731 if (rm->rm_skipstart == 0)
732 rm->rm_skipstart = 1;
733 }
734
735 if (zio->io_type == ZIO_TYPE_WRITE) {
736 vdev_raidz_map_alloc_write(zio, rm, ashift);
737 } else {
738 vdev_raidz_map_alloc_read(zio, rm);
739 }
740 /* init RAIDZ parity ops */
741 rm->rm_ops = vdev_raidz_math_get_ops();
742
743 return (rm);
744 }
745
746 /*
747 * Everything before reflow_offset_synced should have been moved to the new
748 * location (read and write completed). However, this may not yet be reflected
749 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
750 * uberblock has not yet been written). If reflow is not in progress,
751 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
752 * entirely before reflow_offset_synced, it will come from the new location.
753 * Otherwise this row will come from the old location. Therefore, rows that
754 * straddle the reflow_offset_synced will come from the old location.
755 *
756 * For writes, reflow_offset_next is the next offset to copy. If a sector has
757 * been copied, but not yet reflected in the on-disk progress
758 * (reflow_offset_synced), it will also be written to the new (already copied)
759 * offset.
760 */
761 noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)762 vdev_raidz_map_alloc_expanded(zio_t *zio,
763 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
764 uint64_t nparity, uint64_t reflow_offset_synced,
765 uint64_t reflow_offset_next, boolean_t use_scratch)
766 {
767 abd_t *abd = zio->io_abd;
768 uint64_t offset = zio->io_offset;
769 uint64_t size = zio->io_size;
770
771 /* The zio's size in units of the vdev's minimum sector size. */
772 uint64_t s = size >> ashift;
773
774 /*
775 * "Quotient": The number of data sectors for this stripe on all but
776 * the "big column" child vdevs that also contain "remainder" data.
777 * AKA "full rows"
778 */
779 uint64_t q = s / (logical_cols - nparity);
780
781 /*
782 * "Remainder": The number of partial stripe data sectors in this I/O.
783 * This will add a sector to some, but not all, child vdevs.
784 */
785 uint64_t r = s - q * (logical_cols - nparity);
786
787 /* The number of "big columns" - those which contain remainder data. */
788 uint64_t bc = (r == 0 ? 0 : r + nparity);
789
790 /*
791 * The total number of data and parity sectors associated with
792 * this I/O.
793 */
794 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
795
796 /* How many rows contain data (not skip) */
797 uint64_t rows = howmany(tot, logical_cols);
798 int cols = MIN(tot, logical_cols);
799
800 raidz_map_t *rm =
801 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
802 KM_SLEEP);
803 rm->rm_nrows = rows;
804 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
805 rm->rm_skipstart = bc;
806 uint64_t asize = 0;
807
808 for (uint64_t row = 0; row < rows; row++) {
809 boolean_t row_use_scratch = B_FALSE;
810 raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
811 rm->rm_row[row] = rr;
812
813 /* The starting RAIDZ (parent) vdev sector of the row. */
814 uint64_t b = (offset >> ashift) + row * logical_cols;
815
816 /*
817 * If we are in the middle of a reflow, and the copying has
818 * not yet completed for any part of this row, then use the
819 * old location of this row. Note that reflow_offset_synced
820 * reflects the i/o that's been completed, because it's
821 * updated by a synctask, after zio_wait(spa_txg_zio[]).
822 * This is sufficient for our check, even if that progress
823 * has not yet been recorded to disk (reflected in
824 * spa_ubsync). Also note that we consider the last row to
825 * be "full width" (`cols`-wide rather than `bc`-wide) for
826 * this calculation. This causes a tiny bit of unnecessary
827 * double-writes but is safe and simpler to calculate.
828 */
829 int row_phys_cols = physical_cols;
830 if (b + cols > reflow_offset_synced >> ashift)
831 row_phys_cols--;
832 else if (use_scratch)
833 row_use_scratch = B_TRUE;
834
835 /* starting child of this row */
836 uint64_t child_id = b % row_phys_cols;
837 /* The starting byte offset on each child vdev. */
838 uint64_t child_offset = (b / row_phys_cols) << ashift;
839
840 /*
841 * Note, rr_cols is the entire width of the block, even
842 * if this row is shorter. This is needed because parity
843 * generation (for Q and R) needs to know the entire width,
844 * because it treats the short row as though it was
845 * full-width (and the "phantom" sectors were zero-filled).
846 *
847 * Another approach to this would be to set cols shorter
848 * (to just the number of columns that we might do i/o to)
849 * and have another mechanism to tell the parity generation
850 * about the "entire width". Reconstruction (at least
851 * vdev_raidz_reconstruct_general()) would also need to
852 * know about the "entire width".
853 */
854 rr->rr_firstdatacol = nparity;
855 #ifdef ZFS_DEBUG
856 /*
857 * note: rr_size is PSIZE, not ASIZE
858 */
859 rr->rr_offset = b << ashift;
860 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
861 #endif
862
863 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
864 if (child_id >= row_phys_cols) {
865 child_id -= row_phys_cols;
866 child_offset += 1ULL << ashift;
867 }
868 raidz_col_t *rc = &rr->rr_col[c];
869 rc->rc_devidx = child_id;
870 rc->rc_offset = child_offset;
871
872 /*
873 * Get this from the scratch space if appropriate.
874 * This only happens if we crashed in the middle of
875 * raidz_reflow_scratch_sync() (while it's running,
876 * the rangelock prevents us from doing concurrent
877 * io), and even then only during zpool import or
878 * when the pool is imported readonly.
879 */
880 if (row_use_scratch)
881 rc->rc_offset -= VDEV_BOOT_SIZE;
882
883 uint64_t dc = c - rr->rr_firstdatacol;
884 if (c < rr->rr_firstdatacol) {
885 rc->rc_size = 1ULL << ashift;
886
887 /*
888 * Parity sectors' rc_abd's are set below
889 * after determining if this is an aggregation.
890 */
891 } else if (row == rows - 1 && bc != 0 && c >= bc) {
892 /*
893 * Past the end of the block (even including
894 * skip sectors). This sector is part of the
895 * map so that we have full rows for p/q parity
896 * generation.
897 */
898 rc->rc_size = 0;
899 rc->rc_abd = NULL;
900 } else {
901 /* "data column" (col excluding parity) */
902 uint64_t off;
903
904 if (c < bc || r == 0) {
905 off = dc * rows + row;
906 } else {
907 off = r * rows +
908 (dc - r) * (rows - 1) + row;
909 }
910 rc->rc_size = 1ULL << ashift;
911 rc->rc_abd = abd_get_offset_struct(
912 &rc->rc_abdstruct, abd, off << ashift,
913 rc->rc_size);
914 }
915
916 if (rc->rc_size == 0)
917 continue;
918
919 /*
920 * If any part of this row is in both old and new
921 * locations, the primary location is the old
922 * location. If this sector was already copied to the
923 * new location, we need to also write to the new,
924 * "shadow" location.
925 *
926 * Note, `row_phys_cols != physical_cols` indicates
927 * that the primary location is the old location.
928 * `b+c < reflow_offset_next` indicates that the copy
929 * to the new location has been initiated. We know
930 * that the copy has completed because we have the
931 * rangelock, which is held exclusively while the
932 * copy is in progress.
933 */
934 if (row_use_scratch ||
935 (row_phys_cols != physical_cols &&
936 b + c < reflow_offset_next >> ashift)) {
937 rc->rc_shadow_devidx = (b + c) % physical_cols;
938 rc->rc_shadow_offset =
939 ((b + c) / physical_cols) << ashift;
940 if (row_use_scratch)
941 rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
942 }
943
944 asize += rc->rc_size;
945 }
946
947 /*
948 * See comment in vdev_raidz_map_alloc()
949 */
950 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
951 (offset & (1ULL << 20))) {
952 ASSERT(rr->rr_cols >= 2);
953 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
954
955 int devidx0 = rr->rr_col[0].rc_devidx;
956 uint64_t offset0 = rr->rr_col[0].rc_offset;
957 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
958 uint64_t shadow_offset0 =
959 rr->rr_col[0].rc_shadow_offset;
960
961 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
962 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
963 rr->rr_col[0].rc_shadow_devidx =
964 rr->rr_col[1].rc_shadow_devidx;
965 rr->rr_col[0].rc_shadow_offset =
966 rr->rr_col[1].rc_shadow_offset;
967
968 rr->rr_col[1].rc_devidx = devidx0;
969 rr->rr_col[1].rc_offset = offset0;
970 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
971 rr->rr_col[1].rc_shadow_offset = shadow_offset0;
972 }
973 }
974 ASSERT3U(asize, ==, tot << ashift);
975
976 /*
977 * Determine if the block is contiguous, in which case we can use
978 * an aggregation.
979 */
980 if (rows >= raidz_io_aggregate_rows) {
981 rm->rm_nphys_cols = physical_cols;
982 rm->rm_phys_col =
983 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
984 KM_SLEEP);
985
986 /*
987 * Determine the aggregate io's offset and size, and check
988 * that the io is contiguous.
989 */
990 for (int i = 0;
991 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
992 raidz_row_t *rr = rm->rm_row[i];
993 for (int c = 0; c < rr->rr_cols; c++) {
994 raidz_col_t *rc = &rr->rr_col[c];
995 raidz_col_t *prc =
996 &rm->rm_phys_col[rc->rc_devidx];
997
998 if (rc->rc_size == 0)
999 continue;
1000
1001 if (prc->rc_size == 0) {
1002 ASSERT0(prc->rc_offset);
1003 prc->rc_offset = rc->rc_offset;
1004 } else if (prc->rc_offset + prc->rc_size !=
1005 rc->rc_offset) {
1006 /*
1007 * This block is not contiguous and
1008 * therefore can't be aggregated.
1009 * This is expected to be rare, so
1010 * the cost of allocating and then
1011 * freeing rm_phys_col is not
1012 * significant.
1013 */
1014 kmem_free(rm->rm_phys_col,
1015 sizeof (raidz_col_t) *
1016 rm->rm_nphys_cols);
1017 rm->rm_phys_col = NULL;
1018 rm->rm_nphys_cols = 0;
1019 break;
1020 }
1021 prc->rc_size += rc->rc_size;
1022 }
1023 }
1024 }
1025 if (rm->rm_phys_col != NULL) {
1026 /*
1027 * Allocate aggregate ABD's.
1028 */
1029 for (int i = 0; i < rm->rm_nphys_cols; i++) {
1030 raidz_col_t *prc = &rm->rm_phys_col[i];
1031
1032 prc->rc_devidx = i;
1033
1034 if (prc->rc_size == 0)
1035 continue;
1036
1037 prc->rc_abd =
1038 abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1039 B_FALSE);
1040 }
1041
1042 /*
1043 * Point the parity abd's into the aggregate abd's.
1044 */
1045 for (int i = 0; i < rm->rm_nrows; i++) {
1046 raidz_row_t *rr = rm->rm_row[i];
1047 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1048 raidz_col_t *rc = &rr->rr_col[c];
1049 raidz_col_t *prc =
1050 &rm->rm_phys_col[rc->rc_devidx];
1051 rc->rc_abd =
1052 abd_get_offset_struct(&rc->rc_abdstruct,
1053 prc->rc_abd,
1054 rc->rc_offset - prc->rc_offset,
1055 rc->rc_size);
1056 }
1057 }
1058 } else {
1059 /*
1060 * Allocate new abd's for the parity sectors.
1061 */
1062 for (int i = 0; i < rm->rm_nrows; i++) {
1063 raidz_row_t *rr = rm->rm_row[i];
1064 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1065 raidz_col_t *rc = &rr->rr_col[c];
1066 rc->rc_abd =
1067 abd_alloc_linear(rc->rc_size,
1068 B_TRUE);
1069 }
1070 }
1071 }
1072 /* init RAIDZ parity ops */
1073 rm->rm_ops = vdev_raidz_math_get_ops();
1074
1075 return (rm);
1076 }
1077
1078 struct pqr_struct {
1079 uint64_t *p;
1080 uint64_t *q;
1081 uint64_t *r;
1082 };
1083
1084 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1085 vdev_raidz_p_func(void *buf, size_t size, void *private)
1086 {
1087 struct pqr_struct *pqr = private;
1088 const uint64_t *src = buf;
1089 int cnt = size / sizeof (src[0]);
1090
1091 ASSERT(pqr->p && !pqr->q && !pqr->r);
1092
1093 for (int i = 0; i < cnt; i++, src++, pqr->p++)
1094 *pqr->p ^= *src;
1095
1096 return (0);
1097 }
1098
1099 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1100 vdev_raidz_pq_func(void *buf, size_t size, void *private)
1101 {
1102 struct pqr_struct *pqr = private;
1103 const uint64_t *src = buf;
1104 uint64_t mask;
1105 int cnt = size / sizeof (src[0]);
1106
1107 ASSERT(pqr->p && pqr->q && !pqr->r);
1108
1109 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1110 *pqr->p ^= *src;
1111 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1112 *pqr->q ^= *src;
1113 }
1114
1115 return (0);
1116 }
1117
1118 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1119 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1120 {
1121 struct pqr_struct *pqr = private;
1122 const uint64_t *src = buf;
1123 uint64_t mask;
1124 int cnt = size / sizeof (src[0]);
1125
1126 ASSERT(pqr->p && pqr->q && pqr->r);
1127
1128 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1129 *pqr->p ^= *src;
1130 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1131 *pqr->q ^= *src;
1132 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1133 *pqr->r ^= *src;
1134 }
1135
1136 return (0);
1137 }
1138
1139 static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)1140 vdev_raidz_generate_parity_p(raidz_row_t *rr)
1141 {
1142 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1143
1144 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1145 abd_t *src = rr->rr_col[c].rc_abd;
1146
1147 if (c == rr->rr_firstdatacol) {
1148 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1149 } else {
1150 struct pqr_struct pqr = { p, NULL, NULL };
1151 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1152 vdev_raidz_p_func, &pqr);
1153 }
1154 }
1155 }
1156
1157 static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)1158 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1159 {
1160 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1161 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1162 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1163 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1164 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1165
1166 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1167 abd_t *src = rr->rr_col[c].rc_abd;
1168
1169 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1170
1171 if (c == rr->rr_firstdatacol) {
1172 ASSERT(ccnt == pcnt || ccnt == 0);
1173 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1174 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1175
1176 for (uint64_t i = ccnt; i < pcnt; i++) {
1177 p[i] = 0;
1178 q[i] = 0;
1179 }
1180 } else {
1181 struct pqr_struct pqr = { p, q, NULL };
1182
1183 ASSERT(ccnt <= pcnt);
1184 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1185 vdev_raidz_pq_func, &pqr);
1186
1187 /*
1188 * Treat short columns as though they are full of 0s.
1189 * Note that there's therefore nothing needed for P.
1190 */
1191 uint64_t mask;
1192 for (uint64_t i = ccnt; i < pcnt; i++) {
1193 VDEV_RAIDZ_64MUL_2(q[i], mask);
1194 }
1195 }
1196 }
1197 }
1198
1199 static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)1200 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1201 {
1202 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1203 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1204 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1205 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1206 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1207 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1208 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1209 rr->rr_col[VDEV_RAIDZ_R].rc_size);
1210
1211 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1212 abd_t *src = rr->rr_col[c].rc_abd;
1213
1214 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1215
1216 if (c == rr->rr_firstdatacol) {
1217 ASSERT(ccnt == pcnt || ccnt == 0);
1218 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1219 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1220 (void) memcpy(r, p, rr->rr_col[c].rc_size);
1221
1222 for (uint64_t i = ccnt; i < pcnt; i++) {
1223 p[i] = 0;
1224 q[i] = 0;
1225 r[i] = 0;
1226 }
1227 } else {
1228 struct pqr_struct pqr = { p, q, r };
1229
1230 ASSERT(ccnt <= pcnt);
1231 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1232 vdev_raidz_pqr_func, &pqr);
1233
1234 /*
1235 * Treat short columns as though they are full of 0s.
1236 * Note that there's therefore nothing needed for P.
1237 */
1238 uint64_t mask;
1239 for (uint64_t i = ccnt; i < pcnt; i++) {
1240 VDEV_RAIDZ_64MUL_2(q[i], mask);
1241 VDEV_RAIDZ_64MUL_4(r[i], mask);
1242 }
1243 }
1244 }
1245 }
1246
1247 /*
1248 * Generate RAID parity in the first virtual columns according to the number of
1249 * parity columns available.
1250 */
1251 void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)1252 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1253 {
1254 if (rr->rr_cols == 0) {
1255 /*
1256 * We are handling this block one row at a time (because
1257 * this block has a different logical vs physical width,
1258 * due to RAIDZ expansion), and this is a pad-only row,
1259 * which has no parity.
1260 */
1261 return;
1262 }
1263
1264 /* Generate using the new math implementation */
1265 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1266 return;
1267
1268 switch (rr->rr_firstdatacol) {
1269 case 1:
1270 vdev_raidz_generate_parity_p(rr);
1271 break;
1272 case 2:
1273 vdev_raidz_generate_parity_pq(rr);
1274 break;
1275 case 3:
1276 vdev_raidz_generate_parity_pqr(rr);
1277 break;
1278 default:
1279 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1280 }
1281 }
1282
1283 void
vdev_raidz_generate_parity(raidz_map_t * rm)1284 vdev_raidz_generate_parity(raidz_map_t *rm)
1285 {
1286 for (int i = 0; i < rm->rm_nrows; i++) {
1287 raidz_row_t *rr = rm->rm_row[i];
1288 vdev_raidz_generate_parity_row(rm, rr);
1289 }
1290 }
1291
1292 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1293 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1294 {
1295 (void) private;
1296 uint64_t *dst = dbuf;
1297 uint64_t *src = sbuf;
1298 int cnt = size / sizeof (src[0]);
1299
1300 for (int i = 0; i < cnt; i++) {
1301 dst[i] ^= src[i];
1302 }
1303
1304 return (0);
1305 }
1306
1307 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1308 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1309 void *private)
1310 {
1311 (void) private;
1312 uint64_t *dst = dbuf;
1313 uint64_t *src = sbuf;
1314 uint64_t mask;
1315 int cnt = size / sizeof (dst[0]);
1316
1317 for (int i = 0; i < cnt; i++, dst++, src++) {
1318 VDEV_RAIDZ_64MUL_2(*dst, mask);
1319 *dst ^= *src;
1320 }
1321
1322 return (0);
1323 }
1324
1325 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1326 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1327 {
1328 (void) private;
1329 uint64_t *dst = buf;
1330 uint64_t mask;
1331 int cnt = size / sizeof (dst[0]);
1332
1333 for (int i = 0; i < cnt; i++, dst++) {
1334 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1335 VDEV_RAIDZ_64MUL_2(*dst, mask);
1336 }
1337
1338 return (0);
1339 }
1340
1341 struct reconst_q_struct {
1342 uint64_t *q;
1343 int exp;
1344 };
1345
1346 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1347 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1348 {
1349 struct reconst_q_struct *rq = private;
1350 uint64_t *dst = buf;
1351 int cnt = size / sizeof (dst[0]);
1352
1353 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1354 int j;
1355 uint8_t *b;
1356
1357 *dst ^= *rq->q;
1358 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1359 *b = vdev_raidz_exp2(*b, rq->exp);
1360 }
1361 }
1362
1363 return (0);
1364 }
1365
1366 struct reconst_pq_struct {
1367 uint8_t *p;
1368 uint8_t *q;
1369 uint8_t *pxy;
1370 uint8_t *qxy;
1371 int aexp;
1372 int bexp;
1373 };
1374
1375 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)1376 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1377 {
1378 struct reconst_pq_struct *rpq = private;
1379 uint8_t *xd = xbuf;
1380 uint8_t *yd = ybuf;
1381
1382 for (int i = 0; i < size;
1383 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1384 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1385 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1386 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1387 }
1388
1389 return (0);
1390 }
1391
1392 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1393 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1394 {
1395 struct reconst_pq_struct *rpq = private;
1396 uint8_t *xd = xbuf;
1397
1398 for (int i = 0; i < size;
1399 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1400 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1401 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1402 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1403 }
1404
1405 return (0);
1406 }
1407
1408 static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)1409 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1410 {
1411 int x = tgts[0];
1412 abd_t *dst, *src;
1413
1414 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1415 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1416
1417 ASSERT3U(ntgts, ==, 1);
1418 ASSERT3U(x, >=, rr->rr_firstdatacol);
1419 ASSERT3U(x, <, rr->rr_cols);
1420
1421 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1422
1423 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1424 dst = rr->rr_col[x].rc_abd;
1425
1426 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1427
1428 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1429 uint64_t size = MIN(rr->rr_col[x].rc_size,
1430 rr->rr_col[c].rc_size);
1431
1432 src = rr->rr_col[c].rc_abd;
1433
1434 if (c == x)
1435 continue;
1436
1437 (void) abd_iterate_func2(dst, src, 0, 0, size,
1438 vdev_raidz_reconst_p_func, NULL);
1439 }
1440 }
1441
1442 static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)1443 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1444 {
1445 int x = tgts[0];
1446 int c, exp;
1447 abd_t *dst, *src;
1448
1449 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1450 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1451
1452 ASSERT(ntgts == 1);
1453
1454 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1455
1456 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1457 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1458 rr->rr_col[c].rc_size);
1459
1460 src = rr->rr_col[c].rc_abd;
1461 dst = rr->rr_col[x].rc_abd;
1462
1463 if (c == rr->rr_firstdatacol) {
1464 abd_copy(dst, src, size);
1465 if (rr->rr_col[x].rc_size > size) {
1466 abd_zero_off(dst, size,
1467 rr->rr_col[x].rc_size - size);
1468 }
1469 } else {
1470 ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1471 (void) abd_iterate_func2(dst, src, 0, 0, size,
1472 vdev_raidz_reconst_q_pre_func, NULL);
1473 (void) abd_iterate_func(dst,
1474 size, rr->rr_col[x].rc_size - size,
1475 vdev_raidz_reconst_q_pre_tail_func, NULL);
1476 }
1477 }
1478
1479 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1480 dst = rr->rr_col[x].rc_abd;
1481 exp = 255 - (rr->rr_cols - 1 - x);
1482
1483 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1484 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1485 vdev_raidz_reconst_q_post_func, &rq);
1486 }
1487
1488 static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)1489 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1490 {
1491 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1492 abd_t *pdata, *qdata;
1493 uint64_t xsize, ysize;
1494 int x = tgts[0];
1495 int y = tgts[1];
1496 abd_t *xd, *yd;
1497
1498 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1499 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1500
1501 ASSERT(ntgts == 2);
1502 ASSERT(x < y);
1503 ASSERT(x >= rr->rr_firstdatacol);
1504 ASSERT(y < rr->rr_cols);
1505
1506 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1507
1508 /*
1509 * Move the parity data aside -- we're going to compute parity as
1510 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1511 * reuse the parity generation mechanism without trashing the actual
1512 * parity so we make those columns appear to be full of zeros by
1513 * setting their lengths to zero.
1514 */
1515 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1516 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1517 xsize = rr->rr_col[x].rc_size;
1518 ysize = rr->rr_col[y].rc_size;
1519
1520 rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1521 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1522 rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1523 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1524 rr->rr_col[x].rc_size = 0;
1525 rr->rr_col[y].rc_size = 0;
1526
1527 vdev_raidz_generate_parity_pq(rr);
1528
1529 rr->rr_col[x].rc_size = xsize;
1530 rr->rr_col[y].rc_size = ysize;
1531
1532 p = abd_to_buf(pdata);
1533 q = abd_to_buf(qdata);
1534 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1535 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1536 xd = rr->rr_col[x].rc_abd;
1537 yd = rr->rr_col[y].rc_abd;
1538
1539 /*
1540 * We now have:
1541 * Pxy = P + D_x + D_y
1542 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1543 *
1544 * We can then solve for D_x:
1545 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1546 * where
1547 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1548 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1549 *
1550 * With D_x in hand, we can easily solve for D_y:
1551 * D_y = P + Pxy + D_x
1552 */
1553
1554 a = vdev_raidz_pow2[255 + x - y];
1555 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1556 tmp = 255 - vdev_raidz_log2[a ^ 1];
1557
1558 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1559 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1560
1561 ASSERT3U(xsize, >=, ysize);
1562 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1563
1564 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1565 vdev_raidz_reconst_pq_func, &rpq);
1566 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1567 vdev_raidz_reconst_pq_tail_func, &rpq);
1568
1569 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1570 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1571
1572 /*
1573 * Restore the saved parity data.
1574 */
1575 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1576 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1577 }
1578
1579 /*
1580 * In the general case of reconstruction, we must solve the system of linear
1581 * equations defined by the coefficients used to generate parity as well as
1582 * the contents of the data and parity disks. This can be expressed with
1583 * vectors for the original data (D) and the actual data (d) and parity (p)
1584 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1585 *
1586 * __ __ __ __
1587 * | | __ __ | p_0 |
1588 * | V | | D_0 | | p_m-1 |
1589 * | | x | : | = | d_0 |
1590 * | I | | D_n-1 | | : |
1591 * | | ~~ ~~ | d_n-1 |
1592 * ~~ ~~ ~~ ~~
1593 *
1594 * I is simply a square identity matrix of size n, and V is a vandermonde
1595 * matrix defined by the coefficients we chose for the various parity columns
1596 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1597 * computation as well as linear separability.
1598 *
1599 * __ __ __ __
1600 * | 1 .. 1 1 1 | | p_0 |
1601 * | 2^n-1 .. 4 2 1 | __ __ | : |
1602 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1603 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1604 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1605 * | : : : : | | : | | d_2 |
1606 * | 0 .. 1 0 0 | | D_n-1 | | : |
1607 * | 0 .. 0 1 0 | ~~ ~~ | : |
1608 * | 0 .. 0 0 1 | | d_n-1 |
1609 * ~~ ~~ ~~ ~~
1610 *
1611 * Note that I, V, d, and p are known. To compute D, we must invert the
1612 * matrix and use the known data and parity values to reconstruct the unknown
1613 * data values. We begin by removing the rows in V|I and d|p that correspond
1614 * to failed or missing columns; we then make V|I square (n x n) and d|p
1615 * sized n by removing rows corresponding to unused parity from the bottom up
1616 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1617 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1618 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1619 * __ __
1620 * | 1 1 1 1 1 1 1 1 |
1621 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1622 * | 19 205 116 29 64 16 4 1 | / /
1623 * | 1 0 0 0 0 0 0 0 | / /
1624 * | 0 1 0 0 0 0 0 0 | <--' /
1625 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1626 * | 0 0 0 1 0 0 0 0 |
1627 * | 0 0 0 0 1 0 0 0 |
1628 * | 0 0 0 0 0 1 0 0 |
1629 * | 0 0 0 0 0 0 1 0 |
1630 * | 0 0 0 0 0 0 0 1 |
1631 * ~~ ~~
1632 * __ __
1633 * | 1 1 1 1 1 1 1 1 |
1634 * | 128 64 32 16 8 4 2 1 |
1635 * | 19 205 116 29 64 16 4 1 |
1636 * | 1 0 0 0 0 0 0 0 |
1637 * | 0 1 0 0 0 0 0 0 |
1638 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1639 * | 0 0 0 1 0 0 0 0 |
1640 * | 0 0 0 0 1 0 0 0 |
1641 * | 0 0 0 0 0 1 0 0 |
1642 * | 0 0 0 0 0 0 1 0 |
1643 * | 0 0 0 0 0 0 0 1 |
1644 * ~~ ~~
1645 *
1646 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1647 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1648 * matrix is not singular.
1649 * __ __
1650 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1651 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1652 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1653 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1654 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1655 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1656 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1657 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1658 * ~~ ~~
1659 * __ __
1660 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1661 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1662 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1663 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1664 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1665 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1666 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1667 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1668 * ~~ ~~
1669 * __ __
1670 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1671 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1672 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1673 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1674 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1675 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1676 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1677 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1678 * ~~ ~~
1679 * __ __
1680 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1681 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1682 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1683 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1684 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1685 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1686 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1687 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1688 * ~~ ~~
1689 * __ __
1690 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1691 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1692 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1693 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1694 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1695 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1696 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1697 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1698 * ~~ ~~
1699 * __ __
1700 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1701 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1702 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1703 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1704 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1705 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1706 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1707 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1708 * ~~ ~~
1709 * __ __
1710 * | 0 0 1 0 0 0 0 0 |
1711 * | 167 100 5 41 159 169 217 208 |
1712 * | 166 100 4 40 158 168 216 209 |
1713 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1714 * | 0 0 0 0 1 0 0 0 |
1715 * | 0 0 0 0 0 1 0 0 |
1716 * | 0 0 0 0 0 0 1 0 |
1717 * | 0 0 0 0 0 0 0 1 |
1718 * ~~ ~~
1719 *
1720 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1721 * of the missing data.
1722 *
1723 * As is apparent from the example above, the only non-trivial rows in the
1724 * inverse matrix correspond to the data disks that we're trying to
1725 * reconstruct. Indeed, those are the only rows we need as the others would
1726 * only be useful for reconstructing data known or assumed to be valid. For
1727 * that reason, we only build the coefficients in the rows that correspond to
1728 * targeted columns.
1729 */
1730
1731 static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)1732 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1733 uint8_t **rows)
1734 {
1735 int i, j;
1736 int pow;
1737
1738 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1739
1740 /*
1741 * Fill in the missing rows of interest.
1742 */
1743 for (i = 0; i < nmap; i++) {
1744 ASSERT3S(0, <=, map[i]);
1745 ASSERT3S(map[i], <=, 2);
1746
1747 pow = map[i] * n;
1748 if (pow > 255)
1749 pow -= 255;
1750 ASSERT(pow <= 255);
1751
1752 for (j = 0; j < n; j++) {
1753 pow -= map[i];
1754 if (pow < 0)
1755 pow += 255;
1756 rows[i][j] = vdev_raidz_pow2[pow];
1757 }
1758 }
1759 }
1760
1761 static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1762 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1763 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1764 {
1765 int i, j, ii, jj;
1766 uint8_t log;
1767
1768 /*
1769 * Assert that the first nmissing entries from the array of used
1770 * columns correspond to parity columns and that subsequent entries
1771 * correspond to data columns.
1772 */
1773 for (i = 0; i < nmissing; i++) {
1774 ASSERT3S(used[i], <, rr->rr_firstdatacol);
1775 }
1776 for (; i < n; i++) {
1777 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1778 }
1779
1780 /*
1781 * First initialize the storage where we'll compute the inverse rows.
1782 */
1783 for (i = 0; i < nmissing; i++) {
1784 for (j = 0; j < n; j++) {
1785 invrows[i][j] = (i == j) ? 1 : 0;
1786 }
1787 }
1788
1789 /*
1790 * Subtract all trivial rows from the rows of consequence.
1791 */
1792 for (i = 0; i < nmissing; i++) {
1793 for (j = nmissing; j < n; j++) {
1794 ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1795 jj = used[j] - rr->rr_firstdatacol;
1796 ASSERT3S(jj, <, n);
1797 invrows[i][j] = rows[i][jj];
1798 rows[i][jj] = 0;
1799 }
1800 }
1801
1802 /*
1803 * For each of the rows of interest, we must normalize it and subtract
1804 * a multiple of it from the other rows.
1805 */
1806 for (i = 0; i < nmissing; i++) {
1807 for (j = 0; j < missing[i]; j++) {
1808 ASSERT0(rows[i][j]);
1809 }
1810 ASSERT3U(rows[i][missing[i]], !=, 0);
1811
1812 /*
1813 * Compute the inverse of the first element and multiply each
1814 * element in the row by that value.
1815 */
1816 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1817
1818 for (j = 0; j < n; j++) {
1819 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1820 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1821 }
1822
1823 for (ii = 0; ii < nmissing; ii++) {
1824 if (i == ii)
1825 continue;
1826
1827 ASSERT3U(rows[ii][missing[i]], !=, 0);
1828
1829 log = vdev_raidz_log2[rows[ii][missing[i]]];
1830
1831 for (j = 0; j < n; j++) {
1832 rows[ii][j] ^=
1833 vdev_raidz_exp2(rows[i][j], log);
1834 invrows[ii][j] ^=
1835 vdev_raidz_exp2(invrows[i][j], log);
1836 }
1837 }
1838 }
1839
1840 /*
1841 * Verify that the data that is left in the rows are properly part of
1842 * an identity matrix.
1843 */
1844 for (i = 0; i < nmissing; i++) {
1845 for (j = 0; j < n; j++) {
1846 if (j == missing[i]) {
1847 ASSERT3U(rows[i][j], ==, 1);
1848 } else {
1849 ASSERT0(rows[i][j]);
1850 }
1851 }
1852 }
1853 }
1854
1855 static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1856 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1857 int *missing, uint8_t **invrows, const uint8_t *used)
1858 {
1859 int i, j, x, cc, c;
1860 uint8_t *src;
1861 uint64_t ccount;
1862 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1863 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1864 uint8_t log = 0;
1865 uint8_t val;
1866 int ll;
1867 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1868 uint8_t *p, *pp;
1869 size_t psize;
1870
1871 psize = sizeof (invlog[0][0]) * n * nmissing;
1872 p = kmem_alloc(psize, KM_SLEEP);
1873
1874 for (pp = p, i = 0; i < nmissing; i++) {
1875 invlog[i] = pp;
1876 pp += n;
1877 }
1878
1879 for (i = 0; i < nmissing; i++) {
1880 for (j = 0; j < n; j++) {
1881 ASSERT3U(invrows[i][j], !=, 0);
1882 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1883 }
1884 }
1885
1886 for (i = 0; i < n; i++) {
1887 c = used[i];
1888 ASSERT3U(c, <, rr->rr_cols);
1889
1890 ccount = rr->rr_col[c].rc_size;
1891 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1892 if (ccount == 0)
1893 continue;
1894 src = abd_to_buf(rr->rr_col[c].rc_abd);
1895 for (j = 0; j < nmissing; j++) {
1896 cc = missing[j] + rr->rr_firstdatacol;
1897 ASSERT3U(cc, >=, rr->rr_firstdatacol);
1898 ASSERT3U(cc, <, rr->rr_cols);
1899 ASSERT3U(cc, !=, c);
1900
1901 dcount[j] = rr->rr_col[cc].rc_size;
1902 if (dcount[j] != 0)
1903 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1904 }
1905
1906 for (x = 0; x < ccount; x++, src++) {
1907 if (*src != 0)
1908 log = vdev_raidz_log2[*src];
1909
1910 for (cc = 0; cc < nmissing; cc++) {
1911 if (x >= dcount[cc])
1912 continue;
1913
1914 if (*src == 0) {
1915 val = 0;
1916 } else {
1917 if ((ll = log + invlog[cc][i]) >= 255)
1918 ll -= 255;
1919 val = vdev_raidz_pow2[ll];
1920 }
1921
1922 if (i == 0)
1923 dst[cc][x] = val;
1924 else
1925 dst[cc][x] ^= val;
1926 }
1927 }
1928 }
1929
1930 kmem_free(p, psize);
1931 }
1932
1933 static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)1934 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1935 {
1936 int i, c, t, tt;
1937 unsigned int n;
1938 unsigned int nmissing_rows;
1939 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1940 int parity_map[VDEV_RAIDZ_MAXPARITY];
1941 uint8_t *p, *pp;
1942 size_t psize;
1943 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1944 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1945 uint8_t *used;
1946
1947 abd_t **bufs = NULL;
1948
1949 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1950 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1951 /*
1952 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1953 * temporary linear ABDs if any non-linear ABDs are found.
1954 */
1955 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1956 ASSERT(rr->rr_col[i].rc_abd != NULL);
1957 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1958 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1959 KM_PUSHPAGE);
1960
1961 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1962 raidz_col_t *col = &rr->rr_col[c];
1963
1964 bufs[c] = col->rc_abd;
1965 if (bufs[c] != NULL) {
1966 col->rc_abd = abd_alloc_linear(
1967 col->rc_size, B_TRUE);
1968 abd_copy(col->rc_abd, bufs[c],
1969 col->rc_size);
1970 }
1971 }
1972
1973 break;
1974 }
1975 }
1976
1977 n = rr->rr_cols - rr->rr_firstdatacol;
1978
1979 /*
1980 * Figure out which data columns are missing.
1981 */
1982 nmissing_rows = 0;
1983 for (t = 0; t < ntgts; t++) {
1984 if (tgts[t] >= rr->rr_firstdatacol) {
1985 missing_rows[nmissing_rows++] =
1986 tgts[t] - rr->rr_firstdatacol;
1987 }
1988 }
1989
1990 /*
1991 * Figure out which parity columns to use to help generate the missing
1992 * data columns.
1993 */
1994 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1995 ASSERT(tt < ntgts);
1996 ASSERT(c < rr->rr_firstdatacol);
1997
1998 /*
1999 * Skip any targeted parity columns.
2000 */
2001 if (c == tgts[tt]) {
2002 tt++;
2003 continue;
2004 }
2005
2006 parity_map[i] = c;
2007 i++;
2008 }
2009
2010 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
2011 nmissing_rows * n + sizeof (used[0]) * n;
2012 p = kmem_alloc(psize, KM_SLEEP);
2013
2014 for (pp = p, i = 0; i < nmissing_rows; i++) {
2015 rows[i] = pp;
2016 pp += n;
2017 invrows[i] = pp;
2018 pp += n;
2019 }
2020 used = pp;
2021
2022 for (i = 0; i < nmissing_rows; i++) {
2023 used[i] = parity_map[i];
2024 }
2025
2026 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2027 if (tt < nmissing_rows &&
2028 c == missing_rows[tt] + rr->rr_firstdatacol) {
2029 tt++;
2030 continue;
2031 }
2032
2033 ASSERT3S(i, <, n);
2034 used[i] = c;
2035 i++;
2036 }
2037
2038 /*
2039 * Initialize the interesting rows of the matrix.
2040 */
2041 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2042
2043 /*
2044 * Invert the matrix.
2045 */
2046 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2047 invrows, used);
2048
2049 /*
2050 * Reconstruct the missing data using the generated matrix.
2051 */
2052 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2053 invrows, used);
2054
2055 kmem_free(p, psize);
2056
2057 /*
2058 * copy back from temporary linear abds and free them
2059 */
2060 if (bufs) {
2061 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2062 raidz_col_t *col = &rr->rr_col[c];
2063
2064 if (bufs[c] != NULL) {
2065 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2066 abd_free(col->rc_abd);
2067 }
2068 col->rc_abd = bufs[c];
2069 }
2070 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2071 }
2072 }
2073
2074 static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)2075 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2076 const int *t, int nt)
2077 {
2078 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2079 int ntgts;
2080 int i, c, ret;
2081 int nbadparity, nbaddata;
2082 int parity_valid[VDEV_RAIDZ_MAXPARITY];
2083
2084 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2085 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2086 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2087 (int)rr->rr_missingparity);
2088 }
2089
2090 nbadparity = rr->rr_firstdatacol;
2091 nbaddata = rr->rr_cols - nbadparity;
2092 ntgts = 0;
2093 for (i = 0, c = 0; c < rr->rr_cols; c++) {
2094 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2095 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2096 "offset=%llx error=%u)",
2097 rr, c, (int)rr->rr_col[c].rc_devidx,
2098 (long long)rr->rr_col[c].rc_offset,
2099 (int)rr->rr_col[c].rc_error);
2100 }
2101 if (c < rr->rr_firstdatacol)
2102 parity_valid[c] = B_FALSE;
2103
2104 if (i < nt && c == t[i]) {
2105 tgts[ntgts++] = c;
2106 i++;
2107 } else if (rr->rr_col[c].rc_error != 0) {
2108 tgts[ntgts++] = c;
2109 } else if (c >= rr->rr_firstdatacol) {
2110 nbaddata--;
2111 } else {
2112 parity_valid[c] = B_TRUE;
2113 nbadparity--;
2114 }
2115 }
2116
2117 ASSERT(ntgts >= nt);
2118 ASSERT(nbaddata >= 0);
2119 ASSERT(nbaddata + nbadparity == ntgts);
2120
2121 dt = &tgts[nbadparity];
2122
2123 /* Reconstruct using the new math implementation */
2124 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2125 if (ret != RAIDZ_ORIGINAL_IMPL)
2126 return;
2127
2128 /*
2129 * See if we can use any of our optimized reconstruction routines.
2130 */
2131 switch (nbaddata) {
2132 case 1:
2133 if (parity_valid[VDEV_RAIDZ_P]) {
2134 vdev_raidz_reconstruct_p(rr, dt, 1);
2135 return;
2136 }
2137
2138 ASSERT(rr->rr_firstdatacol > 1);
2139
2140 if (parity_valid[VDEV_RAIDZ_Q]) {
2141 vdev_raidz_reconstruct_q(rr, dt, 1);
2142 return;
2143 }
2144
2145 ASSERT(rr->rr_firstdatacol > 2);
2146 break;
2147
2148 case 2:
2149 ASSERT(rr->rr_firstdatacol > 1);
2150
2151 if (parity_valid[VDEV_RAIDZ_P] &&
2152 parity_valid[VDEV_RAIDZ_Q]) {
2153 vdev_raidz_reconstruct_pq(rr, dt, 2);
2154 return;
2155 }
2156
2157 ASSERT(rr->rr_firstdatacol > 2);
2158
2159 break;
2160 }
2161
2162 vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2163 }
2164
2165 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)2166 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2167 uint64_t *logical_ashift, uint64_t *physical_ashift)
2168 {
2169 vdev_raidz_t *vdrz = vd->vdev_tsd;
2170 uint64_t nparity = vdrz->vd_nparity;
2171 int c;
2172 int lasterror = 0;
2173 int numerrors = 0;
2174
2175 ASSERT(nparity > 0);
2176
2177 if (nparity > VDEV_RAIDZ_MAXPARITY ||
2178 vd->vdev_children < nparity + 1) {
2179 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2180 return (SET_ERROR(EINVAL));
2181 }
2182
2183 vdev_open_children(vd);
2184
2185 for (c = 0; c < vd->vdev_children; c++) {
2186 vdev_t *cvd = vd->vdev_child[c];
2187
2188 if (cvd->vdev_open_error != 0) {
2189 lasterror = cvd->vdev_open_error;
2190 numerrors++;
2191 continue;
2192 }
2193
2194 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2195 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2196 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2197 }
2198 for (c = 0; c < vd->vdev_children; c++) {
2199 vdev_t *cvd = vd->vdev_child[c];
2200
2201 if (cvd->vdev_open_error != 0)
2202 continue;
2203 *physical_ashift = vdev_best_ashift(*logical_ashift,
2204 *physical_ashift, cvd->vdev_physical_ashift);
2205 }
2206
2207 if (vd->vdev_rz_expanding) {
2208 *asize *= vd->vdev_children - 1;
2209 *max_asize *= vd->vdev_children - 1;
2210
2211 vd->vdev_min_asize = *asize;
2212 } else {
2213 *asize *= vd->vdev_children;
2214 *max_asize *= vd->vdev_children;
2215 }
2216
2217 if (numerrors > nparity) {
2218 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2219 return (lasterror);
2220 }
2221
2222 return (0);
2223 }
2224
2225 static void
vdev_raidz_close(vdev_t * vd)2226 vdev_raidz_close(vdev_t *vd)
2227 {
2228 for (int c = 0; c < vd->vdev_children; c++) {
2229 if (vd->vdev_child[c] != NULL)
2230 vdev_close(vd->vdev_child[c]);
2231 }
2232 }
2233
2234 /*
2235 * Return the logical width to use, given the txg in which the allocation
2236 * happened.
2237 */
2238 static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)2239 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2240 {
2241 reflow_node_t lookup = {
2242 .re_txg = txg,
2243 };
2244 avl_index_t where;
2245
2246 uint64_t width;
2247 mutex_enter(&vdrz->vd_expand_lock);
2248 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2249 if (re != NULL) {
2250 width = re->re_logical_width;
2251 } else {
2252 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2253 if (re != NULL)
2254 width = re->re_logical_width;
2255 else
2256 width = vdrz->vd_original_width;
2257 }
2258 mutex_exit(&vdrz->vd_expand_lock);
2259 return (width);
2260 }
2261 /*
2262 * This code converts an asize into the largest psize that can safely be written
2263 * to an allocation of that size for this vdev.
2264 *
2265 * Note that this function will not take into account the effect of gang
2266 * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
2267 * the psize_to_asize function.
2268 */
2269 static uint64_t
vdev_raidz_asize_to_psize(vdev_t * vd,uint64_t asize,uint64_t txg)2270 vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
2271 {
2272 vdev_raidz_t *vdrz = vd->vdev_tsd;
2273 uint64_t psize;
2274 uint64_t ashift = vd->vdev_top->vdev_ashift;
2275 uint64_t nparity = vdrz->vd_nparity;
2276
2277 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2278
2279 ASSERT0(asize % (1 << ashift));
2280
2281 psize = (asize >> ashift);
2282 /*
2283 * If the roundup to nparity + 1 caused us to spill into a new row, we
2284 * need to ignore that row entirely (since it can't store data or
2285 * parity).
2286 */
2287 uint64_t rows = psize / cols;
2288 psize = psize - (rows * cols) <= nparity ? rows * cols : psize;
2289 /* Subtract out parity sectors for each row storing data. */
2290 psize -= nparity * DIV_ROUND_UP(psize, cols);
2291 psize <<= ashift;
2292
2293 return (psize);
2294 }
2295
2296 /*
2297 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2298 * more space due to the lower data-to-parity ratio. In this case it's
2299 * important to pass in the correct txg. Note that vdev_gang_header_asize()
2300 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2301 * regardless of txg. This is assured because for a single data sector, we
2302 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2303 */
2304 static uint64_t
vdev_raidz_psize_to_asize(vdev_t * vd,uint64_t psize,uint64_t txg)2305 vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2306 {
2307 vdev_raidz_t *vdrz = vd->vdev_tsd;
2308 uint64_t asize;
2309 uint64_t ashift = vd->vdev_top->vdev_ashift;
2310 uint64_t nparity = vdrz->vd_nparity;
2311
2312 uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
2313
2314 asize = ((psize - 1) >> ashift) + 1;
2315 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2316 asize = roundup(asize, nparity + 1) << ashift;
2317
2318 #ifdef ZFS_DEBUG
2319 uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2320 uint64_t ncols_new = vdrz->vd_physical_width;
2321 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2322 (ncols_new - nparity));
2323 asize_new = roundup(asize_new, nparity + 1) << ashift;
2324 VERIFY3U(asize_new, <=, asize);
2325 #endif
2326
2327 return (asize);
2328 }
2329
2330 /*
2331 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2332 * so each child must provide at least 1/Nth of its asize.
2333 */
2334 static uint64_t
vdev_raidz_min_asize(vdev_t * vd)2335 vdev_raidz_min_asize(vdev_t *vd)
2336 {
2337 return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2338 vd->vdev_children);
2339 }
2340
2341 /*
2342 * return B_TRUE if a read should be skipped due to being too slow.
2343 *
2344 * In vdev_child_slow_outlier() it looks for outliers based on disk
2345 * latency from the most recent child reads. Here we're checking if,
2346 * over time, a disk has has been an outlier too many times and is
2347 * now in a sit out period.
2348 */
2349 boolean_t
vdev_sit_out_reads(vdev_t * vd,zio_flag_t io_flags)2350 vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
2351 {
2352 if (vdev_read_sit_out_secs == 0)
2353 return (B_FALSE);
2354
2355 /* Avoid skipping a data column read when scrubbing */
2356 if (io_flags & ZIO_FLAG_SCRUB)
2357 return (B_FALSE);
2358
2359 if (!vd->vdev_ops->vdev_op_leaf) {
2360 boolean_t sitting = B_FALSE;
2361 for (int c = 0; c < vd->vdev_children; c++) {
2362 sitting |= vdev_sit_out_reads(vd->vdev_child[c],
2363 io_flags);
2364 }
2365 return (sitting);
2366 }
2367
2368 if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
2369 return (B_TRUE);
2370
2371 vd->vdev_read_sit_out_expire = 0;
2372
2373 return (B_FALSE);
2374 }
2375
2376 void
vdev_raidz_child_done(zio_t * zio)2377 vdev_raidz_child_done(zio_t *zio)
2378 {
2379 raidz_col_t *rc = zio->io_private;
2380
2381 ASSERT3P(rc->rc_abd, !=, NULL);
2382 rc->rc_error = zio->io_error;
2383 rc->rc_tried = 1;
2384 rc->rc_skipped = 0;
2385 }
2386
2387 static void
vdev_raidz_shadow_child_done(zio_t * zio)2388 vdev_raidz_shadow_child_done(zio_t *zio)
2389 {
2390 raidz_col_t *rc = zio->io_private;
2391
2392 rc->rc_shadow_error = zio->io_error;
2393 }
2394
2395 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)2396 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2397 {
2398 (void) rm;
2399 #ifdef ZFS_DEBUG
2400 zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
2401 logical_rs.rs_start = rr->rr_offset;
2402 logical_rs.rs_end = logical_rs.rs_start +
2403 vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
2404 BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2405
2406 raidz_col_t *rc = &rr->rr_col[col];
2407 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2408
2409 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2410 ASSERT(vdev_xlate_is_empty(&remain_rs));
2411 if (vdev_xlate_is_empty(&physical_rs)) {
2412 /*
2413 * If we are in the middle of expansion, the
2414 * physical->logical mapping is changing so vdev_xlate()
2415 * can't give us a reliable answer.
2416 */
2417 return;
2418 }
2419 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2420 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2421 /*
2422 * It would be nice to assert that rs_end is equal
2423 * to rc_offset + rc_size but there might be an
2424 * optional I/O at the end that is not accounted in
2425 * rc_size.
2426 */
2427 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2428 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2429 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2430 } else {
2431 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2432 }
2433 #endif
2434 }
2435
2436 static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2437 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2438 {
2439 vdev_t *vd = zio->io_vd;
2440 raidz_map_t *rm = zio->io_vsd;
2441
2442 vdev_raidz_generate_parity_row(rm, rr);
2443
2444 for (int c = 0; c < rr->rr_scols; c++) {
2445 raidz_col_t *rc = &rr->rr_col[c];
2446 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2447
2448 /* Verify physical to logical translation */
2449 vdev_raidz_io_verify(zio, rm, rr, c);
2450
2451 if (rc->rc_size == 0)
2452 continue;
2453
2454 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2455 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2456
2457 ASSERT3P(rc->rc_abd, !=, NULL);
2458 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2459 rc->rc_offset, rc->rc_abd,
2460 abd_get_size(rc->rc_abd), zio->io_type,
2461 zio->io_priority, 0, vdev_raidz_child_done, rc));
2462
2463 if (rc->rc_shadow_devidx != INT_MAX) {
2464 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2465
2466 ASSERT3U(
2467 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2468 cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2469
2470 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2471 rc->rc_shadow_offset, rc->rc_abd,
2472 abd_get_size(rc->rc_abd),
2473 zio->io_type, zio->io_priority, 0,
2474 vdev_raidz_shadow_child_done, rc));
2475 }
2476 }
2477 }
2478
2479 /*
2480 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2481 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2482 */
2483 static void
raidz_start_skip_writes(zio_t * zio)2484 raidz_start_skip_writes(zio_t *zio)
2485 {
2486 vdev_t *vd = zio->io_vd;
2487 uint64_t ashift = vd->vdev_top->vdev_ashift;
2488 raidz_map_t *rm = zio->io_vsd;
2489 ASSERT3U(rm->rm_nrows, ==, 1);
2490 raidz_row_t *rr = rm->rm_row[0];
2491 for (int c = 0; c < rr->rr_scols; c++) {
2492 raidz_col_t *rc = &rr->rr_col[c];
2493 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2494 if (rc->rc_size != 0)
2495 continue;
2496 ASSERT0P(rc->rc_abd);
2497
2498 ASSERT3U(rc->rc_offset, <,
2499 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2500
2501 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2502 NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2503 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2504 }
2505 }
2506
2507 static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2508 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2509 {
2510 vdev_t *vd = zio->io_vd;
2511
2512 /*
2513 * Iterate over the columns in reverse order so that we hit the parity
2514 * last -- any errors along the way will force us to read the parity.
2515 */
2516 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2517 raidz_col_t *rc = &rr->rr_col[c];
2518 if (rc->rc_size == 0)
2519 continue;
2520 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2521 if (!vdev_readable(cvd)) {
2522 if (c >= rr->rr_firstdatacol)
2523 rr->rr_missingdata++;
2524 else
2525 rr->rr_missingparity++;
2526 rc->rc_error = SET_ERROR(ENXIO);
2527 rc->rc_tried = 1; /* don't even try */
2528 rc->rc_skipped = 1;
2529 continue;
2530 }
2531 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2532 if (c >= rr->rr_firstdatacol)
2533 rr->rr_missingdata++;
2534 else
2535 rr->rr_missingparity++;
2536 rc->rc_error = SET_ERROR(ESTALE);
2537 rc->rc_skipped = 1;
2538 continue;
2539 }
2540
2541 if (vdev_sit_out_reads(cvd, zio->io_flags)) {
2542 rr->rr_outlier_cnt++;
2543 ASSERT0(rc->rc_latency_outlier);
2544 rc->rc_latency_outlier = 1;
2545 }
2546 }
2547
2548 /*
2549 * When the row contains a latency outlier and sufficient parity
2550 * exists to reconstruct the column data, then skip reading the
2551 * known slow child vdev as a performance optimization.
2552 */
2553 if (rr->rr_outlier_cnt > 0 &&
2554 (rr->rr_firstdatacol - rr->rr_missingparity) >=
2555 (rr->rr_missingdata + 1)) {
2556
2557 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2558 raidz_col_t *rc = &rr->rr_col[c];
2559
2560 if (rc->rc_error == 0 && rc->rc_latency_outlier) {
2561 if (c >= rr->rr_firstdatacol)
2562 rr->rr_missingdata++;
2563 else
2564 rr->rr_missingparity++;
2565 rc->rc_error = SET_ERROR(EAGAIN);
2566 rc->rc_skipped = 1;
2567 break;
2568 }
2569 }
2570 }
2571
2572 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2573 raidz_col_t *rc = &rr->rr_col[c];
2574 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2575
2576 if (rc->rc_error || rc->rc_size == 0)
2577 continue;
2578
2579 if (forceparity ||
2580 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2581 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2582 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2583 rc->rc_offset, rc->rc_abd, rc->rc_size,
2584 zio->io_type, zio->io_priority, 0,
2585 vdev_raidz_child_done, rc));
2586 }
2587 }
2588 }
2589
2590 static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)2591 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2592 {
2593 vdev_t *vd = zio->io_vd;
2594
2595 for (int i = 0; i < rm->rm_nphys_cols; i++) {
2596 raidz_col_t *prc = &rm->rm_phys_col[i];
2597 if (prc->rc_size == 0)
2598 continue;
2599
2600 ASSERT3U(prc->rc_devidx, ==, i);
2601 vdev_t *cvd = vd->vdev_child[i];
2602
2603 if (!vdev_readable(cvd)) {
2604 prc->rc_error = SET_ERROR(ENXIO);
2605 prc->rc_tried = 1; /* don't even try */
2606 prc->rc_skipped = 1;
2607 continue;
2608 }
2609 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2610 prc->rc_error = SET_ERROR(ESTALE);
2611 prc->rc_skipped = 1;
2612 continue;
2613 }
2614 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2615 prc->rc_offset, prc->rc_abd, prc->rc_size,
2616 zio->io_type, zio->io_priority, 0,
2617 vdev_raidz_child_done, prc));
2618 }
2619 }
2620
2621 static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)2622 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2623 {
2624 /*
2625 * If there are multiple rows, we will be hitting
2626 * all disks, so go ahead and read the parity so
2627 * that we are reading in decent size chunks.
2628 */
2629 boolean_t forceparity = rm->rm_nrows > 1;
2630
2631 if (rm->rm_phys_col) {
2632 vdev_raidz_io_start_read_phys_cols(zio, rm);
2633 } else {
2634 for (int i = 0; i < rm->rm_nrows; i++) {
2635 raidz_row_t *rr = rm->rm_row[i];
2636 vdev_raidz_io_start_read_row(zio, rr, forceparity);
2637 }
2638 }
2639 }
2640
2641 /*
2642 * Start an IO operation on a RAIDZ VDev
2643 *
2644 * Outline:
2645 * - For write operations:
2646 * 1. Generate the parity data
2647 * 2. Create child zio write operations to each column's vdev, for both
2648 * data and parity.
2649 * 3. If the column skips any sectors for padding, create optional dummy
2650 * write zio children for those areas to improve aggregation continuity.
2651 * - For read operations:
2652 * 1. Create child zio read operations to each data column's vdev to read
2653 * the range of data required for zio.
2654 * 2. If this is a scrub or resilver operation, or if any of the data
2655 * vdevs have had errors, then create zio read operations to the parity
2656 * columns' VDevs as well.
2657 */
2658 static void
vdev_raidz_io_start(zio_t * zio)2659 vdev_raidz_io_start(zio_t *zio)
2660 {
2661 vdev_t *vd = zio->io_vd;
2662 vdev_t *tvd = vd->vdev_top;
2663 vdev_raidz_t *vdrz = vd->vdev_tsd;
2664 raidz_map_t *rm;
2665
2666 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2667 BP_GET_PHYSICAL_BIRTH(zio->io_bp));
2668 if (logical_width != vdrz->vd_physical_width) {
2669 zfs_locked_range_t *lr = NULL;
2670 uint64_t synced_offset = UINT64_MAX;
2671 uint64_t next_offset = UINT64_MAX;
2672 boolean_t use_scratch = B_FALSE;
2673 /*
2674 * Note: when the expansion is completing, we set
2675 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2676 * in a later txg than when we last update spa_ubsync's state
2677 * (see the end of spa_raidz_expand_thread()). Therefore we
2678 * may see vre_state!=SCANNING before
2679 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2680 * on disk, but the copying progress has been synced to disk
2681 * (and reflected in spa_ubsync). In this case it's fine to
2682 * treat the expansion as completed, since if we crash there's
2683 * no additional copying to do.
2684 */
2685 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2686 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2687 &vdrz->vn_vre);
2688 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2689 zio->io_offset, zio->io_size, RL_READER);
2690 use_scratch =
2691 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2692 RRSS_SCRATCH_VALID);
2693 synced_offset =
2694 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2695 next_offset = vdrz->vn_vre.vre_offset;
2696 /*
2697 * If we haven't resumed expanding since importing the
2698 * pool, vre_offset won't have been set yet. In
2699 * this case the next offset to be copied is the same
2700 * as what was synced.
2701 */
2702 if (next_offset == UINT64_MAX) {
2703 next_offset = synced_offset;
2704 }
2705 }
2706
2707 rm = vdev_raidz_map_alloc_expanded(zio,
2708 tvd->vdev_ashift, vdrz->vd_physical_width,
2709 logical_width, vdrz->vd_nparity,
2710 synced_offset, next_offset, use_scratch);
2711 rm->rm_lr = lr;
2712 } else {
2713 rm = vdev_raidz_map_alloc(zio,
2714 tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2715 }
2716 rm->rm_original_width = vdrz->vd_original_width;
2717
2718 zio->io_vsd = rm;
2719 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2720 if (zio->io_type == ZIO_TYPE_WRITE) {
2721 for (int i = 0; i < rm->rm_nrows; i++) {
2722 vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2723 }
2724
2725 if (logical_width == vdrz->vd_physical_width) {
2726 raidz_start_skip_writes(zio);
2727 }
2728 } else {
2729 ASSERT(zio->io_type == ZIO_TYPE_READ);
2730 vdev_raidz_io_start_read(zio, rm);
2731 }
2732
2733 zio_execute(zio);
2734 }
2735
2736 /*
2737 * Report a checksum error for a child of a RAID-Z device.
2738 */
2739 void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2740 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2741 {
2742 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2743
2744 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2745 zio->io_priority != ZIO_PRIORITY_REBUILD) {
2746 zio_bad_cksum_t zbc;
2747 raidz_map_t *rm = zio->io_vsd;
2748
2749 zbc.zbc_has_cksum = 0;
2750 zbc.zbc_injected = rm->rm_ecksuminjected;
2751
2752 mutex_enter(&vd->vdev_stat_lock);
2753 vd->vdev_stat.vs_checksum_errors++;
2754 mutex_exit(&vd->vdev_stat_lock);
2755 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2756 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2757 rc->rc_abd, bad_data, &zbc);
2758 }
2759 }
2760
2761 /*
2762 * We keep track of whether or not there were any injected errors, so that
2763 * any ereports we generate can note it.
2764 */
2765 static int
raidz_checksum_verify(zio_t * zio)2766 raidz_checksum_verify(zio_t *zio)
2767 {
2768 zio_bad_cksum_t zbc = {0};
2769 raidz_map_t *rm = zio->io_vsd;
2770
2771 int ret = zio_checksum_error(zio, &zbc);
2772 /*
2773 * Any Direct I/O read that has a checksum error must be treated as
2774 * suspicious as the contents of the buffer could be getting
2775 * manipulated while the I/O is taking place. The checksum verify error
2776 * will be reported to the top-level RAIDZ VDEV.
2777 */
2778 if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
2779 zio->io_error = ret;
2780 zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
2781 zio_dio_chksum_verify_error_report(zio);
2782 zio_checksum_verified(zio);
2783 return (0);
2784 }
2785
2786 if (ret != 0 && zbc.zbc_injected != 0)
2787 rm->rm_ecksuminjected = 1;
2788
2789 return (ret);
2790 }
2791
2792 /*
2793 * Generate the parity from the data columns. If we tried and were able to
2794 * read the parity without error, verify that the generated parity matches the
2795 * data we read. If it doesn't, we fire off a checksum error. Return the
2796 * number of such failures.
2797 */
2798 static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)2799 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2800 {
2801 abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2802 int c, ret = 0;
2803 raidz_map_t *rm = zio->io_vsd;
2804 raidz_col_t *rc;
2805
2806 blkptr_t *bp = zio->io_bp;
2807 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2808 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2809
2810 if (checksum == ZIO_CHECKSUM_NOPARITY)
2811 return (ret);
2812
2813 for (c = 0; c < rr->rr_firstdatacol; c++) {
2814 rc = &rr->rr_col[c];
2815 if (!rc->rc_tried || rc->rc_error != 0)
2816 continue;
2817
2818 orig[c] = rc->rc_abd;
2819 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2820 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2821 }
2822
2823 /*
2824 * Verify any empty sectors are zero filled to ensure the parity
2825 * is calculated correctly even if these non-data sectors are damaged.
2826 */
2827 if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2828 ret += vdev_draid_map_verify_empty(zio, rr);
2829
2830 /*
2831 * Regenerates parity even for !tried||rc_error!=0 columns. This
2832 * isn't harmful but it does have the side effect of fixing stuff
2833 * we didn't realize was necessary (i.e. even if we return 0).
2834 */
2835 vdev_raidz_generate_parity_row(rm, rr);
2836
2837 for (c = 0; c < rr->rr_firstdatacol; c++) {
2838 rc = &rr->rr_col[c];
2839
2840 if (!rc->rc_tried || rc->rc_error != 0)
2841 continue;
2842
2843 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2844 vdev_raidz_checksum_error(zio, rc, orig[c]);
2845 rc->rc_error = SET_ERROR(ECKSUM);
2846 ret++;
2847 }
2848 abd_free(orig[c]);
2849 }
2850
2851 return (ret);
2852 }
2853
2854 static int
vdev_raidz_worst_error(raidz_row_t * rr)2855 vdev_raidz_worst_error(raidz_row_t *rr)
2856 {
2857 int error = 0;
2858
2859 for (int c = 0; c < rr->rr_cols; c++) {
2860 error = zio_worst_error(error, rr->rr_col[c].rc_error);
2861 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2862 }
2863
2864 return (error);
2865 }
2866
2867 /*
2868 * Find the median value from a set of n values
2869 */
2870 static uint64_t
latency_median_value(const uint64_t * data,size_t n)2871 latency_median_value(const uint64_t *data, size_t n)
2872 {
2873 uint64_t m;
2874
2875 if (n % 2 == 0)
2876 m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
2877 else
2878 m = data[((n + 1) >> 1) - 1];
2879
2880 return (m);
2881 }
2882
2883 /*
2884 * Calculate the outlier fence from a set of n latency values
2885 *
2886 * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
2887 */
2888 static uint64_t
latency_quartiles_fence(const uint64_t * data,size_t n,uint64_t * iqr)2889 latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
2890 {
2891 uint64_t q1 = latency_median_value(&data[0], n >> 1);
2892 uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
2893
2894 /*
2895 * To avoid detecting false positive outliers when N is small and
2896 * and the latencies values are very close, make sure the IQR
2897 * is at least 25% larger than Q1.
2898 */
2899 *iqr = MAX(q3 - q1, q1 / 4);
2900
2901 return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
2902 }
2903 #define LAT_CHILDREN_MIN 5
2904 #define LAT_OUTLIER_LIMIT 20
2905
2906 static int
latency_compare(const void * arg1,const void * arg2)2907 latency_compare(const void *arg1, const void *arg2)
2908 {
2909 const uint64_t *l1 = (uint64_t *)arg1;
2910 const uint64_t *l2 = (uint64_t *)arg2;
2911
2912 return (TREE_CMP(*l1, *l2));
2913 }
2914
2915 void
vdev_raidz_sit_child(vdev_t * svd,uint64_t secs)2916 vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
2917 {
2918 for (int c = 0; c < svd->vdev_children; c++)
2919 vdev_raidz_sit_child(svd->vdev_child[c], secs);
2920
2921 if (!svd->vdev_ops->vdev_op_leaf)
2922 return;
2923
2924 /* Begin a sit out period for this slow drive */
2925 svd->vdev_read_sit_out_expire = gethrestime_sec() +
2926 secs;
2927
2928 /* Count each slow io period */
2929 mutex_enter(&svd->vdev_stat_lock);
2930 svd->vdev_stat.vs_slow_ios++;
2931 mutex_exit(&svd->vdev_stat_lock);
2932 }
2933
2934 void
vdev_raidz_unsit_child(vdev_t * vd)2935 vdev_raidz_unsit_child(vdev_t *vd)
2936 {
2937 for (int c = 0; c < vd->vdev_children; c++)
2938 vdev_raidz_unsit_child(vd->vdev_child[c]);
2939
2940 if (!vd->vdev_ops->vdev_op_leaf)
2941 return;
2942
2943 vd->vdev_read_sit_out_expire = 0;
2944 }
2945
2946 /*
2947 * Check for any latency outlier from latest set of child reads.
2948 *
2949 * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
2950 * rule defines extreme outliers as data points outside the fence of the
2951 * third quartile plus fifty times the Interquartile Range (IQR). This range
2952 * is the distance between the first and third quartile.
2953 *
2954 * Fifty is an extremely large value for Tukey's fence, but the outliers we're
2955 * attempting to detect here are orders of magnitude times larger than the
2956 * median. This large value should capture any truly fault disk quickly,
2957 * without causing spurious sit-outs.
2958 *
2959 * To further avoid spurious sit-outs, vdevs must be detected multiple times
2960 * as an outlier before they are sat, and outlier counts will gradually decay.
2961 * Every nchildren times we have detected an outlier, we subtract 2 from the
2962 * outlier count of all children. If detected outliers are close to uniformly
2963 * distributed, this will result in the outlier count remaining close to 0
2964 * (in expectation; over long enough time-scales, spurious sit-outs are still
2965 * possible).
2966 */
2967 static void
vdev_child_slow_outlier(zio_t * zio)2968 vdev_child_slow_outlier(zio_t *zio)
2969 {
2970 vdev_t *vd = zio->io_vd;
2971 if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
2972 vd->vdev_children < LAT_CHILDREN_MIN)
2973 return;
2974
2975 hrtime_t now = getlrtime();
2976 uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
2977
2978 if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
2979 return;
2980
2981 /* Allow a single winner when there are racing callers. */
2982 if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
2983 return;
2984
2985 int children = vd->vdev_children;
2986 uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
2987
2988 for (int c = 0; c < children; c++) {
2989 vdev_t *cvd = vd->vdev_child[c];
2990 if (cvd->vdev_prev_histo == NULL) {
2991 mutex_enter(&cvd->vdev_stat_lock);
2992 size_t size =
2993 sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
2994 cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
2995 memcpy(cvd->vdev_prev_histo,
2996 cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
2997 size);
2998 mutex_exit(&cvd->vdev_stat_lock);
2999 }
3000 }
3001 uint64_t max = 0;
3002 vdev_t *svd = NULL;
3003 uint_t sitouts = 0;
3004 boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
3005 for (int c = 0; c < children; c++) {
3006 vdev_t *cvd = vd->vdev_child[c];
3007 boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
3008 cvd->vdev_state != VDEV_STATE_HEALTHY;
3009
3010 /* We can't sit out more disks than we have parity */
3011 if (sitting && ++sitouts >= vdev_get_nparity(vd))
3012 skip = B_TRUE;
3013
3014 mutex_enter(&cvd->vdev_stat_lock);
3015
3016 uint64_t *prev_histo = cvd->vdev_prev_histo;
3017 uint64_t *histo =
3018 cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
3019 if (skip) {
3020 size_t size =
3021 sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3022 memcpy(prev_histo, histo, size);
3023 mutex_exit(&cvd->vdev_stat_lock);
3024 continue;
3025 }
3026 uint64_t count = 0;
3027 lat_data[c] = 0;
3028 for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
3029 uint64_t this_count = histo[i] - prev_histo[i];
3030 lat_data[c] += (1ULL << i) * this_count;
3031 count += this_count;
3032 }
3033 size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3034 memcpy(prev_histo, histo, size);
3035 mutex_exit(&cvd->vdev_stat_lock);
3036 lat_data[c] /= MAX(1, count);
3037
3038 /* Wait until all disks have been read from */
3039 if (lat_data[c] == 0 && !sitting) {
3040 skip = B_TRUE;
3041 continue;
3042 }
3043
3044 /* Keep track of the vdev with largest value */
3045 if (lat_data[c] > max) {
3046 max = lat_data[c];
3047 svd = cvd;
3048 svd_sitting = sitting;
3049 }
3050 }
3051
3052 if (skip) {
3053 kmem_free(lat_data, sizeof (uint64_t) * children);
3054 return;
3055 }
3056
3057 qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
3058
3059 uint64_t iqr;
3060 uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
3061
3062 ASSERT3U(lat_data[children - 1], ==, max);
3063 if (max > fence && !svd_sitting) {
3064 ASSERT3U(iqr, >, 0);
3065 uint64_t incr = MAX(1, MIN((max - fence) / iqr,
3066 LAT_OUTLIER_LIMIT / 4));
3067 vd->vdev_outlier_count += incr;
3068 if (vd->vdev_outlier_count >= children) {
3069 for (int c = 0; c < children; c++) {
3070 vdev_t *cvd = vd->vdev_child[c];
3071 cvd->vdev_outlier_count -= 2;
3072 cvd->vdev_outlier_count = MAX(0,
3073 cvd->vdev_outlier_count);
3074 }
3075 vd->vdev_outlier_count = 0;
3076 }
3077 /*
3078 * Keep track of how many times this child has had
3079 * an outlier read. A disk that persitently has a
3080 * higher than peers outlier count will be considered
3081 * a slow disk.
3082 */
3083 svd->vdev_outlier_count += incr;
3084 if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
3085 ASSERT0(svd->vdev_read_sit_out_expire);
3086 vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
3087 (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
3088 zio->io_spa, svd, NULL, NULL, 0);
3089 vdev_dbgmsg(svd, "begin read sit out for %d secs",
3090 (int)vdev_read_sit_out_secs);
3091
3092 for (int c = 0; c < vd->vdev_children; c++)
3093 vd->vdev_child[c]->vdev_outlier_count = 0;
3094 }
3095 }
3096
3097 kmem_free(lat_data, sizeof (uint64_t) * children);
3098 }
3099
3100 static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)3101 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
3102 {
3103 int unexpected_errors = 0;
3104 int parity_errors = 0;
3105 int parity_untried = 0;
3106 int data_errors = 0;
3107
3108 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
3109
3110 for (int c = 0; c < rr->rr_cols; c++) {
3111 raidz_col_t *rc = &rr->rr_col[c];
3112
3113 if (rc->rc_error) {
3114 if (c < rr->rr_firstdatacol)
3115 parity_errors++;
3116 else
3117 data_errors++;
3118
3119 if (!rc->rc_skipped)
3120 unexpected_errors++;
3121 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3122 parity_untried++;
3123 }
3124
3125 if (rc->rc_force_repair)
3126 unexpected_errors++;
3127 }
3128
3129 /*
3130 * If we read more parity disks than were used for
3131 * reconstruction, confirm that the other parity disks produced
3132 * correct data.
3133 *
3134 * Note that we also regenerate parity when resilvering so we
3135 * can write it out to failed devices later.
3136 */
3137 if (parity_errors + parity_untried <
3138 rr->rr_firstdatacol - data_errors ||
3139 (zio->io_flags & ZIO_FLAG_RESILVER)) {
3140 int n = raidz_parity_verify(zio, rr);
3141 unexpected_errors += n;
3142 }
3143
3144 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
3145 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
3146 /*
3147 * Use the good data we have in hand to repair damaged children.
3148 */
3149 for (int c = 0; c < rr->rr_cols; c++) {
3150 raidz_col_t *rc = &rr->rr_col[c];
3151 vdev_t *vd = zio->io_vd;
3152 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
3153
3154 if (!rc->rc_allow_repair) {
3155 continue;
3156 } else if (!rc->rc_force_repair &&
3157 (rc->rc_error == 0 || rc->rc_size == 0)) {
3158 continue;
3159 }
3160 /*
3161 * We do not allow self healing for Direct I/O reads.
3162 * See comment in vdev_raid_row_alloc().
3163 */
3164 ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
3165
3166 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
3167 rc->rc_offset, rc->rc_abd, rc->rc_size,
3168 ZIO_TYPE_WRITE,
3169 zio->io_priority == ZIO_PRIORITY_REBUILD ?
3170 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
3171 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
3172 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
3173 }
3174 }
3175
3176 /*
3177 * Scrub or resilver i/o's: overwrite any shadow locations with the
3178 * good data. This ensures that if we've already copied this sector,
3179 * it will be corrected if it was damaged. This writes more than is
3180 * necessary, but since expansion is paused during scrub/resilver, at
3181 * most a single row will have a shadow location.
3182 */
3183 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
3184 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
3185 for (int c = 0; c < rr->rr_cols; c++) {
3186 raidz_col_t *rc = &rr->rr_col[c];
3187 vdev_t *vd = zio->io_vd;
3188
3189 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
3190 continue;
3191 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
3192
3193 /*
3194 * Note: We don't want to update the repair stats
3195 * because that would incorrectly indicate that there
3196 * was bad data to repair, which we aren't sure about.
3197 * By clearing the SCAN_THREAD flag, we prevent this
3198 * from happening, despite having the REPAIR flag set.
3199 * We need to set SELF_HEAL so that this i/o can't be
3200 * bypassed by zio_vdev_io_start().
3201 */
3202 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
3203 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
3204 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
3205 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
3206 NULL, NULL);
3207 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
3208 zio_nowait(cio);
3209 }
3210 }
3211 }
3212
3213 static void
raidz_restore_orig_data(raidz_map_t * rm)3214 raidz_restore_orig_data(raidz_map_t *rm)
3215 {
3216 for (int i = 0; i < rm->rm_nrows; i++) {
3217 raidz_row_t *rr = rm->rm_row[i];
3218 for (int c = 0; c < rr->rr_cols; c++) {
3219 raidz_col_t *rc = &rr->rr_col[c];
3220 if (rc->rc_need_orig_restore) {
3221 abd_copy(rc->rc_abd,
3222 rc->rc_orig_data, rc->rc_size);
3223 rc->rc_need_orig_restore = B_FALSE;
3224 }
3225 }
3226 }
3227 }
3228
3229 /*
3230 * During raidz_reconstruct() for expanded VDEV, we need special consideration
3231 * failure simulations. See note in raidz_reconstruct() on simulating failure
3232 * of a pre-expansion device.
3233 *
3234 * Treating logical child i as failed, return TRUE if the given column should
3235 * be treated as failed. The idea of logical children allows us to imagine
3236 * that a disk silently failed before a RAIDZ expansion (reads from this disk
3237 * succeed but return the wrong data). Since the expansion doesn't verify
3238 * checksums, the incorrect data will be moved to new locations spread among
3239 * the children (going diagonally across them).
3240 *
3241 * Higher "logical child failures" (values of `i`) indicate these
3242 * "pre-expansion failures". The first physical_width values imagine that a
3243 * current child failed; the next physical_width-1 values imagine that a
3244 * child failed before the most recent expansion; the next physical_width-2
3245 * values imagine a child failed in the expansion before that, etc.
3246 */
3247 static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)3248 raidz_simulate_failure(int physical_width, int original_width, int ashift,
3249 int i, raidz_col_t *rc)
3250 {
3251 uint64_t sector_id =
3252 physical_width * (rc->rc_offset >> ashift) +
3253 rc->rc_devidx;
3254
3255 for (int w = physical_width; w >= original_width; w--) {
3256 if (i < w) {
3257 return (sector_id % w == i);
3258 } else {
3259 i -= w;
3260 }
3261 }
3262 ASSERT(!"invalid logical child id");
3263 return (B_FALSE);
3264 }
3265
3266 /*
3267 * returns EINVAL if reconstruction of the block will not be possible
3268 * returns ECKSUM if this specific reconstruction failed
3269 * returns 0 on successful reconstruction
3270 */
3271 static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)3272 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
3273 {
3274 raidz_map_t *rm = zio->io_vsd;
3275 int physical_width = zio->io_vd->vdev_children;
3276 int original_width = (rm->rm_original_width != 0) ?
3277 rm->rm_original_width : physical_width;
3278 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
3279
3280 if (dbgmsg) {
3281 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
3282 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
3283 }
3284
3285 /* Reconstruct each row */
3286 for (int r = 0; r < rm->rm_nrows; r++) {
3287 raidz_row_t *rr = rm->rm_row[r];
3288 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
3289 int t = 0;
3290 int dead = 0;
3291 int dead_data = 0;
3292
3293 if (dbgmsg)
3294 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
3295
3296 for (int c = 0; c < rr->rr_cols; c++) {
3297 raidz_col_t *rc = &rr->rr_col[c];
3298 ASSERT0(rc->rc_need_orig_restore);
3299 if (rc->rc_error != 0) {
3300 dead++;
3301 if (c >= nparity)
3302 dead_data++;
3303 continue;
3304 }
3305 if (rc->rc_size == 0)
3306 continue;
3307 for (int lt = 0; lt < ntgts; lt++) {
3308 if (raidz_simulate_failure(physical_width,
3309 original_width,
3310 zio->io_vd->vdev_top->vdev_ashift,
3311 ltgts[lt], rc)) {
3312 if (rc->rc_orig_data == NULL) {
3313 rc->rc_orig_data =
3314 abd_alloc_linear(
3315 rc->rc_size, B_TRUE);
3316 abd_copy(rc->rc_orig_data,
3317 rc->rc_abd, rc->rc_size);
3318 }
3319 rc->rc_need_orig_restore = B_TRUE;
3320
3321 dead++;
3322 if (c >= nparity)
3323 dead_data++;
3324 /*
3325 * Note: simulating failure of a
3326 * pre-expansion device can hit more
3327 * than one column, in which case we
3328 * might try to simulate more failures
3329 * than can be reconstructed, which is
3330 * also more than the size of my_tgts.
3331 * This check prevents accessing past
3332 * the end of my_tgts. The "dead >
3333 * nparity" check below will fail this
3334 * reconstruction attempt.
3335 */
3336 if (t < VDEV_RAIDZ_MAXPARITY) {
3337 my_tgts[t++] = c;
3338 if (dbgmsg) {
3339 zfs_dbgmsg("simulating "
3340 "failure of col %u "
3341 "devidx %u", c,
3342 (int)rc->rc_devidx);
3343 }
3344 }
3345 break;
3346 }
3347 }
3348 }
3349 if (dead > nparity) {
3350 /* reconstruction not possible */
3351 if (dbgmsg) {
3352 zfs_dbgmsg("reconstruction not possible; "
3353 "too many failures");
3354 }
3355 raidz_restore_orig_data(rm);
3356 return (EINVAL);
3357 }
3358 if (dead_data > 0)
3359 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
3360 }
3361
3362 /* Check for success */
3363 if (raidz_checksum_verify(zio) == 0) {
3364 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3365 return (0);
3366
3367 /* Reconstruction succeeded - report errors */
3368 for (int i = 0; i < rm->rm_nrows; i++) {
3369 raidz_row_t *rr = rm->rm_row[i];
3370
3371 for (int c = 0; c < rr->rr_cols; c++) {
3372 raidz_col_t *rc = &rr->rr_col[c];
3373 if (rc->rc_need_orig_restore) {
3374 /*
3375 * Note: if this is a parity column,
3376 * we don't really know if it's wrong.
3377 * We need to let
3378 * vdev_raidz_io_done_verified() check
3379 * it, and if we set rc_error, it will
3380 * think that it is a "known" error
3381 * that doesn't need to be checked
3382 * or corrected.
3383 */
3384 if (rc->rc_error == 0 &&
3385 c >= rr->rr_firstdatacol) {
3386 vdev_raidz_checksum_error(zio,
3387 rc, rc->rc_orig_data);
3388 rc->rc_error =
3389 SET_ERROR(ECKSUM);
3390 }
3391 rc->rc_need_orig_restore = B_FALSE;
3392 }
3393 }
3394
3395 vdev_raidz_io_done_verified(zio, rr);
3396 }
3397
3398 zio_checksum_verified(zio);
3399
3400 if (dbgmsg) {
3401 zfs_dbgmsg("reconstruction successful "
3402 "(checksum verified)");
3403 }
3404 return (0);
3405 }
3406
3407 /* Reconstruction failed - restore original data */
3408 raidz_restore_orig_data(rm);
3409 if (dbgmsg) {
3410 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3411 "failed", zio);
3412 }
3413 return (ECKSUM);
3414 }
3415
3416 /*
3417 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3418 * Note that the algorithm below is non-optimal because it doesn't take into
3419 * account how reconstruction is actually performed. For example, with
3420 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3421 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3422 * cases we'd only use parity information in column 0.
3423 *
3424 * The order that we find the various possible combinations of failed
3425 * disks is dictated by these rules:
3426 * - Examine each "slot" (the "i" in tgts[i])
3427 * - Try to increment this slot (tgts[i] += 1)
3428 * - if we can't increment because it runs into the next slot,
3429 * reset our slot to the minimum, and examine the next slot
3430 *
3431 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3432 * 3 columns to reconstruct), we will generate the following sequence:
3433 *
3434 * STATE ACTION
3435 * 0 1 2 special case: skip since these are all parity
3436 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3437 * 0 2 3 first slot: increment to 1
3438 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3439 * 0 1 4 first: reset to 0; middle: increment to 2
3440 * 0 2 4 first: increment to 1
3441 * 1 2 4 first: reset to 0; middle: increment to 3
3442 * 0 3 4 first: increment to 1
3443 * 1 3 4 first: increment to 2
3444 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3445 * 0 1 5 first: reset to 0; middle: increment to 2
3446 * 0 2 5 first: increment to 1
3447 * 1 2 5 first: reset to 0; middle: increment to 3
3448 * 0 3 5 first: increment to 1
3449 * 1 3 5 first: increment to 2
3450 * 2 3 5 first: reset to 0; middle: increment to 4
3451 * 0 4 5 first: increment to 1
3452 * 1 4 5 first: increment to 2
3453 * 2 4 5 first: increment to 3
3454 * 3 4 5 done
3455 *
3456 * This strategy works for dRAID but is less efficient when there are a large
3457 * number of child vdevs and therefore permutations to check. Furthermore,
3458 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3459 * possible as long as there are no more than nparity data errors per row.
3460 * These additional permutations are not currently checked but could be as
3461 * a future improvement.
3462 *
3463 * Returns 0 on success, ECKSUM on failure.
3464 */
3465 static int
vdev_raidz_combrec(zio_t * zio)3466 vdev_raidz_combrec(zio_t *zio)
3467 {
3468 int nparity = vdev_get_nparity(zio->io_vd);
3469 raidz_map_t *rm = zio->io_vsd;
3470 int physical_width = zio->io_vd->vdev_children;
3471 int original_width = (rm->rm_original_width != 0) ?
3472 rm->rm_original_width : physical_width;
3473
3474 for (int i = 0; i < rm->rm_nrows; i++) {
3475 raidz_row_t *rr = rm->rm_row[i];
3476 int total_errors = 0;
3477
3478 for (int c = 0; c < rr->rr_cols; c++) {
3479 if (rr->rr_col[c].rc_error)
3480 total_errors++;
3481 }
3482
3483 if (total_errors > nparity)
3484 return (vdev_raidz_worst_error(rr));
3485 }
3486
3487 for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3488 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3489 int *ltgts = &tstore[1]; /* value is logical child ID */
3490
3491
3492 /*
3493 * Determine number of logical children, n. See comment
3494 * above raidz_simulate_failure().
3495 */
3496 int n = 0;
3497 for (int w = physical_width;
3498 w >= original_width; w--) {
3499 n += w;
3500 }
3501
3502 ASSERT3U(num_failures, <=, nparity);
3503 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3504
3505 /* Handle corner cases in combrec logic */
3506 ltgts[-1] = -1;
3507 for (int i = 0; i < num_failures; i++) {
3508 ltgts[i] = i;
3509 }
3510 ltgts[num_failures] = n;
3511
3512 for (;;) {
3513 int err = raidz_reconstruct(zio, ltgts, num_failures,
3514 nparity);
3515 if (err == EINVAL) {
3516 /*
3517 * Reconstruction not possible with this #
3518 * failures; try more failures.
3519 */
3520 break;
3521 } else if (err == 0)
3522 return (0);
3523
3524 /* Compute next targets to try */
3525 for (int t = 0; ; t++) {
3526 ASSERT3U(t, <, num_failures);
3527 ltgts[t]++;
3528 if (ltgts[t] == n) {
3529 /* try more failures */
3530 ASSERT3U(t, ==, num_failures - 1);
3531 if (zfs_flags &
3532 ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3533 zfs_dbgmsg("reconstruction "
3534 "failed for num_failures="
3535 "%u; tried all "
3536 "combinations",
3537 num_failures);
3538 }
3539 break;
3540 }
3541
3542 ASSERT3U(ltgts[t], <, n);
3543 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3544
3545 /*
3546 * If that spot is available, we're done here.
3547 * Try the next combination.
3548 */
3549 if (ltgts[t] != ltgts[t + 1])
3550 break; // found next combination
3551
3552 /*
3553 * Otherwise, reset this tgt to the minimum,
3554 * and move on to the next tgt.
3555 */
3556 ltgts[t] = ltgts[t - 1] + 1;
3557 ASSERT3U(ltgts[t], ==, t);
3558 }
3559
3560 /* Increase the number of failures and keep trying. */
3561 if (ltgts[num_failures - 1] == n)
3562 break;
3563 }
3564 }
3565 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3566 zfs_dbgmsg("reconstruction failed for all num_failures");
3567 return (ECKSUM);
3568 }
3569
3570 void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)3571 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3572 {
3573 for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3574 raidz_row_t *rr = rm->rm_row[row];
3575 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3576 }
3577 }
3578
3579 /*
3580 * Complete a write IO operation on a RAIDZ VDev
3581 *
3582 * Outline:
3583 * 1. Check for errors on the child IOs.
3584 * 2. Return, setting an error code if too few child VDevs were written
3585 * to reconstruct the data later. Note that partial writes are
3586 * considered successful if they can be reconstructed at all.
3587 */
3588 static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)3589 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3590 {
3591 int normal_errors = 0;
3592 int shadow_errors = 0;
3593
3594 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3595 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3596 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3597
3598 for (int c = 0; c < rr->rr_cols; c++) {
3599 raidz_col_t *rc = &rr->rr_col[c];
3600
3601 if (rc->rc_error != 0) {
3602 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3603 normal_errors++;
3604 }
3605 if (rc->rc_shadow_error != 0) {
3606 ASSERT(rc->rc_shadow_error != ECKSUM);
3607 shadow_errors++;
3608 }
3609 }
3610
3611 /*
3612 * Treat partial writes as a success. If we couldn't write enough
3613 * columns to reconstruct the data, the I/O failed. Otherwise, good
3614 * enough. Note that in the case of a shadow write (during raidz
3615 * expansion), depending on if we crash, either the normal (old) or
3616 * shadow (new) location may become the "real" version of the block,
3617 * so both locations must have sufficient redundancy.
3618 *
3619 * Now that we support write reallocation, it would be better
3620 * to treat partial failure as real failure unless there are
3621 * no non-degraded top-level vdevs left, and not update DTLs
3622 * if we intend to reallocate.
3623 */
3624 if (normal_errors > rr->rr_firstdatacol ||
3625 shadow_errors > rr->rr_firstdatacol) {
3626 zio->io_error = zio_worst_error(zio->io_error,
3627 vdev_raidz_worst_error(rr));
3628 }
3629 }
3630
3631 static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)3632 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3633 raidz_row_t *rr)
3634 {
3635 int parity_errors = 0;
3636 int parity_untried = 0;
3637 int data_errors = 0;
3638 int total_errors = 0;
3639
3640 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3641 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3642
3643 for (int c = 0; c < rr->rr_cols; c++) {
3644 raidz_col_t *rc = &rr->rr_col[c];
3645
3646 /*
3647 * If scrubbing and a replacing/sparing child vdev determined
3648 * that not all of its children have an identical copy of the
3649 * data, then clear the error so the column is treated like
3650 * any other read and force a repair to correct the damage.
3651 */
3652 if (rc->rc_error == ECKSUM) {
3653 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3654 vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3655 rc->rc_force_repair = 1;
3656 rc->rc_error = 0;
3657 }
3658
3659 if (rc->rc_error) {
3660 if (c < rr->rr_firstdatacol)
3661 parity_errors++;
3662 else
3663 data_errors++;
3664
3665 total_errors++;
3666 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3667 parity_untried++;
3668 }
3669 }
3670
3671 /*
3672 * If there were data errors and the number of errors we saw was
3673 * correctable -- less than or equal to the number of parity disks read
3674 * -- reconstruct based on the missing data.
3675 */
3676 if (data_errors != 0 &&
3677 total_errors <= rr->rr_firstdatacol - parity_untried) {
3678 /*
3679 * We either attempt to read all the parity columns or
3680 * none of them. If we didn't try to read parity, we
3681 * wouldn't be here in the correctable case. There must
3682 * also have been fewer parity errors than parity
3683 * columns or, again, we wouldn't be in this code path.
3684 */
3685 ASSERT0(parity_untried);
3686 ASSERT(parity_errors < rr->rr_firstdatacol);
3687
3688 /*
3689 * Identify the data columns that reported an error.
3690 */
3691 int n = 0;
3692 int tgts[VDEV_RAIDZ_MAXPARITY];
3693 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3694 raidz_col_t *rc = &rr->rr_col[c];
3695 if (rc->rc_error != 0) {
3696 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3697 tgts[n++] = c;
3698 }
3699 }
3700
3701 ASSERT(rr->rr_firstdatacol >= n);
3702
3703 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3704 }
3705 }
3706
3707 /*
3708 * Return the number of reads issued.
3709 */
3710 static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)3711 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3712 {
3713 vdev_t *vd = zio->io_vd;
3714 int nread = 0;
3715
3716 rr->rr_missingdata = 0;
3717 rr->rr_missingparity = 0;
3718
3719 /*
3720 * If this rows contains empty sectors which are not required
3721 * for a normal read then allocate an ABD for them now so they
3722 * may be read, verified, and any needed repairs performed.
3723 */
3724 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3725 vdev_draid_map_alloc_empty(zio, rr);
3726
3727 for (int c = 0; c < rr->rr_cols; c++) {
3728 raidz_col_t *rc = &rr->rr_col[c];
3729 if (rc->rc_tried || rc->rc_size == 0)
3730 continue;
3731
3732 zio_nowait(zio_vdev_child_io(zio, NULL,
3733 vd->vdev_child[rc->rc_devidx],
3734 rc->rc_offset, rc->rc_abd, rc->rc_size,
3735 zio->io_type, zio->io_priority, 0,
3736 vdev_raidz_child_done, rc));
3737 nread++;
3738 }
3739 return (nread);
3740 }
3741
3742 /*
3743 * We're here because either there were too many errors to even attempt
3744 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3745 * failed. In either case, there is enough bad data to prevent reconstruction.
3746 * Start checksum ereports for all children which haven't failed.
3747 */
3748 static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)3749 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3750 {
3751 raidz_map_t *rm = zio->io_vsd;
3752
3753 for (int i = 0; i < rm->rm_nrows; i++) {
3754 raidz_row_t *rr = rm->rm_row[i];
3755
3756 for (int c = 0; c < rr->rr_cols; c++) {
3757 raidz_col_t *rc = &rr->rr_col[c];
3758 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3759
3760 if (rc->rc_error != 0)
3761 continue;
3762
3763 zio_bad_cksum_t zbc;
3764 zbc.zbc_has_cksum = 0;
3765 zbc.zbc_injected = rm->rm_ecksuminjected;
3766 mutex_enter(&cvd->vdev_stat_lock);
3767 cvd->vdev_stat.vs_checksum_errors++;
3768 mutex_exit(&cvd->vdev_stat_lock);
3769 (void) zfs_ereport_start_checksum(zio->io_spa,
3770 cvd, &zio->io_bookmark, zio, rc->rc_offset,
3771 rc->rc_size, &zbc);
3772 }
3773 }
3774 }
3775
3776 void
vdev_raidz_io_done(zio_t * zio)3777 vdev_raidz_io_done(zio_t *zio)
3778 {
3779 raidz_map_t *rm = zio->io_vsd;
3780
3781 ASSERT(zio->io_bp != NULL);
3782 if (zio->io_type == ZIO_TYPE_WRITE) {
3783 for (int i = 0; i < rm->rm_nrows; i++) {
3784 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3785 }
3786 } else {
3787 if (rm->rm_phys_col) {
3788 /*
3789 * This is an aggregated read. Copy the data and status
3790 * from the aggregate abd's to the individual rows.
3791 */
3792 for (int i = 0; i < rm->rm_nrows; i++) {
3793 raidz_row_t *rr = rm->rm_row[i];
3794
3795 for (int c = 0; c < rr->rr_cols; c++) {
3796 raidz_col_t *rc = &rr->rr_col[c];
3797 if (rc->rc_tried || rc->rc_size == 0)
3798 continue;
3799
3800 raidz_col_t *prc =
3801 &rm->rm_phys_col[rc->rc_devidx];
3802 rc->rc_error = prc->rc_error;
3803 rc->rc_tried = prc->rc_tried;
3804 rc->rc_skipped = prc->rc_skipped;
3805 if (c >= rr->rr_firstdatacol) {
3806 /*
3807 * Note: this is slightly faster
3808 * than using abd_copy_off().
3809 */
3810 char *physbuf = abd_to_buf(
3811 prc->rc_abd);
3812 void *physloc = physbuf +
3813 rc->rc_offset -
3814 prc->rc_offset;
3815
3816 abd_copy_from_buf(rc->rc_abd,
3817 physloc, rc->rc_size);
3818 }
3819 }
3820 }
3821 }
3822
3823 for (int i = 0; i < rm->rm_nrows; i++) {
3824 raidz_row_t *rr = rm->rm_row[i];
3825 vdev_raidz_io_done_reconstruct_known_missing(zio,
3826 rm, rr);
3827 }
3828
3829 if (raidz_checksum_verify(zio) == 0) {
3830 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
3831 goto done;
3832
3833 for (int i = 0; i < rm->rm_nrows; i++) {
3834 raidz_row_t *rr = rm->rm_row[i];
3835 vdev_raidz_io_done_verified(zio, rr);
3836 }
3837 /* Periodically check for a read outlier */
3838 if (zio->io_type == ZIO_TYPE_READ)
3839 vdev_child_slow_outlier(zio);
3840 zio_checksum_verified(zio);
3841 } else {
3842 /*
3843 * A sequential resilver has no checksum which makes
3844 * combinatoral reconstruction impossible. This code
3845 * path is unreachable since raidz_checksum_verify()
3846 * has no checksum to verify and must succeed.
3847 */
3848 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3849
3850 /*
3851 * This isn't a typical situation -- either we got a
3852 * read error or a child silently returned bad data.
3853 * Read every block so we can try again with as much
3854 * data and parity as we can track down. If we've
3855 * already been through once before, all children will
3856 * be marked as tried so we'll proceed to combinatorial
3857 * reconstruction.
3858 */
3859 int nread = 0;
3860 for (int i = 0; i < rm->rm_nrows; i++) {
3861 nread += vdev_raidz_read_all(zio,
3862 rm->rm_row[i]);
3863 }
3864 if (nread != 0) {
3865 /*
3866 * Normally our stage is VDEV_IO_DONE, but if
3867 * we've already called redone(), it will have
3868 * changed to VDEV_IO_START, in which case we
3869 * don't want to call redone() again.
3870 */
3871 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3872 zio_vdev_io_redone(zio);
3873 return;
3874 }
3875 /*
3876 * It would be too expensive to try every possible
3877 * combination of failed sectors in every row, so
3878 * instead we try every combination of failed current or
3879 * past physical disk. This means that if the incorrect
3880 * sectors were all on Nparity disks at any point in the
3881 * past, we will find the correct data. The only known
3882 * case where this is less durable than a non-expanded
3883 * RAIDZ, is if we have a silent failure during
3884 * expansion. In that case, one block could be
3885 * partially in the old format and partially in the
3886 * new format, so we'd lost some sectors from the old
3887 * format and some from the new format.
3888 *
3889 * e.g. logical_width=4 physical_width=6
3890 * the 15 (6+5+4) possible failed disks are:
3891 * width=6 child=0
3892 * width=6 child=1
3893 * width=6 child=2
3894 * width=6 child=3
3895 * width=6 child=4
3896 * width=6 child=5
3897 * width=5 child=0
3898 * width=5 child=1
3899 * width=5 child=2
3900 * width=5 child=3
3901 * width=5 child=4
3902 * width=4 child=0
3903 * width=4 child=1
3904 * width=4 child=2
3905 * width=4 child=3
3906 * And we will try every combination of Nparity of these
3907 * failing.
3908 *
3909 * As a first pass, we can generate every combo,
3910 * and try reconstructing, ignoring any known
3911 * failures. If any row has too many known + simulated
3912 * failures, then we bail on reconstructing with this
3913 * number of simulated failures. As an improvement,
3914 * we could detect the number of whole known failures
3915 * (i.e. we have known failures on these disks for
3916 * every row; the disks never succeeded), and
3917 * subtract that from the max # failures to simulate.
3918 * We could go even further like the current
3919 * combrec code, but that doesn't seem like it
3920 * gains us very much. If we simulate a failure
3921 * that is also a known failure, that's fine.
3922 */
3923 zio->io_error = vdev_raidz_combrec(zio);
3924 if (zio->io_error == ECKSUM &&
3925 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3926 vdev_raidz_io_done_unrecoverable(zio);
3927 }
3928 }
3929 }
3930 done:
3931 if (rm->rm_lr != NULL) {
3932 zfs_rangelock_exit(rm->rm_lr);
3933 rm->rm_lr = NULL;
3934 }
3935 }
3936
3937 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)3938 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3939 {
3940 vdev_raidz_t *vdrz = vd->vdev_tsd;
3941 if (faulted > vdrz->vd_nparity)
3942 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3943 VDEV_AUX_NO_REPLICAS);
3944 else if (degraded + faulted != 0)
3945 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3946 else
3947 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3948 }
3949
3950 /*
3951 * Determine if any portion of the provided block resides on a child vdev
3952 * with a dirty DTL and therefore needs to be resilvered. The function
3953 * assumes that at least one DTL is dirty which implies that full stripe
3954 * width blocks must be resilvered.
3955 */
3956 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3957 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3958 uint64_t phys_birth)
3959 {
3960 vdev_raidz_t *vdrz = vd->vdev_tsd;
3961
3962 /*
3963 * If we're in the middle of a RAIDZ expansion, this block may be in
3964 * the old and/or new location. For simplicity, always resilver it.
3965 */
3966 if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3967 return (B_TRUE);
3968
3969 uint64_t dcols = vd->vdev_children;
3970 uint64_t nparity = vdrz->vd_nparity;
3971 uint64_t ashift = vd->vdev_top->vdev_ashift;
3972 /* The starting RAIDZ (parent) vdev sector of the block. */
3973 uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3974 /* The zio's size in units of the vdev's minimum sector size. */
3975 uint64_t s = ((psize - 1) >> ashift) + 1;
3976 /* The first column for this stripe. */
3977 uint64_t f = b % dcols;
3978
3979 /* Unreachable by sequential resilver. */
3980 ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3981
3982 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3983 return (B_FALSE);
3984
3985 if (s + nparity >= dcols)
3986 return (B_TRUE);
3987
3988 for (uint64_t c = 0; c < s + nparity; c++) {
3989 uint64_t devidx = (f + c) % dcols;
3990 vdev_t *cvd = vd->vdev_child[devidx];
3991
3992 /*
3993 * dsl_scan_need_resilver() already checked vd with
3994 * vdev_dtl_contains(). So here just check cvd with
3995 * vdev_dtl_empty(), cheaper and a good approximation.
3996 */
3997 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3998 return (B_TRUE);
3999 }
4000
4001 return (B_FALSE);
4002 }
4003
4004 static void
vdev_raidz_xlate(vdev_t * cvd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)4005 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
4006 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
4007 {
4008 (void) remain_rs;
4009
4010 vdev_t *raidvd = cvd->vdev_parent;
4011 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
4012
4013 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4014
4015 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4016 /*
4017 * We're in the middle of expansion, in which case the
4018 * translation is in flux. Any answer we give may be wrong
4019 * by the time we return, so it isn't safe for the caller to
4020 * act on it. Therefore we say that this range isn't present
4021 * on any children. The only consumers of this are "zpool
4022 * initialize" and trimming, both of which are "best effort"
4023 * anyway.
4024 */
4025 physical_rs->rs_start = physical_rs->rs_end = 0;
4026 remain_rs->rs_start = remain_rs->rs_end = 0;
4027 return;
4028 }
4029
4030 uint64_t width = vdrz->vd_physical_width;
4031 uint64_t tgt_col = cvd->vdev_id;
4032 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
4033
4034 /* make sure the offsets are block-aligned */
4035 ASSERT0(logical_rs->rs_start % (1 << ashift));
4036 ASSERT0(logical_rs->rs_end % (1 << ashift));
4037 uint64_t b_start = logical_rs->rs_start >> ashift;
4038 uint64_t b_end = logical_rs->rs_end >> ashift;
4039
4040 uint64_t start_row = 0;
4041 if (b_start > tgt_col) /* avoid underflow */
4042 start_row = ((b_start - tgt_col - 1) / width) + 1;
4043
4044 uint64_t end_row = 0;
4045 if (b_end > tgt_col)
4046 end_row = ((b_end - tgt_col - 1) / width) + 1;
4047
4048 physical_rs->rs_start = start_row << ashift;
4049 physical_rs->rs_end = end_row << ashift;
4050
4051 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
4052 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
4053 logical_rs->rs_end - logical_rs->rs_start);
4054 }
4055
4056 static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)4057 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
4058 {
4059 spa_t *spa = arg;
4060 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4061 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4062
4063 /*
4064 * Ensure there are no i/os to the range that is being committed.
4065 */
4066 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
4067 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
4068
4069 mutex_enter(&vre->vre_lock);
4070 uint64_t new_offset =
4071 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
4072 /*
4073 * We should not have committed anything that failed.
4074 */
4075 VERIFY3U(vre->vre_failed_offset, >=, old_offset);
4076 mutex_exit(&vre->vre_lock);
4077
4078 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4079 old_offset, new_offset - old_offset,
4080 RL_WRITER);
4081
4082 /*
4083 * Update the uberblock that will be written when this txg completes.
4084 */
4085 RAIDZ_REFLOW_SET(&spa->spa_uberblock,
4086 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
4087 vre->vre_offset_pertxg[txgoff] = 0;
4088 zfs_rangelock_exit(lr);
4089
4090 mutex_enter(&vre->vre_lock);
4091 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
4092 vre->vre_bytes_copied_pertxg[txgoff] = 0;
4093 mutex_exit(&vre->vre_lock);
4094
4095 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4096 VERIFY0(zap_update(spa->spa_meta_objset,
4097 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4098 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
4099 }
4100
4101 static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)4102 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
4103 {
4104 spa_t *spa = arg;
4105 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4106 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4107 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4108
4109 for (int i = 0; i < TXG_SIZE; i++)
4110 VERIFY0(vre->vre_offset_pertxg[i]);
4111
4112 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4113 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
4114 re->re_logical_width = vdrz->vd_physical_width;
4115 mutex_enter(&vdrz->vd_expand_lock);
4116 avl_add(&vdrz->vd_expand_txgs, re);
4117 mutex_exit(&vdrz->vd_expand_lock);
4118
4119 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4120
4121 /*
4122 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
4123 * will get written (based on vd_expand_txgs).
4124 */
4125 vdev_config_dirty(vd);
4126
4127 /*
4128 * Before we change vre_state, the on-disk state must reflect that we
4129 * have completed all copying, so that vdev_raidz_io_start() can use
4130 * vre_state to determine if the reflow is in progress. See also the
4131 * end of spa_raidz_expand_thread().
4132 */
4133 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
4134 raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
4135
4136 vre->vre_end_time = gethrestime_sec();
4137 vre->vre_state = DSS_FINISHED;
4138
4139 uint64_t state = vre->vre_state;
4140 VERIFY0(zap_update(spa->spa_meta_objset,
4141 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4142 sizeof (state), 1, &state, tx));
4143
4144 uint64_t end_time = vre->vre_end_time;
4145 VERIFY0(zap_update(spa->spa_meta_objset,
4146 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4147 sizeof (end_time), 1, &end_time, tx));
4148
4149 spa->spa_uberblock.ub_raidz_reflow_info = 0;
4150
4151 spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
4152 "%s vdev %llu new width %llu", spa_name(spa),
4153 (unsigned long long)vd->vdev_id,
4154 (unsigned long long)vd->vdev_children);
4155
4156 spa->spa_raidz_expand = NULL;
4157 raidvd->vdev_rz_expanding = B_FALSE;
4158
4159 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
4160 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
4161 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
4162
4163 spa_notify_waiters(spa);
4164
4165 /*
4166 * While we're in syncing context take the opportunity to
4167 * setup a scrub. All the data has been sucessfully copied
4168 * but we have not validated any checksums.
4169 */
4170 setup_sync_arg_t setup_sync_arg = {
4171 .func = POOL_SCAN_SCRUB,
4172 .txgstart = 0,
4173 .txgend = 0,
4174 };
4175 if (zfs_scrub_after_expand &&
4176 dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
4177 dsl_scan_setup_sync(&setup_sync_arg, tx);
4178 }
4179 }
4180
4181 /*
4182 * State of one copy batch.
4183 */
4184 typedef struct raidz_reflow_arg {
4185 vdev_raidz_expand_t *rra_vre; /* Global expantion state. */
4186 zfs_locked_range_t *rra_lr; /* Range lock of this batch. */
4187 uint64_t rra_txg; /* TXG of this batch. */
4188 uint_t rra_ashift; /* Ashift of the vdev. */
4189 uint32_t rra_tbd; /* Number of in-flight ZIOs. */
4190 uint32_t rra_writes; /* Number of write ZIOs. */
4191 zio_t *rra_zio[]; /* Write ZIO pointers. */
4192 } raidz_reflow_arg_t;
4193
4194 /*
4195 * Write of the new location on one child is done. Once all of them are done
4196 * we can unlock and free everything.
4197 */
4198 static void
raidz_reflow_write_done(zio_t * zio)4199 raidz_reflow_write_done(zio_t *zio)
4200 {
4201 raidz_reflow_arg_t *rra = zio->io_private;
4202 vdev_raidz_expand_t *vre = rra->rra_vre;
4203
4204 abd_free(zio->io_abd);
4205
4206 mutex_enter(&vre->vre_lock);
4207 if (zio->io_error != 0) {
4208 /* Force a reflow pause on errors */
4209 vre->vre_failed_offset =
4210 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4211 }
4212 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
4213 vre->vre_outstanding_bytes -= zio->io_size;
4214 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
4215 vre->vre_failed_offset) {
4216 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
4217 zio->io_size;
4218 }
4219 cv_signal(&vre->vre_cv);
4220 boolean_t done = (--rra->rra_tbd == 0);
4221 mutex_exit(&vre->vre_lock);
4222
4223 if (!done)
4224 return;
4225 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
4226 zfs_rangelock_exit(rra->rra_lr);
4227 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
4228 }
4229
4230 /*
4231 * Read of the old location on one child is done. Once all of them are done
4232 * writes should have all the data and we can issue them.
4233 */
4234 static void
raidz_reflow_read_done(zio_t * zio)4235 raidz_reflow_read_done(zio_t *zio)
4236 {
4237 raidz_reflow_arg_t *rra = zio->io_private;
4238 vdev_raidz_expand_t *vre = rra->rra_vre;
4239
4240 /* Reads of only one block use write ABDs. For bigger free gangs. */
4241 if (zio->io_size > (1 << rra->rra_ashift))
4242 abd_free(zio->io_abd);
4243
4244 /*
4245 * If the read failed, or if it was done on a vdev that is not fully
4246 * healthy (e.g. a child that has a resilver in progress), we may not
4247 * have the correct data. Note that it's OK if the write proceeds.
4248 * It may write garbage but the location is otherwise unused and we
4249 * will retry later due to vre_failed_offset.
4250 */
4251 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
4252 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
4253 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
4254 (long long)rra->rra_lr->lr_offset,
4255 (long long)rra->rra_lr->lr_length,
4256 (long long)rra->rra_txg,
4257 zio->io_error,
4258 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
4259 vdev_dtl_empty(zio->io_vd, DTL_MISSING));
4260 mutex_enter(&vre->vre_lock);
4261 /* Force a reflow pause on errors */
4262 vre->vre_failed_offset =
4263 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4264 mutex_exit(&vre->vre_lock);
4265 }
4266
4267 if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
4268 return;
4269 uint32_t writes = rra->rra_tbd = rra->rra_writes;
4270 for (uint64_t i = 0; i < writes; i++)
4271 zio_nowait(rra->rra_zio[i]);
4272 }
4273
4274 static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)4275 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
4276 dmu_tx_t *tx)
4277 {
4278 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4279 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4280
4281 if (offset == 0)
4282 return;
4283
4284 mutex_enter(&vre->vre_lock);
4285 ASSERT3U(vre->vre_offset, <=, offset);
4286 vre->vre_offset = offset;
4287 mutex_exit(&vre->vre_lock);
4288
4289 if (vre->vre_offset_pertxg[txgoff] == 0) {
4290 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
4291 spa, tx);
4292 }
4293 vre->vre_offset_pertxg[txgoff] = offset;
4294 }
4295
4296 static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)4297 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
4298 {
4299 for (int i = 0; i < raidz_vd->vdev_children; i++) {
4300 /* Quick check if a child is being replaced */
4301 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
4302 return (B_TRUE);
4303 }
4304 return (B_FALSE);
4305 }
4306
4307 static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,zfs_range_tree_t * rt,dmu_tx_t * tx)4308 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
4309 dmu_tx_t *tx)
4310 {
4311 spa_t *spa = vd->vdev_spa;
4312 uint_t ashift = vd->vdev_top->vdev_ashift;
4313
4314 zfs_range_seg_t *rs = zfs_range_tree_first(rt);
4315 if (rt == NULL)
4316 return (B_FALSE);
4317 uint64_t offset = zfs_rs_get_start(rs, rt);
4318 ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
4319 uint64_t size = zfs_rs_get_end(rs, rt) - offset;
4320 ASSERT3U(size, >=, 1 << ashift);
4321 ASSERT(IS_P2ALIGNED(size, 1 << ashift));
4322
4323 uint64_t blkid = offset >> ashift;
4324 uint_t old_children = vd->vdev_children - 1;
4325
4326 /*
4327 * We can only progress to the point that writes will not overlap
4328 * with blocks whose progress has not yet been recorded on disk.
4329 * Since partially-copied rows are still read from the old location,
4330 * we need to stop one row before the sector-wise overlap, to prevent
4331 * row-wise overlap.
4332 *
4333 * Note that even if we are skipping over a large unallocated region,
4334 * we can't move the on-disk progress to `offset`, because concurrent
4335 * writes/allocations could still use the currently-unallocated
4336 * region.
4337 */
4338 uint64_t ubsync_blkid =
4339 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
4340 uint64_t next_overwrite_blkid = ubsync_blkid +
4341 ubsync_blkid / old_children - old_children;
4342 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
4343 if (blkid >= next_overwrite_blkid) {
4344 raidz_reflow_record_progress(vre,
4345 next_overwrite_blkid << ashift, tx);
4346 return (B_TRUE);
4347 }
4348
4349 size = MIN(size, raidz_expand_max_copy_bytes);
4350 size = MIN(size, (uint64_t)old_children *
4351 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
4352 size = MAX(size, 1 << ashift);
4353 uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
4354 size = (uint64_t)blocks << ashift;
4355
4356 zfs_range_tree_remove(rt, offset, size);
4357
4358 uint_t reads = MIN(blocks, old_children);
4359 uint_t writes = MIN(blocks, vd->vdev_children);
4360 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4361 sizeof (zio_t *) * writes, KM_SLEEP);
4362 rra->rra_vre = vre;
4363 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
4364 offset, size, RL_WRITER);
4365 rra->rra_txg = dmu_tx_get_txg(tx);
4366 rra->rra_ashift = ashift;
4367 rra->rra_tbd = reads;
4368 rra->rra_writes = writes;
4369
4370 raidz_reflow_record_progress(vre, offset + size, tx);
4371
4372 /*
4373 * SCL_STATE will be released when the read and write are done,
4374 * by raidz_reflow_write_done().
4375 */
4376 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4377
4378 /* check if a replacing vdev was added, if so treat it as an error */
4379 if (vdev_raidz_expand_child_replacing(vd)) {
4380 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4381 "offset=%llu txg=%llu",
4382 (long long)rra->rra_lr->lr_offset,
4383 (long long)rra->rra_txg);
4384
4385 mutex_enter(&vre->vre_lock);
4386 vre->vre_failed_offset =
4387 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4388 cv_signal(&vre->vre_cv);
4389 mutex_exit(&vre->vre_lock);
4390
4391 /* drop everything we acquired */
4392 spa_config_exit(spa, SCL_STATE, spa);
4393 zfs_rangelock_exit(rra->rra_lr);
4394 kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4395 return (B_TRUE);
4396 }
4397
4398 mutex_enter(&vre->vre_lock);
4399 vre->vre_outstanding_bytes += size;
4400 mutex_exit(&vre->vre_lock);
4401
4402 /* Allocate ABD and ZIO for each child we write. */
4403 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4404 zio_t *pio = spa->spa_txg_zio[txgoff];
4405 uint_t b = blocks / vd->vdev_children;
4406 uint_t bb = blocks % vd->vdev_children;
4407 for (uint_t i = 0; i < writes; i++) {
4408 uint_t n = b + (i < bb);
4409 abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4410 rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4411 vd->vdev_child[(blkid + i) % vd->vdev_children],
4412 ((blkid + i) / vd->vdev_children) << ashift,
4413 abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4414 ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4415 }
4416
4417 /*
4418 * Allocate and issue ZIO for each child we read. For reads of only
4419 * one block we can use respective writer ABDs, since they will also
4420 * have only one block. For bigger reads create gang ABDs and fill
4421 * them with respective blocks from writer ABDs.
4422 */
4423 b = blocks / old_children;
4424 bb = blocks % old_children;
4425 for (uint_t i = 0; i < reads; i++) {
4426 uint_t n = b + (i < bb);
4427 abd_t *abd;
4428 if (n > 1) {
4429 abd = abd_alloc_gang();
4430 for (uint_t j = 0; j < n; j++) {
4431 uint_t b = j * old_children + i;
4432 abd_t *cabd = abd_get_offset_size(
4433 rra->rra_zio[b % vd->vdev_children]->io_abd,
4434 (b / vd->vdev_children) << ashift,
4435 1 << ashift);
4436 abd_gang_add(abd, cabd, B_TRUE);
4437 }
4438 } else {
4439 abd = rra->rra_zio[i]->io_abd;
4440 }
4441 zio_nowait(zio_vdev_child_io(pio, NULL,
4442 vd->vdev_child[(blkid + i) % old_children],
4443 ((blkid + i) / old_children) << ashift, abd,
4444 n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4445 ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4446 }
4447
4448 return (B_FALSE);
4449 }
4450
4451 /*
4452 * For testing (ztest specific)
4453 */
4454 static void
raidz_expand_pause(uint_t pause_point)4455 raidz_expand_pause(uint_t pause_point)
4456 {
4457 while (raidz_expand_pause_point != 0 &&
4458 raidz_expand_pause_point <= pause_point)
4459 delay(hz);
4460 }
4461
4462 static void
raidz_scratch_child_done(zio_t * zio)4463 raidz_scratch_child_done(zio_t *zio)
4464 {
4465 zio_t *pio = zio->io_private;
4466
4467 mutex_enter(&pio->io_lock);
4468 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4469 mutex_exit(&pio->io_lock);
4470 }
4471
4472 /*
4473 * Reflow the beginning portion of the vdev into an intermediate scratch area
4474 * in memory and on disk. This operation must be persisted on disk before we
4475 * proceed to overwrite the beginning portion with the reflowed data.
4476 *
4477 * This multi-step task can fail to complete if disk errors are encountered
4478 * and we can return here after a pause (waiting for disk to become healthy).
4479 */
4480 static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4481 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4482 {
4483 vdev_raidz_expand_t *vre = arg;
4484 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4485 zio_t *pio;
4486 int error;
4487
4488 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4489 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4490 int ashift = raidvd->vdev_ashift;
4491 uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4492 uint64_t);
4493 uint64_t logical_size = write_size * raidvd->vdev_children;
4494 uint64_t read_size =
4495 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4496 1 << ashift);
4497
4498 /*
4499 * The scratch space must be large enough to get us to the point
4500 * that one row does not overlap itself when moved. This is checked
4501 * by vdev_raidz_attach_check().
4502 */
4503 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4504 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4505 VERIFY3U(write_size, <=, read_size);
4506
4507 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4508 0, logical_size, RL_WRITER);
4509
4510 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4511 KM_SLEEP);
4512 for (int i = 0; i < raidvd->vdev_children; i++) {
4513 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4514 }
4515
4516 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4517
4518 /*
4519 * If we have already written the scratch area then we must read from
4520 * there, since new writes were redirected there while we were paused
4521 * or the original location may have been partially overwritten with
4522 * reflowed data.
4523 */
4524 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4525 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4526 /*
4527 * Read from scratch space.
4528 */
4529 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4530 for (int i = 0; i < raidvd->vdev_children; i++) {
4531 /*
4532 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4533 * to the offset to calculate the physical offset to
4534 * write to. Passing in a negative offset makes us
4535 * access the scratch area.
4536 */
4537 zio_nowait(zio_vdev_child_io(pio, NULL,
4538 raidvd->vdev_child[i],
4539 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4540 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4541 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4542 }
4543 error = zio_wait(pio);
4544 if (error != 0) {
4545 zfs_dbgmsg("reflow: error %d reading scratch location",
4546 error);
4547 goto io_error_exit;
4548 }
4549 goto overwrite;
4550 }
4551
4552 /*
4553 * Read from original location.
4554 */
4555 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4556 for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4557 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4558 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4559 0, abds[i], read_size, ZIO_TYPE_READ,
4560 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4561 raidz_scratch_child_done, pio));
4562 }
4563 error = zio_wait(pio);
4564 if (error != 0) {
4565 zfs_dbgmsg("reflow: error %d reading original location", error);
4566 io_error_exit:
4567 for (int i = 0; i < raidvd->vdev_children; i++)
4568 abd_free(abds[i]);
4569 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4570 zfs_rangelock_exit(lr);
4571 spa_config_exit(spa, SCL_STATE, FTAG);
4572 return;
4573 }
4574
4575 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4576
4577 /*
4578 * Reflow in memory.
4579 */
4580 uint64_t logical_sectors = logical_size >> ashift;
4581 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4582 int oldchild = i % (raidvd->vdev_children - 1);
4583 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4584
4585 int newchild = i % raidvd->vdev_children;
4586 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4587
4588 /* a single sector should not be copying over itself */
4589 ASSERT(!(newchild == oldchild && newoff == oldoff));
4590
4591 abd_copy_off(abds[newchild], abds[oldchild],
4592 newoff, oldoff, 1 << ashift);
4593 }
4594
4595 /*
4596 * Verify that we filled in everything we intended to (write_size on
4597 * each child).
4598 */
4599 VERIFY0(logical_sectors % raidvd->vdev_children);
4600 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4601 write_size);
4602
4603 /*
4604 * Write to scratch location (boot area).
4605 */
4606 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4607 for (int i = 0; i < raidvd->vdev_children; i++) {
4608 /*
4609 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4610 * the offset to calculate the physical offset to write to.
4611 * Passing in a negative offset lets us access the boot area.
4612 */
4613 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4614 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4615 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4616 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4617 }
4618 error = zio_wait(pio);
4619 if (error != 0) {
4620 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4621 goto io_error_exit;
4622 }
4623 pio = zio_root(spa, NULL, NULL, 0);
4624 zio_flush(pio, raidvd);
4625 zio_wait(pio);
4626
4627 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4628 (long long)logical_size);
4629
4630 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4631
4632 /*
4633 * Update uberblock to indicate that scratch space is valid. This is
4634 * needed because after this point, the real location may be
4635 * overwritten. If we crash, we need to get the data from the
4636 * scratch space, rather than the real location.
4637 *
4638 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4639 * will prefer this uberblock.
4640 */
4641 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4642 spa->spa_ubsync.ub_timestamp++;
4643 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4644 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4645 if (spa_multihost(spa))
4646 mmp_update_uberblock(spa, &spa->spa_ubsync);
4647
4648 zfs_dbgmsg("reflow: uberblock updated "
4649 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4650 (long long)spa->spa_ubsync.ub_txg,
4651 (long long)logical_size,
4652 (long long)spa->spa_ubsync.ub_timestamp);
4653
4654 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4655
4656 /*
4657 * Overwrite with reflow'ed data.
4658 */
4659 overwrite:
4660 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4661 for (int i = 0; i < raidvd->vdev_children; i++) {
4662 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4663 0, abds[i], write_size, ZIO_TYPE_WRITE,
4664 ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4665 raidz_scratch_child_done, pio));
4666 }
4667 error = zio_wait(pio);
4668 if (error != 0) {
4669 /*
4670 * When we exit early here and drop the range lock, new
4671 * writes will go into the scratch area so we'll need to
4672 * read from there when we return after pausing.
4673 */
4674 zfs_dbgmsg("reflow: error %d writing real location", error);
4675 /*
4676 * Update the uberblock that is written when this txg completes.
4677 */
4678 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4679 logical_size);
4680 goto io_error_exit;
4681 }
4682 pio = zio_root(spa, NULL, NULL, 0);
4683 zio_flush(pio, raidvd);
4684 zio_wait(pio);
4685
4686 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4687 (long long)logical_size);
4688 for (int i = 0; i < raidvd->vdev_children; i++)
4689 abd_free(abds[i]);
4690 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4691
4692 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4693
4694 /*
4695 * Update uberblock to indicate that the initial part has been
4696 * reflow'ed. This is needed because after this point (when we exit
4697 * the rangelock), we allow regular writes to this region, which will
4698 * be written to the new location only (because reflow_offset_next ==
4699 * reflow_offset_synced). If we crashed and re-copied from the
4700 * scratch space, we would lose the regular writes.
4701 */
4702 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4703 logical_size);
4704 spa->spa_ubsync.ub_timestamp++;
4705 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4706 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4707 if (spa_multihost(spa))
4708 mmp_update_uberblock(spa, &spa->spa_ubsync);
4709
4710 zfs_dbgmsg("reflow: uberblock updated "
4711 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4712 (long long)spa->spa_ubsync.ub_txg,
4713 (long long)logical_size,
4714 (long long)spa->spa_ubsync.ub_timestamp);
4715
4716 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4717
4718 /*
4719 * Update progress.
4720 */
4721 vre->vre_offset = logical_size;
4722 zfs_rangelock_exit(lr);
4723 spa_config_exit(spa, SCL_STATE, FTAG);
4724
4725 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4726 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4727 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4728 /*
4729 * Note - raidz_reflow_sync() will update the uberblock state to
4730 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4731 */
4732 raidz_reflow_sync(spa, tx);
4733
4734 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4735 }
4736
4737 /*
4738 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4739 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4740 */
4741 void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4742 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4743 {
4744 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4745 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4746 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4747
4748 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4749 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4750 ASSERT0(logical_size % raidvd->vdev_children);
4751 uint64_t write_size = logical_size / raidvd->vdev_children;
4752
4753 zio_t *pio;
4754
4755 /*
4756 * Read from scratch space.
4757 */
4758 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4759 KM_SLEEP);
4760 for (int i = 0; i < raidvd->vdev_children; i++) {
4761 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4762 }
4763
4764 pio = zio_root(spa, NULL, NULL, 0);
4765 for (int i = 0; i < raidvd->vdev_children; i++) {
4766 /*
4767 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4768 * the offset to calculate the physical offset to write to.
4769 * Passing in a negative offset lets us access the boot area.
4770 */
4771 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4772 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4773 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4774 raidz_scratch_child_done, pio));
4775 }
4776 zio_wait(pio);
4777
4778 /*
4779 * Overwrite real location with reflow'ed data.
4780 */
4781 pio = zio_root(spa, NULL, NULL, 0);
4782 for (int i = 0; i < raidvd->vdev_children; i++) {
4783 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4784 0, abds[i], write_size, ZIO_TYPE_WRITE,
4785 ZIO_PRIORITY_REMOVAL, 0,
4786 raidz_scratch_child_done, pio));
4787 }
4788 zio_wait(pio);
4789 pio = zio_root(spa, NULL, NULL, 0);
4790 zio_flush(pio, raidvd);
4791 zio_wait(pio);
4792
4793 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4794 "to real location", (long long)logical_size);
4795
4796 for (int i = 0; i < raidvd->vdev_children; i++)
4797 abd_free(abds[i]);
4798 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4799
4800 /*
4801 * Update uberblock.
4802 */
4803 RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4804 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4805 spa->spa_ubsync.ub_timestamp++;
4806 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4807 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4808 if (spa_multihost(spa))
4809 mmp_update_uberblock(spa, &spa->spa_ubsync);
4810
4811 zfs_dbgmsg("reflow recovery: uberblock updated "
4812 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4813 (long long)spa->spa_ubsync.ub_txg,
4814 (long long)logical_size,
4815 (long long)spa->spa_ubsync.ub_timestamp);
4816
4817 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4818 spa_first_txg(spa));
4819 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4820 vre->vre_offset = logical_size;
4821 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4822 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4823 /*
4824 * Note that raidz_reflow_sync() will update the uberblock once more
4825 */
4826 raidz_reflow_sync(spa, tx);
4827
4828 dmu_tx_commit(tx);
4829
4830 spa_config_exit(spa, SCL_STATE, FTAG);
4831 }
4832
4833 static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4834 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4835 {
4836 (void) zthr;
4837 spa_t *spa = arg;
4838
4839 return (spa->spa_raidz_expand != NULL &&
4840 !spa->spa_raidz_expand->vre_waiting_for_resilver);
4841 }
4842
4843 /*
4844 * RAIDZ expansion background thread
4845 *
4846 * Can be called multiple times if the reflow is paused
4847 */
4848 static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4849 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4850 {
4851 spa_t *spa = arg;
4852 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4853
4854 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4855 vre->vre_offset = 0;
4856 else
4857 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4858
4859 /* Reflow the beginning portion using the scratch area */
4860 if (vre->vre_offset == 0) {
4861 VERIFY0(dsl_sync_task(spa_name(spa),
4862 NULL, raidz_reflow_scratch_sync,
4863 vre, 0, ZFS_SPACE_CHECK_NONE));
4864
4865 /* if we encountered errors then pause */
4866 if (vre->vre_offset == 0) {
4867 mutex_enter(&vre->vre_lock);
4868 vre->vre_waiting_for_resilver = B_TRUE;
4869 mutex_exit(&vre->vre_lock);
4870 return;
4871 }
4872 }
4873
4874 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4875 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4876
4877 uint64_t guid = raidvd->vdev_guid;
4878
4879 /* Iterate over all the remaining metaslabs */
4880 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4881 i < raidvd->vdev_ms_count &&
4882 !zthr_iscancelled(zthr) &&
4883 vre->vre_failed_offset == UINT64_MAX; i++) {
4884 metaslab_t *msp = raidvd->vdev_ms[i];
4885
4886 metaslab_disable(msp);
4887 mutex_enter(&msp->ms_lock);
4888
4889 /*
4890 * The metaslab may be newly created (for the expanded
4891 * space), in which case its trees won't exist yet,
4892 * so we need to bail out early.
4893 */
4894 if (msp->ms_new) {
4895 mutex_exit(&msp->ms_lock);
4896 metaslab_enable(msp, B_FALSE, B_FALSE);
4897 continue;
4898 }
4899
4900 VERIFY0(metaslab_load(msp));
4901
4902 /*
4903 * We want to copy everything except the free (allocatable)
4904 * space. Note that there may be a little bit more free
4905 * space (e.g. in ms_defer), and it's fine to copy that too.
4906 */
4907 uint64_t shift, start;
4908 zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
4909 raidvd, msp, &start, &shift);
4910 zfs_range_tree_t *rt = zfs_range_tree_create_flags(
4911 NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
4912 metaslab_rt_name(msp->ms_group, msp,
4913 "spa_raidz_expand_thread:rt"));
4914 zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
4915 zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
4916 rt);
4917 mutex_exit(&msp->ms_lock);
4918
4919 /*
4920 * Force the last sector of each metaslab to be copied. This
4921 * ensures that we advance the on-disk progress to the end of
4922 * this metaslab while the metaslab is disabled. Otherwise, we
4923 * could move past this metaslab without advancing the on-disk
4924 * progress, and then an allocation to this metaslab would not
4925 * be copied.
4926 */
4927 int sectorsz = 1 << raidvd->vdev_ashift;
4928 uint64_t ms_last_offset = msp->ms_start +
4929 msp->ms_size - sectorsz;
4930 if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
4931 zfs_range_tree_add(rt, ms_last_offset, sectorsz);
4932 }
4933
4934 /*
4935 * When we are resuming from a paused expansion (i.e.
4936 * when importing a pool with a expansion in progress),
4937 * discard any state that we have already processed.
4938 */
4939 if (vre->vre_offset > msp->ms_start) {
4940 zfs_range_tree_clear(rt, msp->ms_start,
4941 vre->vre_offset - msp->ms_start);
4942 }
4943
4944 while (!zthr_iscancelled(zthr) &&
4945 !zfs_range_tree_is_empty(rt) &&
4946 vre->vre_failed_offset == UINT64_MAX) {
4947
4948 /*
4949 * We need to periodically drop the config lock so that
4950 * writers can get in. Additionally, we can't wait
4951 * for a txg to sync while holding a config lock
4952 * (since a waiting writer could cause a 3-way deadlock
4953 * with the sync thread, which also gets a config
4954 * lock for reader). So we can't hold the config lock
4955 * while calling dmu_tx_assign().
4956 */
4957 spa_config_exit(spa, SCL_CONFIG, FTAG);
4958
4959 /*
4960 * If requested, pause the reflow when the amount
4961 * specified by raidz_expand_max_reflow_bytes is reached
4962 *
4963 * This pause is only used during testing or debugging.
4964 */
4965 while (raidz_expand_max_reflow_bytes != 0 &&
4966 raidz_expand_max_reflow_bytes <=
4967 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4968 delay(hz);
4969 }
4970
4971 mutex_enter(&vre->vre_lock);
4972 while (vre->vre_outstanding_bytes >
4973 raidz_expand_max_copy_bytes) {
4974 cv_wait(&vre->vre_cv, &vre->vre_lock);
4975 }
4976 mutex_exit(&vre->vre_lock);
4977
4978 dmu_tx_t *tx =
4979 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4980
4981 VERIFY0(dmu_tx_assign(tx,
4982 DMU_TX_WAIT | DMU_TX_SUSPEND));
4983 uint64_t txg = dmu_tx_get_txg(tx);
4984
4985 /*
4986 * Reacquire the vdev_config lock. Theoretically, the
4987 * vdev_t that we're expanding may have changed.
4988 */
4989 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4990 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4991
4992 boolean_t needsync =
4993 raidz_reflow_impl(raidvd, vre, rt, tx);
4994
4995 dmu_tx_commit(tx);
4996
4997 if (needsync) {
4998 spa_config_exit(spa, SCL_CONFIG, FTAG);
4999 txg_wait_synced(spa->spa_dsl_pool, txg);
5000 spa_config_enter(spa, SCL_CONFIG, FTAG,
5001 RW_READER);
5002 }
5003 }
5004
5005 spa_config_exit(spa, SCL_CONFIG, FTAG);
5006
5007 metaslab_enable(msp, B_FALSE, B_FALSE);
5008 zfs_range_tree_vacate(rt, NULL, NULL);
5009 zfs_range_tree_destroy(rt);
5010
5011 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5012 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
5013 }
5014
5015 spa_config_exit(spa, SCL_CONFIG, FTAG);
5016
5017 /*
5018 * The txg_wait_synced() here ensures that all reflow zio's have
5019 * completed, and vre_failed_offset has been set if necessary. It
5020 * also ensures that the progress of the last raidz_reflow_sync() is
5021 * written to disk before raidz_reflow_complete_sync() changes the
5022 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
5023 * determine if a reflow is in progress, in which case we may need to
5024 * write to both old and new locations. Therefore we can only change
5025 * vre_state once this is not necessary, which is once the on-disk
5026 * progress (in spa_ubsync) has been set past any possible writes (to
5027 * the end of the last metaslab).
5028 */
5029 txg_wait_synced(spa->spa_dsl_pool, 0);
5030
5031 if (!zthr_iscancelled(zthr) &&
5032 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
5033 /*
5034 * We are not being canceled or paused, so the reflow must be
5035 * complete. In that case also mark it as completed on disk.
5036 */
5037 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
5038 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
5039 raidz_reflow_complete_sync, spa,
5040 0, ZFS_SPACE_CHECK_NONE));
5041 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
5042 } else {
5043 /*
5044 * Wait for all copy zio's to complete and for all the
5045 * raidz_reflow_sync() synctasks to be run.
5046 */
5047 spa_history_log_internal(spa, "reflow pause",
5048 NULL, "offset=%llu failed_offset=%lld",
5049 (long long)vre->vre_offset,
5050 (long long)vre->vre_failed_offset);
5051 mutex_enter(&vre->vre_lock);
5052 if (vre->vre_failed_offset != UINT64_MAX) {
5053 /*
5054 * Reset progress so that we will retry everything
5055 * after the point that something failed.
5056 */
5057 vre->vre_offset = vre->vre_failed_offset;
5058 vre->vre_failed_offset = UINT64_MAX;
5059 vre->vre_waiting_for_resilver = B_TRUE;
5060 }
5061 mutex_exit(&vre->vre_lock);
5062 }
5063 }
5064
5065 void
spa_start_raidz_expansion_thread(spa_t * spa)5066 spa_start_raidz_expansion_thread(spa_t *spa)
5067 {
5068 ASSERT0P(spa->spa_raidz_expand_zthr);
5069 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
5070 spa_raidz_expand_thread_check, spa_raidz_expand_thread,
5071 spa, defclsyspri);
5072 }
5073
5074 void
raidz_dtl_reassessed(vdev_t * vd)5075 raidz_dtl_reassessed(vdev_t *vd)
5076 {
5077 spa_t *spa = vd->vdev_spa;
5078 if (spa->spa_raidz_expand != NULL) {
5079 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
5080 /*
5081 * we get called often from vdev_dtl_reassess() so make
5082 * sure it's our vdev and any replacing is complete
5083 */
5084 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
5085 !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
5086 mutex_enter(&vre->vre_lock);
5087 if (vre->vre_waiting_for_resilver) {
5088 vdev_dbgmsg(vd, "DTL reassessed, "
5089 "continuing raidz expansion");
5090 vre->vre_waiting_for_resilver = B_FALSE;
5091 zthr_wakeup(spa->spa_raidz_expand_zthr);
5092 }
5093 mutex_exit(&vre->vre_lock);
5094 }
5095 }
5096 }
5097
5098 int
vdev_raidz_attach_check(vdev_t * new_child)5099 vdev_raidz_attach_check(vdev_t *new_child)
5100 {
5101 vdev_t *raidvd = new_child->vdev_parent;
5102 uint64_t new_children = raidvd->vdev_children;
5103
5104 /*
5105 * We use the "boot" space as scratch space to handle overwriting the
5106 * initial part of the vdev. If it is too small, then this expansion
5107 * is not allowed. This would be very unusual (e.g. ashift > 13 and
5108 * >200 children).
5109 */
5110 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
5111 return (EINVAL);
5112 }
5113 return (0);
5114 }
5115
5116 void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)5117 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
5118 {
5119 vdev_t *new_child = arg;
5120 spa_t *spa = new_child->vdev_spa;
5121 vdev_t *raidvd = new_child->vdev_parent;
5122 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
5123 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
5124 ASSERT3P(raidvd->vdev_top, ==, raidvd);
5125 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
5126 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
5127 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
5128 new_child);
5129
5130 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
5131
5132 vdrz->vd_physical_width++;
5133
5134 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
5135 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
5136 vdrz->vn_vre.vre_offset = 0;
5137 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
5138 spa->spa_raidz_expand = &vdrz->vn_vre;
5139 zthr_wakeup(spa->spa_raidz_expand_zthr);
5140
5141 /*
5142 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
5143 * written to the config.
5144 */
5145 vdev_config_dirty(raidvd);
5146
5147 vdrz->vn_vre.vre_start_time = gethrestime_sec();
5148 vdrz->vn_vre.vre_end_time = 0;
5149 vdrz->vn_vre.vre_state = DSS_SCANNING;
5150 vdrz->vn_vre.vre_bytes_copied = 0;
5151
5152 uint64_t state = vdrz->vn_vre.vre_state;
5153 VERIFY0(zap_update(spa->spa_meta_objset,
5154 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
5155 sizeof (state), 1, &state, tx));
5156
5157 uint64_t start_time = vdrz->vn_vre.vre_start_time;
5158 VERIFY0(zap_update(spa->spa_meta_objset,
5159 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
5160 sizeof (start_time), 1, &start_time, tx));
5161
5162 (void) zap_remove(spa->spa_meta_objset,
5163 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
5164 (void) zap_remove(spa->spa_meta_objset,
5165 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
5166
5167 spa_history_log_internal(spa, "raidz vdev expansion started", tx,
5168 "%s vdev %llu new width %llu", spa_name(spa),
5169 (unsigned long long)raidvd->vdev_id,
5170 (unsigned long long)raidvd->vdev_children);
5171 }
5172
5173 int
vdev_raidz_load(vdev_t * vd)5174 vdev_raidz_load(vdev_t *vd)
5175 {
5176 vdev_raidz_t *vdrz = vd->vdev_tsd;
5177 int err;
5178
5179 uint64_t state = DSS_NONE;
5180 uint64_t start_time = 0;
5181 uint64_t end_time = 0;
5182 uint64_t bytes_copied = 0;
5183
5184 if (vd->vdev_top_zap != 0) {
5185 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5186 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
5187 sizeof (state), 1, &state);
5188 if (err != 0 && err != ENOENT)
5189 return (err);
5190
5191 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5192 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
5193 sizeof (start_time), 1, &start_time);
5194 if (err != 0 && err != ENOENT)
5195 return (err);
5196
5197 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5198 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
5199 sizeof (end_time), 1, &end_time);
5200 if (err != 0 && err != ENOENT)
5201 return (err);
5202
5203 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
5204 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
5205 sizeof (bytes_copied), 1, &bytes_copied);
5206 if (err != 0 && err != ENOENT)
5207 return (err);
5208 }
5209
5210 /*
5211 * If we are in the middle of expansion, vre_state should have
5212 * already been set by vdev_raidz_init().
5213 */
5214 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
5215 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
5216 vdrz->vn_vre.vre_start_time = start_time;
5217 vdrz->vn_vre.vre_end_time = end_time;
5218 vdrz->vn_vre.vre_bytes_copied = bytes_copied;
5219
5220 return (0);
5221 }
5222
5223 int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)5224 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
5225 {
5226 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
5227
5228 if (vre == NULL) {
5229 /* no removal in progress; find most recent completed */
5230 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
5231 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
5232 if (vd->vdev_ops == &vdev_raidz_ops) {
5233 vdev_raidz_t *vdrz = vd->vdev_tsd;
5234
5235 if (vdrz->vn_vre.vre_end_time != 0 &&
5236 (vre == NULL ||
5237 vdrz->vn_vre.vre_end_time >
5238 vre->vre_end_time)) {
5239 vre = &vdrz->vn_vre;
5240 }
5241 }
5242 }
5243 }
5244
5245 if (vre == NULL) {
5246 return (SET_ERROR(ENOENT));
5247 }
5248
5249 pres->pres_state = vre->vre_state;
5250 pres->pres_expanding_vdev = vre->vre_vdev_id;
5251
5252 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
5253 pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
5254
5255 mutex_enter(&vre->vre_lock);
5256 pres->pres_reflowed = vre->vre_bytes_copied;
5257 for (int i = 0; i < TXG_SIZE; i++)
5258 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
5259 mutex_exit(&vre->vre_lock);
5260
5261 pres->pres_start_time = vre->vre_start_time;
5262 pres->pres_end_time = vre->vre_end_time;
5263 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
5264
5265 return (0);
5266 }
5267
5268 /*
5269 * Initialize private RAIDZ specific fields from the nvlist.
5270 */
5271 static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)5272 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
5273 {
5274 uint_t children;
5275 nvlist_t **child;
5276 int error = nvlist_lookup_nvlist_array(nv,
5277 ZPOOL_CONFIG_CHILDREN, &child, &children);
5278 if (error != 0)
5279 return (SET_ERROR(EINVAL));
5280
5281 uint64_t nparity;
5282 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
5283 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
5284 return (SET_ERROR(EINVAL));
5285
5286 /*
5287 * Previous versions could only support 1 or 2 parity
5288 * device.
5289 */
5290 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
5291 return (SET_ERROR(EINVAL));
5292 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
5293 return (SET_ERROR(EINVAL));
5294 } else {
5295 /*
5296 * We require the parity to be specified for SPAs that
5297 * support multiple parity levels.
5298 */
5299 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
5300 return (SET_ERROR(EINVAL));
5301
5302 /*
5303 * Otherwise, we default to 1 parity device for RAID-Z.
5304 */
5305 nparity = 1;
5306 }
5307
5308 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
5309 vdrz->vn_vre.vre_vdev_id = -1;
5310 vdrz->vn_vre.vre_offset = UINT64_MAX;
5311 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
5312 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
5313 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
5314 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
5315 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
5316 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
5317 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
5318
5319 vdrz->vd_physical_width = children;
5320 vdrz->vd_nparity = nparity;
5321
5322 /* note, the ID does not exist when creating a pool */
5323 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
5324 &vdrz->vn_vre.vre_vdev_id);
5325
5326 boolean_t reflow_in_progress =
5327 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5328 if (reflow_in_progress) {
5329 spa->spa_raidz_expand = &vdrz->vn_vre;
5330 vdrz->vn_vre.vre_state = DSS_SCANNING;
5331 }
5332
5333 vdrz->vd_original_width = children;
5334 uint64_t *txgs;
5335 unsigned int txgs_size = 0;
5336 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5337 &txgs, &txgs_size);
5338 if (error == 0) {
5339 for (int i = 0; i < txgs_size; i++) {
5340 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
5341 re->re_txg = txgs[txgs_size - i - 1];
5342 re->re_logical_width = vdrz->vd_physical_width - i;
5343
5344 if (reflow_in_progress)
5345 re->re_logical_width--;
5346
5347 avl_add(&vdrz->vd_expand_txgs, re);
5348 }
5349
5350 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
5351 }
5352 if (reflow_in_progress) {
5353 vdrz->vd_original_width--;
5354 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
5355 children, txgs_size);
5356 }
5357
5358 *tsd = vdrz;
5359
5360 return (0);
5361 }
5362
5363 static void
vdev_raidz_fini(vdev_t * vd)5364 vdev_raidz_fini(vdev_t *vd)
5365 {
5366 vdev_raidz_t *vdrz = vd->vdev_tsd;
5367 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5368 vd->vdev_spa->spa_raidz_expand = NULL;
5369 reflow_node_t *re;
5370 void *cookie = NULL;
5371 avl_tree_t *tree = &vdrz->vd_expand_txgs;
5372 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5373 kmem_free(re, sizeof (*re));
5374 avl_destroy(&vdrz->vd_expand_txgs);
5375 mutex_destroy(&vdrz->vd_expand_lock);
5376 mutex_destroy(&vdrz->vn_vre.vre_lock);
5377 cv_destroy(&vdrz->vn_vre.vre_cv);
5378 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5379 kmem_free(vdrz, sizeof (*vdrz));
5380 }
5381
5382 /*
5383 * Add RAIDZ specific fields to the config nvlist.
5384 */
5385 static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)5386 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
5387 {
5388 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
5389 vdev_raidz_t *vdrz = vd->vdev_tsd;
5390
5391 /*
5392 * Make sure someone hasn't managed to sneak a fancy new vdev
5393 * into a crufty old storage pool.
5394 */
5395 ASSERT(vdrz->vd_nparity == 1 ||
5396 (vdrz->vd_nparity <= 2 &&
5397 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
5398 (vdrz->vd_nparity <= 3 &&
5399 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
5400
5401 /*
5402 * Note that we'll add these even on storage pools where they
5403 * aren't strictly required -- older software will just ignore
5404 * it.
5405 */
5406 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5407
5408 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5409 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5410 }
5411
5412 mutex_enter(&vdrz->vd_expand_lock);
5413 if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5414 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5415 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5416 KM_SLEEP);
5417 uint64_t i = 0;
5418
5419 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5420 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5421 txgs[i++] = re->re_txg;
5422 }
5423
5424 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5425 txgs, count);
5426
5427 kmem_free(txgs, sizeof (uint64_t) * count);
5428 }
5429 mutex_exit(&vdrz->vd_expand_lock);
5430 }
5431
5432 static uint64_t
vdev_raidz_nparity(vdev_t * vd)5433 vdev_raidz_nparity(vdev_t *vd)
5434 {
5435 vdev_raidz_t *vdrz = vd->vdev_tsd;
5436 return (vdrz->vd_nparity);
5437 }
5438
5439 static uint64_t
vdev_raidz_ndisks(vdev_t * vd)5440 vdev_raidz_ndisks(vdev_t *vd)
5441 {
5442 return (vd->vdev_children);
5443 }
5444
5445 vdev_ops_t vdev_raidz_ops = {
5446 .vdev_op_init = vdev_raidz_init,
5447 .vdev_op_fini = vdev_raidz_fini,
5448 .vdev_op_open = vdev_raidz_open,
5449 .vdev_op_close = vdev_raidz_close,
5450 .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
5451 .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
5452 .vdev_op_min_asize = vdev_raidz_min_asize,
5453 .vdev_op_min_alloc = NULL,
5454 .vdev_op_io_start = vdev_raidz_io_start,
5455 .vdev_op_io_done = vdev_raidz_io_done,
5456 .vdev_op_state_change = vdev_raidz_state_change,
5457 .vdev_op_need_resilver = vdev_raidz_need_resilver,
5458 .vdev_op_hold = NULL,
5459 .vdev_op_rele = NULL,
5460 .vdev_op_remap = NULL,
5461 .vdev_op_xlate = vdev_raidz_xlate,
5462 .vdev_op_rebuild_asize = NULL,
5463 .vdev_op_metaslab_init = NULL,
5464 .vdev_op_config_generate = vdev_raidz_config_generate,
5465 .vdev_op_nparity = vdev_raidz_nparity,
5466 .vdev_op_ndisks = vdev_raidz_ndisks,
5467 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5468 .vdev_op_leaf = B_FALSE /* not a leaf vdev */
5469 };
5470
5471 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5472 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5473 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5474 "Max amount of concurrent i/o for RAIDZ expansion");
5475 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5476 "For expanded RAIDZ, aggregate reads that have more rows than this");
5477 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5478 "For expanded RAIDZ, automatically start a pool scrub when expansion "
5479 "completes");
5480 ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
5481 "Raidz/draid slow disk sit out time period in seconds");
5482 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
5483 ZMOD_RW, "Interval to check for slow raidz/draid children");
5484 ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
5485 ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
5486 /* END CSTYLED */
5487