xref: /freebsd/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c (revision d485c77f203fb0f4cdc08dea5ff81631b51d8809)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/time.h>
28 #include <sys/wait.h>
29 #include <sys/zio.h>
30 #include <umem.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include "raidz_test.h"
36 
37 static int *rand_data;
38 raidz_test_opts_t rto_opts;
39 
40 static char gdb[256];
41 static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d";
42 
43 static void sig_handler(int signo)
44 {
45 	struct sigaction action;
46 	/*
47 	 * Restore default action and re-raise signal so SIGSEGV and
48 	 * SIGABRT can trigger a core dump.
49 	 */
50 	action.sa_handler = SIG_DFL;
51 	sigemptyset(&action.sa_mask);
52 	action.sa_flags = 0;
53 	(void) sigaction(signo, &action, NULL);
54 
55 	if (rto_opts.rto_gdb)
56 		if (system(gdb)) { }
57 
58 	raise(signo);
59 }
60 
61 static void print_opts(raidz_test_opts_t *opts, boolean_t force)
62 {
63 	char *verbose;
64 	switch (opts->rto_v) {
65 		case 0:
66 			verbose = "no";
67 			break;
68 		case 1:
69 			verbose = "info";
70 			break;
71 		default:
72 			verbose = "debug";
73 			break;
74 	}
75 
76 	if (force || opts->rto_v >= D_INFO) {
77 		(void) fprintf(stdout, DBLSEP "Running with options:\n"
78 		    "  (-a) zio ashift                   : %zu\n"
79 		    "  (-o) zio offset                   : 1 << %zu\n"
80 		    "  (-e) expanded map                 : %s\n"
81 		    "  (-r) reflow offset                : %llx\n"
82 		    "  (-d) number of raidz data columns : %zu\n"
83 		    "  (-s) size of DATA                 : 1 << %zu\n"
84 		    "  (-S) sweep parameters             : %s \n"
85 		    "  (-v) verbose                      : %s \n\n",
86 		    opts->rto_ashift,				/* -a */
87 		    ilog2(opts->rto_offset),			/* -o */
88 		    opts->rto_expand ? "yes" : "no",		/* -e */
89 		    (u_longlong_t)opts->rto_expand_offset,	/* -r */
90 		    opts->rto_dcols,				/* -d */
91 		    ilog2(opts->rto_dsize),			/* -s */
92 		    opts->rto_sweep ? "yes" : "no",		/* -S */
93 		    verbose);					/* -v */
94 	}
95 }
96 
97 static void usage(boolean_t requested)
98 {
99 	const raidz_test_opts_t *o = &rto_opts_defaults;
100 
101 	FILE *fp = requested ? stdout : stderr;
102 
103 	(void) fprintf(fp, "Usage:\n"
104 	    "\t[-a zio ashift (default: %zu)]\n"
105 	    "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
106 	    "\t[-d number of raidz data columns (default: %zu)]\n"
107 	    "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
108 	    "\t[-S parameter sweep (default: %s)]\n"
109 	    "\t[-t timeout for parameter sweep test]\n"
110 	    "\t[-B benchmark all raidz implementations]\n"
111 	    "\t[-e use expanded raidz map (default: %s)]\n"
112 	    "\t[-r expanded raidz map reflow offset (default: %llx)]\n"
113 	    "\t[-v increase verbosity (default: %zu)]\n"
114 	    "\t[-h (print help)]\n"
115 	    "\t[-T test the test, see if failure would be detected]\n"
116 	    "\t[-D debug (attach gdb on SIGSEGV)]\n"
117 	    "",
118 	    o->rto_ashift,				/* -a */
119 	    ilog2(o->rto_offset),			/* -o */
120 	    o->rto_dcols,				/* -d */
121 	    ilog2(o->rto_dsize),			/* -s */
122 	    rto_opts.rto_sweep ? "yes" : "no",		/* -S */
123 	    rto_opts.rto_expand ? "yes" : "no",		/* -e */
124 	    (u_longlong_t)o->rto_expand_offset,		/* -r */
125 	    o->rto_v);					/* -d */
126 
127 	exit(requested ? 0 : 1);
128 }
129 
130 static void process_options(int argc, char **argv)
131 {
132 	size_t value;
133 	int opt;
134 
135 	raidz_test_opts_t *o = &rto_opts;
136 
137 	bcopy(&rto_opts_defaults, o, sizeof (*o));
138 
139 	while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
140 		value = 0;
141 
142 		switch (opt) {
143 		case 'a':
144 			value = strtoull(optarg, NULL, 0);
145 			o->rto_ashift = MIN(13, MAX(9, value));
146 			break;
147 		case 'e':
148 			o->rto_expand = 1;
149 			break;
150 		case 'r':
151 			o->rto_expand_offset = strtoull(optarg, NULL, 0);
152 			break;
153 		case 'o':
154 			value = strtoull(optarg, NULL, 0);
155 			o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
156 			break;
157 		case 'd':
158 			value = strtoull(optarg, NULL, 0);
159 			o->rto_dcols = MIN(255, MAX(1, value));
160 			break;
161 		case 's':
162 			value = strtoull(optarg, NULL, 0);
163 			o->rto_dsize = 1ULL <<  MIN(SPA_MAXBLOCKSHIFT,
164 			    MAX(SPA_MINBLOCKSHIFT, value));
165 			break;
166 		case 't':
167 			value = strtoull(optarg, NULL, 0);
168 			o->rto_sweep_timeout = value;
169 			break;
170 		case 'v':
171 			o->rto_v++;
172 			break;
173 		case 'S':
174 			o->rto_sweep = 1;
175 			break;
176 		case 'B':
177 			o->rto_benchmark = 1;
178 			break;
179 		case 'D':
180 			o->rto_gdb = 1;
181 			break;
182 		case 'T':
183 			o->rto_sanity = 1;
184 			break;
185 		case 'h':
186 			usage(B_TRUE);
187 			break;
188 		case '?':
189 		default:
190 			usage(B_FALSE);
191 			break;
192 		}
193 	}
194 }
195 
196 #define	DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
197 #define	DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
198 
199 #define	CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
200 #define	CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
201 
202 static int
203 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
204 {
205 	int r, i, ret = 0;
206 
207 	VERIFY(parity >= 1 && parity <= 3);
208 
209 	for (r = 0; r < rm->rm_nrows; r++) {
210 		raidz_row_t * const rr = rm->rm_row[r];
211 		raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
212 		for (i = 0; i < parity; i++) {
213 			if (CODE_COL_SIZE(rrg, i) == 0) {
214 				VERIFY0(CODE_COL_SIZE(rr, i));
215 				continue;
216 			}
217 
218 			if (abd_cmp(CODE_COL(rr, i),
219 			    CODE_COL(rrg, i)) != 0) {
220 				ret++;
221 				LOG_OPT(D_DEBUG, opts,
222 				    "\nParity block [%d] different!\n", i);
223 			}
224 		}
225 	}
226 	return (ret);
227 }
228 
229 static int
230 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
231 {
232 	int r, i, dcols, ret = 0;
233 
234 	for (r = 0; r < rm->rm_nrows; r++) {
235 		raidz_row_t *rr = rm->rm_row[r];
236 		raidz_row_t *rrg = opts->rm_golden->rm_row[r];
237 		dcols = opts->rm_golden->rm_row[0]->rr_cols -
238 		    raidz_parity(opts->rm_golden);
239 		for (i = 0; i < dcols; i++) {
240 			if (DATA_COL_SIZE(rrg, i) == 0) {
241 				VERIFY0(DATA_COL_SIZE(rr, i));
242 				continue;
243 			}
244 
245 			if (abd_cmp(DATA_COL(rrg, i),
246 			    DATA_COL(rr, i)) != 0) {
247 				ret++;
248 
249 				LOG_OPT(D_DEBUG, opts,
250 				    "\nData block [%d] different!\n", i);
251 			}
252 		}
253 	}
254 	return (ret);
255 }
256 
257 static int
258 init_rand(void *data, size_t size, void *private)
259 {
260 	int i;
261 	int *dst = (int *)data;
262 
263 	for (i = 0; i < size / sizeof (int); i++)
264 		dst[i] = rand_data[i];
265 
266 	return (0);
267 }
268 
269 static void
270 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
271 {
272 	for (int r = 0; r < rm->rm_nrows; r++) {
273 		raidz_row_t *rr = rm->rm_row[r];
274 		for (int i = 0; i < cnt; i++) {
275 			raidz_col_t *col = &rr->rr_col[tgts[i]];
276 			abd_iterate_func(col->rc_abd, 0, col->rc_size,
277 			    init_rand, NULL);
278 		}
279 	}
280 }
281 
282 void
283 init_zio_abd(zio_t *zio)
284 {
285 	abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
286 }
287 
288 static void
289 fini_raidz_map(zio_t **zio, raidz_map_t **rm)
290 {
291 	vdev_raidz_map_free(*rm);
292 	raidz_free((*zio)->io_abd, (*zio)->io_size);
293 	umem_free(*zio, sizeof (zio_t));
294 
295 	*zio = NULL;
296 	*rm = NULL;
297 }
298 
299 static int
300 init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
301 {
302 	int err = 0;
303 	zio_t *zio_test;
304 	raidz_map_t *rm_test;
305 	const size_t total_ncols = opts->rto_dcols + parity;
306 
307 	if (opts->rm_golden) {
308 		fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
309 	}
310 
311 	opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
312 	zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
313 
314 	opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
315 	opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
316 
317 	opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
318 	zio_test->io_abd = raidz_alloc(opts->rto_dsize);
319 
320 	init_zio_abd(opts->zio_golden);
321 	init_zio_abd(zio_test);
322 
323 	VERIFY0(vdev_raidz_impl_set("original"));
324 
325 	if (opts->rto_expand) {
326 		opts->rm_golden =
327 		    vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
328 		    opts->zio_golden->io_size, opts->zio_golden->io_offset,
329 		    opts->rto_ashift, total_ncols+1, total_ncols,
330 		    parity, opts->rto_expand_offset);
331 		rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
332 		    zio_test->io_size, zio_test->io_offset,
333 		    opts->rto_ashift, total_ncols+1, total_ncols,
334 		    parity, opts->rto_expand_offset);
335 	} else {
336 		opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
337 		    opts->rto_ashift, total_ncols, parity);
338 		rm_test = vdev_raidz_map_alloc(zio_test,
339 		    opts->rto_ashift, total_ncols, parity);
340 	}
341 
342 	VERIFY(opts->zio_golden);
343 	VERIFY(opts->rm_golden);
344 
345 	vdev_raidz_generate_parity(opts->rm_golden);
346 	vdev_raidz_generate_parity(rm_test);
347 
348 	/* sanity check */
349 	err |= cmp_data(opts, rm_test);
350 	err |= cmp_code(opts, rm_test, parity);
351 
352 	if (err)
353 		ERR("initializing the golden copy ... [FAIL]!\n");
354 
355 	/* tear down raidz_map of test zio */
356 	fini_raidz_map(&zio_test, &rm_test);
357 
358 	return (err);
359 }
360 
361 /*
362  * If reflow is not in progress, reflow_offset should be UINT64_MAX.
363  * For each row, if the row is entirely before reflow_offset, it will
364  * come from the new location.  Otherwise this row will come from the
365  * old location.  Therefore, rows that straddle the reflow_offset will
366  * come from the old location.
367  *
368  * NOTE: Until raidz expansion is implemented this function is only
369  * needed by raidz_test.c to the multi-row raid_map_t functionality.
370  */
371 raidz_map_t *
372 vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
373     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
374     uint64_t nparity, uint64_t reflow_offset)
375 {
376 	/* The zio's size in units of the vdev's minimum sector size. */
377 	uint64_t s = size >> ashift;
378 	uint64_t q, r, bc, devidx, asize = 0, tot;
379 
380 	/*
381 	 * "Quotient": The number of data sectors for this stripe on all but
382 	 * the "big column" child vdevs that also contain "remainder" data.
383 	 * AKA "full rows"
384 	 */
385 	q = s / (logical_cols - nparity);
386 
387 	/*
388 	 * "Remainder": The number of partial stripe data sectors in this I/O.
389 	 * This will add a sector to some, but not all, child vdevs.
390 	 */
391 	r = s - q * (logical_cols - nparity);
392 
393 	/* The number of "big columns" - those which contain remainder data. */
394 	bc = (r == 0 ? 0 : r + nparity);
395 
396 	/*
397 	 * The total number of data and parity sectors associated with
398 	 * this I/O.
399 	 */
400 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
401 
402 	/* How many rows contain data (not skip) */
403 	uint64_t rows = howmany(tot, logical_cols);
404 	int cols = MIN(tot, logical_cols);
405 
406 	raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
407 	    KM_SLEEP);
408 	rm->rm_nrows = rows;
409 
410 	for (uint64_t row = 0; row < rows; row++) {
411 		raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
412 		    rr_col[cols]), KM_SLEEP);
413 		rm->rm_row[row] = rr;
414 
415 		/* The starting RAIDZ (parent) vdev sector of the row. */
416 		uint64_t b = (offset >> ashift) + row * logical_cols;
417 
418 		/*
419 		 * If we are in the middle of a reflow, and any part of this
420 		 * row has not been copied, then use the old location of
421 		 * this row.
422 		 */
423 		int row_phys_cols = physical_cols;
424 		if (b + (logical_cols - nparity) > reflow_offset >> ashift)
425 			row_phys_cols--;
426 
427 		/* starting child of this row */
428 		uint64_t child_id = b % row_phys_cols;
429 		/* The starting byte offset on each child vdev. */
430 		uint64_t child_offset = (b / row_phys_cols) << ashift;
431 
432 		/*
433 		 * We set cols to the entire width of the block, even
434 		 * if this row is shorter.  This is needed because parity
435 		 * generation (for Q and R) needs to know the entire width,
436 		 * because it treats the short row as though it was
437 		 * full-width (and the "phantom" sectors were zero-filled).
438 		 *
439 		 * Another approach to this would be to set cols shorter
440 		 * (to just the number of columns that we might do i/o to)
441 		 * and have another mechanism to tell the parity generation
442 		 * about the "entire width".  Reconstruction (at least
443 		 * vdev_raidz_reconstruct_general()) would also need to
444 		 * know about the "entire width".
445 		 */
446 		rr->rr_cols = cols;
447 		rr->rr_bigcols = bc;
448 		rr->rr_missingdata = 0;
449 		rr->rr_missingparity = 0;
450 		rr->rr_firstdatacol = nparity;
451 		rr->rr_abd_copy = NULL;
452 		rr->rr_abd_empty = NULL;
453 		rr->rr_nempty = 0;
454 
455 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
456 			if (child_id >= row_phys_cols) {
457 				child_id -= row_phys_cols;
458 				child_offset += 1ULL << ashift;
459 			}
460 			rr->rr_col[c].rc_devidx = child_id;
461 			rr->rr_col[c].rc_offset = child_offset;
462 			rr->rr_col[c].rc_gdata = NULL;
463 			rr->rr_col[c].rc_orig_data = NULL;
464 			rr->rr_col[c].rc_error = 0;
465 			rr->rr_col[c].rc_tried = 0;
466 			rr->rr_col[c].rc_skipped = 0;
467 			rr->rr_col[c].rc_need_orig_restore = B_FALSE;
468 
469 			uint64_t dc = c - rr->rr_firstdatacol;
470 			if (c < rr->rr_firstdatacol) {
471 				rr->rr_col[c].rc_size = 1ULL << ashift;
472 				rr->rr_col[c].rc_abd =
473 				    abd_alloc_linear(rr->rr_col[c].rc_size,
474 				    B_TRUE);
475 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
476 				/*
477 				 * Past the end, this for parity generation.
478 				 */
479 				rr->rr_col[c].rc_size = 0;
480 				rr->rr_col[c].rc_abd = NULL;
481 			} else {
482 				/*
483 				 * "data column" (col excluding parity)
484 				 * Add an ASCII art diagram here
485 				 */
486 				uint64_t off;
487 
488 				if (c < bc || r == 0) {
489 					off = dc * rows + row;
490 				} else {
491 					off = r * rows +
492 					    (dc - r) * (rows - 1) + row;
493 				}
494 				rr->rr_col[c].rc_size = 1ULL << ashift;
495 				rr->rr_col[c].rc_abd = abd_get_offset_struct(
496 				    &rr->rr_col[c].rc_abdstruct,
497 				    abd, off << ashift, 1 << ashift);
498 			}
499 
500 			asize += rr->rr_col[c].rc_size;
501 		}
502 		/*
503 		 * If all data stored spans all columns, there's a danger that
504 		 * parity will always be on the same device and, since parity
505 		 * isn't read during normal operation, that that device's I/O
506 		 * bandwidth won't be used effectively. We therefore switch
507 		 * the parity every 1MB.
508 		 *
509 		 * ...at least that was, ostensibly, the theory. As a practical
510 		 * matter unless we juggle the parity between all devices
511 		 * evenly, we won't see any benefit. Further, occasional writes
512 		 * that aren't a multiple of the LCM of the number of children
513 		 * and the minimum stripe width are sufficient to avoid pessimal
514 		 * behavior. Unfortunately, this decision created an implicit
515 		 * on-disk format requirement that we need to support for all
516 		 * eternity, but only for single-parity RAID-Z.
517 		 *
518 		 * If we intend to skip a sector in the zeroth column for
519 		 * padding we must make sure to note this swap. We will never
520 		 * intend to skip the first column since at least one data and
521 		 * one parity column must appear in each row.
522 		 */
523 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
524 		    (offset & (1ULL << 20))) {
525 			ASSERT(rr->rr_cols >= 2);
526 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
527 			devidx = rr->rr_col[0].rc_devidx;
528 			uint64_t o = rr->rr_col[0].rc_offset;
529 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
530 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
531 			rr->rr_col[1].rc_devidx = devidx;
532 			rr->rr_col[1].rc_offset = o;
533 		}
534 
535 	}
536 	ASSERT3U(asize, ==, tot << ashift);
537 
538 	/* init RAIDZ parity ops */
539 	rm->rm_ops = vdev_raidz_math_get_ops();
540 
541 	return (rm);
542 }
543 
544 static raidz_map_t *
545 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
546 {
547 	raidz_map_t *rm = NULL;
548 	const size_t alloc_dsize = opts->rto_dsize;
549 	const size_t total_ncols = opts->rto_dcols + parity;
550 	const int ccols[] = { 0, 1, 2 };
551 
552 	VERIFY(zio);
553 	VERIFY(parity <= 3 && parity >= 1);
554 
555 	*zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
556 
557 	(*zio)->io_offset = 0;
558 	(*zio)->io_size = alloc_dsize;
559 	(*zio)->io_abd = raidz_alloc(alloc_dsize);
560 	init_zio_abd(*zio);
561 
562 	if (opts->rto_expand) {
563 		rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
564 		    (*zio)->io_size, (*zio)->io_offset,
565 		    opts->rto_ashift, total_ncols+1, total_ncols,
566 		    parity, opts->rto_expand_offset);
567 	} else {
568 		rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
569 		    total_ncols, parity);
570 	}
571 	VERIFY(rm);
572 
573 	/* Make sure code columns are destroyed */
574 	corrupt_colums(rm, ccols, parity);
575 
576 	return (rm);
577 }
578 
579 static int
580 run_gen_check(raidz_test_opts_t *opts)
581 {
582 	char **impl_name;
583 	int fn, err = 0;
584 	zio_t *zio_test;
585 	raidz_map_t *rm_test;
586 
587 	err = init_raidz_golden_map(opts, PARITY_PQR);
588 	if (0 != err)
589 		return (err);
590 
591 	LOG(D_INFO, DBLSEP);
592 	LOG(D_INFO, "Testing parity generation...\n");
593 
594 	for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
595 	    impl_name++) {
596 
597 		LOG(D_INFO, SEP);
598 		LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
599 
600 		if (0 != vdev_raidz_impl_set(*impl_name)) {
601 			LOG(D_INFO, "[SKIP]\n");
602 			continue;
603 		} else {
604 			LOG(D_INFO, "[SUPPORTED]\n");
605 		}
606 
607 		for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
608 
609 			/* Check if should stop */
610 			if (rto_opts.rto_should_stop)
611 				return (err);
612 
613 			/* create suitable raidz_map */
614 			rm_test = init_raidz_map(opts, &zio_test, fn+1);
615 			VERIFY(rm_test);
616 
617 			LOG(D_INFO, "\t\tTesting method [%s] ...",
618 			    raidz_gen_name[fn]);
619 
620 			if (!opts->rto_sanity)
621 				vdev_raidz_generate_parity(rm_test);
622 
623 			if (cmp_code(opts, rm_test, fn+1) != 0) {
624 				LOG(D_INFO, "[FAIL]\n");
625 				err++;
626 			} else
627 				LOG(D_INFO, "[PASS]\n");
628 
629 			fini_raidz_map(&zio_test, &rm_test);
630 		}
631 	}
632 
633 	fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
634 
635 	return (err);
636 }
637 
638 static int
639 run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
640 {
641 	int x0, x1, x2;
642 	int tgtidx[3];
643 	int err = 0;
644 	static const int rec_tgts[7][3] = {
645 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
646 		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
647 		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
648 		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
649 		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
650 		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
651 		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
652 	};
653 
654 	memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
655 
656 	if (fn < RAIDZ_REC_PQ) {
657 		/* can reconstruct 1 failed data disk */
658 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
659 			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
660 				continue;
661 
662 			/* Check if should stop */
663 			if (rto_opts.rto_should_stop)
664 				return (err);
665 
666 			LOG(D_DEBUG, "[%d] ", x0);
667 
668 			tgtidx[2] = x0 + raidz_parity(rm);
669 
670 			corrupt_colums(rm, tgtidx+2, 1);
671 
672 			if (!opts->rto_sanity)
673 				vdev_raidz_reconstruct(rm, tgtidx, 3);
674 
675 			if (cmp_data(opts, rm) != 0) {
676 				err++;
677 				LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
678 			}
679 		}
680 
681 	} else if (fn < RAIDZ_REC_PQR) {
682 		/* can reconstruct 2 failed data disk */
683 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
684 			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
685 				continue;
686 			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
687 				if (x1 >= rm->rm_row[0]->rr_cols -
688 				    raidz_parity(rm))
689 					continue;
690 
691 				/* Check if should stop */
692 				if (rto_opts.rto_should_stop)
693 					return (err);
694 
695 				LOG(D_DEBUG, "[%d %d] ", x0, x1);
696 
697 				tgtidx[1] = x0 + raidz_parity(rm);
698 				tgtidx[2] = x1 + raidz_parity(rm);
699 
700 				corrupt_colums(rm, tgtidx+1, 2);
701 
702 				if (!opts->rto_sanity)
703 					vdev_raidz_reconstruct(rm, tgtidx, 3);
704 
705 				if (cmp_data(opts, rm) != 0) {
706 					err++;
707 					LOG(D_DEBUG, "\nREC D[%d %d]... "
708 					    "[FAIL]\n", x0, x1);
709 				}
710 			}
711 		}
712 	} else {
713 		/* can reconstruct 3 failed data disk */
714 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
715 			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
716 				continue;
717 			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
718 				if (x1 >= rm->rm_row[0]->rr_cols -
719 				    raidz_parity(rm))
720 					continue;
721 				for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
722 					if (x2 >= rm->rm_row[0]->rr_cols -
723 					    raidz_parity(rm))
724 						continue;
725 
726 					/* Check if should stop */
727 					if (rto_opts.rto_should_stop)
728 						return (err);
729 
730 					LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
731 
732 					tgtidx[0] = x0 + raidz_parity(rm);
733 					tgtidx[1] = x1 + raidz_parity(rm);
734 					tgtidx[2] = x2 + raidz_parity(rm);
735 
736 					corrupt_colums(rm, tgtidx, 3);
737 
738 					if (!opts->rto_sanity)
739 						vdev_raidz_reconstruct(rm,
740 						    tgtidx, 3);
741 
742 					if (cmp_data(opts, rm) != 0) {
743 						err++;
744 						LOG(D_DEBUG,
745 						    "\nREC D[%d %d %d]... "
746 						    "[FAIL]\n", x0, x1, x2);
747 					}
748 				}
749 			}
750 		}
751 	}
752 	return (err);
753 }
754 
755 static int
756 run_rec_check(raidz_test_opts_t *opts)
757 {
758 	char **impl_name;
759 	unsigned fn, err = 0;
760 	zio_t *zio_test;
761 	raidz_map_t *rm_test;
762 
763 	err = init_raidz_golden_map(opts, PARITY_PQR);
764 	if (0 != err)
765 		return (err);
766 
767 	LOG(D_INFO, DBLSEP);
768 	LOG(D_INFO, "Testing data reconstruction...\n");
769 
770 	for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
771 	    impl_name++) {
772 
773 		LOG(D_INFO, SEP);
774 		LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
775 
776 		if (vdev_raidz_impl_set(*impl_name) != 0) {
777 			LOG(D_INFO, "[SKIP]\n");
778 			continue;
779 		} else
780 			LOG(D_INFO, "[SUPPORTED]\n");
781 
782 
783 		/* create suitable raidz_map */
784 		rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
785 		/* generate parity */
786 		vdev_raidz_generate_parity(rm_test);
787 
788 		for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
789 
790 			LOG(D_INFO, "\t\tTesting method [%s] ...",
791 			    raidz_rec_name[fn]);
792 
793 			if (run_rec_check_impl(opts, rm_test, fn) != 0) {
794 				LOG(D_INFO, "[FAIL]\n");
795 				err++;
796 
797 			} else
798 				LOG(D_INFO, "[PASS]\n");
799 
800 		}
801 		/* tear down test raidz_map */
802 		fini_raidz_map(&zio_test, &rm_test);
803 	}
804 
805 	fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
806 
807 	return (err);
808 }
809 
810 static int
811 run_test(raidz_test_opts_t *opts)
812 {
813 	int err = 0;
814 
815 	if (opts == NULL)
816 		opts = &rto_opts;
817 
818 	print_opts(opts, B_FALSE);
819 
820 	err |= run_gen_check(opts);
821 	err |= run_rec_check(opts);
822 
823 	return (err);
824 }
825 
826 #define	SWEEP_RUNNING	0
827 #define	SWEEP_FINISHED	1
828 #define	SWEEP_ERROR	2
829 #define	SWEEP_TIMEOUT	3
830 
831 static int sweep_state = 0;
832 static raidz_test_opts_t failed_opts;
833 
834 static kmutex_t sem_mtx;
835 static kcondvar_t sem_cv;
836 static int max_free_slots;
837 static int free_slots;
838 
839 static void
840 sweep_thread(void *arg)
841 {
842 	int err = 0;
843 	raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
844 	VERIFY(opts != NULL);
845 
846 	err = run_test(opts);
847 
848 	if (rto_opts.rto_sanity) {
849 		/* 25% chance that a sweep test fails */
850 		if (rand() < (RAND_MAX/4))
851 			err = 1;
852 	}
853 
854 	if (0 != err) {
855 		mutex_enter(&sem_mtx);
856 		memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
857 		sweep_state = SWEEP_ERROR;
858 		mutex_exit(&sem_mtx);
859 	}
860 
861 	umem_free(opts, sizeof (raidz_test_opts_t));
862 
863 	/* signal the next thread */
864 	mutex_enter(&sem_mtx);
865 	free_slots++;
866 	cv_signal(&sem_cv);
867 	mutex_exit(&sem_mtx);
868 
869 	thread_exit();
870 }
871 
872 static int
873 run_sweep(void)
874 {
875 	static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
876 	static const size_t ashift_v[] = { 9, 12, 14 };
877 	static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
878 		1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
879 
880 	(void) setvbuf(stdout, NULL, _IONBF, 0);
881 
882 	ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
883 	    ARRAY_SIZE(dcols_v);
884 	ulong_t tried_comb = 0;
885 	hrtime_t time_diff, start_time = gethrtime();
886 	raidz_test_opts_t *opts;
887 	int a, d, s;
888 
889 	max_free_slots = free_slots = MAX(2, boot_ncpus);
890 
891 	mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
892 	cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
893 
894 	for (s = 0; s < ARRAY_SIZE(size_v); s++)
895 	for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
896 	for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
897 
898 		if (size_v[s] < (1 << ashift_v[a])) {
899 			total_comb--;
900 			continue;
901 		}
902 
903 		if (++tried_comb % 20 == 0)
904 			LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
905 
906 		/* wait for signal to start new thread */
907 		mutex_enter(&sem_mtx);
908 		while (cv_timedwait_sig(&sem_cv, &sem_mtx,
909 		    ddi_get_lbolt() + hz)) {
910 
911 			/* check if should stop the test (timeout) */
912 			time_diff = (gethrtime() - start_time) / NANOSEC;
913 			if (rto_opts.rto_sweep_timeout > 0 &&
914 			    time_diff >= rto_opts.rto_sweep_timeout) {
915 				sweep_state = SWEEP_TIMEOUT;
916 				rto_opts.rto_should_stop = B_TRUE;
917 				mutex_exit(&sem_mtx);
918 				goto exit;
919 			}
920 
921 			/* check if should stop the test (error) */
922 			if (sweep_state != SWEEP_RUNNING) {
923 				mutex_exit(&sem_mtx);
924 				goto exit;
925 			}
926 
927 			/* exit loop if a slot is available */
928 			if (free_slots > 0) {
929 				break;
930 			}
931 		}
932 
933 		free_slots--;
934 		mutex_exit(&sem_mtx);
935 
936 		opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
937 		opts->rto_ashift = ashift_v[a];
938 		opts->rto_dcols = dcols_v[d];
939 		opts->rto_offset = (1 << ashift_v[a]) * rand();
940 		opts->rto_dsize = size_v[s];
941 		opts->rto_expand = rto_opts.rto_expand;
942 		opts->rto_expand_offset = rto_opts.rto_expand_offset;
943 		opts->rto_v = 0; /* be quiet */
944 
945 		VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
946 		    0, NULL, TS_RUN, defclsyspri), !=, NULL);
947 	}
948 
949 exit:
950 	LOG(D_ALL, "\nWaiting for test threads to finish...\n");
951 	mutex_enter(&sem_mtx);
952 	VERIFY(free_slots <= max_free_slots);
953 	while (free_slots < max_free_slots) {
954 		(void) cv_wait(&sem_cv, &sem_mtx);
955 	}
956 	mutex_exit(&sem_mtx);
957 
958 	if (sweep_state == SWEEP_ERROR) {
959 		ERR("Sweep test failed! Failed option: \n");
960 		print_opts(&failed_opts, B_TRUE);
961 	} else {
962 		if (sweep_state == SWEEP_TIMEOUT)
963 			LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
964 			    (ulong_t)rto_opts.rto_sweep_timeout);
965 
966 		LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
967 		    (ulong_t)tried_comb);
968 	}
969 
970 	mutex_destroy(&sem_mtx);
971 
972 	return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
973 }
974 
975 
976 int
977 main(int argc, char **argv)
978 {
979 	size_t i;
980 	struct sigaction action;
981 	int err = 0;
982 
983 	/* init gdb string early */
984 	(void) sprintf(gdb, gdb_tmpl, getpid());
985 
986 	action.sa_handler = sig_handler;
987 	sigemptyset(&action.sa_mask);
988 	action.sa_flags = 0;
989 
990 	if (sigaction(SIGSEGV, &action, NULL) < 0) {
991 		ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
992 		exit(EXIT_FAILURE);
993 	}
994 
995 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
996 
997 	dprintf_setup(&argc, argv);
998 
999 	process_options(argc, argv);
1000 
1001 	kernel_init(SPA_MODE_READ);
1002 
1003 	/* setup random data because rand() is not reentrant */
1004 	rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1005 	srand((unsigned)time(NULL) * getpid());
1006 	for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
1007 		rand_data[i] = rand();
1008 
1009 	mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
1010 
1011 	if (rto_opts.rto_benchmark) {
1012 		run_raidz_benchmark();
1013 	} else if (rto_opts.rto_sweep) {
1014 		err = run_sweep();
1015 	} else {
1016 		err = run_test(NULL);
1017 	}
1018 
1019 	umem_free(rand_data, SPA_MAXBLOCKSIZE);
1020 	kernel_fini();
1021 
1022 	return (err);
1023 }
1024