xref: /freebsd/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c (revision 9e8fbb95b36f14cdcba8aaace0523811a8eef3ec)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/time.h>
28 #include <sys/wait.h>
29 #include <sys/zio.h>
30 #include <umem.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include "raidz_test.h"
36 
37 static int *rand_data;
38 raidz_test_opts_t rto_opts;
39 
40 static char pid_s[16];
41 
42 static void sig_handler(int signo)
43 {
44 	int old_errno = errno;
45 	struct sigaction action;
46 	/*
47 	 * Restore default action and re-raise signal so SIGSEGV and
48 	 * SIGABRT can trigger a core dump.
49 	 */
50 	action.sa_handler = SIG_DFL;
51 	sigemptyset(&action.sa_mask);
52 	action.sa_flags = 0;
53 	(void) sigaction(signo, &action, NULL);
54 
55 	if (rto_opts.rto_gdb) {
56 		pid_t pid = fork();
57 		if (pid == 0) {
58 			execlp("gdb", "gdb", "-ex", "set pagination 0",
59 			    "-p", pid_s, NULL);
60 			_exit(-1);
61 		} else if (pid > 0)
62 			while (waitpid(pid, NULL, 0) == -1 && errno == EINTR)
63 				;
64 	}
65 
66 	raise(signo);
67 	errno = old_errno;
68 }
69 
70 static void print_opts(raidz_test_opts_t *opts, boolean_t force)
71 {
72 	char *verbose;
73 	switch (opts->rto_v) {
74 		case D_ALL:
75 			verbose = "no";
76 			break;
77 		case D_INFO:
78 			verbose = "info";
79 			break;
80 		case D_DEBUG:
81 		default:
82 			verbose = "debug";
83 			break;
84 	}
85 
86 	if (force || opts->rto_v >= D_INFO) {
87 		(void) fprintf(stdout, DBLSEP "Running with options:\n"
88 		    "  (-a) zio ashift                   : %zu\n"
89 		    "  (-o) zio offset                   : 1 << %zu\n"
90 		    "  (-e) expanded map                 : %s\n"
91 		    "  (-r) reflow offset                : %llx\n"
92 		    "  (-d) number of raidz data columns : %zu\n"
93 		    "  (-s) size of DATA                 : 1 << %zu\n"
94 		    "  (-S) sweep parameters             : %s \n"
95 		    "  (-v) verbose                      : %s \n\n",
96 		    opts->rto_ashift,				/* -a */
97 		    ilog2(opts->rto_offset),			/* -o */
98 		    opts->rto_expand ? "yes" : "no",		/* -e */
99 		    (u_longlong_t)opts->rto_expand_offset,	/* -r */
100 		    opts->rto_dcols,				/* -d */
101 		    ilog2(opts->rto_dsize),			/* -s */
102 		    opts->rto_sweep ? "yes" : "no",		/* -S */
103 		    verbose);					/* -v */
104 	}
105 }
106 
107 static void usage(boolean_t requested)
108 {
109 	const raidz_test_opts_t *o = &rto_opts_defaults;
110 
111 	FILE *fp = requested ? stdout : stderr;
112 
113 	(void) fprintf(fp, "Usage:\n"
114 	    "\t[-a zio ashift (default: %zu)]\n"
115 	    "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
116 	    "\t[-d number of raidz data columns (default: %zu)]\n"
117 	    "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
118 	    "\t[-S parameter sweep (default: %s)]\n"
119 	    "\t[-t timeout for parameter sweep test]\n"
120 	    "\t[-B benchmark all raidz implementations]\n"
121 	    "\t[-e use expanded raidz map (default: %s)]\n"
122 	    "\t[-r expanded raidz map reflow offset (default: %llx)]\n"
123 	    "\t[-v increase verbosity (default: %d)]\n"
124 	    "\t[-h (print help)]\n"
125 	    "\t[-T test the test, see if failure would be detected]\n"
126 	    "\t[-D debug (attach gdb on SIGSEGV)]\n"
127 	    "",
128 	    o->rto_ashift,				/* -a */
129 	    ilog2(o->rto_offset),			/* -o */
130 	    o->rto_dcols,				/* -d */
131 	    ilog2(o->rto_dsize),			/* -s */
132 	    rto_opts.rto_sweep ? "yes" : "no",		/* -S */
133 	    rto_opts.rto_expand ? "yes" : "no",		/* -e */
134 	    (u_longlong_t)o->rto_expand_offset,		/* -r */
135 	    o->rto_v);					/* -v */
136 
137 	exit(requested ? 0 : 1);
138 }
139 
140 static void process_options(int argc, char **argv)
141 {
142 	size_t value;
143 	int opt;
144 
145 	raidz_test_opts_t *o = &rto_opts;
146 
147 	bcopy(&rto_opts_defaults, o, sizeof (*o));
148 
149 	while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
150 		value = 0;
151 
152 		switch (opt) {
153 		case 'a':
154 			value = strtoull(optarg, NULL, 0);
155 			o->rto_ashift = MIN(13, MAX(9, value));
156 			break;
157 		case 'e':
158 			o->rto_expand = 1;
159 			break;
160 		case 'r':
161 			o->rto_expand_offset = strtoull(optarg, NULL, 0);
162 			break;
163 		case 'o':
164 			value = strtoull(optarg, NULL, 0);
165 			o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
166 			break;
167 		case 'd':
168 			value = strtoull(optarg, NULL, 0);
169 			o->rto_dcols = MIN(255, MAX(1, value));
170 			break;
171 		case 's':
172 			value = strtoull(optarg, NULL, 0);
173 			o->rto_dsize = 1ULL <<  MIN(SPA_MAXBLOCKSHIFT,
174 			    MAX(SPA_MINBLOCKSHIFT, value));
175 			break;
176 		case 't':
177 			value = strtoull(optarg, NULL, 0);
178 			o->rto_sweep_timeout = value;
179 			break;
180 		case 'v':
181 			o->rto_v++;
182 			break;
183 		case 'S':
184 			o->rto_sweep = 1;
185 			break;
186 		case 'B':
187 			o->rto_benchmark = 1;
188 			break;
189 		case 'D':
190 			o->rto_gdb = 1;
191 			break;
192 		case 'T':
193 			o->rto_sanity = 1;
194 			break;
195 		case 'h':
196 			usage(B_TRUE);
197 			break;
198 		case '?':
199 		default:
200 			usage(B_FALSE);
201 			break;
202 		}
203 	}
204 }
205 
206 #define	DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
207 #define	DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
208 
209 #define	CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
210 #define	CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
211 
212 static int
213 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
214 {
215 	int r, i, ret = 0;
216 
217 	VERIFY(parity >= 1 && parity <= 3);
218 
219 	for (r = 0; r < rm->rm_nrows; r++) {
220 		raidz_row_t * const rr = rm->rm_row[r];
221 		raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
222 		for (i = 0; i < parity; i++) {
223 			if (CODE_COL_SIZE(rrg, i) == 0) {
224 				VERIFY0(CODE_COL_SIZE(rr, i));
225 				continue;
226 			}
227 
228 			if (abd_cmp(CODE_COL(rr, i),
229 			    CODE_COL(rrg, i)) != 0) {
230 				ret++;
231 				LOG_OPT(D_DEBUG, opts,
232 				    "\nParity block [%d] different!\n", i);
233 			}
234 		}
235 	}
236 	return (ret);
237 }
238 
239 static int
240 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
241 {
242 	int r, i, dcols, ret = 0;
243 
244 	for (r = 0; r < rm->rm_nrows; r++) {
245 		raidz_row_t *rr = rm->rm_row[r];
246 		raidz_row_t *rrg = opts->rm_golden->rm_row[r];
247 		dcols = opts->rm_golden->rm_row[0]->rr_cols -
248 		    raidz_parity(opts->rm_golden);
249 		for (i = 0; i < dcols; i++) {
250 			if (DATA_COL_SIZE(rrg, i) == 0) {
251 				VERIFY0(DATA_COL_SIZE(rr, i));
252 				continue;
253 			}
254 
255 			if (abd_cmp(DATA_COL(rrg, i),
256 			    DATA_COL(rr, i)) != 0) {
257 				ret++;
258 
259 				LOG_OPT(D_DEBUG, opts,
260 				    "\nData block [%d] different!\n", i);
261 			}
262 		}
263 	}
264 	return (ret);
265 }
266 
267 static int
268 init_rand(void *data, size_t size, void *private)
269 {
270 	(void) private;
271 	memcpy(data, rand_data, size);
272 	return (0);
273 }
274 
275 static void
276 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
277 {
278 	for (int r = 0; r < rm->rm_nrows; r++) {
279 		raidz_row_t *rr = rm->rm_row[r];
280 		for (int i = 0; i < cnt; i++) {
281 			raidz_col_t *col = &rr->rr_col[tgts[i]];
282 			abd_iterate_func(col->rc_abd, 0, col->rc_size,
283 			    init_rand, NULL);
284 		}
285 	}
286 }
287 
288 void
289 init_zio_abd(zio_t *zio)
290 {
291 	abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
292 }
293 
294 static void
295 fini_raidz_map(zio_t **zio, raidz_map_t **rm)
296 {
297 	vdev_raidz_map_free(*rm);
298 	raidz_free((*zio)->io_abd, (*zio)->io_size);
299 	umem_free(*zio, sizeof (zio_t));
300 
301 	*zio = NULL;
302 	*rm = NULL;
303 }
304 
305 static int
306 init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
307 {
308 	int err = 0;
309 	zio_t *zio_test;
310 	raidz_map_t *rm_test;
311 	const size_t total_ncols = opts->rto_dcols + parity;
312 
313 	if (opts->rm_golden) {
314 		fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
315 	}
316 
317 	opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
318 	zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
319 
320 	opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
321 	opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
322 
323 	opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
324 	zio_test->io_abd = raidz_alloc(opts->rto_dsize);
325 
326 	init_zio_abd(opts->zio_golden);
327 	init_zio_abd(zio_test);
328 
329 	VERIFY0(vdev_raidz_impl_set("original"));
330 
331 	if (opts->rto_expand) {
332 		opts->rm_golden =
333 		    vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
334 		    opts->zio_golden->io_size, opts->zio_golden->io_offset,
335 		    opts->rto_ashift, total_ncols+1, total_ncols,
336 		    parity, opts->rto_expand_offset);
337 		rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
338 		    zio_test->io_size, zio_test->io_offset,
339 		    opts->rto_ashift, total_ncols+1, total_ncols,
340 		    parity, opts->rto_expand_offset);
341 	} else {
342 		opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
343 		    opts->rto_ashift, total_ncols, parity);
344 		rm_test = vdev_raidz_map_alloc(zio_test,
345 		    opts->rto_ashift, total_ncols, parity);
346 	}
347 
348 	VERIFY(opts->zio_golden);
349 	VERIFY(opts->rm_golden);
350 
351 	vdev_raidz_generate_parity(opts->rm_golden);
352 	vdev_raidz_generate_parity(rm_test);
353 
354 	/* sanity check */
355 	err |= cmp_data(opts, rm_test);
356 	err |= cmp_code(opts, rm_test, parity);
357 
358 	if (err)
359 		ERR("initializing the golden copy ... [FAIL]!\n");
360 
361 	/* tear down raidz_map of test zio */
362 	fini_raidz_map(&zio_test, &rm_test);
363 
364 	return (err);
365 }
366 
367 /*
368  * If reflow is not in progress, reflow_offset should be UINT64_MAX.
369  * For each row, if the row is entirely before reflow_offset, it will
370  * come from the new location.  Otherwise this row will come from the
371  * old location.  Therefore, rows that straddle the reflow_offset will
372  * come from the old location.
373  *
374  * NOTE: Until raidz expansion is implemented this function is only
375  * needed by raidz_test.c to the multi-row raid_map_t functionality.
376  */
377 raidz_map_t *
378 vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
379     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
380     uint64_t nparity, uint64_t reflow_offset)
381 {
382 	/* The zio's size in units of the vdev's minimum sector size. */
383 	uint64_t s = size >> ashift;
384 	uint64_t q, r, bc, devidx, asize = 0, tot;
385 
386 	/*
387 	 * "Quotient": The number of data sectors for this stripe on all but
388 	 * the "big column" child vdevs that also contain "remainder" data.
389 	 * AKA "full rows"
390 	 */
391 	q = s / (logical_cols - nparity);
392 
393 	/*
394 	 * "Remainder": The number of partial stripe data sectors in this I/O.
395 	 * This will add a sector to some, but not all, child vdevs.
396 	 */
397 	r = s - q * (logical_cols - nparity);
398 
399 	/* The number of "big columns" - those which contain remainder data. */
400 	bc = (r == 0 ? 0 : r + nparity);
401 
402 	/*
403 	 * The total number of data and parity sectors associated with
404 	 * this I/O.
405 	 */
406 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
407 
408 	/* How many rows contain data (not skip) */
409 	uint64_t rows = howmany(tot, logical_cols);
410 	int cols = MIN(tot, logical_cols);
411 
412 	raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
413 	    KM_SLEEP);
414 	rm->rm_nrows = rows;
415 
416 	for (uint64_t row = 0; row < rows; row++) {
417 		raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
418 		    rr_col[cols]), KM_SLEEP);
419 		rm->rm_row[row] = rr;
420 
421 		/* The starting RAIDZ (parent) vdev sector of the row. */
422 		uint64_t b = (offset >> ashift) + row * logical_cols;
423 
424 		/*
425 		 * If we are in the middle of a reflow, and any part of this
426 		 * row has not been copied, then use the old location of
427 		 * this row.
428 		 */
429 		int row_phys_cols = physical_cols;
430 		if (b + (logical_cols - nparity) > reflow_offset >> ashift)
431 			row_phys_cols--;
432 
433 		/* starting child of this row */
434 		uint64_t child_id = b % row_phys_cols;
435 		/* The starting byte offset on each child vdev. */
436 		uint64_t child_offset = (b / row_phys_cols) << ashift;
437 
438 		/*
439 		 * We set cols to the entire width of the block, even
440 		 * if this row is shorter.  This is needed because parity
441 		 * generation (for Q and R) needs to know the entire width,
442 		 * because it treats the short row as though it was
443 		 * full-width (and the "phantom" sectors were zero-filled).
444 		 *
445 		 * Another approach to this would be to set cols shorter
446 		 * (to just the number of columns that we might do i/o to)
447 		 * and have another mechanism to tell the parity generation
448 		 * about the "entire width".  Reconstruction (at least
449 		 * vdev_raidz_reconstruct_general()) would also need to
450 		 * know about the "entire width".
451 		 */
452 		rr->rr_cols = cols;
453 		rr->rr_bigcols = bc;
454 		rr->rr_missingdata = 0;
455 		rr->rr_missingparity = 0;
456 		rr->rr_firstdatacol = nparity;
457 		rr->rr_abd_empty = NULL;
458 		rr->rr_nempty = 0;
459 
460 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
461 			if (child_id >= row_phys_cols) {
462 				child_id -= row_phys_cols;
463 				child_offset += 1ULL << ashift;
464 			}
465 			rr->rr_col[c].rc_devidx = child_id;
466 			rr->rr_col[c].rc_offset = child_offset;
467 			rr->rr_col[c].rc_orig_data = NULL;
468 			rr->rr_col[c].rc_error = 0;
469 			rr->rr_col[c].rc_tried = 0;
470 			rr->rr_col[c].rc_skipped = 0;
471 			rr->rr_col[c].rc_need_orig_restore = B_FALSE;
472 
473 			uint64_t dc = c - rr->rr_firstdatacol;
474 			if (c < rr->rr_firstdatacol) {
475 				rr->rr_col[c].rc_size = 1ULL << ashift;
476 				rr->rr_col[c].rc_abd =
477 				    abd_alloc_linear(rr->rr_col[c].rc_size,
478 				    B_TRUE);
479 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
480 				/*
481 				 * Past the end, this for parity generation.
482 				 */
483 				rr->rr_col[c].rc_size = 0;
484 				rr->rr_col[c].rc_abd = NULL;
485 			} else {
486 				/*
487 				 * "data column" (col excluding parity)
488 				 * Add an ASCII art diagram here
489 				 */
490 				uint64_t off;
491 
492 				if (c < bc || r == 0) {
493 					off = dc * rows + row;
494 				} else {
495 					off = r * rows +
496 					    (dc - r) * (rows - 1) + row;
497 				}
498 				rr->rr_col[c].rc_size = 1ULL << ashift;
499 				rr->rr_col[c].rc_abd = abd_get_offset_struct(
500 				    &rr->rr_col[c].rc_abdstruct,
501 				    abd, off << ashift, 1 << ashift);
502 			}
503 
504 			asize += rr->rr_col[c].rc_size;
505 		}
506 		/*
507 		 * If all data stored spans all columns, there's a danger that
508 		 * parity will always be on the same device and, since parity
509 		 * isn't read during normal operation, that that device's I/O
510 		 * bandwidth won't be used effectively. We therefore switch
511 		 * the parity every 1MB.
512 		 *
513 		 * ...at least that was, ostensibly, the theory. As a practical
514 		 * matter unless we juggle the parity between all devices
515 		 * evenly, we won't see any benefit. Further, occasional writes
516 		 * that aren't a multiple of the LCM of the number of children
517 		 * and the minimum stripe width are sufficient to avoid pessimal
518 		 * behavior. Unfortunately, this decision created an implicit
519 		 * on-disk format requirement that we need to support for all
520 		 * eternity, but only for single-parity RAID-Z.
521 		 *
522 		 * If we intend to skip a sector in the zeroth column for
523 		 * padding we must make sure to note this swap. We will never
524 		 * intend to skip the first column since at least one data and
525 		 * one parity column must appear in each row.
526 		 */
527 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
528 		    (offset & (1ULL << 20))) {
529 			ASSERT(rr->rr_cols >= 2);
530 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
531 			devidx = rr->rr_col[0].rc_devidx;
532 			uint64_t o = rr->rr_col[0].rc_offset;
533 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
534 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
535 			rr->rr_col[1].rc_devidx = devidx;
536 			rr->rr_col[1].rc_offset = o;
537 		}
538 
539 	}
540 	ASSERT3U(asize, ==, tot << ashift);
541 
542 	/* init RAIDZ parity ops */
543 	rm->rm_ops = vdev_raidz_math_get_ops();
544 
545 	return (rm);
546 }
547 
548 static raidz_map_t *
549 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
550 {
551 	raidz_map_t *rm = NULL;
552 	const size_t alloc_dsize = opts->rto_dsize;
553 	const size_t total_ncols = opts->rto_dcols + parity;
554 	const int ccols[] = { 0, 1, 2 };
555 
556 	VERIFY(zio);
557 	VERIFY(parity <= 3 && parity >= 1);
558 
559 	*zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
560 
561 	(*zio)->io_offset = 0;
562 	(*zio)->io_size = alloc_dsize;
563 	(*zio)->io_abd = raidz_alloc(alloc_dsize);
564 	init_zio_abd(*zio);
565 
566 	if (opts->rto_expand) {
567 		rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
568 		    (*zio)->io_size, (*zio)->io_offset,
569 		    opts->rto_ashift, total_ncols+1, total_ncols,
570 		    parity, opts->rto_expand_offset);
571 	} else {
572 		rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
573 		    total_ncols, parity);
574 	}
575 	VERIFY(rm);
576 
577 	/* Make sure code columns are destroyed */
578 	corrupt_colums(rm, ccols, parity);
579 
580 	return (rm);
581 }
582 
583 static int
584 run_gen_check(raidz_test_opts_t *opts)
585 {
586 	char **impl_name;
587 	int fn, err = 0;
588 	zio_t *zio_test;
589 	raidz_map_t *rm_test;
590 
591 	err = init_raidz_golden_map(opts, PARITY_PQR);
592 	if (0 != err)
593 		return (err);
594 
595 	LOG(D_INFO, DBLSEP);
596 	LOG(D_INFO, "Testing parity generation...\n");
597 
598 	for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
599 	    impl_name++) {
600 
601 		LOG(D_INFO, SEP);
602 		LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
603 
604 		if (0 != vdev_raidz_impl_set(*impl_name)) {
605 			LOG(D_INFO, "[SKIP]\n");
606 			continue;
607 		} else {
608 			LOG(D_INFO, "[SUPPORTED]\n");
609 		}
610 
611 		for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
612 
613 			/* Check if should stop */
614 			if (rto_opts.rto_should_stop)
615 				return (err);
616 
617 			/* create suitable raidz_map */
618 			rm_test = init_raidz_map(opts, &zio_test, fn+1);
619 			VERIFY(rm_test);
620 
621 			LOG(D_INFO, "\t\tTesting method [%s] ...",
622 			    raidz_gen_name[fn]);
623 
624 			if (!opts->rto_sanity)
625 				vdev_raidz_generate_parity(rm_test);
626 
627 			if (cmp_code(opts, rm_test, fn+1) != 0) {
628 				LOG(D_INFO, "[FAIL]\n");
629 				err++;
630 			} else
631 				LOG(D_INFO, "[PASS]\n");
632 
633 			fini_raidz_map(&zio_test, &rm_test);
634 		}
635 	}
636 
637 	fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
638 
639 	return (err);
640 }
641 
642 static int
643 run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
644 {
645 	int x0, x1, x2;
646 	int tgtidx[3];
647 	int err = 0;
648 	static const int rec_tgts[7][3] = {
649 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
650 		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
651 		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
652 		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
653 		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
654 		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
655 		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
656 	};
657 
658 	memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
659 
660 	if (fn < RAIDZ_REC_PQ) {
661 		/* can reconstruct 1 failed data disk */
662 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
663 			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
664 				continue;
665 
666 			/* Check if should stop */
667 			if (rto_opts.rto_should_stop)
668 				return (err);
669 
670 			LOG(D_DEBUG, "[%d] ", x0);
671 
672 			tgtidx[2] = x0 + raidz_parity(rm);
673 
674 			corrupt_colums(rm, tgtidx+2, 1);
675 
676 			if (!opts->rto_sanity)
677 				vdev_raidz_reconstruct(rm, tgtidx, 3);
678 
679 			if (cmp_data(opts, rm) != 0) {
680 				err++;
681 				LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
682 			}
683 		}
684 
685 	} else if (fn < RAIDZ_REC_PQR) {
686 		/* can reconstruct 2 failed data disk */
687 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
688 			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
689 				continue;
690 			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
691 				if (x1 >= rm->rm_row[0]->rr_cols -
692 				    raidz_parity(rm))
693 					continue;
694 
695 				/* Check if should stop */
696 				if (rto_opts.rto_should_stop)
697 					return (err);
698 
699 				LOG(D_DEBUG, "[%d %d] ", x0, x1);
700 
701 				tgtidx[1] = x0 + raidz_parity(rm);
702 				tgtidx[2] = x1 + raidz_parity(rm);
703 
704 				corrupt_colums(rm, tgtidx+1, 2);
705 
706 				if (!opts->rto_sanity)
707 					vdev_raidz_reconstruct(rm, tgtidx, 3);
708 
709 				if (cmp_data(opts, rm) != 0) {
710 					err++;
711 					LOG(D_DEBUG, "\nREC D[%d %d]... "
712 					    "[FAIL]\n", x0, x1);
713 				}
714 			}
715 		}
716 	} else {
717 		/* can reconstruct 3 failed data disk */
718 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
719 			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
720 				continue;
721 			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
722 				if (x1 >= rm->rm_row[0]->rr_cols -
723 				    raidz_parity(rm))
724 					continue;
725 				for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
726 					if (x2 >= rm->rm_row[0]->rr_cols -
727 					    raidz_parity(rm))
728 						continue;
729 
730 					/* Check if should stop */
731 					if (rto_opts.rto_should_stop)
732 						return (err);
733 
734 					LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
735 
736 					tgtidx[0] = x0 + raidz_parity(rm);
737 					tgtidx[1] = x1 + raidz_parity(rm);
738 					tgtidx[2] = x2 + raidz_parity(rm);
739 
740 					corrupt_colums(rm, tgtidx, 3);
741 
742 					if (!opts->rto_sanity)
743 						vdev_raidz_reconstruct(rm,
744 						    tgtidx, 3);
745 
746 					if (cmp_data(opts, rm) != 0) {
747 						err++;
748 						LOG(D_DEBUG,
749 						    "\nREC D[%d %d %d]... "
750 						    "[FAIL]\n", x0, x1, x2);
751 					}
752 				}
753 			}
754 		}
755 	}
756 	return (err);
757 }
758 
759 static int
760 run_rec_check(raidz_test_opts_t *opts)
761 {
762 	char **impl_name;
763 	unsigned fn, err = 0;
764 	zio_t *zio_test;
765 	raidz_map_t *rm_test;
766 
767 	err = init_raidz_golden_map(opts, PARITY_PQR);
768 	if (0 != err)
769 		return (err);
770 
771 	LOG(D_INFO, DBLSEP);
772 	LOG(D_INFO, "Testing data reconstruction...\n");
773 
774 	for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
775 	    impl_name++) {
776 
777 		LOG(D_INFO, SEP);
778 		LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
779 
780 		if (vdev_raidz_impl_set(*impl_name) != 0) {
781 			LOG(D_INFO, "[SKIP]\n");
782 			continue;
783 		} else
784 			LOG(D_INFO, "[SUPPORTED]\n");
785 
786 
787 		/* create suitable raidz_map */
788 		rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
789 		/* generate parity */
790 		vdev_raidz_generate_parity(rm_test);
791 
792 		for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
793 
794 			LOG(D_INFO, "\t\tTesting method [%s] ...",
795 			    raidz_rec_name[fn]);
796 
797 			if (run_rec_check_impl(opts, rm_test, fn) != 0) {
798 				LOG(D_INFO, "[FAIL]\n");
799 				err++;
800 
801 			} else
802 				LOG(D_INFO, "[PASS]\n");
803 
804 		}
805 		/* tear down test raidz_map */
806 		fini_raidz_map(&zio_test, &rm_test);
807 	}
808 
809 	fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
810 
811 	return (err);
812 }
813 
814 static int
815 run_test(raidz_test_opts_t *opts)
816 {
817 	int err = 0;
818 
819 	if (opts == NULL)
820 		opts = &rto_opts;
821 
822 	print_opts(opts, B_FALSE);
823 
824 	err |= run_gen_check(opts);
825 	err |= run_rec_check(opts);
826 
827 	return (err);
828 }
829 
830 #define	SWEEP_RUNNING	0
831 #define	SWEEP_FINISHED	1
832 #define	SWEEP_ERROR	2
833 #define	SWEEP_TIMEOUT	3
834 
835 static int sweep_state = 0;
836 static raidz_test_opts_t failed_opts;
837 
838 static kmutex_t sem_mtx;
839 static kcondvar_t sem_cv;
840 static int max_free_slots;
841 static int free_slots;
842 
843 static _Noreturn void
844 sweep_thread(void *arg)
845 {
846 	int err = 0;
847 	raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
848 	VERIFY(opts != NULL);
849 
850 	err = run_test(opts);
851 
852 	if (rto_opts.rto_sanity) {
853 		/* 25% chance that a sweep test fails */
854 		if (rand() < (RAND_MAX/4))
855 			err = 1;
856 	}
857 
858 	if (0 != err) {
859 		mutex_enter(&sem_mtx);
860 		memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
861 		sweep_state = SWEEP_ERROR;
862 		mutex_exit(&sem_mtx);
863 	}
864 
865 	umem_free(opts, sizeof (raidz_test_opts_t));
866 
867 	/* signal the next thread */
868 	mutex_enter(&sem_mtx);
869 	free_slots++;
870 	cv_signal(&sem_cv);
871 	mutex_exit(&sem_mtx);
872 
873 	thread_exit();
874 }
875 
876 static int
877 run_sweep(void)
878 {
879 	static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
880 	static const size_t ashift_v[] = { 9, 12, 14 };
881 	static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
882 		1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
883 
884 	(void) setvbuf(stdout, NULL, _IONBF, 0);
885 
886 	ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
887 	    ARRAY_SIZE(dcols_v);
888 	ulong_t tried_comb = 0;
889 	hrtime_t time_diff, start_time = gethrtime();
890 	raidz_test_opts_t *opts;
891 	int a, d, s;
892 
893 	max_free_slots = free_slots = MAX(2, boot_ncpus);
894 
895 	mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
896 	cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
897 
898 	for (s = 0; s < ARRAY_SIZE(size_v); s++)
899 	for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
900 	for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
901 
902 		if (size_v[s] < (1 << ashift_v[a])) {
903 			total_comb--;
904 			continue;
905 		}
906 
907 		if (++tried_comb % 20 == 0)
908 			LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
909 
910 		/* wait for signal to start new thread */
911 		mutex_enter(&sem_mtx);
912 		while (cv_timedwait_sig(&sem_cv, &sem_mtx,
913 		    ddi_get_lbolt() + hz)) {
914 
915 			/* check if should stop the test (timeout) */
916 			time_diff = (gethrtime() - start_time) / NANOSEC;
917 			if (rto_opts.rto_sweep_timeout > 0 &&
918 			    time_diff >= rto_opts.rto_sweep_timeout) {
919 				sweep_state = SWEEP_TIMEOUT;
920 				rto_opts.rto_should_stop = B_TRUE;
921 				mutex_exit(&sem_mtx);
922 				goto exit;
923 			}
924 
925 			/* check if should stop the test (error) */
926 			if (sweep_state != SWEEP_RUNNING) {
927 				mutex_exit(&sem_mtx);
928 				goto exit;
929 			}
930 
931 			/* exit loop if a slot is available */
932 			if (free_slots > 0) {
933 				break;
934 			}
935 		}
936 
937 		free_slots--;
938 		mutex_exit(&sem_mtx);
939 
940 		opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
941 		opts->rto_ashift = ashift_v[a];
942 		opts->rto_dcols = dcols_v[d];
943 		opts->rto_offset = (1 << ashift_v[a]) * rand();
944 		opts->rto_dsize = size_v[s];
945 		opts->rto_expand = rto_opts.rto_expand;
946 		opts->rto_expand_offset = rto_opts.rto_expand_offset;
947 		opts->rto_v = 0; /* be quiet */
948 
949 		VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
950 		    0, NULL, TS_RUN, defclsyspri), !=, NULL);
951 	}
952 
953 exit:
954 	LOG(D_ALL, "\nWaiting for test threads to finish...\n");
955 	mutex_enter(&sem_mtx);
956 	VERIFY(free_slots <= max_free_slots);
957 	while (free_slots < max_free_slots) {
958 		(void) cv_wait(&sem_cv, &sem_mtx);
959 	}
960 	mutex_exit(&sem_mtx);
961 
962 	if (sweep_state == SWEEP_ERROR) {
963 		ERR("Sweep test failed! Failed option: \n");
964 		print_opts(&failed_opts, B_TRUE);
965 	} else {
966 		if (sweep_state == SWEEP_TIMEOUT)
967 			LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
968 			    (ulong_t)rto_opts.rto_sweep_timeout);
969 
970 		LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
971 		    (ulong_t)tried_comb);
972 	}
973 
974 	mutex_destroy(&sem_mtx);
975 
976 	return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
977 }
978 
979 
980 int
981 main(int argc, char **argv)
982 {
983 	size_t i;
984 	struct sigaction action;
985 	int err = 0;
986 
987 	/* init gdb pid string early */
988 	(void) sprintf(pid_s, "%d", getpid());
989 
990 	action.sa_handler = sig_handler;
991 	sigemptyset(&action.sa_mask);
992 	action.sa_flags = 0;
993 
994 	if (sigaction(SIGSEGV, &action, NULL) < 0) {
995 		ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
996 		exit(EXIT_FAILURE);
997 	}
998 
999 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
1000 
1001 	dprintf_setup(&argc, argv);
1002 
1003 	process_options(argc, argv);
1004 
1005 	kernel_init(SPA_MODE_READ);
1006 
1007 	/* setup random data because rand() is not reentrant */
1008 	rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1009 	srand((unsigned)time(NULL) * getpid());
1010 	for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
1011 		rand_data[i] = rand();
1012 
1013 	mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
1014 
1015 	if (rto_opts.rto_benchmark) {
1016 		run_raidz_benchmark();
1017 	} else if (rto_opts.rto_sweep) {
1018 		err = run_sweep();
1019 	} else {
1020 		err = run_test(NULL);
1021 	}
1022 
1023 	umem_free(rand_data, SPA_MAXBLOCKSIZE);
1024 	kernel_fini();
1025 
1026 	return (err);
1027 }
1028