xref: /illumos-gate/usr/src/cmd/zinject/zinject.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * ZFS Fault Injector
28  *
29  * This userland component takes a set of options and uses libzpool to translate
30  * from a user-visible object type and name to an internal representation.
31  * There are two basic types of faults: device faults and data faults.
32  *
33  *
34  * DEVICE FAULTS
35  *
36  * Errors can be injected into a particular vdev using the '-d' option.  This
37  * option takes a path or vdev GUID to uniquely identify the device within a
38  * pool.  There are two types of errors that can be injected, EIO and ENXIO,
39  * that can be controlled through the '-e' option.  The default is ENXIO.  For
40  * EIO failures, any attempt to read data from the device will return EIO, but
41  * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
42  * any attempt to read from the device will return EIO, but any attempt to
43  * reopen the device will also return ENXIO.
44  * For label faults, the -L option must be specified. This allows faults
45  * to be injected into either the nvlist, uberblock, pad1, or pad2 region
46  * of all the labels for the specified device.
47  *
48  * This form of the command looks like:
49  *
50  *	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
51  *
52  *
53  * DATA FAULTS
54  *
55  * We begin with a tuple of the form:
56  *
57  *	<type,level,range,object>
58  *
59  *	type	A string describing the type of data to target.  Each type
60  *		implicitly describes how to interpret 'object'. Currently,
61  *		the following values are supported:
62  *
63  *		data		User data for a file
64  *		dnode		Dnode for a file or directory
65  *
66  *		The following MOS objects are special.  Instead of injecting
67  *		errors on a particular object or blkid, we inject errors across
68  *		all objects of the given type.
69  *
70  *		mos		Any data in the MOS
71  *		mosdir		object directory
72  *		config		pool configuration
73  *		bpobj		blkptr list
74  *		spacemap	spacemap
75  *		metaslab	metaslab
76  *		errlog		persistent error log
77  *
78  *	level	Object level.  Defaults to '0', not applicable to all types.  If
79  *		a range is given, this corresponds to the indirect block
80  *		corresponding to the specific range.
81  *
82  *	range	A numerical range [start,end) within the object.  Defaults to
83  *		the full size of the file.
84  *
85  *	object	A string describing the logical location of the object.  For
86  *		files and directories (currently the only supported types),
87  *		this is the path of the object on disk.
88  *
89  * This is translated, via libzpool, into the following internal representation:
90  *
91  *	<type,objset,object,level,range>
92  *
93  * These types should be self-explanatory.  This tuple is then passed to the
94  * kernel via a special ioctl() to initiate fault injection for the given
95  * object.  Note that 'type' is not strictly necessary for fault injection, but
96  * is used when translating existing faults into a human-readable string.
97  *
98  *
99  * The command itself takes one of the forms:
100  *
101  *	zinject
102  *	zinject <-a | -u pool>
103  *	zinject -c <id|all>
104  *	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
105  *	    [-r range] <object>
106  *	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
107  *
108  * With no arguments, the command prints all currently registered injection
109  * handlers, with their numeric identifiers.
110  *
111  * The '-c' option will clear the given handler, or all handlers if 'all' is
112  * specified.
113  *
114  * The '-e' option takes a string describing the errno to simulate.  This must
115  * be one of 'io', 'checksum', or 'decrypt'.  In most cases this will result
116  * in the same behavior, but RAID-Z will produce a different set of ereports
117  * for this situation.
118  *
119  * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
120  * specified, then the ARC cache is flushed appropriately.  If '-u' is
121  * specified, then the underlying SPA is unloaded.  Either of these flags can be
122  * specified independently of any other handlers.  The '-m' flag automatically
123  * does an unmount and remount of the underlying dataset to aid in flushing the
124  * cache.
125  *
126  * The '-f' flag controls the frequency of errors injected, expressed as a
127  * integer percentage between 1 and 100.  The default is 100.
128  *
129  * The this form is responsible for actually injecting the handler into the
130  * framework.  It takes the arguments described above, translates them to the
131  * internal tuple using libzpool, and then issues an ioctl() to register the
132  * handler.
133  *
134  * The final form can target a specific bookmark, regardless of whether a
135  * human-readable interface has been designed.  It allows developers to specify
136  * a particular block by number.
137  */
138 
139 #include <errno.h>
140 #include <fcntl.h>
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <strings.h>
144 #include <unistd.h>
145 
146 #include <sys/fs/zfs.h>
147 #include <sys/mount.h>
148 
149 #include <libzfs.h>
150 
151 #undef verify	/* both libzfs.h and zfs_context.h want to define this */
152 
153 #include "zinject.h"
154 
155 libzfs_handle_t *g_zfs;
156 int zfs_fd;
157 
158 #define	ECKSUM	EBADE
159 
160 static const char *errtable[TYPE_INVAL] = {
161 	"data",
162 	"dnode",
163 	"mos",
164 	"mosdir",
165 	"metaslab",
166 	"config",
167 	"bpobj",
168 	"spacemap",
169 	"errlog",
170 	"uber",
171 	"nvlist",
172 	"pad1",
173 	"pad2"
174 };
175 
176 static err_type_t
177 name_to_type(const char *arg)
178 {
179 	int i;
180 	for (i = 0; i < TYPE_INVAL; i++)
181 		if (strcmp(errtable[i], arg) == 0)
182 			return (i);
183 
184 	return (TYPE_INVAL);
185 }
186 
187 static const char *
188 type_to_name(uint64_t type)
189 {
190 	switch (type) {
191 	case DMU_OT_OBJECT_DIRECTORY:
192 		return ("mosdir");
193 	case DMU_OT_OBJECT_ARRAY:
194 		return ("metaslab");
195 	case DMU_OT_PACKED_NVLIST:
196 		return ("config");
197 	case DMU_OT_BPOBJ:
198 		return ("bpobj");
199 	case DMU_OT_SPACE_MAP:
200 		return ("spacemap");
201 	case DMU_OT_ERROR_LOG:
202 		return ("errlog");
203 	default:
204 		return ("-");
205 	}
206 }
207 
208 
209 /*
210  * Print usage message.
211  */
212 void
213 usage(void)
214 {
215 	(void) printf(
216 	    "usage:\n"
217 	    "\n"
218 	    "\tzinject\n"
219 	    "\n"
220 	    "\t\tList all active injection records.\n"
221 	    "\n"
222 	    "\tzinject -c <id|all>\n"
223 	    "\n"
224 	    "\t\tClear the particular record (if given a numeric ID), or\n"
225 	    "\t\tall records if 'all' is specificed.\n"
226 	    "\n"
227 	    "\tzinject -p <function name> pool\n"
228 	    "\n"
229 	    "\t\tInject a panic fault at the specified function. Only \n"
230 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
231 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
232 	    "\n"
233 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
234 	    "\t    [-T <read|write|free|claim|all> pool\n"
235 	    "\n"
236 	    "\t\tInject a fault into a particular device or the device's\n"
237 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
238 	    "\t\t'pad1', or 'pad2'.\n"
239 	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
240 	    "\n"
241 	    "\tzinject -d device -A <degrade|fault> pool\n"
242 	    "\n"
243 	    "\t\tPerform a specific action on a particular device\n"
244 	    "\n"
245 	    "\tzinject -d device -D latency:lanes pool\n"
246 	    "\n"
247 	    "\t\tAdd an artificial delay to IO requests on a particular\n"
248 	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
249 	    "\t\tmilliseconds to complete. Each delay has an associated\n"
250 	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
251 	    "\t\tIO requests that can be processed.\n"
252 	    "\n"
253 	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
254 	    "\t\tthe device will only be able to service a single IO request\n"
255 	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
256 	    "\t\tif only a single request is submitted every 10 ms, the\n"
257 	    "\t\taverage latency will be 10 ms; but if more than one request\n"
258 	    "\t\tis submitted every 10 ms, the average latency will be more\n"
259 	    "\t\tthan 10 ms.\n"
260 	    "\n"
261 	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
262 	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
263 	    "\t\ttwo requests at a time, each with a minimum latency of\n"
264 	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
265 	    "\t\tthe average latency will be 10 ms; but if more than two\n"
266 	    "\t\trequests are submitted every 10 ms, the average latency\n"
267 	    "\t\twill be more than 10 ms.\n"
268 	    "\n"
269 	    "\t\tAlso note, these delays are additive. So two invocations\n"
270 	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
271 	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
272 	    "\t\tlanes with differing target latencies. For example, an\n"
273 	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
274 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
275 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
276 	    "\n"
277 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
278 	    "\n"
279 	    "\t\tCause the pool to stop writing blocks yet not\n"
280 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
281 	    "\t\tthat fails to honor cache flush requests.\n"
282 	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
283 	    "\t\tat the end of the duration.\n"
284 	    "\n"
285 	    "\tzinject -b objset:object:level:blkid pool\n"
286 	    "\n"
287 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
288 	    "\t\tspecified by the remaining tuple.  Each number is in\n"
289 	    "\t\thexidecimal, and only one block can be specified.\n"
290 	    "\n"
291 	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
292 	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
293 	    "\n"
294 	    "\t\tInject an error into the object specified by the '-t' option\n"
295 	    "\t\tand the object descriptor.  The 'object' parameter is\n"
296 	    "\t\tinterperted depending on the '-t' option.\n"
297 	    "\n"
298 	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
299 	    "\t\t-e\tInject a specific error.  Must be one of 'io', "
300 	    "'checksum',\n"
301 	    "\t\t\t'decompress', or decrypt.  Default is 'io'.\n"
302 	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
303 	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
304 	    "\t\t\tseparated by commas (ex. '0,2').\n"
305 	    "\t\t-l\tInject error at a particular block level. Default is "
306 	    "0.\n"
307 	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
308 	    "\t\t-r\tInject error over a particular logical range of an\n"
309 	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
310 	    "\t\t\trange according to the object's properties.\n"
311 	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
312 	    "\t\t\tassociated object.\n"
313 	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
314 	    "\t\t\ta pool object.\n"
315 	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
316 	    "\t\t\ta percentage between 1 and 100.\n"
317 	    "\n"
318 	    "\t-t data\t\tInject an error into the plain file contents of a\n"
319 	    "\t\t\tfile.  The object must be specified as a complete path\n"
320 	    "\t\t\tto a file on a ZFS filesystem.\n"
321 	    "\n"
322 	    "\t-t dnode\tInject an error into the metadnode in the block\n"
323 	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
324 	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
325 	    "\t\t\tis specified as a complete path to a file or directory\n"
326 	    "\t\t\ton a ZFS filesystem.\n"
327 	    "\n"
328 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
329 	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
330 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
331 	    "\t\t\tthe poolname.\n");
332 }
333 
334 static int
335 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
336     void *data)
337 {
338 	zfs_cmd_t zc = { 0 };
339 	int ret;
340 
341 	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
342 		if ((ret = func((int)zc.zc_guid, zc.zc_name,
343 		    &zc.zc_inject_record, data)) != 0)
344 			return (ret);
345 
346 	if (errno != ENOENT) {
347 		(void) fprintf(stderr, "Unable to list handlers: %s\n",
348 		    strerror(errno));
349 		return (-1);
350 	}
351 
352 	return (0);
353 }
354 
355 static int
356 print_data_handler(int id, const char *pool, zinject_record_t *record,
357     void *data)
358 {
359 	int *count = data;
360 
361 	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
362 		return (0);
363 
364 	if (*count == 0) {
365 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  ",
366 		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
367 		    "LVL", "DVAs", "RANGE");
368 		(void) printf("---  ---------------  ------  "
369 		    "------  --------  ---  ---- ----------------\n");
370 	}
371 
372 	*count += 1;
373 
374 	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
375 	    id, pool, (u_longlong_t)record->zi_objset,
376 	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
377 	    record->zi_level, record->zi_dvas);
378 
379 	if (record->zi_start == 0 &&
380 	    record->zi_end == -1ULL)
381 		(void) printf("all\n");
382 	else
383 		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
384 		    (u_longlong_t)record->zi_end);
385 
386 	return (0);
387 }
388 
389 static int
390 print_device_handler(int id, const char *pool, zinject_record_t *record,
391     void *data)
392 {
393 	int *count = data;
394 
395 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
396 		return (0);
397 
398 	if (record->zi_cmd == ZINJECT_DELAY_IO)
399 		return (0);
400 
401 	if (*count == 0) {
402 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
403 		(void) printf("---  ---------------  ----------------\n");
404 	}
405 
406 	*count += 1;
407 
408 	(void) printf("%3d  %-15s  %llx\n", id, pool,
409 	    (u_longlong_t)record->zi_guid);
410 
411 	return (0);
412 }
413 
414 static int
415 print_delay_handler(int id, const char *pool, zinject_record_t *record,
416     void *data)
417 {
418 	int *count = data;
419 
420 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
421 		return (0);
422 
423 	if (record->zi_cmd != ZINJECT_DELAY_IO)
424 		return (0);
425 
426 	if (*count == 0) {
427 		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
428 		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
429 		(void) printf("---  ---------------  ---------------  "
430 		    "---------------  ----------------\n");
431 	}
432 
433 	*count += 1;
434 
435 	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
436 	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
437 	    (u_longlong_t)record->zi_nlanes,
438 	    (u_longlong_t)record->zi_guid);
439 
440 	return (0);
441 }
442 
443 static int
444 print_panic_handler(int id, const char *pool, zinject_record_t *record,
445     void *data)
446 {
447 	int *count = data;
448 
449 	if (record->zi_func[0] == '\0')
450 		return (0);
451 
452 	if (*count == 0) {
453 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
454 		(void) printf("---  ---------------  ----------------\n");
455 	}
456 
457 	*count += 1;
458 
459 	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
460 
461 	return (0);
462 }
463 
464 /*
465  * Print all registered error handlers.  Returns the number of handlers
466  * registered.
467  */
468 static int
469 print_all_handlers(void)
470 {
471 	int count = 0, total = 0;
472 
473 	(void) iter_handlers(print_device_handler, &count);
474 	if (count > 0) {
475 		total += count;
476 		(void) printf("\n");
477 		count = 0;
478 	}
479 
480 	(void) iter_handlers(print_delay_handler, &count);
481 	if (count > 0) {
482 		total += count;
483 		(void) printf("\n");
484 		count = 0;
485 	}
486 
487 	(void) iter_handlers(print_data_handler, &count);
488 	if (count > 0) {
489 		total += count;
490 		(void) printf("\n");
491 		count = 0;
492 	}
493 
494 	(void) iter_handlers(print_panic_handler, &count);
495 
496 	return (count + total);
497 }
498 
499 /* ARGSUSED */
500 static int
501 cancel_one_handler(int id, const char *pool, zinject_record_t *record,
502     void *data)
503 {
504 	zfs_cmd_t zc = { 0 };
505 
506 	zc.zc_guid = (uint64_t)id;
507 
508 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
509 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
510 		    id, strerror(errno));
511 		return (1);
512 	}
513 
514 	return (0);
515 }
516 
517 /*
518  * Remove all fault injection handlers.
519  */
520 static int
521 cancel_all_handlers(void)
522 {
523 	int ret = iter_handlers(cancel_one_handler, NULL);
524 
525 	if (ret == 0)
526 		(void) printf("removed all registered handlers\n");
527 
528 	return (ret);
529 }
530 
531 /*
532  * Remove a specific fault injection handler.
533  */
534 static int
535 cancel_handler(int id)
536 {
537 	zfs_cmd_t zc = { 0 };
538 
539 	zc.zc_guid = (uint64_t)id;
540 
541 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
542 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
543 		    id, strerror(errno));
544 		return (1);
545 	}
546 
547 	(void) printf("removed handler %d\n", id);
548 
549 	return (0);
550 }
551 
552 /*
553  * Register a new fault injection handler.
554  */
555 static int
556 register_handler(const char *pool, int flags, zinject_record_t *record,
557     int quiet)
558 {
559 	zfs_cmd_t zc = { 0 };
560 
561 	(void) strcpy(zc.zc_name, pool);
562 	zc.zc_inject_record = *record;
563 	zc.zc_guid = flags;
564 
565 	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
566 		(void) fprintf(stderr, "failed to add handler: %s\n",
567 		    strerror(errno));
568 		return (1);
569 	}
570 
571 	if (flags & ZINJECT_NULL)
572 		return (0);
573 
574 	if (quiet) {
575 		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
576 	} else {
577 		(void) printf("Added handler %llu with the following "
578 		    "properties:\n", (u_longlong_t)zc.zc_guid);
579 		(void) printf("  pool: %s\n", pool);
580 		if (record->zi_guid) {
581 			(void) printf("  vdev: %llx\n",
582 			    (u_longlong_t)record->zi_guid);
583 		} else if (record->zi_func[0] != '\0') {
584 			(void) printf("  panic function: %s\n",
585 			    record->zi_func);
586 		} else if (record->zi_duration > 0) {
587 			(void) printf(" time: %lld seconds\n",
588 			    (u_longlong_t)record->zi_duration);
589 		} else if (record->zi_duration < 0) {
590 			(void) printf(" txgs: %lld \n",
591 			    (u_longlong_t)-record->zi_duration);
592 		} else {
593 			(void) printf("objset: %llu\n",
594 			    (u_longlong_t)record->zi_objset);
595 			(void) printf("object: %llu\n",
596 			    (u_longlong_t)record->zi_object);
597 			(void) printf("  type: %llu\n",
598 			    (u_longlong_t)record->zi_type);
599 			(void) printf(" level: %d\n", record->zi_level);
600 			if (record->zi_start == 0 &&
601 			    record->zi_end == -1ULL)
602 				(void) printf(" range: all\n");
603 			else
604 				(void) printf(" range: [%llu, %llu)\n",
605 				    (u_longlong_t)record->zi_start,
606 				    (u_longlong_t)record->zi_end);
607 			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
608 		}
609 	}
610 
611 	return (0);
612 }
613 
614 int
615 perform_action(const char *pool, zinject_record_t *record, int cmd)
616 {
617 	zfs_cmd_t zc = { 0 };
618 
619 	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
620 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
621 	zc.zc_guid = record->zi_guid;
622 	zc.zc_cookie = cmd;
623 
624 	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
625 		return (0);
626 
627 	return (1);
628 }
629 
630 static int
631 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
632 {
633 	unsigned long scan_delay;
634 	unsigned long scan_nlanes;
635 
636 	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
637 		return (1);
638 
639 	/*
640 	 * We explicitly disallow a delay of zero here, because we key
641 	 * off this value being non-zero in translate_device(), to
642 	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
643 	 */
644 	if (scan_delay == 0)
645 		return (1);
646 
647 	/*
648 	 * The units for the CLI delay parameter is milliseconds, but
649 	 * the data passed to the kernel is interpreted as nanoseconds.
650 	 * Thus we scale the milliseconds to nanoseconds here, and this
651 	 * nanosecond value is used to pass the delay to the kernel.
652 	 */
653 	*delay = MSEC2NSEC(scan_delay);
654 	*nlanes = scan_nlanes;
655 
656 	return (0);
657 }
658 
659 /*
660  * This function converts a string specifier for DVAs into a bit mask.
661  * The dva's provided by the user should be 0 indexed and separated by
662  * a comma. For example:
663  *     "1"     -> 0b0010  (0x2)
664  *     "0,1"   -> 0b0011  (0x3)
665  *     "0,1,2" -> 0b0111  (0x7)
666  */
667 static int
668 parse_dvas(const char *str, uint32_t *dvas_out)
669 {
670 	const char *c = str;
671 	uint32_t mask = 0;
672 	boolean_t need_delim = B_FALSE;
673 
674 	/* max string length is 5 ("0,1,2") */
675 	if (strlen(str) > 5 || strlen(str) == 0)
676 		return (EINVAL);
677 
678 	while (*c != '\0') {
679 		switch (*c) {
680 		case '0':
681 		case '1':
682 		case '2':
683 			/* check for pipe between DVAs */
684 			if (need_delim)
685 				return (EINVAL);
686 
687 			/* check if this DVA has been set already */
688 			if (mask & (1 << ((*c) - '0')))
689 				return (EINVAL);
690 
691 			mask |= (1 << ((*c) - '0'));
692 			need_delim = B_TRUE;
693 			break;
694 		case ',':
695 			need_delim = B_FALSE;
696 			break;
697 		default:
698 			/* check for invalid character */
699 			return (EINVAL);
700 		}
701 		c++;
702 	}
703 
704 	/* check for dangling delimiter */
705 	if (!need_delim)
706 		return (EINVAL);
707 
708 	*dvas_out = mask;
709 	return (0);
710 }
711 
712 int
713 main(int argc, char **argv)
714 {
715 	int c;
716 	char *range = NULL;
717 	char *cancel = NULL;
718 	char *end;
719 	char *raw = NULL;
720 	char *device = NULL;
721 	int level = 0;
722 	int quiet = 0;
723 	int error = 0;
724 	int domount = 0;
725 	int io_type = ZIO_TYPES;
726 	int action = VDEV_STATE_UNKNOWN;
727 	err_type_t type = TYPE_INVAL;
728 	err_type_t label = TYPE_INVAL;
729 	zinject_record_t record = { 0 };
730 	char pool[MAXNAMELEN];
731 	char dataset[MAXNAMELEN];
732 	zfs_handle_t *zhp;
733 	int nowrites = 0;
734 	int dur_txg = 0;
735 	int dur_secs = 0;
736 	int ret;
737 	int flags = 0;
738 	uint32_t dvas = 0;
739 
740 	if ((g_zfs = libzfs_init()) == NULL) {
741 		(void) fprintf(stderr, "internal error: failed to "
742 		    "initialize ZFS library\n");
743 		return (1);
744 	}
745 
746 	libzfs_print_on_error(g_zfs, B_TRUE);
747 
748 	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
749 		(void) fprintf(stderr, "failed to open ZFS device\n");
750 		return (1);
751 	}
752 
753 	if (argc == 1) {
754 		/*
755 		 * No arguments.  Print the available handlers.  If there are no
756 		 * available handlers, direct the user to '-h' for help
757 		 * information.
758 		 */
759 		if (print_all_handlers() == 0) {
760 			(void) printf("No handlers registered.\n");
761 			(void) printf("Run 'zinject -h' for usage "
762 			    "information.\n");
763 		}
764 
765 		return (0);
766 	}
767 
768 	while ((c = getopt(argc, argv,
769 	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
770 		switch (c) {
771 		case 'a':
772 			flags |= ZINJECT_FLUSH_ARC;
773 			break;
774 		case 'A':
775 			if (strcasecmp(optarg, "degrade") == 0) {
776 				action = VDEV_STATE_DEGRADED;
777 			} else if (strcasecmp(optarg, "fault") == 0) {
778 				action = VDEV_STATE_FAULTED;
779 			} else {
780 				(void) fprintf(stderr, "invalid action '%s': "
781 				    "must be 'degrade' or 'fault'\n", optarg);
782 				usage();
783 				return (1);
784 			}
785 			break;
786 		case 'b':
787 			raw = optarg;
788 			break;
789 		case 'c':
790 			cancel = optarg;
791 			break;
792 		case 'C':
793 			ret = parse_dvas(optarg, &dvas);
794 			if (ret != 0) {
795 				(void) fprintf(stderr, "invalid DVA list '%s': "
796 				    "DVAs should be 0 indexed and separated by "
797 				    "commas.\n", optarg);
798 				usage();
799 				libzfs_fini(g_zfs);
800 				return (1);
801 			}
802 			break;
803 		case 'd':
804 			device = optarg;
805 			break;
806 		case 'D':
807 			ret = parse_delay(optarg, &record.zi_timer,
808 			    &record.zi_nlanes);
809 			if (ret != 0) {
810 				(void) fprintf(stderr, "invalid i/o delay "
811 				    "value: '%s'\n", optarg);
812 				usage();
813 				return (1);
814 			}
815 			break;
816 		case 'e':
817 			if (strcasecmp(optarg, "io") == 0) {
818 				error = EIO;
819 			} else if (strcasecmp(optarg, "checksum") == 0) {
820 				error = ECKSUM;
821 			} else if (strcasecmp(optarg, "decrypt") == 0) {
822 				error = EACCES;
823 			} else if (strcasecmp(optarg, "nxio") == 0) {
824 				error = ENXIO;
825 			} else if (strcasecmp(optarg, "dtl") == 0) {
826 				error = ECHILD;
827 			} else {
828 				(void) fprintf(stderr, "invalid error type "
829 				    "'%s': must be 'io', 'checksum' or "
830 				    "'nxio'\n", optarg);
831 				usage();
832 				return (1);
833 			}
834 			break;
835 		case 'f':
836 			record.zi_freq = atoi(optarg);
837 			if (record.zi_freq < 1 || record.zi_freq > 100) {
838 				(void) fprintf(stderr, "frequency range must "
839 				    "be in the range (0, 100]\n");
840 				return (1);
841 			}
842 			break;
843 		case 'F':
844 			record.zi_failfast = B_TRUE;
845 			break;
846 		case 'g':
847 			dur_txg = 1;
848 			record.zi_duration = (int)strtol(optarg, &end, 10);
849 			if (record.zi_duration <= 0 || *end != '\0') {
850 				(void) fprintf(stderr, "invalid duration '%s': "
851 				    "must be a positive integer\n", optarg);
852 				usage();
853 				return (1);
854 			}
855 			/* store duration of txgs as its negative */
856 			record.zi_duration *= -1;
857 			break;
858 		case 'h':
859 			usage();
860 			return (0);
861 		case 'I':
862 			/* default duration, if one hasn't yet been defined */
863 			nowrites = 1;
864 			if (dur_secs == 0 && dur_txg == 0)
865 				record.zi_duration = 30;
866 			break;
867 		case 'l':
868 			level = (int)strtol(optarg, &end, 10);
869 			if (*end != '\0') {
870 				(void) fprintf(stderr, "invalid level '%s': "
871 				    "must be an integer\n", optarg);
872 				usage();
873 				return (1);
874 			}
875 			break;
876 		case 'm':
877 			domount = 1;
878 			break;
879 		case 'p':
880 			(void) strlcpy(record.zi_func, optarg,
881 			    sizeof (record.zi_func));
882 			record.zi_cmd = ZINJECT_PANIC;
883 			break;
884 		case 'q':
885 			quiet = 1;
886 			break;
887 		case 'r':
888 			range = optarg;
889 			break;
890 		case 's':
891 			dur_secs = 1;
892 			record.zi_duration = (int)strtol(optarg, &end, 10);
893 			if (record.zi_duration <= 0 || *end != '\0') {
894 				(void) fprintf(stderr, "invalid duration '%s': "
895 				    "must be a positive integer\n", optarg);
896 				usage();
897 				return (1);
898 			}
899 			break;
900 		case 'T':
901 			if (strcasecmp(optarg, "read") == 0) {
902 				io_type = ZIO_TYPE_READ;
903 			} else if (strcasecmp(optarg, "write") == 0) {
904 				io_type = ZIO_TYPE_WRITE;
905 			} else if (strcasecmp(optarg, "free") == 0) {
906 				io_type = ZIO_TYPE_FREE;
907 			} else if (strcasecmp(optarg, "claim") == 0) {
908 				io_type = ZIO_TYPE_CLAIM;
909 			} else if (strcasecmp(optarg, "all") == 0) {
910 				io_type = ZIO_TYPES;
911 			} else {
912 				(void) fprintf(stderr, "invalid I/O type "
913 				    "'%s': must be 'read', 'write', 'free', "
914 				    "'claim' or 'all'\n", optarg);
915 				usage();
916 				return (1);
917 			}
918 			break;
919 		case 't':
920 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
921 			    !MOS_TYPE(type)) {
922 				(void) fprintf(stderr, "invalid type '%s'\n",
923 				    optarg);
924 				usage();
925 				return (1);
926 			}
927 			break;
928 		case 'u':
929 			flags |= ZINJECT_UNLOAD_SPA;
930 			break;
931 		case 'L':
932 			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
933 			    !LABEL_TYPE(type)) {
934 				(void) fprintf(stderr, "invalid label type "
935 				    "'%s'\n", optarg);
936 				usage();
937 				return (1);
938 			}
939 			break;
940 		case ':':
941 			(void) fprintf(stderr, "option -%c requires an "
942 			    "operand\n", optopt);
943 			usage();
944 			return (1);
945 		case '?':
946 			(void) fprintf(stderr, "invalid option '%c'\n",
947 			    optopt);
948 			usage();
949 			return (2);
950 		}
951 	}
952 
953 	argc -= optind;
954 	argv += optind;
955 
956 	if (record.zi_duration != 0)
957 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
958 
959 	if (cancel != NULL) {
960 		/*
961 		 * '-c' is invalid with any other options.
962 		 */
963 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
964 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
965 		    record.zi_freq > 0 || dvas != 0) {
966 			(void) fprintf(stderr, "cancel (-c) incompatible with "
967 			    "any other options\n");
968 			usage();
969 			return (2);
970 		}
971 		if (argc != 0) {
972 			(void) fprintf(stderr, "extraneous argument to '-c'\n");
973 			usage();
974 			return (2);
975 		}
976 
977 		if (strcmp(cancel, "all") == 0) {
978 			return (cancel_all_handlers());
979 		} else {
980 			int id = (int)strtol(cancel, &end, 10);
981 			if (*end != '\0') {
982 				(void) fprintf(stderr, "invalid handle id '%s':"
983 				    " must be an integer or 'all'\n", cancel);
984 				usage();
985 				return (1);
986 			}
987 			return (cancel_handler(id));
988 		}
989 	}
990 
991 	if (device != NULL) {
992 		/*
993 		 * Device (-d) injection uses a completely different mechanism
994 		 * for doing injection, so handle it separately here.
995 		 */
996 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
997 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
998 		    dvas != 0) {
999 			(void) fprintf(stderr, "device (-d) incompatible with "
1000 			    "data error injection\n");
1001 			usage();
1002 			return (2);
1003 		}
1004 
1005 		if (argc != 1) {
1006 			(void) fprintf(stderr, "device (-d) injection requires "
1007 			    "a single pool name\n");
1008 			usage();
1009 			return (2);
1010 		}
1011 
1012 		(void) strcpy(pool, argv[0]);
1013 		dataset[0] = '\0';
1014 
1015 		if (error == ECKSUM) {
1016 			(void) fprintf(stderr, "device error type must be "
1017 			    "'io' or 'nxio'\n");
1018 			return (1);
1019 		}
1020 
1021 		record.zi_iotype = io_type;
1022 		if (translate_device(pool, device, label, &record) != 0)
1023 			return (1);
1024 		if (!error)
1025 			error = ENXIO;
1026 
1027 		if (action != VDEV_STATE_UNKNOWN)
1028 			return (perform_action(pool, &record, action));
1029 
1030 	} else if (raw != NULL) {
1031 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1032 		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1033 		    record.zi_freq > 0 || dvas != 0) {
1034 			(void) fprintf(stderr, "raw (-b) format with "
1035 			    "any other options\n");
1036 			usage();
1037 			return (2);
1038 		}
1039 
1040 		if (argc != 1) {
1041 			(void) fprintf(stderr, "raw (-b) format expects a "
1042 			    "single pool name\n");
1043 			usage();
1044 			return (2);
1045 		}
1046 
1047 		(void) strcpy(pool, argv[0]);
1048 		dataset[0] = '\0';
1049 
1050 		if (error == ENXIO) {
1051 			(void) fprintf(stderr, "data error type must be "
1052 			    "'checksum' or 'io'\n");
1053 			return (1);
1054 		}
1055 
1056 		record.zi_cmd = ZINJECT_DATA_FAULT;
1057 		if (translate_raw(raw, &record) != 0)
1058 			return (1);
1059 		if (!error)
1060 			error = EIO;
1061 	} else if (record.zi_cmd == ZINJECT_PANIC) {
1062 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1063 		    level != 0 || device != NULL || record.zi_freq > 0 ||
1064 		    dvas != 0) {
1065 			(void) fprintf(stderr, "panic (-p) incompatible with "
1066 			    "other options\n");
1067 			usage();
1068 			return (2);
1069 		}
1070 
1071 		if (argc < 1 || argc > 2) {
1072 			(void) fprintf(stderr, "panic (-p) injection requires "
1073 			    "a single pool name and an optional id\n");
1074 			usage();
1075 			return (2);
1076 		}
1077 
1078 		(void) strcpy(pool, argv[0]);
1079 		if (argv[1] != NULL)
1080 			record.zi_type = atoi(argv[1]);
1081 		dataset[0] = '\0';
1082 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1083 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1084 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1085 			(void) fprintf(stderr, "hardware failure (-I) "
1086 			    "incompatible with other options\n");
1087 			usage();
1088 			libzfs_fini(g_zfs);
1089 			return (2);
1090 		}
1091 
1092 		if (nowrites == 0) {
1093 			(void) fprintf(stderr, "-s or -g meaningless "
1094 			    "without -I (ignore writes)\n");
1095 			usage();
1096 			return (2);
1097 		} else if (dur_secs && dur_txg) {
1098 			(void) fprintf(stderr, "choose a duration either "
1099 			    "in seconds (-s) or a number of txgs (-g) "
1100 			    "but not both\n");
1101 			usage();
1102 			return (2);
1103 		} else if (argc != 1) {
1104 			(void) fprintf(stderr, "ignore writes (-I) "
1105 			    "injection requires a single pool name\n");
1106 			usage();
1107 			return (2);
1108 		}
1109 
1110 		(void) strcpy(pool, argv[0]);
1111 		dataset[0] = '\0';
1112 	} else if (type == TYPE_INVAL) {
1113 		if (flags == 0) {
1114 			(void) fprintf(stderr, "at least one of '-b', '-d', "
1115 			    "'-t', '-a', '-p', '-I' or '-u' "
1116 			    "must be specified\n");
1117 			usage();
1118 			return (2);
1119 		}
1120 
1121 		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1122 			(void) strcpy(pool, argv[0]);
1123 			dataset[0] = '\0';
1124 		} else if (argc != 0) {
1125 			(void) fprintf(stderr, "extraneous argument for "
1126 			    "'-f'\n");
1127 			usage();
1128 			return (2);
1129 		}
1130 
1131 		flags |= ZINJECT_NULL;
1132 	} else {
1133 		if (argc != 1) {
1134 			(void) fprintf(stderr, "missing object\n");
1135 			usage();
1136 			return (2);
1137 		}
1138 
1139 		if (error == ENXIO) {
1140 			(void) fprintf(stderr, "data error type must be "
1141 			    "'checksum' or 'io'\n");
1142 			return (1);
1143 		}
1144 
1145 		if (dvas != 0) {
1146 			if (error == EACCES || error == EINVAL) {
1147 				(void) fprintf(stderr, "the '-C' option may "
1148 				    "not be used with logical data errors "
1149 				    "'decrypt' and 'decompress'\n");
1150 				record.zi_dvas = dvas;
1151 			}
1152 		}
1153 
1154 		record.zi_cmd = ZINJECT_DATA_FAULT;
1155 
1156 		if (error == EACCES) {
1157 			if (type != TYPE_DATA) {
1158 				(void) fprintf(stderr, "decryption errors "
1159 				    "may only be injected for 'data' types\n");
1160 				libzfs_fini(g_zfs);
1161 				return (1);
1162 			}
1163 
1164 			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
1165 			/*
1166 			 * Internally, ZFS actually uses ECKSUM for decryption
1167 			 * errors since EACCES is used to indicate the key was
1168 			 * not found.
1169 			 */
1170 			error = ECKSUM;
1171 		}
1172 
1173 		if (translate_record(type, argv[0], range, level, &record, pool,
1174 		    dataset) != 0)
1175 			return (1);
1176 		if (!error)
1177 			error = EIO;
1178 	}
1179 
1180 	/*
1181 	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1182 	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1183 	 * time we access the pool.
1184 	 */
1185 	if (dataset[0] != '\0' && domount) {
1186 		if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
1187 			return (1);
1188 
1189 		if (zfs_unmount(zhp, NULL, 0) != 0)
1190 			return (1);
1191 	}
1192 
1193 	record.zi_error = error;
1194 
1195 	ret = register_handler(pool, flags, &record, quiet);
1196 
1197 	if (dataset[0] != '\0' && domount)
1198 		ret = (zfs_mount(zhp, NULL, 0) != 0);
1199 
1200 	libzfs_fini(g_zfs);
1201 
1202 	return (ret);
1203 }
1204