xref: /illumos-gate/usr/src/cmd/zinject/zinject.c (revision 0bc0887e1cf0f912077b83256f295ad0ed1c715c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * ZFS Fault Injector
28  *
29  * This userland component takes a set of options and uses libzpool to translate
30  * from a user-visible object type and name to an internal representation.
31  * There are two basic types of faults: device faults and data faults.
32  *
33  *
34  * DEVICE FAULTS
35  *
36  * Errors can be injected into a particular vdev using the '-d' option.  This
37  * option takes a path or vdev GUID to uniquely identify the device within a
38  * pool.  There are two types of errors that can be injected, EIO and ENXIO,
39  * that can be controlled through the '-e' option.  The default is ENXIO.  For
40  * EIO failures, any attempt to read data from the device will return EIO, but
41  * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
42  * any attempt to read from the device will return EIO, but any attempt to
43  * reopen the device will also return ENXIO.
44  * For label faults, the -L option must be specified. This allows faults
45  * to be injected into either the nvlist, uberblock, pad1, or pad2 region
46  * of all the labels for the specified device.
47  *
48  * This form of the command looks like:
49  *
50  *	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
51  *
52  *
53  * DATA FAULTS
54  *
55  * We begin with a tuple of the form:
56  *
57  *	<type,level,range,object>
58  *
59  *	type	A string describing the type of data to target.  Each type
60  *		implicitly describes how to interpret 'object'. Currently,
61  *		the following values are supported:
62  *
63  *		data		User data for a file
64  *		dnode		Dnode for a file or directory
65  *
66  *		The following MOS objects are special.  Instead of injecting
67  *		errors on a particular object or blkid, we inject errors across
68  *		all objects of the given type.
69  *
70  *		mos		Any data in the MOS
71  *		mosdir		object directory
72  *		config		pool configuration
73  *		bpobj		blkptr list
74  *		spacemap	spacemap
75  *		metaslab	metaslab
76  *		errlog		persistent error log
77  *
78  *	level	Object level.  Defaults to '0', not applicable to all types.  If
79  *		a range is given, this corresponds to the indirect block
80  *		corresponding to the specific range.
81  *
82  *	range	A numerical range [start,end) within the object.  Defaults to
83  *		the full size of the file.
84  *
85  *	object	A string describing the logical location of the object.  For
86  *		files and directories (currently the only supported types),
87  *		this is the path of the object on disk.
88  *
89  * This is translated, via libzpool, into the following internal representation:
90  *
91  *	<type,objset,object,level,range>
92  *
93  * These types should be self-explanatory.  This tuple is then passed to the
94  * kernel via a special ioctl() to initiate fault injection for the given
95  * object.  Note that 'type' is not strictly necessary for fault injection, but
96  * is used when translating existing faults into a human-readable string.
97  *
98  *
99  * The command itself takes one of the forms:
100  *
101  *	zinject
102  *	zinject <-a | -u pool>
103  *	zinject -c <id|all>
104  *	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
105  *	    [-r range] <object>
106  *	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
107  *
108  * With no arguments, the command prints all currently registered injection
109  * handlers, with their numeric identifiers.
110  *
111  * The '-c' option will clear the given handler, or all handlers if 'all' is
112  * specified.
113  *
114  * The '-e' option takes a string describing the errno to simulate.  This must
115  * be either 'io' or 'checksum'.  In most cases this will result in the same
116  * behavior, but RAID-Z will produce a different set of ereports for this
117  * situation.
118  *
119  * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
120  * specified, then the ARC cache is flushed appropriately.  If '-u' is
121  * specified, then the underlying SPA is unloaded.  Either of these flags can be
122  * specified independently of any other handlers.  The '-m' flag automatically
123  * does an unmount and remount of the underlying dataset to aid in flushing the
124  * cache.
125  *
126  * The '-f' flag controls the frequency of errors injected, expressed as a
127  * integer percentage between 1 and 100.  The default is 100.
128  *
129  * The this form is responsible for actually injecting the handler into the
130  * framework.  It takes the arguments described above, translates them to the
131  * internal tuple using libzpool, and then issues an ioctl() to register the
132  * handler.
133  *
134  * The final form can target a specific bookmark, regardless of whether a
135  * human-readable interface has been designed.  It allows developers to specify
136  * a particular block by number.
137  */
138 
139 #include <errno.h>
140 #include <fcntl.h>
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <strings.h>
144 #include <unistd.h>
145 
146 #include <sys/fs/zfs.h>
147 #include <sys/mount.h>
148 
149 #include <libzfs.h>
150 
151 #undef verify	/* both libzfs.h and zfs_context.h want to define this */
152 
153 #include "zinject.h"
154 
155 libzfs_handle_t *g_zfs;
156 int zfs_fd;
157 
158 #define	ECKSUM	EBADE
159 
160 static const char *errtable[TYPE_INVAL] = {
161 	"data",
162 	"dnode",
163 	"mos",
164 	"mosdir",
165 	"metaslab",
166 	"config",
167 	"bpobj",
168 	"spacemap",
169 	"errlog",
170 	"uber",
171 	"nvlist",
172 	"pad1",
173 	"pad2"
174 };
175 
176 static err_type_t
177 name_to_type(const char *arg)
178 {
179 	int i;
180 	for (i = 0; i < TYPE_INVAL; i++)
181 		if (strcmp(errtable[i], arg) == 0)
182 			return (i);
183 
184 	return (TYPE_INVAL);
185 }
186 
187 static const char *
188 type_to_name(uint64_t type)
189 {
190 	switch (type) {
191 	case DMU_OT_OBJECT_DIRECTORY:
192 		return ("mosdir");
193 	case DMU_OT_OBJECT_ARRAY:
194 		return ("metaslab");
195 	case DMU_OT_PACKED_NVLIST:
196 		return ("config");
197 	case DMU_OT_BPOBJ:
198 		return ("bpobj");
199 	case DMU_OT_SPACE_MAP:
200 		return ("spacemap");
201 	case DMU_OT_ERROR_LOG:
202 		return ("errlog");
203 	default:
204 		return ("-");
205 	}
206 }
207 
208 
209 /*
210  * Print usage message.
211  */
212 void
213 usage(void)
214 {
215 	(void) printf(
216 	    "usage:\n"
217 	    "\n"
218 	    "\tzinject\n"
219 	    "\n"
220 	    "\t\tList all active injection records.\n"
221 	    "\n"
222 	    "\tzinject -c <id|all>\n"
223 	    "\n"
224 	    "\t\tClear the particular record (if given a numeric ID), or\n"
225 	    "\t\tall records if 'all' is specificed.\n"
226 	    "\n"
227 	    "\tzinject -p <function name> pool\n"
228 	    "\n"
229 	    "\t\tInject a panic fault at the specified function. Only \n"
230 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
231 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
232 	    "\n"
233 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
234 	    "\t    [-T <read|write|free|claim|all> pool\n"
235 	    "\n"
236 	    "\t\tInject a fault into a particular device or the device's\n"
237 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
238 	    "\t\t'pad1', or 'pad2'.\n"
239 	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
240 	    "\n"
241 	    "\tzinject -d device -A <degrade|fault> pool\n"
242 	    "\n"
243 	    "\t\tPerform a specific action on a particular device\n"
244 	    "\n"
245 	    "\tzinject -d device -D latency:lanes pool\n"
246 	    "\n"
247 	    "\t\tAdd an artificial delay to IO requests on a particular\n"
248 	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
249 	    "\t\tmilliseconds to complete. Each delay has an associated\n"
250 	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
251 	    "\t\tIO requests that can be processed.\n"
252 	    "\n"
253 	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
254 	    "\t\tthe device will only be able to service a single IO request\n"
255 	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
256 	    "\t\tif only a single request is submitted every 10 ms, the\n"
257 	    "\t\taverage latency will be 10 ms; but if more than one request\n"
258 	    "\t\tis submitted every 10 ms, the average latency will be more\n"
259 	    "\t\tthan 10 ms.\n"
260 	    "\n"
261 	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
262 	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
263 	    "\t\ttwo requests at a time, each with a minimum latency of\n"
264 	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
265 	    "\t\tthe average latency will be 10 ms; but if more than two\n"
266 	    "\t\trequests are submitted every 10 ms, the average latency\n"
267 	    "\t\twill be more than 10 ms.\n"
268 	    "\n"
269 	    "\t\tAlso note, these delays are additive. So two invocations\n"
270 	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
271 	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
272 	    "\t\tlanes with differing target latencies. For example, an\n"
273 	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
274 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
275 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
276 	    "\n"
277 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
278 	    "\n"
279 	    "\t\tCause the pool to stop writing blocks yet not\n"
280 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
281 	    "\t\tthat fails to honor cache flush requests.\n"
282 	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
283 	    "\t\tat the end of the duration.\n"
284 	    "\n"
285 	    "\tzinject -b objset:object:level:blkid pool\n"
286 	    "\n"
287 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
288 	    "\t\tspecified by the remaining tuple.  Each number is in\n"
289 	    "\t\thexidecimal, and only one block can be specified.\n"
290 	    "\n"
291 	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
292 	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
293 	    "\n"
294 	    "\t\tInject an error into the object specified by the '-t' option\n"
295 	    "\t\tand the object descriptor.  The 'object' parameter is\n"
296 	    "\t\tinterperted depending on the '-t' option.\n"
297 	    "\n"
298 	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
299 	    "\t\t-e\tInject a specific error.  Must be either 'io' or\n"
300 	    "\t\t\t'checksum', or 'decompress'.  Default is 'io'.\n"
301 	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
302 	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
303 	    "\t\t\tseparated by commas (ex. '0,2').\n"
304 	    "\t\t-l\tInject error at a particular block level. Default is "
305 	    "0.\n"
306 	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
307 	    "\t\t-r\tInject error over a particular logical range of an\n"
308 	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
309 	    "\t\t\trange according to the object's properties.\n"
310 	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
311 	    "\t\t\tassociated object.\n"
312 	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
313 	    "\t\t\ta pool object.\n"
314 	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
315 	    "\t\t\ta percentage between 1 and 100.\n"
316 	    "\n"
317 	    "\t-t data\t\tInject an error into the plain file contents of a\n"
318 	    "\t\t\tfile.  The object must be specified as a complete path\n"
319 	    "\t\t\tto a file on a ZFS filesystem.\n"
320 	    "\n"
321 	    "\t-t dnode\tInject an error into the metadnode in the block\n"
322 	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
323 	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
324 	    "\t\t\tis specified as a complete path to a file or directory\n"
325 	    "\t\t\ton a ZFS filesystem.\n"
326 	    "\n"
327 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
328 	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
329 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
330 	    "\t\t\tthe poolname.\n");
331 }
332 
333 static int
334 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
335     void *data)
336 {
337 	zfs_cmd_t zc = { 0 };
338 	int ret;
339 
340 	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
341 		if ((ret = func((int)zc.zc_guid, zc.zc_name,
342 		    &zc.zc_inject_record, data)) != 0)
343 			return (ret);
344 
345 	if (errno != ENOENT) {
346 		(void) fprintf(stderr, "Unable to list handlers: %s\n",
347 		    strerror(errno));
348 		return (-1);
349 	}
350 
351 	return (0);
352 }
353 
354 static int
355 print_data_handler(int id, const char *pool, zinject_record_t *record,
356     void *data)
357 {
358 	int *count = data;
359 
360 	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
361 		return (0);
362 
363 	if (*count == 0) {
364 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  ",
365 		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
366 		    "LVL", "DVAs", "RANGE");
367 		(void) printf("---  ---------------  ------  "
368 		    "------  --------  ---  ---- ----------------\n");
369 	}
370 
371 	*count += 1;
372 
373 	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
374 	    id, pool, (u_longlong_t)record->zi_objset,
375 	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
376 	    record->zi_level, record->zi_dvas);
377 
378 	if (record->zi_start == 0 &&
379 	    record->zi_end == -1ULL)
380 		(void) printf("all\n");
381 	else
382 		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
383 		    (u_longlong_t)record->zi_end);
384 
385 	return (0);
386 }
387 
388 static int
389 print_device_handler(int id, const char *pool, zinject_record_t *record,
390     void *data)
391 {
392 	int *count = data;
393 
394 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
395 		return (0);
396 
397 	if (record->zi_cmd == ZINJECT_DELAY_IO)
398 		return (0);
399 
400 	if (*count == 0) {
401 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
402 		(void) printf("---  ---------------  ----------------\n");
403 	}
404 
405 	*count += 1;
406 
407 	(void) printf("%3d  %-15s  %llx\n", id, pool,
408 	    (u_longlong_t)record->zi_guid);
409 
410 	return (0);
411 }
412 
413 static int
414 print_delay_handler(int id, const char *pool, zinject_record_t *record,
415     void *data)
416 {
417 	int *count = data;
418 
419 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
420 		return (0);
421 
422 	if (record->zi_cmd != ZINJECT_DELAY_IO)
423 		return (0);
424 
425 	if (*count == 0) {
426 		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
427 		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
428 		(void) printf("---  ---------------  ---------------  "
429 		    "---------------  ----------------\n");
430 	}
431 
432 	*count += 1;
433 
434 	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
435 	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
436 	    (u_longlong_t)record->zi_nlanes,
437 	    (u_longlong_t)record->zi_guid);
438 
439 	return (0);
440 }
441 
442 static int
443 print_panic_handler(int id, const char *pool, zinject_record_t *record,
444     void *data)
445 {
446 	int *count = data;
447 
448 	if (record->zi_func[0] == '\0')
449 		return (0);
450 
451 	if (*count == 0) {
452 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
453 		(void) printf("---  ---------------  ----------------\n");
454 	}
455 
456 	*count += 1;
457 
458 	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
459 
460 	return (0);
461 }
462 
463 /*
464  * Print all registered error handlers.  Returns the number of handlers
465  * registered.
466  */
467 static int
468 print_all_handlers(void)
469 {
470 	int count = 0, total = 0;
471 
472 	(void) iter_handlers(print_device_handler, &count);
473 	if (count > 0) {
474 		total += count;
475 		(void) printf("\n");
476 		count = 0;
477 	}
478 
479 	(void) iter_handlers(print_delay_handler, &count);
480 	if (count > 0) {
481 		total += count;
482 		(void) printf("\n");
483 		count = 0;
484 	}
485 
486 	(void) iter_handlers(print_data_handler, &count);
487 	if (count > 0) {
488 		total += count;
489 		(void) printf("\n");
490 		count = 0;
491 	}
492 
493 	(void) iter_handlers(print_panic_handler, &count);
494 
495 	return (count + total);
496 }
497 
498 /* ARGSUSED */
499 static int
500 cancel_one_handler(int id, const char *pool, zinject_record_t *record,
501     void *data)
502 {
503 	zfs_cmd_t zc = { 0 };
504 
505 	zc.zc_guid = (uint64_t)id;
506 
507 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
508 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
509 		    id, strerror(errno));
510 		return (1);
511 	}
512 
513 	return (0);
514 }
515 
516 /*
517  * Remove all fault injection handlers.
518  */
519 static int
520 cancel_all_handlers(void)
521 {
522 	int ret = iter_handlers(cancel_one_handler, NULL);
523 
524 	if (ret == 0)
525 		(void) printf("removed all registered handlers\n");
526 
527 	return (ret);
528 }
529 
530 /*
531  * Remove a specific fault injection handler.
532  */
533 static int
534 cancel_handler(int id)
535 {
536 	zfs_cmd_t zc = { 0 };
537 
538 	zc.zc_guid = (uint64_t)id;
539 
540 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
541 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
542 		    id, strerror(errno));
543 		return (1);
544 	}
545 
546 	(void) printf("removed handler %d\n", id);
547 
548 	return (0);
549 }
550 
551 /*
552  * Register a new fault injection handler.
553  */
554 static int
555 register_handler(const char *pool, int flags, zinject_record_t *record,
556     int quiet)
557 {
558 	zfs_cmd_t zc = { 0 };
559 
560 	(void) strcpy(zc.zc_name, pool);
561 	zc.zc_inject_record = *record;
562 	zc.zc_guid = flags;
563 
564 	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
565 		(void) fprintf(stderr, "failed to add handler: %s\n",
566 		    strerror(errno));
567 		return (1);
568 	}
569 
570 	if (flags & ZINJECT_NULL)
571 		return (0);
572 
573 	if (quiet) {
574 		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
575 	} else {
576 		(void) printf("Added handler %llu with the following "
577 		    "properties:\n", (u_longlong_t)zc.zc_guid);
578 		(void) printf("  pool: %s\n", pool);
579 		if (record->zi_guid) {
580 			(void) printf("  vdev: %llx\n",
581 			    (u_longlong_t)record->zi_guid);
582 		} else if (record->zi_func[0] != '\0') {
583 			(void) printf("  panic function: %s\n",
584 			    record->zi_func);
585 		} else if (record->zi_duration > 0) {
586 			(void) printf(" time: %lld seconds\n",
587 			    (u_longlong_t)record->zi_duration);
588 		} else if (record->zi_duration < 0) {
589 			(void) printf(" txgs: %lld \n",
590 			    (u_longlong_t)-record->zi_duration);
591 		} else {
592 			(void) printf("objset: %llu\n",
593 			    (u_longlong_t)record->zi_objset);
594 			(void) printf("object: %llu\n",
595 			    (u_longlong_t)record->zi_object);
596 			(void) printf("  type: %llu\n",
597 			    (u_longlong_t)record->zi_type);
598 			(void) printf(" level: %d\n", record->zi_level);
599 			if (record->zi_start == 0 &&
600 			    record->zi_end == -1ULL)
601 				(void) printf(" range: all\n");
602 			else
603 				(void) printf(" range: [%llu, %llu)\n",
604 				    (u_longlong_t)record->zi_start,
605 				    (u_longlong_t)record->zi_end);
606 			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
607 		}
608 	}
609 
610 	return (0);
611 }
612 
613 int
614 perform_action(const char *pool, zinject_record_t *record, int cmd)
615 {
616 	zfs_cmd_t zc = { 0 };
617 
618 	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
619 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
620 	zc.zc_guid = record->zi_guid;
621 	zc.zc_cookie = cmd;
622 
623 	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
624 		return (0);
625 
626 	return (1);
627 }
628 
629 static int
630 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
631 {
632 	unsigned long scan_delay;
633 	unsigned long scan_nlanes;
634 
635 	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
636 		return (1);
637 
638 	/*
639 	 * We explicitly disallow a delay of zero here, because we key
640 	 * off this value being non-zero in translate_device(), to
641 	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
642 	 */
643 	if (scan_delay == 0)
644 		return (1);
645 
646 	/*
647 	 * The units for the CLI delay parameter is milliseconds, but
648 	 * the data passed to the kernel is interpreted as nanoseconds.
649 	 * Thus we scale the milliseconds to nanoseconds here, and this
650 	 * nanosecond value is used to pass the delay to the kernel.
651 	 */
652 	*delay = MSEC2NSEC(scan_delay);
653 	*nlanes = scan_nlanes;
654 
655 	return (0);
656 }
657 
658 /*
659  * This function converts a string specifier for DVAs into a bit mask.
660  * The dva's provided by the user should be 0 indexed and separated by
661  * a comma. For example:
662  *     "1"     -> 0b0010  (0x2)
663  *     "0,1"   -> 0b0011  (0x3)
664  *     "0,1,2" -> 0b0111  (0x7)
665  */
666 static int
667 parse_dvas(const char *str, uint32_t *dvas_out)
668 {
669 	const char *c = str;
670 	uint32_t mask = 0;
671 	boolean_t need_delim = B_FALSE;
672 
673 	/* max string length is 5 ("0,1,2") */
674 	if (strlen(str) > 5 || strlen(str) == 0)
675 		return (EINVAL);
676 
677 	while (*c != '\0') {
678 		switch (*c) {
679 		case '0':
680 		case '1':
681 		case '2':
682 			/* check for pipe between DVAs */
683 			if (need_delim)
684 				return (EINVAL);
685 
686 			/* check if this DVA has been set already */
687 			if (mask & (1 << ((*c) - '0')))
688 				return (EINVAL);
689 
690 			mask |= (1 << ((*c) - '0'));
691 			need_delim = B_TRUE;
692 			break;
693 		case ',':
694 			need_delim = B_FALSE;
695 			break;
696 		default:
697 			/* check for invalid character */
698 			return (EINVAL);
699 		}
700 		c++;
701 	}
702 
703 	/* check for dangling delimiter */
704 	if (!need_delim)
705 		return (EINVAL);
706 
707 	*dvas_out = mask;
708 	return (0);
709 }
710 
711 int
712 main(int argc, char **argv)
713 {
714 	int c;
715 	char *range = NULL;
716 	char *cancel = NULL;
717 	char *end;
718 	char *raw = NULL;
719 	char *device = NULL;
720 	int level = 0;
721 	int quiet = 0;
722 	int error = 0;
723 	int domount = 0;
724 	int io_type = ZIO_TYPES;
725 	int action = VDEV_STATE_UNKNOWN;
726 	err_type_t type = TYPE_INVAL;
727 	err_type_t label = TYPE_INVAL;
728 	zinject_record_t record = { 0 };
729 	char pool[MAXNAMELEN];
730 	char dataset[MAXNAMELEN];
731 	zfs_handle_t *zhp;
732 	int nowrites = 0;
733 	int dur_txg = 0;
734 	int dur_secs = 0;
735 	int ret;
736 	int flags = 0;
737 	uint32_t dvas = 0;
738 
739 	if ((g_zfs = libzfs_init()) == NULL) {
740 		(void) fprintf(stderr, "internal error: failed to "
741 		    "initialize ZFS library\n");
742 		return (1);
743 	}
744 
745 	libzfs_print_on_error(g_zfs, B_TRUE);
746 
747 	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
748 		(void) fprintf(stderr, "failed to open ZFS device\n");
749 		return (1);
750 	}
751 
752 	if (argc == 1) {
753 		/*
754 		 * No arguments.  Print the available handlers.  If there are no
755 		 * available handlers, direct the user to '-h' for help
756 		 * information.
757 		 */
758 		if (print_all_handlers() == 0) {
759 			(void) printf("No handlers registered.\n");
760 			(void) printf("Run 'zinject -h' for usage "
761 			    "information.\n");
762 		}
763 
764 		return (0);
765 	}
766 
767 	while ((c = getopt(argc, argv,
768 	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
769 		switch (c) {
770 		case 'a':
771 			flags |= ZINJECT_FLUSH_ARC;
772 			break;
773 		case 'A':
774 			if (strcasecmp(optarg, "degrade") == 0) {
775 				action = VDEV_STATE_DEGRADED;
776 			} else if (strcasecmp(optarg, "fault") == 0) {
777 				action = VDEV_STATE_FAULTED;
778 			} else {
779 				(void) fprintf(stderr, "invalid action '%s': "
780 				    "must be 'degrade' or 'fault'\n", optarg);
781 				usage();
782 				return (1);
783 			}
784 			break;
785 		case 'b':
786 			raw = optarg;
787 			break;
788 		case 'c':
789 			cancel = optarg;
790 			break;
791 		case 'C':
792 			ret = parse_dvas(optarg, &dvas);
793 			if (ret != 0) {
794 				(void) fprintf(stderr, "invalid DVA list '%s': "
795 				    "DVAs should be 0 indexed and separated by "
796 				    "commas.\n", optarg);
797 				usage();
798 				libzfs_fini(g_zfs);
799 				return (1);
800 			}
801 			break;
802 		case 'd':
803 			device = optarg;
804 			break;
805 		case 'D':
806 			ret = parse_delay(optarg, &record.zi_timer,
807 			    &record.zi_nlanes);
808 			if (ret != 0) {
809 				(void) fprintf(stderr, "invalid i/o delay "
810 				    "value: '%s'\n", optarg);
811 				usage();
812 				return (1);
813 			}
814 			break;
815 		case 'e':
816 			if (strcasecmp(optarg, "io") == 0) {
817 				error = EIO;
818 			} else if (strcasecmp(optarg, "checksum") == 0) {
819 				error = ECKSUM;
820 			} else if (strcasecmp(optarg, "nxio") == 0) {
821 				error = ENXIO;
822 			} else if (strcasecmp(optarg, "dtl") == 0) {
823 				error = ECHILD;
824 			} else {
825 				(void) fprintf(stderr, "invalid error type "
826 				    "'%s': must be 'io', 'checksum' or "
827 				    "'nxio'\n", optarg);
828 				usage();
829 				return (1);
830 			}
831 			break;
832 		case 'f':
833 			record.zi_freq = atoi(optarg);
834 			if (record.zi_freq < 1 || record.zi_freq > 100) {
835 				(void) fprintf(stderr, "frequency range must "
836 				    "be in the range (0, 100]\n");
837 				return (1);
838 			}
839 			break;
840 		case 'F':
841 			record.zi_failfast = B_TRUE;
842 			break;
843 		case 'g':
844 			dur_txg = 1;
845 			record.zi_duration = (int)strtol(optarg, &end, 10);
846 			if (record.zi_duration <= 0 || *end != '\0') {
847 				(void) fprintf(stderr, "invalid duration '%s': "
848 				    "must be a positive integer\n", optarg);
849 				usage();
850 				return (1);
851 			}
852 			/* store duration of txgs as its negative */
853 			record.zi_duration *= -1;
854 			break;
855 		case 'h':
856 			usage();
857 			return (0);
858 		case 'I':
859 			/* default duration, if one hasn't yet been defined */
860 			nowrites = 1;
861 			if (dur_secs == 0 && dur_txg == 0)
862 				record.zi_duration = 30;
863 			break;
864 		case 'l':
865 			level = (int)strtol(optarg, &end, 10);
866 			if (*end != '\0') {
867 				(void) fprintf(stderr, "invalid level '%s': "
868 				    "must be an integer\n", optarg);
869 				usage();
870 				return (1);
871 			}
872 			break;
873 		case 'm':
874 			domount = 1;
875 			break;
876 		case 'p':
877 			(void) strlcpy(record.zi_func, optarg,
878 			    sizeof (record.zi_func));
879 			record.zi_cmd = ZINJECT_PANIC;
880 			break;
881 		case 'q':
882 			quiet = 1;
883 			break;
884 		case 'r':
885 			range = optarg;
886 			break;
887 		case 's':
888 			dur_secs = 1;
889 			record.zi_duration = (int)strtol(optarg, &end, 10);
890 			if (record.zi_duration <= 0 || *end != '\0') {
891 				(void) fprintf(stderr, "invalid duration '%s': "
892 				    "must be a positive integer\n", optarg);
893 				usage();
894 				return (1);
895 			}
896 			break;
897 		case 'T':
898 			if (strcasecmp(optarg, "read") == 0) {
899 				io_type = ZIO_TYPE_READ;
900 			} else if (strcasecmp(optarg, "write") == 0) {
901 				io_type = ZIO_TYPE_WRITE;
902 			} else if (strcasecmp(optarg, "free") == 0) {
903 				io_type = ZIO_TYPE_FREE;
904 			} else if (strcasecmp(optarg, "claim") == 0) {
905 				io_type = ZIO_TYPE_CLAIM;
906 			} else if (strcasecmp(optarg, "all") == 0) {
907 				io_type = ZIO_TYPES;
908 			} else {
909 				(void) fprintf(stderr, "invalid I/O type "
910 				    "'%s': must be 'read', 'write', 'free', "
911 				    "'claim' or 'all'\n", optarg);
912 				usage();
913 				return (1);
914 			}
915 			break;
916 		case 't':
917 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
918 			    !MOS_TYPE(type)) {
919 				(void) fprintf(stderr, "invalid type '%s'\n",
920 				    optarg);
921 				usage();
922 				return (1);
923 			}
924 			break;
925 		case 'u':
926 			flags |= ZINJECT_UNLOAD_SPA;
927 			break;
928 		case 'L':
929 			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
930 			    !LABEL_TYPE(type)) {
931 				(void) fprintf(stderr, "invalid label type "
932 				    "'%s'\n", optarg);
933 				usage();
934 				return (1);
935 			}
936 			break;
937 		case ':':
938 			(void) fprintf(stderr, "option -%c requires an "
939 			    "operand\n", optopt);
940 			usage();
941 			return (1);
942 		case '?':
943 			(void) fprintf(stderr, "invalid option '%c'\n",
944 			    optopt);
945 			usage();
946 			return (2);
947 		}
948 	}
949 
950 	argc -= optind;
951 	argv += optind;
952 
953 	if (record.zi_duration != 0)
954 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
955 
956 	if (cancel != NULL) {
957 		/*
958 		 * '-c' is invalid with any other options.
959 		 */
960 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
961 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
962 		    record.zi_freq > 0 || dvas != 0) {
963 			(void) fprintf(stderr, "cancel (-c) incompatible with "
964 			    "any other options\n");
965 			usage();
966 			return (2);
967 		}
968 		if (argc != 0) {
969 			(void) fprintf(stderr, "extraneous argument to '-c'\n");
970 			usage();
971 			return (2);
972 		}
973 
974 		if (strcmp(cancel, "all") == 0) {
975 			return (cancel_all_handlers());
976 		} else {
977 			int id = (int)strtol(cancel, &end, 10);
978 			if (*end != '\0') {
979 				(void) fprintf(stderr, "invalid handle id '%s':"
980 				    " must be an integer or 'all'\n", cancel);
981 				usage();
982 				return (1);
983 			}
984 			return (cancel_handler(id));
985 		}
986 	}
987 
988 	if (device != NULL) {
989 		/*
990 		 * Device (-d) injection uses a completely different mechanism
991 		 * for doing injection, so handle it separately here.
992 		 */
993 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
994 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
995 		    dvas != 0) {
996 			(void) fprintf(stderr, "device (-d) incompatible with "
997 			    "data error injection\n");
998 			usage();
999 			return (2);
1000 		}
1001 
1002 		if (argc != 1) {
1003 			(void) fprintf(stderr, "device (-d) injection requires "
1004 			    "a single pool name\n");
1005 			usage();
1006 			return (2);
1007 		}
1008 
1009 		(void) strcpy(pool, argv[0]);
1010 		dataset[0] = '\0';
1011 
1012 		if (error == ECKSUM) {
1013 			(void) fprintf(stderr, "device error type must be "
1014 			    "'io' or 'nxio'\n");
1015 			return (1);
1016 		}
1017 
1018 		record.zi_iotype = io_type;
1019 		if (translate_device(pool, device, label, &record) != 0)
1020 			return (1);
1021 		if (!error)
1022 			error = ENXIO;
1023 
1024 		if (action != VDEV_STATE_UNKNOWN)
1025 			return (perform_action(pool, &record, action));
1026 
1027 	} else if (raw != NULL) {
1028 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1029 		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1030 		    record.zi_freq > 0 || dvas != 0) {
1031 			(void) fprintf(stderr, "raw (-b) format with "
1032 			    "any other options\n");
1033 			usage();
1034 			return (2);
1035 		}
1036 
1037 		if (argc != 1) {
1038 			(void) fprintf(stderr, "raw (-b) format expects a "
1039 			    "single pool name\n");
1040 			usage();
1041 			return (2);
1042 		}
1043 
1044 		(void) strcpy(pool, argv[0]);
1045 		dataset[0] = '\0';
1046 
1047 		if (error == ENXIO) {
1048 			(void) fprintf(stderr, "data error type must be "
1049 			    "'checksum' or 'io'\n");
1050 			return (1);
1051 		}
1052 
1053 		record.zi_cmd = ZINJECT_DATA_FAULT;
1054 		if (translate_raw(raw, &record) != 0)
1055 			return (1);
1056 		if (!error)
1057 			error = EIO;
1058 	} else if (record.zi_cmd == ZINJECT_PANIC) {
1059 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1060 		    level != 0 || device != NULL || record.zi_freq > 0 ||
1061 		    dvas != 0) {
1062 			(void) fprintf(stderr, "panic (-p) incompatible with "
1063 			    "other options\n");
1064 			usage();
1065 			return (2);
1066 		}
1067 
1068 		if (argc < 1 || argc > 2) {
1069 			(void) fprintf(stderr, "panic (-p) injection requires "
1070 			    "a single pool name and an optional id\n");
1071 			usage();
1072 			return (2);
1073 		}
1074 
1075 		(void) strcpy(pool, argv[0]);
1076 		if (argv[1] != NULL)
1077 			record.zi_type = atoi(argv[1]);
1078 		dataset[0] = '\0';
1079 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1080 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1081 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1082 			(void) fprintf(stderr, "hardware failure (-I) "
1083 			    "incompatible with other options\n");
1084 			usage();
1085 			libzfs_fini(g_zfs);
1086 			return (2);
1087 		}
1088 
1089 		if (nowrites == 0) {
1090 			(void) fprintf(stderr, "-s or -g meaningless "
1091 			    "without -I (ignore writes)\n");
1092 			usage();
1093 			return (2);
1094 		} else if (dur_secs && dur_txg) {
1095 			(void) fprintf(stderr, "choose a duration either "
1096 			    "in seconds (-s) or a number of txgs (-g) "
1097 			    "but not both\n");
1098 			usage();
1099 			return (2);
1100 		} else if (argc != 1) {
1101 			(void) fprintf(stderr, "ignore writes (-I) "
1102 			    "injection requires a single pool name\n");
1103 			usage();
1104 			return (2);
1105 		}
1106 
1107 		(void) strcpy(pool, argv[0]);
1108 		dataset[0] = '\0';
1109 	} else if (type == TYPE_INVAL) {
1110 		if (flags == 0) {
1111 			(void) fprintf(stderr, "at least one of '-b', '-d', "
1112 			    "'-t', '-a', '-p', '-I' or '-u' "
1113 			    "must be specified\n");
1114 			usage();
1115 			return (2);
1116 		}
1117 
1118 		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1119 			(void) strcpy(pool, argv[0]);
1120 			dataset[0] = '\0';
1121 		} else if (argc != 0) {
1122 			(void) fprintf(stderr, "extraneous argument for "
1123 			    "'-f'\n");
1124 			usage();
1125 			return (2);
1126 		}
1127 
1128 		flags |= ZINJECT_NULL;
1129 	} else {
1130 		if (argc != 1) {
1131 			(void) fprintf(stderr, "missing object\n");
1132 			usage();
1133 			return (2);
1134 		}
1135 
1136 		if (error == ENXIO) {
1137 			(void) fprintf(stderr, "data error type must be "
1138 			    "'checksum' or 'io'\n");
1139 			return (1);
1140 		}
1141 
1142 		if (dvas != 0) {
1143 			if (error == EACCES || error == EINVAL) {
1144 				(void) fprintf(stderr, "the '-C' option may "
1145 				    "not be used with logical data errors "
1146 				    "'decrypt' and 'decompress'\n");
1147 				libzfs_fini(g_zfs);
1148 				return (1);
1149 			}
1150 
1151 			record.zi_dvas = dvas;
1152 		}
1153 
1154 		record.zi_cmd = ZINJECT_DATA_FAULT;
1155 		if (translate_record(type, argv[0], range, level, &record, pool,
1156 		    dataset) != 0)
1157 			return (1);
1158 		if (!error)
1159 			error = EIO;
1160 	}
1161 
1162 	/*
1163 	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1164 	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1165 	 * time we access the pool.
1166 	 */
1167 	if (dataset[0] != '\0' && domount) {
1168 		if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
1169 			return (1);
1170 
1171 		if (zfs_unmount(zhp, NULL, 0) != 0)
1172 			return (1);
1173 	}
1174 
1175 	record.zi_error = error;
1176 
1177 	ret = register_handler(pool, flags, &record, quiet);
1178 
1179 	if (dataset[0] != '\0' && domount)
1180 		ret = (zfs_mount(zhp, NULL, 0) != 0);
1181 
1182 	libzfs_fini(g_zfs);
1183 
1184 	return (ret);
1185 }
1186