xref: /freebsd/sys/contrib/openzfs/cmd/zinject/zinject.c (revision 7be9a3b45356747f9fcb6d69a722c1c95f8060bf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2017, Intel Corporation.
25  */
26 
27 /*
28  * ZFS Fault Injector
29  *
30  * This userland component takes a set of options and uses libzpool to translate
31  * from a user-visible object type and name to an internal representation.
32  * There are two basic types of faults: device faults and data faults.
33  *
34  *
35  * DEVICE FAULTS
36  *
37  * Errors can be injected into a particular vdev using the '-d' option.  This
38  * option takes a path or vdev GUID to uniquely identify the device within a
39  * pool.  There are four types of errors that can be injected, IO, ENXIO,
40  * ECHILD, and EILSEQ.  These can be controlled through the '-e' option and the
41  * default is ENXIO.  For EIO failures, any attempt to read data from the device
42  * will return EIO, but a subsequent attempt to reopen the device will succeed.
43  * For ENXIO failures, any attempt to read from the device will return EIO, but
44  * any attempt to reopen the device will also return ENXIO.  The EILSEQ failures
45  * only apply to read operations (-T read) and will flip a bit after the device
46  * has read the original data.
47  *
48  * For label faults, the -L option must be specified. This allows faults
49  * to be injected into either the nvlist, uberblock, pad1, or pad2 region
50  * of all the labels for the specified device.
51  *
52  * This form of the command looks like:
53  *
54  * 	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
55  *
56  *
57  * DATA FAULTS
58  *
59  * We begin with a tuple of the form:
60  *
61  * 	<type,level,range,object>
62  *
63  * 	type	A string describing the type of data to target.  Each type
64  * 		implicitly describes how to interpret 'object'. Currently,
65  * 		the following values are supported:
66  *
67  * 		data		User data for a file
68  * 		dnode		Dnode for a file or directory
69  *
70  *		The following MOS objects are special.  Instead of injecting
71  *		errors on a particular object or blkid, we inject errors across
72  *		all objects of the given type.
73  *
74  * 		mos		Any data in the MOS
75  * 		mosdir		object directory
76  * 		config		pool configuration
77  * 		bpobj		blkptr list
78  * 		spacemap	spacemap
79  * 		metaslab	metaslab
80  * 		errlog		persistent error log
81  *
82  * 	level	Object level.  Defaults to '0', not applicable to all types.  If
83  * 		a range is given, this corresponds to the indirect block
84  * 		corresponding to the specific range.
85  *
86  *	range	A numerical range [start,end) within the object.  Defaults to
87  *		the full size of the file.
88  *
89  * 	object	A string describing the logical location of the object.  For
90  * 		files and directories (currently the only supported types),
91  * 		this is the path of the object on disk.
92  *
93  * This is translated, via libzpool, into the following internal representation:
94  *
95  * 	<type,objset,object,level,range>
96  *
97  * These types should be self-explanatory.  This tuple is then passed to the
98  * kernel via a special ioctl() to initiate fault injection for the given
99  * object.  Note that 'type' is not strictly necessary for fault injection, but
100  * is used when translating existing faults into a human-readable string.
101  *
102  *
103  * The command itself takes one of the forms:
104  *
105  * 	zinject
106  * 	zinject <-a | -u pool>
107  * 	zinject -c <id|all>
108  * 	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
109  *	    [-r range] <object>
110  * 	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
111  *
112  * With no arguments, the command prints all currently registered injection
113  * handlers, with their numeric identifiers.
114  *
115  * The '-c' option will clear the given handler, or all handlers if 'all' is
116  * specified.
117  *
118  * The '-e' option takes a string describing the errno to simulate.  This must
119  * be one of 'io', 'checksum', 'decompress', or 'decrypt'.  In most cases this
120  * will result in the same behavior, but RAID-Z will produce a different set of
121  * ereports for this situation.
122  *
123  * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
124  * specified, then the ARC cache is flushed appropriately.  If '-u' is
125  * specified, then the underlying SPA is unloaded.  Either of these flags can be
126  * specified independently of any other handlers.  The '-m' flag automatically
127  * does an unmount and remount of the underlying dataset to aid in flushing the
128  * cache.
129  *
130  * The '-f' flag controls the frequency of errors injected, expressed as a
131  * real number percentage between 0.0001 and 100.  The default is 100.
132  *
133  * The this form is responsible for actually injecting the handler into the
134  * framework.  It takes the arguments described above, translates them to the
135  * internal tuple using libzpool, and then issues an ioctl() to register the
136  * handler.
137  *
138  * The final form can target a specific bookmark, regardless of whether a
139  * human-readable interface has been designed.  It allows developers to specify
140  * a particular block by number.
141  */
142 
143 #include <errno.h>
144 #include <fcntl.h>
145 #include <stdio.h>
146 #include <stdlib.h>
147 #include <strings.h>
148 #include <unistd.h>
149 
150 #include <sys/fs/zfs.h>
151 #include <sys/mount.h>
152 
153 #include <libzfs.h>
154 
155 #undef verify	/* both libzfs.h and zfs_context.h want to define this */
156 
157 #include "zinject.h"
158 
159 libzfs_handle_t *g_zfs;
160 int zfs_fd;
161 
162 static const char *errtable[TYPE_INVAL] = {
163 	"data",
164 	"dnode",
165 	"mos",
166 	"mosdir",
167 	"metaslab",
168 	"config",
169 	"bpobj",
170 	"spacemap",
171 	"errlog",
172 	"uber",
173 	"nvlist",
174 	"pad1",
175 	"pad2"
176 };
177 
178 static err_type_t
179 name_to_type(const char *arg)
180 {
181 	int i;
182 	for (i = 0; i < TYPE_INVAL; i++)
183 		if (strcmp(errtable[i], arg) == 0)
184 			return (i);
185 
186 	return (TYPE_INVAL);
187 }
188 
189 static const char *
190 type_to_name(uint64_t type)
191 {
192 	switch (type) {
193 	case DMU_OT_OBJECT_DIRECTORY:
194 		return ("mosdir");
195 	case DMU_OT_OBJECT_ARRAY:
196 		return ("metaslab");
197 	case DMU_OT_PACKED_NVLIST:
198 		return ("config");
199 	case DMU_OT_BPOBJ:
200 		return ("bpobj");
201 	case DMU_OT_SPACE_MAP:
202 		return ("spacemap");
203 	case DMU_OT_ERROR_LOG:
204 		return ("errlog");
205 	default:
206 		return ("-");
207 	}
208 }
209 
210 
211 /*
212  * Print usage message.
213  */
214 void
215 usage(void)
216 {
217 	(void) printf(
218 	    "usage:\n"
219 	    "\n"
220 	    "\tzinject\n"
221 	    "\n"
222 	    "\t\tList all active injection records.\n"
223 	    "\n"
224 	    "\tzinject -c <id|all>\n"
225 	    "\n"
226 	    "\t\tClear the particular record (if given a numeric ID), or\n"
227 	    "\t\tall records if 'all' is specified.\n"
228 	    "\n"
229 	    "\tzinject -p <function name> pool\n"
230 	    "\t\tInject a panic fault at the specified function. Only \n"
231 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
232 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
233 	    "\n"
234 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
235 	    "\t\t[-T <read|write|free|claim|all>] [-f frequency] pool\n\n"
236 	    "\t\tInject a fault into a particular device or the device's\n"
237 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
238 	    "\t\t'pad1', or 'pad2'.\n"
239 	    "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n"
240 	    "\t\t'corrupt' (bit flip).\n"
241 	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
242 	    "\t\tdevice error injection to a percentage of the IOs.\n"
243 	    "\n"
244 	    "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
245 	    "\t\tPerform a specific action on a particular device.\n"
246 	    "\n"
247 	    "\tzinject -d device -D latency:lanes pool\n"
248 	    "\n"
249 	    "\t\tAdd an artificial delay to IO requests on a particular\n"
250 	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
251 	    "\t\tmilliseconds to complete. Each delay has an associated\n"
252 	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
253 	    "\t\tIO requests that can be processed.\n"
254 	    "\n"
255 	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
256 	    "\t\tthe device will only be able to service a single IO request\n"
257 	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
258 	    "\t\tif only a single request is submitted every 10 ms, the\n"
259 	    "\t\taverage latency will be 10 ms; but if more than one request\n"
260 	    "\t\tis submitted every 10 ms, the average latency will be more\n"
261 	    "\t\tthan 10 ms.\n"
262 	    "\n"
263 	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
264 	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
265 	    "\t\ttwo requests at a time, each with a minimum latency of\n"
266 	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
267 	    "\t\tthe average latency will be 10 ms; but if more than two\n"
268 	    "\t\trequests are submitted every 10 ms, the average latency\n"
269 	    "\t\twill be more than 10 ms.\n"
270 	    "\n"
271 	    "\t\tAlso note, these delays are additive. So two invocations\n"
272 	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
273 	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
274 	    "\t\tlanes with differing target latencies. For example, an\n"
275 	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
276 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
277 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
278 	    "\n"
279 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
280 	    "\t\tCause the pool to stop writing blocks yet not\n"
281 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
282 	    "\t\tthat fails to honor cache flush requests.\n"
283 	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
284 	    "\t\tat the end of the duration.\n"
285 	    "\n"
286 	    "\tzinject -b objset:object:level:blkid pool\n"
287 	    "\n"
288 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
289 	    "\t\tspecified by the remaining tuple.  Each number is in\n"
290 	    "\t\thexadecimal, and only one block can be specified.\n"
291 	    "\n"
292 	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
293 	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
294 	    "\n"
295 	    "\t\tInject an error into the object specified by the '-t' option\n"
296 	    "\t\tand the object descriptor.  The 'object' parameter is\n"
297 	    "\t\tinterpreted depending on the '-t' option.\n"
298 	    "\n"
299 	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
300 	    "\t\t-e\tInject a specific error.  Must be one of 'io',\n"
301 	    "\t\t\t'checksum', 'decompress', or 'decrypt'.  Default is 'io'.\n"
302 	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
303 	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
304 	    "\t\t\tseparated by commas (ex. '0,2').\n"
305 	    "\t\t-l\tInject error at a particular block level. Default is "
306 	    "0.\n"
307 	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
308 	    "\t\t-r\tInject error over a particular logical range of an\n"
309 	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
310 	    "\t\t\trange according to the object's properties.\n"
311 	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
312 	    "\t\t\tassociated object.\n"
313 	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
314 	    "\t\t\ta pool object.\n"
315 	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
316 	    "\t\t\ta percentage between 0.0001 and 100.\n"
317 	    "\n"
318 	    "\t-t data\t\tInject an error into the plain file contents of a\n"
319 	    "\t\t\tfile.  The object must be specified as a complete path\n"
320 	    "\t\t\tto a file on a ZFS filesystem.\n"
321 	    "\n"
322 	    "\t-t dnode\tInject an error into the metadnode in the block\n"
323 	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
324 	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
325 	    "\t\t\tis specified as a complete path to a file or directory\n"
326 	    "\t\t\ton a ZFS filesystem.\n"
327 	    "\n"
328 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
329 	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
330 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
331 	    "\t\t\tthe poolname.\n");
332 }
333 
334 static int
335 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
336     void *data)
337 {
338 	zfs_cmd_t zc = {"\0"};
339 	int ret;
340 
341 	while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
342 		if ((ret = func((int)zc.zc_guid, zc.zc_name,
343 		    &zc.zc_inject_record, data)) != 0)
344 			return (ret);
345 
346 	if (errno != ENOENT) {
347 		(void) fprintf(stderr, "Unable to list handlers: %s\n",
348 		    strerror(errno));
349 		return (-1);
350 	}
351 
352 	return (0);
353 }
354 
355 static int
356 print_data_handler(int id, const char *pool, zinject_record_t *record,
357     void *data)
358 {
359 	int *count = data;
360 
361 	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
362 		return (0);
363 
364 	if (*count == 0) {
365 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  "
366 		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
367 		    "LVL", "DVAs", "RANGE");
368 		(void) printf("---  ---------------  ------  "
369 		    "------  --------  ---  ----  ---------------\n");
370 	}
371 
372 	*count += 1;
373 
374 	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
375 	    id, pool, (u_longlong_t)record->zi_objset,
376 	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
377 	    record->zi_level, record->zi_dvas);
378 
379 
380 	if (record->zi_start == 0 &&
381 	    record->zi_end == -1ULL)
382 		(void) printf("all\n");
383 	else
384 		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
385 		    (u_longlong_t)record->zi_end);
386 
387 	return (0);
388 }
389 
390 static int
391 print_device_handler(int id, const char *pool, zinject_record_t *record,
392     void *data)
393 {
394 	int *count = data;
395 
396 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
397 		return (0);
398 
399 	if (record->zi_cmd == ZINJECT_DELAY_IO)
400 		return (0);
401 
402 	if (*count == 0) {
403 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
404 		(void) printf("---  ---------------  ----------------\n");
405 	}
406 
407 	*count += 1;
408 
409 	(void) printf("%3d  %-15s  %llx\n", id, pool,
410 	    (u_longlong_t)record->zi_guid);
411 
412 	return (0);
413 }
414 
415 static int
416 print_delay_handler(int id, const char *pool, zinject_record_t *record,
417     void *data)
418 {
419 	int *count = data;
420 
421 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
422 		return (0);
423 
424 	if (record->zi_cmd != ZINJECT_DELAY_IO)
425 		return (0);
426 
427 	if (*count == 0) {
428 		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
429 		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
430 		(void) printf("---  ---------------  ---------------  "
431 		    "---------------  ----------------\n");
432 	}
433 
434 	*count += 1;
435 
436 	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
437 	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
438 	    (u_longlong_t)record->zi_nlanes,
439 	    (u_longlong_t)record->zi_guid);
440 
441 	return (0);
442 }
443 
444 static int
445 print_panic_handler(int id, const char *pool, zinject_record_t *record,
446     void *data)
447 {
448 	int *count = data;
449 
450 	if (record->zi_func[0] == '\0')
451 		return (0);
452 
453 	if (*count == 0) {
454 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
455 		(void) printf("---  ---------------  ----------------\n");
456 	}
457 
458 	*count += 1;
459 
460 	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
461 
462 	return (0);
463 }
464 
465 /*
466  * Print all registered error handlers.  Returns the number of handlers
467  * registered.
468  */
469 static int
470 print_all_handlers(void)
471 {
472 	int count = 0, total = 0;
473 
474 	(void) iter_handlers(print_device_handler, &count);
475 	if (count > 0) {
476 		total += count;
477 		(void) printf("\n");
478 		count = 0;
479 	}
480 
481 	(void) iter_handlers(print_delay_handler, &count);
482 	if (count > 0) {
483 		total += count;
484 		(void) printf("\n");
485 		count = 0;
486 	}
487 
488 	(void) iter_handlers(print_data_handler, &count);
489 	if (count > 0) {
490 		total += count;
491 		(void) printf("\n");
492 		count = 0;
493 	}
494 
495 	(void) iter_handlers(print_panic_handler, &count);
496 
497 	return (count + total);
498 }
499 
500 static int
501 cancel_one_handler(int id, const char *pool, zinject_record_t *record,
502     void *data)
503 {
504 	(void) pool, (void) record, (void) data;
505 	zfs_cmd_t zc = {"\0"};
506 
507 	zc.zc_guid = (uint64_t)id;
508 
509 	if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
510 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
511 		    id, strerror(errno));
512 		return (1);
513 	}
514 
515 	return (0);
516 }
517 
518 /*
519  * Remove all fault injection handlers.
520  */
521 static int
522 cancel_all_handlers(void)
523 {
524 	int ret = iter_handlers(cancel_one_handler, NULL);
525 
526 	if (ret == 0)
527 		(void) printf("removed all registered handlers\n");
528 
529 	return (ret);
530 }
531 
532 /*
533  * Remove a specific fault injection handler.
534  */
535 static int
536 cancel_handler(int id)
537 {
538 	zfs_cmd_t zc = {"\0"};
539 
540 	zc.zc_guid = (uint64_t)id;
541 
542 	if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
543 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
544 		    id, strerror(errno));
545 		return (1);
546 	}
547 
548 	(void) printf("removed handler %d\n", id);
549 
550 	return (0);
551 }
552 
553 /*
554  * Register a new fault injection handler.
555  */
556 static int
557 register_handler(const char *pool, int flags, zinject_record_t *record,
558     int quiet)
559 {
560 	zfs_cmd_t zc = {"\0"};
561 
562 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
563 	zc.zc_inject_record = *record;
564 	zc.zc_guid = flags;
565 
566 	if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
567 		(void) fprintf(stderr, "failed to add handler: %s\n",
568 		    errno == EDOM ? "block level exceeds max level of object" :
569 		    strerror(errno));
570 		return (1);
571 	}
572 
573 	if (flags & ZINJECT_NULL)
574 		return (0);
575 
576 	if (quiet) {
577 		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
578 	} else {
579 		(void) printf("Added handler %llu with the following "
580 		    "properties:\n", (u_longlong_t)zc.zc_guid);
581 		(void) printf("  pool: %s\n", pool);
582 		if (record->zi_guid) {
583 			(void) printf("  vdev: %llx\n",
584 			    (u_longlong_t)record->zi_guid);
585 		} else if (record->zi_func[0] != '\0') {
586 			(void) printf("  panic function: %s\n",
587 			    record->zi_func);
588 		} else if (record->zi_duration > 0) {
589 			(void) printf(" time: %lld seconds\n",
590 			    (u_longlong_t)record->zi_duration);
591 		} else if (record->zi_duration < 0) {
592 			(void) printf(" txgs: %lld \n",
593 			    (u_longlong_t)-record->zi_duration);
594 		} else {
595 			(void) printf("objset: %llu\n",
596 			    (u_longlong_t)record->zi_objset);
597 			(void) printf("object: %llu\n",
598 			    (u_longlong_t)record->zi_object);
599 			(void) printf("  type: %llu\n",
600 			    (u_longlong_t)record->zi_type);
601 			(void) printf(" level: %d\n", record->zi_level);
602 			if (record->zi_start == 0 &&
603 			    record->zi_end == -1ULL)
604 				(void) printf(" range: all\n");
605 			else
606 				(void) printf(" range: [%llu, %llu)\n",
607 				    (u_longlong_t)record->zi_start,
608 				    (u_longlong_t)record->zi_end);
609 			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
610 		}
611 	}
612 
613 	return (0);
614 }
615 
616 static int
617 perform_action(const char *pool, zinject_record_t *record, int cmd)
618 {
619 	zfs_cmd_t zc = {"\0"};
620 
621 	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
622 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
623 	zc.zc_guid = record->zi_guid;
624 	zc.zc_cookie = cmd;
625 
626 	if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
627 		return (0);
628 
629 	return (1);
630 }
631 
632 static int
633 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
634 {
635 	unsigned long scan_delay;
636 	unsigned long scan_nlanes;
637 
638 	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
639 		return (1);
640 
641 	/*
642 	 * We explicitly disallow a delay of zero here, because we key
643 	 * off this value being non-zero in translate_device(), to
644 	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
645 	 */
646 	if (scan_delay == 0)
647 		return (1);
648 
649 	/*
650 	 * The units for the CLI delay parameter is milliseconds, but
651 	 * the data passed to the kernel is interpreted as nanoseconds.
652 	 * Thus we scale the milliseconds to nanoseconds here, and this
653 	 * nanosecond value is used to pass the delay to the kernel.
654 	 */
655 	*delay = MSEC2NSEC(scan_delay);
656 	*nlanes = scan_nlanes;
657 
658 	return (0);
659 }
660 
661 static int
662 parse_frequency(const char *str, uint32_t *percent)
663 {
664 	double val;
665 	char *post;
666 
667 	val = strtod(str, &post);
668 	if (post == NULL || *post != '\0')
669 		return (EINVAL);
670 
671 	/* valid range is [0.0001, 100.0] */
672 	val /= 100.0f;
673 	if (val < 0.000001f || val > 1.0f)
674 		return (ERANGE);
675 
676 	/* convert to an integer for use by kernel */
677 	*percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));
678 
679 	return (0);
680 }
681 
682 /*
683  * This function converts a string specifier for DVAs into a bit mask.
684  * The dva's provided by the user should be 0 indexed and separated by
685  * a comma. For example:
686  *	"1"	-> 0b0010  (0x2)
687  *	"0,1"	-> 0b0011  (0x3)
688  *	"0,1,2"	-> 0b0111  (0x7)
689  */
690 static int
691 parse_dvas(const char *str, uint32_t *dvas_out)
692 {
693 	const char *c = str;
694 	uint32_t mask = 0;
695 	boolean_t need_delim = B_FALSE;
696 
697 	/* max string length is 5 ("0,1,2") */
698 	if (strlen(str) > 5 || strlen(str) == 0)
699 		return (EINVAL);
700 
701 	while (*c != '\0') {
702 		switch (*c) {
703 		case '0':
704 		case '1':
705 		case '2':
706 			/* check for pipe between DVAs */
707 			if (need_delim)
708 				return (EINVAL);
709 
710 			/* check if this DVA has been set already */
711 			if (mask & (1 << ((*c) - '0')))
712 				return (EINVAL);
713 
714 			mask |= (1 << ((*c) - '0'));
715 			need_delim = B_TRUE;
716 			break;
717 		case ',':
718 			need_delim = B_FALSE;
719 			break;
720 		default:
721 			/* check for invalid character */
722 			return (EINVAL);
723 		}
724 		c++;
725 	}
726 
727 	/* check for dangling delimiter */
728 	if (!need_delim)
729 		return (EINVAL);
730 
731 	*dvas_out = mask;
732 	return (0);
733 }
734 
735 int
736 main(int argc, char **argv)
737 {
738 	int c;
739 	char *range = NULL;
740 	char *cancel = NULL;
741 	char *end;
742 	char *raw = NULL;
743 	char *device = NULL;
744 	int level = 0;
745 	int quiet = 0;
746 	int error = 0;
747 	int domount = 0;
748 	int io_type = ZIO_TYPES;
749 	int action = VDEV_STATE_UNKNOWN;
750 	err_type_t type = TYPE_INVAL;
751 	err_type_t label = TYPE_INVAL;
752 	zinject_record_t record = { 0 };
753 	char pool[MAXNAMELEN] = "";
754 	char dataset[MAXNAMELEN] = "";
755 	zfs_handle_t *zhp = NULL;
756 	int nowrites = 0;
757 	int dur_txg = 0;
758 	int dur_secs = 0;
759 	int ret;
760 	int flags = 0;
761 	uint32_t dvas = 0;
762 
763 	if ((g_zfs = libzfs_init()) == NULL) {
764 		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
765 		return (1);
766 	}
767 
768 	libzfs_print_on_error(g_zfs, B_TRUE);
769 
770 	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
771 		(void) fprintf(stderr, "failed to open ZFS device\n");
772 		libzfs_fini(g_zfs);
773 		return (1);
774 	}
775 
776 	if (argc == 1) {
777 		/*
778 		 * No arguments.  Print the available handlers.  If there are no
779 		 * available handlers, direct the user to '-h' for help
780 		 * information.
781 		 */
782 		if (print_all_handlers() == 0) {
783 			(void) printf("No handlers registered.\n");
784 			(void) printf("Run 'zinject -h' for usage "
785 			    "information.\n");
786 		}
787 		libzfs_fini(g_zfs);
788 		return (0);
789 	}
790 
791 	while ((c = getopt(argc, argv,
792 	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
793 		switch (c) {
794 		case 'a':
795 			flags |= ZINJECT_FLUSH_ARC;
796 			break;
797 		case 'A':
798 			if (strcasecmp(optarg, "degrade") == 0) {
799 				action = VDEV_STATE_DEGRADED;
800 			} else if (strcasecmp(optarg, "fault") == 0) {
801 				action = VDEV_STATE_FAULTED;
802 			} else {
803 				(void) fprintf(stderr, "invalid action '%s': "
804 				    "must be 'degrade' or 'fault'\n", optarg);
805 				usage();
806 				libzfs_fini(g_zfs);
807 				return (1);
808 			}
809 			break;
810 		case 'b':
811 			raw = optarg;
812 			break;
813 		case 'c':
814 			cancel = optarg;
815 			break;
816 		case 'C':
817 			ret = parse_dvas(optarg, &dvas);
818 			if (ret != 0) {
819 				(void) fprintf(stderr, "invalid DVA list '%s': "
820 				    "DVAs should be 0 indexed and separated by "
821 				    "commas.\n", optarg);
822 				usage();
823 				libzfs_fini(g_zfs);
824 				return (1);
825 			}
826 			break;
827 		case 'd':
828 			device = optarg;
829 			break;
830 		case 'D':
831 			errno = 0;
832 			ret = parse_delay(optarg, &record.zi_timer,
833 			    &record.zi_nlanes);
834 			if (ret != 0) {
835 
836 				(void) fprintf(stderr, "invalid i/o delay "
837 				    "value: '%s'\n", optarg);
838 				usage();
839 				libzfs_fini(g_zfs);
840 				return (1);
841 			}
842 			break;
843 		case 'e':
844 			if (strcasecmp(optarg, "io") == 0) {
845 				error = EIO;
846 			} else if (strcasecmp(optarg, "checksum") == 0) {
847 				error = ECKSUM;
848 			} else if (strcasecmp(optarg, "decompress") == 0) {
849 				error = EINVAL;
850 			} else if (strcasecmp(optarg, "decrypt") == 0) {
851 				error = EACCES;
852 			} else if (strcasecmp(optarg, "nxio") == 0) {
853 				error = ENXIO;
854 			} else if (strcasecmp(optarg, "dtl") == 0) {
855 				error = ECHILD;
856 			} else if (strcasecmp(optarg, "corrupt") == 0) {
857 				error = EILSEQ;
858 			} else {
859 				(void) fprintf(stderr, "invalid error type "
860 				    "'%s': must be 'io', 'checksum' or "
861 				    "'nxio'\n", optarg);
862 				usage();
863 				libzfs_fini(g_zfs);
864 				return (1);
865 			}
866 			break;
867 		case 'f':
868 			ret = parse_frequency(optarg, &record.zi_freq);
869 			if (ret != 0) {
870 				(void) fprintf(stderr, "%sfrequency value must "
871 				    "be in the range [0.0001, 100.0]\n",
872 				    ret == EINVAL ? "invalid value: " :
873 				    ret == ERANGE ? "out of range: " : "");
874 				libzfs_fini(g_zfs);
875 				return (1);
876 			}
877 			break;
878 		case 'F':
879 			record.zi_failfast = B_TRUE;
880 			break;
881 		case 'g':
882 			dur_txg = 1;
883 			record.zi_duration = (int)strtol(optarg, &end, 10);
884 			if (record.zi_duration <= 0 || *end != '\0') {
885 				(void) fprintf(stderr, "invalid duration '%s': "
886 				    "must be a positive integer\n", optarg);
887 				usage();
888 				libzfs_fini(g_zfs);
889 				return (1);
890 			}
891 			/* store duration of txgs as its negative */
892 			record.zi_duration *= -1;
893 			break;
894 		case 'h':
895 			usage();
896 			libzfs_fini(g_zfs);
897 			return (0);
898 		case 'I':
899 			/* default duration, if one hasn't yet been defined */
900 			nowrites = 1;
901 			if (dur_secs == 0 && dur_txg == 0)
902 				record.zi_duration = 30;
903 			break;
904 		case 'l':
905 			level = (int)strtol(optarg, &end, 10);
906 			if (*end != '\0') {
907 				(void) fprintf(stderr, "invalid level '%s': "
908 				    "must be an integer\n", optarg);
909 				usage();
910 				libzfs_fini(g_zfs);
911 				return (1);
912 			}
913 			break;
914 		case 'm':
915 			domount = 1;
916 			break;
917 		case 'p':
918 			(void) strlcpy(record.zi_func, optarg,
919 			    sizeof (record.zi_func));
920 			record.zi_cmd = ZINJECT_PANIC;
921 			break;
922 		case 'q':
923 			quiet = 1;
924 			break;
925 		case 'r':
926 			range = optarg;
927 			flags |= ZINJECT_CALC_RANGE;
928 			break;
929 		case 's':
930 			dur_secs = 1;
931 			record.zi_duration = (int)strtol(optarg, &end, 10);
932 			if (record.zi_duration <= 0 || *end != '\0') {
933 				(void) fprintf(stderr, "invalid duration '%s': "
934 				    "must be a positive integer\n", optarg);
935 				usage();
936 				libzfs_fini(g_zfs);
937 				return (1);
938 			}
939 			break;
940 		case 'T':
941 			if (strcasecmp(optarg, "read") == 0) {
942 				io_type = ZIO_TYPE_READ;
943 			} else if (strcasecmp(optarg, "write") == 0) {
944 				io_type = ZIO_TYPE_WRITE;
945 			} else if (strcasecmp(optarg, "free") == 0) {
946 				io_type = ZIO_TYPE_FREE;
947 			} else if (strcasecmp(optarg, "claim") == 0) {
948 				io_type = ZIO_TYPE_CLAIM;
949 			} else if (strcasecmp(optarg, "all") == 0) {
950 				io_type = ZIO_TYPES;
951 			} else {
952 				(void) fprintf(stderr, "invalid I/O type "
953 				    "'%s': must be 'read', 'write', 'free', "
954 				    "'claim' or 'all'\n", optarg);
955 				usage();
956 				libzfs_fini(g_zfs);
957 				return (1);
958 			}
959 			break;
960 		case 't':
961 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
962 			    !MOS_TYPE(type)) {
963 				(void) fprintf(stderr, "invalid type '%s'\n",
964 				    optarg);
965 				usage();
966 				libzfs_fini(g_zfs);
967 				return (1);
968 			}
969 			break;
970 		case 'u':
971 			flags |= ZINJECT_UNLOAD_SPA;
972 			break;
973 		case 'L':
974 			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
975 			    !LABEL_TYPE(type)) {
976 				(void) fprintf(stderr, "invalid label type "
977 				    "'%s'\n", optarg);
978 				usage();
979 				libzfs_fini(g_zfs);
980 				return (1);
981 			}
982 			break;
983 		case ':':
984 			(void) fprintf(stderr, "option -%c requires an "
985 			    "operand\n", optopt);
986 			usage();
987 			libzfs_fini(g_zfs);
988 			return (1);
989 		case '?':
990 			(void) fprintf(stderr, "invalid option '%c'\n",
991 			    optopt);
992 			usage();
993 			libzfs_fini(g_zfs);
994 			return (2);
995 		}
996 	}
997 
998 	argc -= optind;
999 	argv += optind;
1000 
1001 	if (record.zi_duration != 0)
1002 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
1003 
1004 	if (cancel != NULL) {
1005 		/*
1006 		 * '-c' is invalid with any other options.
1007 		 */
1008 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1009 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1010 		    record.zi_freq > 0 || dvas != 0) {
1011 			(void) fprintf(stderr, "cancel (-c) incompatible with "
1012 			    "any other options\n");
1013 			usage();
1014 			libzfs_fini(g_zfs);
1015 			return (2);
1016 		}
1017 		if (argc != 0) {
1018 			(void) fprintf(stderr, "extraneous argument to '-c'\n");
1019 			usage();
1020 			libzfs_fini(g_zfs);
1021 			return (2);
1022 		}
1023 
1024 		if (strcmp(cancel, "all") == 0) {
1025 			return (cancel_all_handlers());
1026 		} else {
1027 			int id = (int)strtol(cancel, &end, 10);
1028 			if (*end != '\0') {
1029 				(void) fprintf(stderr, "invalid handle id '%s':"
1030 				    " must be an integer or 'all'\n", cancel);
1031 				usage();
1032 				libzfs_fini(g_zfs);
1033 				return (1);
1034 			}
1035 			return (cancel_handler(id));
1036 		}
1037 	}
1038 
1039 	if (device != NULL) {
1040 		/*
1041 		 * Device (-d) injection uses a completely different mechanism
1042 		 * for doing injection, so handle it separately here.
1043 		 */
1044 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1045 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1046 		    dvas != 0) {
1047 			(void) fprintf(stderr, "device (-d) incompatible with "
1048 			    "data error injection\n");
1049 			usage();
1050 			libzfs_fini(g_zfs);
1051 			return (2);
1052 		}
1053 
1054 		if (argc != 1) {
1055 			(void) fprintf(stderr, "device (-d) injection requires "
1056 			    "a single pool name\n");
1057 			usage();
1058 			libzfs_fini(g_zfs);
1059 			return (2);
1060 		}
1061 
1062 		(void) strlcpy(pool, argv[0], sizeof (pool));
1063 		dataset[0] = '\0';
1064 
1065 		if (error == ECKSUM) {
1066 			(void) fprintf(stderr, "device error type must be "
1067 			    "'io', 'nxio' or 'corrupt'\n");
1068 			libzfs_fini(g_zfs);
1069 			return (1);
1070 		}
1071 
1072 		if (error == EILSEQ &&
1073 		    (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) {
1074 			(void) fprintf(stderr, "device corrupt errors require "
1075 			    "io type read and a frequency value\n");
1076 			libzfs_fini(g_zfs);
1077 			return (1);
1078 		}
1079 
1080 		record.zi_iotype = io_type;
1081 		if (translate_device(pool, device, label, &record) != 0) {
1082 			libzfs_fini(g_zfs);
1083 			return (1);
1084 		}
1085 		if (!error)
1086 			error = ENXIO;
1087 
1088 		if (action != VDEV_STATE_UNKNOWN)
1089 			return (perform_action(pool, &record, action));
1090 
1091 	} else if (raw != NULL) {
1092 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1093 		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1094 		    record.zi_freq > 0 || dvas != 0) {
1095 			(void) fprintf(stderr, "raw (-b) format with "
1096 			    "any other options\n");
1097 			usage();
1098 			libzfs_fini(g_zfs);
1099 			return (2);
1100 		}
1101 
1102 		if (argc != 1) {
1103 			(void) fprintf(stderr, "raw (-b) format expects a "
1104 			    "single pool name\n");
1105 			usage();
1106 			libzfs_fini(g_zfs);
1107 			return (2);
1108 		}
1109 
1110 		(void) strlcpy(pool, argv[0], sizeof (pool));
1111 		dataset[0] = '\0';
1112 
1113 		if (error == ENXIO) {
1114 			(void) fprintf(stderr, "data error type must be "
1115 			    "'checksum' or 'io'\n");
1116 			libzfs_fini(g_zfs);
1117 			return (1);
1118 		}
1119 
1120 		record.zi_cmd = ZINJECT_DATA_FAULT;
1121 		if (translate_raw(raw, &record) != 0) {
1122 			libzfs_fini(g_zfs);
1123 			return (1);
1124 		}
1125 		if (!error)
1126 			error = EIO;
1127 	} else if (record.zi_cmd == ZINJECT_PANIC) {
1128 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1129 		    level != 0 || device != NULL || record.zi_freq > 0 ||
1130 		    dvas != 0) {
1131 			(void) fprintf(stderr, "panic (-p) incompatible with "
1132 			    "other options\n");
1133 			usage();
1134 			libzfs_fini(g_zfs);
1135 			return (2);
1136 		}
1137 
1138 		if (argc < 1 || argc > 2) {
1139 			(void) fprintf(stderr, "panic (-p) injection requires "
1140 			    "a single pool name and an optional id\n");
1141 			usage();
1142 			libzfs_fini(g_zfs);
1143 			return (2);
1144 		}
1145 
1146 		(void) strlcpy(pool, argv[0], sizeof (pool));
1147 		if (argv[1] != NULL)
1148 			record.zi_type = atoi(argv[1]);
1149 		dataset[0] = '\0';
1150 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1151 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1152 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1153 			(void) fprintf(stderr, "hardware failure (-I) "
1154 			    "incompatible with other options\n");
1155 			usage();
1156 			libzfs_fini(g_zfs);
1157 			return (2);
1158 		}
1159 
1160 		if (nowrites == 0) {
1161 			(void) fprintf(stderr, "-s or -g meaningless "
1162 			    "without -I (ignore writes)\n");
1163 			usage();
1164 			libzfs_fini(g_zfs);
1165 			return (2);
1166 		} else if (dur_secs && dur_txg) {
1167 			(void) fprintf(stderr, "choose a duration either "
1168 			    "in seconds (-s) or a number of txgs (-g) "
1169 			    "but not both\n");
1170 			usage();
1171 			libzfs_fini(g_zfs);
1172 			return (2);
1173 		} else if (argc != 1) {
1174 			(void) fprintf(stderr, "ignore writes (-I) "
1175 			    "injection requires a single pool name\n");
1176 			usage();
1177 			libzfs_fini(g_zfs);
1178 			return (2);
1179 		}
1180 
1181 		(void) strlcpy(pool, argv[0], sizeof (pool));
1182 		dataset[0] = '\0';
1183 	} else if (type == TYPE_INVAL) {
1184 		if (flags == 0) {
1185 			(void) fprintf(stderr, "at least one of '-b', '-d', "
1186 			    "'-t', '-a', '-p', '-I' or '-u' "
1187 			    "must be specified\n");
1188 			usage();
1189 			libzfs_fini(g_zfs);
1190 			return (2);
1191 		}
1192 
1193 		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1194 			(void) strlcpy(pool, argv[0], sizeof (pool));
1195 			dataset[0] = '\0';
1196 		} else if (argc != 0) {
1197 			(void) fprintf(stderr, "extraneous argument for "
1198 			    "'-f'\n");
1199 			usage();
1200 			libzfs_fini(g_zfs);
1201 			return (2);
1202 		}
1203 
1204 		flags |= ZINJECT_NULL;
1205 	} else {
1206 		if (argc != 1) {
1207 			(void) fprintf(stderr, "missing object\n");
1208 			usage();
1209 			libzfs_fini(g_zfs);
1210 			return (2);
1211 		}
1212 
1213 		if (error == ENXIO || error == EILSEQ) {
1214 			(void) fprintf(stderr, "data error type must be "
1215 			    "'checksum' or 'io'\n");
1216 			libzfs_fini(g_zfs);
1217 			return (1);
1218 		}
1219 
1220 		if (dvas != 0) {
1221 			if (error == EACCES || error == EINVAL) {
1222 				(void) fprintf(stderr, "the '-C' option may "
1223 				    "not be used with logical data errors "
1224 				    "'decrypt' and 'decompress'\n");
1225 				libzfs_fini(g_zfs);
1226 				return (1);
1227 			}
1228 
1229 			record.zi_dvas = dvas;
1230 		}
1231 
1232 		if (error == EACCES) {
1233 			if (type != TYPE_DATA) {
1234 				(void) fprintf(stderr, "decryption errors "
1235 				    "may only be injected for 'data' types\n");
1236 				libzfs_fini(g_zfs);
1237 				return (1);
1238 			}
1239 
1240 			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
1241 			/*
1242 			 * Internally, ZFS actually uses ECKSUM for decryption
1243 			 * errors since EACCES is used to indicate the key was
1244 			 * not found.
1245 			 */
1246 			error = ECKSUM;
1247 		} else {
1248 			record.zi_cmd = ZINJECT_DATA_FAULT;
1249 		}
1250 
1251 		if (translate_record(type, argv[0], range, level, &record, pool,
1252 		    dataset) != 0) {
1253 			libzfs_fini(g_zfs);
1254 			return (1);
1255 		}
1256 		if (!error)
1257 			error = EIO;
1258 	}
1259 
1260 	/*
1261 	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1262 	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1263 	 * time we access the pool.
1264 	 */
1265 	if (dataset[0] != '\0' && domount) {
1266 		if ((zhp = zfs_open(g_zfs, dataset,
1267 		    ZFS_TYPE_DATASET)) == NULL) {
1268 			libzfs_fini(g_zfs);
1269 			return (1);
1270 		}
1271 		if (zfs_unmount(zhp, NULL, 0) != 0) {
1272 			libzfs_fini(g_zfs);
1273 			return (1);
1274 		}
1275 	}
1276 
1277 	record.zi_error = error;
1278 
1279 	ret = register_handler(pool, flags, &record, quiet);
1280 
1281 	if (dataset[0] != '\0' && domount)
1282 		ret = (zfs_mount(zhp, NULL, 0) != 0);
1283 
1284 	libzfs_fini(g_zfs);
1285 
1286 	return (ret);
1287 }
1288