1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2017, Intel Corporation. 25 * Copyright (c) 2024, Klara Inc. 26 */ 27 28 /* 29 * ZFS Fault Injector 30 * 31 * This userland component takes a set of options and uses libzpool to translate 32 * from a user-visible object type and name to an internal representation. 33 * There are two basic types of faults: device faults and data faults. 34 * 35 * 36 * DEVICE FAULTS 37 * 38 * Errors can be injected into a particular vdev using the '-d' option. This 39 * option takes a path or vdev GUID to uniquely identify the device within a 40 * pool. There are four types of errors that can be injected, IO, ENXIO, 41 * ECHILD, and EILSEQ. These can be controlled through the '-e' option and the 42 * default is ENXIO. For EIO failures, any attempt to read data from the device 43 * will return EIO, but a subsequent attempt to reopen the device will succeed. 44 * For ENXIO failures, any attempt to read from the device will return EIO, but 45 * any attempt to reopen the device will also return ENXIO. The EILSEQ failures 46 * only apply to read operations (-T read) and will flip a bit after the device 47 * has read the original data. 48 * 49 * For label faults, the -L option must be specified. This allows faults 50 * to be injected into either the nvlist, uberblock, pad1, or pad2 region 51 * of all the labels for the specified device. 52 * 53 * This form of the command looks like: 54 * 55 * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool 56 * 57 * 58 * DATA FAULTS 59 * 60 * We begin with a tuple of the form: 61 * 62 * <type,level,range,object> 63 * 64 * type A string describing the type of data to target. Each type 65 * implicitly describes how to interpret 'object'. Currently, 66 * the following values are supported: 67 * 68 * data User data for a file 69 * dnode Dnode for a file or directory 70 * 71 * The following MOS objects are special. Instead of injecting 72 * errors on a particular object or blkid, we inject errors across 73 * all objects of the given type. 74 * 75 * mos Any data in the MOS 76 * mosdir object directory 77 * config pool configuration 78 * bpobj blkptr list 79 * spacemap spacemap 80 * metaslab metaslab 81 * errlog persistent error log 82 * 83 * level Object level. Defaults to '0', not applicable to all types. If 84 * a range is given, this corresponds to the indirect block 85 * corresponding to the specific range. 86 * 87 * range A numerical range [start,end) within the object. Defaults to 88 * the full size of the file. 89 * 90 * object A string describing the logical location of the object. For 91 * files and directories (currently the only supported types), 92 * this is the path of the object on disk. 93 * 94 * This is translated, via libzpool, into the following internal representation: 95 * 96 * <type,objset,object,level,range> 97 * 98 * These types should be self-explanatory. This tuple is then passed to the 99 * kernel via a special ioctl() to initiate fault injection for the given 100 * object. Note that 'type' is not strictly necessary for fault injection, but 101 * is used when translating existing faults into a human-readable string. 102 * 103 * 104 * The command itself takes one of the forms: 105 * 106 * zinject 107 * zinject <-a | -u pool> 108 * zinject -c <id|all> 109 * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] 110 * [-r range] <object> 111 * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool 112 * 113 * With no arguments, the command prints all currently registered injection 114 * handlers, with their numeric identifiers. 115 * 116 * The '-c' option will clear the given handler, or all handlers if 'all' is 117 * specified. 118 * 119 * The '-e' option takes a string describing the errno to simulate. This must 120 * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this 121 * will result in the same behavior, but RAID-Z will produce a different set of 122 * ereports for this situation. 123 * 124 * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is 125 * specified, then the ARC cache is flushed appropriately. If '-u' is 126 * specified, then the underlying SPA is unloaded. Either of these flags can be 127 * specified independently of any other handlers. The '-m' flag automatically 128 * does an unmount and remount of the underlying dataset to aid in flushing the 129 * cache. 130 * 131 * The '-f' flag controls the frequency of errors injected, expressed as a 132 * real number percentage between 0.0001 and 100. The default is 100. 133 * 134 * The this form is responsible for actually injecting the handler into the 135 * framework. It takes the arguments described above, translates them to the 136 * internal tuple using libzpool, and then issues an ioctl() to register the 137 * handler. 138 * 139 * The final form can target a specific bookmark, regardless of whether a 140 * human-readable interface has been designed. It allows developers to specify 141 * a particular block by number. 142 */ 143 144 #include <errno.h> 145 #include <fcntl.h> 146 #include <stdio.h> 147 #include <stdlib.h> 148 #include <string.h> 149 #include <strings.h> 150 #include <unistd.h> 151 152 #include <sys/fs/zfs.h> 153 #include <sys/mount.h> 154 155 #include <libzfs.h> 156 157 #undef verify /* both libzfs.h and zfs_context.h want to define this */ 158 159 #include "zinject.h" 160 161 libzfs_handle_t *g_zfs; 162 int zfs_fd; 163 164 static const char *const errtable[TYPE_INVAL] = { 165 "data", 166 "dnode", 167 "mos", 168 "mosdir", 169 "metaslab", 170 "config", 171 "bpobj", 172 "spacemap", 173 "errlog", 174 "uber", 175 "nvlist", 176 "pad1", 177 "pad2" 178 }; 179 180 static err_type_t 181 name_to_type(const char *arg) 182 { 183 int i; 184 for (i = 0; i < TYPE_INVAL; i++) 185 if (strcmp(errtable[i], arg) == 0) 186 return (i); 187 188 return (TYPE_INVAL); 189 } 190 191 static const char * 192 type_to_name(uint64_t type) 193 { 194 switch (type) { 195 case DMU_OT_OBJECT_DIRECTORY: 196 return ("mosdir"); 197 case DMU_OT_OBJECT_ARRAY: 198 return ("metaslab"); 199 case DMU_OT_PACKED_NVLIST: 200 return ("config"); 201 case DMU_OT_BPOBJ: 202 return ("bpobj"); 203 case DMU_OT_SPACE_MAP: 204 return ("spacemap"); 205 case DMU_OT_ERROR_LOG: 206 return ("errlog"); 207 default: 208 return ("-"); 209 } 210 } 211 212 struct errstr { 213 int err; 214 const char *str; 215 }; 216 static const struct errstr errstrtable[] = { 217 { EIO, "io" }, 218 { ECKSUM, "checksum" }, 219 { EINVAL, "decompress" }, 220 { EACCES, "decrypt" }, 221 { ENXIO, "nxio" }, 222 { ECHILD, "dtl" }, 223 { EILSEQ, "corrupt" }, 224 { ENOSYS, "noop" }, 225 { 0, NULL }, 226 }; 227 228 static int 229 str_to_err(const char *str) 230 { 231 for (int i = 0; errstrtable[i].str != NULL; i++) 232 if (strcasecmp(errstrtable[i].str, str) == 0) 233 return (errstrtable[i].err); 234 return (-1); 235 } 236 static const char * 237 err_to_str(int err) 238 { 239 for (int i = 0; errstrtable[i].str != NULL; i++) 240 if (errstrtable[i].err == err) 241 return (errstrtable[i].str); 242 return ("[unknown]"); 243 } 244 245 /* 246 * Print usage message. 247 */ 248 void 249 usage(void) 250 { 251 (void) printf( 252 "usage:\n" 253 "\n" 254 "\tzinject\n" 255 "\n" 256 "\t\tList all active injection records.\n" 257 "\n" 258 "\tzinject -c <id|all>\n" 259 "\n" 260 "\t\tClear the particular record (if given a numeric ID), or\n" 261 "\t\tall records if 'all' is specified.\n" 262 "\n" 263 "\tzinject -p <function name> pool\n" 264 "\t\tInject a panic fault at the specified function. Only \n" 265 "\t\tfunctions which call spa_vdev_config_exit(), or \n" 266 "\t\tspa_vdev_exit() will trigger a panic.\n" 267 "\n" 268 "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n" 269 "\t\t[-T <read|write|free|claim|flush|all>] [-f frequency] pool\n\n" 270 "\t\tInject a fault into a particular device or the device's\n" 271 "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " 272 "\t\t'pad1', or 'pad2'.\n" 273 "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n" 274 "\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n" 275 "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" 276 "\t\tdevice error injection to a percentage of the IOs.\n" 277 "\n" 278 "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n" 279 "\t\tPerform a specific action on a particular device.\n" 280 "\n" 281 "\tzinject -d device -D latency:lanes pool\n" 282 "\n" 283 "\t\tAdd an artificial delay to IO requests on a particular\n" 284 "\t\tdevice, such that the requests take a minimum of 'latency'\n" 285 "\t\tmilliseconds to complete. Each delay has an associated\n" 286 "\t\tnumber of 'lanes' which defines the number of concurrent\n" 287 "\t\tIO requests that can be processed.\n" 288 "\n" 289 "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" 290 "\t\tthe device will only be able to service a single IO request\n" 291 "\t\tat a time with each request taking 10 ms to complete. So,\n" 292 "\t\tif only a single request is submitted every 10 ms, the\n" 293 "\t\taverage latency will be 10 ms; but if more than one request\n" 294 "\t\tis submitted every 10 ms, the average latency will be more\n" 295 "\t\tthan 10 ms.\n" 296 "\n" 297 "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" 298 "\t\tlanes (-D 10:2), then the device will be able to service\n" 299 "\t\ttwo requests at a time, each with a minimum latency of\n" 300 "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" 301 "\t\tthe average latency will be 10 ms; but if more than two\n" 302 "\t\trequests are submitted every 10 ms, the average latency\n" 303 "\t\twill be more than 10 ms.\n" 304 "\n" 305 "\t\tAlso note, these delays are additive. So two invocations\n" 306 "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" 307 "\t\tof '-D 10:2'. This also means, one can specify multiple\n" 308 "\t\tlanes with differing target latencies. For example, an\n" 309 "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" 310 "\t\tcreate 3 lanes on the device; one lane with a latency\n" 311 "\t\tof 10 ms and two lanes with a 25 ms latency.\n" 312 "\n" 313 "\tzinject -I [-s <seconds> | -g <txgs>] pool\n" 314 "\t\tCause the pool to stop writing blocks yet not\n" 315 "\t\treport errors for a duration. Simulates buggy hardware\n" 316 "\t\tthat fails to honor cache flush requests.\n" 317 "\t\tDefault duration is 30 seconds. The machine is panicked\n" 318 "\t\tat the end of the duration.\n" 319 "\n" 320 "\tzinject -b objset:object:level:blkid pool\n" 321 "\n" 322 "\t\tInject an error into pool 'pool' with the numeric bookmark\n" 323 "\t\tspecified by the remaining tuple. Each number is in\n" 324 "\t\thexadecimal, and only one block can be specified.\n" 325 "\n" 326 "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" 327 "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n" 328 "\n" 329 "\t\tInject an error into the object specified by the '-t' option\n" 330 "\t\tand the object descriptor. The 'object' parameter is\n" 331 "\t\tinterpreted depending on the '-t' option.\n" 332 "\n" 333 "\t\t-q\tQuiet mode. Only print out the handler number added.\n" 334 "\t\t-e\tInject a specific error. Must be one of 'io',\n" 335 "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n" 336 "\t\t-C\tInject the given error only into specific DVAs. The\n" 337 "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" 338 "\t\t\tseparated by commas (ex. '0,2').\n" 339 "\t\t-l\tInject error at a particular block level. Default is " 340 "0.\n" 341 "\t\t-m\tAutomatically remount underlying filesystem.\n" 342 "\t\t-r\tInject error over a particular logical range of an\n" 343 "\t\t\tobject. Will be translated to the appropriate blkid\n" 344 "\t\t\trange according to the object's properties.\n" 345 "\t\t-a\tFlush the ARC cache. Can be specified without any\n" 346 "\t\t\tassociated object.\n" 347 "\t\t-u\tUnload the associated pool. Can be specified with only\n" 348 "\t\t\ta pool object.\n" 349 "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" 350 "\t\t\ta percentage between 0.0001 and 100.\n" 351 "\n" 352 "\t-t data\t\tInject an error into the plain file contents of a\n" 353 "\t\t\tfile. The object must be specified as a complete path\n" 354 "\t\t\tto a file on a ZFS filesystem.\n" 355 "\n" 356 "\t-t dnode\tInject an error into the metadnode in the block\n" 357 "\t\t\tcorresponding to the dnode for a file or directory. The\n" 358 "\t\t\t'-r' option is incompatible with this mode. The object\n" 359 "\t\t\tis specified as a complete path to a file or directory\n" 360 "\t\t\ton a ZFS filesystem.\n" 361 "\n" 362 "\t-t <mos>\tInject errors into the MOS for objects of the given\n" 363 "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" 364 "\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n" 365 "\t\t\tthe poolname.\n"); 366 } 367 368 static int 369 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), 370 void *data) 371 { 372 zfs_cmd_t zc = {"\0"}; 373 int ret; 374 375 while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) 376 if ((ret = func((int)zc.zc_guid, zc.zc_name, 377 &zc.zc_inject_record, data)) != 0) 378 return (ret); 379 380 if (errno != ENOENT) { 381 (void) fprintf(stderr, "Unable to list handlers: %s\n", 382 strerror(errno)); 383 return (-1); 384 } 385 386 return (0); 387 } 388 389 static int 390 print_data_handler(int id, const char *pool, zinject_record_t *record, 391 void *data) 392 { 393 int *count = data; 394 395 if (record->zi_guid != 0 || record->zi_func[0] != '\0') 396 return (0); 397 398 if (*count == 0) { 399 (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " 400 "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", 401 "LVL", "DVAs", "RANGE"); 402 (void) printf("--- --------------- ------ " 403 "------ -------- --- ---- ---------------\n"); 404 } 405 406 *count += 1; 407 408 (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", 409 id, pool, (u_longlong_t)record->zi_objset, 410 (u_longlong_t)record->zi_object, type_to_name(record->zi_type), 411 record->zi_level, record->zi_dvas); 412 413 414 if (record->zi_start == 0 && 415 record->zi_end == -1ULL) 416 (void) printf("all\n"); 417 else 418 (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, 419 (u_longlong_t)record->zi_end); 420 421 return (0); 422 } 423 424 static int 425 print_device_handler(int id, const char *pool, zinject_record_t *record, 426 void *data) 427 { 428 static const char *iotypestr[] = { 429 "null", "read", "write", "free", "claim", "flush", "trim", "all", 430 }; 431 432 int *count = data; 433 434 if (record->zi_guid == 0 || record->zi_func[0] != '\0') 435 return (0); 436 437 if (record->zi_cmd == ZINJECT_DELAY_IO) 438 return (0); 439 440 if (*count == 0) { 441 (void) printf("%3s %-15s %-16s %-5s %-10s %-9s\n", 442 "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ"); 443 (void) printf( 444 "--- --------------- ---------------- " 445 "----- ---------- ---------\n"); 446 } 447 448 *count += 1; 449 450 double freq = record->zi_freq == 0 ? 100.0f : 451 (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f; 452 453 (void) printf("%3d %-15s %llx %-5s %-10s %8.4f%%\n", id, pool, 454 (u_longlong_t)record->zi_guid, iotypestr[record->zi_iotype], 455 err_to_str(record->zi_error), freq); 456 457 return (0); 458 } 459 460 static int 461 print_delay_handler(int id, const char *pool, zinject_record_t *record, 462 void *data) 463 { 464 int *count = data; 465 466 if (record->zi_guid == 0 || record->zi_func[0] != '\0') 467 return (0); 468 469 if (record->zi_cmd != ZINJECT_DELAY_IO) 470 return (0); 471 472 if (*count == 0) { 473 (void) printf("%3s %-15s %-15s %-15s %s\n", 474 "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); 475 (void) printf("--- --------------- --------------- " 476 "--------------- ----------------\n"); 477 } 478 479 *count += 1; 480 481 (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, 482 (u_longlong_t)NSEC2MSEC(record->zi_timer), 483 (u_longlong_t)record->zi_nlanes, 484 (u_longlong_t)record->zi_guid); 485 486 return (0); 487 } 488 489 static int 490 print_panic_handler(int id, const char *pool, zinject_record_t *record, 491 void *data) 492 { 493 int *count = data; 494 495 if (record->zi_func[0] == '\0') 496 return (0); 497 498 if (*count == 0) { 499 (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); 500 (void) printf("--- --------------- ----------------\n"); 501 } 502 503 *count += 1; 504 505 (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); 506 507 return (0); 508 } 509 510 /* 511 * Print all registered error handlers. Returns the number of handlers 512 * registered. 513 */ 514 static int 515 print_all_handlers(void) 516 { 517 int count = 0, total = 0; 518 519 (void) iter_handlers(print_device_handler, &count); 520 if (count > 0) { 521 total += count; 522 (void) printf("\n"); 523 count = 0; 524 } 525 526 (void) iter_handlers(print_delay_handler, &count); 527 if (count > 0) { 528 total += count; 529 (void) printf("\n"); 530 count = 0; 531 } 532 533 (void) iter_handlers(print_data_handler, &count); 534 if (count > 0) { 535 total += count; 536 (void) printf("\n"); 537 count = 0; 538 } 539 540 (void) iter_handlers(print_panic_handler, &count); 541 542 return (count + total); 543 } 544 545 static int 546 cancel_one_handler(int id, const char *pool, zinject_record_t *record, 547 void *data) 548 { 549 (void) pool, (void) record, (void) data; 550 zfs_cmd_t zc = {"\0"}; 551 552 zc.zc_guid = (uint64_t)id; 553 554 if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { 555 (void) fprintf(stderr, "failed to remove handler %d: %s\n", 556 id, strerror(errno)); 557 return (1); 558 } 559 560 return (0); 561 } 562 563 /* 564 * Remove all fault injection handlers. 565 */ 566 static int 567 cancel_all_handlers(void) 568 { 569 int ret = iter_handlers(cancel_one_handler, NULL); 570 571 if (ret == 0) 572 (void) printf("removed all registered handlers\n"); 573 574 return (ret); 575 } 576 577 /* 578 * Remove a specific fault injection handler. 579 */ 580 static int 581 cancel_handler(int id) 582 { 583 zfs_cmd_t zc = {"\0"}; 584 585 zc.zc_guid = (uint64_t)id; 586 587 if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { 588 (void) fprintf(stderr, "failed to remove handler %d: %s\n", 589 id, strerror(errno)); 590 return (1); 591 } 592 593 (void) printf("removed handler %d\n", id); 594 595 return (0); 596 } 597 598 /* 599 * Register a new fault injection handler. 600 */ 601 static int 602 register_handler(const char *pool, int flags, zinject_record_t *record, 603 int quiet) 604 { 605 zfs_cmd_t zc = {"\0"}; 606 607 (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); 608 zc.zc_inject_record = *record; 609 zc.zc_guid = flags; 610 611 if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { 612 (void) fprintf(stderr, "failed to add handler: %s\n", 613 errno == EDOM ? "block level exceeds max level of object" : 614 strerror(errno)); 615 return (1); 616 } 617 618 if (flags & ZINJECT_NULL) 619 return (0); 620 621 if (quiet) { 622 (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); 623 } else { 624 (void) printf("Added handler %llu with the following " 625 "properties:\n", (u_longlong_t)zc.zc_guid); 626 (void) printf(" pool: %s\n", pool); 627 if (record->zi_guid) { 628 (void) printf(" vdev: %llx\n", 629 (u_longlong_t)record->zi_guid); 630 } else if (record->zi_func[0] != '\0') { 631 (void) printf(" panic function: %s\n", 632 record->zi_func); 633 } else if (record->zi_duration > 0) { 634 (void) printf(" time: %lld seconds\n", 635 (u_longlong_t)record->zi_duration); 636 } else if (record->zi_duration < 0) { 637 (void) printf(" txgs: %lld \n", 638 (u_longlong_t)-record->zi_duration); 639 } else { 640 (void) printf("objset: %llu\n", 641 (u_longlong_t)record->zi_objset); 642 (void) printf("object: %llu\n", 643 (u_longlong_t)record->zi_object); 644 (void) printf(" type: %llu\n", 645 (u_longlong_t)record->zi_type); 646 (void) printf(" level: %d\n", record->zi_level); 647 if (record->zi_start == 0 && 648 record->zi_end == -1ULL) 649 (void) printf(" range: all\n"); 650 else 651 (void) printf(" range: [%llu, %llu)\n", 652 (u_longlong_t)record->zi_start, 653 (u_longlong_t)record->zi_end); 654 (void) printf(" dvas: 0x%x\n", record->zi_dvas); 655 } 656 } 657 658 return (0); 659 } 660 661 static int 662 perform_action(const char *pool, zinject_record_t *record, int cmd) 663 { 664 zfs_cmd_t zc = {"\0"}; 665 666 ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); 667 (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); 668 zc.zc_guid = record->zi_guid; 669 zc.zc_cookie = cmd; 670 671 if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) 672 return (0); 673 674 return (1); 675 } 676 677 static int 678 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) 679 { 680 unsigned long scan_delay; 681 unsigned long scan_nlanes; 682 683 if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) 684 return (1); 685 686 /* 687 * We explicitly disallow a delay of zero here, because we key 688 * off this value being non-zero in translate_device(), to 689 * determine if the fault is a ZINJECT_DELAY_IO fault or not. 690 */ 691 if (scan_delay == 0) 692 return (1); 693 694 /* 695 * The units for the CLI delay parameter is milliseconds, but 696 * the data passed to the kernel is interpreted as nanoseconds. 697 * Thus we scale the milliseconds to nanoseconds here, and this 698 * nanosecond value is used to pass the delay to the kernel. 699 */ 700 *delay = MSEC2NSEC(scan_delay); 701 *nlanes = scan_nlanes; 702 703 return (0); 704 } 705 706 static int 707 parse_frequency(const char *str, uint32_t *percent) 708 { 709 double val; 710 char *post; 711 712 val = strtod(str, &post); 713 if (post == NULL || *post != '\0') 714 return (EINVAL); 715 716 /* valid range is [0.0001, 100.0] */ 717 val /= 100.0f; 718 if (val < 0.000001f || val > 1.0f) 719 return (ERANGE); 720 721 /* convert to an integer for use by kernel */ 722 *percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX)); 723 724 return (0); 725 } 726 727 /* 728 * This function converts a string specifier for DVAs into a bit mask. 729 * The dva's provided by the user should be 0 indexed and separated by 730 * a comma. For example: 731 * "1" -> 0b0010 (0x2) 732 * "0,1" -> 0b0011 (0x3) 733 * "0,1,2" -> 0b0111 (0x7) 734 */ 735 static int 736 parse_dvas(const char *str, uint32_t *dvas_out) 737 { 738 const char *c = str; 739 uint32_t mask = 0; 740 boolean_t need_delim = B_FALSE; 741 742 /* max string length is 5 ("0,1,2") */ 743 if (strlen(str) > 5 || strlen(str) == 0) 744 return (EINVAL); 745 746 while (*c != '\0') { 747 switch (*c) { 748 case '0': 749 case '1': 750 case '2': 751 /* check for pipe between DVAs */ 752 if (need_delim) 753 return (EINVAL); 754 755 /* check if this DVA has been set already */ 756 if (mask & (1 << ((*c) - '0'))) 757 return (EINVAL); 758 759 mask |= (1 << ((*c) - '0')); 760 need_delim = B_TRUE; 761 break; 762 case ',': 763 need_delim = B_FALSE; 764 break; 765 default: 766 /* check for invalid character */ 767 return (EINVAL); 768 } 769 c++; 770 } 771 772 /* check for dangling delimiter */ 773 if (!need_delim) 774 return (EINVAL); 775 776 *dvas_out = mask; 777 return (0); 778 } 779 780 int 781 main(int argc, char **argv) 782 { 783 int c; 784 char *range = NULL; 785 char *cancel = NULL; 786 char *end; 787 char *raw = NULL; 788 char *device = NULL; 789 int level = 0; 790 int quiet = 0; 791 int error = 0; 792 int domount = 0; 793 int io_type = ZIO_TYPES; 794 int action = VDEV_STATE_UNKNOWN; 795 err_type_t type = TYPE_INVAL; 796 err_type_t label = TYPE_INVAL; 797 zinject_record_t record = { 0 }; 798 char pool[MAXNAMELEN] = ""; 799 char dataset[MAXNAMELEN] = ""; 800 zfs_handle_t *zhp = NULL; 801 int nowrites = 0; 802 int dur_txg = 0; 803 int dur_secs = 0; 804 int ret; 805 int flags = 0; 806 uint32_t dvas = 0; 807 808 if ((g_zfs = libzfs_init()) == NULL) { 809 (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); 810 return (1); 811 } 812 813 libzfs_print_on_error(g_zfs, B_TRUE); 814 815 if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { 816 (void) fprintf(stderr, "failed to open ZFS device\n"); 817 libzfs_fini(g_zfs); 818 return (1); 819 } 820 821 if (argc == 1) { 822 /* 823 * No arguments. Print the available handlers. If there are no 824 * available handlers, direct the user to '-h' for help 825 * information. 826 */ 827 if (print_all_handlers() == 0) { 828 (void) printf("No handlers registered.\n"); 829 (void) printf("Run 'zinject -h' for usage " 830 "information.\n"); 831 } 832 libzfs_fini(g_zfs); 833 return (0); 834 } 835 836 while ((c = getopt(argc, argv, 837 ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { 838 switch (c) { 839 case 'a': 840 flags |= ZINJECT_FLUSH_ARC; 841 break; 842 case 'A': 843 if (strcasecmp(optarg, "degrade") == 0) { 844 action = VDEV_STATE_DEGRADED; 845 } else if (strcasecmp(optarg, "fault") == 0) { 846 action = VDEV_STATE_FAULTED; 847 } else { 848 (void) fprintf(stderr, "invalid action '%s': " 849 "must be 'degrade' or 'fault'\n", optarg); 850 usage(); 851 libzfs_fini(g_zfs); 852 return (1); 853 } 854 break; 855 case 'b': 856 raw = optarg; 857 break; 858 case 'c': 859 cancel = optarg; 860 break; 861 case 'C': 862 ret = parse_dvas(optarg, &dvas); 863 if (ret != 0) { 864 (void) fprintf(stderr, "invalid DVA list '%s': " 865 "DVAs should be 0 indexed and separated by " 866 "commas.\n", optarg); 867 usage(); 868 libzfs_fini(g_zfs); 869 return (1); 870 } 871 break; 872 case 'd': 873 device = optarg; 874 break; 875 case 'D': 876 errno = 0; 877 ret = parse_delay(optarg, &record.zi_timer, 878 &record.zi_nlanes); 879 if (ret != 0) { 880 881 (void) fprintf(stderr, "invalid i/o delay " 882 "value: '%s'\n", optarg); 883 usage(); 884 libzfs_fini(g_zfs); 885 return (1); 886 } 887 break; 888 case 'e': 889 error = str_to_err(optarg); 890 if (error < 0) { 891 (void) fprintf(stderr, "invalid error type " 892 "'%s': must be one of: io decompress " 893 "decrypt nxio dtl corrupt noop\n", 894 optarg); 895 usage(); 896 libzfs_fini(g_zfs); 897 return (1); 898 } 899 break; 900 case 'f': 901 ret = parse_frequency(optarg, &record.zi_freq); 902 if (ret != 0) { 903 (void) fprintf(stderr, "%sfrequency value must " 904 "be in the range [0.0001, 100.0]\n", 905 ret == EINVAL ? "invalid value: " : 906 ret == ERANGE ? "out of range: " : ""); 907 libzfs_fini(g_zfs); 908 return (1); 909 } 910 break; 911 case 'F': 912 record.zi_failfast = B_TRUE; 913 break; 914 case 'g': 915 dur_txg = 1; 916 record.zi_duration = (int)strtol(optarg, &end, 10); 917 if (record.zi_duration <= 0 || *end != '\0') { 918 (void) fprintf(stderr, "invalid duration '%s': " 919 "must be a positive integer\n", optarg); 920 usage(); 921 libzfs_fini(g_zfs); 922 return (1); 923 } 924 /* store duration of txgs as its negative */ 925 record.zi_duration *= -1; 926 break; 927 case 'h': 928 usage(); 929 libzfs_fini(g_zfs); 930 return (0); 931 case 'I': 932 /* default duration, if one hasn't yet been defined */ 933 nowrites = 1; 934 if (dur_secs == 0 && dur_txg == 0) 935 record.zi_duration = 30; 936 break; 937 case 'l': 938 level = (int)strtol(optarg, &end, 10); 939 if (*end != '\0') { 940 (void) fprintf(stderr, "invalid level '%s': " 941 "must be an integer\n", optarg); 942 usage(); 943 libzfs_fini(g_zfs); 944 return (1); 945 } 946 break; 947 case 'm': 948 domount = 1; 949 break; 950 case 'p': 951 (void) strlcpy(record.zi_func, optarg, 952 sizeof (record.zi_func)); 953 record.zi_cmd = ZINJECT_PANIC; 954 break; 955 case 'q': 956 quiet = 1; 957 break; 958 case 'r': 959 range = optarg; 960 flags |= ZINJECT_CALC_RANGE; 961 break; 962 case 's': 963 dur_secs = 1; 964 record.zi_duration = (int)strtol(optarg, &end, 10); 965 if (record.zi_duration <= 0 || *end != '\0') { 966 (void) fprintf(stderr, "invalid duration '%s': " 967 "must be a positive integer\n", optarg); 968 usage(); 969 libzfs_fini(g_zfs); 970 return (1); 971 } 972 break; 973 case 'T': 974 if (strcasecmp(optarg, "read") == 0) { 975 io_type = ZIO_TYPE_READ; 976 } else if (strcasecmp(optarg, "write") == 0) { 977 io_type = ZIO_TYPE_WRITE; 978 } else if (strcasecmp(optarg, "free") == 0) { 979 io_type = ZIO_TYPE_FREE; 980 } else if (strcasecmp(optarg, "claim") == 0) { 981 io_type = ZIO_TYPE_CLAIM; 982 } else if (strcasecmp(optarg, "flush") == 0) { 983 io_type = ZIO_TYPE_FLUSH; 984 } else if (strcasecmp(optarg, "all") == 0) { 985 io_type = ZIO_TYPES; 986 } else { 987 (void) fprintf(stderr, "invalid I/O type " 988 "'%s': must be 'read', 'write', 'free', " 989 "'claim', 'flush' or 'all'\n", optarg); 990 usage(); 991 libzfs_fini(g_zfs); 992 return (1); 993 } 994 break; 995 case 't': 996 if ((type = name_to_type(optarg)) == TYPE_INVAL && 997 !MOS_TYPE(type)) { 998 (void) fprintf(stderr, "invalid type '%s'\n", 999 optarg); 1000 usage(); 1001 libzfs_fini(g_zfs); 1002 return (1); 1003 } 1004 break; 1005 case 'u': 1006 flags |= ZINJECT_UNLOAD_SPA; 1007 break; 1008 case 'L': 1009 if ((label = name_to_type(optarg)) == TYPE_INVAL && 1010 !LABEL_TYPE(type)) { 1011 (void) fprintf(stderr, "invalid label type " 1012 "'%s'\n", optarg); 1013 usage(); 1014 libzfs_fini(g_zfs); 1015 return (1); 1016 } 1017 break; 1018 case ':': 1019 (void) fprintf(stderr, "option -%c requires an " 1020 "operand\n", optopt); 1021 usage(); 1022 libzfs_fini(g_zfs); 1023 return (1); 1024 case '?': 1025 (void) fprintf(stderr, "invalid option '%c'\n", 1026 optopt); 1027 usage(); 1028 libzfs_fini(g_zfs); 1029 return (2); 1030 } 1031 } 1032 1033 argc -= optind; 1034 argv += optind; 1035 1036 if (record.zi_duration != 0) 1037 record.zi_cmd = ZINJECT_IGNORED_WRITES; 1038 1039 if (cancel != NULL) { 1040 /* 1041 * '-c' is invalid with any other options. 1042 */ 1043 if (raw != NULL || range != NULL || type != TYPE_INVAL || 1044 level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || 1045 record.zi_freq > 0 || dvas != 0) { 1046 (void) fprintf(stderr, "cancel (-c) incompatible with " 1047 "any other options\n"); 1048 usage(); 1049 libzfs_fini(g_zfs); 1050 return (2); 1051 } 1052 if (argc != 0) { 1053 (void) fprintf(stderr, "extraneous argument to '-c'\n"); 1054 usage(); 1055 libzfs_fini(g_zfs); 1056 return (2); 1057 } 1058 1059 if (strcmp(cancel, "all") == 0) { 1060 return (cancel_all_handlers()); 1061 } else { 1062 int id = (int)strtol(cancel, &end, 10); 1063 if (*end != '\0') { 1064 (void) fprintf(stderr, "invalid handle id '%s':" 1065 " must be an integer or 'all'\n", cancel); 1066 usage(); 1067 libzfs_fini(g_zfs); 1068 return (1); 1069 } 1070 return (cancel_handler(id)); 1071 } 1072 } 1073 1074 if (device != NULL) { 1075 /* 1076 * Device (-d) injection uses a completely different mechanism 1077 * for doing injection, so handle it separately here. 1078 */ 1079 if (raw != NULL || range != NULL || type != TYPE_INVAL || 1080 level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || 1081 dvas != 0) { 1082 (void) fprintf(stderr, "device (-d) incompatible with " 1083 "data error injection\n"); 1084 usage(); 1085 libzfs_fini(g_zfs); 1086 return (2); 1087 } 1088 1089 if (argc != 1) { 1090 (void) fprintf(stderr, "device (-d) injection requires " 1091 "a single pool name\n"); 1092 usage(); 1093 libzfs_fini(g_zfs); 1094 return (2); 1095 } 1096 1097 (void) strlcpy(pool, argv[0], sizeof (pool)); 1098 dataset[0] = '\0'; 1099 1100 if (error == ECKSUM) { 1101 (void) fprintf(stderr, "device error type must be " 1102 "'io', 'nxio' or 'corrupt'\n"); 1103 libzfs_fini(g_zfs); 1104 return (1); 1105 } 1106 1107 if (error == EILSEQ && 1108 (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) { 1109 (void) fprintf(stderr, "device corrupt errors require " 1110 "io type read and a frequency value\n"); 1111 libzfs_fini(g_zfs); 1112 return (1); 1113 } 1114 1115 record.zi_iotype = io_type; 1116 if (translate_device(pool, device, label, &record) != 0) { 1117 libzfs_fini(g_zfs); 1118 return (1); 1119 } 1120 1121 if (record.zi_nlanes) { 1122 switch (io_type) { 1123 case ZIO_TYPE_READ: 1124 case ZIO_TYPE_WRITE: 1125 case ZIO_TYPES: 1126 break; 1127 default: 1128 (void) fprintf(stderr, "I/O type for a delay " 1129 "must be 'read' or 'write'\n"); 1130 usage(); 1131 libzfs_fini(g_zfs); 1132 return (1); 1133 } 1134 } 1135 1136 if (!error) 1137 error = ENXIO; 1138 1139 if (action != VDEV_STATE_UNKNOWN) 1140 return (perform_action(pool, &record, action)); 1141 1142 } else if (raw != NULL) { 1143 if (range != NULL || type != TYPE_INVAL || level != 0 || 1144 record.zi_cmd != ZINJECT_UNINITIALIZED || 1145 record.zi_freq > 0 || dvas != 0) { 1146 (void) fprintf(stderr, "raw (-b) format with " 1147 "any other options\n"); 1148 usage(); 1149 libzfs_fini(g_zfs); 1150 return (2); 1151 } 1152 1153 if (argc != 1) { 1154 (void) fprintf(stderr, "raw (-b) format expects a " 1155 "single pool name\n"); 1156 usage(); 1157 libzfs_fini(g_zfs); 1158 return (2); 1159 } 1160 1161 (void) strlcpy(pool, argv[0], sizeof (pool)); 1162 dataset[0] = '\0'; 1163 1164 if (error == ENXIO) { 1165 (void) fprintf(stderr, "data error type must be " 1166 "'checksum' or 'io'\n"); 1167 libzfs_fini(g_zfs); 1168 return (1); 1169 } 1170 1171 record.zi_cmd = ZINJECT_DATA_FAULT; 1172 if (translate_raw(raw, &record) != 0) { 1173 libzfs_fini(g_zfs); 1174 return (1); 1175 } 1176 if (!error) 1177 error = EIO; 1178 } else if (record.zi_cmd == ZINJECT_PANIC) { 1179 if (raw != NULL || range != NULL || type != TYPE_INVAL || 1180 level != 0 || device != NULL || record.zi_freq > 0 || 1181 dvas != 0) { 1182 (void) fprintf(stderr, "panic (-p) incompatible with " 1183 "other options\n"); 1184 usage(); 1185 libzfs_fini(g_zfs); 1186 return (2); 1187 } 1188 1189 if (argc < 1 || argc > 2) { 1190 (void) fprintf(stderr, "panic (-p) injection requires " 1191 "a single pool name and an optional id\n"); 1192 usage(); 1193 libzfs_fini(g_zfs); 1194 return (2); 1195 } 1196 1197 (void) strlcpy(pool, argv[0], sizeof (pool)); 1198 if (argv[1] != NULL) 1199 record.zi_type = atoi(argv[1]); 1200 dataset[0] = '\0'; 1201 } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { 1202 if (raw != NULL || range != NULL || type != TYPE_INVAL || 1203 level != 0 || record.zi_freq > 0 || dvas != 0) { 1204 (void) fprintf(stderr, "hardware failure (-I) " 1205 "incompatible with other options\n"); 1206 usage(); 1207 libzfs_fini(g_zfs); 1208 return (2); 1209 } 1210 1211 if (nowrites == 0) { 1212 (void) fprintf(stderr, "-s or -g meaningless " 1213 "without -I (ignore writes)\n"); 1214 usage(); 1215 libzfs_fini(g_zfs); 1216 return (2); 1217 } else if (dur_secs && dur_txg) { 1218 (void) fprintf(stderr, "choose a duration either " 1219 "in seconds (-s) or a number of txgs (-g) " 1220 "but not both\n"); 1221 usage(); 1222 libzfs_fini(g_zfs); 1223 return (2); 1224 } else if (argc != 1) { 1225 (void) fprintf(stderr, "ignore writes (-I) " 1226 "injection requires a single pool name\n"); 1227 usage(); 1228 libzfs_fini(g_zfs); 1229 return (2); 1230 } 1231 1232 (void) strlcpy(pool, argv[0], sizeof (pool)); 1233 dataset[0] = '\0'; 1234 } else if (type == TYPE_INVAL) { 1235 if (flags == 0) { 1236 (void) fprintf(stderr, "at least one of '-b', '-d', " 1237 "'-t', '-a', '-p', '-I' or '-u' " 1238 "must be specified\n"); 1239 usage(); 1240 libzfs_fini(g_zfs); 1241 return (2); 1242 } 1243 1244 if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { 1245 (void) strlcpy(pool, argv[0], sizeof (pool)); 1246 dataset[0] = '\0'; 1247 } else if (argc != 0) { 1248 (void) fprintf(stderr, "extraneous argument for " 1249 "'-f'\n"); 1250 usage(); 1251 libzfs_fini(g_zfs); 1252 return (2); 1253 } 1254 1255 flags |= ZINJECT_NULL; 1256 } else { 1257 if (argc != 1) { 1258 (void) fprintf(stderr, "missing object\n"); 1259 usage(); 1260 libzfs_fini(g_zfs); 1261 return (2); 1262 } 1263 1264 if (error == ENXIO || error == EILSEQ) { 1265 (void) fprintf(stderr, "data error type must be " 1266 "'checksum' or 'io'\n"); 1267 libzfs_fini(g_zfs); 1268 return (1); 1269 } 1270 1271 if (dvas != 0) { 1272 if (error == EACCES || error == EINVAL) { 1273 (void) fprintf(stderr, "the '-C' option may " 1274 "not be used with logical data errors " 1275 "'decrypt' and 'decompress'\n"); 1276 libzfs_fini(g_zfs); 1277 return (1); 1278 } 1279 1280 record.zi_dvas = dvas; 1281 } 1282 1283 if (error == EACCES) { 1284 if (type != TYPE_DATA) { 1285 (void) fprintf(stderr, "decryption errors " 1286 "may only be injected for 'data' types\n"); 1287 libzfs_fini(g_zfs); 1288 return (1); 1289 } 1290 1291 record.zi_cmd = ZINJECT_DECRYPT_FAULT; 1292 /* 1293 * Internally, ZFS actually uses ECKSUM for decryption 1294 * errors since EACCES is used to indicate the key was 1295 * not found. 1296 */ 1297 error = ECKSUM; 1298 } else { 1299 record.zi_cmd = ZINJECT_DATA_FAULT; 1300 } 1301 1302 if (translate_record(type, argv[0], range, level, &record, pool, 1303 dataset) != 0) { 1304 libzfs_fini(g_zfs); 1305 return (1); 1306 } 1307 if (!error) 1308 error = EIO; 1309 } 1310 1311 /* 1312 * If this is pool-wide metadata, unmount everything. The ioctl() will 1313 * unload the pool, so that we trigger spa-wide reopen of metadata next 1314 * time we access the pool. 1315 */ 1316 if (dataset[0] != '\0' && domount) { 1317 if ((zhp = zfs_open(g_zfs, dataset, 1318 ZFS_TYPE_DATASET)) == NULL) { 1319 libzfs_fini(g_zfs); 1320 return (1); 1321 } 1322 if (zfs_unmount(zhp, NULL, 0) != 0) { 1323 libzfs_fini(g_zfs); 1324 return (1); 1325 } 1326 } 1327 1328 record.zi_error = error; 1329 1330 ret = register_handler(pool, flags, &record, quiet); 1331 1332 if (dataset[0] != '\0' && domount) 1333 ret = (zfs_mount(zhp, NULL, 0) != 0); 1334 1335 libzfs_fini(g_zfs); 1336 1337 return (ret); 1338 } 1339