1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2003-2004 Sean M. Kelly <smkelly@FreeBSD.org> 5 * Copyright (c) 2013 iXsystems.com, 6 * author: Alfred Perlstein <alfred@freebsd.org> 7 * 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Software watchdog daemon. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/mman.h> 38 #include <sys/param.h> 39 #include <sys/rtprio.h> 40 #include <sys/stat.h> 41 #include <sys/time.h> 42 #include <sys/sysctl.h> 43 #include <sys/watchdog.h> 44 45 #include <err.h> 46 #include <errno.h> 47 #include <fcntl.h> 48 #include <libutil.h> 49 #include <math.h> 50 #include <paths.h> 51 #include <signal.h> 52 #include <stdio.h> 53 #include <stdint.h> 54 #include <stdlib.h> 55 #include <string.h> 56 #include <strings.h> 57 #include <sysexits.h> 58 #include <syslog.h> 59 #include <unistd.h> 60 61 #include <getopt.h> 62 63 static long fetchtimeout(int opt, 64 const char *longopt, const char *myoptarg, int zero_ok); 65 static void parseargs(int, char *[]); 66 static void sighandler(int); 67 static void watchdog_loop(void); 68 static int watchdog_init(void); 69 static int watchdog_onoff(int onoff); 70 static int watchdog_patpat(sbintime_t); 71 static void usage(void); 72 static int tvtohz(struct timeval *tv); 73 74 static int debugging = 0; 75 static int end_program = 0; 76 static const char *pidfile = _PATH_VARRUN "watchdogd.pid"; 77 static sbintime_t timeout = 128 * SBT_1S; 78 static u_int exit_timeout = WD_TO_NEVER; 79 static u_int pretimeout = 0; 80 static u_int timeout_sec; 81 static u_int nap = 10; 82 #ifdef notyet 83 static int passive = 0; 84 #endif 85 static int is_daemon = 0; 86 static int is_dry_run = 0; /* do not arm the watchdog, only 87 report on timing of the watch 88 program */ 89 static int do_timedog = 0; 90 static int do_syslog = 1; 91 static int fd = -1; 92 static int carp_thresh_seconds = -1; 93 static char *test_cmd = NULL; 94 95 static const char *getopt_shortopts; 96 97 static int pretimeout_set; 98 static int pretimeout_act; 99 static int pretimeout_act_set; 100 101 static int softtimeout_set; 102 static int softtimeout_act; 103 static int softtimeout_act_set; 104 105 static struct option longopts[] = { 106 { "debug", no_argument, &debugging, 1 }, 107 { "pretimeout", required_argument, &pretimeout_set, 1 }, 108 { "pretimeout-action", required_argument, &pretimeout_act_set, 1 }, 109 { "softtimeout", no_argument, &softtimeout_set, 1 }, 110 { "softtimeout-action", required_argument, &softtimeout_act_set, 1 }, 111 { NULL, 0, NULL, 0} 112 }; 113 114 /* 115 * Periodically pat the watchdog, preventing it from firing. 116 */ 117 int 118 main(int argc, char *argv[]) 119 { 120 struct rtprio rtp; 121 struct pidfh *pfh; 122 pid_t otherpid; 123 124 if (getuid() != 0) 125 errx(EX_SOFTWARE, "not super user"); 126 127 parseargs(argc, argv); 128 129 if (do_syslog) 130 openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR, 131 LOG_DAEMON); 132 133 rtp.type = RTP_PRIO_REALTIME; 134 rtp.prio = 0; 135 if (rtprio(RTP_SET, 0, &rtp) == -1) 136 err(EX_OSERR, "rtprio"); 137 138 if (!is_dry_run && watchdog_init() == -1) 139 errx(EX_SOFTWARE, "unable to initialize watchdog"); 140 141 if (is_daemon) { 142 if (watchdog_onoff(1) == -1) 143 err(EX_OSERR, "patting the dog"); 144 145 pfh = pidfile_open(pidfile, 0600, &otherpid); 146 if (pfh == NULL) { 147 if (errno == EEXIST) { 148 watchdog_onoff(0); 149 errx(EX_SOFTWARE, "%s already running, pid: %d", 150 getprogname(), otherpid); 151 } 152 warn("Cannot open or create pidfile"); 153 } 154 155 if (debugging == 0 && daemon(0, 0) == -1) { 156 watchdog_onoff(0); 157 pidfile_remove(pfh); 158 err(EX_OSERR, "daemon"); 159 } 160 161 signal(SIGHUP, SIG_IGN); 162 signal(SIGINT, sighandler); 163 signal(SIGTERM, sighandler); 164 165 pidfile_write(pfh); 166 if (madvise(0, 0, MADV_PROTECT) != 0) 167 warn("madvise failed"); 168 if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) 169 warn("mlockall failed"); 170 171 watchdog_loop(); 172 173 /* exiting */ 174 pidfile_remove(pfh); 175 return (EX_OK); 176 } else { 177 if (watchdog_patpat(timeout) < 0) 178 err(EX_OSERR, "patting the dog"); 179 return (EX_OK); 180 } 181 } 182 183 /* 184 * Convert a timeout in seconds to N where 2^N nanoseconds is close to 185 * "seconds". 186 * 187 * The kernel expects the timeouts for watchdogs in "2^N nanosecond format". 188 */ 189 static sbintime_t 190 parse_timeout_to_sbt(char opt, const char *longopt, const char *myoptarg) 191 { 192 long a; 193 sbintime_t rv; 194 struct timeval tv; 195 int ticks; 196 char shortopt[] = "- "; 197 198 if (!longopt) 199 shortopt[1] = opt; 200 201 a = fetchtimeout(opt, longopt, myoptarg, 1); 202 203 if (a == 0) 204 rv = 0; 205 else 206 rv = a * SBT_1S; 207 tv = sbttotv(rv); 208 ticks = tvtohz(&tv); 209 if (debugging) { 210 printf("Timeout for %s%s " 211 "is " 212 "(in: %s sec -> out: %jd sec %ld us -> %d ticks)\n", 213 longopt ? "-" : "", longopt ? longopt : shortopt, 214 myoptarg, (intmax_t)tv.tv_sec, tv.tv_usec, ticks); 215 } 216 if (ticks <= 0) { 217 errx(1, "Timeout for %s%s is too small, please choose a higher timeout.", longopt ? "-" : "", longopt ? longopt : shortopt); 218 } 219 220 return (rv); 221 } 222 223 /* 224 * Catch signals and begin shutdown process. 225 */ 226 static void 227 sighandler(int signum) 228 { 229 230 if (signum == SIGINT || signum == SIGTERM) 231 end_program = 1; 232 } 233 234 /* 235 * Open the watchdog device. 236 */ 237 static int 238 watchdog_init(void) 239 { 240 241 if (is_dry_run) 242 return 0; 243 244 fd = open("/dev/" _PATH_WATCHDOG, O_RDWR); 245 if (fd >= 0) 246 return (0); 247 warn("Could not open watchdog device"); 248 return (-1); 249 } 250 251 /* 252 * If we are doing timing, then get the time. 253 */ 254 static int 255 watchdog_getuptime(struct timespec *tp) 256 { 257 int error; 258 259 if (!do_timedog) 260 return 0; 261 262 error = clock_gettime(CLOCK_UPTIME_FAST, tp); 263 if (error) 264 warn("clock_gettime"); 265 return (error); 266 } 267 268 static long 269 watchdog_check_dogfunction_time(struct timespec *tp_start, 270 struct timespec *tp_end) 271 { 272 struct timeval tv_start, tv_end, tv_now, tv; 273 const char *cmd_prefix, *cmd; 274 struct timespec tp_now; 275 int sec; 276 277 if (!do_timedog) 278 return (0); 279 280 TIMESPEC_TO_TIMEVAL(&tv_start, tp_start); 281 TIMESPEC_TO_TIMEVAL(&tv_end, tp_end); 282 timersub(&tv_end, &tv_start, &tv); 283 sec = tv.tv_sec; 284 if (sec < carp_thresh_seconds) 285 return (sec); 286 287 if (test_cmd) { 288 cmd_prefix = "Watchdog program"; 289 cmd = test_cmd; 290 } else { 291 cmd_prefix = "Watchdog operation"; 292 cmd = "stat(\"/etc\", &sb)"; 293 } 294 if (do_syslog) 295 syslog(LOG_CRIT, "%s: '%s' took too long: " 296 "%d.%06ld seconds >= %d seconds threshold", 297 cmd_prefix, cmd, sec, (long)tv.tv_usec, 298 carp_thresh_seconds); 299 else 300 warnx("%s: '%s' took too long: " 301 "%d.%06ld seconds >= %d seconds threshold", 302 cmd_prefix, cmd, sec, (long)tv.tv_usec, 303 carp_thresh_seconds); 304 305 /* 306 * Adjust the sleep interval again in case syslog(3) took a non-trivial 307 * amount of time to run. 308 */ 309 if (watchdog_getuptime(&tp_now)) 310 return (sec); 311 TIMESPEC_TO_TIMEVAL(&tv_now, &tp_now); 312 timersub(&tv_now, &tv_start, &tv); 313 sec = tv.tv_sec; 314 315 return (sec); 316 } 317 318 /* 319 * Main program loop which is iterated every second. 320 */ 321 static void 322 watchdog_loop(void) 323 { 324 struct timespec ts_start, ts_end; 325 struct stat sb; 326 long waited; 327 int error, failed; 328 329 while (end_program != 2) { 330 failed = 0; 331 332 error = watchdog_getuptime(&ts_start); 333 if (error) { 334 end_program = 1; 335 goto try_end; 336 } 337 338 if (test_cmd != NULL) 339 failed = system(test_cmd); 340 else 341 failed = stat("/etc", &sb); 342 343 error = watchdog_getuptime(&ts_end); 344 if (error) { 345 end_program = 1; 346 goto try_end; 347 } 348 349 if (failed == 0) 350 watchdog_patpat(timeout); 351 352 waited = watchdog_check_dogfunction_time(&ts_start, &ts_end); 353 if (nap - waited > 0) 354 sleep(nap - waited); 355 356 try_end: 357 if (end_program != 0) { 358 if (watchdog_onoff(0) == 0) { 359 end_program = 2; 360 } else { 361 warnx("Could not stop the watchdog, not exiting"); 362 end_program = 0; 363 } 364 } 365 } 366 } 367 368 /* 369 * Reset the watchdog timer. This function must be called periodically 370 * to keep the watchdog from firing. 371 */ 372 static int 373 watchdog_patpat(sbintime_t sbt) 374 { 375 376 if (is_dry_run) 377 return 0; 378 379 return ioctl(fd, WDIOC_SETTIMEOUT, &sbt); 380 } 381 382 static int 383 watchdog_control(u_int control) 384 { 385 if (is_dry_run) 386 return (0); 387 388 return ioctl(fd, WDIOC_CONTROL, &control); 389 } 390 391 /* 392 * Toggle the kernel's watchdog. This routine is used to enable and 393 * disable the watchdog. 394 */ 395 static int 396 watchdog_onoff(int onoff) 397 { 398 int error; 399 400 /* fake successful watchdog op if a dry run */ 401 if (is_dry_run) 402 return 0; 403 404 if (onoff) { 405 /* 406 * Call the WDIOC_SETSOFT regardless of softtimeout_set 407 * because we'll need to turn it off if someone had turned 408 * it on. 409 */ 410 error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set); 411 if (error) { 412 warn("setting WDIOC_SETSOFT %d", softtimeout_set); 413 return (error); 414 } 415 error = watchdog_patpat(timeout); 416 if (error) { 417 warn("watchdog_patpat failed"); 418 goto failsafe; 419 } 420 if (softtimeout_act_set) { 421 error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT, 422 &softtimeout_act); 423 if (error) { 424 warn("setting WDIOC_SETSOFTTIMEOUTACT %d", 425 softtimeout_act); 426 goto failsafe; 427 } 428 } 429 if (pretimeout_set) { 430 error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout); 431 if (error) { 432 warn("setting WDIOC_SETPRETIMEOUT %d", 433 pretimeout); 434 goto failsafe; 435 } 436 } 437 if (pretimeout_act_set) { 438 error = ioctl(fd, WDIOC_SETPRETIMEOUTACT, 439 &pretimeout_act); 440 if (error) { 441 warn("setting WDIOC_SETPRETIMEOUTACT %d", 442 pretimeout_act); 443 goto failsafe; 444 } 445 } 446 /* pat one more time for good measure */ 447 return watchdog_patpat(timeout); 448 } else { 449 return watchdog_control(WD_CTRL_DISABLE); 450 } 451 failsafe: 452 watchdog_control(WD_CTRL_DISABLE); 453 return (error); 454 } 455 456 /* 457 * Tell user how to use the program. 458 */ 459 static void 460 usage(void) 461 { 462 if (is_daemon) 463 fprintf(stderr, "usage:\n" 464 " watchdogd [-dnSw] [-e cmd] [-I pidfile] [-s sleep] [-t timeout]\n" 465 " [-T script_timeout] [-x exit_timeout]\n" 466 " [--debug]\n" 467 " [--pretimeout seconds] [-pretimeout-action action]\n" 468 " [--softtimeout] [-softtimeout-action action]\n" 469 ); 470 else 471 fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n"); 472 exit(EX_USAGE); 473 } 474 475 static long 476 fetchtimeout(int opt, const char *longopt, const char *myoptarg, int zero_ok) 477 { 478 const char *errstr; 479 char *p; 480 long rv; 481 482 errstr = NULL; 483 p = NULL; 484 errno = 0; 485 rv = strtol(myoptarg, &p, 0); 486 if ((p != NULL && *p != '\0') || errno != 0) 487 errstr = "is not a number"; 488 if (rv < 0 || (!zero_ok && rv == 0)) 489 errstr = "must be greater than zero"; 490 if (errstr) { 491 if (longopt) 492 errx(EX_USAGE, "--%s argument %s", longopt, errstr); 493 else 494 errx(EX_USAGE, "-%c argument %s", opt, errstr); 495 } 496 return (rv); 497 } 498 499 struct act_tbl { 500 const char *at_act; 501 int at_value; 502 }; 503 504 static const struct act_tbl act_tbl[] = { 505 { "panic", WD_SOFT_PANIC }, 506 { "ddb", WD_SOFT_DDB }, 507 { "log", WD_SOFT_LOG }, 508 { "printf", WD_SOFT_PRINTF }, 509 { NULL, 0 } 510 }; 511 512 static void 513 timeout_act_error(const char *lopt, const char *badact) 514 { 515 char *opts, *oldopts; 516 int i; 517 518 opts = NULL; 519 for (i = 0; act_tbl[i].at_act != NULL; i++) { 520 oldopts = opts; 521 if (asprintf(&opts, "%s%s%s", 522 oldopts == NULL ? "" : oldopts, 523 oldopts == NULL ? "" : ", ", 524 act_tbl[i].at_act) == -1) 525 err(EX_OSERR, "malloc"); 526 free(oldopts); 527 } 528 warnx("bad --%s argument '%s' must be one of (%s).", 529 lopt, badact, opts); 530 usage(); 531 } 532 533 /* 534 * Take a comma separated list of actions and or the flags 535 * together for the ioctl. 536 */ 537 static int 538 timeout_act_str2int(const char *lopt, const char *acts) 539 { 540 int i; 541 char *dupacts, *tofree; 542 char *o; 543 int rv = 0; 544 545 tofree = dupacts = strdup(acts); 546 if (!tofree) 547 err(EX_OSERR, "malloc"); 548 while ((o = strsep(&dupacts, ",")) != NULL) { 549 for (i = 0; act_tbl[i].at_act != NULL; i++) { 550 if (!strcmp(o, act_tbl[i].at_act)) { 551 rv |= act_tbl[i].at_value; 552 break; 553 } 554 } 555 if (act_tbl[i].at_act == NULL) 556 timeout_act_error(lopt, o); 557 } 558 free(tofree); 559 return rv; 560 } 561 562 /* 563 * Convert a timeval to a number of ticks. 564 * Mostly copied from the kernel. 565 */ 566 int 567 tvtohz(struct timeval *tv) 568 { 569 register unsigned long ticks; 570 register long sec, usec; 571 int hz; 572 size_t hzsize; 573 int error; 574 int tick; 575 576 hzsize = sizeof(hz); 577 578 error = sysctlbyname("kern.hz", &hz, &hzsize, NULL, 0); 579 if (error) 580 err(1, "sysctlbyname kern.hz"); 581 582 tick = 1000000 / hz; 583 584 /* 585 * If the number of usecs in the whole seconds part of the time 586 * difference fits in a long, then the total number of usecs will 587 * fit in an unsigned long. Compute the total and convert it to 588 * ticks, rounding up and adding 1 to allow for the current tick 589 * to expire. Rounding also depends on unsigned long arithmetic 590 * to avoid overflow. 591 * 592 * Otherwise, if the number of ticks in the whole seconds part of 593 * the time difference fits in a long, then convert the parts to 594 * ticks separately and add, using similar rounding methods and 595 * overflow avoidance. This method would work in the previous 596 * case but it is slightly slower and assumes that hz is integral. 597 * 598 * Otherwise, round the time difference down to the maximum 599 * representable value. 600 * 601 * If ints have 32 bits, then the maximum value for any timeout in 602 * 10ms ticks is 248 days. 603 */ 604 sec = tv->tv_sec; 605 usec = tv->tv_usec; 606 if (usec < 0) { 607 sec--; 608 usec += 1000000; 609 } 610 if (sec < 0) { 611 #ifdef DIAGNOSTIC 612 if (usec > 0) { 613 sec++; 614 usec -= 1000000; 615 } 616 printf("tvotohz: negative time difference %ld sec %ld usec\n", 617 sec, usec); 618 #endif 619 ticks = 1; 620 } else if (sec <= LONG_MAX / 1000000) 621 ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) 622 / tick + 1; 623 else if (sec <= LONG_MAX / hz) 624 ticks = sec * hz 625 + ((unsigned long)usec + (tick - 1)) / tick + 1; 626 else 627 ticks = LONG_MAX; 628 if (ticks > INT_MAX) 629 ticks = INT_MAX; 630 return ((int)ticks); 631 } 632 633 /* 634 * Handle the few command line arguments supported. 635 */ 636 static void 637 parseargs(int argc, char *argv[]) 638 { 639 struct timespec ts; 640 int longindex; 641 int c; 642 const char *lopt; 643 644 /* Get the default value of timeout_sec from the default timeout. */ 645 timeout_sec = sbintime_getsec(timeout); 646 647 /* 648 * if we end with a 'd' aka 'watchdogd' then we are the daemon program, 649 * otherwise run as a command line utility. 650 */ 651 c = strlen(argv[0]); 652 if (argv[0][c - 1] == 'd') 653 is_daemon = 1; 654 655 if (is_daemon) 656 getopt_shortopts = "I:de:ns:t:ST:wx:?"; 657 else 658 getopt_shortopts = "dt:?"; 659 660 while ((c = getopt_long(argc, argv, getopt_shortopts, longopts, 661 &longindex)) != -1) { 662 switch (c) { 663 case 'I': 664 pidfile = optarg; 665 break; 666 case 'd': 667 debugging = 1; 668 break; 669 case 'e': 670 test_cmd = strdup(optarg); 671 break; 672 case 'n': 673 is_dry_run = 1; 674 break; 675 #ifdef notyet 676 case 'p': 677 passive = 1; 678 break; 679 #endif 680 case 's': 681 nap = fetchtimeout(c, NULL, optarg, 0); 682 break; 683 case 'S': 684 do_syslog = 0; 685 break; 686 case 't': 687 timeout_sec = atoi(optarg); 688 timeout = parse_timeout_to_sbt(c, NULL, optarg); 689 if (debugging) 690 printf("Timeout is %d\n", 691 (int)(timeout / SBT_1S)); 692 break; 693 case 'T': 694 carp_thresh_seconds = 695 fetchtimeout(c, "NULL", optarg, 0); 696 break; 697 case 'w': 698 do_timedog = 1; 699 break; 700 case 'x': 701 exit_timeout = parse_timeout_to_sbt(c, NULL, optarg); 702 if (exit_timeout != 0) 703 exit_timeout |= WD_ACTIVE; 704 break; 705 case 0: 706 lopt = longopts[longindex].name; 707 if (!strcmp(lopt, "pretimeout")) { 708 pretimeout = fetchtimeout(0, lopt, optarg, 0); 709 } else if (!strcmp(lopt, "pretimeout-action")) { 710 pretimeout_act = timeout_act_str2int(lopt, 711 optarg); 712 } else if (!strcmp(lopt, "softtimeout-action")) { 713 softtimeout_act = timeout_act_str2int(lopt, 714 optarg); 715 } else { 716 /* warnx("bad option at index %d: %s", optind, 717 argv[optind]); 718 usage(); 719 */ 720 } 721 break; 722 case '?': 723 default: 724 usage(); 725 /* NOTREACHED */ 726 } 727 } 728 729 if (nap > timeout_sec / 2) 730 nap = timeout_sec / 2; 731 732 if (carp_thresh_seconds == -1) 733 carp_thresh_seconds = nap; 734 735 if (argc != optind) 736 errx(EX_USAGE, "extra arguments."); 737 if (is_daemon && timeout < WD_TO_1SEC) 738 errx(EX_USAGE, "-t argument is less than one second."); 739 if (pretimeout_set) { 740 if (pretimeout >= timeout_sec) { 741 errx(EX_USAGE, 742 "pretimeout (%d) >= timeout (%d -> %ld)\n" 743 "see manual section TIMEOUT RESOLUTION", 744 pretimeout, timeout_sec, (long)ts.tv_sec); 745 } 746 } 747 } 748