1 /*- 2 * Copyright (c) 2003-2004 Sean M. Kelly <smkelly@FreeBSD.org> 3 * Copyright (c) 2013 iXsystems.com, 4 * author: Alfred Perlstein <alfred@freebsd.org> 5 * 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * Software watchdog daemon. 32 */ 33 34 #include <sys/types.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/mman.h> 38 #include <sys/param.h> 39 #include <sys/rtprio.h> 40 #include <sys/stat.h> 41 #include <sys/time.h> 42 #include <sys/watchdog.h> 43 44 #include <err.h> 45 #include <errno.h> 46 #include <fcntl.h> 47 #include <libutil.h> 48 #include <math.h> 49 #include <paths.h> 50 #include <signal.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <strings.h> 55 #include <sysexits.h> 56 #include <syslog.h> 57 #include <unistd.h> 58 59 #include <getopt.h> 60 61 static void parseargs(int, char *[]); 62 static void sighandler(int); 63 static void watchdog_loop(void); 64 static int watchdog_init(void); 65 static int watchdog_onoff(int onoff); 66 static int watchdog_patpat(u_int timeout); 67 static void usage(void); 68 69 static int debugging = 0; 70 static int end_program = 0; 71 static const char *pidfile = _PATH_VARRUN "watchdogd.pid"; 72 static u_int timeout = WD_TO_128SEC; 73 static u_int pretimeout = 0; 74 static u_int passive = 0; 75 static int is_daemon = 0; 76 static int is_dry_run = 0; /* do not arm the watchdog, only 77 report on timing of the watch 78 program */ 79 static int do_timedog = 0; 80 static int do_syslog = 1; 81 static int fd = -1; 82 static int nap = 1; 83 static int carp_thresh_seconds = -1; 84 static char *test_cmd = NULL; 85 86 static const char *getopt_shortopts; 87 88 static int pretimeout_set; 89 static int pretimeout_act; 90 static int pretimeout_act_set; 91 92 static int softtimeout_set; 93 static int softtimeout_act; 94 static int softtimeout_act_set; 95 96 static struct option longopts[] = { 97 { "debug", no_argument, &debugging, 1 }, 98 { "pretimeout", required_argument, &pretimeout_set, 1 }, 99 { "pretimeout-action", required_argument, &pretimeout_act_set, 1 }, 100 { "softtimeout", no_argument, &softtimeout_set, 1 }, 101 { "softtimeout-action", required_argument, &softtimeout_act_set, 1 }, 102 { NULL, 0, NULL, 0} 103 }; 104 105 /* 106 * Ask malloc() to map minimum-sized chunks of virtual address space at a time, 107 * so that mlockall() won't needlessly wire megabytes of unused memory into the 108 * process. This must be done using the malloc_conf string so that it gets set 109 * up before the first allocation, which happens before entry to main(). 110 */ 111 const char * malloc_conf = "lg_chunk:0"; 112 113 /* 114 * Periodically pat the watchdog, preventing it from firing. 115 */ 116 int 117 main(int argc, char *argv[]) 118 { 119 struct rtprio rtp; 120 struct pidfh *pfh; 121 pid_t otherpid; 122 123 if (getuid() != 0) 124 errx(EX_SOFTWARE, "not super user"); 125 126 parseargs(argc, argv); 127 128 if (do_syslog) 129 openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR, 130 LOG_DAEMON); 131 132 rtp.type = RTP_PRIO_REALTIME; 133 rtp.prio = 0; 134 if (rtprio(RTP_SET, 0, &rtp) == -1) 135 err(EX_OSERR, "rtprio"); 136 137 if (!is_dry_run && watchdog_init() == -1) 138 errx(EX_SOFTWARE, "unable to initialize watchdog"); 139 140 if (is_daemon) { 141 if (watchdog_onoff(1) == -1) 142 err(EX_OSERR, "patting the dog"); 143 144 pfh = pidfile_open(pidfile, 0600, &otherpid); 145 if (pfh == NULL) { 146 if (errno == EEXIST) { 147 watchdog_onoff(0); 148 errx(EX_SOFTWARE, "%s already running, pid: %d", 149 getprogname(), otherpid); 150 } 151 warn("Cannot open or create pidfile"); 152 } 153 154 if (debugging == 0 && daemon(0, 0) == -1) { 155 watchdog_onoff(0); 156 pidfile_remove(pfh); 157 err(EX_OSERR, "daemon"); 158 } 159 160 signal(SIGHUP, SIG_IGN); 161 signal(SIGINT, sighandler); 162 signal(SIGTERM, sighandler); 163 164 pidfile_write(pfh); 165 if (madvise(0, 0, MADV_PROTECT) != 0) 166 warn("madvise failed"); 167 if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) 168 warn("mlockall failed"); 169 170 watchdog_loop(); 171 172 /* exiting */ 173 pidfile_remove(pfh); 174 return (EX_OK); 175 } else { 176 if (passive) 177 timeout |= WD_PASSIVE; 178 else 179 timeout |= WD_ACTIVE; 180 if (watchdog_patpat(timeout) < 0) 181 err(EX_OSERR, "patting the dog"); 182 return (EX_OK); 183 } 184 } 185 186 /* 187 * Catch signals and begin shutdown process. 188 */ 189 static void 190 sighandler(int signum) 191 { 192 193 if (signum == SIGINT || signum == SIGTERM) 194 end_program = 1; 195 } 196 197 /* 198 * Open the watchdog device. 199 */ 200 static int 201 watchdog_init(void) 202 { 203 204 if (is_dry_run) 205 return 0; 206 207 fd = open("/dev/" _PATH_WATCHDOG, O_RDWR); 208 if (fd >= 0) 209 return (0); 210 warn("Could not open watchdog device"); 211 return (-1); 212 } 213 214 /* 215 * If we are doing timing, then get the time. 216 */ 217 static int 218 watchdog_getuptime(struct timespec *tp) 219 { 220 int error; 221 222 if (!do_timedog) 223 return 0; 224 225 error = clock_gettime(CLOCK_UPTIME_FAST, tp); 226 if (error) 227 warn("clock_gettime"); 228 return (error); 229 } 230 231 static long 232 watchdog_check_dogfunction_time(struct timespec *tp_start, 233 struct timespec *tp_end) 234 { 235 struct timeval tv_start, tv_end, tv_now, tv; 236 const char *cmd_prefix, *cmd; 237 struct timespec tp_now; 238 int sec; 239 240 if (!do_timedog) 241 return (0); 242 243 TIMESPEC_TO_TIMEVAL(&tv_start, tp_start); 244 TIMESPEC_TO_TIMEVAL(&tv_end, tp_end); 245 timersub(&tv_end, &tv_start, &tv); 246 sec = tv.tv_sec; 247 if (sec < carp_thresh_seconds) 248 return (sec); 249 250 if (test_cmd) { 251 cmd_prefix = "Watchdog program"; 252 cmd = test_cmd; 253 } else { 254 cmd_prefix = "Watchdog operation"; 255 cmd = "stat(\"/etc\", &sb)"; 256 } 257 if (do_syslog) 258 syslog(LOG_CRIT, "%s: '%s' took too long: " 259 "%d.%06ld seconds >= %d seconds threshold", 260 cmd_prefix, cmd, sec, (long)tv.tv_usec, 261 carp_thresh_seconds); 262 else 263 warnx("%s: '%s' took too long: " 264 "%d.%06ld seconds >= %d seconds threshold", 265 cmd_prefix, cmd, sec, (long)tv.tv_usec, 266 carp_thresh_seconds); 267 268 /* 269 * Adjust the sleep interval again in case syslog(3) took a non-trivial 270 * amount of time to run. 271 */ 272 if (watchdog_getuptime(&tp_now)) 273 return (sec); 274 TIMESPEC_TO_TIMEVAL(&tv_now, &tp_now); 275 timersub(&tv_now, &tv_start, &tv); 276 sec = tv.tv_sec; 277 278 return (sec); 279 } 280 281 /* 282 * Main program loop which is iterated every second. 283 */ 284 static void 285 watchdog_loop(void) 286 { 287 struct timespec ts_start, ts_end; 288 struct stat sb; 289 long waited; 290 int error, failed; 291 292 while (end_program != 2) { 293 failed = 0; 294 295 error = watchdog_getuptime(&ts_start); 296 if (error) { 297 end_program = 1; 298 goto try_end; 299 } 300 301 if (test_cmd != NULL) 302 failed = system(test_cmd); 303 else 304 failed = stat("/etc", &sb); 305 306 error = watchdog_getuptime(&ts_end); 307 if (error) { 308 end_program = 1; 309 goto try_end; 310 } 311 312 if (failed == 0) 313 watchdog_patpat(timeout|WD_ACTIVE); 314 315 waited = watchdog_check_dogfunction_time(&ts_start, &ts_end); 316 if (nap - waited > 0) 317 sleep(nap - waited); 318 319 try_end: 320 if (end_program != 0) { 321 if (watchdog_onoff(0) == 0) { 322 end_program = 2; 323 } else { 324 warnx("Could not stop the watchdog, not exiting"); 325 end_program = 0; 326 } 327 } 328 } 329 } 330 331 /* 332 * Reset the watchdog timer. This function must be called periodically 333 * to keep the watchdog from firing. 334 */ 335 static int 336 watchdog_patpat(u_int t) 337 { 338 339 if (is_dry_run) 340 return 0; 341 342 return ioctl(fd, WDIOCPATPAT, &t); 343 } 344 345 /* 346 * Toggle the kernel's watchdog. This routine is used to enable and 347 * disable the watchdog. 348 */ 349 static int 350 watchdog_onoff(int onoff) 351 { 352 int error; 353 354 /* fake successful watchdog op if a dry run */ 355 if (is_dry_run) 356 return 0; 357 358 if (onoff) { 359 /* 360 * Call the WDIOC_SETSOFT regardless of softtimeout_set 361 * because we'll need to turn it off if someone had turned 362 * it on. 363 */ 364 error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set); 365 if (error) { 366 warn("setting WDIOC_SETSOFT %d", softtimeout_set); 367 return (error); 368 } 369 error = watchdog_patpat((timeout|WD_ACTIVE)); 370 if (error) { 371 warn("watchdog_patpat failed"); 372 goto failsafe; 373 } 374 if (softtimeout_act_set) { 375 error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT, 376 &softtimeout_act); 377 if (error) { 378 warn("setting WDIOC_SETSOFTTIMEOUTACT %d", 379 softtimeout_act); 380 goto failsafe; 381 } 382 } 383 if (pretimeout_set) { 384 error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout); 385 if (error) { 386 warn("setting WDIOC_SETPRETIMEOUT %d", 387 pretimeout); 388 goto failsafe; 389 } 390 } 391 if (pretimeout_act_set) { 392 error = ioctl(fd, WDIOC_SETPRETIMEOUTACT, 393 &pretimeout_act); 394 if (error) { 395 warn("setting WDIOC_SETPRETIMEOUTACT %d", 396 pretimeout_act); 397 goto failsafe; 398 } 399 } 400 /* pat one more time for good measure */ 401 return watchdog_patpat((timeout|WD_ACTIVE)); 402 } else { 403 return watchdog_patpat(0); 404 } 405 failsafe: 406 watchdog_patpat(0); 407 return (error); 408 } 409 410 /* 411 * Tell user how to use the program. 412 */ 413 static void 414 usage(void) 415 { 416 if (is_daemon) 417 fprintf(stderr, "usage:\n" 418 " watchdogd [-dnSw] [-e cmd] [-I file] [-s sleep] [-t timeout]\n" 419 " [-T script_timeout]\n" 420 " [--debug]\n" 421 " [--pretimeout seconds] [-pretimeout-action action]\n" 422 " [--softtimeout] [-softtimeout-action action]\n" 423 ); 424 else 425 fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n"); 426 exit(EX_USAGE); 427 } 428 429 static long 430 fetchtimeout(int opt, const char *longopt, const char *myoptarg) 431 { 432 const char *errstr; 433 char *p; 434 long rv; 435 436 errstr = NULL; 437 p = NULL; 438 errno = 0; 439 rv = strtol(myoptarg, &p, 0); 440 if ((p != NULL && *p != '\0') || errno != 0) 441 errstr = "is not a number"; 442 if (rv <= 0) 443 errstr = "must be greater than zero"; 444 if (errstr) { 445 if (longopt) 446 errx(EX_USAGE, "--%s argument %s", longopt, errstr); 447 else 448 errx(EX_USAGE, "-%c argument %s", opt, errstr); 449 } 450 return (rv); 451 } 452 453 struct act_tbl { 454 const char *at_act; 455 int at_value; 456 }; 457 458 static const struct act_tbl act_tbl[] = { 459 { "panic", WD_SOFT_PANIC }, 460 { "ddb", WD_SOFT_DDB }, 461 { "log", WD_SOFT_LOG }, 462 { "printf", WD_SOFT_PRINTF }, 463 { NULL, 0 } 464 }; 465 466 static void 467 timeout_act_error(const char *lopt, const char *badact) 468 { 469 char *opts, *oldopts; 470 int i; 471 472 opts = NULL; 473 for (i = 0; act_tbl[i].at_act != NULL; i++) { 474 oldopts = opts; 475 if (asprintf(&opts, "%s%s%s", 476 oldopts == NULL ? "" : oldopts, 477 oldopts == NULL ? "" : ", ", 478 act_tbl[i].at_act) == -1) 479 err(EX_OSERR, "malloc"); 480 free(oldopts); 481 } 482 warnx("bad --%s argument '%s' must be one of (%s).", 483 lopt, badact, opts); 484 usage(); 485 } 486 487 /* 488 * Take a comma separated list of actions and or the flags 489 * together for the ioctl. 490 */ 491 static int 492 timeout_act_str2int(const char *lopt, const char *acts) 493 { 494 int i; 495 char *dupacts, *tofree; 496 char *o; 497 int rv = 0; 498 499 tofree = dupacts = strdup(acts); 500 if (!tofree) 501 err(EX_OSERR, "malloc"); 502 while ((o = strsep(&dupacts, ",")) != NULL) { 503 for (i = 0; act_tbl[i].at_act != NULL; i++) { 504 if (!strcmp(o, act_tbl[i].at_act)) { 505 rv |= act_tbl[i].at_value; 506 break; 507 } 508 } 509 if (act_tbl[i].at_act == NULL) 510 timeout_act_error(lopt, o); 511 } 512 free(tofree); 513 return rv; 514 } 515 516 /* 517 * Handle the few command line arguments supported. 518 */ 519 static void 520 parseargs(int argc, char *argv[]) 521 { 522 int longindex; 523 int c; 524 char *p; 525 const char *lopt; 526 double a; 527 528 /* 529 * if we end with a 'd' aka 'watchdogd' then we are the daemon program, 530 * otherwise run as a command line utility. 531 */ 532 c = strlen(argv[0]); 533 if (argv[0][c - 1] == 'd') 534 is_daemon = 1; 535 536 if (is_daemon) 537 getopt_shortopts = "I:de:ns:t:ST:w?"; 538 else 539 getopt_shortopts = "dt:?"; 540 541 while ((c = getopt_long(argc, argv, getopt_shortopts, longopts, 542 &longindex)) != -1) { 543 switch (c) { 544 case 'I': 545 pidfile = optarg; 546 break; 547 case 'd': 548 debugging = 1; 549 break; 550 case 'e': 551 test_cmd = strdup(optarg); 552 break; 553 case 'n': 554 is_dry_run = 1; 555 break; 556 #ifdef notyet 557 case 'p': 558 passive = 1; 559 break; 560 #endif 561 case 's': 562 nap = fetchtimeout(c, NULL, optarg); 563 break; 564 case 'S': 565 do_syslog = 0; 566 break; 567 case 't': 568 p = NULL; 569 errno = 0; 570 a = strtod(optarg, &p); 571 if ((p != NULL && *p != '\0') || errno != 0) 572 errx(EX_USAGE, "-t argument is not a number"); 573 if (a < 0) 574 errx(EX_USAGE, "-t argument must be positive"); 575 576 if (a == 0) 577 timeout = WD_TO_NEVER; 578 else 579 timeout = flsll(a * 1e9); 580 if (debugging) 581 printf("Timeout is 2^%d nanoseconds\n", 582 timeout); 583 break; 584 case 'T': 585 carp_thresh_seconds = fetchtimeout(c, "NULL", optarg); 586 break; 587 case 'w': 588 do_timedog = 1; 589 break; 590 case 0: 591 lopt = longopts[longindex].name; 592 if (!strcmp(lopt, "pretimeout")) { 593 pretimeout = fetchtimeout(0, lopt, optarg); 594 } else if (!strcmp(lopt, "pretimeout-action")) { 595 pretimeout_act = timeout_act_str2int(lopt, 596 optarg); 597 } else if (!strcmp(lopt, "softtimeout-action")) { 598 softtimeout_act = timeout_act_str2int(lopt, 599 optarg); 600 } else { 601 /* warnx("bad option at index %d: %s", optind, 602 argv[optind]); 603 usage(); 604 */ 605 } 606 break; 607 case '?': 608 default: 609 usage(); 610 /* NOTREACHED */ 611 } 612 } 613 614 if (carp_thresh_seconds == -1) 615 carp_thresh_seconds = nap; 616 617 if (argc != optind) 618 errx(EX_USAGE, "extra arguments."); 619 if (is_daemon && timeout < WD_TO_1SEC) 620 errx(EX_USAGE, "-t argument is less than one second."); 621 } 622