1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2003-2004 Sean M. Kelly <smkelly@FreeBSD.org>
5 * Copyright (c) 2013 iXsystems.com,
6 * author: Alfred Perlstein <alfred@freebsd.org>
7 *
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Software watchdog daemon.
34 */
35
36 #include <sys/types.h>
37 #include <sys/mman.h>
38 #include <sys/param.h>
39 #include <sys/rtprio.h>
40 #include <sys/stat.h>
41 #include <sys/time.h>
42 #include <sys/sysctl.h>
43 #include <sys/watchdog.h>
44
45 #include <err.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <libutil.h>
49 #include <math.h>
50 #include <paths.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdint.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <strings.h>
57 #include <sysexits.h>
58 #include <syslog.h>
59 #include <unistd.h>
60
61 #include <getopt.h>
62
63 static long fetchtimeout(int opt,
64 const char *longopt, const char *myoptarg, int zero_ok);
65 static void parseargs(int, char *[]);
66 static void sighandler(int);
67 static void watchdog_loop(void);
68 static int watchdog_init(void);
69 static int watchdog_onoff(int onoff);
70 static int watchdog_patpat(sbintime_t);
71 static void usage(void);
72 static int tvtohz(struct timeval *tv);
73
74 static int debugging = 0;
75 static int end_program = 0;
76 static const char *pidfile = _PATH_VARRUN "watchdogd.pid";
77 static sbintime_t timeout = 128 * SBT_1S;
78 static u_int exit_timeout = WD_TO_NEVER;
79 static u_int pretimeout = 0;
80 static u_int timeout_sec;
81 static u_int nap = 10;
82 #ifdef notyet
83 static int passive = 0;
84 #endif
85 static int is_daemon = 0;
86 static int is_dry_run = 0; /* do not arm the watchdog, only
87 report on timing of the watch
88 program */
89 static int do_timedog = 0;
90 static int do_syslog = 1;
91 static int fd = -1;
92 static int carp_thresh_seconds = -1;
93 static char *test_cmd = NULL;
94
95 static const char *getopt_shortopts;
96
97 static int pretimeout_set;
98 static int pretimeout_act;
99 static int pretimeout_act_set;
100
101 static int softtimeout_set;
102 static int softtimeout_act;
103 static int softtimeout_act_set;
104
105 static struct option longopts[] = {
106 { "debug", no_argument, &debugging, 1 },
107 { "pretimeout", required_argument, &pretimeout_set, 1 },
108 { "pretimeout-action", required_argument, &pretimeout_act_set, 1 },
109 { "softtimeout", no_argument, &softtimeout_set, 1 },
110 { "softtimeout-action", required_argument, &softtimeout_act_set, 1 },
111 { NULL, 0, NULL, 0}
112 };
113
114 /*
115 * Periodically pat the watchdog, preventing it from firing.
116 */
117 int
main(int argc,char * argv[])118 main(int argc, char *argv[])
119 {
120 struct rtprio rtp;
121 struct pidfh *pfh;
122 pid_t otherpid;
123
124 if (getuid() != 0)
125 errx(EX_SOFTWARE, "not super user");
126
127 parseargs(argc, argv);
128
129 if (do_syslog)
130 openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR,
131 LOG_DAEMON);
132
133 rtp.type = RTP_PRIO_REALTIME;
134 rtp.prio = 0;
135 if (rtprio(RTP_SET, 0, &rtp) == -1)
136 err(EX_OSERR, "rtprio");
137
138 if (!is_dry_run && watchdog_init() == -1)
139 errx(EX_SOFTWARE, "unable to initialize watchdog");
140
141 if (is_daemon) {
142 if (watchdog_onoff(1) == -1)
143 err(EX_OSERR, "patting the dog");
144
145 pfh = pidfile_open(pidfile, 0600, &otherpid);
146 if (pfh == NULL) {
147 if (errno == EEXIST) {
148 watchdog_onoff(0);
149 errx(EX_SOFTWARE, "%s already running, pid: %d",
150 getprogname(), otherpid);
151 }
152 warn("Cannot open or create pidfile");
153 }
154
155 if (debugging == 0 && daemon(0, 0) == -1) {
156 watchdog_onoff(0);
157 pidfile_remove(pfh);
158 err(EX_OSERR, "daemon");
159 }
160
161 signal(SIGHUP, SIG_IGN);
162 signal(SIGINT, sighandler);
163 signal(SIGTERM, sighandler);
164
165 pidfile_write(pfh);
166 if (madvise(0, 0, MADV_PROTECT) != 0)
167 warn("madvise failed");
168 if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0)
169 warn("mlockall failed");
170
171 watchdog_loop();
172
173 /* exiting */
174 pidfile_remove(pfh);
175 return (EX_OK);
176 } else {
177 if (watchdog_patpat(timeout) < 0)
178 err(EX_OSERR, "patting the dog");
179 return (EX_OK);
180 }
181 }
182
183 /*
184 * Convert a timeout in seconds to N where 2^N nanoseconds is close to
185 * "seconds".
186 *
187 * The kernel expects the timeouts for watchdogs in "2^N nanosecond format".
188 */
189 static sbintime_t
parse_timeout_to_sbt(char opt,const char * longopt,const char * myoptarg)190 parse_timeout_to_sbt(char opt, const char *longopt, const char *myoptarg)
191 {
192 long a;
193 sbintime_t rv;
194 struct timeval tv;
195 int ticks;
196 char shortopt[] = "- ";
197
198 if (!longopt)
199 shortopt[1] = opt;
200
201 a = fetchtimeout(opt, longopt, myoptarg, 1);
202
203 if (a == 0)
204 rv = 0;
205 else
206 rv = a * SBT_1S;
207 tv = sbttotv(rv);
208 ticks = tvtohz(&tv);
209 if (debugging) {
210 printf("Timeout for %s%s "
211 "is "
212 "(in: %s sec -> out: %jd sec %ld us -> %d ticks)\n",
213 longopt ? "-" : "", longopt ? longopt : shortopt,
214 myoptarg, (intmax_t)tv.tv_sec, tv.tv_usec, ticks);
215 }
216 if (ticks <= 0) {
217 errx(1, "Timeout for %s%s is too small, please choose a higher timeout.", longopt ? "-" : "", longopt ? longopt : shortopt);
218 }
219
220 return (rv);
221 }
222
223 /*
224 * Catch signals and begin shutdown process.
225 */
226 static void
sighandler(int signum)227 sighandler(int signum)
228 {
229
230 if (signum == SIGINT || signum == SIGTERM)
231 end_program = 1;
232 }
233
234 /*
235 * Open the watchdog device.
236 */
237 static int
watchdog_init(void)238 watchdog_init(void)
239 {
240
241 if (is_dry_run)
242 return 0;
243
244 fd = open("/dev/" _PATH_WATCHDOG, O_RDWR);
245 if (fd >= 0)
246 return (0);
247 warn("Could not open watchdog device");
248 return (-1);
249 }
250
251 /*
252 * If we are doing timing, then get the time.
253 */
254 static int
watchdog_getuptime(struct timespec * tp)255 watchdog_getuptime(struct timespec *tp)
256 {
257 int error;
258
259 if (!do_timedog)
260 return 0;
261
262 error = clock_gettime(CLOCK_UPTIME_FAST, tp);
263 if (error)
264 warn("clock_gettime");
265 return (error);
266 }
267
268 static long
watchdog_check_dogfunction_time(struct timespec * tp_start,struct timespec * tp_end)269 watchdog_check_dogfunction_time(struct timespec *tp_start,
270 struct timespec *tp_end)
271 {
272 struct timeval tv_start, tv_end, tv_now, tv;
273 const char *cmd_prefix, *cmd;
274 struct timespec tp_now;
275 int sec;
276
277 if (!do_timedog)
278 return (0);
279
280 TIMESPEC_TO_TIMEVAL(&tv_start, tp_start);
281 TIMESPEC_TO_TIMEVAL(&tv_end, tp_end);
282 timersub(&tv_end, &tv_start, &tv);
283 sec = tv.tv_sec;
284 if (sec < carp_thresh_seconds)
285 return (sec);
286
287 if (test_cmd) {
288 cmd_prefix = "Watchdog program";
289 cmd = test_cmd;
290 } else {
291 cmd_prefix = "Watchdog operation";
292 cmd = "stat(\"/etc\", &sb)";
293 }
294 if (do_syslog)
295 syslog(LOG_CRIT, "%s: '%s' took too long: "
296 "%d.%06ld seconds >= %d seconds threshold",
297 cmd_prefix, cmd, sec, (long)tv.tv_usec,
298 carp_thresh_seconds);
299 else
300 warnx("%s: '%s' took too long: "
301 "%d.%06ld seconds >= %d seconds threshold",
302 cmd_prefix, cmd, sec, (long)tv.tv_usec,
303 carp_thresh_seconds);
304
305 /*
306 * Adjust the sleep interval again in case syslog(3) took a non-trivial
307 * amount of time to run.
308 */
309 if (watchdog_getuptime(&tp_now))
310 return (sec);
311 TIMESPEC_TO_TIMEVAL(&tv_now, &tp_now);
312 timersub(&tv_now, &tv_start, &tv);
313 sec = tv.tv_sec;
314
315 return (sec);
316 }
317
318 /*
319 * Main program loop which is iterated every second.
320 */
321 static void
watchdog_loop(void)322 watchdog_loop(void)
323 {
324 struct timespec ts_start, ts_end;
325 struct stat sb;
326 long waited;
327 int error, failed;
328
329 while (end_program != 2) {
330 failed = 0;
331
332 error = watchdog_getuptime(&ts_start);
333 if (error) {
334 end_program = 1;
335 goto try_end;
336 }
337
338 if (test_cmd != NULL)
339 failed = system(test_cmd);
340 else
341 failed = stat("/etc", &sb);
342
343 error = watchdog_getuptime(&ts_end);
344 if (error) {
345 end_program = 1;
346 goto try_end;
347 }
348
349 if (failed == 0)
350 watchdog_patpat(timeout);
351
352 waited = watchdog_check_dogfunction_time(&ts_start, &ts_end);
353 if (nap - waited > 0)
354 sleep(nap - waited);
355
356 try_end:
357 if (end_program != 0) {
358 if (watchdog_onoff(0) == 0) {
359 end_program = 2;
360 } else {
361 warnx("Could not stop the watchdog, not exiting");
362 end_program = 0;
363 }
364 }
365 }
366 }
367
368 /*
369 * Reset the watchdog timer. This function must be called periodically
370 * to keep the watchdog from firing.
371 */
372 static int
watchdog_patpat(sbintime_t sbt)373 watchdog_patpat(sbintime_t sbt)
374 {
375
376 if (is_dry_run)
377 return 0;
378
379 return ioctl(fd, WDIOC_SETTIMEOUT, &sbt);
380 }
381
382 static int
watchdog_control(u_int control)383 watchdog_control(u_int control)
384 {
385 if (is_dry_run)
386 return (0);
387
388 return ioctl(fd, WDIOC_CONTROL, &control);
389 }
390
391 /*
392 * Toggle the kernel's watchdog. This routine is used to enable and
393 * disable the watchdog.
394 */
395 static int
watchdog_onoff(int onoff)396 watchdog_onoff(int onoff)
397 {
398 int error;
399
400 /* fake successful watchdog op if a dry run */
401 if (is_dry_run)
402 return 0;
403
404 if (onoff) {
405 /*
406 * Call the WDIOC_SETSOFT regardless of softtimeout_set
407 * because we'll need to turn it off if someone had turned
408 * it on.
409 */
410 error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set);
411 if (error) {
412 warn("setting WDIOC_SETSOFT %d", softtimeout_set);
413 return (error);
414 }
415 error = watchdog_patpat(timeout);
416 if (error) {
417 warn("watchdog_patpat failed");
418 goto failsafe;
419 }
420 if (softtimeout_act_set) {
421 error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT,
422 &softtimeout_act);
423 if (error) {
424 warn("setting WDIOC_SETSOFTTIMEOUTACT %d",
425 softtimeout_act);
426 goto failsafe;
427 }
428 }
429 if (pretimeout_set) {
430 error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout);
431 if (error) {
432 warn("setting WDIOC_SETPRETIMEOUT %d",
433 pretimeout);
434 goto failsafe;
435 }
436 }
437 if (pretimeout_act_set) {
438 error = ioctl(fd, WDIOC_SETPRETIMEOUTACT,
439 &pretimeout_act);
440 if (error) {
441 warn("setting WDIOC_SETPRETIMEOUTACT %d",
442 pretimeout_act);
443 goto failsafe;
444 }
445 }
446 /* pat one more time for good measure */
447 return watchdog_patpat(timeout);
448 } else {
449 return watchdog_control(WD_CTRL_DISABLE);
450 }
451 failsafe:
452 watchdog_control(WD_CTRL_DISABLE);
453 return (error);
454 }
455
456 /*
457 * Tell user how to use the program.
458 */
459 static void
usage(void)460 usage(void)
461 {
462 if (is_daemon)
463 fprintf(stderr, "usage:\n"
464 " watchdogd [-dnSw] [-e cmd] [-I pidfile] [-s sleep] [-t timeout]\n"
465 " [-T script_timeout] [-x exit_timeout]\n"
466 " [--debug]\n"
467 " [--pretimeout seconds] [-pretimeout-action action]\n"
468 " [--softtimeout] [-softtimeout-action action]\n"
469 );
470 else
471 fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n");
472 exit(EX_USAGE);
473 }
474
475 static long
fetchtimeout(int opt,const char * longopt,const char * myoptarg,int zero_ok)476 fetchtimeout(int opt, const char *longopt, const char *myoptarg, int zero_ok)
477 {
478 const char *errstr;
479 char *p;
480 long rv;
481
482 errstr = NULL;
483 p = NULL;
484 errno = 0;
485 rv = strtol(myoptarg, &p, 0);
486 if ((p != NULL && *p != '\0') || errno != 0)
487 errstr = "is not a number";
488 if (rv < 0 || (!zero_ok && rv == 0))
489 errstr = "must be greater than zero";
490 if (errstr) {
491 if (longopt)
492 errx(EX_USAGE, "--%s argument %s", longopt, errstr);
493 else
494 errx(EX_USAGE, "-%c argument %s", opt, errstr);
495 }
496 return (rv);
497 }
498
499 struct act_tbl {
500 const char *at_act;
501 int at_value;
502 };
503
504 static const struct act_tbl act_tbl[] = {
505 { "panic", WD_SOFT_PANIC },
506 { "ddb", WD_SOFT_DDB },
507 { "log", WD_SOFT_LOG },
508 { "printf", WD_SOFT_PRINTF },
509 { NULL, 0 }
510 };
511
512 static void
timeout_act_error(const char * lopt,const char * badact)513 timeout_act_error(const char *lopt, const char *badact)
514 {
515 char *opts, *oldopts;
516 int i;
517
518 opts = NULL;
519 for (i = 0; act_tbl[i].at_act != NULL; i++) {
520 oldopts = opts;
521 if (asprintf(&opts, "%s%s%s",
522 oldopts == NULL ? "" : oldopts,
523 oldopts == NULL ? "" : ", ",
524 act_tbl[i].at_act) == -1)
525 err(EX_OSERR, "malloc");
526 free(oldopts);
527 }
528 warnx("bad --%s argument '%s' must be one of (%s).",
529 lopt, badact, opts);
530 usage();
531 }
532
533 /*
534 * Take a comma separated list of actions and or the flags
535 * together for the ioctl.
536 */
537 static int
timeout_act_str2int(const char * lopt,const char * acts)538 timeout_act_str2int(const char *lopt, const char *acts)
539 {
540 int i;
541 char *dupacts, *tofree;
542 char *o;
543 int rv = 0;
544
545 tofree = dupacts = strdup(acts);
546 if (!tofree)
547 err(EX_OSERR, "malloc");
548 while ((o = strsep(&dupacts, ",")) != NULL) {
549 for (i = 0; act_tbl[i].at_act != NULL; i++) {
550 if (!strcmp(o, act_tbl[i].at_act)) {
551 rv |= act_tbl[i].at_value;
552 break;
553 }
554 }
555 if (act_tbl[i].at_act == NULL)
556 timeout_act_error(lopt, o);
557 }
558 free(tofree);
559 return rv;
560 }
561
562 /*
563 * Convert a timeval to a number of ticks.
564 * Mostly copied from the kernel.
565 */
566 int
tvtohz(struct timeval * tv)567 tvtohz(struct timeval *tv)
568 {
569 register unsigned long ticks;
570 register long sec, usec;
571 int hz;
572 size_t hzsize;
573 int error;
574 int tick;
575
576 hzsize = sizeof(hz);
577
578 error = sysctlbyname("kern.hz", &hz, &hzsize, NULL, 0);
579 if (error)
580 err(1, "sysctlbyname kern.hz");
581
582 tick = 1000000 / hz;
583
584 /*
585 * If the number of usecs in the whole seconds part of the time
586 * difference fits in a long, then the total number of usecs will
587 * fit in an unsigned long. Compute the total and convert it to
588 * ticks, rounding up and adding 1 to allow for the current tick
589 * to expire. Rounding also depends on unsigned long arithmetic
590 * to avoid overflow.
591 *
592 * Otherwise, if the number of ticks in the whole seconds part of
593 * the time difference fits in a long, then convert the parts to
594 * ticks separately and add, using similar rounding methods and
595 * overflow avoidance. This method would work in the previous
596 * case but it is slightly slower and assumes that hz is integral.
597 *
598 * Otherwise, round the time difference down to the maximum
599 * representable value.
600 *
601 * If ints have 32 bits, then the maximum value for any timeout in
602 * 10ms ticks is 248 days.
603 */
604 sec = tv->tv_sec;
605 usec = tv->tv_usec;
606 if (usec < 0) {
607 sec--;
608 usec += 1000000;
609 }
610 if (sec < 0) {
611 #ifdef DIAGNOSTIC
612 if (usec > 0) {
613 sec++;
614 usec -= 1000000;
615 }
616 printf("tvotohz: negative time difference %ld sec %ld usec\n",
617 sec, usec);
618 #endif
619 ticks = 1;
620 } else if (sec <= LONG_MAX / 1000000)
621 ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
622 / tick + 1;
623 else if (sec <= LONG_MAX / hz)
624 ticks = sec * hz
625 + ((unsigned long)usec + (tick - 1)) / tick + 1;
626 else
627 ticks = LONG_MAX;
628 if (ticks > INT_MAX)
629 ticks = INT_MAX;
630 return ((int)ticks);
631 }
632
633 /*
634 * Handle the few command line arguments supported.
635 */
636 static void
parseargs(int argc,char * argv[])637 parseargs(int argc, char *argv[])
638 {
639 struct timespec ts;
640 int longindex;
641 int c;
642 const char *lopt;
643
644 /* Get the default value of timeout_sec from the default timeout. */
645 timeout_sec = sbintime_getsec(timeout);
646
647 /*
648 * if we end with a 'd' aka 'watchdogd' then we are the daemon program,
649 * otherwise run as a command line utility.
650 */
651 c = strlen(argv[0]);
652 if (argv[0][c - 1] == 'd')
653 is_daemon = 1;
654
655 if (is_daemon)
656 getopt_shortopts = "I:de:ns:t:ST:wx:?";
657 else
658 getopt_shortopts = "dt:?";
659
660 while ((c = getopt_long(argc, argv, getopt_shortopts, longopts,
661 &longindex)) != -1) {
662 switch (c) {
663 case 'I':
664 pidfile = optarg;
665 break;
666 case 'd':
667 debugging = 1;
668 break;
669 case 'e':
670 test_cmd = strdup(optarg);
671 break;
672 case 'n':
673 is_dry_run = 1;
674 break;
675 #ifdef notyet
676 case 'p':
677 passive = 1;
678 break;
679 #endif
680 case 's':
681 nap = fetchtimeout(c, NULL, optarg, 0);
682 break;
683 case 'S':
684 do_syslog = 0;
685 break;
686 case 't':
687 timeout_sec = atoi(optarg);
688 timeout = parse_timeout_to_sbt(c, NULL, optarg);
689 if (debugging)
690 printf("Timeout is %d\n",
691 (int)(timeout / SBT_1S));
692 break;
693 case 'T':
694 carp_thresh_seconds =
695 fetchtimeout(c, "NULL", optarg, 0);
696 break;
697 case 'w':
698 do_timedog = 1;
699 break;
700 case 'x':
701 exit_timeout = parse_timeout_to_sbt(c, NULL, optarg);
702 if (exit_timeout != 0)
703 exit_timeout |= WD_ACTIVE;
704 break;
705 case 0:
706 lopt = longopts[longindex].name;
707 if (!strcmp(lopt, "pretimeout")) {
708 pretimeout = fetchtimeout(0, lopt, optarg, 0);
709 } else if (!strcmp(lopt, "pretimeout-action")) {
710 pretimeout_act = timeout_act_str2int(lopt,
711 optarg);
712 } else if (!strcmp(lopt, "softtimeout-action")) {
713 softtimeout_act = timeout_act_str2int(lopt,
714 optarg);
715 } else {
716 /* warnx("bad option at index %d: %s", optind,
717 argv[optind]);
718 usage();
719 */
720 }
721 break;
722 case '?':
723 default:
724 usage();
725 /* NOTREACHED */
726 }
727 }
728
729 if (nap > timeout_sec / 2)
730 nap = timeout_sec / 2;
731
732 if (carp_thresh_seconds == -1)
733 carp_thresh_seconds = nap;
734
735 if (argc != optind)
736 errx(EX_USAGE, "extra arguments.");
737 if (is_daemon && timeout < WD_TO_1SEC)
738 errx(EX_USAGE, "-t argument is less than one second.");
739 if (pretimeout_set) {
740 if (pretimeout >= timeout_sec) {
741 errx(EX_USAGE,
742 "pretimeout (%d) >= timeout (%d -> %ld)\n"
743 "see manual section TIMEOUT RESOLUTION",
744 pretimeout, timeout_sec, (long)ts.tv_sec);
745 }
746 }
747 }
748