xref: /freebsd/sys/kern/kern_boottrace.c (revision 1da7f3f6f72b2245e458fc7195733268ae4a1136)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2008-2022 NetApp, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/boottrace.h>
30 #include <sys/kernel.h>
31 #include <sys/malloc.h>
32 #include <sys/pcpu.h>
33 #include <sys/proc.h>
34 #include <sys/resourcevar.h>
35 #include <sys/sbuf.h>
36 #include <sys/syscallsubr.h>
37 #include <sys/sysctl.h>
38 #include <sys/time.h>
39 
40 #include <machine/stdarg.h>
41 
42 #define	dprintf(fmt, ...)						\
43 	do {								\
44 		if (dotrace_debugging)					\
45 			printf(fmt, ##__VA_ARGS__);			\
46 	} while (0);
47 
48 static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing");
49 
50 #define	BT_TABLE_DEFSIZE	3000
51 #define	BT_TABLE_RUNSIZE	2000
52 #define	BT_TABLE_SHTSIZE	1000
53 #define	BT_TABLE_MINSIZE	500
54 
55 /*
56  * Boot-time & shutdown-time event.
57  */
58 struct bt_event {
59 	uint64_t tsc;			/* Timestamp */
60 	uint64_t tick;			/* Kernel tick */
61 	uint64_t cputime;		/* Microseconds of process CPU time */
62 	uint32_t cpuid;			/* CPU on which the event ran */
63 	uint32_t inblock;		/* # of blocks in */
64 	uint32_t oublock;		/* # of blocks out */
65 	pid_t pid;			/* Current PID */
66 	char name[BT_EVENT_NAMELEN];	/* Event name */
67 	char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */
68 };
69 
70 struct bt_table {
71 	uint32_t size;			/* Trace table size */
72 	uint32_t curr;			/* Trace entry to use */
73 	uint32_t wrap;			/* Wrap-around, instead of dropping */
74 	uint32_t drops_early;		/* Trace entries dropped before init */
75 	uint32_t drops_full;		/* Trace entries dropped after full */
76 	struct bt_event *table;
77 };
78 
79 /* Boot-time tracing */
80 static struct bt_table bt;
81 
82 /* Run-time tracing */
83 static struct bt_table rt;
84 
85 /* Shutdown-time tracing */
86 static struct bt_table st;
87 
88 /* Set when system boot is complete, and we've switched to runtime tracing. */
89 static bool bootdone;
90 
91 /* Set when system shutdown has started. */
92 static bool shutdown;
93 
94 /* Turn on dotrace() debug logging to console. */
95 static bool dotrace_debugging;
96 TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging);
97 
98 /* Trace kernel events */
99 static bool dotrace_kernel = true;
100 TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel);
101 
102 /* Trace userspace events */
103 static bool dotrace_user = true;
104 TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user);
105 
106 static int sysctl_log(SYSCTL_HANDLER_ARGS);
107 static int sysctl_boottrace(SYSCTL_HANDLER_ARGS);
108 static int sysctl_runtrace(SYSCTL_HANDLER_ARGS);
109 static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS);
110 static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS);
111 
112 SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
113     "boottrace statistics");
114 
115 SYSCTL_PROC(_kern_boottrace, OID_AUTO, log,
116     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
117     0, 0, sysctl_log, "A",
118     "Print a log of the boottrace trace data");
119 SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace,
120     CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
121     0, 0, sysctl_boottrace, "A",
122     "Capture a boot-time trace event");
123 SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace,
124     CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
125     0, 0, sysctl_runtrace, "A",
126     "Capture a run-time trace event");
127 SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace,
128     CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
129     0, 0, sysctl_shuttrace, "A",
130     "Capture a shutdown-time trace event");
131 SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset,
132     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE,
133     0, 0, sysctl_boottrace_reset, "A",
134     "Reset run-time tracing table");
135 
136 /*
137  * Global enable.
138  */
139 bool __read_mostly boottrace_enabled = false;
140 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN,
141     &boottrace_enabled, 0,
142     "Boot-time and shutdown-time tracing enabled");
143 
144 /*
145  * Enable dumping of the shutdown trace entries to console.
146  */
147 bool shutdown_trace = false;
148 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN,
149    &shutdown_trace, 0,
150    "Enable kernel shutdown tracing to the console");
151 
152 /*
153  * Set the delta threshold (ms) below which events are ignored, used in
154  * determining what to dump to the console.
155  */
156 static int shutdown_trace_threshold;
157 SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN,
158    &shutdown_trace_threshold, 0,
159    "Tracing threshold (ms) below which tracing is ignored");
160 
161 SYSCTL_UINT(_kern_boottrace, OID_AUTO, table_size,
162     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bt.size, 0,
163     "Boot-time tracing table size");
164 
165 /*
166  * Dump a trace to buffer or if buffer is NULL to console.
167  *
168  * Non-zero delta_threshold selectively prints entries based on delta
169  * with current and previous entry. Otherwise, delta_threshold of 0
170  * prints every trace entry and delta.
171  *
172  * Output something like this:
173  *
174  * CPU      msecs      delta process                  event
175  *  11 1228262715          0 init                     shutdown pre sync begin
176  *   3 1228265622       2907 init                     shutdown pre sync complete
177  *   3 1228265623          0 init                     shutdown turned swap off
178  *  18 1228266466        843 init                     shutdown unmounted all filesystems
179  *
180  * How to interpret:
181  *
182  * delta column represents the time in milliseconds between this event and the previous.
183  * Usually that means you can take the previous event, current event, match them
184  * up in the code, and whatever lies between is the culprit taking time.
185  *
186  * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount
187  * filesystems is taking 843 milliseconds.
188  *
189  * An event with a delta of 0 are 'landmark' events that simply exist in the output
190  * for the developer to know where the time measurement begins. The 0 is an arbitrary
191  * number that can effectively be ignored.
192  */
193 #define BTC_DELTA_PRINT(bte, msecs, delta) do {				\
194 	if (sbp) {							\
195 		sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta,	\
196 		    (bte)->tdname, (bte)->name, (bte)->pid,		\
197 		    (bte)->cputime / 1000000,				\
198 		    ((bte)->cputime % 1000000) / 10000,			\
199 		    (bte)->inblock, (bte)->oublock);			\
200 	} else {							\
201 		printf(fmt, (bte)->cpuid, msecs, delta,			\
202 		    (bte)->tdname, (bte)->name, (bte)->pid,		\
203 		    (bte)->cputime / 1000000,				\
204 		    ((bte)->cputime % 1000000) / 10000,			\
205 		    (bte)->inblock, (bte)->oublock);			\
206 	}								\
207 } while (0)
208 
209 /*
210  * Print the trace entries to the message buffer, or to an sbuf, if provided.
211  *
212  * Entries with a difference less than dthres will not be printed.
213  */
214 static void
215 boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres)
216 {
217 	struct bt_event *evtp;
218 	struct bt_event *last_evtp;
219 	uint64_t msecs;
220 	uint64_t first_msecs;
221 	uint64_t last_msecs;
222 	uint64_t dmsecs;
223 	uint64_t last_dmsecs;
224 	uint64_t total_dmsecs;
225 	uint32_t i;
226 	uint32_t curr;
227 	const char *fmt     = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n";
228 	const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n";
229 	bool printed;
230 	bool last_printed;
231 
232 	/* Print the header */
233 	if (sbp != NULL)
234 		sbuf_printf(sbp, hdr_fmt,
235 		    "CPU", "msecs", "delta", "process",
236 		    "event", "PID", "CPUtime", "IBlks", "OBlks");
237 	else
238 		printf(hdr_fmt,
239 		    "CPU", "msecs", "delta", "process",
240 		    "event", "PID", "CPUtime", "IBlks", "OBlks");
241 
242 	first_msecs = 0;
243 	last_evtp = NULL;
244 	last_msecs = 0;
245 	last_dmsecs = 0;
246 	last_printed = false;
247 	i = curr = btp->curr;
248 
249 	do {
250 		evtp = &btp->table[i];
251 		if (evtp->tsc == 0)
252 			goto next;
253 
254 		msecs = cputick2usec(evtp->tick) / 1000;
255 		dmsecs = (last_msecs != 0 && msecs > last_msecs) ?
256 		    msecs - last_msecs : 0;
257 		printed = false;
258 
259 		/*
260 		 * If a threshold is defined, start filtering events by the
261 		 * delta of msecs.
262 		 */
263 		if (dthres != 0 && (dmsecs > dthres)) {
264 			/*
265 			 * Print the previous entry as a landmark, even if it
266 			 * falls below the threshold.
267 			 */
268 			if (last_evtp != NULL && !last_printed)
269 				BTC_DELTA_PRINT(last_evtp, last_msecs,
270 				    last_dmsecs);
271 			BTC_DELTA_PRINT(evtp, msecs, dmsecs);
272 			printed = true;
273 		} else if (dthres == 0) {
274 			BTC_DELTA_PRINT(evtp, msecs, dmsecs);
275 			printed = true;
276 		}
277 		if (first_msecs == 0 || msecs < first_msecs)
278 			first_msecs = msecs;
279 		last_evtp = evtp;
280 		last_msecs = msecs;
281 		last_dmsecs = dmsecs;
282 		last_printed = printed;
283 		maybe_yield();
284 next:
285 		i = (i + 1) % btp->size;
286 	} while (i != curr);
287 
288 	total_dmsecs = last_msecs > first_msecs ?
289 	    (last_msecs - first_msecs) : 0;
290 	if (sbp != NULL)
291 		sbuf_printf(sbp, "Total measured time: %ju msecs\n",
292 		    (uintmax_t)total_dmsecs);
293 	else
294 		printf("Total measured time: %ju msecs\n",
295 		    (uintmax_t)total_dmsecs);
296 }
297 
298 /*
299  * Dump trace table entries to the console, given a delta threshold.
300  */
301 void
302 boottrace_dump_console(void)
303 {
304 	if (!boottrace_enabled)
305 		return;
306 
307 	if (shutdown || rebooting || KERNEL_PANICKED()) {
308 		boottrace_display(NULL, &st, shutdown_trace_threshold);
309 	} else {
310 		boottrace_display(NULL, &bt, 0);
311 		boottrace_display(NULL, &rt, 0);
312 	}
313 }
314 
315 /*
316  * Records a new tracing event to the specified table.
317  *
318  * We don't use a lock because we want this to be callable from interrupt
319  * context.
320  */
321 static int
322 dotrace(struct bt_table *btp, const char *eventname, const char *tdname)
323 {
324 	uint32_t idx, nxt;
325 	struct rusage usage;
326 
327 	MPASS(boottrace_enabled);
328 	if (tdname == NULL)
329 		tdname = (curproc->p_flag & P_SYSTEM) ?
330 		    curthread->td_name : curproc->p_comm;
331 
332 	dprintf("dotrace[");
333 	dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'",
334 	    PCPU_GET(cpuid), curthread->td_proc->p_pid,
335 	    (uintmax_t)get_cyclecount(), ticks, tdname, eventname);
336 	if (btp->table == NULL) {
337 		btp->drops_early++;
338 		dprintf(", return=ENOSPC_1]\n");
339 		return (ENOSPC);
340 	}
341 
342 	/* Claim a slot in the table. */
343 	do {
344 		idx = btp->curr;
345 		nxt = (idx + 1) % btp->size;
346 		if (nxt == 0 && btp->wrap == 0) {
347 			btp->drops_full++;
348 			dprintf(", return=ENOSPC_2]\n");
349 			return (ENOSPC);
350 		}
351 	} while (!atomic_cmpset_int(&btp->curr, idx, nxt));
352 
353 	btp->table[idx].cpuid = PCPU_GET(cpuid);
354 	btp->table[idx].tsc = get_cyclecount(),
355 	btp->table[idx].tick = cpu_ticks();
356 	btp->table[idx].pid = curthread->td_proc->p_pid;
357 
358 	/*
359 	 * Get the resource usage for the process. Don't attempt this for the
360 	 * kernel proc0 or for critical section activities.
361 	 */
362 	if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) {
363 		btp->table[idx].cputime = 0;
364 		btp->table[idx].inblock = 0;
365 		btp->table[idx].oublock = 0;
366 	} else {
367 		kern_getrusage(curthread, RUSAGE_CHILDREN, &usage);
368 		btp->table[idx].cputime =
369 		    (uint64_t)(usage.ru_utime.tv_sec * 1000000 +
370 		    usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 +
371 		    usage.ru_stime.tv_usec);
372 		btp->table[idx].inblock = (uint32_t)usage.ru_inblock;
373 		btp->table[idx].oublock = (uint32_t)usage.ru_oublock;
374 	}
375 	strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN);
376 	strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN);
377 
378 	dprintf(", return=0]\n");
379 	return (0);
380 }
381 
382 /*
383  * Log various boot-time events, with the trace message encoded using
384  * printf-like arguments.
385  */
386 int
387 boottrace(const char *tdname, const char *fmt, ...)
388 {
389 	char eventname[BT_EVENT_NAMELEN];
390 	struct bt_table *btp;
391 	va_list ap;
392 
393 	if (!dotrace_kernel)
394 		 return (ENXIO);
395 
396 	va_start(ap, fmt);
397 	vsnprintf(eventname, sizeof(eventname), fmt, ap);
398 	va_end(ap);
399 
400 	btp = &bt;
401 	if (bootdone)
402 		btp = &rt;
403 	if (shutdown || rebooting || KERNEL_PANICKED())
404 		btp = &st;
405 
406 	return (dotrace(btp, eventname, tdname));
407 }
408 
409 /*
410  * Log a run-time event & switch over to run-time tracing mode.
411  */
412 static int
413 runtrace(const char *eventname, const char *tdname)
414 {
415 
416 	bootdone = true;
417 	return (dotrace(&rt, eventname, tdname));
418 }
419 
420 /*
421  * Parse a boottrace message from userspace.
422  *
423  * The input from may contain a ':' to denote tdname. If not, tdname is
424  * inferred from the process' name.
425  *
426  * e.g. reboot(8):SIGINT to init(8)
427  */
428 static void
429 boottrace_parse_message(char *message, char **eventname, char **tdname)
430 {
431 	char *delim;
432 
433 	delim = strchr(message, ':');
434 	if (delim != NULL) {
435 		*delim = '\0';
436 		*tdname = message;
437 		*eventname = ++delim;
438 	} else {
439 		*tdname = curproc->p_comm;
440 		*eventname = message;
441 	}
442 }
443 
444 static int
445 _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp,
446     struct sysctl_req *req)
447 {
448 	char message[BT_MSGLEN];
449 	char *eventname, *tdname;
450 	int error;
451 
452 	if (!dotrace_user)
453 	       return (ENXIO);
454 
455 	message[0] = '\0';
456 	error = sysctl_handle_string(oidp, message, sizeof(message), req);
457 	if (error)
458 		return (error);
459 
460 	boottrace_parse_message(message, &eventname, &tdname);
461 	error = dotrace(btp, eventname, tdname);
462 	if (error == ENOSPC) {
463 		/* Ignore table full error. */
464 		error = 0;
465 	}
466 	return (error);
467 }
468 
469 static int
470 sysctl_log(SYSCTL_HANDLER_ARGS)
471 {
472 	struct sbuf *sbuf;
473 	int error;
474 
475 	if (!boottrace_enabled || req->newptr != NULL)
476 		return (0);
477 
478 	sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
479 	boottrace_display(sbuf, &bt, 0);
480 	boottrace_display(sbuf, &rt, 0);
481 	sbuf_finish(sbuf);
482 	error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf));
483 	sbuf_delete(sbuf);
484 
485 	return (error);
486 }
487 
488 static int
489 sysctl_boottrace(SYSCTL_HANDLER_ARGS)
490 {
491 
492 	if (!boottrace_enabled)
493 		return (0);
494 
495 	/* No output */
496 	if (req->newptr == NULL)
497 		return (0);
498 
499 	/* Trace to rt if we have reached multi-user. */
500 	return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req));
501 }
502 
503 /*
504  * Log a run-time event and switch over to run-time tracing mode.
505  */
506 static int
507 sysctl_runtrace(SYSCTL_HANDLER_ARGS)
508 {
509 
510 	if (!boottrace_enabled)
511 		return (0);
512 
513 	/* No output */
514 	if (req->newptr == NULL)
515 		return (0);
516 
517 	bootdone = true;
518 	return (_boottrace_sysctl(&rt, oidp, req));
519 }
520 
521 /*
522  * Log a shutdown-time event and switch over to shutdown tracing mode.
523  */
524 static int
525 sysctl_shuttrace(SYSCTL_HANDLER_ARGS)
526 {
527 
528 	if (!boottrace_enabled)
529 		return (0);
530 
531 	/* No output */
532 	if (req->newptr == NULL)
533 		return (0);
534 
535 	shutdown = true;
536 	return (_boottrace_sysctl(&st, oidp, req));
537 }
538 
539 /*
540  * Reset the runtime tracing buffer.
541  */
542 void
543 boottrace_reset(const char *actor)
544 {
545 	char tmpbuf[64];
546 
547 	snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor);
548 	runtrace(tmpbuf, NULL);
549 }
550 
551 /*
552  * Note that a resize implies a reset, i.e., the index is reset to 0.
553  * We never shrink the array; we can only increase its size.
554  */
555 int
556 boottrace_resize(u_int newsize)
557 {
558 
559 	if (newsize <= rt.size) {
560 		return (EINVAL);
561 	}
562 	rt.table = realloc(rt.table, newsize * sizeof(struct bt_event),
563 	    M_BOOTTRACE, M_WAITOK | M_ZERO);
564 	if (rt.table == NULL)
565 		return (ENOMEM);
566 
567 	rt.size = newsize;
568 	boottrace_reset("boottrace_resize");
569 	return (0);
570 }
571 
572 static int
573 sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
574 {
575 
576 	if (!boottrace_enabled)
577 		return (0);
578 
579 	if (req->newptr != NULL)
580 		boottrace_reset("sysctl_boottrace_reset");
581 
582 	return (0);
583 }
584 
585 static void
586 boottrace_init(void)
587 {
588 
589 	if (!boottrace_enabled)
590 		return;
591 
592 	/* Boottime trace table */
593 	bt.size = BT_TABLE_DEFSIZE;
594 	TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size);
595 	bt.size = max(bt.size, BT_TABLE_MINSIZE);
596 	bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE,
597 	    M_WAITOK | M_ZERO);
598 
599 	/* Stick in an initial entry. */
600 	bt.table[0].cpuid = PCPU_GET(cpuid);
601 	strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN);
602 	strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN);
603 	bt.curr = 1;
604 
605 	/* Run-time trace table (may wrap-around). */
606 	rt.wrap = 1;
607 	rt.size = BT_TABLE_RUNSIZE;
608 	rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE,
609 	    M_WAITOK | M_ZERO);
610 
611 	/* Shutdown trace table */
612 	st.size = BT_TABLE_SHTSIZE;
613 	st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE,
614 	    M_WAITOK | M_ZERO);
615 }
616 SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, 0);
617