xref: /freebsd/sys/kern/kern_boottrace.c (revision da5b7e90e7409a11a04f80ef0f19e7936811678d)
1*da5b7e90SMitchell Horne /*-
2*da5b7e90SMitchell Horne  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3*da5b7e90SMitchell Horne  *
4*da5b7e90SMitchell Horne  * Copyright (c) 2008-2022 NetApp, Inc.
5*da5b7e90SMitchell Horne  *
6*da5b7e90SMitchell Horne  * Redistribution and use in source and binary forms, with or without
7*da5b7e90SMitchell Horne  * modification, are permitted provided that the following conditions
8*da5b7e90SMitchell Horne  * are met:
9*da5b7e90SMitchell Horne  * 1. Redistributions of source code must retain the above copyright
10*da5b7e90SMitchell Horne  *    notice, this list of conditions and the following disclaimer.
11*da5b7e90SMitchell Horne  * 2. Redistributions in binary form must reproduce the above copyright
12*da5b7e90SMitchell Horne  *    notice, this list of conditions and the following disclaimer in the
13*da5b7e90SMitchell Horne  *    documentation and/or other materials provided with the distribution.
14*da5b7e90SMitchell Horne  *
15*da5b7e90SMitchell Horne  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16*da5b7e90SMitchell Horne  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17*da5b7e90SMitchell Horne  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18*da5b7e90SMitchell Horne  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19*da5b7e90SMitchell Horne  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20*da5b7e90SMitchell Horne  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21*da5b7e90SMitchell Horne  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22*da5b7e90SMitchell Horne  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*da5b7e90SMitchell Horne  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24*da5b7e90SMitchell Horne  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*da5b7e90SMitchell Horne  */
26*da5b7e90SMitchell Horne 
27*da5b7e90SMitchell Horne #include <sys/param.h>
28*da5b7e90SMitchell Horne #include <sys/systm.h>
29*da5b7e90SMitchell Horne #include <sys/boottrace.h>
30*da5b7e90SMitchell Horne #include <sys/kernel.h>
31*da5b7e90SMitchell Horne #include <sys/malloc.h>
32*da5b7e90SMitchell Horne #include <sys/pcpu.h>
33*da5b7e90SMitchell Horne #include <sys/proc.h>
34*da5b7e90SMitchell Horne #include <sys/resourcevar.h>
35*da5b7e90SMitchell Horne #include <sys/sbuf.h>
36*da5b7e90SMitchell Horne #include <sys/syscallsubr.h>
37*da5b7e90SMitchell Horne #include <sys/sysctl.h>
38*da5b7e90SMitchell Horne #include <sys/time.h>
39*da5b7e90SMitchell Horne 
40*da5b7e90SMitchell Horne #include <machine/stdarg.h>
41*da5b7e90SMitchell Horne 
42*da5b7e90SMitchell Horne #define	dprintf(fmt, ...)						\
43*da5b7e90SMitchell Horne 	do {								\
44*da5b7e90SMitchell Horne 		if (dotrace_debugging)					\
45*da5b7e90SMitchell Horne 			printf(fmt, ##__VA_ARGS__);			\
46*da5b7e90SMitchell Horne 	} while (0);
47*da5b7e90SMitchell Horne 
48*da5b7e90SMitchell Horne static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing");
49*da5b7e90SMitchell Horne 
50*da5b7e90SMitchell Horne #define	BT_TABLE_DEFSIZE	3000
51*da5b7e90SMitchell Horne #define	BT_TABLE_RUNSIZE	2000
52*da5b7e90SMitchell Horne #define	BT_TABLE_SHTSIZE	1000
53*da5b7e90SMitchell Horne #define	BT_TABLE_MINSIZE	500
54*da5b7e90SMitchell Horne 
55*da5b7e90SMitchell Horne /*
56*da5b7e90SMitchell Horne  * Boot-time & shutdown-time event.
57*da5b7e90SMitchell Horne  */
58*da5b7e90SMitchell Horne struct bt_event {
59*da5b7e90SMitchell Horne 	uint64_t tsc;			/* Timestamp */
60*da5b7e90SMitchell Horne 	uint64_t tick;			/* Kernel tick */
61*da5b7e90SMitchell Horne 	uint64_t cputime;		/* Microseconds of process CPU time */
62*da5b7e90SMitchell Horne 	uint32_t cpuid;			/* CPU on which the event ran */
63*da5b7e90SMitchell Horne 	uint32_t inblock;		/* # of blocks in */
64*da5b7e90SMitchell Horne 	uint32_t oublock;		/* # of blocks out */
65*da5b7e90SMitchell Horne 	pid_t pid;			/* Current PID */
66*da5b7e90SMitchell Horne 	char name[BT_EVENT_NAMELEN];	/* Event name */
67*da5b7e90SMitchell Horne 	char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */
68*da5b7e90SMitchell Horne };
69*da5b7e90SMitchell Horne 
70*da5b7e90SMitchell Horne struct bt_table {
71*da5b7e90SMitchell Horne 	uint32_t size;			/* Trace table size */
72*da5b7e90SMitchell Horne 	uint32_t curr;			/* Trace entry to use */
73*da5b7e90SMitchell Horne 	uint32_t wrap;			/* Wrap-around, instead of dropping */
74*da5b7e90SMitchell Horne 	uint32_t drops_early;		/* Trace entries dropped before init */
75*da5b7e90SMitchell Horne 	uint32_t drops_full;		/* Trace entries dropped after full */
76*da5b7e90SMitchell Horne 	struct bt_event *table;
77*da5b7e90SMitchell Horne };
78*da5b7e90SMitchell Horne 
79*da5b7e90SMitchell Horne /* Boot-time tracing */
80*da5b7e90SMitchell Horne static struct bt_table bt;
81*da5b7e90SMitchell Horne 
82*da5b7e90SMitchell Horne /* Run-time tracing */
83*da5b7e90SMitchell Horne static struct bt_table rt;
84*da5b7e90SMitchell Horne 
85*da5b7e90SMitchell Horne /* Shutdown-time tracing */
86*da5b7e90SMitchell Horne static struct bt_table st;
87*da5b7e90SMitchell Horne 
88*da5b7e90SMitchell Horne /* Set when system boot is complete, and we've switched to runtime tracing. */
89*da5b7e90SMitchell Horne static bool bootdone;
90*da5b7e90SMitchell Horne 
91*da5b7e90SMitchell Horne /* Set when system shutdown has started. */
92*da5b7e90SMitchell Horne static bool shutdown;
93*da5b7e90SMitchell Horne 
94*da5b7e90SMitchell Horne /* Turn on dotrace() debug logging to console. */
95*da5b7e90SMitchell Horne static bool dotrace_debugging;
96*da5b7e90SMitchell Horne TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging);
97*da5b7e90SMitchell Horne 
98*da5b7e90SMitchell Horne /* Trace kernel events */
99*da5b7e90SMitchell Horne static bool dotrace_kernel = true;
100*da5b7e90SMitchell Horne TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel);
101*da5b7e90SMitchell Horne 
102*da5b7e90SMitchell Horne /* Trace userspace events */
103*da5b7e90SMitchell Horne static bool dotrace_user = true;
104*da5b7e90SMitchell Horne TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user);
105*da5b7e90SMitchell Horne 
106*da5b7e90SMitchell Horne static int sysctl_log(SYSCTL_HANDLER_ARGS);
107*da5b7e90SMitchell Horne static int sysctl_boottrace(SYSCTL_HANDLER_ARGS);
108*da5b7e90SMitchell Horne static int sysctl_runtrace(SYSCTL_HANDLER_ARGS);
109*da5b7e90SMitchell Horne static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS);
110*da5b7e90SMitchell Horne static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS);
111*da5b7e90SMitchell Horne 
112*da5b7e90SMitchell Horne SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
113*da5b7e90SMitchell Horne     "boottrace statistics");
114*da5b7e90SMitchell Horne 
115*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, log,
116*da5b7e90SMitchell Horne     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
117*da5b7e90SMitchell Horne     0, 0, sysctl_log, "A",
118*da5b7e90SMitchell Horne     "Print a log of the boottrace trace data");
119*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace,
120*da5b7e90SMitchell Horne     CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
121*da5b7e90SMitchell Horne     0, 0, sysctl_boottrace, "A",
122*da5b7e90SMitchell Horne     "Capture a boot-time trace event");
123*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace,
124*da5b7e90SMitchell Horne     CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
125*da5b7e90SMitchell Horne     0, 0, sysctl_runtrace, "A",
126*da5b7e90SMitchell Horne     "Capture a run-time trace event");
127*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace,
128*da5b7e90SMitchell Horne     CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
129*da5b7e90SMitchell Horne     0, 0, sysctl_shuttrace, "A",
130*da5b7e90SMitchell Horne     "Capture a shutdown-time trace event");
131*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset,
132*da5b7e90SMitchell Horne     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE,
133*da5b7e90SMitchell Horne     0, 0, sysctl_boottrace_reset, "A",
134*da5b7e90SMitchell Horne     "Reset run-time tracing table");
135*da5b7e90SMitchell Horne 
136*da5b7e90SMitchell Horne /*
137*da5b7e90SMitchell Horne  * Global enable.
138*da5b7e90SMitchell Horne  */
139*da5b7e90SMitchell Horne bool __read_mostly boottrace_enabled = false;
140*da5b7e90SMitchell Horne SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN,
141*da5b7e90SMitchell Horne     &boottrace_enabled, 0,
142*da5b7e90SMitchell Horne     "Boot-time and shutdown-time tracing enabled");
143*da5b7e90SMitchell Horne 
144*da5b7e90SMitchell Horne /*
145*da5b7e90SMitchell Horne  * Enable dumping of the shutdown trace entries to console.
146*da5b7e90SMitchell Horne  */
147*da5b7e90SMitchell Horne bool shutdown_trace = false;
148*da5b7e90SMitchell Horne SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN,
149*da5b7e90SMitchell Horne    &shutdown_trace, 0,
150*da5b7e90SMitchell Horne    "Enable kernel shutdown tracing to the console");
151*da5b7e90SMitchell Horne 
152*da5b7e90SMitchell Horne /*
153*da5b7e90SMitchell Horne  * Set the delta threshold (ms) below which events are ignored, used in
154*da5b7e90SMitchell Horne  * determining what to dump to the console.
155*da5b7e90SMitchell Horne  */
156*da5b7e90SMitchell Horne static int shutdown_trace_threshold;
157*da5b7e90SMitchell Horne SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN,
158*da5b7e90SMitchell Horne    &shutdown_trace_threshold, 0,
159*da5b7e90SMitchell Horne    "Tracing threshold (ms) below which tracing is ignored");
160*da5b7e90SMitchell Horne 
161*da5b7e90SMitchell Horne /*
162*da5b7e90SMitchell Horne  * Dump a trace to buffer or if buffer is NULL to console.
163*da5b7e90SMitchell Horne  *
164*da5b7e90SMitchell Horne  * Non-zero delta_threshold selectively prints entries based on delta
165*da5b7e90SMitchell Horne  * with current and previous entry. Otherwise, delta_threshold of 0
166*da5b7e90SMitchell Horne  * prints every trace entry and delta.
167*da5b7e90SMitchell Horne  *
168*da5b7e90SMitchell Horne  * Output something like this:
169*da5b7e90SMitchell Horne  *
170*da5b7e90SMitchell Horne  * CPU      msecs      delta process                  event
171*da5b7e90SMitchell Horne  *  11 1228262715          0 init                     shutdown pre sync begin
172*da5b7e90SMitchell Horne  *   3 1228265622       2907 init                     shutdown pre sync complete
173*da5b7e90SMitchell Horne  *   3 1228265623          0 init                     shutdown turned swap off
174*da5b7e90SMitchell Horne  *  18 1228266466        843 init                     shutdown unmounted all filesystems
175*da5b7e90SMitchell Horne  *
176*da5b7e90SMitchell Horne  * How to interpret:
177*da5b7e90SMitchell Horne  *
178*da5b7e90SMitchell Horne  * delta column represents the time in milliseconds between this event and the previous.
179*da5b7e90SMitchell Horne  * Usually that means you can take the previous event, current event, match them
180*da5b7e90SMitchell Horne  * up in the code, and whatever lies between is the culprit taking time.
181*da5b7e90SMitchell Horne  *
182*da5b7e90SMitchell Horne  * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount
183*da5b7e90SMitchell Horne  * filesystems is taking 843 milliseconds.
184*da5b7e90SMitchell Horne  *
185*da5b7e90SMitchell Horne  * An event with a delta of 0 are 'landmark' events that simply exist in the output
186*da5b7e90SMitchell Horne  * for the developer to know where the time measurement begins. The 0 is an arbitrary
187*da5b7e90SMitchell Horne  * number that can effectively be ignored.
188*da5b7e90SMitchell Horne  */
189*da5b7e90SMitchell Horne #define BTC_DELTA_PRINT(bte, msecs, delta) do {				\
190*da5b7e90SMitchell Horne 	if (sbp) {							\
191*da5b7e90SMitchell Horne 		sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta,	\
192*da5b7e90SMitchell Horne 		    (bte)->tdname, (bte)->name, (bte)->pid,		\
193*da5b7e90SMitchell Horne 		    (bte)->cputime / 1000000,				\
194*da5b7e90SMitchell Horne 		    ((bte)->cputime % 1000000) / 10000,			\
195*da5b7e90SMitchell Horne 		    (bte)->inblock, (bte)->oublock);			\
196*da5b7e90SMitchell Horne 	} else {							\
197*da5b7e90SMitchell Horne 		printf(fmt, (bte)->cpuid, msecs, delta,			\
198*da5b7e90SMitchell Horne 		    (bte)->tdname, (bte)->name, (bte)->pid,		\
199*da5b7e90SMitchell Horne 		    (bte)->cputime / 1000000,				\
200*da5b7e90SMitchell Horne 		    ((bte)->cputime % 1000000) / 10000,			\
201*da5b7e90SMitchell Horne 		    (bte)->inblock, (bte)->oublock);			\
202*da5b7e90SMitchell Horne 	}								\
203*da5b7e90SMitchell Horne } while (0)
204*da5b7e90SMitchell Horne 
205*da5b7e90SMitchell Horne /*
206*da5b7e90SMitchell Horne  * Print the trace entries to the message buffer, or to an sbuf, if provided.
207*da5b7e90SMitchell Horne  *
208*da5b7e90SMitchell Horne  * Entries with a difference less than dthres will not be printed.
209*da5b7e90SMitchell Horne  */
210*da5b7e90SMitchell Horne static void
211*da5b7e90SMitchell Horne boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres)
212*da5b7e90SMitchell Horne {
213*da5b7e90SMitchell Horne 	struct bt_event *evtp;
214*da5b7e90SMitchell Horne 	struct bt_event *last_evtp;
215*da5b7e90SMitchell Horne 	uint64_t msecs;
216*da5b7e90SMitchell Horne 	uint64_t first_msecs;
217*da5b7e90SMitchell Horne 	uint64_t last_msecs;
218*da5b7e90SMitchell Horne 	uint64_t dmsecs;
219*da5b7e90SMitchell Horne 	uint64_t last_dmsecs;
220*da5b7e90SMitchell Horne 	uint64_t total_dmsecs;
221*da5b7e90SMitchell Horne 	uint32_t i;
222*da5b7e90SMitchell Horne 	uint32_t curr;
223*da5b7e90SMitchell Horne 	const char *fmt     = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n";
224*da5b7e90SMitchell Horne 	const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n";
225*da5b7e90SMitchell Horne 	bool printed;
226*da5b7e90SMitchell Horne 	bool last_printed;
227*da5b7e90SMitchell Horne 
228*da5b7e90SMitchell Horne 	/* Print the header */
229*da5b7e90SMitchell Horne 	if (sbp != NULL)
230*da5b7e90SMitchell Horne 		sbuf_printf(sbp, hdr_fmt,
231*da5b7e90SMitchell Horne 		    "CPU", "msecs", "delta", "process",
232*da5b7e90SMitchell Horne 		    "event", "PID", "CPUtime", "IBlks", "OBlks");
233*da5b7e90SMitchell Horne 	else
234*da5b7e90SMitchell Horne 		printf(hdr_fmt,
235*da5b7e90SMitchell Horne 		    "CPU", "msecs", "delta", "process",
236*da5b7e90SMitchell Horne 		    "event", "PID", "CPUtime", "IBlks", "OBlks");
237*da5b7e90SMitchell Horne 
238*da5b7e90SMitchell Horne 	first_msecs = 0;
239*da5b7e90SMitchell Horne 	last_evtp = NULL;
240*da5b7e90SMitchell Horne 	last_msecs = 0;
241*da5b7e90SMitchell Horne 	last_dmsecs = 0;
242*da5b7e90SMitchell Horne 	last_printed = false;
243*da5b7e90SMitchell Horne 	i = curr = btp->curr;
244*da5b7e90SMitchell Horne 
245*da5b7e90SMitchell Horne 	do {
246*da5b7e90SMitchell Horne 		evtp = &btp->table[i];
247*da5b7e90SMitchell Horne 		if (evtp->tsc == 0)
248*da5b7e90SMitchell Horne 			goto next;
249*da5b7e90SMitchell Horne 
250*da5b7e90SMitchell Horne 		msecs = cputick2usec(evtp->tick) / 1000;
251*da5b7e90SMitchell Horne 		dmsecs = (last_msecs != 0 && msecs > last_msecs) ?
252*da5b7e90SMitchell Horne 		    msecs - last_msecs : 0;
253*da5b7e90SMitchell Horne 		printed = false;
254*da5b7e90SMitchell Horne 
255*da5b7e90SMitchell Horne 		/*
256*da5b7e90SMitchell Horne 		 * If a threshold is defined, start filtering events by the
257*da5b7e90SMitchell Horne 		 * delta of msecs.
258*da5b7e90SMitchell Horne 		 */
259*da5b7e90SMitchell Horne 		if (dthres != 0 && (dmsecs > dthres)) {
260*da5b7e90SMitchell Horne 			/*
261*da5b7e90SMitchell Horne 			 * Print the previous entry as a landmark, even if it
262*da5b7e90SMitchell Horne 			 * falls below the threshold.
263*da5b7e90SMitchell Horne 			 */
264*da5b7e90SMitchell Horne 			if (last_evtp != NULL && !last_printed)
265*da5b7e90SMitchell Horne 				BTC_DELTA_PRINT(last_evtp, last_msecs,
266*da5b7e90SMitchell Horne 				    last_dmsecs);
267*da5b7e90SMitchell Horne 			BTC_DELTA_PRINT(evtp, msecs, dmsecs);
268*da5b7e90SMitchell Horne 			printed = true;
269*da5b7e90SMitchell Horne 		} else if (dthres == 0) {
270*da5b7e90SMitchell Horne 			BTC_DELTA_PRINT(evtp, msecs, dmsecs);
271*da5b7e90SMitchell Horne 			printed = true;
272*da5b7e90SMitchell Horne 		}
273*da5b7e90SMitchell Horne 		if (first_msecs == 0 || msecs < first_msecs)
274*da5b7e90SMitchell Horne 			first_msecs = msecs;
275*da5b7e90SMitchell Horne 		last_evtp = evtp;
276*da5b7e90SMitchell Horne 		last_msecs = msecs;
277*da5b7e90SMitchell Horne 		last_dmsecs = dmsecs;
278*da5b7e90SMitchell Horne 		last_printed = printed;
279*da5b7e90SMitchell Horne 		maybe_yield();
280*da5b7e90SMitchell Horne next:
281*da5b7e90SMitchell Horne 		i = (i + 1) % btp->size;
282*da5b7e90SMitchell Horne 	} while (i != curr);
283*da5b7e90SMitchell Horne 
284*da5b7e90SMitchell Horne 	total_dmsecs = last_msecs > first_msecs ?
285*da5b7e90SMitchell Horne 	    (last_msecs - first_msecs) : 0;
286*da5b7e90SMitchell Horne 	if (sbp != NULL)
287*da5b7e90SMitchell Horne 		sbuf_printf(sbp, "Total measured time: %ju msecs\n",
288*da5b7e90SMitchell Horne 		    (uintmax_t)total_dmsecs);
289*da5b7e90SMitchell Horne 	else
290*da5b7e90SMitchell Horne 		printf("Total measured time: %ju msecs\n",
291*da5b7e90SMitchell Horne 		    (uintmax_t)total_dmsecs);
292*da5b7e90SMitchell Horne }
293*da5b7e90SMitchell Horne 
294*da5b7e90SMitchell Horne /*
295*da5b7e90SMitchell Horne  * Dump trace table entries to the console, given a delta threshold.
296*da5b7e90SMitchell Horne  */
297*da5b7e90SMitchell Horne void
298*da5b7e90SMitchell Horne boottrace_dump_console(void)
299*da5b7e90SMitchell Horne {
300*da5b7e90SMitchell Horne 	if (!boottrace_enabled)
301*da5b7e90SMitchell Horne 		return;
302*da5b7e90SMitchell Horne 
303*da5b7e90SMitchell Horne 	if (shutdown || rebooting || KERNEL_PANICKED()) {
304*da5b7e90SMitchell Horne 		boottrace_display(NULL, &st, shutdown_trace_threshold);
305*da5b7e90SMitchell Horne 	} else {
306*da5b7e90SMitchell Horne 		boottrace_display(NULL, &bt, 0);
307*da5b7e90SMitchell Horne 		boottrace_display(NULL, &rt, 0);
308*da5b7e90SMitchell Horne 	}
309*da5b7e90SMitchell Horne }
310*da5b7e90SMitchell Horne 
311*da5b7e90SMitchell Horne /*
312*da5b7e90SMitchell Horne  * Records a new tracing event to the specified table.
313*da5b7e90SMitchell Horne  *
314*da5b7e90SMitchell Horne  * We don't use a lock because we want this to be callable from interrupt
315*da5b7e90SMitchell Horne  * context.
316*da5b7e90SMitchell Horne  */
317*da5b7e90SMitchell Horne static int
318*da5b7e90SMitchell Horne dotrace(struct bt_table *btp, const char *eventname, const char *tdname)
319*da5b7e90SMitchell Horne {
320*da5b7e90SMitchell Horne 	uint32_t idx, nxt;
321*da5b7e90SMitchell Horne 	struct rusage usage;
322*da5b7e90SMitchell Horne 
323*da5b7e90SMitchell Horne 	MPASS(boottrace_enabled);
324*da5b7e90SMitchell Horne 	if (tdname == NULL)
325*da5b7e90SMitchell Horne 		tdname = (curproc->p_flag & P_SYSTEM) ?
326*da5b7e90SMitchell Horne 		    curthread->td_name : curproc->p_comm;
327*da5b7e90SMitchell Horne 
328*da5b7e90SMitchell Horne 	dprintf("dotrace[");
329*da5b7e90SMitchell Horne 	dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'",
330*da5b7e90SMitchell Horne 	    PCPU_GET(cpuid), curthread->td_proc->p_pid,
331*da5b7e90SMitchell Horne 	    (uintmax_t)get_cyclecount(), ticks, tdname, eventname);
332*da5b7e90SMitchell Horne 	if (btp->table == NULL) {
333*da5b7e90SMitchell Horne 		btp->drops_early++;
334*da5b7e90SMitchell Horne 		dprintf(", return=ENOSPC_1]\n");
335*da5b7e90SMitchell Horne 		return (ENOSPC);
336*da5b7e90SMitchell Horne 	}
337*da5b7e90SMitchell Horne 
338*da5b7e90SMitchell Horne 	/* Claim a slot in the table. */
339*da5b7e90SMitchell Horne 	do {
340*da5b7e90SMitchell Horne 		idx = btp->curr;
341*da5b7e90SMitchell Horne 		nxt = (idx + 1) % btp->size;
342*da5b7e90SMitchell Horne 		if (nxt == 0 && btp->wrap == 0) {
343*da5b7e90SMitchell Horne 			btp->drops_full++;
344*da5b7e90SMitchell Horne 			dprintf(", return=ENOSPC_2]\n");
345*da5b7e90SMitchell Horne 			return (ENOSPC);
346*da5b7e90SMitchell Horne 		}
347*da5b7e90SMitchell Horne 	} while (!atomic_cmpset_int(&btp->curr, idx, nxt));
348*da5b7e90SMitchell Horne 
349*da5b7e90SMitchell Horne 	btp->table[idx].cpuid = PCPU_GET(cpuid);
350*da5b7e90SMitchell Horne 	btp->table[idx].tsc = get_cyclecount(),
351*da5b7e90SMitchell Horne 	btp->table[idx].tick = cpu_ticks();
352*da5b7e90SMitchell Horne 	btp->table[idx].pid = curthread->td_proc->p_pid;
353*da5b7e90SMitchell Horne 
354*da5b7e90SMitchell Horne 	/*
355*da5b7e90SMitchell Horne 	 * Get the resource usage for the process. Don't attempt this for the
356*da5b7e90SMitchell Horne 	 * kernel proc0 or for critical section activities.
357*da5b7e90SMitchell Horne 	 */
358*da5b7e90SMitchell Horne 	if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) {
359*da5b7e90SMitchell Horne 		btp->table[idx].cputime = 0;
360*da5b7e90SMitchell Horne 		btp->table[idx].inblock = 0;
361*da5b7e90SMitchell Horne 		btp->table[idx].oublock = 0;
362*da5b7e90SMitchell Horne 	} else {
363*da5b7e90SMitchell Horne 		kern_getrusage(curthread, RUSAGE_CHILDREN, &usage);
364*da5b7e90SMitchell Horne 		btp->table[idx].cputime =
365*da5b7e90SMitchell Horne 		    (uint64_t)(usage.ru_utime.tv_sec * 1000000 +
366*da5b7e90SMitchell Horne 		    usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 +
367*da5b7e90SMitchell Horne 		    usage.ru_stime.tv_usec);
368*da5b7e90SMitchell Horne 		btp->table[idx].inblock = (uint32_t)usage.ru_inblock;
369*da5b7e90SMitchell Horne 		btp->table[idx].oublock = (uint32_t)usage.ru_oublock;
370*da5b7e90SMitchell Horne 	}
371*da5b7e90SMitchell Horne 	strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN);
372*da5b7e90SMitchell Horne 	strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN);
373*da5b7e90SMitchell Horne 
374*da5b7e90SMitchell Horne 	dprintf(", return=0]\n");
375*da5b7e90SMitchell Horne 	return (0);
376*da5b7e90SMitchell Horne }
377*da5b7e90SMitchell Horne 
378*da5b7e90SMitchell Horne /*
379*da5b7e90SMitchell Horne  * Log various boot-time events, with the trace message encoded using
380*da5b7e90SMitchell Horne  * printf-like arguments.
381*da5b7e90SMitchell Horne  */
382*da5b7e90SMitchell Horne int
383*da5b7e90SMitchell Horne boottrace(const char *tdname, const char *fmt, ...)
384*da5b7e90SMitchell Horne {
385*da5b7e90SMitchell Horne 	char eventname[BT_EVENT_NAMELEN];
386*da5b7e90SMitchell Horne 	struct bt_table *btp;
387*da5b7e90SMitchell Horne 	va_list ap;
388*da5b7e90SMitchell Horne 
389*da5b7e90SMitchell Horne 	if (!dotrace_kernel)
390*da5b7e90SMitchell Horne 		 return (ENXIO);
391*da5b7e90SMitchell Horne 
392*da5b7e90SMitchell Horne 	va_start(ap, fmt);
393*da5b7e90SMitchell Horne 	vsnprintf(eventname, sizeof(eventname), fmt, ap);
394*da5b7e90SMitchell Horne 	va_end(ap);
395*da5b7e90SMitchell Horne 
396*da5b7e90SMitchell Horne 	btp = &bt;
397*da5b7e90SMitchell Horne 	if (bootdone)
398*da5b7e90SMitchell Horne 		btp = &rt;
399*da5b7e90SMitchell Horne 	if (shutdown || rebooting || KERNEL_PANICKED())
400*da5b7e90SMitchell Horne 		btp = &st;
401*da5b7e90SMitchell Horne 
402*da5b7e90SMitchell Horne 	return (dotrace(btp, eventname, tdname));
403*da5b7e90SMitchell Horne }
404*da5b7e90SMitchell Horne 
405*da5b7e90SMitchell Horne /*
406*da5b7e90SMitchell Horne  * Log a run-time event & switch over to run-time tracing mode.
407*da5b7e90SMitchell Horne  */
408*da5b7e90SMitchell Horne static int
409*da5b7e90SMitchell Horne runtrace(const char *eventname, const char *tdname)
410*da5b7e90SMitchell Horne {
411*da5b7e90SMitchell Horne 
412*da5b7e90SMitchell Horne 	bootdone = true;
413*da5b7e90SMitchell Horne 	return (dotrace(&rt, eventname, tdname));
414*da5b7e90SMitchell Horne }
415*da5b7e90SMitchell Horne 
416*da5b7e90SMitchell Horne /*
417*da5b7e90SMitchell Horne  * Parse a boottrace message from userspace.
418*da5b7e90SMitchell Horne  *
419*da5b7e90SMitchell Horne  * The input from may contain a ':' to denote tdname. If not, tdname is
420*da5b7e90SMitchell Horne  * inferred from the process' name.
421*da5b7e90SMitchell Horne  *
422*da5b7e90SMitchell Horne  * e.g. reboot(8):SIGINT to init(8)
423*da5b7e90SMitchell Horne  */
424*da5b7e90SMitchell Horne static void
425*da5b7e90SMitchell Horne boottrace_parse_message(char *message, char **eventname, char **tdname)
426*da5b7e90SMitchell Horne {
427*da5b7e90SMitchell Horne 	char *delim;
428*da5b7e90SMitchell Horne 
429*da5b7e90SMitchell Horne 	delim = strchr(message, ':');
430*da5b7e90SMitchell Horne 	if (delim != NULL) {
431*da5b7e90SMitchell Horne 		*delim = '\0';
432*da5b7e90SMitchell Horne 		*tdname = message;
433*da5b7e90SMitchell Horne 		*eventname = ++delim;
434*da5b7e90SMitchell Horne 	} else {
435*da5b7e90SMitchell Horne 		*tdname = curproc->p_comm;
436*da5b7e90SMitchell Horne 		*eventname = message;
437*da5b7e90SMitchell Horne 	}
438*da5b7e90SMitchell Horne }
439*da5b7e90SMitchell Horne 
440*da5b7e90SMitchell Horne static int
441*da5b7e90SMitchell Horne _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp,
442*da5b7e90SMitchell Horne     struct sysctl_req *req)
443*da5b7e90SMitchell Horne {
444*da5b7e90SMitchell Horne 	char message[BT_MSGLEN];
445*da5b7e90SMitchell Horne 	char *eventname, *tdname;
446*da5b7e90SMitchell Horne 	int error;
447*da5b7e90SMitchell Horne 
448*da5b7e90SMitchell Horne 	if (!dotrace_user)
449*da5b7e90SMitchell Horne 	       return (ENXIO);
450*da5b7e90SMitchell Horne 
451*da5b7e90SMitchell Horne 	message[0] = '\0';
452*da5b7e90SMitchell Horne 	error = sysctl_handle_string(oidp, message, sizeof(message), req);
453*da5b7e90SMitchell Horne 	if (error)
454*da5b7e90SMitchell Horne 		return (error);
455*da5b7e90SMitchell Horne 
456*da5b7e90SMitchell Horne 	boottrace_parse_message(message, &eventname, &tdname);
457*da5b7e90SMitchell Horne 	error = dotrace(btp, eventname, tdname);
458*da5b7e90SMitchell Horne 	if (error == ENOSPC) {
459*da5b7e90SMitchell Horne 		/* Ignore table full error. */
460*da5b7e90SMitchell Horne 		error = 0;
461*da5b7e90SMitchell Horne 	}
462*da5b7e90SMitchell Horne 	return (error);
463*da5b7e90SMitchell Horne }
464*da5b7e90SMitchell Horne 
465*da5b7e90SMitchell Horne static int
466*da5b7e90SMitchell Horne sysctl_log(SYSCTL_HANDLER_ARGS)
467*da5b7e90SMitchell Horne {
468*da5b7e90SMitchell Horne 	struct sbuf *sbuf;
469*da5b7e90SMitchell Horne 	int error;
470*da5b7e90SMitchell Horne 
471*da5b7e90SMitchell Horne 	if (!boottrace_enabled || req->newptr != NULL)
472*da5b7e90SMitchell Horne 		return (0);
473*da5b7e90SMitchell Horne 
474*da5b7e90SMitchell Horne 	sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
475*da5b7e90SMitchell Horne 	boottrace_display(sbuf, &bt, 0);
476*da5b7e90SMitchell Horne 	boottrace_display(sbuf, &rt, 0);
477*da5b7e90SMitchell Horne 	sbuf_finish(sbuf);
478*da5b7e90SMitchell Horne 	error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf));
479*da5b7e90SMitchell Horne 	sbuf_delete(sbuf);
480*da5b7e90SMitchell Horne 
481*da5b7e90SMitchell Horne 	return (error);
482*da5b7e90SMitchell Horne }
483*da5b7e90SMitchell Horne 
484*da5b7e90SMitchell Horne static int
485*da5b7e90SMitchell Horne sysctl_boottrace(SYSCTL_HANDLER_ARGS)
486*da5b7e90SMitchell Horne {
487*da5b7e90SMitchell Horne 
488*da5b7e90SMitchell Horne 	if (!boottrace_enabled)
489*da5b7e90SMitchell Horne 		return (0);
490*da5b7e90SMitchell Horne 
491*da5b7e90SMitchell Horne 	/* No output */
492*da5b7e90SMitchell Horne 	if (req->newptr == NULL)
493*da5b7e90SMitchell Horne 		return (0);
494*da5b7e90SMitchell Horne 
495*da5b7e90SMitchell Horne 	/* Trace to rt if we have reached multi-user. */
496*da5b7e90SMitchell Horne 	return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req));
497*da5b7e90SMitchell Horne }
498*da5b7e90SMitchell Horne 
499*da5b7e90SMitchell Horne /*
500*da5b7e90SMitchell Horne  * Log a run-time event and switch over to run-time tracing mode.
501*da5b7e90SMitchell Horne  */
502*da5b7e90SMitchell Horne static int
503*da5b7e90SMitchell Horne sysctl_runtrace(SYSCTL_HANDLER_ARGS)
504*da5b7e90SMitchell Horne {
505*da5b7e90SMitchell Horne 
506*da5b7e90SMitchell Horne 	if (!boottrace_enabled)
507*da5b7e90SMitchell Horne 		return (0);
508*da5b7e90SMitchell Horne 
509*da5b7e90SMitchell Horne 	/* No output */
510*da5b7e90SMitchell Horne 	if (req->newptr == NULL)
511*da5b7e90SMitchell Horne 		return (0);
512*da5b7e90SMitchell Horne 
513*da5b7e90SMitchell Horne 	bootdone = true;
514*da5b7e90SMitchell Horne 	return (_boottrace_sysctl(&rt, oidp, req));
515*da5b7e90SMitchell Horne }
516*da5b7e90SMitchell Horne 
517*da5b7e90SMitchell Horne /*
518*da5b7e90SMitchell Horne  * Log a shutdown-time event and switch over to shutdown tracing mode.
519*da5b7e90SMitchell Horne  */
520*da5b7e90SMitchell Horne static int
521*da5b7e90SMitchell Horne sysctl_shuttrace(SYSCTL_HANDLER_ARGS)
522*da5b7e90SMitchell Horne {
523*da5b7e90SMitchell Horne 
524*da5b7e90SMitchell Horne 	if (!boottrace_enabled)
525*da5b7e90SMitchell Horne 		return (0);
526*da5b7e90SMitchell Horne 
527*da5b7e90SMitchell Horne 	/* No output */
528*da5b7e90SMitchell Horne 	if (req->newptr == NULL)
529*da5b7e90SMitchell Horne 		return (0);
530*da5b7e90SMitchell Horne 
531*da5b7e90SMitchell Horne 	shutdown = true;
532*da5b7e90SMitchell Horne 	return (_boottrace_sysctl(&st, oidp, req));
533*da5b7e90SMitchell Horne }
534*da5b7e90SMitchell Horne 
535*da5b7e90SMitchell Horne /*
536*da5b7e90SMitchell Horne  * Reset the runtime tracing buffer.
537*da5b7e90SMitchell Horne  */
538*da5b7e90SMitchell Horne void
539*da5b7e90SMitchell Horne boottrace_reset(const char *actor)
540*da5b7e90SMitchell Horne {
541*da5b7e90SMitchell Horne 	char tmpbuf[64];
542*da5b7e90SMitchell Horne 
543*da5b7e90SMitchell Horne 	snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor);
544*da5b7e90SMitchell Horne 	runtrace(tmpbuf, NULL);
545*da5b7e90SMitchell Horne }
546*da5b7e90SMitchell Horne 
547*da5b7e90SMitchell Horne /*
548*da5b7e90SMitchell Horne  * Note that a resize implies a reset, i.e., the index is reset to 0.
549*da5b7e90SMitchell Horne  * We never shrink the array; we can only increase its size.
550*da5b7e90SMitchell Horne  */
551*da5b7e90SMitchell Horne int
552*da5b7e90SMitchell Horne boottrace_resize(u_int newsize)
553*da5b7e90SMitchell Horne {
554*da5b7e90SMitchell Horne 
555*da5b7e90SMitchell Horne 	if (newsize <= rt.size) {
556*da5b7e90SMitchell Horne 		return (EINVAL);
557*da5b7e90SMitchell Horne 	}
558*da5b7e90SMitchell Horne 	rt.table = realloc(rt.table, newsize * sizeof(struct bt_event),
559*da5b7e90SMitchell Horne 	    M_BOOTTRACE, M_WAITOK | M_ZERO);
560*da5b7e90SMitchell Horne 	if (rt.table == NULL)
561*da5b7e90SMitchell Horne 		return (ENOMEM);
562*da5b7e90SMitchell Horne 
563*da5b7e90SMitchell Horne 	rt.size = newsize;
564*da5b7e90SMitchell Horne 	boottrace_reset("boottrace_resize");
565*da5b7e90SMitchell Horne 	return (0);
566*da5b7e90SMitchell Horne }
567*da5b7e90SMitchell Horne 
568*da5b7e90SMitchell Horne static int
569*da5b7e90SMitchell Horne sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
570*da5b7e90SMitchell Horne {
571*da5b7e90SMitchell Horne 
572*da5b7e90SMitchell Horne 	if (!boottrace_enabled)
573*da5b7e90SMitchell Horne 		return (0);
574*da5b7e90SMitchell Horne 
575*da5b7e90SMitchell Horne 	if (req->newptr != NULL)
576*da5b7e90SMitchell Horne 		boottrace_reset("sysctl_boottrace_reset");
577*da5b7e90SMitchell Horne 
578*da5b7e90SMitchell Horne 	return (0);
579*da5b7e90SMitchell Horne }
580*da5b7e90SMitchell Horne 
581*da5b7e90SMitchell Horne static void
582*da5b7e90SMitchell Horne boottrace_init(void)
583*da5b7e90SMitchell Horne {
584*da5b7e90SMitchell Horne 
585*da5b7e90SMitchell Horne 	if (!boottrace_enabled)
586*da5b7e90SMitchell Horne 		return;
587*da5b7e90SMitchell Horne 
588*da5b7e90SMitchell Horne 	/* Boottime trace table */
589*da5b7e90SMitchell Horne 	bt.size = BT_TABLE_DEFSIZE;
590*da5b7e90SMitchell Horne 	TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size);
591*da5b7e90SMitchell Horne 	bt.size = max(bt.size, BT_TABLE_MINSIZE);
592*da5b7e90SMitchell Horne 	bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE,
593*da5b7e90SMitchell Horne 	    M_WAITOK | M_ZERO);
594*da5b7e90SMitchell Horne 
595*da5b7e90SMitchell Horne 	/* Stick in an initial entry. */
596*da5b7e90SMitchell Horne 	bt.table[0].cpuid = PCPU_GET(cpuid);
597*da5b7e90SMitchell Horne 	strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN);
598*da5b7e90SMitchell Horne 	strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN);
599*da5b7e90SMitchell Horne 	bt.curr = 1;
600*da5b7e90SMitchell Horne 
601*da5b7e90SMitchell Horne 	/* Run-time trace table (may wrap-around). */
602*da5b7e90SMitchell Horne 	rt.wrap = 1;
603*da5b7e90SMitchell Horne 	rt.size = BT_TABLE_RUNSIZE;
604*da5b7e90SMitchell Horne 	rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE,
605*da5b7e90SMitchell Horne 	    M_WAITOK | M_ZERO);
606*da5b7e90SMitchell Horne 
607*da5b7e90SMitchell Horne 	/* Shutdown trace table */
608*da5b7e90SMitchell Horne 	st.size = BT_TABLE_SHTSIZE;
609*da5b7e90SMitchell Horne 	st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE,
610*da5b7e90SMitchell Horne 	    M_WAITOK | M_ZERO);
611*da5b7e90SMitchell Horne }
612*da5b7e90SMitchell Horne SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, 0);
613