1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2008-2022 NetApp, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/boottrace.h>
30 #include <sys/kernel.h>
31 #include <sys/malloc.h>
32 #include <sys/pcpu.h>
33 #include <sys/proc.h>
34 #include <sys/resourcevar.h>
35 #include <sys/sbuf.h>
36 #include <sys/stdarg.h>
37 #include <sys/syscallsubr.h>
38 #include <sys/sysctl.h>
39 #include <sys/time.h>
40
41 #define dprintf(fmt, ...) \
42 do { \
43 if (dotrace_debugging) \
44 printf(fmt, ##__VA_ARGS__); \
45 } while (0);
46
47 static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing");
48
49 #define BT_TABLE_DEFSIZE 3000
50 #define BT_TABLE_RUNSIZE 2000
51 #define BT_TABLE_SHTSIZE 1000
52 #define BT_TABLE_MINSIZE 500
53
54 /*
55 * Boot-time & shutdown-time event.
56 */
57 struct bt_event {
58 uint64_t tsc; /* Timestamp */
59 uint64_t tick; /* Kernel tick */
60 uint64_t cputime; /* Microseconds of process CPU time */
61 uint32_t cpuid; /* CPU on which the event ran */
62 uint32_t inblock; /* # of blocks in */
63 uint32_t oublock; /* # of blocks out */
64 pid_t pid; /* Current PID */
65 char name[BT_EVENT_NAMELEN]; /* Event name */
66 char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */
67 };
68
69 struct bt_table {
70 uint32_t size; /* Trace table size */
71 uint32_t curr; /* Trace entry to use */
72 uint32_t wrap; /* Wrap-around, instead of dropping */
73 uint32_t drops_early; /* Trace entries dropped before init */
74 uint32_t drops_full; /* Trace entries dropped after full */
75 struct bt_event *table;
76 };
77
78 /* Boot-time tracing */
79 static struct bt_table bt;
80
81 /* Run-time tracing */
82 static struct bt_table rt;
83
84 /* Shutdown-time tracing */
85 static struct bt_table st;
86
87 /* Set when system boot is complete, and we've switched to runtime tracing. */
88 static bool bootdone;
89
90 /* Set when system shutdown has started. */
91 static bool shutdown;
92
93 /* Turn on dotrace() debug logging to console. */
94 static bool dotrace_debugging;
95 TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging);
96
97 /* Trace kernel events */
98 static bool dotrace_kernel = true;
99 TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel);
100
101 /* Trace userspace events */
102 static bool dotrace_user = true;
103 TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user);
104
105 static int sysctl_log(SYSCTL_HANDLER_ARGS);
106 static int sysctl_boottrace(SYSCTL_HANDLER_ARGS);
107 static int sysctl_runtrace(SYSCTL_HANDLER_ARGS);
108 static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS);
109 static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS);
110
111 SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
112 "boottrace statistics");
113
114 SYSCTL_PROC(_kern_boottrace, OID_AUTO, log,
115 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
116 0, 0, sysctl_log, "A",
117 "Print a log of the boottrace trace data");
118 SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace,
119 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
120 0, 0, sysctl_boottrace, "A",
121 "Capture a boot-time trace event");
122 SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace,
123 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
124 0, 0, sysctl_runtrace, "A",
125 "Capture a run-time trace event");
126 SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace,
127 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
128 0, 0, sysctl_shuttrace, "A",
129 "Capture a shutdown-time trace event");
130 SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset,
131 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE,
132 0, 0, sysctl_boottrace_reset, "A",
133 "Reset run-time tracing table");
134
135 /*
136 * Global enable.
137 */
138 bool __read_mostly boottrace_enabled = false;
139 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN,
140 &boottrace_enabled, 0,
141 "Boot-time and shutdown-time tracing enabled");
142
143 /*
144 * Enable dumping of the shutdown trace entries to console.
145 */
146 bool shutdown_trace = false;
147 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN,
148 &shutdown_trace, 0,
149 "Enable kernel shutdown tracing to the console");
150
151 /*
152 * Set the delta threshold (ms) below which events are ignored, used in
153 * determining what to dump to the console.
154 */
155 static int shutdown_trace_threshold;
156 SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN,
157 &shutdown_trace_threshold, 0,
158 "Tracing threshold (ms) below which tracing is ignored");
159
160 SYSCTL_UINT(_kern_boottrace, OID_AUTO, table_size,
161 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bt.size, 0,
162 "Boot-time tracing table size");
163
164 /*
165 * Dump a trace to buffer or if buffer is NULL to console.
166 *
167 * Non-zero delta_threshold selectively prints entries based on delta
168 * with current and previous entry. Otherwise, delta_threshold of 0
169 * prints every trace entry and delta.
170 *
171 * Output something like this:
172 *
173 * CPU msecs delta process event
174 * 11 1228262715 0 init shutdown pre sync begin
175 * 3 1228265622 2907 init shutdown pre sync complete
176 * 3 1228265623 0 init shutdown turned swap off
177 * 18 1228266466 843 init shutdown unmounted all filesystems
178 *
179 * How to interpret:
180 *
181 * delta column represents the time in milliseconds between this event and the previous.
182 * Usually that means you can take the previous event, current event, match them
183 * up in the code, and whatever lies between is the culprit taking time.
184 *
185 * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount
186 * filesystems is taking 843 milliseconds.
187 *
188 * An event with a delta of 0 are 'landmark' events that simply exist in the output
189 * for the developer to know where the time measurement begins. The 0 is an arbitrary
190 * number that can effectively be ignored.
191 */
192 #define BTC_DELTA_PRINT(bte, msecs, delta) do { \
193 if (sbp) { \
194 sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta, \
195 (bte)->tdname, (bte)->name, (bte)->pid, \
196 (bte)->cputime / 1000000, \
197 ((bte)->cputime % 1000000) / 10000, \
198 (bte)->inblock, (bte)->oublock); \
199 } else { \
200 printf(fmt, (bte)->cpuid, msecs, delta, \
201 (bte)->tdname, (bte)->name, (bte)->pid, \
202 (bte)->cputime / 1000000, \
203 ((bte)->cputime % 1000000) / 10000, \
204 (bte)->inblock, (bte)->oublock); \
205 } \
206 } while (0)
207
208 /*
209 * Print the trace entries to the message buffer, or to an sbuf, if provided.
210 *
211 * Entries with a difference less than dthres will not be printed.
212 */
213 static void
boottrace_display(struct sbuf * sbp,struct bt_table * btp,uint64_t dthres)214 boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres)
215 {
216 struct bt_event *evtp;
217 struct bt_event *last_evtp;
218 uint64_t msecs;
219 uint64_t first_msecs;
220 uint64_t last_msecs;
221 uint64_t dmsecs;
222 uint64_t last_dmsecs;
223 uint64_t total_dmsecs;
224 uint32_t i;
225 uint32_t curr;
226 const char *fmt = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n";
227 const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n";
228 bool printed;
229 bool last_printed;
230
231 /* Print the header */
232 if (sbp != NULL)
233 sbuf_printf(sbp, hdr_fmt,
234 "CPU", "msecs", "delta", "process",
235 "event", "PID", "CPUtime", "IBlks", "OBlks");
236 else
237 printf(hdr_fmt,
238 "CPU", "msecs", "delta", "process",
239 "event", "PID", "CPUtime", "IBlks", "OBlks");
240
241 first_msecs = 0;
242 last_evtp = NULL;
243 last_msecs = 0;
244 last_dmsecs = 0;
245 last_printed = false;
246 i = curr = btp->curr;
247
248 do {
249 evtp = &btp->table[i];
250 if (evtp->tsc == 0)
251 goto next;
252
253 msecs = cputick2usec(evtp->tick) / 1000;
254 dmsecs = (last_msecs != 0 && msecs > last_msecs) ?
255 msecs - last_msecs : 0;
256 printed = false;
257
258 /*
259 * If a threshold is defined, start filtering events by the
260 * delta of msecs.
261 */
262 if (dthres != 0 && (dmsecs > dthres)) {
263 /*
264 * Print the previous entry as a landmark, even if it
265 * falls below the threshold.
266 */
267 if (last_evtp != NULL && !last_printed)
268 BTC_DELTA_PRINT(last_evtp, last_msecs,
269 last_dmsecs);
270 BTC_DELTA_PRINT(evtp, msecs, dmsecs);
271 printed = true;
272 } else if (dthres == 0) {
273 BTC_DELTA_PRINT(evtp, msecs, dmsecs);
274 printed = true;
275 }
276 if (first_msecs == 0 || msecs < first_msecs)
277 first_msecs = msecs;
278 last_evtp = evtp;
279 last_msecs = msecs;
280 last_dmsecs = dmsecs;
281 last_printed = printed;
282 maybe_yield();
283 next:
284 i = (i + 1) % btp->size;
285 } while (i != curr);
286
287 total_dmsecs = last_msecs > first_msecs ?
288 (last_msecs - first_msecs) : 0;
289 if (sbp != NULL)
290 sbuf_printf(sbp, "Total measured time: %ju msecs\n",
291 (uintmax_t)total_dmsecs);
292 else
293 printf("Total measured time: %ju msecs\n",
294 (uintmax_t)total_dmsecs);
295 }
296
297 /*
298 * Dump trace table entries to the console, given a delta threshold.
299 */
300 void
boottrace_dump_console(void)301 boottrace_dump_console(void)
302 {
303 if (!boottrace_enabled)
304 return;
305
306 if (shutdown || rebooting || KERNEL_PANICKED()) {
307 boottrace_display(NULL, &st, shutdown_trace_threshold);
308 } else {
309 boottrace_display(NULL, &bt, 0);
310 boottrace_display(NULL, &rt, 0);
311 }
312 }
313
314 /*
315 * Records a new tracing event to the specified table.
316 *
317 * We don't use a lock because we want this to be callable from interrupt
318 * context.
319 */
320 static int
dotrace(struct bt_table * btp,const char * eventname,const char * tdname)321 dotrace(struct bt_table *btp, const char *eventname, const char *tdname)
322 {
323 uint32_t idx, nxt;
324 struct rusage usage;
325
326 MPASS(boottrace_enabled);
327 if (tdname == NULL)
328 tdname = (curproc->p_flag & P_SYSTEM) ?
329 curthread->td_name : curproc->p_comm;
330
331 dprintf("dotrace[");
332 dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'",
333 PCPU_GET(cpuid), curthread->td_proc->p_pid,
334 (uintmax_t)get_cyclecount(), ticks, tdname, eventname);
335 if (btp->table == NULL) {
336 btp->drops_early++;
337 dprintf(", return=ENOSPC_1]\n");
338 return (ENOSPC);
339 }
340
341 /* Claim a slot in the table. */
342 do {
343 idx = btp->curr;
344 nxt = (idx + 1) % btp->size;
345 if (nxt == 0 && btp->wrap == 0) {
346 btp->drops_full++;
347 dprintf(", return=ENOSPC_2]\n");
348 return (ENOSPC);
349 }
350 } while (!atomic_cmpset_int(&btp->curr, idx, nxt));
351
352 btp->table[idx].cpuid = PCPU_GET(cpuid);
353 btp->table[idx].tsc = get_cyclecount(),
354 btp->table[idx].tick = cpu_ticks();
355 btp->table[idx].pid = curthread->td_proc->p_pid;
356
357 /*
358 * Get the resource usage for the process. Don't attempt this for the
359 * kernel proc0 or for critical section activities.
360 */
361 if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) {
362 btp->table[idx].cputime = 0;
363 btp->table[idx].inblock = 0;
364 btp->table[idx].oublock = 0;
365 } else {
366 kern_getrusage(curthread, RUSAGE_CHILDREN, &usage);
367 btp->table[idx].cputime =
368 (uint64_t)(usage.ru_utime.tv_sec * 1000000 +
369 usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 +
370 usage.ru_stime.tv_usec);
371 btp->table[idx].inblock = (uint32_t)usage.ru_inblock;
372 btp->table[idx].oublock = (uint32_t)usage.ru_oublock;
373 }
374 strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN);
375 strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN);
376
377 dprintf(", return=0]\n");
378 return (0);
379 }
380
381 /*
382 * Log various boot-time events, with the trace message encoded using
383 * printf-like arguments.
384 */
385 int
boottrace(const char * tdname,const char * fmt,...)386 boottrace(const char *tdname, const char *fmt, ...)
387 {
388 char eventname[BT_EVENT_NAMELEN];
389 struct bt_table *btp;
390 va_list ap;
391
392 if (!dotrace_kernel)
393 return (ENXIO);
394
395 va_start(ap, fmt);
396 vsnprintf(eventname, sizeof(eventname), fmt, ap);
397 va_end(ap);
398
399 btp = &bt;
400 if (bootdone)
401 btp = &rt;
402 if (shutdown || rebooting || KERNEL_PANICKED())
403 btp = &st;
404
405 return (dotrace(btp, eventname, tdname));
406 }
407
408 /*
409 * Log a run-time event & switch over to run-time tracing mode.
410 */
411 static int
runtrace(const char * eventname,const char * tdname)412 runtrace(const char *eventname, const char *tdname)
413 {
414
415 bootdone = true;
416 return (dotrace(&rt, eventname, tdname));
417 }
418
419 /*
420 * Parse a boottrace message from userspace.
421 *
422 * The input from may contain a ':' to denote tdname. If not, tdname is
423 * inferred from the process' name.
424 *
425 * e.g. reboot(8):SIGINT to init(8)
426 */
427 static void
boottrace_parse_message(char * message,char ** eventname,char ** tdname)428 boottrace_parse_message(char *message, char **eventname, char **tdname)
429 {
430 char *delim;
431
432 delim = strchr(message, ':');
433 if (delim != NULL) {
434 *delim = '\0';
435 *tdname = message;
436 *eventname = ++delim;
437 } else {
438 *tdname = curproc->p_comm;
439 *eventname = message;
440 }
441 }
442
443 static int
_boottrace_sysctl(struct bt_table * btp,struct sysctl_oid * oidp,struct sysctl_req * req)444 _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp,
445 struct sysctl_req *req)
446 {
447 char message[BT_MSGLEN];
448 char *eventname, *tdname;
449 int error;
450
451 if (!dotrace_user)
452 return (ENXIO);
453
454 message[0] = '\0';
455 error = sysctl_handle_string(oidp, message, sizeof(message), req);
456 if (error)
457 return (error);
458
459 boottrace_parse_message(message, &eventname, &tdname);
460 error = dotrace(btp, eventname, tdname);
461 if (error == ENOSPC) {
462 /* Ignore table full error. */
463 error = 0;
464 }
465 return (error);
466 }
467
468 static int
sysctl_log(SYSCTL_HANDLER_ARGS)469 sysctl_log(SYSCTL_HANDLER_ARGS)
470 {
471 struct sbuf *sbuf;
472 int error;
473
474 if (!boottrace_enabled || req->newptr != NULL)
475 return (0);
476
477 sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
478 boottrace_display(sbuf, &bt, 0);
479 boottrace_display(sbuf, &rt, 0);
480 sbuf_finish(sbuf);
481 error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf));
482 sbuf_delete(sbuf);
483
484 return (error);
485 }
486
487 static int
sysctl_boottrace(SYSCTL_HANDLER_ARGS)488 sysctl_boottrace(SYSCTL_HANDLER_ARGS)
489 {
490
491 if (!boottrace_enabled)
492 return (0);
493
494 /* No output */
495 if (req->newptr == NULL)
496 return (0);
497
498 /* Trace to rt if we have reached multi-user. */
499 return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req));
500 }
501
502 /*
503 * Log a run-time event and switch over to run-time tracing mode.
504 */
505 static int
sysctl_runtrace(SYSCTL_HANDLER_ARGS)506 sysctl_runtrace(SYSCTL_HANDLER_ARGS)
507 {
508
509 if (!boottrace_enabled)
510 return (0);
511
512 /* No output */
513 if (req->newptr == NULL)
514 return (0);
515
516 bootdone = true;
517 return (_boottrace_sysctl(&rt, oidp, req));
518 }
519
520 /*
521 * Log a shutdown-time event and switch over to shutdown tracing mode.
522 */
523 static int
sysctl_shuttrace(SYSCTL_HANDLER_ARGS)524 sysctl_shuttrace(SYSCTL_HANDLER_ARGS)
525 {
526
527 if (!boottrace_enabled)
528 return (0);
529
530 /* No output */
531 if (req->newptr == NULL)
532 return (0);
533
534 shutdown = true;
535 return (_boottrace_sysctl(&st, oidp, req));
536 }
537
538 /*
539 * Reset the runtime tracing buffer.
540 */
541 void
boottrace_reset(const char * actor)542 boottrace_reset(const char *actor)
543 {
544 char tmpbuf[64];
545
546 snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor);
547 runtrace(tmpbuf, NULL);
548 }
549
550 /*
551 * Note that a resize implies a reset, i.e., the index is reset to 0.
552 * We never shrink the array; we can only increase its size.
553 */
554 int
boottrace_resize(u_int newsize)555 boottrace_resize(u_int newsize)
556 {
557
558 if (newsize <= rt.size) {
559 return (EINVAL);
560 }
561 rt.table = realloc(rt.table, newsize * sizeof(struct bt_event),
562 M_BOOTTRACE, M_WAITOK | M_ZERO);
563 rt.size = newsize;
564 boottrace_reset("boottrace_resize");
565 return (0);
566 }
567
568 static int
sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)569 sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
570 {
571
572 if (!boottrace_enabled)
573 return (0);
574
575 if (req->newptr != NULL)
576 boottrace_reset("sysctl_boottrace_reset");
577
578 return (0);
579 }
580
581 static void
boottrace_init(void)582 boottrace_init(void)
583 {
584
585 if (!boottrace_enabled)
586 return;
587
588 /* Boottime trace table */
589 bt.size = BT_TABLE_DEFSIZE;
590 TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size);
591 bt.size = max(bt.size, BT_TABLE_MINSIZE);
592 bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE,
593 M_WAITOK | M_ZERO);
594
595 /* Stick in an initial entry. */
596 bt.table[0].cpuid = PCPU_GET(cpuid);
597 strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN);
598 strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN);
599 bt.curr = 1;
600
601 /* Run-time trace table (may wrap-around). */
602 rt.wrap = 1;
603 rt.size = BT_TABLE_RUNSIZE;
604 rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE,
605 M_WAITOK | M_ZERO);
606
607 /* Shutdown trace table */
608 st.size = BT_TABLE_SHTSIZE;
609 st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE,
610 M_WAITOK | M_ZERO);
611 }
612 SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, NULL);
613