1*da5b7e90SMitchell Horne /*- 2*da5b7e90SMitchell Horne * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3*da5b7e90SMitchell Horne * 4*da5b7e90SMitchell Horne * Copyright (c) 2008-2022 NetApp, Inc. 5*da5b7e90SMitchell Horne * 6*da5b7e90SMitchell Horne * Redistribution and use in source and binary forms, with or without 7*da5b7e90SMitchell Horne * modification, are permitted provided that the following conditions 8*da5b7e90SMitchell Horne * are met: 9*da5b7e90SMitchell Horne * 1. Redistributions of source code must retain the above copyright 10*da5b7e90SMitchell Horne * notice, this list of conditions and the following disclaimer. 11*da5b7e90SMitchell Horne * 2. Redistributions in binary form must reproduce the above copyright 12*da5b7e90SMitchell Horne * notice, this list of conditions and the following disclaimer in the 13*da5b7e90SMitchell Horne * documentation and/or other materials provided with the distribution. 14*da5b7e90SMitchell Horne * 15*da5b7e90SMitchell Horne * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16*da5b7e90SMitchell Horne * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17*da5b7e90SMitchell Horne * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18*da5b7e90SMitchell Horne * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19*da5b7e90SMitchell Horne * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20*da5b7e90SMitchell Horne * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21*da5b7e90SMitchell Horne * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22*da5b7e90SMitchell Horne * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*da5b7e90SMitchell Horne * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24*da5b7e90SMitchell Horne * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*da5b7e90SMitchell Horne */ 26*da5b7e90SMitchell Horne 27*da5b7e90SMitchell Horne #include <sys/param.h> 28*da5b7e90SMitchell Horne #include <sys/systm.h> 29*da5b7e90SMitchell Horne #include <sys/boottrace.h> 30*da5b7e90SMitchell Horne #include <sys/kernel.h> 31*da5b7e90SMitchell Horne #include <sys/malloc.h> 32*da5b7e90SMitchell Horne #include <sys/pcpu.h> 33*da5b7e90SMitchell Horne #include <sys/proc.h> 34*da5b7e90SMitchell Horne #include <sys/resourcevar.h> 35*da5b7e90SMitchell Horne #include <sys/sbuf.h> 36*da5b7e90SMitchell Horne #include <sys/syscallsubr.h> 37*da5b7e90SMitchell Horne #include <sys/sysctl.h> 38*da5b7e90SMitchell Horne #include <sys/time.h> 39*da5b7e90SMitchell Horne 40*da5b7e90SMitchell Horne #include <machine/stdarg.h> 41*da5b7e90SMitchell Horne 42*da5b7e90SMitchell Horne #define dprintf(fmt, ...) \ 43*da5b7e90SMitchell Horne do { \ 44*da5b7e90SMitchell Horne if (dotrace_debugging) \ 45*da5b7e90SMitchell Horne printf(fmt, ##__VA_ARGS__); \ 46*da5b7e90SMitchell Horne } while (0); 47*da5b7e90SMitchell Horne 48*da5b7e90SMitchell Horne static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing"); 49*da5b7e90SMitchell Horne 50*da5b7e90SMitchell Horne #define BT_TABLE_DEFSIZE 3000 51*da5b7e90SMitchell Horne #define BT_TABLE_RUNSIZE 2000 52*da5b7e90SMitchell Horne #define BT_TABLE_SHTSIZE 1000 53*da5b7e90SMitchell Horne #define BT_TABLE_MINSIZE 500 54*da5b7e90SMitchell Horne 55*da5b7e90SMitchell Horne /* 56*da5b7e90SMitchell Horne * Boot-time & shutdown-time event. 57*da5b7e90SMitchell Horne */ 58*da5b7e90SMitchell Horne struct bt_event { 59*da5b7e90SMitchell Horne uint64_t tsc; /* Timestamp */ 60*da5b7e90SMitchell Horne uint64_t tick; /* Kernel tick */ 61*da5b7e90SMitchell Horne uint64_t cputime; /* Microseconds of process CPU time */ 62*da5b7e90SMitchell Horne uint32_t cpuid; /* CPU on which the event ran */ 63*da5b7e90SMitchell Horne uint32_t inblock; /* # of blocks in */ 64*da5b7e90SMitchell Horne uint32_t oublock; /* # of blocks out */ 65*da5b7e90SMitchell Horne pid_t pid; /* Current PID */ 66*da5b7e90SMitchell Horne char name[BT_EVENT_NAMELEN]; /* Event name */ 67*da5b7e90SMitchell Horne char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */ 68*da5b7e90SMitchell Horne }; 69*da5b7e90SMitchell Horne 70*da5b7e90SMitchell Horne struct bt_table { 71*da5b7e90SMitchell Horne uint32_t size; /* Trace table size */ 72*da5b7e90SMitchell Horne uint32_t curr; /* Trace entry to use */ 73*da5b7e90SMitchell Horne uint32_t wrap; /* Wrap-around, instead of dropping */ 74*da5b7e90SMitchell Horne uint32_t drops_early; /* Trace entries dropped before init */ 75*da5b7e90SMitchell Horne uint32_t drops_full; /* Trace entries dropped after full */ 76*da5b7e90SMitchell Horne struct bt_event *table; 77*da5b7e90SMitchell Horne }; 78*da5b7e90SMitchell Horne 79*da5b7e90SMitchell Horne /* Boot-time tracing */ 80*da5b7e90SMitchell Horne static struct bt_table bt; 81*da5b7e90SMitchell Horne 82*da5b7e90SMitchell Horne /* Run-time tracing */ 83*da5b7e90SMitchell Horne static struct bt_table rt; 84*da5b7e90SMitchell Horne 85*da5b7e90SMitchell Horne /* Shutdown-time tracing */ 86*da5b7e90SMitchell Horne static struct bt_table st; 87*da5b7e90SMitchell Horne 88*da5b7e90SMitchell Horne /* Set when system boot is complete, and we've switched to runtime tracing. */ 89*da5b7e90SMitchell Horne static bool bootdone; 90*da5b7e90SMitchell Horne 91*da5b7e90SMitchell Horne /* Set when system shutdown has started. */ 92*da5b7e90SMitchell Horne static bool shutdown; 93*da5b7e90SMitchell Horne 94*da5b7e90SMitchell Horne /* Turn on dotrace() debug logging to console. */ 95*da5b7e90SMitchell Horne static bool dotrace_debugging; 96*da5b7e90SMitchell Horne TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging); 97*da5b7e90SMitchell Horne 98*da5b7e90SMitchell Horne /* Trace kernel events */ 99*da5b7e90SMitchell Horne static bool dotrace_kernel = true; 100*da5b7e90SMitchell Horne TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel); 101*da5b7e90SMitchell Horne 102*da5b7e90SMitchell Horne /* Trace userspace events */ 103*da5b7e90SMitchell Horne static bool dotrace_user = true; 104*da5b7e90SMitchell Horne TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user); 105*da5b7e90SMitchell Horne 106*da5b7e90SMitchell Horne static int sysctl_log(SYSCTL_HANDLER_ARGS); 107*da5b7e90SMitchell Horne static int sysctl_boottrace(SYSCTL_HANDLER_ARGS); 108*da5b7e90SMitchell Horne static int sysctl_runtrace(SYSCTL_HANDLER_ARGS); 109*da5b7e90SMitchell Horne static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS); 110*da5b7e90SMitchell Horne static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS); 111*da5b7e90SMitchell Horne 112*da5b7e90SMitchell Horne SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 113*da5b7e90SMitchell Horne "boottrace statistics"); 114*da5b7e90SMitchell Horne 115*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, log, 116*da5b7e90SMitchell Horne CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 117*da5b7e90SMitchell Horne 0, 0, sysctl_log, "A", 118*da5b7e90SMitchell Horne "Print a log of the boottrace trace data"); 119*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace, 120*da5b7e90SMitchell Horne CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 121*da5b7e90SMitchell Horne 0, 0, sysctl_boottrace, "A", 122*da5b7e90SMitchell Horne "Capture a boot-time trace event"); 123*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace, 124*da5b7e90SMitchell Horne CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 125*da5b7e90SMitchell Horne 0, 0, sysctl_runtrace, "A", 126*da5b7e90SMitchell Horne "Capture a run-time trace event"); 127*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace, 128*da5b7e90SMitchell Horne CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 129*da5b7e90SMitchell Horne 0, 0, sysctl_shuttrace, "A", 130*da5b7e90SMitchell Horne "Capture a shutdown-time trace event"); 131*da5b7e90SMitchell Horne SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset, 132*da5b7e90SMitchell Horne CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, 133*da5b7e90SMitchell Horne 0, 0, sysctl_boottrace_reset, "A", 134*da5b7e90SMitchell Horne "Reset run-time tracing table"); 135*da5b7e90SMitchell Horne 136*da5b7e90SMitchell Horne /* 137*da5b7e90SMitchell Horne * Global enable. 138*da5b7e90SMitchell Horne */ 139*da5b7e90SMitchell Horne bool __read_mostly boottrace_enabled = false; 140*da5b7e90SMitchell Horne SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN, 141*da5b7e90SMitchell Horne &boottrace_enabled, 0, 142*da5b7e90SMitchell Horne "Boot-time and shutdown-time tracing enabled"); 143*da5b7e90SMitchell Horne 144*da5b7e90SMitchell Horne /* 145*da5b7e90SMitchell Horne * Enable dumping of the shutdown trace entries to console. 146*da5b7e90SMitchell Horne */ 147*da5b7e90SMitchell Horne bool shutdown_trace = false; 148*da5b7e90SMitchell Horne SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN, 149*da5b7e90SMitchell Horne &shutdown_trace, 0, 150*da5b7e90SMitchell Horne "Enable kernel shutdown tracing to the console"); 151*da5b7e90SMitchell Horne 152*da5b7e90SMitchell Horne /* 153*da5b7e90SMitchell Horne * Set the delta threshold (ms) below which events are ignored, used in 154*da5b7e90SMitchell Horne * determining what to dump to the console. 155*da5b7e90SMitchell Horne */ 156*da5b7e90SMitchell Horne static int shutdown_trace_threshold; 157*da5b7e90SMitchell Horne SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN, 158*da5b7e90SMitchell Horne &shutdown_trace_threshold, 0, 159*da5b7e90SMitchell Horne "Tracing threshold (ms) below which tracing is ignored"); 160*da5b7e90SMitchell Horne 161*da5b7e90SMitchell Horne /* 162*da5b7e90SMitchell Horne * Dump a trace to buffer or if buffer is NULL to console. 163*da5b7e90SMitchell Horne * 164*da5b7e90SMitchell Horne * Non-zero delta_threshold selectively prints entries based on delta 165*da5b7e90SMitchell Horne * with current and previous entry. Otherwise, delta_threshold of 0 166*da5b7e90SMitchell Horne * prints every trace entry and delta. 167*da5b7e90SMitchell Horne * 168*da5b7e90SMitchell Horne * Output something like this: 169*da5b7e90SMitchell Horne * 170*da5b7e90SMitchell Horne * CPU msecs delta process event 171*da5b7e90SMitchell Horne * 11 1228262715 0 init shutdown pre sync begin 172*da5b7e90SMitchell Horne * 3 1228265622 2907 init shutdown pre sync complete 173*da5b7e90SMitchell Horne * 3 1228265623 0 init shutdown turned swap off 174*da5b7e90SMitchell Horne * 18 1228266466 843 init shutdown unmounted all filesystems 175*da5b7e90SMitchell Horne * 176*da5b7e90SMitchell Horne * How to interpret: 177*da5b7e90SMitchell Horne * 178*da5b7e90SMitchell Horne * delta column represents the time in milliseconds between this event and the previous. 179*da5b7e90SMitchell Horne * Usually that means you can take the previous event, current event, match them 180*da5b7e90SMitchell Horne * up in the code, and whatever lies between is the culprit taking time. 181*da5b7e90SMitchell Horne * 182*da5b7e90SMitchell Horne * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount 183*da5b7e90SMitchell Horne * filesystems is taking 843 milliseconds. 184*da5b7e90SMitchell Horne * 185*da5b7e90SMitchell Horne * An event with a delta of 0 are 'landmark' events that simply exist in the output 186*da5b7e90SMitchell Horne * for the developer to know where the time measurement begins. The 0 is an arbitrary 187*da5b7e90SMitchell Horne * number that can effectively be ignored. 188*da5b7e90SMitchell Horne */ 189*da5b7e90SMitchell Horne #define BTC_DELTA_PRINT(bte, msecs, delta) do { \ 190*da5b7e90SMitchell Horne if (sbp) { \ 191*da5b7e90SMitchell Horne sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta, \ 192*da5b7e90SMitchell Horne (bte)->tdname, (bte)->name, (bte)->pid, \ 193*da5b7e90SMitchell Horne (bte)->cputime / 1000000, \ 194*da5b7e90SMitchell Horne ((bte)->cputime % 1000000) / 10000, \ 195*da5b7e90SMitchell Horne (bte)->inblock, (bte)->oublock); \ 196*da5b7e90SMitchell Horne } else { \ 197*da5b7e90SMitchell Horne printf(fmt, (bte)->cpuid, msecs, delta, \ 198*da5b7e90SMitchell Horne (bte)->tdname, (bte)->name, (bte)->pid, \ 199*da5b7e90SMitchell Horne (bte)->cputime / 1000000, \ 200*da5b7e90SMitchell Horne ((bte)->cputime % 1000000) / 10000, \ 201*da5b7e90SMitchell Horne (bte)->inblock, (bte)->oublock); \ 202*da5b7e90SMitchell Horne } \ 203*da5b7e90SMitchell Horne } while (0) 204*da5b7e90SMitchell Horne 205*da5b7e90SMitchell Horne /* 206*da5b7e90SMitchell Horne * Print the trace entries to the message buffer, or to an sbuf, if provided. 207*da5b7e90SMitchell Horne * 208*da5b7e90SMitchell Horne * Entries with a difference less than dthres will not be printed. 209*da5b7e90SMitchell Horne */ 210*da5b7e90SMitchell Horne static void 211*da5b7e90SMitchell Horne boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres) 212*da5b7e90SMitchell Horne { 213*da5b7e90SMitchell Horne struct bt_event *evtp; 214*da5b7e90SMitchell Horne struct bt_event *last_evtp; 215*da5b7e90SMitchell Horne uint64_t msecs; 216*da5b7e90SMitchell Horne uint64_t first_msecs; 217*da5b7e90SMitchell Horne uint64_t last_msecs; 218*da5b7e90SMitchell Horne uint64_t dmsecs; 219*da5b7e90SMitchell Horne uint64_t last_dmsecs; 220*da5b7e90SMitchell Horne uint64_t total_dmsecs; 221*da5b7e90SMitchell Horne uint32_t i; 222*da5b7e90SMitchell Horne uint32_t curr; 223*da5b7e90SMitchell Horne const char *fmt = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n"; 224*da5b7e90SMitchell Horne const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n"; 225*da5b7e90SMitchell Horne bool printed; 226*da5b7e90SMitchell Horne bool last_printed; 227*da5b7e90SMitchell Horne 228*da5b7e90SMitchell Horne /* Print the header */ 229*da5b7e90SMitchell Horne if (sbp != NULL) 230*da5b7e90SMitchell Horne sbuf_printf(sbp, hdr_fmt, 231*da5b7e90SMitchell Horne "CPU", "msecs", "delta", "process", 232*da5b7e90SMitchell Horne "event", "PID", "CPUtime", "IBlks", "OBlks"); 233*da5b7e90SMitchell Horne else 234*da5b7e90SMitchell Horne printf(hdr_fmt, 235*da5b7e90SMitchell Horne "CPU", "msecs", "delta", "process", 236*da5b7e90SMitchell Horne "event", "PID", "CPUtime", "IBlks", "OBlks"); 237*da5b7e90SMitchell Horne 238*da5b7e90SMitchell Horne first_msecs = 0; 239*da5b7e90SMitchell Horne last_evtp = NULL; 240*da5b7e90SMitchell Horne last_msecs = 0; 241*da5b7e90SMitchell Horne last_dmsecs = 0; 242*da5b7e90SMitchell Horne last_printed = false; 243*da5b7e90SMitchell Horne i = curr = btp->curr; 244*da5b7e90SMitchell Horne 245*da5b7e90SMitchell Horne do { 246*da5b7e90SMitchell Horne evtp = &btp->table[i]; 247*da5b7e90SMitchell Horne if (evtp->tsc == 0) 248*da5b7e90SMitchell Horne goto next; 249*da5b7e90SMitchell Horne 250*da5b7e90SMitchell Horne msecs = cputick2usec(evtp->tick) / 1000; 251*da5b7e90SMitchell Horne dmsecs = (last_msecs != 0 && msecs > last_msecs) ? 252*da5b7e90SMitchell Horne msecs - last_msecs : 0; 253*da5b7e90SMitchell Horne printed = false; 254*da5b7e90SMitchell Horne 255*da5b7e90SMitchell Horne /* 256*da5b7e90SMitchell Horne * If a threshold is defined, start filtering events by the 257*da5b7e90SMitchell Horne * delta of msecs. 258*da5b7e90SMitchell Horne */ 259*da5b7e90SMitchell Horne if (dthres != 0 && (dmsecs > dthres)) { 260*da5b7e90SMitchell Horne /* 261*da5b7e90SMitchell Horne * Print the previous entry as a landmark, even if it 262*da5b7e90SMitchell Horne * falls below the threshold. 263*da5b7e90SMitchell Horne */ 264*da5b7e90SMitchell Horne if (last_evtp != NULL && !last_printed) 265*da5b7e90SMitchell Horne BTC_DELTA_PRINT(last_evtp, last_msecs, 266*da5b7e90SMitchell Horne last_dmsecs); 267*da5b7e90SMitchell Horne BTC_DELTA_PRINT(evtp, msecs, dmsecs); 268*da5b7e90SMitchell Horne printed = true; 269*da5b7e90SMitchell Horne } else if (dthres == 0) { 270*da5b7e90SMitchell Horne BTC_DELTA_PRINT(evtp, msecs, dmsecs); 271*da5b7e90SMitchell Horne printed = true; 272*da5b7e90SMitchell Horne } 273*da5b7e90SMitchell Horne if (first_msecs == 0 || msecs < first_msecs) 274*da5b7e90SMitchell Horne first_msecs = msecs; 275*da5b7e90SMitchell Horne last_evtp = evtp; 276*da5b7e90SMitchell Horne last_msecs = msecs; 277*da5b7e90SMitchell Horne last_dmsecs = dmsecs; 278*da5b7e90SMitchell Horne last_printed = printed; 279*da5b7e90SMitchell Horne maybe_yield(); 280*da5b7e90SMitchell Horne next: 281*da5b7e90SMitchell Horne i = (i + 1) % btp->size; 282*da5b7e90SMitchell Horne } while (i != curr); 283*da5b7e90SMitchell Horne 284*da5b7e90SMitchell Horne total_dmsecs = last_msecs > first_msecs ? 285*da5b7e90SMitchell Horne (last_msecs - first_msecs) : 0; 286*da5b7e90SMitchell Horne if (sbp != NULL) 287*da5b7e90SMitchell Horne sbuf_printf(sbp, "Total measured time: %ju msecs\n", 288*da5b7e90SMitchell Horne (uintmax_t)total_dmsecs); 289*da5b7e90SMitchell Horne else 290*da5b7e90SMitchell Horne printf("Total measured time: %ju msecs\n", 291*da5b7e90SMitchell Horne (uintmax_t)total_dmsecs); 292*da5b7e90SMitchell Horne } 293*da5b7e90SMitchell Horne 294*da5b7e90SMitchell Horne /* 295*da5b7e90SMitchell Horne * Dump trace table entries to the console, given a delta threshold. 296*da5b7e90SMitchell Horne */ 297*da5b7e90SMitchell Horne void 298*da5b7e90SMitchell Horne boottrace_dump_console(void) 299*da5b7e90SMitchell Horne { 300*da5b7e90SMitchell Horne if (!boottrace_enabled) 301*da5b7e90SMitchell Horne return; 302*da5b7e90SMitchell Horne 303*da5b7e90SMitchell Horne if (shutdown || rebooting || KERNEL_PANICKED()) { 304*da5b7e90SMitchell Horne boottrace_display(NULL, &st, shutdown_trace_threshold); 305*da5b7e90SMitchell Horne } else { 306*da5b7e90SMitchell Horne boottrace_display(NULL, &bt, 0); 307*da5b7e90SMitchell Horne boottrace_display(NULL, &rt, 0); 308*da5b7e90SMitchell Horne } 309*da5b7e90SMitchell Horne } 310*da5b7e90SMitchell Horne 311*da5b7e90SMitchell Horne /* 312*da5b7e90SMitchell Horne * Records a new tracing event to the specified table. 313*da5b7e90SMitchell Horne * 314*da5b7e90SMitchell Horne * We don't use a lock because we want this to be callable from interrupt 315*da5b7e90SMitchell Horne * context. 316*da5b7e90SMitchell Horne */ 317*da5b7e90SMitchell Horne static int 318*da5b7e90SMitchell Horne dotrace(struct bt_table *btp, const char *eventname, const char *tdname) 319*da5b7e90SMitchell Horne { 320*da5b7e90SMitchell Horne uint32_t idx, nxt; 321*da5b7e90SMitchell Horne struct rusage usage; 322*da5b7e90SMitchell Horne 323*da5b7e90SMitchell Horne MPASS(boottrace_enabled); 324*da5b7e90SMitchell Horne if (tdname == NULL) 325*da5b7e90SMitchell Horne tdname = (curproc->p_flag & P_SYSTEM) ? 326*da5b7e90SMitchell Horne curthread->td_name : curproc->p_comm; 327*da5b7e90SMitchell Horne 328*da5b7e90SMitchell Horne dprintf("dotrace["); 329*da5b7e90SMitchell Horne dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'", 330*da5b7e90SMitchell Horne PCPU_GET(cpuid), curthread->td_proc->p_pid, 331*da5b7e90SMitchell Horne (uintmax_t)get_cyclecount(), ticks, tdname, eventname); 332*da5b7e90SMitchell Horne if (btp->table == NULL) { 333*da5b7e90SMitchell Horne btp->drops_early++; 334*da5b7e90SMitchell Horne dprintf(", return=ENOSPC_1]\n"); 335*da5b7e90SMitchell Horne return (ENOSPC); 336*da5b7e90SMitchell Horne } 337*da5b7e90SMitchell Horne 338*da5b7e90SMitchell Horne /* Claim a slot in the table. */ 339*da5b7e90SMitchell Horne do { 340*da5b7e90SMitchell Horne idx = btp->curr; 341*da5b7e90SMitchell Horne nxt = (idx + 1) % btp->size; 342*da5b7e90SMitchell Horne if (nxt == 0 && btp->wrap == 0) { 343*da5b7e90SMitchell Horne btp->drops_full++; 344*da5b7e90SMitchell Horne dprintf(", return=ENOSPC_2]\n"); 345*da5b7e90SMitchell Horne return (ENOSPC); 346*da5b7e90SMitchell Horne } 347*da5b7e90SMitchell Horne } while (!atomic_cmpset_int(&btp->curr, idx, nxt)); 348*da5b7e90SMitchell Horne 349*da5b7e90SMitchell Horne btp->table[idx].cpuid = PCPU_GET(cpuid); 350*da5b7e90SMitchell Horne btp->table[idx].tsc = get_cyclecount(), 351*da5b7e90SMitchell Horne btp->table[idx].tick = cpu_ticks(); 352*da5b7e90SMitchell Horne btp->table[idx].pid = curthread->td_proc->p_pid; 353*da5b7e90SMitchell Horne 354*da5b7e90SMitchell Horne /* 355*da5b7e90SMitchell Horne * Get the resource usage for the process. Don't attempt this for the 356*da5b7e90SMitchell Horne * kernel proc0 or for critical section activities. 357*da5b7e90SMitchell Horne */ 358*da5b7e90SMitchell Horne if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) { 359*da5b7e90SMitchell Horne btp->table[idx].cputime = 0; 360*da5b7e90SMitchell Horne btp->table[idx].inblock = 0; 361*da5b7e90SMitchell Horne btp->table[idx].oublock = 0; 362*da5b7e90SMitchell Horne } else { 363*da5b7e90SMitchell Horne kern_getrusage(curthread, RUSAGE_CHILDREN, &usage); 364*da5b7e90SMitchell Horne btp->table[idx].cputime = 365*da5b7e90SMitchell Horne (uint64_t)(usage.ru_utime.tv_sec * 1000000 + 366*da5b7e90SMitchell Horne usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 + 367*da5b7e90SMitchell Horne usage.ru_stime.tv_usec); 368*da5b7e90SMitchell Horne btp->table[idx].inblock = (uint32_t)usage.ru_inblock; 369*da5b7e90SMitchell Horne btp->table[idx].oublock = (uint32_t)usage.ru_oublock; 370*da5b7e90SMitchell Horne } 371*da5b7e90SMitchell Horne strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN); 372*da5b7e90SMitchell Horne strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN); 373*da5b7e90SMitchell Horne 374*da5b7e90SMitchell Horne dprintf(", return=0]\n"); 375*da5b7e90SMitchell Horne return (0); 376*da5b7e90SMitchell Horne } 377*da5b7e90SMitchell Horne 378*da5b7e90SMitchell Horne /* 379*da5b7e90SMitchell Horne * Log various boot-time events, with the trace message encoded using 380*da5b7e90SMitchell Horne * printf-like arguments. 381*da5b7e90SMitchell Horne */ 382*da5b7e90SMitchell Horne int 383*da5b7e90SMitchell Horne boottrace(const char *tdname, const char *fmt, ...) 384*da5b7e90SMitchell Horne { 385*da5b7e90SMitchell Horne char eventname[BT_EVENT_NAMELEN]; 386*da5b7e90SMitchell Horne struct bt_table *btp; 387*da5b7e90SMitchell Horne va_list ap; 388*da5b7e90SMitchell Horne 389*da5b7e90SMitchell Horne if (!dotrace_kernel) 390*da5b7e90SMitchell Horne return (ENXIO); 391*da5b7e90SMitchell Horne 392*da5b7e90SMitchell Horne va_start(ap, fmt); 393*da5b7e90SMitchell Horne vsnprintf(eventname, sizeof(eventname), fmt, ap); 394*da5b7e90SMitchell Horne va_end(ap); 395*da5b7e90SMitchell Horne 396*da5b7e90SMitchell Horne btp = &bt; 397*da5b7e90SMitchell Horne if (bootdone) 398*da5b7e90SMitchell Horne btp = &rt; 399*da5b7e90SMitchell Horne if (shutdown || rebooting || KERNEL_PANICKED()) 400*da5b7e90SMitchell Horne btp = &st; 401*da5b7e90SMitchell Horne 402*da5b7e90SMitchell Horne return (dotrace(btp, eventname, tdname)); 403*da5b7e90SMitchell Horne } 404*da5b7e90SMitchell Horne 405*da5b7e90SMitchell Horne /* 406*da5b7e90SMitchell Horne * Log a run-time event & switch over to run-time tracing mode. 407*da5b7e90SMitchell Horne */ 408*da5b7e90SMitchell Horne static int 409*da5b7e90SMitchell Horne runtrace(const char *eventname, const char *tdname) 410*da5b7e90SMitchell Horne { 411*da5b7e90SMitchell Horne 412*da5b7e90SMitchell Horne bootdone = true; 413*da5b7e90SMitchell Horne return (dotrace(&rt, eventname, tdname)); 414*da5b7e90SMitchell Horne } 415*da5b7e90SMitchell Horne 416*da5b7e90SMitchell Horne /* 417*da5b7e90SMitchell Horne * Parse a boottrace message from userspace. 418*da5b7e90SMitchell Horne * 419*da5b7e90SMitchell Horne * The input from may contain a ':' to denote tdname. If not, tdname is 420*da5b7e90SMitchell Horne * inferred from the process' name. 421*da5b7e90SMitchell Horne * 422*da5b7e90SMitchell Horne * e.g. reboot(8):SIGINT to init(8) 423*da5b7e90SMitchell Horne */ 424*da5b7e90SMitchell Horne static void 425*da5b7e90SMitchell Horne boottrace_parse_message(char *message, char **eventname, char **tdname) 426*da5b7e90SMitchell Horne { 427*da5b7e90SMitchell Horne char *delim; 428*da5b7e90SMitchell Horne 429*da5b7e90SMitchell Horne delim = strchr(message, ':'); 430*da5b7e90SMitchell Horne if (delim != NULL) { 431*da5b7e90SMitchell Horne *delim = '\0'; 432*da5b7e90SMitchell Horne *tdname = message; 433*da5b7e90SMitchell Horne *eventname = ++delim; 434*da5b7e90SMitchell Horne } else { 435*da5b7e90SMitchell Horne *tdname = curproc->p_comm; 436*da5b7e90SMitchell Horne *eventname = message; 437*da5b7e90SMitchell Horne } 438*da5b7e90SMitchell Horne } 439*da5b7e90SMitchell Horne 440*da5b7e90SMitchell Horne static int 441*da5b7e90SMitchell Horne _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp, 442*da5b7e90SMitchell Horne struct sysctl_req *req) 443*da5b7e90SMitchell Horne { 444*da5b7e90SMitchell Horne char message[BT_MSGLEN]; 445*da5b7e90SMitchell Horne char *eventname, *tdname; 446*da5b7e90SMitchell Horne int error; 447*da5b7e90SMitchell Horne 448*da5b7e90SMitchell Horne if (!dotrace_user) 449*da5b7e90SMitchell Horne return (ENXIO); 450*da5b7e90SMitchell Horne 451*da5b7e90SMitchell Horne message[0] = '\0'; 452*da5b7e90SMitchell Horne error = sysctl_handle_string(oidp, message, sizeof(message), req); 453*da5b7e90SMitchell Horne if (error) 454*da5b7e90SMitchell Horne return (error); 455*da5b7e90SMitchell Horne 456*da5b7e90SMitchell Horne boottrace_parse_message(message, &eventname, &tdname); 457*da5b7e90SMitchell Horne error = dotrace(btp, eventname, tdname); 458*da5b7e90SMitchell Horne if (error == ENOSPC) { 459*da5b7e90SMitchell Horne /* Ignore table full error. */ 460*da5b7e90SMitchell Horne error = 0; 461*da5b7e90SMitchell Horne } 462*da5b7e90SMitchell Horne return (error); 463*da5b7e90SMitchell Horne } 464*da5b7e90SMitchell Horne 465*da5b7e90SMitchell Horne static int 466*da5b7e90SMitchell Horne sysctl_log(SYSCTL_HANDLER_ARGS) 467*da5b7e90SMitchell Horne { 468*da5b7e90SMitchell Horne struct sbuf *sbuf; 469*da5b7e90SMitchell Horne int error; 470*da5b7e90SMitchell Horne 471*da5b7e90SMitchell Horne if (!boottrace_enabled || req->newptr != NULL) 472*da5b7e90SMitchell Horne return (0); 473*da5b7e90SMitchell Horne 474*da5b7e90SMitchell Horne sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); 475*da5b7e90SMitchell Horne boottrace_display(sbuf, &bt, 0); 476*da5b7e90SMitchell Horne boottrace_display(sbuf, &rt, 0); 477*da5b7e90SMitchell Horne sbuf_finish(sbuf); 478*da5b7e90SMitchell Horne error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf)); 479*da5b7e90SMitchell Horne sbuf_delete(sbuf); 480*da5b7e90SMitchell Horne 481*da5b7e90SMitchell Horne return (error); 482*da5b7e90SMitchell Horne } 483*da5b7e90SMitchell Horne 484*da5b7e90SMitchell Horne static int 485*da5b7e90SMitchell Horne sysctl_boottrace(SYSCTL_HANDLER_ARGS) 486*da5b7e90SMitchell Horne { 487*da5b7e90SMitchell Horne 488*da5b7e90SMitchell Horne if (!boottrace_enabled) 489*da5b7e90SMitchell Horne return (0); 490*da5b7e90SMitchell Horne 491*da5b7e90SMitchell Horne /* No output */ 492*da5b7e90SMitchell Horne if (req->newptr == NULL) 493*da5b7e90SMitchell Horne return (0); 494*da5b7e90SMitchell Horne 495*da5b7e90SMitchell Horne /* Trace to rt if we have reached multi-user. */ 496*da5b7e90SMitchell Horne return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req)); 497*da5b7e90SMitchell Horne } 498*da5b7e90SMitchell Horne 499*da5b7e90SMitchell Horne /* 500*da5b7e90SMitchell Horne * Log a run-time event and switch over to run-time tracing mode. 501*da5b7e90SMitchell Horne */ 502*da5b7e90SMitchell Horne static int 503*da5b7e90SMitchell Horne sysctl_runtrace(SYSCTL_HANDLER_ARGS) 504*da5b7e90SMitchell Horne { 505*da5b7e90SMitchell Horne 506*da5b7e90SMitchell Horne if (!boottrace_enabled) 507*da5b7e90SMitchell Horne return (0); 508*da5b7e90SMitchell Horne 509*da5b7e90SMitchell Horne /* No output */ 510*da5b7e90SMitchell Horne if (req->newptr == NULL) 511*da5b7e90SMitchell Horne return (0); 512*da5b7e90SMitchell Horne 513*da5b7e90SMitchell Horne bootdone = true; 514*da5b7e90SMitchell Horne return (_boottrace_sysctl(&rt, oidp, req)); 515*da5b7e90SMitchell Horne } 516*da5b7e90SMitchell Horne 517*da5b7e90SMitchell Horne /* 518*da5b7e90SMitchell Horne * Log a shutdown-time event and switch over to shutdown tracing mode. 519*da5b7e90SMitchell Horne */ 520*da5b7e90SMitchell Horne static int 521*da5b7e90SMitchell Horne sysctl_shuttrace(SYSCTL_HANDLER_ARGS) 522*da5b7e90SMitchell Horne { 523*da5b7e90SMitchell Horne 524*da5b7e90SMitchell Horne if (!boottrace_enabled) 525*da5b7e90SMitchell Horne return (0); 526*da5b7e90SMitchell Horne 527*da5b7e90SMitchell Horne /* No output */ 528*da5b7e90SMitchell Horne if (req->newptr == NULL) 529*da5b7e90SMitchell Horne return (0); 530*da5b7e90SMitchell Horne 531*da5b7e90SMitchell Horne shutdown = true; 532*da5b7e90SMitchell Horne return (_boottrace_sysctl(&st, oidp, req)); 533*da5b7e90SMitchell Horne } 534*da5b7e90SMitchell Horne 535*da5b7e90SMitchell Horne /* 536*da5b7e90SMitchell Horne * Reset the runtime tracing buffer. 537*da5b7e90SMitchell Horne */ 538*da5b7e90SMitchell Horne void 539*da5b7e90SMitchell Horne boottrace_reset(const char *actor) 540*da5b7e90SMitchell Horne { 541*da5b7e90SMitchell Horne char tmpbuf[64]; 542*da5b7e90SMitchell Horne 543*da5b7e90SMitchell Horne snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor); 544*da5b7e90SMitchell Horne runtrace(tmpbuf, NULL); 545*da5b7e90SMitchell Horne } 546*da5b7e90SMitchell Horne 547*da5b7e90SMitchell Horne /* 548*da5b7e90SMitchell Horne * Note that a resize implies a reset, i.e., the index is reset to 0. 549*da5b7e90SMitchell Horne * We never shrink the array; we can only increase its size. 550*da5b7e90SMitchell Horne */ 551*da5b7e90SMitchell Horne int 552*da5b7e90SMitchell Horne boottrace_resize(u_int newsize) 553*da5b7e90SMitchell Horne { 554*da5b7e90SMitchell Horne 555*da5b7e90SMitchell Horne if (newsize <= rt.size) { 556*da5b7e90SMitchell Horne return (EINVAL); 557*da5b7e90SMitchell Horne } 558*da5b7e90SMitchell Horne rt.table = realloc(rt.table, newsize * sizeof(struct bt_event), 559*da5b7e90SMitchell Horne M_BOOTTRACE, M_WAITOK | M_ZERO); 560*da5b7e90SMitchell Horne if (rt.table == NULL) 561*da5b7e90SMitchell Horne return (ENOMEM); 562*da5b7e90SMitchell Horne 563*da5b7e90SMitchell Horne rt.size = newsize; 564*da5b7e90SMitchell Horne boottrace_reset("boottrace_resize"); 565*da5b7e90SMitchell Horne return (0); 566*da5b7e90SMitchell Horne } 567*da5b7e90SMitchell Horne 568*da5b7e90SMitchell Horne static int 569*da5b7e90SMitchell Horne sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS) 570*da5b7e90SMitchell Horne { 571*da5b7e90SMitchell Horne 572*da5b7e90SMitchell Horne if (!boottrace_enabled) 573*da5b7e90SMitchell Horne return (0); 574*da5b7e90SMitchell Horne 575*da5b7e90SMitchell Horne if (req->newptr != NULL) 576*da5b7e90SMitchell Horne boottrace_reset("sysctl_boottrace_reset"); 577*da5b7e90SMitchell Horne 578*da5b7e90SMitchell Horne return (0); 579*da5b7e90SMitchell Horne } 580*da5b7e90SMitchell Horne 581*da5b7e90SMitchell Horne static void 582*da5b7e90SMitchell Horne boottrace_init(void) 583*da5b7e90SMitchell Horne { 584*da5b7e90SMitchell Horne 585*da5b7e90SMitchell Horne if (!boottrace_enabled) 586*da5b7e90SMitchell Horne return; 587*da5b7e90SMitchell Horne 588*da5b7e90SMitchell Horne /* Boottime trace table */ 589*da5b7e90SMitchell Horne bt.size = BT_TABLE_DEFSIZE; 590*da5b7e90SMitchell Horne TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size); 591*da5b7e90SMitchell Horne bt.size = max(bt.size, BT_TABLE_MINSIZE); 592*da5b7e90SMitchell Horne bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE, 593*da5b7e90SMitchell Horne M_WAITOK | M_ZERO); 594*da5b7e90SMitchell Horne 595*da5b7e90SMitchell Horne /* Stick in an initial entry. */ 596*da5b7e90SMitchell Horne bt.table[0].cpuid = PCPU_GET(cpuid); 597*da5b7e90SMitchell Horne strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN); 598*da5b7e90SMitchell Horne strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN); 599*da5b7e90SMitchell Horne bt.curr = 1; 600*da5b7e90SMitchell Horne 601*da5b7e90SMitchell Horne /* Run-time trace table (may wrap-around). */ 602*da5b7e90SMitchell Horne rt.wrap = 1; 603*da5b7e90SMitchell Horne rt.size = BT_TABLE_RUNSIZE; 604*da5b7e90SMitchell Horne rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE, 605*da5b7e90SMitchell Horne M_WAITOK | M_ZERO); 606*da5b7e90SMitchell Horne 607*da5b7e90SMitchell Horne /* Shutdown trace table */ 608*da5b7e90SMitchell Horne st.size = BT_TABLE_SHTSIZE; 609*da5b7e90SMitchell Horne st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE, 610*da5b7e90SMitchell Horne M_WAITOK | M_ZERO); 611*da5b7e90SMitchell Horne } 612*da5b7e90SMitchell Horne SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, 0); 613