1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2008-2022 NetApp, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/boottrace.h> 30 #include <sys/kernel.h> 31 #include <sys/malloc.h> 32 #include <sys/pcpu.h> 33 #include <sys/proc.h> 34 #include <sys/resourcevar.h> 35 #include <sys/sbuf.h> 36 #include <sys/stdarg.h> 37 #include <sys/syscallsubr.h> 38 #include <sys/sysctl.h> 39 #include <sys/time.h> 40 41 #define dprintf(fmt, ...) \ 42 do { \ 43 if (dotrace_debugging) \ 44 printf(fmt, ##__VA_ARGS__); \ 45 } while (0); 46 47 static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing"); 48 49 #define BT_TABLE_DEFSIZE 3000 50 #define BT_TABLE_RUNSIZE 2000 51 #define BT_TABLE_SHTSIZE 1000 52 #define BT_TABLE_MINSIZE 500 53 54 /* 55 * Boot-time & shutdown-time event. 56 */ 57 struct bt_event { 58 uint64_t tsc; /* Timestamp */ 59 uint64_t tick; /* Kernel tick */ 60 uint64_t cputime; /* Microseconds of process CPU time */ 61 uint32_t cpuid; /* CPU on which the event ran */ 62 uint32_t inblock; /* # of blocks in */ 63 uint32_t oublock; /* # of blocks out */ 64 pid_t pid; /* Current PID */ 65 char name[BT_EVENT_NAMELEN]; /* Event name */ 66 char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */ 67 }; 68 69 struct bt_table { 70 uint32_t size; /* Trace table size */ 71 uint32_t curr; /* Trace entry to use */ 72 uint32_t wrap; /* Wrap-around, instead of dropping */ 73 uint32_t drops_early; /* Trace entries dropped before init */ 74 uint32_t drops_full; /* Trace entries dropped after full */ 75 struct bt_event *table; 76 }; 77 78 /* Boot-time tracing */ 79 static struct bt_table bt; 80 81 /* Run-time tracing */ 82 static struct bt_table rt; 83 84 /* Shutdown-time tracing */ 85 static struct bt_table st; 86 87 /* Set when system boot is complete, and we've switched to runtime tracing. */ 88 static bool bootdone; 89 90 /* Set when system shutdown has started. */ 91 static bool shutdown; 92 93 /* Turn on dotrace() debug logging to console. */ 94 static bool dotrace_debugging; 95 TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging); 96 97 /* Trace kernel events */ 98 static bool dotrace_kernel = true; 99 TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel); 100 101 /* Trace userspace events */ 102 static bool dotrace_user = true; 103 TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user); 104 105 static int sysctl_log(SYSCTL_HANDLER_ARGS); 106 static int sysctl_boottrace(SYSCTL_HANDLER_ARGS); 107 static int sysctl_runtrace(SYSCTL_HANDLER_ARGS); 108 static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS); 109 static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS); 110 111 SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 112 "boottrace statistics"); 113 114 SYSCTL_PROC(_kern_boottrace, OID_AUTO, log, 115 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 116 0, 0, sysctl_log, "A", 117 "Print a log of the boottrace trace data"); 118 SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace, 119 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 120 0, 0, sysctl_boottrace, "A", 121 "Capture a boot-time trace event"); 122 SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace, 123 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 124 0, 0, sysctl_runtrace, "A", 125 "Capture a run-time trace event"); 126 SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace, 127 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 128 0, 0, sysctl_shuttrace, "A", 129 "Capture a shutdown-time trace event"); 130 SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset, 131 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, 132 0, 0, sysctl_boottrace_reset, "A", 133 "Reset run-time tracing table"); 134 135 /* 136 * Global enable. 137 */ 138 bool __read_mostly boottrace_enabled = false; 139 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN, 140 &boottrace_enabled, 0, 141 "Boot-time and shutdown-time tracing enabled"); 142 143 /* 144 * Enable dumping of the shutdown trace entries to console. 145 */ 146 bool shutdown_trace = false; 147 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN, 148 &shutdown_trace, 0, 149 "Enable kernel shutdown tracing to the console"); 150 151 /* 152 * Set the delta threshold (ms) below which events are ignored, used in 153 * determining what to dump to the console. 154 */ 155 static int shutdown_trace_threshold; 156 SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN, 157 &shutdown_trace_threshold, 0, 158 "Tracing threshold (ms) below which tracing is ignored"); 159 160 SYSCTL_UINT(_kern_boottrace, OID_AUTO, table_size, 161 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bt.size, 0, 162 "Boot-time tracing table size"); 163 164 /* 165 * Dump a trace to buffer or if buffer is NULL to console. 166 * 167 * Non-zero delta_threshold selectively prints entries based on delta 168 * with current and previous entry. Otherwise, delta_threshold of 0 169 * prints every trace entry and delta. 170 * 171 * Output something like this: 172 * 173 * CPU msecs delta process event 174 * 11 1228262715 0 init shutdown pre sync begin 175 * 3 1228265622 2907 init shutdown pre sync complete 176 * 3 1228265623 0 init shutdown turned swap off 177 * 18 1228266466 843 init shutdown unmounted all filesystems 178 * 179 * How to interpret: 180 * 181 * delta column represents the time in milliseconds between this event and the previous. 182 * Usually that means you can take the previous event, current event, match them 183 * up in the code, and whatever lies between is the culprit taking time. 184 * 185 * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount 186 * filesystems is taking 843 milliseconds. 187 * 188 * An event with a delta of 0 are 'landmark' events that simply exist in the output 189 * for the developer to know where the time measurement begins. The 0 is an arbitrary 190 * number that can effectively be ignored. 191 */ 192 #define BTC_DELTA_PRINT(bte, msecs, delta) do { \ 193 if (sbp) { \ 194 sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta, \ 195 (bte)->tdname, (bte)->name, (bte)->pid, \ 196 (bte)->cputime / 1000000, \ 197 ((bte)->cputime % 1000000) / 10000, \ 198 (bte)->inblock, (bte)->oublock); \ 199 } else { \ 200 printf(fmt, (bte)->cpuid, msecs, delta, \ 201 (bte)->tdname, (bte)->name, (bte)->pid, \ 202 (bte)->cputime / 1000000, \ 203 ((bte)->cputime % 1000000) / 10000, \ 204 (bte)->inblock, (bte)->oublock); \ 205 } \ 206 } while (0) 207 208 /* 209 * Print the trace entries to the message buffer, or to an sbuf, if provided. 210 * 211 * Entries with a difference less than dthres will not be printed. 212 */ 213 static void 214 boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres) 215 { 216 struct bt_event *evtp; 217 struct bt_event *last_evtp; 218 uint64_t msecs; 219 uint64_t first_msecs; 220 uint64_t last_msecs; 221 uint64_t dmsecs; 222 uint64_t last_dmsecs; 223 uint64_t total_dmsecs; 224 uint32_t i; 225 uint32_t curr; 226 const char *fmt = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n"; 227 const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n"; 228 bool printed; 229 bool last_printed; 230 231 /* Print the header */ 232 if (sbp != NULL) 233 sbuf_printf(sbp, hdr_fmt, 234 "CPU", "msecs", "delta", "process", 235 "event", "PID", "CPUtime", "IBlks", "OBlks"); 236 else 237 printf(hdr_fmt, 238 "CPU", "msecs", "delta", "process", 239 "event", "PID", "CPUtime", "IBlks", "OBlks"); 240 241 first_msecs = 0; 242 last_evtp = NULL; 243 last_msecs = 0; 244 last_dmsecs = 0; 245 last_printed = false; 246 i = curr = btp->curr; 247 248 do { 249 evtp = &btp->table[i]; 250 if (evtp->tsc == 0) 251 goto next; 252 253 msecs = cputick2usec(evtp->tick) / 1000; 254 dmsecs = (last_msecs != 0 && msecs > last_msecs) ? 255 msecs - last_msecs : 0; 256 printed = false; 257 258 /* 259 * If a threshold is defined, start filtering events by the 260 * delta of msecs. 261 */ 262 if (dthres != 0 && (dmsecs > dthres)) { 263 /* 264 * Print the previous entry as a landmark, even if it 265 * falls below the threshold. 266 */ 267 if (last_evtp != NULL && !last_printed) 268 BTC_DELTA_PRINT(last_evtp, last_msecs, 269 last_dmsecs); 270 BTC_DELTA_PRINT(evtp, msecs, dmsecs); 271 printed = true; 272 } else if (dthres == 0) { 273 BTC_DELTA_PRINT(evtp, msecs, dmsecs); 274 printed = true; 275 } 276 if (first_msecs == 0 || msecs < first_msecs) 277 first_msecs = msecs; 278 last_evtp = evtp; 279 last_msecs = msecs; 280 last_dmsecs = dmsecs; 281 last_printed = printed; 282 maybe_yield(); 283 next: 284 i = (i + 1) % btp->size; 285 } while (i != curr); 286 287 total_dmsecs = last_msecs > first_msecs ? 288 (last_msecs - first_msecs) : 0; 289 if (sbp != NULL) 290 sbuf_printf(sbp, "Total measured time: %ju msecs\n", 291 (uintmax_t)total_dmsecs); 292 else 293 printf("Total measured time: %ju msecs\n", 294 (uintmax_t)total_dmsecs); 295 } 296 297 /* 298 * Dump trace table entries to the console, given a delta threshold. 299 */ 300 void 301 boottrace_dump_console(void) 302 { 303 if (!boottrace_enabled) 304 return; 305 306 if (shutdown || rebooting || KERNEL_PANICKED()) { 307 boottrace_display(NULL, &st, shutdown_trace_threshold); 308 } else { 309 boottrace_display(NULL, &bt, 0); 310 boottrace_display(NULL, &rt, 0); 311 } 312 } 313 314 /* 315 * Records a new tracing event to the specified table. 316 * 317 * We don't use a lock because we want this to be callable from interrupt 318 * context. 319 */ 320 static int 321 dotrace(struct bt_table *btp, const char *eventname, const char *tdname) 322 { 323 uint32_t idx, nxt; 324 struct rusage usage; 325 326 MPASS(boottrace_enabled); 327 if (tdname == NULL) 328 tdname = (curproc->p_flag & P_SYSTEM) ? 329 curthread->td_name : curproc->p_comm; 330 331 dprintf("dotrace["); 332 dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'", 333 PCPU_GET(cpuid), curthread->td_proc->p_pid, 334 (uintmax_t)get_cyclecount(), ticks, tdname, eventname); 335 if (btp->table == NULL) { 336 btp->drops_early++; 337 dprintf(", return=ENOSPC_1]\n"); 338 return (ENOSPC); 339 } 340 341 /* Claim a slot in the table. */ 342 do { 343 idx = btp->curr; 344 nxt = (idx + 1) % btp->size; 345 if (nxt == 0 && btp->wrap == 0) { 346 btp->drops_full++; 347 dprintf(", return=ENOSPC_2]\n"); 348 return (ENOSPC); 349 } 350 } while (!atomic_cmpset_int(&btp->curr, idx, nxt)); 351 352 btp->table[idx].cpuid = PCPU_GET(cpuid); 353 btp->table[idx].tsc = get_cyclecount(), 354 btp->table[idx].tick = cpu_ticks(); 355 btp->table[idx].pid = curthread->td_proc->p_pid; 356 357 /* 358 * Get the resource usage for the process. Don't attempt this for the 359 * kernel proc0 or for critical section activities. 360 */ 361 if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) { 362 btp->table[idx].cputime = 0; 363 btp->table[idx].inblock = 0; 364 btp->table[idx].oublock = 0; 365 } else { 366 kern_getrusage(curthread, RUSAGE_CHILDREN, &usage); 367 btp->table[idx].cputime = 368 (uint64_t)(usage.ru_utime.tv_sec * 1000000 + 369 usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 + 370 usage.ru_stime.tv_usec); 371 btp->table[idx].inblock = (uint32_t)usage.ru_inblock; 372 btp->table[idx].oublock = (uint32_t)usage.ru_oublock; 373 } 374 strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN); 375 strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN); 376 377 dprintf(", return=0]\n"); 378 return (0); 379 } 380 381 /* 382 * Log various boot-time events, with the trace message encoded using 383 * printf-like arguments. 384 */ 385 int 386 boottrace(const char *tdname, const char *fmt, ...) 387 { 388 char eventname[BT_EVENT_NAMELEN]; 389 struct bt_table *btp; 390 va_list ap; 391 392 if (!dotrace_kernel) 393 return (ENXIO); 394 395 va_start(ap, fmt); 396 vsnprintf(eventname, sizeof(eventname), fmt, ap); 397 va_end(ap); 398 399 btp = &bt; 400 if (bootdone) 401 btp = &rt; 402 if (shutdown || rebooting || KERNEL_PANICKED()) 403 btp = &st; 404 405 return (dotrace(btp, eventname, tdname)); 406 } 407 408 /* 409 * Log a run-time event & switch over to run-time tracing mode. 410 */ 411 static int 412 runtrace(const char *eventname, const char *tdname) 413 { 414 415 bootdone = true; 416 return (dotrace(&rt, eventname, tdname)); 417 } 418 419 /* 420 * Parse a boottrace message from userspace. 421 * 422 * The input from may contain a ':' to denote tdname. If not, tdname is 423 * inferred from the process' name. 424 * 425 * e.g. reboot(8):SIGINT to init(8) 426 */ 427 static void 428 boottrace_parse_message(char *message, char **eventname, char **tdname) 429 { 430 char *delim; 431 432 delim = strchr(message, ':'); 433 if (delim != NULL) { 434 *delim = '\0'; 435 *tdname = message; 436 *eventname = ++delim; 437 } else { 438 *tdname = curproc->p_comm; 439 *eventname = message; 440 } 441 } 442 443 static int 444 _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp, 445 struct sysctl_req *req) 446 { 447 char message[BT_MSGLEN]; 448 char *eventname, *tdname; 449 int error; 450 451 if (!dotrace_user) 452 return (ENXIO); 453 454 message[0] = '\0'; 455 error = sysctl_handle_string(oidp, message, sizeof(message), req); 456 if (error) 457 return (error); 458 459 boottrace_parse_message(message, &eventname, &tdname); 460 error = dotrace(btp, eventname, tdname); 461 if (error == ENOSPC) { 462 /* Ignore table full error. */ 463 error = 0; 464 } 465 return (error); 466 } 467 468 static int 469 sysctl_log(SYSCTL_HANDLER_ARGS) 470 { 471 struct sbuf *sbuf; 472 int error; 473 474 if (!boottrace_enabled || req->newptr != NULL) 475 return (0); 476 477 sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); 478 boottrace_display(sbuf, &bt, 0); 479 boottrace_display(sbuf, &rt, 0); 480 sbuf_finish(sbuf); 481 error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf)); 482 sbuf_delete(sbuf); 483 484 return (error); 485 } 486 487 static int 488 sysctl_boottrace(SYSCTL_HANDLER_ARGS) 489 { 490 491 if (!boottrace_enabled) 492 return (0); 493 494 /* No output */ 495 if (req->newptr == NULL) 496 return (0); 497 498 /* Trace to rt if we have reached multi-user. */ 499 return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req)); 500 } 501 502 /* 503 * Log a run-time event and switch over to run-time tracing mode. 504 */ 505 static int 506 sysctl_runtrace(SYSCTL_HANDLER_ARGS) 507 { 508 509 if (!boottrace_enabled) 510 return (0); 511 512 /* No output */ 513 if (req->newptr == NULL) 514 return (0); 515 516 bootdone = true; 517 return (_boottrace_sysctl(&rt, oidp, req)); 518 } 519 520 /* 521 * Log a shutdown-time event and switch over to shutdown tracing mode. 522 */ 523 static int 524 sysctl_shuttrace(SYSCTL_HANDLER_ARGS) 525 { 526 527 if (!boottrace_enabled) 528 return (0); 529 530 /* No output */ 531 if (req->newptr == NULL) 532 return (0); 533 534 shutdown = true; 535 return (_boottrace_sysctl(&st, oidp, req)); 536 } 537 538 /* 539 * Reset the runtime tracing buffer. 540 */ 541 void 542 boottrace_reset(const char *actor) 543 { 544 char tmpbuf[64]; 545 546 snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor); 547 runtrace(tmpbuf, NULL); 548 } 549 550 /* 551 * Note that a resize implies a reset, i.e., the index is reset to 0. 552 * We never shrink the array; we can only increase its size. 553 */ 554 int 555 boottrace_resize(u_int newsize) 556 { 557 558 if (newsize <= rt.size) { 559 return (EINVAL); 560 } 561 rt.table = realloc(rt.table, newsize * sizeof(struct bt_event), 562 M_BOOTTRACE, M_WAITOK | M_ZERO); 563 rt.size = newsize; 564 boottrace_reset("boottrace_resize"); 565 return (0); 566 } 567 568 static int 569 sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS) 570 { 571 572 if (!boottrace_enabled) 573 return (0); 574 575 if (req->newptr != NULL) 576 boottrace_reset("sysctl_boottrace_reset"); 577 578 return (0); 579 } 580 581 static void 582 boottrace_init(void) 583 { 584 585 if (!boottrace_enabled) 586 return; 587 588 /* Boottime trace table */ 589 bt.size = BT_TABLE_DEFSIZE; 590 TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size); 591 bt.size = max(bt.size, BT_TABLE_MINSIZE); 592 bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE, 593 M_WAITOK | M_ZERO); 594 595 /* Stick in an initial entry. */ 596 bt.table[0].cpuid = PCPU_GET(cpuid); 597 strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN); 598 strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN); 599 bt.curr = 1; 600 601 /* Run-time trace table (may wrap-around). */ 602 rt.wrap = 1; 603 rt.size = BT_TABLE_RUNSIZE; 604 rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE, 605 M_WAITOK | M_ZERO); 606 607 /* Shutdown trace table */ 608 st.size = BT_TABLE_SHTSIZE; 609 st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE, 610 M_WAITOK | M_ZERO); 611 } 612 SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, NULL); 613