1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2008-2022 NetApp, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/boottrace.h> 30 #include <sys/kernel.h> 31 #include <sys/malloc.h> 32 #include <sys/pcpu.h> 33 #include <sys/proc.h> 34 #include <sys/resourcevar.h> 35 #include <sys/sbuf.h> 36 #include <sys/syscallsubr.h> 37 #include <sys/sysctl.h> 38 #include <sys/time.h> 39 40 #include <machine/stdarg.h> 41 42 #define dprintf(fmt, ...) \ 43 do { \ 44 if (dotrace_debugging) \ 45 printf(fmt, ##__VA_ARGS__); \ 46 } while (0); 47 48 static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing"); 49 50 #define BT_TABLE_DEFSIZE 3000 51 #define BT_TABLE_RUNSIZE 2000 52 #define BT_TABLE_SHTSIZE 1000 53 #define BT_TABLE_MINSIZE 500 54 55 /* 56 * Boot-time & shutdown-time event. 57 */ 58 struct bt_event { 59 uint64_t tsc; /* Timestamp */ 60 uint64_t tick; /* Kernel tick */ 61 uint64_t cputime; /* Microseconds of process CPU time */ 62 uint32_t cpuid; /* CPU on which the event ran */ 63 uint32_t inblock; /* # of blocks in */ 64 uint32_t oublock; /* # of blocks out */ 65 pid_t pid; /* Current PID */ 66 char name[BT_EVENT_NAMELEN]; /* Event name */ 67 char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */ 68 }; 69 70 struct bt_table { 71 uint32_t size; /* Trace table size */ 72 uint32_t curr; /* Trace entry to use */ 73 uint32_t wrap; /* Wrap-around, instead of dropping */ 74 uint32_t drops_early; /* Trace entries dropped before init */ 75 uint32_t drops_full; /* Trace entries dropped after full */ 76 struct bt_event *table; 77 }; 78 79 /* Boot-time tracing */ 80 static struct bt_table bt; 81 82 /* Run-time tracing */ 83 static struct bt_table rt; 84 85 /* Shutdown-time tracing */ 86 static struct bt_table st; 87 88 /* Set when system boot is complete, and we've switched to runtime tracing. */ 89 static bool bootdone; 90 91 /* Set when system shutdown has started. */ 92 static bool shutdown; 93 94 /* Turn on dotrace() debug logging to console. */ 95 static bool dotrace_debugging; 96 TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging); 97 98 /* Trace kernel events */ 99 static bool dotrace_kernel = true; 100 TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel); 101 102 /* Trace userspace events */ 103 static bool dotrace_user = true; 104 TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user); 105 106 static int sysctl_log(SYSCTL_HANDLER_ARGS); 107 static int sysctl_boottrace(SYSCTL_HANDLER_ARGS); 108 static int sysctl_runtrace(SYSCTL_HANDLER_ARGS); 109 static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS); 110 static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS); 111 112 SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 113 "boottrace statistics"); 114 115 SYSCTL_PROC(_kern_boottrace, OID_AUTO, log, 116 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 117 0, 0, sysctl_log, "A", 118 "Print a log of the boottrace trace data"); 119 SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace, 120 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 121 0, 0, sysctl_boottrace, "A", 122 "Capture a boot-time trace event"); 123 SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace, 124 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 125 0, 0, sysctl_runtrace, "A", 126 "Capture a run-time trace event"); 127 SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace, 128 CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 129 0, 0, sysctl_shuttrace, "A", 130 "Capture a shutdown-time trace event"); 131 SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset, 132 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, 133 0, 0, sysctl_boottrace_reset, "A", 134 "Reset run-time tracing table"); 135 136 /* 137 * Global enable. 138 */ 139 bool __read_mostly boottrace_enabled = false; 140 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN, 141 &boottrace_enabled, 0, 142 "Boot-time and shutdown-time tracing enabled"); 143 144 /* 145 * Enable dumping of the shutdown trace entries to console. 146 */ 147 bool shutdown_trace = false; 148 SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN, 149 &shutdown_trace, 0, 150 "Enable kernel shutdown tracing to the console"); 151 152 /* 153 * Set the delta threshold (ms) below which events are ignored, used in 154 * determining what to dump to the console. 155 */ 156 static int shutdown_trace_threshold; 157 SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN, 158 &shutdown_trace_threshold, 0, 159 "Tracing threshold (ms) below which tracing is ignored"); 160 161 SYSCTL_UINT(_kern_boottrace, OID_AUTO, table_size, 162 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bt.size, 0, 163 "Boot-time tracing table size"); 164 165 /* 166 * Dump a trace to buffer or if buffer is NULL to console. 167 * 168 * Non-zero delta_threshold selectively prints entries based on delta 169 * with current and previous entry. Otherwise, delta_threshold of 0 170 * prints every trace entry and delta. 171 * 172 * Output something like this: 173 * 174 * CPU msecs delta process event 175 * 11 1228262715 0 init shutdown pre sync begin 176 * 3 1228265622 2907 init shutdown pre sync complete 177 * 3 1228265623 0 init shutdown turned swap off 178 * 18 1228266466 843 init shutdown unmounted all filesystems 179 * 180 * How to interpret: 181 * 182 * delta column represents the time in milliseconds between this event and the previous. 183 * Usually that means you can take the previous event, current event, match them 184 * up in the code, and whatever lies between is the culprit taking time. 185 * 186 * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount 187 * filesystems is taking 843 milliseconds. 188 * 189 * An event with a delta of 0 are 'landmark' events that simply exist in the output 190 * for the developer to know where the time measurement begins. The 0 is an arbitrary 191 * number that can effectively be ignored. 192 */ 193 #define BTC_DELTA_PRINT(bte, msecs, delta) do { \ 194 if (sbp) { \ 195 sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta, \ 196 (bte)->tdname, (bte)->name, (bte)->pid, \ 197 (bte)->cputime / 1000000, \ 198 ((bte)->cputime % 1000000) / 10000, \ 199 (bte)->inblock, (bte)->oublock); \ 200 } else { \ 201 printf(fmt, (bte)->cpuid, msecs, delta, \ 202 (bte)->tdname, (bte)->name, (bte)->pid, \ 203 (bte)->cputime / 1000000, \ 204 ((bte)->cputime % 1000000) / 10000, \ 205 (bte)->inblock, (bte)->oublock); \ 206 } \ 207 } while (0) 208 209 /* 210 * Print the trace entries to the message buffer, or to an sbuf, if provided. 211 * 212 * Entries with a difference less than dthres will not be printed. 213 */ 214 static void 215 boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres) 216 { 217 struct bt_event *evtp; 218 struct bt_event *last_evtp; 219 uint64_t msecs; 220 uint64_t first_msecs; 221 uint64_t last_msecs; 222 uint64_t dmsecs; 223 uint64_t last_dmsecs; 224 uint64_t total_dmsecs; 225 uint32_t i; 226 uint32_t curr; 227 const char *fmt = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n"; 228 const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n"; 229 bool printed; 230 bool last_printed; 231 232 /* Print the header */ 233 if (sbp != NULL) 234 sbuf_printf(sbp, hdr_fmt, 235 "CPU", "msecs", "delta", "process", 236 "event", "PID", "CPUtime", "IBlks", "OBlks"); 237 else 238 printf(hdr_fmt, 239 "CPU", "msecs", "delta", "process", 240 "event", "PID", "CPUtime", "IBlks", "OBlks"); 241 242 first_msecs = 0; 243 last_evtp = NULL; 244 last_msecs = 0; 245 last_dmsecs = 0; 246 last_printed = false; 247 i = curr = btp->curr; 248 249 do { 250 evtp = &btp->table[i]; 251 if (evtp->tsc == 0) 252 goto next; 253 254 msecs = cputick2usec(evtp->tick) / 1000; 255 dmsecs = (last_msecs != 0 && msecs > last_msecs) ? 256 msecs - last_msecs : 0; 257 printed = false; 258 259 /* 260 * If a threshold is defined, start filtering events by the 261 * delta of msecs. 262 */ 263 if (dthres != 0 && (dmsecs > dthres)) { 264 /* 265 * Print the previous entry as a landmark, even if it 266 * falls below the threshold. 267 */ 268 if (last_evtp != NULL && !last_printed) 269 BTC_DELTA_PRINT(last_evtp, last_msecs, 270 last_dmsecs); 271 BTC_DELTA_PRINT(evtp, msecs, dmsecs); 272 printed = true; 273 } else if (dthres == 0) { 274 BTC_DELTA_PRINT(evtp, msecs, dmsecs); 275 printed = true; 276 } 277 if (first_msecs == 0 || msecs < first_msecs) 278 first_msecs = msecs; 279 last_evtp = evtp; 280 last_msecs = msecs; 281 last_dmsecs = dmsecs; 282 last_printed = printed; 283 maybe_yield(); 284 next: 285 i = (i + 1) % btp->size; 286 } while (i != curr); 287 288 total_dmsecs = last_msecs > first_msecs ? 289 (last_msecs - first_msecs) : 0; 290 if (sbp != NULL) 291 sbuf_printf(sbp, "Total measured time: %ju msecs\n", 292 (uintmax_t)total_dmsecs); 293 else 294 printf("Total measured time: %ju msecs\n", 295 (uintmax_t)total_dmsecs); 296 } 297 298 /* 299 * Dump trace table entries to the console, given a delta threshold. 300 */ 301 void 302 boottrace_dump_console(void) 303 { 304 if (!boottrace_enabled) 305 return; 306 307 if (shutdown || rebooting || KERNEL_PANICKED()) { 308 boottrace_display(NULL, &st, shutdown_trace_threshold); 309 } else { 310 boottrace_display(NULL, &bt, 0); 311 boottrace_display(NULL, &rt, 0); 312 } 313 } 314 315 /* 316 * Records a new tracing event to the specified table. 317 * 318 * We don't use a lock because we want this to be callable from interrupt 319 * context. 320 */ 321 static int 322 dotrace(struct bt_table *btp, const char *eventname, const char *tdname) 323 { 324 uint32_t idx, nxt; 325 struct rusage usage; 326 327 MPASS(boottrace_enabled); 328 if (tdname == NULL) 329 tdname = (curproc->p_flag & P_SYSTEM) ? 330 curthread->td_name : curproc->p_comm; 331 332 dprintf("dotrace["); 333 dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'", 334 PCPU_GET(cpuid), curthread->td_proc->p_pid, 335 (uintmax_t)get_cyclecount(), ticks, tdname, eventname); 336 if (btp->table == NULL) { 337 btp->drops_early++; 338 dprintf(", return=ENOSPC_1]\n"); 339 return (ENOSPC); 340 } 341 342 /* Claim a slot in the table. */ 343 do { 344 idx = btp->curr; 345 nxt = (idx + 1) % btp->size; 346 if (nxt == 0 && btp->wrap == 0) { 347 btp->drops_full++; 348 dprintf(", return=ENOSPC_2]\n"); 349 return (ENOSPC); 350 } 351 } while (!atomic_cmpset_int(&btp->curr, idx, nxt)); 352 353 btp->table[idx].cpuid = PCPU_GET(cpuid); 354 btp->table[idx].tsc = get_cyclecount(), 355 btp->table[idx].tick = cpu_ticks(); 356 btp->table[idx].pid = curthread->td_proc->p_pid; 357 358 /* 359 * Get the resource usage for the process. Don't attempt this for the 360 * kernel proc0 or for critical section activities. 361 */ 362 if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) { 363 btp->table[idx].cputime = 0; 364 btp->table[idx].inblock = 0; 365 btp->table[idx].oublock = 0; 366 } else { 367 kern_getrusage(curthread, RUSAGE_CHILDREN, &usage); 368 btp->table[idx].cputime = 369 (uint64_t)(usage.ru_utime.tv_sec * 1000000 + 370 usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 + 371 usage.ru_stime.tv_usec); 372 btp->table[idx].inblock = (uint32_t)usage.ru_inblock; 373 btp->table[idx].oublock = (uint32_t)usage.ru_oublock; 374 } 375 strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN); 376 strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN); 377 378 dprintf(", return=0]\n"); 379 return (0); 380 } 381 382 /* 383 * Log various boot-time events, with the trace message encoded using 384 * printf-like arguments. 385 */ 386 int 387 boottrace(const char *tdname, const char *fmt, ...) 388 { 389 char eventname[BT_EVENT_NAMELEN]; 390 struct bt_table *btp; 391 va_list ap; 392 393 if (!dotrace_kernel) 394 return (ENXIO); 395 396 va_start(ap, fmt); 397 vsnprintf(eventname, sizeof(eventname), fmt, ap); 398 va_end(ap); 399 400 btp = &bt; 401 if (bootdone) 402 btp = &rt; 403 if (shutdown || rebooting || KERNEL_PANICKED()) 404 btp = &st; 405 406 return (dotrace(btp, eventname, tdname)); 407 } 408 409 /* 410 * Log a run-time event & switch over to run-time tracing mode. 411 */ 412 static int 413 runtrace(const char *eventname, const char *tdname) 414 { 415 416 bootdone = true; 417 return (dotrace(&rt, eventname, tdname)); 418 } 419 420 /* 421 * Parse a boottrace message from userspace. 422 * 423 * The input from may contain a ':' to denote tdname. If not, tdname is 424 * inferred from the process' name. 425 * 426 * e.g. reboot(8):SIGINT to init(8) 427 */ 428 static void 429 boottrace_parse_message(char *message, char **eventname, char **tdname) 430 { 431 char *delim; 432 433 delim = strchr(message, ':'); 434 if (delim != NULL) { 435 *delim = '\0'; 436 *tdname = message; 437 *eventname = ++delim; 438 } else { 439 *tdname = curproc->p_comm; 440 *eventname = message; 441 } 442 } 443 444 static int 445 _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp, 446 struct sysctl_req *req) 447 { 448 char message[BT_MSGLEN]; 449 char *eventname, *tdname; 450 int error; 451 452 if (!dotrace_user) 453 return (ENXIO); 454 455 message[0] = '\0'; 456 error = sysctl_handle_string(oidp, message, sizeof(message), req); 457 if (error) 458 return (error); 459 460 boottrace_parse_message(message, &eventname, &tdname); 461 error = dotrace(btp, eventname, tdname); 462 if (error == ENOSPC) { 463 /* Ignore table full error. */ 464 error = 0; 465 } 466 return (error); 467 } 468 469 static int 470 sysctl_log(SYSCTL_HANDLER_ARGS) 471 { 472 struct sbuf *sbuf; 473 int error; 474 475 if (!boottrace_enabled || req->newptr != NULL) 476 return (0); 477 478 sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); 479 boottrace_display(sbuf, &bt, 0); 480 boottrace_display(sbuf, &rt, 0); 481 sbuf_finish(sbuf); 482 error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf)); 483 sbuf_delete(sbuf); 484 485 return (error); 486 } 487 488 static int 489 sysctl_boottrace(SYSCTL_HANDLER_ARGS) 490 { 491 492 if (!boottrace_enabled) 493 return (0); 494 495 /* No output */ 496 if (req->newptr == NULL) 497 return (0); 498 499 /* Trace to rt if we have reached multi-user. */ 500 return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req)); 501 } 502 503 /* 504 * Log a run-time event and switch over to run-time tracing mode. 505 */ 506 static int 507 sysctl_runtrace(SYSCTL_HANDLER_ARGS) 508 { 509 510 if (!boottrace_enabled) 511 return (0); 512 513 /* No output */ 514 if (req->newptr == NULL) 515 return (0); 516 517 bootdone = true; 518 return (_boottrace_sysctl(&rt, oidp, req)); 519 } 520 521 /* 522 * Log a shutdown-time event and switch over to shutdown tracing mode. 523 */ 524 static int 525 sysctl_shuttrace(SYSCTL_HANDLER_ARGS) 526 { 527 528 if (!boottrace_enabled) 529 return (0); 530 531 /* No output */ 532 if (req->newptr == NULL) 533 return (0); 534 535 shutdown = true; 536 return (_boottrace_sysctl(&st, oidp, req)); 537 } 538 539 /* 540 * Reset the runtime tracing buffer. 541 */ 542 void 543 boottrace_reset(const char *actor) 544 { 545 char tmpbuf[64]; 546 547 snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor); 548 runtrace(tmpbuf, NULL); 549 } 550 551 /* 552 * Note that a resize implies a reset, i.e., the index is reset to 0. 553 * We never shrink the array; we can only increase its size. 554 */ 555 int 556 boottrace_resize(u_int newsize) 557 { 558 559 if (newsize <= rt.size) { 560 return (EINVAL); 561 } 562 rt.table = realloc(rt.table, newsize * sizeof(struct bt_event), 563 M_BOOTTRACE, M_WAITOK | M_ZERO); 564 if (rt.table == NULL) 565 return (ENOMEM); 566 567 rt.size = newsize; 568 boottrace_reset("boottrace_resize"); 569 return (0); 570 } 571 572 static int 573 sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS) 574 { 575 576 if (!boottrace_enabled) 577 return (0); 578 579 if (req->newptr != NULL) 580 boottrace_reset("sysctl_boottrace_reset"); 581 582 return (0); 583 } 584 585 static void 586 boottrace_init(void) 587 { 588 589 if (!boottrace_enabled) 590 return; 591 592 /* Boottime trace table */ 593 bt.size = BT_TABLE_DEFSIZE; 594 TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size); 595 bt.size = max(bt.size, BT_TABLE_MINSIZE); 596 bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE, 597 M_WAITOK | M_ZERO); 598 599 /* Stick in an initial entry. */ 600 bt.table[0].cpuid = PCPU_GET(cpuid); 601 strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN); 602 strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN); 603 bt.curr = 1; 604 605 /* Run-time trace table (may wrap-around). */ 606 rt.wrap = 1; 607 rt.size = BT_TABLE_RUNSIZE; 608 rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE, 609 M_WAITOK | M_ZERO); 610 611 /* Shutdown trace table */ 612 st.size = BT_TABLE_SHTSIZE; 613 st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE, 614 M_WAITOK | M_ZERO); 615 } 616 SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, 0); 617