1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <stdio.h> 29 #include <stddef.h> 30 #include <stdlib.h> 31 #include <stdarg.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <fcntl.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <sys/types.h> 40 #include <sys/modctl.h> 41 #include <sys/stat.h> 42 #include <sys/wait.h> 43 #include <dtrace.h> 44 #include <sys/lockstat.h> 45 #include <alloca.h> 46 #include <signal.h> 47 #include <assert.h> 48 49 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 50 51 #define LS_MAX_STACK_DEPTH 50 52 #define LS_MAX_EVENTS 64 53 54 typedef struct lsrec { 55 struct lsrec *ls_next; /* next in hash chain */ 56 uintptr_t ls_lock; /* lock address */ 57 uintptr_t ls_caller; /* caller address */ 58 uint32_t ls_count; /* cumulative event count */ 59 uint32_t ls_event; /* type of event */ 60 uintptr_t ls_refcnt; /* cumulative reference count */ 61 uint64_t ls_time; /* cumulative event duration */ 62 uint32_t ls_hist[64]; /* log2(duration) histogram */ 63 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 64 } lsrec_t; 65 66 typedef struct lsdata { 67 struct lsrec *lsd_next; /* next available */ 68 int lsd_count; /* number of records */ 69 } lsdata_t; 70 71 /* 72 * Definitions for the types of experiments which can be run. They are 73 * listed in increasing order of memory cost and processing time cost. 74 * The numerical value of each type is the number of bytes needed per record. 75 */ 76 #define LS_BASIC offsetof(lsrec_t, ls_time) 77 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 78 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 79 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 80 81 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 82 static void report_trace(FILE *, lsrec_t **); 83 84 extern int symtab_init(void); 85 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 86 extern uintptr_t sym_to_addr(char *name); 87 extern size_t sym_size(char *name); 88 extern char *strtok_r(char *, const char *, char **); 89 90 #define DEFAULT_NRECS 10000 91 #define DEFAULT_HZ 97 92 #define MAX_HZ 1000 93 #define MIN_AGGSIZE (16 * 1024) 94 #define MAX_AGGSIZE (32 * 1024 * 1024) 95 96 static int g_stkdepth; 97 static int g_topn = INT_MAX; 98 static hrtime_t g_elapsed; 99 static int g_rates = 0; 100 static int g_pflag = 0; 101 static int g_Pflag = 0; 102 static int g_wflag = 0; 103 static int g_Wflag = 0; 104 static int g_cflag = 0; 105 static int g_kflag = 0; 106 static int g_gflag = 0; 107 static int g_Vflag = 0; 108 static int g_tracing = 0; 109 static size_t g_recsize; 110 static size_t g_nrecs; 111 static int g_nrecs_used; 112 static uchar_t g_enabled[LS_MAX_EVENTS]; 113 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 114 static dtrace_hdl_t *g_dtp; 115 static char *g_predicate; 116 static char *g_ipredicate; 117 static char *g_prog; 118 static int g_proglen; 119 static int g_dropped; 120 121 typedef struct ls_event_info { 122 char ev_type; 123 char ev_lhdr[20]; 124 char ev_desc[80]; 125 char ev_units[10]; 126 char ev_name[DTRACE_NAMELEN]; 127 char *ev_predicate; 128 char *ev_acquire; 129 } ls_event_info_t; 130 131 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 132 { 'C', "Lock", "Adaptive mutex spin", "nsec", 133 "lockstat:::adaptive-spin" }, 134 { 'C', "Lock", "Adaptive mutex block", "nsec", 135 "lockstat:::adaptive-block" }, 136 { 'C', "Lock", "Spin lock spin", "nsec", 137 "lockstat:::spin-spin" }, 138 { 'C', "Lock", "Thread lock spin", "nsec", 139 "lockstat:::thread-spin" }, 140 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 141 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 142 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 143 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 144 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 145 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 146 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 147 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 148 { 'C', "Lock", "Unknown event (type 8)", "units" }, 149 { 'C', "Lock", "Unknown event (type 9)", "units" }, 150 { 'C', "Lock", "Unknown event (type 10)", "units" }, 151 { 'C', "Lock", "Unknown event (type 11)", "units" }, 152 { 'C', "Lock", "Unknown event (type 12)", "units" }, 153 { 'C', "Lock", "Unknown event (type 13)", "units" }, 154 { 'C', "Lock", "Unknown event (type 14)", "units" }, 155 { 'C', "Lock", "Unknown event (type 15)", "units" }, 156 { 'C', "Lock", "Unknown event (type 16)", "units" }, 157 { 'C', "Lock", "Unknown event (type 17)", "units" }, 158 { 'C', "Lock", "Unknown event (type 18)", "units" }, 159 { 'C', "Lock", "Unknown event (type 19)", "units" }, 160 { 'C', "Lock", "Unknown event (type 20)", "units" }, 161 { 'C', "Lock", "Unknown event (type 21)", "units" }, 162 { 'C', "Lock", "Unknown event (type 22)", "units" }, 163 { 'C', "Lock", "Unknown event (type 23)", "units" }, 164 { 'C', "Lock", "Unknown event (type 24)", "units" }, 165 { 'C', "Lock", "Unknown event (type 25)", "units" }, 166 { 'C', "Lock", "Unknown event (type 26)", "units" }, 167 { 'C', "Lock", "Unknown event (type 27)", "units" }, 168 { 'C', "Lock", "Unknown event (type 28)", "units" }, 169 { 'C', "Lock", "Unknown event (type 29)", "units" }, 170 { 'C', "Lock", "Unknown event (type 30)", "units" }, 171 { 'C', "Lock", "Unknown event (type 31)", "units" }, 172 { 'H', "Lock", "Adaptive mutex hold", "nsec", 173 "lockstat:::adaptive-release", NULL, 174 "lockstat:::adaptive-acquire" }, 175 { 'H', "Lock", "Spin lock hold", "nsec", 176 "lockstat:::spin-release", NULL, 177 "lockstat:::spin-acquire" }, 178 { 'H', "Lock", "R/W writer hold", "nsec", 179 "lockstat:::rw-release", "arg1 == 0", 180 "lockstat:::rw-acquire" }, 181 { 'H', "Lock", "R/W reader hold", "nsec", 182 "lockstat:::rw-release", "arg1 != 0", 183 "lockstat:::rw-acquire" }, 184 { 'H', "Lock", "Unknown event (type 36)", "units" }, 185 { 'H', "Lock", "Unknown event (type 37)", "units" }, 186 { 'H', "Lock", "Unknown event (type 38)", "units" }, 187 { 'H', "Lock", "Unknown event (type 39)", "units" }, 188 { 'H', "Lock", "Unknown event (type 40)", "units" }, 189 { 'H', "Lock", "Unknown event (type 41)", "units" }, 190 { 'H', "Lock", "Unknown event (type 42)", "units" }, 191 { 'H', "Lock", "Unknown event (type 43)", "units" }, 192 { 'H', "Lock", "Unknown event (type 44)", "units" }, 193 { 'H', "Lock", "Unknown event (type 45)", "units" }, 194 { 'H', "Lock", "Unknown event (type 46)", "units" }, 195 { 'H', "Lock", "Unknown event (type 47)", "units" }, 196 { 'H', "Lock", "Unknown event (type 48)", "units" }, 197 { 'H', "Lock", "Unknown event (type 49)", "units" }, 198 { 'H', "Lock", "Unknown event (type 50)", "units" }, 199 { 'H', "Lock", "Unknown event (type 51)", "units" }, 200 { 'H', "Lock", "Unknown event (type 52)", "units" }, 201 { 'H', "Lock", "Unknown event (type 53)", "units" }, 202 { 'H', "Lock", "Unknown event (type 54)", "units" }, 203 { 'H', "Lock", "Unknown event (type 55)", "units" }, 204 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 205 "profile:::profile-97", NULL }, 206 { 'I', "Lock", "Unknown event (type 57)", "units" }, 207 { 'I', "Lock", "Unknown event (type 58)", "units" }, 208 { 'I', "Lock", "Unknown event (type 59)", "units" }, 209 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 210 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 211 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 212 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 213 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 214 }; 215 216 static void 217 fail(int do_perror, const char *message, ...) 218 { 219 va_list args; 220 int save_errno = errno; 221 222 va_start(args, message); 223 (void) fprintf(stderr, "lockstat: "); 224 (void) vfprintf(stderr, message, args); 225 va_end(args); 226 if (do_perror) 227 (void) fprintf(stderr, ": %s", strerror(save_errno)); 228 (void) fprintf(stderr, "\n"); 229 exit(2); 230 } 231 232 static void 233 dfail(const char *message, ...) 234 { 235 va_list args; 236 237 va_start(args, message); 238 (void) fprintf(stderr, "lockstat: "); 239 (void) vfprintf(stderr, message, args); 240 va_end(args); 241 (void) fprintf(stderr, ": %s\n", 242 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 243 244 exit(2); 245 } 246 247 static void 248 show_events(char event_type, char *desc) 249 { 250 int i, first = -1, last; 251 252 for (i = 0; i < LS_MAX_EVENTS; i++) { 253 ls_event_info_t *evp = &g_event_info[i]; 254 if (evp->ev_type != event_type || 255 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 256 continue; 257 if (first == -1) 258 first = i; 259 last = i; 260 } 261 262 (void) fprintf(stderr, 263 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 264 desc, event_type, first, last); 265 266 for (i = first; i <= last; i++) 267 (void) fprintf(stderr, 268 "%4d = %s\n", i, g_event_info[i].ev_desc); 269 } 270 271 static void 272 usage(void) 273 { 274 (void) fprintf(stderr, 275 "Usage: lockstat [options] command [args]\n" 276 "\nEvent selection options:\n\n" 277 " -C watch contention events [on by default]\n" 278 " -E watch error events [off by default]\n" 279 " -H watch hold events [off by default]\n" 280 " -I watch interrupt events [off by default]\n" 281 " -A watch all lock events [equivalent to -CH]\n" 282 " -e event_list only watch the specified events (shown below);\n" 283 " <event_list> is a comma-separated list of\n" 284 " events or ranges of events, e.g. 1,4-7,35\n" 285 " -i rate interrupt rate for -I [default: %d Hz]\n" 286 "\nData gathering options:\n\n" 287 " -b basic statistics (lock, caller, event count)\n" 288 " -t timing for all events [default]\n" 289 " -h histograms for event times\n" 290 " -s depth stack traces <depth> deep\n" 291 " -x opt[=val] enable or modify DTrace options\n" 292 "\nData filtering options:\n\n" 293 " -n nrecords maximum number of data records [default: %d]\n" 294 " -l lock[,size] only watch <lock>, which can be specified as a\n" 295 " symbolic name or hex address; <size> defaults\n" 296 " to the ELF symbol size if available, 1 if not\n" 297 " -f func[,size] only watch events generated by <func>\n" 298 " -d duration only watch events longer than <duration>\n" 299 " -T trace (rather than sample) events\n" 300 "\nData reporting options:\n\n" 301 " -c coalesce lock data for arrays like pse_mutex[]\n" 302 " -k coalesce PCs within functions\n" 303 " -g show total events generated by function\n" 304 " -w wherever: don't distinguish events by caller\n" 305 " -W whichever: don't distinguish events by lock\n" 306 " -R display rates rather than counts\n" 307 " -p parsable output format (awk(1)-friendly)\n" 308 " -P sort lock data by (count * avg_time) product\n" 309 " -D n only display top <n> events of each type\n" 310 " -o filename send output to <filename>\n", 311 DEFAULT_HZ, DEFAULT_NRECS); 312 313 show_events('C', "Contention"); 314 show_events('H', "Hold-time"); 315 show_events('I', "Interrupt"); 316 show_events('E', "Error"); 317 (void) fprintf(stderr, "\n"); 318 319 exit(1); 320 } 321 322 static int 323 lockcmp(lsrec_t *a, lsrec_t *b) 324 { 325 int i; 326 327 if (a->ls_event < b->ls_event) 328 return (-1); 329 if (a->ls_event > b->ls_event) 330 return (1); 331 332 for (i = g_stkdepth - 1; i >= 0; i--) { 333 if (a->ls_stack[i] < b->ls_stack[i]) 334 return (-1); 335 if (a->ls_stack[i] > b->ls_stack[i]) 336 return (1); 337 } 338 339 if (a->ls_caller < b->ls_caller) 340 return (-1); 341 if (a->ls_caller > b->ls_caller) 342 return (1); 343 344 if (a->ls_lock < b->ls_lock) 345 return (-1); 346 if (a->ls_lock > b->ls_lock) 347 return (1); 348 349 return (0); 350 } 351 352 static int 353 countcmp(lsrec_t *a, lsrec_t *b) 354 { 355 if (a->ls_event < b->ls_event) 356 return (-1); 357 if (a->ls_event > b->ls_event) 358 return (1); 359 360 return (b->ls_count - a->ls_count); 361 } 362 363 static int 364 timecmp(lsrec_t *a, lsrec_t *b) 365 { 366 if (a->ls_event < b->ls_event) 367 return (-1); 368 if (a->ls_event > b->ls_event) 369 return (1); 370 371 if (a->ls_time < b->ls_time) 372 return (1); 373 if (a->ls_time > b->ls_time) 374 return (-1); 375 376 return (0); 377 } 378 379 static int 380 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 381 { 382 if (a->ls_event < b->ls_event) 383 return (-1); 384 if (a->ls_event > b->ls_event) 385 return (1); 386 387 if (a->ls_lock < b->ls_lock) 388 return (-1); 389 if (a->ls_lock > b->ls_lock) 390 return (1); 391 392 return (0); 393 } 394 395 static int 396 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 397 { 398 if (a->ls_event < b->ls_event) 399 return (-1); 400 if (a->ls_event > b->ls_event) 401 return (1); 402 403 if (a->ls_lock < b->ls_lock) 404 return (-1); 405 if (a->ls_lock > b->ls_lock) 406 return (1); 407 408 return (b->ls_count - a->ls_count); 409 } 410 411 static int 412 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 413 { 414 int i; 415 416 if (a->ls_event < b->ls_event) 417 return (-1); 418 if (a->ls_event > b->ls_event) 419 return (1); 420 421 for (i = g_stkdepth - 1; i >= 0; i--) { 422 if (a->ls_stack[i] < b->ls_stack[i]) 423 return (-1); 424 if (a->ls_stack[i] > b->ls_stack[i]) 425 return (1); 426 } 427 428 if (a->ls_caller < b->ls_caller) 429 return (-1); 430 if (a->ls_caller > b->ls_caller) 431 return (1); 432 433 return (0); 434 } 435 436 static int 437 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 438 { 439 int i; 440 441 if (a->ls_event < b->ls_event) 442 return (-1); 443 if (a->ls_event > b->ls_event) 444 return (1); 445 446 for (i = g_stkdepth - 1; i >= 0; i--) { 447 if (a->ls_stack[i] < b->ls_stack[i]) 448 return (-1); 449 if (a->ls_stack[i] > b->ls_stack[i]) 450 return (1); 451 } 452 453 if (a->ls_caller < b->ls_caller) 454 return (-1); 455 if (a->ls_caller > b->ls_caller) 456 return (1); 457 458 return (b->ls_count - a->ls_count); 459 } 460 461 static void 462 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 463 { 464 int m = n / 2; 465 int i, j; 466 467 if (m > 1) 468 mergesort(cmp, a, b, m); 469 if (n - m > 1) 470 mergesort(cmp, a + m, b + m, n - m); 471 for (i = m; i > 0; i--) 472 b[i - 1] = a[i - 1]; 473 for (j = m - 1; j < n - 1; j++) 474 b[n + m - j - 2] = a[j + 1]; 475 while (i < j) 476 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 477 *a = b[i]; 478 } 479 480 static void 481 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 482 { 483 int i, j; 484 lsrec_t *target, *current; 485 486 target = lock[0]; 487 488 for (i = 1; i < n; i++) { 489 current = lock[i]; 490 if (cmp(current, target) != 0) { 491 target = current; 492 continue; 493 } 494 current->ls_event = LS_MAX_EVENTS; 495 target->ls_count += current->ls_count; 496 target->ls_refcnt += current->ls_refcnt; 497 if (g_recsize < LS_TIME) 498 continue; 499 target->ls_time += current->ls_time; 500 if (g_recsize < LS_HIST) 501 continue; 502 for (j = 0; j < 64; j++) 503 target->ls_hist[j] += current->ls_hist[j]; 504 } 505 } 506 507 static void 508 coalesce_symbol(uintptr_t *addrp) 509 { 510 uintptr_t symoff; 511 size_t symsize; 512 513 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 514 *addrp -= symoff; 515 } 516 517 static void 518 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 519 { 520 char *new; 521 int len, newlen; 522 523 if (what == NULL) 524 return; 525 526 if (*pred == NULL) { 527 *pred = malloc(1); 528 *pred[0] = '\0'; 529 } 530 531 len = strlen(*pred); 532 newlen = len + strlen(what) + 32 + strlen("( && )"); 533 new = malloc(newlen); 534 535 if (*pred[0] != '\0') { 536 if (cmp != NULL) { 537 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 538 *pred, what, cmp, (void *)value); 539 } else { 540 (void) sprintf(new, "(%s) && (%s)", *pred, what); 541 } 542 } else { 543 if (cmp != NULL) { 544 (void) sprintf(new, "%s %s 0x%p", 545 what, cmp, (void *)value); 546 } else { 547 (void) sprintf(new, "%s", what); 548 } 549 } 550 551 free(*pred); 552 *pred = new; 553 } 554 555 static void 556 predicate_destroy(char **pred) 557 { 558 free(*pred); 559 *pred = NULL; 560 } 561 562 static void 563 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 564 { 565 char buf[256], *c = buf, *new; 566 int len, newlen; 567 568 if (*filt == NULL) { 569 *filt = malloc(1); 570 *filt[0] = '\0'; 571 } 572 573 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 574 " || " : "", what, (void *)base, what, (void *)(base + size)); 575 576 newlen = (len = strlen(*filt) + 1) + strlen(c); 577 new = malloc(newlen); 578 bcopy(*filt, new, len); 579 (void) strcat(new, c); 580 free(*filt); 581 *filt = new; 582 } 583 584 static void 585 filter_destroy(char **filt) 586 { 587 free(*filt); 588 *filt = NULL; 589 } 590 591 static void 592 dprog_add(const char *fmt, ...) 593 { 594 va_list args; 595 int size, offs; 596 char c; 597 598 va_start(args, fmt); 599 size = vsnprintf(&c, 1, fmt, args) + 1; 600 601 if (g_proglen == 0) { 602 offs = 0; 603 } else { 604 offs = g_proglen - 1; 605 } 606 607 g_proglen = offs + size; 608 609 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 610 fail(1, "failed to reallocate program text"); 611 612 (void) vsnprintf(&g_prog[offs], size, fmt, args); 613 } 614 615 /* 616 * This function may read like an open sewer, but keep in mind that programs 617 * that generate other programs are rarely pretty. If one has the unenviable 618 * task of maintaining or -- worse -- extending this code, use the -V option 619 * to examine the D program as generated by this function. 620 */ 621 static void 622 dprog_addevent(int event) 623 { 624 ls_event_info_t *info = &g_event_info[event]; 625 char *pred = NULL; 626 char stack[20]; 627 const char *arg0, *caller; 628 char *arg1 = "arg1"; 629 char buf[80]; 630 hrtime_t dur; 631 int depth; 632 633 if (info->ev_name[0] == '\0') 634 return; 635 636 if (info->ev_type == 'I') { 637 /* 638 * For interrupt events, arg0 (normally the lock pointer) is 639 * the CPU address plus the current pil, and arg1 (normally 640 * the number of nanoseconds) is the number of nanoseconds 641 * late -- and it's stored in arg2. 642 */ 643 arg0 = "(uintptr_t)curthread->t_cpu + \n" 644 "\t curthread->t_cpu->cpu_profile_pil"; 645 caller = "(uintptr_t)arg0"; 646 arg1 = "arg2"; 647 } else { 648 arg0 = "(uintptr_t)arg0"; 649 caller = "caller"; 650 } 651 652 if (g_recsize > LS_HIST) { 653 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 654 continue; 655 656 if (g_tracing) { 657 (void) sprintf(stack, "\tstack(%d);\n", depth); 658 } else { 659 (void) sprintf(stack, ", stack(%d)", depth); 660 } 661 } else { 662 (void) sprintf(stack, ""); 663 } 664 665 if (info->ev_acquire != NULL) { 666 /* 667 * If this is a hold event, we need to generate an additional 668 * clause for the acquire; the clause for the release will be 669 * generated with the aggregating statement, below. 670 */ 671 dprog_add("%s\n", info->ev_acquire); 672 predicate_add(&pred, info->ev_predicate, NULL, 0); 673 predicate_add(&pred, g_predicate, NULL, 0); 674 if (pred != NULL) 675 dprog_add("/%s/\n", pred); 676 677 dprog_add("{\n"); 678 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 679 680 if (info->ev_type == 'H') { 681 dprog_add("\t%s = timestamp;\n", buf); 682 } else { 683 /* 684 * If this isn't a hold event, it's the recursive 685 * error event. For this, we simply bump the 686 * thread-local, per-lock count. 687 */ 688 dprog_add("\t%s++;\n", buf); 689 } 690 691 dprog_add("}\n\n"); 692 predicate_destroy(&pred); 693 pred = NULL; 694 695 if (info->ev_type == 'E') { 696 /* 697 * If this is the recursive lock error event, we need 698 * to generate an additional clause to decrement the 699 * thread-local, per-lock count. This assures that we 700 * only execute the aggregating clause if we have 701 * recursive entry. 702 */ 703 dprog_add("%s\n", info->ev_name); 704 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 705 } 706 707 predicate_add(&pred, buf, NULL, 0); 708 709 if (info->ev_type == 'H') { 710 (void) sprintf(buf, "timestamp -\n\t " 711 "self->ev%d[(uintptr_t)arg0]", event); 712 } 713 714 arg1 = buf; 715 } else { 716 predicate_add(&pred, info->ev_predicate, NULL, 0); 717 if (info->ev_type != 'I') 718 predicate_add(&pred, g_predicate, NULL, 0); 719 else 720 predicate_add(&pred, g_ipredicate, NULL, 0); 721 } 722 723 if ((dur = g_min_duration[event]) != 0) 724 predicate_add(&pred, arg1, ">=", dur); 725 726 dprog_add("%s\n", info->ev_name); 727 728 if (pred != NULL) 729 dprog_add("/%s/\n", pred); 730 predicate_destroy(&pred); 731 732 dprog_add("{\n"); 733 734 if (g_tracing) { 735 dprog_add("\ttrace(%dULL);\n", event); 736 dprog_add("\ttrace(%s);\n", arg0); 737 dprog_add("\ttrace(%s);\n", caller); 738 dprog_add(stack); 739 } else { 740 /* 741 * The ordering here is important: when we process the 742 * aggregate, we count on the fact that @avg appears before 743 * @hist in program order to assure that @avg is assigned the 744 * first aggregation variable ID and @hist assigned the 745 * second; see the comment in process_aggregate() for details. 746 */ 747 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 748 event, arg0, caller, stack, arg1); 749 750 if (g_recsize >= LS_HIST) { 751 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 752 "(%s);\n", event, arg0, caller, stack, arg1); 753 } 754 } 755 756 if (info->ev_acquire != NULL) 757 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 758 759 dprog_add("}\n\n"); 760 } 761 762 static void 763 dprog_compile() 764 { 765 dtrace_prog_t *prog; 766 dtrace_proginfo_t info; 767 768 if (g_Vflag) { 769 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 770 (void) fputs(g_prog, stderr); 771 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 772 } 773 774 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 775 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 776 dfail("failed to compile program"); 777 778 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 779 dfail("failed to enable probes"); 780 781 if (dtrace_go(g_dtp) != 0) 782 dfail("couldn't start tracing"); 783 } 784 785 static void 786 status_fire(void) 787 {} 788 789 static void 790 status_init(void) 791 { 792 dtrace_optval_t val, status, agg; 793 struct sigaction act; 794 struct itimerspec ts; 795 struct sigevent ev; 796 timer_t tid; 797 798 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 799 dfail("failed to get 'statusrate'"); 800 801 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 802 dfail("failed to get 'statusrate'"); 803 804 /* 805 * We would want to awaken at a rate that is the GCD of the statusrate 806 * and the aggrate -- but that seems a bit absurd. Instead, we'll 807 * simply awaken at a rate that is the more frequent of the two, which 808 * assures that we're never later than the interval implied by the 809 * more frequent rate. 810 */ 811 val = status < agg ? status : agg; 812 813 (void) sigemptyset(&act.sa_mask); 814 act.sa_flags = 0; 815 act.sa_handler = status_fire; 816 (void) sigaction(SIGUSR1, &act, NULL); 817 818 ev.sigev_notify = SIGEV_SIGNAL; 819 ev.sigev_signo = SIGUSR1; 820 821 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 822 dfail("cannot create CLOCK_REALTIME timer"); 823 824 ts.it_value.tv_sec = val / NANOSEC; 825 ts.it_value.tv_nsec = val % NANOSEC; 826 ts.it_interval = ts.it_value; 827 828 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 829 dfail("cannot set time on CLOCK_REALTIME timer"); 830 } 831 832 static void 833 status_check(void) 834 { 835 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 836 dfail("failed to snap aggregate"); 837 838 if (dtrace_status(g_dtp) == -1) 839 dfail("dtrace_status()"); 840 } 841 842 static void 843 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 844 { 845 bzero(lsrec, g_recsize); 846 lsrec->ls_count = 1; 847 848 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 849 fail(0, "truncated DTrace record"); 850 851 if (rec->dtrd_size != sizeof (uint64_t)) 852 fail(0, "bad event size in first record"); 853 854 /* LINTED - alignment */ 855 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 856 rec++; 857 858 if (rec->dtrd_size != sizeof (uintptr_t)) 859 fail(0, "bad lock address size in second record"); 860 861 /* LINTED - alignment */ 862 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 863 rec++; 864 865 if (rec->dtrd_size != sizeof (uintptr_t)) 866 fail(0, "bad caller size in third record"); 867 868 /* LINTED - alignment */ 869 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 870 rec++; 871 872 if (g_recsize > LS_HIST) { 873 int frames, i; 874 pc_t *stack; 875 876 frames = rec->dtrd_size / sizeof (pc_t); 877 /* LINTED - alignment */ 878 stack = (pc_t *)(data + rec->dtrd_offset); 879 880 for (i = 1; i < frames; i++) 881 lsrec->ls_stack[i - 1] = stack[i]; 882 } 883 } 884 885 /*ARGSUSED*/ 886 static int 887 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 888 { 889 *((size_t *)arg) += 1; 890 891 return (DTRACE_AGGWALK_NEXT); 892 } 893 894 static int 895 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 896 { 897 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 898 caddr_t data = agg->dtada_data; 899 lsdata_t *lsdata = arg; 900 lsrec_t *lsrec = lsdata->lsd_next; 901 const dtrace_recdesc_t *rec; 902 uint64_t *avg, *quantized; 903 int i, j; 904 905 assert(lsdata->lsd_count < g_nrecs); 906 907 /* 908 * Aggregation variable IDs are guaranteed to be generated in program 909 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 910 * plus one. As "avg" appears before "hist" in program order, we know 911 * that "avg" will be allocated the first aggregation variable ID, and 912 * "hist" will be allocated the second aggregation variable ID -- and 913 * we therefore use the aggregation variable ID to differentiate the 914 * cases. 915 */ 916 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 917 /* 918 * If this is the histogram entry. We'll copy the quantized 919 * data into lc_hist, and jump over the rest. 920 */ 921 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 922 923 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 924 fail(0, "bad variable ID in aggregation record"); 925 926 if (rec->dtrd_size != 927 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 928 fail(0, "bad quantize size in aggregation record"); 929 930 /* LINTED - alignment */ 931 quantized = (uint64_t *)(data + rec->dtrd_offset); 932 933 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 934 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 935 lsrec->ls_hist[j] = quantized[i]; 936 937 goto out; 938 } 939 940 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 941 aggdesc->dtagd_nrecs - 1, data); 942 943 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 944 945 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 946 fail(0, "bad avg size in aggregation record"); 947 948 /* LINTED - alignment */ 949 avg = (uint64_t *)(data + rec->dtrd_offset); 950 lsrec->ls_count = (uint32_t)avg[0]; 951 lsrec->ls_time = (uintptr_t)avg[1]; 952 953 if (g_recsize >= LS_HIST) 954 return (DTRACE_AGGWALK_NEXT); 955 956 out: 957 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 958 lsdata->lsd_count++; 959 960 return (DTRACE_AGGWALK_NEXT); 961 } 962 963 static int 964 process_trace(const dtrace_probedata_t *pdata, void *arg) 965 { 966 lsdata_t *lsdata = arg; 967 lsrec_t *lsrec = lsdata->lsd_next; 968 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 969 caddr_t data = pdata->dtpda_data; 970 971 if (lsdata->lsd_count >= g_nrecs) 972 return (DTRACE_CONSUME_NEXT); 973 974 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 975 976 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 977 lsdata->lsd_count++; 978 979 return (DTRACE_CONSUME_NEXT); 980 } 981 982 static int 983 process_data(FILE *out, char *data) 984 { 985 lsdata_t lsdata; 986 987 /* LINTED - alignment */ 988 lsdata.lsd_next = (lsrec_t *)data; 989 lsdata.lsd_count = 0; 990 991 if (g_tracing) { 992 if (dtrace_consume(g_dtp, out, 993 process_trace, NULL, &lsdata) != 0) 994 dfail("failed to consume buffer"); 995 996 return (lsdata.lsd_count); 997 } 998 999 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1000 process_aggregate, &lsdata) != 0) 1001 dfail("failed to walk aggregate"); 1002 1003 return (lsdata.lsd_count); 1004 } 1005 1006 /*ARGSUSED*/ 1007 static int 1008 drophandler(const dtrace_dropdata_t *data, void *arg) 1009 { 1010 g_dropped++; 1011 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1012 return (DTRACE_HANDLE_OK); 1013 } 1014 1015 int 1016 main(int argc, char **argv) 1017 { 1018 char *data_buf; 1019 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1020 FILE *out = stdout; 1021 char c; 1022 pid_t child; 1023 int status; 1024 int i, j; 1025 hrtime_t duration; 1026 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1027 uintptr_t addr; 1028 size_t size, off; 1029 int events_specified = 0; 1030 int exec_errno = 0; 1031 uint32_t event; 1032 char *filt = NULL, *ifilt = NULL; 1033 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1034 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1035 dtrace_optval_t aggsize; 1036 char aggstr[10]; 1037 long ncpus; 1038 int dynvar = 0; 1039 int err; 1040 1041 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1042 fail(0, "cannot open dtrace library: %s", 1043 dtrace_errmsg(NULL, err)); 1044 } 1045 1046 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1047 dfail("couldn't establish drop handler"); 1048 1049 if (symtab_init() == -1) 1050 fail(1, "can't load kernel symbols"); 1051 1052 g_nrecs = DEFAULT_NRECS; 1053 1054 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1055 switch (c) { 1056 case 'b': 1057 g_recsize = LS_BASIC; 1058 break; 1059 1060 case 't': 1061 g_recsize = LS_TIME; 1062 break; 1063 1064 case 'h': 1065 g_recsize = LS_HIST; 1066 break; 1067 1068 case 's': 1069 if (!isdigit(optarg[0])) 1070 usage(); 1071 g_stkdepth = atoi(optarg); 1072 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1073 fail(0, "max stack depth is %d", 1074 LS_MAX_STACK_DEPTH); 1075 g_recsize = LS_STACK(g_stkdepth); 1076 break; 1077 1078 case 'n': 1079 if (!isdigit(optarg[0])) 1080 usage(); 1081 g_nrecs = atoi(optarg); 1082 break; 1083 1084 case 'd': 1085 if (!isdigit(optarg[0])) 1086 usage(); 1087 duration = atoll(optarg); 1088 1089 /* 1090 * XXX -- durations really should be per event 1091 * since the units are different, but it's hard 1092 * to express this nicely in the interface. 1093 * Not clear yet what the cleanest solution is. 1094 */ 1095 for (i = 0; i < LS_MAX_EVENTS; i++) 1096 if (g_event_info[i].ev_type != 'E') 1097 g_min_duration[i] = duration; 1098 1099 break; 1100 1101 case 'i': 1102 if (!isdigit(optarg[0])) 1103 usage(); 1104 i = atoi(optarg); 1105 if (i <= 0) 1106 usage(); 1107 if (i > MAX_HZ) 1108 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1109 1110 for (j = 0; j < LS_MAX_EVENTS; j++) 1111 if (strcmp(g_event_info[j].ev_desc, 1112 "Profiling interrupt") == 0) 1113 break; 1114 1115 (void) sprintf(g_event_info[j].ev_name, 1116 "profile:::profile-%d", i); 1117 break; 1118 1119 case 'l': 1120 case 'f': 1121 addrp = strtok(optarg, ","); 1122 sizep = strtok(NULL, ","); 1123 addrp = strtok(optarg, ",+"); 1124 offp = strtok(NULL, ","); 1125 1126 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1127 off = offp ? strtoul(offp, NULL, 0) : 0; 1128 1129 if (addrp[0] == '0') { 1130 addr = strtoul(addrp, NULL, 16) + off; 1131 } else { 1132 addr = sym_to_addr(addrp) + off; 1133 if (sizep == NULL) 1134 size = sym_size(addrp) - off; 1135 if (addr - off == 0) 1136 fail(0, "symbol '%s' not found", addrp); 1137 if (size == 0) 1138 size = 1; 1139 } 1140 1141 1142 if (c == 'l') { 1143 filter_add(&filt, "arg0", addr, size); 1144 } else { 1145 filter_add(&filt, "caller", addr, size); 1146 filter_add(&ifilt, "arg0", addr, size); 1147 } 1148 break; 1149 1150 case 'e': 1151 evp = strtok_r(optarg, ",", &lastp); 1152 while (evp) { 1153 int ev1, ev2; 1154 char *evp2; 1155 1156 (void) strtok(evp, "-"); 1157 evp2 = strtok(NULL, "-"); 1158 ev1 = atoi(evp); 1159 ev2 = evp2 ? atoi(evp2) : ev1; 1160 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1161 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1162 fail(0, "-e events out of range"); 1163 for (i = ev1; i <= ev2; i++) 1164 g_enabled[i] = 1; 1165 evp = strtok_r(NULL, ",", &lastp); 1166 } 1167 events_specified = 1; 1168 break; 1169 1170 case 'c': 1171 g_cflag = 1; 1172 break; 1173 1174 case 'k': 1175 g_kflag = 1; 1176 break; 1177 1178 case 'w': 1179 g_wflag = 1; 1180 break; 1181 1182 case 'W': 1183 g_Wflag = 1; 1184 break; 1185 1186 case 'g': 1187 g_gflag = 1; 1188 break; 1189 1190 case 'C': 1191 case 'E': 1192 case 'H': 1193 case 'I': 1194 for (i = 0; i < LS_MAX_EVENTS; i++) 1195 if (g_event_info[i].ev_type == c) 1196 g_enabled[i] = 1; 1197 events_specified = 1; 1198 break; 1199 1200 case 'A': 1201 for (i = 0; i < LS_MAX_EVENTS; i++) 1202 if (strchr("CH", g_event_info[i].ev_type)) 1203 g_enabled[i] = 1; 1204 events_specified = 1; 1205 break; 1206 1207 case 'T': 1208 g_tracing = 1; 1209 break; 1210 1211 case 'D': 1212 if (!isdigit(optarg[0])) 1213 usage(); 1214 g_topn = atoi(optarg); 1215 break; 1216 1217 case 'R': 1218 g_rates = 1; 1219 break; 1220 1221 case 'p': 1222 g_pflag = 1; 1223 break; 1224 1225 case 'P': 1226 g_Pflag = 1; 1227 break; 1228 1229 case 'o': 1230 if ((out = fopen(optarg, "w")) == NULL) 1231 fail(1, "error opening file"); 1232 break; 1233 1234 case 'V': 1235 g_Vflag = 1; 1236 break; 1237 1238 default: 1239 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1240 usage(); 1241 } 1242 } 1243 1244 if (filt != NULL) { 1245 predicate_add(&g_predicate, filt, NULL, 0); 1246 filter_destroy(&filt); 1247 } 1248 1249 if (ifilt != NULL) { 1250 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1251 filter_destroy(&ifilt); 1252 } 1253 1254 if (g_recsize == 0) { 1255 if (g_gflag) { 1256 g_stkdepth = LS_MAX_STACK_DEPTH; 1257 g_recsize = LS_STACK(g_stkdepth); 1258 } else { 1259 g_recsize = LS_TIME; 1260 } 1261 } 1262 1263 if (g_gflag && g_recsize <= LS_STACK(0)) 1264 fail(0, "'-g' requires at least '-s 1' data gathering"); 1265 1266 /* 1267 * Make sure the alignment is reasonable 1268 */ 1269 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1270 1271 for (i = 0; i < LS_MAX_EVENTS; i++) { 1272 /* 1273 * If no events were specified, enable -C. 1274 */ 1275 if (!events_specified && g_event_info[i].ev_type == 'C') 1276 g_enabled[i] = 1; 1277 } 1278 1279 for (i = 0; i < LS_MAX_EVENTS; i++) { 1280 if (!g_enabled[i]) 1281 continue; 1282 1283 if (g_event_info[i].ev_acquire != NULL) { 1284 /* 1285 * If we've enabled a hold event, we must explicitly 1286 * allocate dynamic variable space. 1287 */ 1288 dynvar = 1; 1289 } 1290 1291 dprog_addevent(i); 1292 } 1293 1294 /* 1295 * Make sure there are remaining arguments to specify a child command 1296 * to execute. 1297 */ 1298 if (argc <= optind) 1299 usage(); 1300 1301 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1302 dfail("couldn't determine number of online CPUs"); 1303 1304 /* 1305 * By default, we set our data buffer size to be the number of records 1306 * multiplied by the size of the record, doubled to account for some 1307 * DTrace slop and divided by the number of CPUs. We silently clamp 1308 * the aggregation size at both a minimum and a maximum to prevent 1309 * absurdly low or high values. 1310 */ 1311 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1312 aggsize = MIN_AGGSIZE; 1313 1314 if (aggsize > MAX_AGGSIZE) 1315 aggsize = MAX_AGGSIZE; 1316 1317 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1318 1319 if (!g_tracing) { 1320 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1321 dfail("failed to set 'bufsize'"); 1322 1323 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1324 dfail("failed to set 'aggsize'"); 1325 1326 if (dynvar) { 1327 /* 1328 * If we're using dynamic variables, we set our 1329 * dynamic variable size to be one megabyte per CPU, 1330 * with a hard-limit of 32 megabytes. This may still 1331 * be too small in some cases, but it can be tuned 1332 * manually via -x if need be. 1333 */ 1334 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1335 1336 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1337 dfail("failed to set 'dynvarsize'"); 1338 } 1339 } else { 1340 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1341 dfail("failed to set 'bufsize'"); 1342 } 1343 1344 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1345 dfail("failed to set 'statusrate'"); 1346 1347 optind = 1; 1348 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1349 switch (c) { 1350 case 'x': 1351 if ((p = strchr(optarg, '=')) != NULL) 1352 *p++ = '\0'; 1353 1354 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1355 dfail("failed to set -x %s", optarg); 1356 break; 1357 } 1358 } 1359 1360 argc -= optind; 1361 argv += optind; 1362 1363 dprog_compile(); 1364 status_init(); 1365 1366 g_elapsed = -gethrtime(); 1367 1368 /* 1369 * Spawn the specified command and wait for it to complete. 1370 */ 1371 child = fork(); 1372 if (child == -1) 1373 fail(1, "cannot fork"); 1374 if (child == 0) { 1375 (void) dtrace_close(g_dtp); 1376 (void) execvp(argv[0], &argv[0]); 1377 exec_errno = errno; 1378 exit(127); 1379 } 1380 1381 while (waitpid(child, &status, WEXITED) != child) 1382 status_check(); 1383 1384 g_elapsed += gethrtime(); 1385 1386 if (WIFEXITED(status)) { 1387 if (WEXITSTATUS(status) != 0) { 1388 if (exec_errno != 0) { 1389 errno = exec_errno; 1390 fail(1, "could not execute %s", argv[0]); 1391 } 1392 (void) fprintf(stderr, 1393 "lockstat: warning: %s exited with code %d\n", 1394 argv[0], WEXITSTATUS(status)); 1395 } 1396 } else { 1397 (void) fprintf(stderr, 1398 "lockstat: warning: %s died on signal %d\n", 1399 argv[0], WTERMSIG(status)); 1400 } 1401 1402 if (dtrace_stop(g_dtp) == -1) 1403 dfail("failed to stop dtrace"); 1404 1405 /* 1406 * Before we read out the results, we need to allocate our buffer. 1407 * If we're tracing, then we'll just use the precalculated size. If 1408 * we're not, then we'll take a snapshot of the aggregate, and walk 1409 * it to count the number of records. 1410 */ 1411 if (!g_tracing) { 1412 if (dtrace_aggregate_snap(g_dtp) != 0) 1413 dfail("failed to snap aggregate"); 1414 1415 g_nrecs = 0; 1416 1417 if (dtrace_aggregate_walk(g_dtp, 1418 count_aggregate, &g_nrecs) != 0) 1419 dfail("failed to walk aggregate"); 1420 } 1421 1422 if ((data_buf = memalign(sizeof (uint64_t), 1423 (g_nrecs + 1) * g_recsize)) == NULL) 1424 fail(1, "Memory allocation failed"); 1425 1426 /* 1427 * Read out the DTrace data. 1428 */ 1429 g_nrecs_used = process_data(out, data_buf); 1430 1431 if (g_nrecs_used > g_nrecs || g_dropped) 1432 (void) fprintf(stderr, "lockstat: warning: " 1433 "ran out of data records (use -n for more)\n"); 1434 1435 /* LINTED - alignment */ 1436 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1437 /* LINTED - alignment */ 1438 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1439 ev_count[lsp->ls_event] += lsp->ls_count; 1440 ev_time[lsp->ls_event] += lsp->ls_time; 1441 } 1442 1443 /* 1444 * If -g was specified, convert stacks into individual records. 1445 */ 1446 if (g_gflag) { 1447 lsrec_t *newlsp, *oldlsp; 1448 1449 newlsp = memalign(sizeof (uint64_t), 1450 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1451 if (newlsp == NULL) 1452 fail(1, "Cannot allocate space for -g processing"); 1453 lsp = newlsp; 1454 /* LINTED - alignment */ 1455 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1456 /* LINTED - alignment */ 1457 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1458 int fr; 1459 int caller_in_stack = 0; 1460 1461 if (oldlsp->ls_count == 0) 1462 continue; 1463 1464 for (fr = 0; fr < g_stkdepth; fr++) { 1465 if (oldlsp->ls_stack[fr] == 0) 1466 break; 1467 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1468 caller_in_stack = 1; 1469 bcopy(oldlsp, lsp, LS_TIME); 1470 lsp->ls_caller = oldlsp->ls_stack[fr]; 1471 /* LINTED - alignment */ 1472 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1473 } 1474 if (!caller_in_stack) { 1475 bcopy(oldlsp, lsp, LS_TIME); 1476 /* LINTED - alignment */ 1477 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1478 } 1479 } 1480 g_nrecs = g_nrecs_used = 1481 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1482 g_recsize = LS_TIME; 1483 g_stkdepth = 0; 1484 free(data_buf); 1485 data_buf = (char *)newlsp; 1486 } 1487 1488 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1489 sizeof (void *))) == NULL) 1490 fail(1, "Sort buffer allocation failed"); 1491 merge_buf = sort_buf + (g_nrecs + 1); 1492 1493 /* 1494 * Build the sort buffer, discarding zero-count records along the way. 1495 */ 1496 /* LINTED - alignment */ 1497 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1498 /* LINTED - alignment */ 1499 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1500 if (lsp->ls_count == 0) 1501 lsp->ls_event = LS_MAX_EVENTS; 1502 sort_buf[i] = lsp; 1503 } 1504 1505 if (g_nrecs_used == 0) 1506 exit(0); 1507 1508 /* 1509 * Add a sentinel after the last record 1510 */ 1511 sort_buf[i] = lsp; 1512 lsp->ls_event = LS_MAX_EVENTS; 1513 1514 if (g_tracing) { 1515 report_trace(out, sort_buf); 1516 return (0); 1517 } 1518 1519 /* 1520 * Application of -g may have resulted in multiple records 1521 * with the same signature; coalesce them. 1522 */ 1523 if (g_gflag) { 1524 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1525 coalesce(lockcmp, sort_buf, g_nrecs_used); 1526 } 1527 1528 /* 1529 * Coalesce locks within the same symbol if -c option specified. 1530 * Coalesce PCs within the same function if -k option specified. 1531 */ 1532 if (g_cflag || g_kflag) { 1533 for (i = 0; i < g_nrecs_used; i++) { 1534 int fr; 1535 lsp = sort_buf[i]; 1536 if (g_cflag) 1537 coalesce_symbol(&lsp->ls_lock); 1538 if (g_kflag) { 1539 for (fr = 0; fr < g_stkdepth; fr++) 1540 coalesce_symbol(&lsp->ls_stack[fr]); 1541 coalesce_symbol(&lsp->ls_caller); 1542 } 1543 } 1544 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1545 coalesce(lockcmp, sort_buf, g_nrecs_used); 1546 } 1547 1548 /* 1549 * Coalesce callers if -w option specified 1550 */ 1551 if (g_wflag) { 1552 mergesort(lock_and_count_cmp_anywhere, 1553 sort_buf, merge_buf, g_nrecs_used); 1554 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1555 } 1556 1557 /* 1558 * Coalesce locks if -W option specified 1559 */ 1560 if (g_Wflag) { 1561 mergesort(site_and_count_cmp_anylock, 1562 sort_buf, merge_buf, g_nrecs_used); 1563 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1564 } 1565 1566 /* 1567 * Sort data by contention count (ls_count) or total time (ls_time), 1568 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1569 */ 1570 if (g_recsize < LS_TIME) 1571 g_Pflag = 0; 1572 1573 if (g_Pflag) 1574 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1575 else 1576 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1577 1578 /* 1579 * Display data by event type 1580 */ 1581 first = &sort_buf[0]; 1582 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1583 current = first; 1584 while ((lsp = *current)->ls_event == event) 1585 current++; 1586 report_stats(out, first, current - first, ev_count[event], 1587 ev_time[event]); 1588 first = current; 1589 } 1590 1591 return (0); 1592 } 1593 1594 static char * 1595 format_symbol(char *buf, uintptr_t addr, int show_size) 1596 { 1597 uintptr_t symoff; 1598 char *symname; 1599 size_t symsize; 1600 1601 symname = addr_to_sym(addr, &symoff, &symsize); 1602 1603 if (show_size && symoff == 0) 1604 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1605 else if (symoff == 0) 1606 (void) sprintf(buf, "%s", symname); 1607 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1608 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1609 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1610 (void) sprintf(buf, "%s+0x%llx", symname, 1611 (unsigned long long)symoff); 1612 else 1613 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1614 return (buf); 1615 } 1616 1617 static void 1618 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1619 uint64_t total_time) 1620 { 1621 uint32_t event = sort_buf[0]->ls_event; 1622 lsrec_t *lsp; 1623 double ptotal = 0.0; 1624 double percent; 1625 int i, j, fr; 1626 int displayed; 1627 int first_bin, last_bin, max_bin_count, total_bin_count; 1628 int rectype; 1629 char buf[256]; 1630 char lhdr[80], chdr[80]; 1631 1632 rectype = g_recsize; 1633 1634 if (g_topn == 0) { 1635 (void) fprintf(out, "%20llu %s\n", 1636 g_rates == 0 ? total_count : 1637 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1638 g_event_info[event].ev_desc); 1639 return; 1640 } 1641 1642 (void) sprintf(lhdr, "%s%s", 1643 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1644 (void) sprintf(chdr, "%s%s", 1645 g_wflag ? "Hottest " : "", "Caller"); 1646 1647 if (!g_pflag) 1648 (void) fprintf(out, 1649 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1650 g_event_info[event].ev_desc, (double)total_count, 1651 (double)g_elapsed / NANOSEC, 1652 (double)total_count * NANOSEC / g_elapsed); 1653 1654 if (!g_pflag && rectype < LS_HIST) { 1655 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1656 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1657 g_rates ? "ops/s" : "Count", 1658 g_gflag ? "genr" : "indv", 1659 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1660 (void) fprintf(out, "---------------------------------" 1661 "----------------------------------------------\n"); 1662 } 1663 1664 displayed = 0; 1665 for (i = 0; i < nrecs; i++) { 1666 lsp = sort_buf[i]; 1667 1668 if (displayed++ >= g_topn) 1669 break; 1670 1671 if (g_pflag) { 1672 int j; 1673 1674 (void) fprintf(out, "%u %u", 1675 lsp->ls_event, lsp->ls_count); 1676 (void) fprintf(out, " %s", 1677 format_symbol(buf, lsp->ls_lock, g_cflag)); 1678 (void) fprintf(out, " %s", 1679 format_symbol(buf, lsp->ls_caller, 0)); 1680 (void) fprintf(out, " %f", 1681 (double)lsp->ls_refcnt / lsp->ls_count); 1682 if (rectype >= LS_TIME) 1683 (void) fprintf(out, " %llu", 1684 (unsigned long long)lsp->ls_time); 1685 if (rectype >= LS_HIST) { 1686 for (j = 0; j < 64; j++) 1687 (void) fprintf(out, " %u", 1688 lsp->ls_hist[j]); 1689 } 1690 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1691 if (rectype <= LS_STACK(j) || 1692 lsp->ls_stack[j] == 0) 1693 break; 1694 (void) fprintf(out, " %s", 1695 format_symbol(buf, lsp->ls_stack[j], 0)); 1696 } 1697 (void) fprintf(out, "\n"); 1698 continue; 1699 } 1700 1701 if (rectype >= LS_HIST) { 1702 (void) fprintf(out, "---------------------------------" 1703 "----------------------------------------------\n"); 1704 (void) sprintf(buf, "%s", 1705 g_event_info[event].ev_units); 1706 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1707 g_rates ? "ops/s" : "Count", 1708 g_gflag ? "genr" : "indv", 1709 "cuml", "rcnt", buf, lhdr, chdr); 1710 } 1711 1712 if (g_Pflag && total_time != 0) 1713 percent = (lsp->ls_time * 100.00) / total_time; 1714 else 1715 percent = (lsp->ls_count * 100.00) / total_count; 1716 1717 ptotal += percent; 1718 1719 if (rectype >= LS_TIME) 1720 (void) sprintf(buf, "%llu", 1721 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1722 else 1723 buf[0] = '\0'; 1724 1725 (void) fprintf(out, "%5llu ", 1726 g_rates == 0 ? lsp->ls_count : 1727 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1728 1729 (void) fprintf(out, "%3.0f%% ", percent); 1730 1731 if (g_gflag) 1732 (void) fprintf(out, "---- "); 1733 else 1734 (void) fprintf(out, "%3.0f%% ", ptotal); 1735 1736 (void) fprintf(out, "%4.2f %8s ", 1737 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1738 1739 (void) fprintf(out, "%-22s ", 1740 format_symbol(buf, lsp->ls_lock, g_cflag)); 1741 1742 (void) fprintf(out, "%-24s\n", 1743 format_symbol(buf, lsp->ls_caller, 0)); 1744 1745 if (rectype < LS_HIST) 1746 continue; 1747 1748 (void) fprintf(out, "\n"); 1749 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1750 g_event_info[event].ev_units, 1751 "------ Time Distribution ------", 1752 g_rates ? "ops/s" : "count", 1753 rectype > LS_STACK(0) ? "Stack" : ""); 1754 1755 first_bin = 0; 1756 while (lsp->ls_hist[first_bin] == 0) 1757 first_bin++; 1758 1759 last_bin = 63; 1760 while (lsp->ls_hist[last_bin] == 0) 1761 last_bin--; 1762 1763 max_bin_count = 0; 1764 total_bin_count = 0; 1765 for (j = first_bin; j <= last_bin; j++) { 1766 total_bin_count += lsp->ls_hist[j]; 1767 if (lsp->ls_hist[j] > max_bin_count) 1768 max_bin_count = lsp->ls_hist[j]; 1769 } 1770 1771 /* 1772 * If we went a few frames below the caller, ignore them 1773 */ 1774 for (fr = 3; fr > 0; fr--) 1775 if (lsp->ls_stack[fr] == lsp->ls_caller) 1776 break; 1777 1778 for (j = first_bin; j <= last_bin; j++) { 1779 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1780 (void) fprintf(out, "%10llu |%s%s %-9u ", 1781 1ULL << j, 1782 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1783 " " + depth, 1784 g_rates == 0 ? lsp->ls_hist[j] : 1785 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1786 g_elapsed)); 1787 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1788 (void) fprintf(out, "\n"); 1789 continue; 1790 } 1791 (void) fprintf(out, "%-24s\n", 1792 format_symbol(buf, lsp->ls_stack[fr], 0)); 1793 fr++; 1794 } 1795 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1796 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1797 format_symbol(buf, lsp->ls_stack[fr], 0)); 1798 fr++; 1799 } 1800 } 1801 1802 if (!g_pflag) 1803 (void) fprintf(out, "---------------------------------" 1804 "----------------------------------------------\n"); 1805 1806 (void) fflush(out); 1807 } 1808 1809 static void 1810 report_trace(FILE *out, lsrec_t **sort_buf) 1811 { 1812 lsrec_t *lsp; 1813 int i, fr; 1814 int rectype; 1815 char buf[256], buf2[256]; 1816 1817 rectype = g_recsize; 1818 1819 if (!g_pflag) { 1820 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1821 "Event", "Time", "Owner", "Lock", "Caller"); 1822 (void) fprintf(out, "---------------------------------" 1823 "----------------------------------------------\n"); 1824 } 1825 1826 for (i = 0; i < g_nrecs_used; i++) { 1827 1828 lsp = sort_buf[i]; 1829 1830 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1831 continue; 1832 1833 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1834 lsp->ls_event, (unsigned long long)lsp->ls_time, 1835 (void *)lsp->ls_next, 1836 format_symbol(buf, lsp->ls_lock, 0), 1837 format_symbol(buf2, lsp->ls_caller, 0)); 1838 1839 if (rectype <= LS_STACK(0)) 1840 continue; 1841 1842 /* 1843 * If we went a few frames below the caller, ignore them 1844 */ 1845 for (fr = 3; fr > 0; fr--) 1846 if (lsp->ls_stack[fr] == lsp->ls_caller) 1847 break; 1848 1849 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1850 (void) fprintf(out, "%53s %-24s\n", "", 1851 format_symbol(buf, lsp->ls_stack[fr], 0)); 1852 fr++; 1853 } 1854 (void) fprintf(out, "\n"); 1855 } 1856 1857 (void) fflush(out); 1858 } 1859