1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <stdio.h> 30 #include <stddef.h> 31 #include <stdlib.h> 32 #include <stdarg.h> 33 #include <string.h> 34 #include <strings.h> 35 #include <ctype.h> 36 #include <fcntl.h> 37 #include <unistd.h> 38 #include <errno.h> 39 #include <limits.h> 40 #include <sys/types.h> 41 #include <sys/modctl.h> 42 #include <sys/stat.h> 43 #include <sys/wait.h> 44 #include <dtrace.h> 45 #include <sys/lockstat.h> 46 #include <alloca.h> 47 #include <signal.h> 48 #include <assert.h> 49 50 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 51 52 #define LS_MAX_STACK_DEPTH 50 53 #define LS_MAX_EVENTS 64 54 55 typedef struct lsrec { 56 struct lsrec *ls_next; /* next in hash chain */ 57 uintptr_t ls_lock; /* lock address */ 58 uintptr_t ls_caller; /* caller address */ 59 uint32_t ls_count; /* cumulative event count */ 60 uint32_t ls_event; /* type of event */ 61 uintptr_t ls_refcnt; /* cumulative reference count */ 62 uint64_t ls_time; /* cumulative event duration */ 63 uint32_t ls_hist[64]; /* log2(duration) histogram */ 64 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 65 } lsrec_t; 66 67 typedef struct lsdata { 68 struct lsrec *lsd_next; /* next available */ 69 int lsd_count; /* number of records */ 70 } lsdata_t; 71 72 /* 73 * Definitions for the types of experiments which can be run. They are 74 * listed in increasing order of memory cost and processing time cost. 75 * The numerical value of each type is the number of bytes needed per record. 76 */ 77 #define LS_BASIC offsetof(lsrec_t, ls_time) 78 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 79 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 80 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 81 82 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 83 static void report_trace(FILE *, lsrec_t **); 84 85 extern int symtab_init(void); 86 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 87 extern uintptr_t sym_to_addr(char *name); 88 extern size_t sym_size(char *name); 89 extern char *strtok_r(char *, const char *, char **); 90 91 #define DEFAULT_NRECS 10000 92 #define DEFAULT_HZ 97 93 #define MAX_HZ 1000 94 #define MIN_AGGSIZE (16 * 1024) 95 #define MAX_AGGSIZE (32 * 1024 * 1024) 96 97 static int g_stkdepth; 98 static int g_topn = INT_MAX; 99 static hrtime_t g_elapsed; 100 static int g_rates = 0; 101 static int g_pflag = 0; 102 static int g_Pflag = 0; 103 static int g_wflag = 0; 104 static int g_Wflag = 0; 105 static int g_cflag = 0; 106 static int g_kflag = 0; 107 static int g_gflag = 0; 108 static int g_Vflag = 0; 109 static int g_tracing = 0; 110 static size_t g_recsize; 111 static size_t g_nrecs; 112 static int g_nrecs_used; 113 static uchar_t g_enabled[LS_MAX_EVENTS]; 114 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 115 static dtrace_hdl_t *g_dtp; 116 static char *g_predicate; 117 static char *g_ipredicate; 118 static char *g_prog; 119 static int g_proglen; 120 static int g_dropped; 121 122 typedef struct ls_event_info { 123 char ev_type; 124 char ev_lhdr[20]; 125 char ev_desc[80]; 126 char ev_units[10]; 127 char ev_name[DTRACE_NAMELEN]; 128 char *ev_predicate; 129 char *ev_acquire; 130 } ls_event_info_t; 131 132 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 133 { 'C', "Lock", "Adaptive mutex spin", "spin", 134 "lockstat:::adaptive-spin" }, 135 { 'C', "Lock", "Adaptive mutex block", "nsec", 136 "lockstat:::adaptive-block" }, 137 { 'C', "Lock", "Spin lock spin", "spin", 138 "lockstat:::spin-spin" }, 139 { 'C', "Lock", "Thread lock spin", "spin", 140 "lockstat:::thread-spin" }, 141 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 142 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 143 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 144 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 145 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 146 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 147 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 148 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 149 { 'C', "Lock", "Unknown event (type 8)", "units" }, 150 { 'C', "Lock", "Unknown event (type 9)", "units" }, 151 { 'C', "Lock", "Unknown event (type 10)", "units" }, 152 { 'C', "Lock", "Unknown event (type 11)", "units" }, 153 { 'C', "Lock", "Unknown event (type 12)", "units" }, 154 { 'C', "Lock", "Unknown event (type 13)", "units" }, 155 { 'C', "Lock", "Unknown event (type 14)", "units" }, 156 { 'C', "Lock", "Unknown event (type 15)", "units" }, 157 { 'C', "Lock", "Unknown event (type 16)", "units" }, 158 { 'C', "Lock", "Unknown event (type 17)", "units" }, 159 { 'C', "Lock", "Unknown event (type 18)", "units" }, 160 { 'C', "Lock", "Unknown event (type 19)", "units" }, 161 { 'C', "Lock", "Unknown event (type 20)", "units" }, 162 { 'C', "Lock", "Unknown event (type 21)", "units" }, 163 { 'C', "Lock", "Unknown event (type 22)", "units" }, 164 { 'C', "Lock", "Unknown event (type 23)", "units" }, 165 { 'C', "Lock", "Unknown event (type 24)", "units" }, 166 { 'C', "Lock", "Unknown event (type 25)", "units" }, 167 { 'C', "Lock", "Unknown event (type 26)", "units" }, 168 { 'C', "Lock", "Unknown event (type 27)", "units" }, 169 { 'C', "Lock", "Unknown event (type 28)", "units" }, 170 { 'C', "Lock", "Unknown event (type 29)", "units" }, 171 { 'C', "Lock", "Unknown event (type 30)", "units" }, 172 { 'C', "Lock", "Unknown event (type 31)", "units" }, 173 { 'H', "Lock", "Adaptive mutex hold", "nsec", 174 "lockstat:::adaptive-release", NULL, 175 "lockstat:::adaptive-acquire" }, 176 { 'H', "Lock", "Spin lock hold", "nsec", 177 "lockstat:::spin-release", NULL, 178 "lockstat:::spin-acquire" }, 179 { 'H', "Lock", "R/W writer hold", "nsec", 180 "lockstat:::rw-release", "arg1 == 0", 181 "lockstat:::rw-acquire" }, 182 { 'H', "Lock", "R/W reader hold", "nsec", 183 "lockstat:::rw-release", "arg1 != 0", 184 "lockstat:::rw-acquire" }, 185 { 'H', "Lock", "Unknown event (type 36)", "units" }, 186 { 'H', "Lock", "Unknown event (type 37)", "units" }, 187 { 'H', "Lock", "Unknown event (type 38)", "units" }, 188 { 'H', "Lock", "Unknown event (type 39)", "units" }, 189 { 'H', "Lock", "Unknown event (type 40)", "units" }, 190 { 'H', "Lock", "Unknown event (type 41)", "units" }, 191 { 'H', "Lock", "Unknown event (type 42)", "units" }, 192 { 'H', "Lock", "Unknown event (type 43)", "units" }, 193 { 'H', "Lock", "Unknown event (type 44)", "units" }, 194 { 'H', "Lock", "Unknown event (type 45)", "units" }, 195 { 'H', "Lock", "Unknown event (type 46)", "units" }, 196 { 'H', "Lock", "Unknown event (type 47)", "units" }, 197 { 'H', "Lock", "Unknown event (type 48)", "units" }, 198 { 'H', "Lock", "Unknown event (type 49)", "units" }, 199 { 'H', "Lock", "Unknown event (type 50)", "units" }, 200 { 'H', "Lock", "Unknown event (type 51)", "units" }, 201 { 'H', "Lock", "Unknown event (type 52)", "units" }, 202 { 'H', "Lock", "Unknown event (type 53)", "units" }, 203 { 'H', "Lock", "Unknown event (type 54)", "units" }, 204 { 'H', "Lock", "Unknown event (type 55)", "units" }, 205 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 206 "profile:::profile-97", NULL }, 207 { 'I', "Lock", "Unknown event (type 57)", "units" }, 208 { 'I', "Lock", "Unknown event (type 58)", "units" }, 209 { 'I', "Lock", "Unknown event (type 59)", "units" }, 210 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 211 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 212 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 213 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 214 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 215 }; 216 217 static void 218 fail(int do_perror, const char *message, ...) 219 { 220 va_list args; 221 int save_errno = errno; 222 223 va_start(args, message); 224 (void) fprintf(stderr, "lockstat: "); 225 (void) vfprintf(stderr, message, args); 226 va_end(args); 227 if (do_perror) 228 (void) fprintf(stderr, ": %s", strerror(save_errno)); 229 (void) fprintf(stderr, "\n"); 230 exit(2); 231 } 232 233 static void 234 dfail(const char *message, ...) 235 { 236 va_list args; 237 238 va_start(args, message); 239 (void) fprintf(stderr, "lockstat: "); 240 (void) vfprintf(stderr, message, args); 241 va_end(args); 242 (void) fprintf(stderr, ": %s\n", 243 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 244 245 exit(2); 246 } 247 248 static void 249 show_events(char event_type, char *desc) 250 { 251 int i, first = -1, last; 252 253 for (i = 0; i < LS_MAX_EVENTS; i++) { 254 ls_event_info_t *evp = &g_event_info[i]; 255 if (evp->ev_type != event_type || 256 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 257 continue; 258 if (first == -1) 259 first = i; 260 last = i; 261 } 262 263 (void) fprintf(stderr, 264 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 265 desc, event_type, first, last); 266 267 for (i = first; i <= last; i++) 268 (void) fprintf(stderr, 269 "%4d = %s\n", i, g_event_info[i].ev_desc); 270 } 271 272 static void 273 usage(void) 274 { 275 (void) fprintf(stderr, 276 "Usage: lockstat [options] command [args]\n" 277 "\nEvent selection options:\n\n" 278 " -C watch contention events [on by default]\n" 279 " -E watch error events [off by default]\n" 280 " -H watch hold events [off by default]\n" 281 " -I watch interrupt events [off by default]\n" 282 " -A watch all lock events [equivalent to -CH]\n" 283 " -e event_list only watch the specified events (shown below);\n" 284 " <event_list> is a comma-separated list of\n" 285 " events or ranges of events, e.g. 1,4-7,35\n" 286 " -i rate interrupt rate for -I [default: %d Hz]\n" 287 "\nData gathering options:\n\n" 288 " -b basic statistics (lock, caller, event count)\n" 289 " -t timing for all events [default]\n" 290 " -h histograms for event times\n" 291 " -s depth stack traces <depth> deep\n" 292 " -x opt[=val] enable or modify DTrace options\n" 293 "\nData filtering options:\n\n" 294 " -n nrecords maximum number of data records [default: %d]\n" 295 " -l lock[,size] only watch <lock>, which can be specified as a\n" 296 " symbolic name or hex address; <size> defaults\n" 297 " to the ELF symbol size if available, 1 if not\n" 298 " -f func[,size] only watch events generated by <func>\n" 299 " -d duration only watch events longer than <duration>\n" 300 " -T trace (rather than sample) events\n" 301 "\nData reporting options:\n\n" 302 " -c coalesce lock data for arrays like pse_mutex[]\n" 303 " -k coalesce PCs within functions\n" 304 " -g show total events generated by function\n" 305 " -w wherever: don't distinguish events by caller\n" 306 " -W whichever: don't distinguish events by lock\n" 307 " -R display rates rather than counts\n" 308 " -p parsable output format (awk(1)-friendly)\n" 309 " -P sort lock data by (count * avg_time) product\n" 310 " -D n only display top <n> events of each type\n" 311 " -o filename send output to <filename>\n", 312 DEFAULT_HZ, DEFAULT_NRECS); 313 314 show_events('C', "Contention"); 315 show_events('H', "Hold-time"); 316 show_events('I', "Interrupt"); 317 show_events('E', "Error"); 318 (void) fprintf(stderr, "\n"); 319 320 exit(1); 321 } 322 323 static int 324 lockcmp(lsrec_t *a, lsrec_t *b) 325 { 326 int i; 327 328 if (a->ls_event < b->ls_event) 329 return (-1); 330 if (a->ls_event > b->ls_event) 331 return (1); 332 333 for (i = g_stkdepth - 1; i >= 0; i--) { 334 if (a->ls_stack[i] < b->ls_stack[i]) 335 return (-1); 336 if (a->ls_stack[i] > b->ls_stack[i]) 337 return (1); 338 } 339 340 if (a->ls_caller < b->ls_caller) 341 return (-1); 342 if (a->ls_caller > b->ls_caller) 343 return (1); 344 345 if (a->ls_lock < b->ls_lock) 346 return (-1); 347 if (a->ls_lock > b->ls_lock) 348 return (1); 349 350 return (0); 351 } 352 353 static int 354 countcmp(lsrec_t *a, lsrec_t *b) 355 { 356 if (a->ls_event < b->ls_event) 357 return (-1); 358 if (a->ls_event > b->ls_event) 359 return (1); 360 361 return (b->ls_count - a->ls_count); 362 } 363 364 static int 365 timecmp(lsrec_t *a, lsrec_t *b) 366 { 367 if (a->ls_event < b->ls_event) 368 return (-1); 369 if (a->ls_event > b->ls_event) 370 return (1); 371 372 if (a->ls_time < b->ls_time) 373 return (1); 374 if (a->ls_time > b->ls_time) 375 return (-1); 376 377 return (0); 378 } 379 380 static int 381 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 382 { 383 if (a->ls_event < b->ls_event) 384 return (-1); 385 if (a->ls_event > b->ls_event) 386 return (1); 387 388 if (a->ls_lock < b->ls_lock) 389 return (-1); 390 if (a->ls_lock > b->ls_lock) 391 return (1); 392 393 return (0); 394 } 395 396 static int 397 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 398 { 399 if (a->ls_event < b->ls_event) 400 return (-1); 401 if (a->ls_event > b->ls_event) 402 return (1); 403 404 if (a->ls_lock < b->ls_lock) 405 return (-1); 406 if (a->ls_lock > b->ls_lock) 407 return (1); 408 409 return (b->ls_count - a->ls_count); 410 } 411 412 static int 413 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 414 { 415 int i; 416 417 if (a->ls_event < b->ls_event) 418 return (-1); 419 if (a->ls_event > b->ls_event) 420 return (1); 421 422 for (i = g_stkdepth - 1; i >= 0; i--) { 423 if (a->ls_stack[i] < b->ls_stack[i]) 424 return (-1); 425 if (a->ls_stack[i] > b->ls_stack[i]) 426 return (1); 427 } 428 429 if (a->ls_caller < b->ls_caller) 430 return (-1); 431 if (a->ls_caller > b->ls_caller) 432 return (1); 433 434 return (0); 435 } 436 437 static int 438 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 439 { 440 int i; 441 442 if (a->ls_event < b->ls_event) 443 return (-1); 444 if (a->ls_event > b->ls_event) 445 return (1); 446 447 for (i = g_stkdepth - 1; i >= 0; i--) { 448 if (a->ls_stack[i] < b->ls_stack[i]) 449 return (-1); 450 if (a->ls_stack[i] > b->ls_stack[i]) 451 return (1); 452 } 453 454 if (a->ls_caller < b->ls_caller) 455 return (-1); 456 if (a->ls_caller > b->ls_caller) 457 return (1); 458 459 return (b->ls_count - a->ls_count); 460 } 461 462 static void 463 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 464 { 465 int m = n / 2; 466 int i, j; 467 468 if (m > 1) 469 mergesort(cmp, a, b, m); 470 if (n - m > 1) 471 mergesort(cmp, a + m, b + m, n - m); 472 for (i = m; i > 0; i--) 473 b[i - 1] = a[i - 1]; 474 for (j = m - 1; j < n - 1; j++) 475 b[n + m - j - 2] = a[j + 1]; 476 while (i < j) 477 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 478 *a = b[i]; 479 } 480 481 static void 482 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 483 { 484 int i, j; 485 lsrec_t *target, *current; 486 487 target = lock[0]; 488 489 for (i = 1; i < n; i++) { 490 current = lock[i]; 491 if (cmp(current, target) != 0) { 492 target = current; 493 continue; 494 } 495 current->ls_event = LS_MAX_EVENTS; 496 target->ls_count += current->ls_count; 497 target->ls_refcnt += current->ls_refcnt; 498 if (g_recsize < LS_TIME) 499 continue; 500 target->ls_time += current->ls_time; 501 if (g_recsize < LS_HIST) 502 continue; 503 for (j = 0; j < 64; j++) 504 target->ls_hist[j] += current->ls_hist[j]; 505 } 506 } 507 508 static void 509 coalesce_symbol(uintptr_t *addrp) 510 { 511 uintptr_t symoff; 512 size_t symsize; 513 514 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 515 *addrp -= symoff; 516 } 517 518 static void 519 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 520 { 521 char *new; 522 int len, newlen; 523 524 if (what == NULL) 525 return; 526 527 if (*pred == NULL) { 528 *pred = malloc(1); 529 *pred[0] = '\0'; 530 } 531 532 len = strlen(*pred); 533 newlen = len + strlen(what) + 32 + strlen("( && )"); 534 new = malloc(newlen); 535 536 if (*pred[0] != '\0') { 537 if (cmp != NULL) { 538 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 539 *pred, what, cmp, (void *)value); 540 } else { 541 (void) sprintf(new, "(%s) && (%s)", *pred, what); 542 } 543 } else { 544 if (cmp != NULL) { 545 (void) sprintf(new, "%s %s 0x%p", 546 what, cmp, (void *)value); 547 } else { 548 (void) sprintf(new, "%s", what); 549 } 550 } 551 552 free(*pred); 553 *pred = new; 554 } 555 556 static void 557 predicate_destroy(char **pred) 558 { 559 free(*pred); 560 *pred = NULL; 561 } 562 563 static void 564 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 565 { 566 char buf[256], *c = buf, *new; 567 int len, newlen; 568 569 if (*filt == NULL) { 570 *filt = malloc(1); 571 *filt[0] = '\0'; 572 } 573 574 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 575 " || " : "", what, (void *)base, what, (void *)(base + size)); 576 577 newlen = (len = strlen(*filt) + 1) + strlen(c); 578 new = malloc(newlen); 579 bcopy(*filt, new, len); 580 (void) strcat(new, c); 581 free(*filt); 582 *filt = new; 583 } 584 585 static void 586 filter_destroy(char **filt) 587 { 588 free(*filt); 589 *filt = NULL; 590 } 591 592 static void 593 dprog_add(const char *fmt, ...) 594 { 595 va_list args; 596 int size, offs; 597 char c; 598 599 va_start(args, fmt); 600 size = vsnprintf(&c, 1, fmt, args) + 1; 601 602 if (g_proglen == 0) { 603 offs = 0; 604 } else { 605 offs = g_proglen - 1; 606 } 607 608 g_proglen = offs + size; 609 610 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 611 fail(1, "failed to reallocate program text"); 612 613 (void) vsnprintf(&g_prog[offs], size, fmt, args); 614 } 615 616 /* 617 * This function may read like an open sewer, but keep in mind that programs 618 * that generate other programs are rarely pretty. If one has the unenviable 619 * task of maintaining or -- worse -- extending this code, use the -V option 620 * to examine the D program as generated by this function. 621 */ 622 static void 623 dprog_addevent(int event) 624 { 625 ls_event_info_t *info = &g_event_info[event]; 626 char *pred = NULL; 627 char stack[20]; 628 const char *arg0, *caller; 629 char *arg1 = "arg1"; 630 char buf[80]; 631 hrtime_t dur; 632 int depth; 633 634 if (info->ev_name[0] == '\0') 635 return; 636 637 if (info->ev_type == 'I') { 638 /* 639 * For interrupt events, arg0 (normally the lock pointer) is 640 * the CPU address plus the current pil, and arg1 (normally 641 * the number of nanoseconds) is the number of nanoseconds 642 * late -- and it's stored in arg2. 643 */ 644 arg0 = "(uintptr_t)curthread->t_cpu + \n" 645 "\t curthread->t_cpu->cpu_profile_pil"; 646 caller = "(uintptr_t)arg0"; 647 arg1 = "arg2"; 648 } else { 649 arg0 = "(uintptr_t)arg0"; 650 caller = "caller"; 651 } 652 653 if (g_recsize > LS_HIST) { 654 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 655 continue; 656 657 if (g_tracing) { 658 (void) sprintf(stack, "\tstack(%d);\n", depth); 659 } else { 660 (void) sprintf(stack, ", stack(%d)", depth); 661 } 662 } else { 663 (void) sprintf(stack, ""); 664 } 665 666 if (info->ev_acquire != NULL) { 667 /* 668 * If this is a hold event, we need to generate an additional 669 * clause for the acquire; the clause for the release will be 670 * generated with the aggregating statement, below. 671 */ 672 dprog_add("%s\n", info->ev_acquire); 673 predicate_add(&pred, info->ev_predicate, NULL, 0); 674 predicate_add(&pred, g_predicate, NULL, 0); 675 if (pred != NULL) 676 dprog_add("/%s/\n", pred); 677 678 dprog_add("{\n"); 679 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 680 681 if (info->ev_type == 'H') { 682 dprog_add("\t%s = timestamp;\n", buf); 683 } else { 684 /* 685 * If this isn't a hold event, it's the recursive 686 * error event. For this, we simply bump the 687 * thread-local, per-lock count. 688 */ 689 dprog_add("\t%s++;\n", buf); 690 } 691 692 dprog_add("}\n\n"); 693 predicate_destroy(&pred); 694 pred = NULL; 695 696 if (info->ev_type == 'E') { 697 /* 698 * If this is the recursive lock error event, we need 699 * to generate an additional clause to decrement the 700 * thread-local, per-lock count. This assures that we 701 * only execute the aggregating clause if we have 702 * recursive entry. 703 */ 704 dprog_add("%s\n", info->ev_name); 705 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 706 } 707 708 predicate_add(&pred, buf, NULL, 0); 709 710 if (info->ev_type == 'H') { 711 (void) sprintf(buf, "timestamp -\n\t " 712 "self->ev%d[(uintptr_t)arg0]", event); 713 } 714 715 arg1 = buf; 716 } else { 717 predicate_add(&pred, info->ev_predicate, NULL, 0); 718 if (info->ev_type != 'I') 719 predicate_add(&pred, g_predicate, NULL, 0); 720 else 721 predicate_add(&pred, g_ipredicate, NULL, 0); 722 } 723 724 if ((dur = g_min_duration[event]) != 0) 725 predicate_add(&pred, arg1, ">=", dur); 726 727 dprog_add("%s\n", info->ev_name); 728 729 if (pred != NULL) 730 dprog_add("/%s/\n", pred); 731 predicate_destroy(&pred); 732 733 dprog_add("{\n"); 734 735 if (g_tracing) { 736 dprog_add("\ttrace(%dULL);\n", event); 737 dprog_add("\ttrace(%s);\n", arg0); 738 dprog_add("\ttrace(%s);\n", caller); 739 dprog_add(stack); 740 } else { 741 /* 742 * The ordering here is important: when we process the 743 * aggregate, we count on the fact that @avg appears before 744 * @hist in program order to assure that @avg is assigned the 745 * first aggregation variable ID and @hist assigned the 746 * second; see the comment in process_aggregate() for details. 747 */ 748 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 749 event, arg0, caller, stack, arg1); 750 751 if (g_recsize >= LS_HIST) { 752 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 753 "(%s);\n", event, arg0, caller, stack, arg1); 754 } 755 } 756 757 if (info->ev_acquire != NULL) 758 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 759 760 dprog_add("}\n\n"); 761 } 762 763 static void 764 dprog_compile() 765 { 766 dtrace_prog_t *prog; 767 dtrace_proginfo_t info; 768 769 if (g_Vflag) { 770 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 771 (void) fputs(g_prog, stderr); 772 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 773 } 774 775 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 776 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 777 dfail("failed to compile program"); 778 779 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 780 dfail("failed to enable probes"); 781 782 if (dtrace_go(g_dtp) != 0) 783 dfail("couldn't start tracing"); 784 } 785 786 static void 787 status_fire(void) 788 {} 789 790 static void 791 status_init(void) 792 { 793 dtrace_optval_t val, status, agg; 794 struct sigaction act; 795 struct itimerspec ts; 796 struct sigevent ev; 797 timer_t tid; 798 799 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 800 dfail("failed to get 'statusrate'"); 801 802 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 803 dfail("failed to get 'statusrate'"); 804 805 /* 806 * We would want to awaken at a rate that is the GCD of the statusrate 807 * and the aggrate -- but that seems a bit absurd. Instead, we'll 808 * simply awaken at a rate that is the more frequent of the two, which 809 * assures that we're never later than the interval implied by the 810 * more frequent rate. 811 */ 812 val = status < agg ? status : agg; 813 814 (void) sigemptyset(&act.sa_mask); 815 act.sa_flags = 0; 816 act.sa_handler = status_fire; 817 (void) sigaction(SIGUSR1, &act, NULL); 818 819 ev.sigev_notify = SIGEV_SIGNAL; 820 ev.sigev_signo = SIGUSR1; 821 822 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 823 dfail("cannot create CLOCK_REALTIME timer"); 824 825 ts.it_value.tv_sec = val / NANOSEC; 826 ts.it_value.tv_nsec = val % NANOSEC; 827 ts.it_interval = ts.it_value; 828 829 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 830 dfail("cannot set time on CLOCK_REALTIME timer"); 831 } 832 833 static void 834 status_check(void) 835 { 836 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 837 dfail("failed to snap aggregate"); 838 839 if (dtrace_status(g_dtp) == -1) 840 dfail("dtrace_status()"); 841 } 842 843 static void 844 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 845 { 846 bzero(lsrec, g_recsize); 847 lsrec->ls_count = 1; 848 849 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 850 fail(0, "truncated DTrace record"); 851 852 if (rec->dtrd_size != sizeof (uint64_t)) 853 fail(0, "bad event size in first record"); 854 855 /* LINTED - alignment */ 856 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 857 rec++; 858 859 if (rec->dtrd_size != sizeof (uintptr_t)) 860 fail(0, "bad lock address size in second record"); 861 862 /* LINTED - alignment */ 863 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 864 rec++; 865 866 if (rec->dtrd_size != sizeof (uintptr_t)) 867 fail(0, "bad caller size in third record"); 868 869 /* LINTED - alignment */ 870 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 871 rec++; 872 873 if (g_recsize > LS_HIST) { 874 int frames, i; 875 pc_t *stack; 876 877 frames = rec->dtrd_size / sizeof (pc_t); 878 /* LINTED - alignment */ 879 stack = (pc_t *)(data + rec->dtrd_offset); 880 881 for (i = 1; i < frames; i++) 882 lsrec->ls_stack[i - 1] = stack[i]; 883 } 884 } 885 886 /*ARGSUSED*/ 887 static int 888 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 889 { 890 *((size_t *)arg) += 1; 891 892 return (DTRACE_AGGWALK_NEXT); 893 } 894 895 static int 896 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 897 { 898 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 899 caddr_t data = agg->dtada_data; 900 lsdata_t *lsdata = arg; 901 lsrec_t *lsrec = lsdata->lsd_next; 902 const dtrace_recdesc_t *rec; 903 uint64_t *avg, *quantized; 904 int i, j; 905 906 assert(lsdata->lsd_count < g_nrecs); 907 908 /* 909 * Aggregation variable IDs are guaranteed to be generated in program 910 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 911 * plus one. As "avg" appears before "hist" in program order, we know 912 * that "avg" will be allocated the first aggregation variable ID, and 913 * "hist" will be allocated the second aggregation variable ID -- and 914 * we therefore use the aggregation variable ID to differentiate the 915 * cases. 916 */ 917 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 918 /* 919 * If this is the histogram entry. We'll copy the quantized 920 * data into lc_hist, and jump over the rest. 921 */ 922 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 923 924 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 925 fail(0, "bad variable ID in aggregation record"); 926 927 if (rec->dtrd_size != 928 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 929 fail(0, "bad quantize size in aggregation record"); 930 931 /* LINTED - alignment */ 932 quantized = (uint64_t *)(data + rec->dtrd_offset); 933 934 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 935 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 936 lsrec->ls_hist[j] = quantized[i]; 937 938 goto out; 939 } 940 941 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 942 aggdesc->dtagd_nrecs - 1, data); 943 944 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 945 946 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 947 fail(0, "bad avg size in aggregation record"); 948 949 /* LINTED - alignment */ 950 avg = (uint64_t *)(data + rec->dtrd_offset); 951 lsrec->ls_count = (uint32_t)avg[0]; 952 lsrec->ls_time = (uintptr_t)avg[1]; 953 954 if (g_recsize >= LS_HIST) 955 return (DTRACE_AGGWALK_NEXT); 956 957 out: 958 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 959 lsdata->lsd_count++; 960 961 return (DTRACE_AGGWALK_NEXT); 962 } 963 964 static int 965 process_trace(const dtrace_probedata_t *pdata, void *arg) 966 { 967 lsdata_t *lsdata = arg; 968 lsrec_t *lsrec = lsdata->lsd_next; 969 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 970 caddr_t data = pdata->dtpda_data; 971 972 if (lsdata->lsd_count >= g_nrecs) 973 return (DTRACE_CONSUME_NEXT); 974 975 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 976 977 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 978 lsdata->lsd_count++; 979 980 return (DTRACE_CONSUME_NEXT); 981 } 982 983 static int 984 process_data(FILE *out, char *data) 985 { 986 lsdata_t lsdata; 987 988 /* LINTED - alignment */ 989 lsdata.lsd_next = (lsrec_t *)data; 990 lsdata.lsd_count = 0; 991 992 if (g_tracing) { 993 if (dtrace_consume(g_dtp, out, 994 process_trace, NULL, &lsdata) != 0) 995 dfail("failed to consume buffer"); 996 997 return (lsdata.lsd_count); 998 } 999 1000 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1001 process_aggregate, &lsdata) != 0) 1002 dfail("failed to walk aggregate"); 1003 1004 return (lsdata.lsd_count); 1005 } 1006 1007 /*ARGSUSED*/ 1008 static int 1009 drophandler(const dtrace_dropdata_t *data, void *arg) 1010 { 1011 g_dropped++; 1012 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1013 return (DTRACE_HANDLE_OK); 1014 } 1015 1016 int 1017 main(int argc, char **argv) 1018 { 1019 char *data_buf; 1020 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1021 FILE *out = stdout; 1022 char c; 1023 pid_t child; 1024 int status; 1025 int i, j; 1026 hrtime_t duration; 1027 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1028 uintptr_t addr; 1029 size_t size, off; 1030 int events_specified = 0; 1031 int exec_errno = 0; 1032 uint32_t event; 1033 char *filt = NULL, *ifilt = NULL; 1034 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1035 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1036 dtrace_optval_t aggsize; 1037 char aggstr[10]; 1038 long ncpus; 1039 int dynvar = 0; 1040 int err; 1041 1042 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1043 fail(0, "cannot open dtrace library: %s", 1044 dtrace_errmsg(NULL, err)); 1045 } 1046 1047 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1048 dfail("couldn't establish drop handler"); 1049 1050 if (symtab_init() == -1) 1051 fail(1, "can't load kernel symbols"); 1052 1053 g_nrecs = DEFAULT_NRECS; 1054 1055 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1056 switch (c) { 1057 case 'b': 1058 g_recsize = LS_BASIC; 1059 break; 1060 1061 case 't': 1062 g_recsize = LS_TIME; 1063 break; 1064 1065 case 'h': 1066 g_recsize = LS_HIST; 1067 break; 1068 1069 case 's': 1070 if (!isdigit(optarg[0])) 1071 usage(); 1072 g_stkdepth = atoi(optarg); 1073 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1074 fail(0, "max stack depth is %d", 1075 LS_MAX_STACK_DEPTH); 1076 g_recsize = LS_STACK(g_stkdepth); 1077 break; 1078 1079 case 'n': 1080 if (!isdigit(optarg[0])) 1081 usage(); 1082 g_nrecs = atoi(optarg); 1083 break; 1084 1085 case 'd': 1086 if (!isdigit(optarg[0])) 1087 usage(); 1088 duration = atoll(optarg); 1089 1090 /* 1091 * XXX -- durations really should be per event 1092 * since the units are different, but it's hard 1093 * to express this nicely in the interface. 1094 * Not clear yet what the cleanest solution is. 1095 */ 1096 for (i = 0; i < LS_MAX_EVENTS; i++) 1097 if (g_event_info[i].ev_type != 'E') 1098 g_min_duration[i] = duration; 1099 1100 break; 1101 1102 case 'i': 1103 if (!isdigit(optarg[0])) 1104 usage(); 1105 i = atoi(optarg); 1106 if (i <= 0) 1107 usage(); 1108 if (i > MAX_HZ) 1109 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1110 1111 for (j = 0; j < LS_MAX_EVENTS; j++) 1112 if (strcmp(g_event_info[j].ev_desc, 1113 "Profiling interrupt") == 0) 1114 break; 1115 1116 (void) sprintf(g_event_info[j].ev_name, 1117 "profile:::profile-%d", i); 1118 break; 1119 1120 case 'l': 1121 case 'f': 1122 addrp = strtok(optarg, ","); 1123 sizep = strtok(NULL, ","); 1124 addrp = strtok(optarg, ",+"); 1125 offp = strtok(NULL, ","); 1126 1127 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1128 off = offp ? strtoul(offp, NULL, 0) : 0; 1129 1130 if (addrp[0] == '0') { 1131 addr = strtoul(addrp, NULL, 16) + off; 1132 } else { 1133 addr = sym_to_addr(addrp) + off; 1134 if (sizep == NULL) 1135 size = sym_size(addrp) - off; 1136 if (addr - off == 0) 1137 fail(0, "symbol '%s' not found", addrp); 1138 if (size == 0) 1139 size = 1; 1140 } 1141 1142 1143 if (c == 'l') { 1144 filter_add(&filt, "arg0", addr, size); 1145 } else { 1146 filter_add(&filt, "caller", addr, size); 1147 filter_add(&ifilt, "arg0", addr, size); 1148 } 1149 break; 1150 1151 case 'e': 1152 evp = strtok_r(optarg, ",", &lastp); 1153 while (evp) { 1154 int ev1, ev2; 1155 char *evp2; 1156 1157 (void) strtok(evp, "-"); 1158 evp2 = strtok(NULL, "-"); 1159 ev1 = atoi(evp); 1160 ev2 = evp2 ? atoi(evp2) : ev1; 1161 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1162 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1163 fail(0, "-e events out of range"); 1164 for (i = ev1; i <= ev2; i++) 1165 g_enabled[i] = 1; 1166 evp = strtok_r(NULL, ",", &lastp); 1167 } 1168 events_specified = 1; 1169 break; 1170 1171 case 'c': 1172 g_cflag = 1; 1173 break; 1174 1175 case 'k': 1176 g_kflag = 1; 1177 break; 1178 1179 case 'w': 1180 g_wflag = 1; 1181 break; 1182 1183 case 'W': 1184 g_Wflag = 1; 1185 break; 1186 1187 case 'g': 1188 g_gflag = 1; 1189 break; 1190 1191 case 'C': 1192 case 'E': 1193 case 'H': 1194 case 'I': 1195 for (i = 0; i < LS_MAX_EVENTS; i++) 1196 if (g_event_info[i].ev_type == c) 1197 g_enabled[i] = 1; 1198 events_specified = 1; 1199 break; 1200 1201 case 'A': 1202 for (i = 0; i < LS_MAX_EVENTS; i++) 1203 if (strchr("CH", g_event_info[i].ev_type)) 1204 g_enabled[i] = 1; 1205 events_specified = 1; 1206 break; 1207 1208 case 'T': 1209 g_tracing = 1; 1210 break; 1211 1212 case 'D': 1213 if (!isdigit(optarg[0])) 1214 usage(); 1215 g_topn = atoi(optarg); 1216 break; 1217 1218 case 'R': 1219 g_rates = 1; 1220 break; 1221 1222 case 'p': 1223 g_pflag = 1; 1224 break; 1225 1226 case 'P': 1227 g_Pflag = 1; 1228 break; 1229 1230 case 'o': 1231 if ((out = fopen(optarg, "w")) == NULL) 1232 fail(1, "error opening file"); 1233 break; 1234 1235 case 'V': 1236 g_Vflag = 1; 1237 break; 1238 1239 default: 1240 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1241 usage(); 1242 } 1243 } 1244 1245 if (filt != NULL) { 1246 predicate_add(&g_predicate, filt, NULL, 0); 1247 filter_destroy(&filt); 1248 } 1249 1250 if (ifilt != NULL) { 1251 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1252 filter_destroy(&ifilt); 1253 } 1254 1255 if (g_recsize == 0) { 1256 if (g_gflag) { 1257 g_stkdepth = LS_MAX_STACK_DEPTH; 1258 g_recsize = LS_STACK(g_stkdepth); 1259 } else { 1260 g_recsize = LS_TIME; 1261 } 1262 } 1263 1264 if (g_gflag && g_recsize <= LS_STACK(0)) 1265 fail(0, "'-g' requires at least '-s 1' data gathering"); 1266 1267 /* 1268 * Make sure the alignment is reasonable 1269 */ 1270 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1271 1272 for (i = 0; i < LS_MAX_EVENTS; i++) { 1273 /* 1274 * If no events were specified, enable -C. 1275 */ 1276 if (!events_specified && g_event_info[i].ev_type == 'C') 1277 g_enabled[i] = 1; 1278 } 1279 1280 for (i = 0; i < LS_MAX_EVENTS; i++) { 1281 if (!g_enabled[i]) 1282 continue; 1283 1284 if (g_event_info[i].ev_acquire != NULL) { 1285 /* 1286 * If we've enabled a hold event, we must explicitly 1287 * allocate dynamic variable space. 1288 */ 1289 dynvar = 1; 1290 } 1291 1292 dprog_addevent(i); 1293 } 1294 1295 /* 1296 * Make sure there are remaining arguments to specify a child command 1297 * to execute. 1298 */ 1299 if (argc <= optind) 1300 usage(); 1301 1302 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1303 dfail("couldn't determine number of online CPUs"); 1304 1305 /* 1306 * By default, we set our data buffer size to be the number of records 1307 * multiplied by the size of the record, doubled to account for some 1308 * DTrace slop and divided by the number of CPUs. We silently clamp 1309 * the aggregation size at both a minimum and a maximum to prevent 1310 * absurdly low or high values. 1311 */ 1312 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1313 aggsize = MIN_AGGSIZE; 1314 1315 if (aggsize > MAX_AGGSIZE) 1316 aggsize = MAX_AGGSIZE; 1317 1318 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1319 1320 if (!g_tracing) { 1321 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1322 dfail("failed to set 'bufsize'"); 1323 1324 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1325 dfail("failed to set 'aggsize'"); 1326 1327 if (dynvar) { 1328 /* 1329 * If we're using dynamic variables, we set our 1330 * dynamic variable size to be one megabyte per CPU, 1331 * with a hard-limit of 32 megabytes. This may still 1332 * be too small in some cases, but it can be tuned 1333 * manually via -x if need be. 1334 */ 1335 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1336 1337 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1338 dfail("failed to set 'dynvarsize'"); 1339 } 1340 } else { 1341 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1342 dfail("failed to set 'bufsize'"); 1343 } 1344 1345 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1346 dfail("failed to set 'statusrate'"); 1347 1348 optind = 1; 1349 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1350 switch (c) { 1351 case 'x': 1352 if ((p = strchr(optarg, '=')) != NULL) 1353 *p++ = '\0'; 1354 1355 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1356 dfail("failed to set -x %s", optarg); 1357 break; 1358 } 1359 } 1360 1361 argc -= optind; 1362 argv += optind; 1363 1364 dprog_compile(); 1365 status_init(); 1366 1367 g_elapsed = -gethrtime(); 1368 1369 /* 1370 * Spawn the specified command and wait for it to complete. 1371 */ 1372 child = fork(); 1373 if (child == -1) 1374 fail(1, "cannot fork"); 1375 if (child == 0) { 1376 (void) dtrace_close(g_dtp); 1377 (void) execvp(argv[0], &argv[0]); 1378 exec_errno = errno; 1379 exit(127); 1380 } 1381 1382 while (waitpid(child, &status, WEXITED) != child) 1383 status_check(); 1384 1385 g_elapsed += gethrtime(); 1386 1387 if (WIFEXITED(status)) { 1388 if (WEXITSTATUS(status) != 0) { 1389 if (exec_errno != 0) { 1390 errno = exec_errno; 1391 fail(1, "could not execute %s", argv[0]); 1392 } 1393 (void) fprintf(stderr, 1394 "lockstat: warning: %s exited with code %d\n", 1395 argv[0], WEXITSTATUS(status)); 1396 } 1397 } else { 1398 (void) fprintf(stderr, 1399 "lockstat: warning: %s died on signal %d\n", 1400 argv[0], WTERMSIG(status)); 1401 } 1402 1403 if (dtrace_stop(g_dtp) == -1) 1404 dfail("failed to stop dtrace"); 1405 1406 /* 1407 * Before we read out the results, we need to allocate our buffer. 1408 * If we're tracing, then we'll just use the precalculated size. If 1409 * we're not, then we'll take a snapshot of the aggregate, and walk 1410 * it to count the number of records. 1411 */ 1412 if (!g_tracing) { 1413 if (dtrace_aggregate_snap(g_dtp) != 0) 1414 dfail("failed to snap aggregate"); 1415 1416 g_nrecs = 0; 1417 1418 if (dtrace_aggregate_walk(g_dtp, 1419 count_aggregate, &g_nrecs) != 0) 1420 dfail("failed to walk aggregate"); 1421 } 1422 1423 if ((data_buf = memalign(sizeof (uint64_t), 1424 (g_nrecs + 1) * g_recsize)) == NULL) 1425 fail(1, "Memory allocation failed"); 1426 1427 /* 1428 * Read out the DTrace data. 1429 */ 1430 g_nrecs_used = process_data(out, data_buf); 1431 1432 if (g_nrecs_used > g_nrecs || g_dropped) 1433 (void) fprintf(stderr, "lockstat: warning: " 1434 "ran out of data records (use -n for more)\n"); 1435 1436 /* LINTED - alignment */ 1437 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1438 /* LINTED - alignment */ 1439 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1440 ev_count[lsp->ls_event] += lsp->ls_count; 1441 ev_time[lsp->ls_event] += lsp->ls_time; 1442 } 1443 1444 /* 1445 * If -g was specified, convert stacks into individual records. 1446 */ 1447 if (g_gflag) { 1448 lsrec_t *newlsp, *oldlsp; 1449 1450 newlsp = memalign(sizeof (uint64_t), 1451 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1452 if (newlsp == NULL) 1453 fail(1, "Cannot allocate space for -g processing"); 1454 lsp = newlsp; 1455 /* LINTED - alignment */ 1456 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1457 /* LINTED - alignment */ 1458 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1459 int fr; 1460 int caller_in_stack = 0; 1461 1462 if (oldlsp->ls_count == 0) 1463 continue; 1464 1465 for (fr = 0; fr < g_stkdepth; fr++) { 1466 if (oldlsp->ls_stack[fr] == 0) 1467 break; 1468 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1469 caller_in_stack = 1; 1470 bcopy(oldlsp, lsp, LS_TIME); 1471 lsp->ls_caller = oldlsp->ls_stack[fr]; 1472 /* LINTED - alignment */ 1473 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1474 } 1475 if (!caller_in_stack) { 1476 bcopy(oldlsp, lsp, LS_TIME); 1477 /* LINTED - alignment */ 1478 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1479 } 1480 } 1481 g_nrecs = g_nrecs_used = 1482 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1483 g_recsize = LS_TIME; 1484 g_stkdepth = 0; 1485 free(data_buf); 1486 data_buf = (char *)newlsp; 1487 } 1488 1489 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1490 sizeof (void *))) == NULL) 1491 fail(1, "Sort buffer allocation failed"); 1492 merge_buf = sort_buf + (g_nrecs + 1); 1493 1494 /* 1495 * Build the sort buffer, discarding zero-count records along the way. 1496 */ 1497 /* LINTED - alignment */ 1498 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1499 /* LINTED - alignment */ 1500 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1501 if (lsp->ls_count == 0) 1502 lsp->ls_event = LS_MAX_EVENTS; 1503 sort_buf[i] = lsp; 1504 } 1505 1506 if (g_nrecs_used == 0) 1507 exit(0); 1508 1509 /* 1510 * Add a sentinel after the last record 1511 */ 1512 sort_buf[i] = lsp; 1513 lsp->ls_event = LS_MAX_EVENTS; 1514 1515 if (g_tracing) { 1516 report_trace(out, sort_buf); 1517 return (0); 1518 } 1519 1520 /* 1521 * Application of -g may have resulted in multiple records 1522 * with the same signature; coalesce them. 1523 */ 1524 if (g_gflag) { 1525 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1526 coalesce(lockcmp, sort_buf, g_nrecs_used); 1527 } 1528 1529 /* 1530 * Coalesce locks within the same symbol if -c option specified. 1531 * Coalesce PCs within the same function if -k option specified. 1532 */ 1533 if (g_cflag || g_kflag) { 1534 for (i = 0; i < g_nrecs_used; i++) { 1535 int fr; 1536 lsp = sort_buf[i]; 1537 if (g_cflag) 1538 coalesce_symbol(&lsp->ls_lock); 1539 if (g_kflag) { 1540 for (fr = 0; fr < g_stkdepth; fr++) 1541 coalesce_symbol(&lsp->ls_stack[fr]); 1542 coalesce_symbol(&lsp->ls_caller); 1543 } 1544 } 1545 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1546 coalesce(lockcmp, sort_buf, g_nrecs_used); 1547 } 1548 1549 /* 1550 * Coalesce callers if -w option specified 1551 */ 1552 if (g_wflag) { 1553 mergesort(lock_and_count_cmp_anywhere, 1554 sort_buf, merge_buf, g_nrecs_used); 1555 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1556 } 1557 1558 /* 1559 * Coalesce locks if -W option specified 1560 */ 1561 if (g_Wflag) { 1562 mergesort(site_and_count_cmp_anylock, 1563 sort_buf, merge_buf, g_nrecs_used); 1564 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1565 } 1566 1567 /* 1568 * Sort data by contention count (ls_count) or total time (ls_time), 1569 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1570 */ 1571 if (g_recsize < LS_TIME) 1572 g_Pflag = 0; 1573 1574 if (g_Pflag) 1575 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1576 else 1577 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1578 1579 /* 1580 * Display data by event type 1581 */ 1582 first = &sort_buf[0]; 1583 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1584 current = first; 1585 while ((lsp = *current)->ls_event == event) 1586 current++; 1587 report_stats(out, first, current - first, ev_count[event], 1588 ev_time[event]); 1589 first = current; 1590 } 1591 1592 return (0); 1593 } 1594 1595 static char * 1596 format_symbol(char *buf, uintptr_t addr, int show_size) 1597 { 1598 uintptr_t symoff; 1599 char *symname; 1600 size_t symsize; 1601 1602 symname = addr_to_sym(addr, &symoff, &symsize); 1603 1604 if (show_size && symoff == 0) 1605 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1606 else if (symoff == 0) 1607 (void) sprintf(buf, "%s", symname); 1608 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1609 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1610 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1611 (void) sprintf(buf, "%s+0x%llx", symname, 1612 (unsigned long long)symoff); 1613 else 1614 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1615 return (buf); 1616 } 1617 1618 static void 1619 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1620 uint64_t total_time) 1621 { 1622 uint32_t event = sort_buf[0]->ls_event; 1623 lsrec_t *lsp; 1624 double ptotal = 0.0; 1625 double percent; 1626 int i, j, fr; 1627 int displayed; 1628 int first_bin, last_bin, max_bin_count, total_bin_count; 1629 int rectype; 1630 char buf[256]; 1631 char lhdr[80], chdr[80]; 1632 1633 rectype = g_recsize; 1634 1635 if (g_topn == 0) { 1636 (void) fprintf(out, "%20llu %s\n", 1637 g_rates == 0 ? total_count : 1638 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1639 g_event_info[event].ev_desc); 1640 return; 1641 } 1642 1643 (void) sprintf(lhdr, "%s%s", 1644 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1645 (void) sprintf(chdr, "%s%s", 1646 g_wflag ? "Hottest " : "", "Caller"); 1647 1648 if (!g_pflag) 1649 (void) fprintf(out, 1650 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1651 g_event_info[event].ev_desc, (double)total_count, 1652 (double)g_elapsed / NANOSEC, 1653 (double)total_count * NANOSEC / g_elapsed); 1654 1655 if (!g_pflag && rectype < LS_HIST) { 1656 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1657 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1658 g_rates ? "ops/s" : "Count", 1659 g_gflag ? "genr" : "indv", 1660 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1661 (void) fprintf(out, "---------------------------------" 1662 "----------------------------------------------\n"); 1663 } 1664 1665 displayed = 0; 1666 for (i = 0; i < nrecs; i++) { 1667 lsp = sort_buf[i]; 1668 1669 if (displayed++ >= g_topn) 1670 break; 1671 1672 if (g_pflag) { 1673 int j; 1674 1675 (void) fprintf(out, "%u %u", 1676 lsp->ls_event, lsp->ls_count); 1677 (void) fprintf(out, " %s", 1678 format_symbol(buf, lsp->ls_lock, g_cflag)); 1679 (void) fprintf(out, " %s", 1680 format_symbol(buf, lsp->ls_caller, 0)); 1681 (void) fprintf(out, " %f", 1682 (double)lsp->ls_refcnt / lsp->ls_count); 1683 if (rectype >= LS_TIME) 1684 (void) fprintf(out, " %llu", 1685 (unsigned long long)lsp->ls_time); 1686 if (rectype >= LS_HIST) { 1687 for (j = 0; j < 64; j++) 1688 (void) fprintf(out, " %u", 1689 lsp->ls_hist[j]); 1690 } 1691 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1692 if (rectype <= LS_STACK(j) || 1693 lsp->ls_stack[j] == 0) 1694 break; 1695 (void) fprintf(out, " %s", 1696 format_symbol(buf, lsp->ls_stack[j], 0)); 1697 } 1698 (void) fprintf(out, "\n"); 1699 continue; 1700 } 1701 1702 if (rectype >= LS_HIST) { 1703 (void) fprintf(out, "---------------------------------" 1704 "----------------------------------------------\n"); 1705 (void) sprintf(buf, "%s", 1706 g_event_info[event].ev_units); 1707 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1708 g_rates ? "ops/s" : "Count", 1709 g_gflag ? "genr" : "indv", 1710 "cuml", "rcnt", buf, lhdr, chdr); 1711 } 1712 1713 if (g_Pflag && total_time != 0) 1714 percent = (lsp->ls_time * 100.00) / total_time; 1715 else 1716 percent = (lsp->ls_count * 100.00) / total_count; 1717 1718 ptotal += percent; 1719 1720 if (rectype >= LS_TIME) 1721 (void) sprintf(buf, "%llu", 1722 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1723 else 1724 buf[0] = '\0'; 1725 1726 (void) fprintf(out, "%5llu ", 1727 g_rates == 0 ? lsp->ls_count : 1728 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1729 1730 (void) fprintf(out, "%3.0f%% ", percent); 1731 1732 if (g_gflag) 1733 (void) fprintf(out, "---- "); 1734 else 1735 (void) fprintf(out, "%3.0f%% ", ptotal); 1736 1737 (void) fprintf(out, "%4.2f %8s ", 1738 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1739 1740 (void) fprintf(out, "%-22s ", 1741 format_symbol(buf, lsp->ls_lock, g_cflag)); 1742 1743 (void) fprintf(out, "%-24s\n", 1744 format_symbol(buf, lsp->ls_caller, 0)); 1745 1746 if (rectype < LS_HIST) 1747 continue; 1748 1749 (void) fprintf(out, "\n"); 1750 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1751 g_event_info[event].ev_units, 1752 "------ Time Distribution ------", 1753 g_rates ? "ops/s" : "count", 1754 rectype > LS_STACK(0) ? "Stack" : ""); 1755 1756 first_bin = 0; 1757 while (lsp->ls_hist[first_bin] == 0) 1758 first_bin++; 1759 1760 last_bin = 63; 1761 while (lsp->ls_hist[last_bin] == 0) 1762 last_bin--; 1763 1764 max_bin_count = 0; 1765 total_bin_count = 0; 1766 for (j = first_bin; j <= last_bin; j++) { 1767 total_bin_count += lsp->ls_hist[j]; 1768 if (lsp->ls_hist[j] > max_bin_count) 1769 max_bin_count = lsp->ls_hist[j]; 1770 } 1771 1772 /* 1773 * If we went a few frames below the caller, ignore them 1774 */ 1775 for (fr = 3; fr > 0; fr--) 1776 if (lsp->ls_stack[fr] == lsp->ls_caller) 1777 break; 1778 1779 for (j = first_bin; j <= last_bin; j++) { 1780 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1781 (void) fprintf(out, "%10llu |%s%s %-9u ", 1782 1ULL << j, 1783 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1784 " " + depth, 1785 g_rates == 0 ? lsp->ls_hist[j] : 1786 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1787 g_elapsed)); 1788 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1789 (void) fprintf(out, "\n"); 1790 continue; 1791 } 1792 (void) fprintf(out, "%-24s\n", 1793 format_symbol(buf, lsp->ls_stack[fr], 0)); 1794 fr++; 1795 } 1796 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1797 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1798 format_symbol(buf, lsp->ls_stack[fr], 0)); 1799 fr++; 1800 } 1801 } 1802 1803 if (!g_pflag) 1804 (void) fprintf(out, "---------------------------------" 1805 "----------------------------------------------\n"); 1806 1807 (void) fflush(out); 1808 } 1809 1810 static void 1811 report_trace(FILE *out, lsrec_t **sort_buf) 1812 { 1813 lsrec_t *lsp; 1814 int i, fr; 1815 int rectype; 1816 char buf[256], buf2[256]; 1817 1818 rectype = g_recsize; 1819 1820 if (!g_pflag) { 1821 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1822 "Event", "Time", "Owner", "Lock", "Caller"); 1823 (void) fprintf(out, "---------------------------------" 1824 "----------------------------------------------\n"); 1825 } 1826 1827 for (i = 0; i < g_nrecs_used; i++) { 1828 1829 lsp = sort_buf[i]; 1830 1831 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1832 continue; 1833 1834 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1835 lsp->ls_event, (unsigned long long)lsp->ls_time, 1836 (void *)lsp->ls_next, 1837 format_symbol(buf, lsp->ls_lock, 0), 1838 format_symbol(buf2, lsp->ls_caller, 0)); 1839 1840 if (rectype <= LS_STACK(0)) 1841 continue; 1842 1843 /* 1844 * If we went a few frames below the caller, ignore them 1845 */ 1846 for (fr = 3; fr > 0; fr--) 1847 if (lsp->ls_stack[fr] == lsp->ls_caller) 1848 break; 1849 1850 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1851 (void) fprintf(out, "%53s %-24s\n", "", 1852 format_symbol(buf, lsp->ls_stack[fr], 0)); 1853 fr++; 1854 } 1855 (void) fprintf(out, "\n"); 1856 } 1857 1858 (void) fflush(out); 1859 } 1860