1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <stdio.h> 27 #include <stddef.h> 28 #include <stdlib.h> 29 #include <stdarg.h> 30 #include <string.h> 31 #include <strings.h> 32 #include <ctype.h> 33 #include <fcntl.h> 34 #include <unistd.h> 35 #include <errno.h> 36 #include <limits.h> 37 #include <sys/types.h> 38 #include <sys/modctl.h> 39 #include <sys/stat.h> 40 #include <sys/wait.h> 41 #include <dtrace.h> 42 #include <sys/lockstat.h> 43 #include <alloca.h> 44 #include <signal.h> 45 #include <assert.h> 46 47 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 48 49 #define LS_MAX_STACK_DEPTH 50 50 #define LS_MAX_EVENTS 64 51 52 typedef struct lsrec { 53 struct lsrec *ls_next; /* next in hash chain */ 54 uintptr_t ls_lock; /* lock address */ 55 uintptr_t ls_caller; /* caller address */ 56 uint32_t ls_count; /* cumulative event count */ 57 uint32_t ls_event; /* type of event */ 58 uintptr_t ls_refcnt; /* cumulative reference count */ 59 uint64_t ls_time; /* cumulative event duration */ 60 uint32_t ls_hist[64]; /* log2(duration) histogram */ 61 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 62 } lsrec_t; 63 64 typedef struct lsdata { 65 struct lsrec *lsd_next; /* next available */ 66 int lsd_count; /* number of records */ 67 } lsdata_t; 68 69 /* 70 * Definitions for the types of experiments which can be run. They are 71 * listed in increasing order of memory cost and processing time cost. 72 * The numerical value of each type is the number of bytes needed per record. 73 */ 74 #define LS_BASIC offsetof(lsrec_t, ls_time) 75 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 76 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 77 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 78 79 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 80 static void report_trace(FILE *, lsrec_t **); 81 82 extern int symtab_init(void); 83 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 84 extern uintptr_t sym_to_addr(char *name); 85 extern size_t sym_size(char *name); 86 extern char *strtok_r(char *, const char *, char **); 87 88 #define DEFAULT_NRECS 10000 89 #define DEFAULT_HZ 97 90 #define MAX_HZ 1000 91 #define MIN_AGGSIZE (16 * 1024) 92 #define MAX_AGGSIZE (32 * 1024 * 1024) 93 94 static int g_stkdepth; 95 static int g_topn = INT_MAX; 96 static hrtime_t g_elapsed; 97 static int g_rates = 0; 98 static int g_pflag = 0; 99 static int g_Pflag = 0; 100 static int g_wflag = 0; 101 static int g_Wflag = 0; 102 static int g_cflag = 0; 103 static int g_kflag = 0; 104 static int g_gflag = 0; 105 static int g_Vflag = 0; 106 static int g_tracing = 0; 107 static size_t g_recsize; 108 static size_t g_nrecs; 109 static int g_nrecs_used; 110 static uchar_t g_enabled[LS_MAX_EVENTS]; 111 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 112 static dtrace_hdl_t *g_dtp; 113 static char *g_predicate; 114 static char *g_ipredicate; 115 static char *g_prog; 116 static int g_proglen; 117 static int g_dropped; 118 119 typedef struct ls_event_info { 120 char ev_type; 121 char ev_lhdr[20]; 122 char ev_desc[80]; 123 char ev_units[10]; 124 char ev_name[DTRACE_NAMELEN]; 125 char *ev_predicate; 126 char *ev_acquire; 127 } ls_event_info_t; 128 129 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 130 { 'C', "Lock", "Adaptive mutex spin", "nsec", 131 "lockstat:::adaptive-spin" }, 132 { 'C', "Lock", "Adaptive mutex block", "nsec", 133 "lockstat:::adaptive-block" }, 134 { 'C', "Lock", "Spin lock spin", "nsec", 135 "lockstat:::spin-spin" }, 136 { 'C', "Lock", "Thread lock spin", "nsec", 137 "lockstat:::thread-spin" }, 138 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 139 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 140 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 141 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 142 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 143 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 144 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 145 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 146 { 'C', "Lock", "Unknown event (type 8)", "units" }, 147 { 'C', "Lock", "Unknown event (type 9)", "units" }, 148 { 'C', "Lock", "Unknown event (type 10)", "units" }, 149 { 'C', "Lock", "Unknown event (type 11)", "units" }, 150 { 'C', "Lock", "Unknown event (type 12)", "units" }, 151 { 'C', "Lock", "Unknown event (type 13)", "units" }, 152 { 'C', "Lock", "Unknown event (type 14)", "units" }, 153 { 'C', "Lock", "Unknown event (type 15)", "units" }, 154 { 'C', "Lock", "Unknown event (type 16)", "units" }, 155 { 'C', "Lock", "Unknown event (type 17)", "units" }, 156 { 'C', "Lock", "Unknown event (type 18)", "units" }, 157 { 'C', "Lock", "Unknown event (type 19)", "units" }, 158 { 'C', "Lock", "Unknown event (type 20)", "units" }, 159 { 'C', "Lock", "Unknown event (type 21)", "units" }, 160 { 'C', "Lock", "Unknown event (type 22)", "units" }, 161 { 'C', "Lock", "Unknown event (type 23)", "units" }, 162 { 'C', "Lock", "Unknown event (type 24)", "units" }, 163 { 'C', "Lock", "Unknown event (type 25)", "units" }, 164 { 'C', "Lock", "Unknown event (type 26)", "units" }, 165 { 'C', "Lock", "Unknown event (type 27)", "units" }, 166 { 'C', "Lock", "Unknown event (type 28)", "units" }, 167 { 'C', "Lock", "Unknown event (type 29)", "units" }, 168 { 'C', "Lock", "Unknown event (type 30)", "units" }, 169 { 'C', "Lock", "Unknown event (type 31)", "units" }, 170 { 'H', "Lock", "Adaptive mutex hold", "nsec", 171 "lockstat:::adaptive-release", NULL, 172 "lockstat:::adaptive-acquire" }, 173 { 'H', "Lock", "Spin lock hold", "nsec", 174 "lockstat:::spin-release", NULL, 175 "lockstat:::spin-acquire" }, 176 { 'H', "Lock", "R/W writer hold", "nsec", 177 "lockstat:::rw-release", "arg1 == 0", 178 "lockstat:::rw-acquire" }, 179 { 'H', "Lock", "R/W reader hold", "nsec", 180 "lockstat:::rw-release", "arg1 != 0", 181 "lockstat:::rw-acquire" }, 182 { 'H', "Lock", "Unknown event (type 36)", "units" }, 183 { 'H', "Lock", "Unknown event (type 37)", "units" }, 184 { 'H', "Lock", "Unknown event (type 38)", "units" }, 185 { 'H', "Lock", "Unknown event (type 39)", "units" }, 186 { 'H', "Lock", "Unknown event (type 40)", "units" }, 187 { 'H', "Lock", "Unknown event (type 41)", "units" }, 188 { 'H', "Lock", "Unknown event (type 42)", "units" }, 189 { 'H', "Lock", "Unknown event (type 43)", "units" }, 190 { 'H', "Lock", "Unknown event (type 44)", "units" }, 191 { 'H', "Lock", "Unknown event (type 45)", "units" }, 192 { 'H', "Lock", "Unknown event (type 46)", "units" }, 193 { 'H', "Lock", "Unknown event (type 47)", "units" }, 194 { 'H', "Lock", "Unknown event (type 48)", "units" }, 195 { 'H', "Lock", "Unknown event (type 49)", "units" }, 196 { 'H', "Lock", "Unknown event (type 50)", "units" }, 197 { 'H', "Lock", "Unknown event (type 51)", "units" }, 198 { 'H', "Lock", "Unknown event (type 52)", "units" }, 199 { 'H', "Lock", "Unknown event (type 53)", "units" }, 200 { 'H', "Lock", "Unknown event (type 54)", "units" }, 201 { 'H', "Lock", "Unknown event (type 55)", "units" }, 202 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 203 "profile:::profile-97", NULL }, 204 { 'I', "Lock", "Unknown event (type 57)", "units" }, 205 { 'I', "Lock", "Unknown event (type 58)", "units" }, 206 { 'I', "Lock", "Unknown event (type 59)", "units" }, 207 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 208 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 209 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 210 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 211 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 212 }; 213 214 static void 215 fail(int do_perror, const char *message, ...) 216 { 217 va_list args; 218 int save_errno = errno; 219 220 va_start(args, message); 221 (void) fprintf(stderr, "lockstat: "); 222 (void) vfprintf(stderr, message, args); 223 va_end(args); 224 if (do_perror) 225 (void) fprintf(stderr, ": %s", strerror(save_errno)); 226 (void) fprintf(stderr, "\n"); 227 exit(2); 228 } 229 230 static void 231 dfail(const char *message, ...) 232 { 233 va_list args; 234 235 va_start(args, message); 236 (void) fprintf(stderr, "lockstat: "); 237 (void) vfprintf(stderr, message, args); 238 va_end(args); 239 (void) fprintf(stderr, ": %s\n", 240 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 241 242 exit(2); 243 } 244 245 static void 246 show_events(char event_type, char *desc) 247 { 248 int i, first = -1, last; 249 250 for (i = 0; i < LS_MAX_EVENTS; i++) { 251 ls_event_info_t *evp = &g_event_info[i]; 252 if (evp->ev_type != event_type || 253 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 254 continue; 255 if (first == -1) 256 first = i; 257 last = i; 258 } 259 260 (void) fprintf(stderr, 261 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 262 desc, event_type, first, last); 263 264 for (i = first; i <= last; i++) 265 (void) fprintf(stderr, 266 "%4d = %s\n", i, g_event_info[i].ev_desc); 267 } 268 269 static void 270 usage(void) 271 { 272 (void) fprintf(stderr, 273 "Usage: lockstat [options] command [args]\n" 274 "\nEvent selection options:\n\n" 275 " -C watch contention events [on by default]\n" 276 " -E watch error events [off by default]\n" 277 " -H watch hold events [off by default]\n" 278 " -I watch interrupt events [off by default]\n" 279 " -A watch all lock events [equivalent to -CH]\n" 280 " -e event_list only watch the specified events (shown below);\n" 281 " <event_list> is a comma-separated list of\n" 282 " events or ranges of events, e.g. 1,4-7,35\n" 283 " -i rate interrupt rate for -I [default: %d Hz]\n" 284 "\nData gathering options:\n\n" 285 " -b basic statistics (lock, caller, event count)\n" 286 " -t timing for all events [default]\n" 287 " -h histograms for event times\n" 288 " -s depth stack traces <depth> deep\n" 289 " -x opt[=val] enable or modify DTrace options\n" 290 "\nData filtering options:\n\n" 291 " -n nrecords maximum number of data records [default: %d]\n" 292 " -l lock[,size] only watch <lock>, which can be specified as a\n" 293 " symbolic name or hex address; <size> defaults\n" 294 " to the ELF symbol size if available, 1 if not\n" 295 " -f func[,size] only watch events generated by <func>\n" 296 " -d duration only watch events longer than <duration>\n" 297 " -T trace (rather than sample) events\n" 298 "\nData reporting options:\n\n" 299 " -c coalesce lock data for arrays like pse_mutex[]\n" 300 " -k coalesce PCs within functions\n" 301 " -g show total events generated by function\n" 302 " -w wherever: don't distinguish events by caller\n" 303 " -W whichever: don't distinguish events by lock\n" 304 " -R display rates rather than counts\n" 305 " -p parsable output format (awk(1)-friendly)\n" 306 " -P sort lock data by (count * avg_time) product\n" 307 " -D n only display top <n> events of each type\n" 308 " -o filename send output to <filename>\n", 309 DEFAULT_HZ, DEFAULT_NRECS); 310 311 show_events('C', "Contention"); 312 show_events('H', "Hold-time"); 313 show_events('I', "Interrupt"); 314 show_events('E', "Error"); 315 (void) fprintf(stderr, "\n"); 316 317 exit(1); 318 } 319 320 static int 321 lockcmp(lsrec_t *a, lsrec_t *b) 322 { 323 int i; 324 325 if (a->ls_event < b->ls_event) 326 return (-1); 327 if (a->ls_event > b->ls_event) 328 return (1); 329 330 for (i = g_stkdepth - 1; i >= 0; i--) { 331 if (a->ls_stack[i] < b->ls_stack[i]) 332 return (-1); 333 if (a->ls_stack[i] > b->ls_stack[i]) 334 return (1); 335 } 336 337 if (a->ls_caller < b->ls_caller) 338 return (-1); 339 if (a->ls_caller > b->ls_caller) 340 return (1); 341 342 if (a->ls_lock < b->ls_lock) 343 return (-1); 344 if (a->ls_lock > b->ls_lock) 345 return (1); 346 347 return (0); 348 } 349 350 static int 351 countcmp(lsrec_t *a, lsrec_t *b) 352 { 353 if (a->ls_event < b->ls_event) 354 return (-1); 355 if (a->ls_event > b->ls_event) 356 return (1); 357 358 return (b->ls_count - a->ls_count); 359 } 360 361 static int 362 timecmp(lsrec_t *a, lsrec_t *b) 363 { 364 if (a->ls_event < b->ls_event) 365 return (-1); 366 if (a->ls_event > b->ls_event) 367 return (1); 368 369 if (a->ls_time < b->ls_time) 370 return (1); 371 if (a->ls_time > b->ls_time) 372 return (-1); 373 374 return (0); 375 } 376 377 static int 378 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 379 { 380 if (a->ls_event < b->ls_event) 381 return (-1); 382 if (a->ls_event > b->ls_event) 383 return (1); 384 385 if (a->ls_lock < b->ls_lock) 386 return (-1); 387 if (a->ls_lock > b->ls_lock) 388 return (1); 389 390 return (0); 391 } 392 393 static int 394 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 395 { 396 if (a->ls_event < b->ls_event) 397 return (-1); 398 if (a->ls_event > b->ls_event) 399 return (1); 400 401 if (a->ls_lock < b->ls_lock) 402 return (-1); 403 if (a->ls_lock > b->ls_lock) 404 return (1); 405 406 return (b->ls_count - a->ls_count); 407 } 408 409 static int 410 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 411 { 412 int i; 413 414 if (a->ls_event < b->ls_event) 415 return (-1); 416 if (a->ls_event > b->ls_event) 417 return (1); 418 419 for (i = g_stkdepth - 1; i >= 0; i--) { 420 if (a->ls_stack[i] < b->ls_stack[i]) 421 return (-1); 422 if (a->ls_stack[i] > b->ls_stack[i]) 423 return (1); 424 } 425 426 if (a->ls_caller < b->ls_caller) 427 return (-1); 428 if (a->ls_caller > b->ls_caller) 429 return (1); 430 431 return (0); 432 } 433 434 static int 435 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 436 { 437 int i; 438 439 if (a->ls_event < b->ls_event) 440 return (-1); 441 if (a->ls_event > b->ls_event) 442 return (1); 443 444 for (i = g_stkdepth - 1; i >= 0; i--) { 445 if (a->ls_stack[i] < b->ls_stack[i]) 446 return (-1); 447 if (a->ls_stack[i] > b->ls_stack[i]) 448 return (1); 449 } 450 451 if (a->ls_caller < b->ls_caller) 452 return (-1); 453 if (a->ls_caller > b->ls_caller) 454 return (1); 455 456 return (b->ls_count - a->ls_count); 457 } 458 459 static void 460 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 461 { 462 int m = n / 2; 463 int i, j; 464 465 if (m > 1) 466 mergesort(cmp, a, b, m); 467 if (n - m > 1) 468 mergesort(cmp, a + m, b + m, n - m); 469 for (i = m; i > 0; i--) 470 b[i - 1] = a[i - 1]; 471 for (j = m - 1; j < n - 1; j++) 472 b[n + m - j - 2] = a[j + 1]; 473 while (i < j) 474 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 475 *a = b[i]; 476 } 477 478 static void 479 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 480 { 481 int i, j; 482 lsrec_t *target, *current; 483 484 target = lock[0]; 485 486 for (i = 1; i < n; i++) { 487 current = lock[i]; 488 if (cmp(current, target) != 0) { 489 target = current; 490 continue; 491 } 492 current->ls_event = LS_MAX_EVENTS; 493 target->ls_count += current->ls_count; 494 target->ls_refcnt += current->ls_refcnt; 495 if (g_recsize < LS_TIME) 496 continue; 497 target->ls_time += current->ls_time; 498 if (g_recsize < LS_HIST) 499 continue; 500 for (j = 0; j < 64; j++) 501 target->ls_hist[j] += current->ls_hist[j]; 502 } 503 } 504 505 static void 506 coalesce_symbol(uintptr_t *addrp) 507 { 508 uintptr_t symoff; 509 size_t symsize; 510 511 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 512 *addrp -= symoff; 513 } 514 515 static void 516 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 517 { 518 char *new; 519 int len, newlen; 520 521 if (what == NULL) 522 return; 523 524 if (*pred == NULL) { 525 *pred = malloc(1); 526 *pred[0] = '\0'; 527 } 528 529 len = strlen(*pred); 530 newlen = len + strlen(what) + 32 + strlen("( && )"); 531 new = malloc(newlen); 532 533 if (*pred[0] != '\0') { 534 if (cmp != NULL) { 535 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 536 *pred, what, cmp, (void *)value); 537 } else { 538 (void) sprintf(new, "(%s) && (%s)", *pred, what); 539 } 540 } else { 541 if (cmp != NULL) { 542 (void) sprintf(new, "%s %s 0x%p", 543 what, cmp, (void *)value); 544 } else { 545 (void) sprintf(new, "%s", what); 546 } 547 } 548 549 free(*pred); 550 *pred = new; 551 } 552 553 static void 554 predicate_destroy(char **pred) 555 { 556 free(*pred); 557 *pred = NULL; 558 } 559 560 static void 561 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 562 { 563 char buf[256], *c = buf, *new; 564 int len, newlen; 565 566 if (*filt == NULL) { 567 *filt = malloc(1); 568 *filt[0] = '\0'; 569 } 570 571 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 572 " || " : "", what, (void *)base, what, (void *)(base + size)); 573 574 newlen = (len = strlen(*filt) + 1) + strlen(c); 575 new = malloc(newlen); 576 bcopy(*filt, new, len); 577 (void) strcat(new, c); 578 free(*filt); 579 *filt = new; 580 } 581 582 static void 583 filter_destroy(char **filt) 584 { 585 free(*filt); 586 *filt = NULL; 587 } 588 589 static void 590 dprog_add(const char *fmt, ...) 591 { 592 va_list args; 593 int size, offs; 594 char c; 595 596 va_start(args, fmt); 597 size = vsnprintf(&c, 1, fmt, args) + 1; 598 599 if (g_proglen == 0) { 600 offs = 0; 601 } else { 602 offs = g_proglen - 1; 603 } 604 605 g_proglen = offs + size; 606 607 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 608 fail(1, "failed to reallocate program text"); 609 610 (void) vsnprintf(&g_prog[offs], size, fmt, args); 611 } 612 613 /* 614 * This function may read like an open sewer, but keep in mind that programs 615 * that generate other programs are rarely pretty. If one has the unenviable 616 * task of maintaining or -- worse -- extending this code, use the -V option 617 * to examine the D program as generated by this function. 618 */ 619 static void 620 dprog_addevent(int event) 621 { 622 ls_event_info_t *info = &g_event_info[event]; 623 char *pred = NULL; 624 char stack[20]; 625 const char *arg0, *caller; 626 char *arg1 = "arg1"; 627 char buf[80]; 628 hrtime_t dur; 629 int depth; 630 631 if (info->ev_name[0] == '\0') 632 return; 633 634 if (info->ev_type == 'I') { 635 /* 636 * For interrupt events, arg0 (normally the lock pointer) is 637 * the CPU address plus the current pil, and arg1 (normally 638 * the number of nanoseconds) is the number of nanoseconds 639 * late -- and it's stored in arg2. 640 */ 641 arg0 = "(uintptr_t)curthread->t_cpu + \n" 642 "\t curthread->t_cpu->cpu_profile_pil"; 643 caller = "(uintptr_t)arg0"; 644 arg1 = "arg2"; 645 } else { 646 arg0 = "(uintptr_t)arg0"; 647 caller = "caller"; 648 } 649 650 if (g_recsize > LS_HIST) { 651 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 652 continue; 653 654 if (g_tracing) { 655 (void) sprintf(stack, "\tstack(%d);\n", depth); 656 } else { 657 (void) sprintf(stack, ", stack(%d)", depth); 658 } 659 } else { 660 (void) sprintf(stack, ""); 661 } 662 663 if (info->ev_acquire != NULL) { 664 /* 665 * If this is a hold event, we need to generate an additional 666 * clause for the acquire; the clause for the release will be 667 * generated with the aggregating statement, below. 668 */ 669 dprog_add("%s\n", info->ev_acquire); 670 predicate_add(&pred, info->ev_predicate, NULL, 0); 671 predicate_add(&pred, g_predicate, NULL, 0); 672 if (pred != NULL) 673 dprog_add("/%s/\n", pred); 674 675 dprog_add("{\n"); 676 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 677 678 if (info->ev_type == 'H') { 679 dprog_add("\t%s = timestamp;\n", buf); 680 } else { 681 /* 682 * If this isn't a hold event, it's the recursive 683 * error event. For this, we simply bump the 684 * thread-local, per-lock count. 685 */ 686 dprog_add("\t%s++;\n", buf); 687 } 688 689 dprog_add("}\n\n"); 690 predicate_destroy(&pred); 691 pred = NULL; 692 693 if (info->ev_type == 'E') { 694 /* 695 * If this is the recursive lock error event, we need 696 * to generate an additional clause to decrement the 697 * thread-local, per-lock count. This assures that we 698 * only execute the aggregating clause if we have 699 * recursive entry. 700 */ 701 dprog_add("%s\n", info->ev_name); 702 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 703 } 704 705 predicate_add(&pred, buf, NULL, 0); 706 707 if (info->ev_type == 'H') { 708 (void) sprintf(buf, "timestamp -\n\t " 709 "self->ev%d[(uintptr_t)arg0]", event); 710 } 711 712 arg1 = buf; 713 } else { 714 predicate_add(&pred, info->ev_predicate, NULL, 0); 715 if (info->ev_type != 'I') 716 predicate_add(&pred, g_predicate, NULL, 0); 717 else 718 predicate_add(&pred, g_ipredicate, NULL, 0); 719 } 720 721 if ((dur = g_min_duration[event]) != 0) 722 predicate_add(&pred, arg1, ">=", dur); 723 724 dprog_add("%s\n", info->ev_name); 725 726 if (pred != NULL) 727 dprog_add("/%s/\n", pred); 728 predicate_destroy(&pred); 729 730 dprog_add("{\n"); 731 732 if (g_tracing) { 733 dprog_add("\ttrace(%dULL);\n", event); 734 dprog_add("\ttrace(%s);\n", arg0); 735 dprog_add("\ttrace(%s);\n", caller); 736 dprog_add(stack); 737 } else { 738 /* 739 * The ordering here is important: when we process the 740 * aggregate, we count on the fact that @avg appears before 741 * @hist in program order to assure that @avg is assigned the 742 * first aggregation variable ID and @hist assigned the 743 * second; see the comment in process_aggregate() for details. 744 */ 745 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 746 event, arg0, caller, stack, arg1); 747 748 if (g_recsize >= LS_HIST) { 749 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 750 "(%s);\n", event, arg0, caller, stack, arg1); 751 } 752 } 753 754 if (info->ev_acquire != NULL) 755 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 756 757 dprog_add("}\n\n"); 758 } 759 760 static void 761 dprog_compile() 762 { 763 dtrace_prog_t *prog; 764 dtrace_proginfo_t info; 765 766 if (g_Vflag) { 767 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 768 (void) fputs(g_prog, stderr); 769 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 770 } 771 772 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 773 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 774 dfail("failed to compile program"); 775 776 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 777 dfail("failed to enable probes"); 778 779 if (dtrace_go(g_dtp) != 0) 780 dfail("couldn't start tracing"); 781 } 782 783 static void 784 status_fire(int signal __unused) 785 { 786 } 787 788 static void 789 status_init(void) 790 { 791 dtrace_optval_t val, status, agg; 792 struct sigaction act; 793 struct itimerspec ts; 794 struct sigevent ev; 795 timer_t tid; 796 797 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 798 dfail("failed to get 'statusrate'"); 799 800 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 801 dfail("failed to get 'statusrate'"); 802 803 /* 804 * We would want to awaken at a rate that is the GCD of the statusrate 805 * and the aggrate -- but that seems a bit absurd. Instead, we'll 806 * simply awaken at a rate that is the more frequent of the two, which 807 * assures that we're never later than the interval implied by the 808 * more frequent rate. 809 */ 810 val = status < agg ? status : agg; 811 812 (void) sigemptyset(&act.sa_mask); 813 act.sa_flags = 0; 814 act.sa_handler = status_fire; 815 (void) sigaction(SIGUSR1, &act, NULL); 816 817 ev.sigev_notify = SIGEV_SIGNAL; 818 ev.sigev_signo = SIGUSR1; 819 820 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 821 dfail("cannot create CLOCK_REALTIME timer"); 822 823 ts.it_value.tv_sec = val / NANOSEC; 824 ts.it_value.tv_nsec = val % NANOSEC; 825 ts.it_interval = ts.it_value; 826 827 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 828 dfail("cannot set time on CLOCK_REALTIME timer"); 829 } 830 831 static void 832 status_check(void) 833 { 834 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 835 dfail("failed to snap aggregate"); 836 837 if (dtrace_status(g_dtp) == -1) 838 dfail("dtrace_status()"); 839 } 840 841 static void 842 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 843 { 844 bzero(lsrec, g_recsize); 845 lsrec->ls_count = 1; 846 847 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 848 fail(0, "truncated DTrace record"); 849 850 if (rec->dtrd_size != sizeof (uint64_t)) 851 fail(0, "bad event size in first record"); 852 853 /* LINTED - alignment */ 854 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 855 rec++; 856 857 if (rec->dtrd_size != sizeof (uintptr_t)) 858 fail(0, "bad lock address size in second record"); 859 860 /* LINTED - alignment */ 861 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 862 rec++; 863 864 if (rec->dtrd_size != sizeof (uintptr_t)) 865 fail(0, "bad caller size in third record"); 866 867 /* LINTED - alignment */ 868 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 869 rec++; 870 871 if (g_recsize > LS_HIST) { 872 int frames, i; 873 pc_t *stack; 874 875 frames = rec->dtrd_size / sizeof (pc_t); 876 /* LINTED - alignment */ 877 stack = (pc_t *)(data + rec->dtrd_offset); 878 879 for (i = 1; i < frames; i++) 880 lsrec->ls_stack[i - 1] = stack[i]; 881 } 882 } 883 884 /*ARGSUSED*/ 885 static int 886 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 887 { 888 *((size_t *)arg) += 1; 889 890 return (DTRACE_AGGWALK_NEXT); 891 } 892 893 static int 894 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 895 { 896 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 897 caddr_t data = agg->dtada_data; 898 lsdata_t *lsdata = arg; 899 lsrec_t *lsrec = lsdata->lsd_next; 900 const dtrace_recdesc_t *rec; 901 uint64_t *avg, *quantized; 902 int i, j; 903 904 assert(lsdata->lsd_count < g_nrecs); 905 906 /* 907 * Aggregation variable IDs are guaranteed to be generated in program 908 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 909 * plus one. As "avg" appears before "hist" in program order, we know 910 * that "avg" will be allocated the first aggregation variable ID, and 911 * "hist" will be allocated the second aggregation variable ID -- and 912 * we therefore use the aggregation variable ID to differentiate the 913 * cases. 914 */ 915 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 916 /* 917 * If this is the histogram entry. We'll copy the quantized 918 * data into lc_hist, and jump over the rest. 919 */ 920 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 921 922 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 923 fail(0, "bad variable ID in aggregation record"); 924 925 if (rec->dtrd_size != 926 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 927 fail(0, "bad quantize size in aggregation record"); 928 929 /* LINTED - alignment */ 930 quantized = (uint64_t *)(data + rec->dtrd_offset); 931 932 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 933 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 934 lsrec->ls_hist[j] = quantized[i]; 935 936 goto out; 937 } 938 939 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 940 aggdesc->dtagd_nrecs - 1, data); 941 942 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 943 944 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 945 fail(0, "bad avg size in aggregation record"); 946 947 /* LINTED - alignment */ 948 avg = (uint64_t *)(data + rec->dtrd_offset); 949 lsrec->ls_count = (uint32_t)avg[0]; 950 lsrec->ls_time = (uintptr_t)avg[1]; 951 952 if (g_recsize >= LS_HIST) 953 return (DTRACE_AGGWALK_NEXT); 954 955 out: 956 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 957 lsdata->lsd_count++; 958 959 return (DTRACE_AGGWALK_NEXT); 960 } 961 962 static int 963 process_trace(const dtrace_probedata_t *pdata, void *arg) 964 { 965 lsdata_t *lsdata = arg; 966 lsrec_t *lsrec = lsdata->lsd_next; 967 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 968 caddr_t data = pdata->dtpda_data; 969 970 if (lsdata->lsd_count >= g_nrecs) 971 return (DTRACE_CONSUME_NEXT); 972 973 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 974 975 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 976 lsdata->lsd_count++; 977 978 return (DTRACE_CONSUME_NEXT); 979 } 980 981 static int 982 process_data(FILE *out, char *data) 983 { 984 lsdata_t lsdata; 985 986 /* LINTED - alignment */ 987 lsdata.lsd_next = (lsrec_t *)data; 988 lsdata.lsd_count = 0; 989 990 if (g_tracing) { 991 if (dtrace_consume(g_dtp, out, 992 process_trace, NULL, &lsdata) != 0) 993 dfail("failed to consume buffer"); 994 995 return (lsdata.lsd_count); 996 } 997 998 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 999 process_aggregate, &lsdata) != 0) 1000 dfail("failed to walk aggregate"); 1001 1002 return (lsdata.lsd_count); 1003 } 1004 1005 /*ARGSUSED*/ 1006 static int 1007 drophandler(const dtrace_dropdata_t *data, void *arg) 1008 { 1009 g_dropped++; 1010 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1011 return (DTRACE_HANDLE_OK); 1012 } 1013 1014 int 1015 main(int argc, char **argv) 1016 { 1017 char *data_buf; 1018 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1019 FILE *out = stdout; 1020 int c; 1021 pid_t child; 1022 int status; 1023 int i, j; 1024 hrtime_t duration; 1025 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1026 uintptr_t addr; 1027 size_t size, off; 1028 int events_specified = 0; 1029 int exec_errno = 0; 1030 uint32_t event; 1031 char *filt = NULL, *ifilt = NULL; 1032 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1033 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1034 dtrace_optval_t aggsize; 1035 char aggstr[10]; 1036 long ncpus; 1037 int dynvar = 0; 1038 int err; 1039 1040 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1041 fail(0, "cannot open dtrace library: %s", 1042 dtrace_errmsg(NULL, err)); 1043 } 1044 1045 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1046 dfail("couldn't establish drop handler"); 1047 1048 if (symtab_init() == -1) 1049 fail(1, "can't load kernel symbols"); 1050 1051 g_nrecs = DEFAULT_NRECS; 1052 1053 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1054 switch (c) { 1055 case 'b': 1056 g_recsize = LS_BASIC; 1057 break; 1058 1059 case 't': 1060 g_recsize = LS_TIME; 1061 break; 1062 1063 case 'h': 1064 g_recsize = LS_HIST; 1065 break; 1066 1067 case 's': 1068 if (!isdigit(optarg[0])) 1069 usage(); 1070 g_stkdepth = atoi(optarg); 1071 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1072 fail(0, "max stack depth is %d", 1073 LS_MAX_STACK_DEPTH); 1074 g_recsize = LS_STACK(g_stkdepth); 1075 break; 1076 1077 case 'n': 1078 if (!isdigit(optarg[0])) 1079 usage(); 1080 g_nrecs = atoi(optarg); 1081 break; 1082 1083 case 'd': 1084 if (!isdigit(optarg[0])) 1085 usage(); 1086 duration = atoll(optarg); 1087 1088 /* 1089 * XXX -- durations really should be per event 1090 * since the units are different, but it's hard 1091 * to express this nicely in the interface. 1092 * Not clear yet what the cleanest solution is. 1093 */ 1094 for (i = 0; i < LS_MAX_EVENTS; i++) 1095 if (g_event_info[i].ev_type != 'E') 1096 g_min_duration[i] = duration; 1097 1098 break; 1099 1100 case 'i': 1101 if (!isdigit(optarg[0])) 1102 usage(); 1103 i = atoi(optarg); 1104 if (i <= 0) 1105 usage(); 1106 if (i > MAX_HZ) 1107 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1108 1109 for (j = 0; j < LS_MAX_EVENTS; j++) 1110 if (strcmp(g_event_info[j].ev_desc, 1111 "Profiling interrupt") == 0) 1112 break; 1113 1114 (void) sprintf(g_event_info[j].ev_name, 1115 "profile:::profile-%d", i); 1116 break; 1117 1118 case 'l': 1119 case 'f': 1120 addrp = strtok(optarg, ","); 1121 sizep = strtok(NULL, ","); 1122 addrp = strtok(optarg, ",+"); 1123 offp = strtok(NULL, ","); 1124 1125 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1126 off = offp ? strtoul(offp, NULL, 0) : 0; 1127 1128 if (addrp[0] == '0') { 1129 addr = strtoul(addrp, NULL, 16) + off; 1130 } else { 1131 addr = sym_to_addr(addrp) + off; 1132 if (sizep == NULL) 1133 size = sym_size(addrp) - off; 1134 if (addr - off == 0) 1135 fail(0, "symbol '%s' not found", addrp); 1136 if (size == 0) 1137 size = 1; 1138 } 1139 1140 1141 if (c == 'l') { 1142 filter_add(&filt, "arg0", addr, size); 1143 } else { 1144 filter_add(&filt, "caller", addr, size); 1145 filter_add(&ifilt, "arg0", addr, size); 1146 } 1147 break; 1148 1149 case 'e': 1150 evp = strtok_r(optarg, ",", &lastp); 1151 while (evp) { 1152 int ev1, ev2; 1153 char *evp2; 1154 1155 (void) strtok(evp, "-"); 1156 evp2 = strtok(NULL, "-"); 1157 ev1 = atoi(evp); 1158 ev2 = evp2 ? atoi(evp2) : ev1; 1159 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1160 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1161 fail(0, "-e events out of range"); 1162 for (i = ev1; i <= ev2; i++) 1163 g_enabled[i] = 1; 1164 evp = strtok_r(NULL, ",", &lastp); 1165 } 1166 events_specified = 1; 1167 break; 1168 1169 case 'c': 1170 g_cflag = 1; 1171 break; 1172 1173 case 'k': 1174 g_kflag = 1; 1175 break; 1176 1177 case 'w': 1178 g_wflag = 1; 1179 break; 1180 1181 case 'W': 1182 g_Wflag = 1; 1183 break; 1184 1185 case 'g': 1186 g_gflag = 1; 1187 break; 1188 1189 case 'C': 1190 case 'E': 1191 case 'H': 1192 case 'I': 1193 for (i = 0; i < LS_MAX_EVENTS; i++) 1194 if (g_event_info[i].ev_type == c) 1195 g_enabled[i] = 1; 1196 events_specified = 1; 1197 break; 1198 1199 case 'A': 1200 for (i = 0; i < LS_MAX_EVENTS; i++) 1201 if (strchr("CH", g_event_info[i].ev_type)) 1202 g_enabled[i] = 1; 1203 events_specified = 1; 1204 break; 1205 1206 case 'T': 1207 g_tracing = 1; 1208 break; 1209 1210 case 'D': 1211 if (!isdigit(optarg[0])) 1212 usage(); 1213 g_topn = atoi(optarg); 1214 break; 1215 1216 case 'R': 1217 g_rates = 1; 1218 break; 1219 1220 case 'p': 1221 g_pflag = 1; 1222 break; 1223 1224 case 'P': 1225 g_Pflag = 1; 1226 break; 1227 1228 case 'o': 1229 if ((out = fopen(optarg, "w")) == NULL) 1230 fail(1, "error opening file"); 1231 break; 1232 1233 case 'V': 1234 g_Vflag = 1; 1235 break; 1236 1237 default: 1238 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1239 usage(); 1240 } 1241 } 1242 1243 if (filt != NULL) { 1244 predicate_add(&g_predicate, filt, NULL, 0); 1245 filter_destroy(&filt); 1246 } 1247 1248 if (ifilt != NULL) { 1249 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1250 filter_destroy(&ifilt); 1251 } 1252 1253 if (g_recsize == 0) { 1254 if (g_gflag) { 1255 g_stkdepth = LS_MAX_STACK_DEPTH; 1256 g_recsize = LS_STACK(g_stkdepth); 1257 } else { 1258 g_recsize = LS_TIME; 1259 } 1260 } 1261 1262 if (g_gflag && g_recsize <= LS_STACK(0)) 1263 fail(0, "'-g' requires at least '-s 1' data gathering"); 1264 1265 /* 1266 * Make sure the alignment is reasonable 1267 */ 1268 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1269 1270 for (i = 0; i < LS_MAX_EVENTS; i++) { 1271 /* 1272 * If no events were specified, enable -C. 1273 */ 1274 if (!events_specified && g_event_info[i].ev_type == 'C') 1275 g_enabled[i] = 1; 1276 } 1277 1278 for (i = 0; i < LS_MAX_EVENTS; i++) { 1279 if (!g_enabled[i]) 1280 continue; 1281 1282 if (g_event_info[i].ev_acquire != NULL) { 1283 /* 1284 * If we've enabled a hold event, we must explicitly 1285 * allocate dynamic variable space. 1286 */ 1287 dynvar = 1; 1288 } 1289 1290 dprog_addevent(i); 1291 } 1292 1293 /* 1294 * Make sure there are remaining arguments to specify a child command 1295 * to execute. 1296 */ 1297 if (argc <= optind) 1298 usage(); 1299 1300 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1301 dfail("couldn't determine number of online CPUs"); 1302 1303 /* 1304 * By default, we set our data buffer size to be the number of records 1305 * multiplied by the size of the record, doubled to account for some 1306 * DTrace slop and divided by the number of CPUs. We silently clamp 1307 * the aggregation size at both a minimum and a maximum to prevent 1308 * absurdly low or high values. 1309 */ 1310 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1311 aggsize = MIN_AGGSIZE; 1312 1313 if (aggsize > MAX_AGGSIZE) 1314 aggsize = MAX_AGGSIZE; 1315 1316 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1317 1318 if (!g_tracing) { 1319 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1320 dfail("failed to set 'bufsize'"); 1321 1322 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1323 dfail("failed to set 'aggsize'"); 1324 1325 if (dynvar) { 1326 /* 1327 * If we're using dynamic variables, we set our 1328 * dynamic variable size to be one megabyte per CPU, 1329 * with a hard-limit of 32 megabytes. This may still 1330 * be too small in some cases, but it can be tuned 1331 * manually via -x if need be. 1332 */ 1333 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1334 1335 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1336 dfail("failed to set 'dynvarsize'"); 1337 } 1338 } else { 1339 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1340 dfail("failed to set 'bufsize'"); 1341 } 1342 1343 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1344 dfail("failed to set 'statusrate'"); 1345 1346 optind = 1; 1347 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { 1348 switch (c) { 1349 case 'x': 1350 if ((p = strchr(optarg, '=')) != NULL) 1351 *p++ = '\0'; 1352 1353 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1354 dfail("failed to set -x %s", optarg); 1355 break; 1356 } 1357 } 1358 1359 argc -= optind; 1360 argv += optind; 1361 1362 dprog_compile(); 1363 status_init(); 1364 1365 g_elapsed = -gethrtime(); 1366 1367 /* 1368 * Spawn the specified command and wait for it to complete. 1369 */ 1370 child = fork(); 1371 if (child == -1) 1372 fail(1, "cannot fork"); 1373 if (child == 0) { 1374 (void) dtrace_close(g_dtp); 1375 (void) execvp(argv[0], &argv[0]); 1376 exec_errno = errno; 1377 exit(127); 1378 } 1379 1380 while (waitpid(child, &status, WEXITED) != child) 1381 status_check(); 1382 1383 g_elapsed += gethrtime(); 1384 1385 if (WIFEXITED(status)) { 1386 if (WEXITSTATUS(status) != 0) { 1387 if (exec_errno != 0) { 1388 errno = exec_errno; 1389 fail(1, "could not execute %s", argv[0]); 1390 } 1391 (void) fprintf(stderr, 1392 "lockstat: warning: %s exited with code %d\n", 1393 argv[0], WEXITSTATUS(status)); 1394 } 1395 } else { 1396 (void) fprintf(stderr, 1397 "lockstat: warning: %s died on signal %d\n", 1398 argv[0], WTERMSIG(status)); 1399 } 1400 1401 if (dtrace_stop(g_dtp) == -1) 1402 dfail("failed to stop dtrace"); 1403 1404 /* 1405 * Before we read out the results, we need to allocate our buffer. 1406 * If we're tracing, then we'll just use the precalculated size. If 1407 * we're not, then we'll take a snapshot of the aggregate, and walk 1408 * it to count the number of records. 1409 */ 1410 if (!g_tracing) { 1411 if (dtrace_aggregate_snap(g_dtp) != 0) 1412 dfail("failed to snap aggregate"); 1413 1414 g_nrecs = 0; 1415 1416 if (dtrace_aggregate_walk(g_dtp, 1417 count_aggregate, &g_nrecs) != 0) 1418 dfail("failed to walk aggregate"); 1419 } 1420 1421 if ((data_buf = memalign(sizeof (uint64_t), 1422 (g_nrecs + 1) * g_recsize)) == NULL) 1423 fail(1, "Memory allocation failed"); 1424 1425 /* 1426 * Read out the DTrace data. 1427 */ 1428 g_nrecs_used = process_data(out, data_buf); 1429 1430 if (g_nrecs_used > g_nrecs || g_dropped) 1431 (void) fprintf(stderr, "lockstat: warning: " 1432 "ran out of data records (use -n for more)\n"); 1433 1434 /* LINTED - alignment */ 1435 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1436 /* LINTED - alignment */ 1437 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1438 ev_count[lsp->ls_event] += lsp->ls_count; 1439 ev_time[lsp->ls_event] += lsp->ls_time; 1440 } 1441 1442 /* 1443 * If -g was specified, convert stacks into individual records. 1444 */ 1445 if (g_gflag) { 1446 lsrec_t *newlsp, *oldlsp; 1447 1448 newlsp = memalign(sizeof (uint64_t), 1449 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1450 if (newlsp == NULL) 1451 fail(1, "Cannot allocate space for -g processing"); 1452 lsp = newlsp; 1453 /* LINTED - alignment */ 1454 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1455 /* LINTED - alignment */ 1456 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1457 int fr; 1458 int caller_in_stack = 0; 1459 1460 if (oldlsp->ls_count == 0) 1461 continue; 1462 1463 for (fr = 0; fr < g_stkdepth; fr++) { 1464 if (oldlsp->ls_stack[fr] == 0) 1465 break; 1466 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1467 caller_in_stack = 1; 1468 bcopy(oldlsp, lsp, LS_TIME); 1469 lsp->ls_caller = oldlsp->ls_stack[fr]; 1470 /* LINTED - alignment */ 1471 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1472 } 1473 if (!caller_in_stack) { 1474 bcopy(oldlsp, lsp, LS_TIME); 1475 /* LINTED - alignment */ 1476 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1477 } 1478 } 1479 g_nrecs = g_nrecs_used = 1480 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1481 g_recsize = LS_TIME; 1482 g_stkdepth = 0; 1483 free(data_buf); 1484 data_buf = (char *)newlsp; 1485 } 1486 1487 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1488 sizeof (void *))) == NULL) 1489 fail(1, "Sort buffer allocation failed"); 1490 merge_buf = sort_buf + (g_nrecs + 1); 1491 1492 /* 1493 * Build the sort buffer, discarding zero-count records along the way. 1494 */ 1495 /* LINTED - alignment */ 1496 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1497 /* LINTED - alignment */ 1498 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1499 if (lsp->ls_count == 0) 1500 lsp->ls_event = LS_MAX_EVENTS; 1501 sort_buf[i] = lsp; 1502 } 1503 1504 if (g_nrecs_used == 0) 1505 exit(0); 1506 1507 /* 1508 * Add a sentinel after the last record 1509 */ 1510 sort_buf[i] = lsp; 1511 lsp->ls_event = LS_MAX_EVENTS; 1512 1513 if (g_tracing) { 1514 report_trace(out, sort_buf); 1515 return (0); 1516 } 1517 1518 /* 1519 * Application of -g may have resulted in multiple records 1520 * with the same signature; coalesce them. 1521 */ 1522 if (g_gflag) { 1523 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1524 coalesce(lockcmp, sort_buf, g_nrecs_used); 1525 } 1526 1527 /* 1528 * Coalesce locks within the same symbol if -c option specified. 1529 * Coalesce PCs within the same function if -k option specified. 1530 */ 1531 if (g_cflag || g_kflag) { 1532 for (i = 0; i < g_nrecs_used; i++) { 1533 int fr; 1534 lsp = sort_buf[i]; 1535 if (g_cflag) 1536 coalesce_symbol(&lsp->ls_lock); 1537 if (g_kflag) { 1538 for (fr = 0; fr < g_stkdepth; fr++) 1539 coalesce_symbol(&lsp->ls_stack[fr]); 1540 coalesce_symbol(&lsp->ls_caller); 1541 } 1542 } 1543 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1544 coalesce(lockcmp, sort_buf, g_nrecs_used); 1545 } 1546 1547 /* 1548 * Coalesce callers if -w option specified 1549 */ 1550 if (g_wflag) { 1551 mergesort(lock_and_count_cmp_anywhere, 1552 sort_buf, merge_buf, g_nrecs_used); 1553 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1554 } 1555 1556 /* 1557 * Coalesce locks if -W option specified 1558 */ 1559 if (g_Wflag) { 1560 mergesort(site_and_count_cmp_anylock, 1561 sort_buf, merge_buf, g_nrecs_used); 1562 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1563 } 1564 1565 /* 1566 * Sort data by contention count (ls_count) or total time (ls_time), 1567 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1568 */ 1569 if (g_recsize < LS_TIME) 1570 g_Pflag = 0; 1571 1572 if (g_Pflag) 1573 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1574 else 1575 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1576 1577 /* 1578 * Display data by event type 1579 */ 1580 first = &sort_buf[0]; 1581 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1582 current = first; 1583 while ((lsp = *current)->ls_event == event) 1584 current++; 1585 report_stats(out, first, current - first, ev_count[event], 1586 ev_time[event]); 1587 first = current; 1588 } 1589 1590 return (0); 1591 } 1592 1593 static char * 1594 format_symbol(char *buf, uintptr_t addr, int show_size) 1595 { 1596 uintptr_t symoff; 1597 char *symname; 1598 size_t symsize; 1599 1600 symname = addr_to_sym(addr, &symoff, &symsize); 1601 1602 if (show_size && symoff == 0) 1603 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1604 else if (symoff == 0) 1605 (void) sprintf(buf, "%s", symname); 1606 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1607 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1608 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1609 (void) sprintf(buf, "%s+0x%llx", symname, 1610 (unsigned long long)symoff); 1611 else 1612 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1613 return (buf); 1614 } 1615 1616 static void 1617 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1618 uint64_t total_time) 1619 { 1620 uint32_t event = sort_buf[0]->ls_event; 1621 lsrec_t *lsp; 1622 double ptotal = 0.0; 1623 double percent; 1624 int i, j, fr; 1625 int displayed; 1626 int first_bin, last_bin, max_bin_count, total_bin_count; 1627 int rectype; 1628 char buf[256]; 1629 char lhdr[80], chdr[80]; 1630 1631 rectype = g_recsize; 1632 1633 if (g_topn == 0) { 1634 (void) fprintf(out, "%20llu %s\n", 1635 g_rates == 0 ? total_count : 1636 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1637 g_event_info[event].ev_desc); 1638 return; 1639 } 1640 1641 (void) sprintf(lhdr, "%s%s", 1642 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1643 (void) sprintf(chdr, "%s%s", 1644 g_wflag ? "Hottest " : "", "Caller"); 1645 1646 if (!g_pflag) 1647 (void) fprintf(out, 1648 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1649 g_event_info[event].ev_desc, (double)total_count, 1650 (double)g_elapsed / NANOSEC, 1651 (double)total_count * NANOSEC / g_elapsed); 1652 1653 if (!g_pflag && rectype < LS_HIST) { 1654 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1655 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1656 g_rates ? "ops/s" : "Count", 1657 g_gflag ? "genr" : "indv", 1658 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1659 (void) fprintf(out, "---------------------------------" 1660 "----------------------------------------------\n"); 1661 } 1662 1663 displayed = 0; 1664 for (i = 0; i < nrecs; i++) { 1665 lsp = sort_buf[i]; 1666 1667 if (displayed++ >= g_topn) 1668 break; 1669 1670 if (g_pflag) { 1671 int j; 1672 1673 (void) fprintf(out, "%u %u", 1674 lsp->ls_event, lsp->ls_count); 1675 (void) fprintf(out, " %s", 1676 format_symbol(buf, lsp->ls_lock, g_cflag)); 1677 (void) fprintf(out, " %s", 1678 format_symbol(buf, lsp->ls_caller, 0)); 1679 (void) fprintf(out, " %f", 1680 (double)lsp->ls_refcnt / lsp->ls_count); 1681 if (rectype >= LS_TIME) 1682 (void) fprintf(out, " %llu", 1683 (unsigned long long)lsp->ls_time); 1684 if (rectype >= LS_HIST) { 1685 for (j = 0; j < 64; j++) 1686 (void) fprintf(out, " %u", 1687 lsp->ls_hist[j]); 1688 } 1689 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1690 if (rectype <= LS_STACK(j) || 1691 lsp->ls_stack[j] == 0) 1692 break; 1693 (void) fprintf(out, " %s", 1694 format_symbol(buf, lsp->ls_stack[j], 0)); 1695 } 1696 (void) fprintf(out, "\n"); 1697 continue; 1698 } 1699 1700 if (rectype >= LS_HIST) { 1701 (void) fprintf(out, "---------------------------------" 1702 "----------------------------------------------\n"); 1703 (void) sprintf(buf, "%s", 1704 g_event_info[event].ev_units); 1705 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1706 g_rates ? "ops/s" : "Count", 1707 g_gflag ? "genr" : "indv", 1708 "cuml", "rcnt", buf, lhdr, chdr); 1709 } 1710 1711 if (g_Pflag && total_time != 0) 1712 percent = (lsp->ls_time * 100.00) / total_time; 1713 else 1714 percent = (lsp->ls_count * 100.00) / total_count; 1715 1716 ptotal += percent; 1717 1718 if (rectype >= LS_TIME) 1719 (void) sprintf(buf, "%llu", 1720 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1721 else 1722 buf[0] = '\0'; 1723 1724 (void) fprintf(out, "%5llu ", 1725 g_rates == 0 ? lsp->ls_count : 1726 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1727 1728 (void) fprintf(out, "%3.0f%% ", percent); 1729 1730 if (g_gflag) 1731 (void) fprintf(out, "---- "); 1732 else 1733 (void) fprintf(out, "%3.0f%% ", ptotal); 1734 1735 (void) fprintf(out, "%4.2f %8s ", 1736 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1737 1738 (void) fprintf(out, "%-22s ", 1739 format_symbol(buf, lsp->ls_lock, g_cflag)); 1740 1741 (void) fprintf(out, "%-24s\n", 1742 format_symbol(buf, lsp->ls_caller, 0)); 1743 1744 if (rectype < LS_HIST) 1745 continue; 1746 1747 (void) fprintf(out, "\n"); 1748 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1749 g_event_info[event].ev_units, 1750 "------ Time Distribution ------", 1751 g_rates ? "ops/s" : "count", 1752 rectype > LS_STACK(0) ? "Stack" : ""); 1753 1754 first_bin = 0; 1755 while (lsp->ls_hist[first_bin] == 0) 1756 first_bin++; 1757 1758 last_bin = 63; 1759 while (lsp->ls_hist[last_bin] == 0) 1760 last_bin--; 1761 1762 max_bin_count = 0; 1763 total_bin_count = 0; 1764 for (j = first_bin; j <= last_bin; j++) { 1765 total_bin_count += lsp->ls_hist[j]; 1766 if (lsp->ls_hist[j] > max_bin_count) 1767 max_bin_count = lsp->ls_hist[j]; 1768 } 1769 1770 /* 1771 * If we went a few frames below the caller, ignore them 1772 */ 1773 for (fr = 3; fr > 0; fr--) 1774 if (lsp->ls_stack[fr] == lsp->ls_caller) 1775 break; 1776 1777 for (j = first_bin; j <= last_bin; j++) { 1778 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1779 (void) fprintf(out, "%10llu |%s%s %-9u ", 1780 1ULL << j, 1781 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1782 " " + depth, 1783 g_rates == 0 ? lsp->ls_hist[j] : 1784 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1785 g_elapsed)); 1786 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1787 (void) fprintf(out, "\n"); 1788 continue; 1789 } 1790 (void) fprintf(out, "%-24s\n", 1791 format_symbol(buf, lsp->ls_stack[fr], 0)); 1792 fr++; 1793 } 1794 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1795 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1796 format_symbol(buf, lsp->ls_stack[fr], 0)); 1797 fr++; 1798 } 1799 } 1800 1801 if (!g_pflag) 1802 (void) fprintf(out, "---------------------------------" 1803 "----------------------------------------------\n"); 1804 1805 (void) fflush(out); 1806 } 1807 1808 static void 1809 report_trace(FILE *out, lsrec_t **sort_buf) 1810 { 1811 lsrec_t *lsp; 1812 int i, fr; 1813 int rectype; 1814 char buf[256], buf2[256]; 1815 1816 rectype = g_recsize; 1817 1818 if (!g_pflag) { 1819 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1820 "Event", "Time", "Owner", "Lock", "Caller"); 1821 (void) fprintf(out, "---------------------------------" 1822 "----------------------------------------------\n"); 1823 } 1824 1825 for (i = 0; i < g_nrecs_used; i++) { 1826 1827 lsp = sort_buf[i]; 1828 1829 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1830 continue; 1831 1832 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1833 lsp->ls_event, (unsigned long long)lsp->ls_time, 1834 (void *)lsp->ls_next, 1835 format_symbol(buf, lsp->ls_lock, 0), 1836 format_symbol(buf2, lsp->ls_caller, 0)); 1837 1838 if (rectype <= LS_STACK(0)) 1839 continue; 1840 1841 /* 1842 * If we went a few frames below the caller, ignore them 1843 */ 1844 for (fr = 3; fr > 0; fr--) 1845 if (lsp->ls_stack[fr] == lsp->ls_caller) 1846 break; 1847 1848 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1849 (void) fprintf(out, "%53s %-24s\n", "", 1850 format_symbol(buf, lsp->ls_stack[fr], 0)); 1851 fr++; 1852 } 1853 (void) fprintf(out, "\n"); 1854 } 1855 1856 (void) fflush(out); 1857 } 1858