1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <stdio.h> 29 #include <stddef.h> 30 #include <stdlib.h> 31 #include <stdarg.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <fcntl.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <sys/types.h> 40 #include <sys/modctl.h> 41 #include <sys/stat.h> 42 #include <sys/wait.h> 43 #include <dtrace.h> 44 #include <sys/lockstat.h> 45 #include <alloca.h> 46 #include <signal.h> 47 #include <assert.h> 48 49 #if defined(sun) 50 #define GETOPT_EOF EOF 51 #else 52 /* FreeBSD */ 53 #include <sys/time.h> 54 #include <sys/resource.h> 55 56 #define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 57 #define GETOPT_EOF (-1) 58 59 typedef uintptr_t pc_t; 60 #endif /* defined(sun) */ 61 62 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 63 64 #define LS_MAX_STACK_DEPTH 50 65 #define LS_MAX_EVENTS 64 66 67 typedef struct lsrec { 68 struct lsrec *ls_next; /* next in hash chain */ 69 uintptr_t ls_lock; /* lock address */ 70 uintptr_t ls_caller; /* caller address */ 71 uint32_t ls_count; /* cumulative event count */ 72 uint32_t ls_event; /* type of event */ 73 uintptr_t ls_refcnt; /* cumulative reference count */ 74 uint64_t ls_time; /* cumulative event duration */ 75 uint32_t ls_hist[64]; /* log2(duration) histogram */ 76 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 77 } lsrec_t; 78 79 typedef struct lsdata { 80 struct lsrec *lsd_next; /* next available */ 81 int lsd_count; /* number of records */ 82 } lsdata_t; 83 84 /* 85 * Definitions for the types of experiments which can be run. They are 86 * listed in increasing order of memory cost and processing time cost. 87 * The numerical value of each type is the number of bytes needed per record. 88 */ 89 #define LS_BASIC offsetof(lsrec_t, ls_time) 90 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 91 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 92 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 93 94 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 95 static void report_trace(FILE *, lsrec_t **); 96 97 extern int symtab_init(void); 98 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 99 extern uintptr_t sym_to_addr(char *name); 100 extern size_t sym_size(char *name); 101 extern char *strtok_r(char *, const char *, char **); 102 103 #define DEFAULT_NRECS 10000 104 #define DEFAULT_HZ 97 105 #define MAX_HZ 1000 106 #define MIN_AGGSIZE (16 * 1024) 107 #define MAX_AGGSIZE (32 * 1024 * 1024) 108 109 static int g_stkdepth; 110 static int g_topn = INT_MAX; 111 static hrtime_t g_elapsed; 112 static int g_rates = 0; 113 static int g_pflag = 0; 114 static int g_Pflag = 0; 115 static int g_wflag = 0; 116 static int g_Wflag = 0; 117 static int g_cflag = 0; 118 static int g_kflag = 0; 119 static int g_gflag = 0; 120 static int g_Vflag = 0; 121 static int g_tracing = 0; 122 static size_t g_recsize; 123 static size_t g_nrecs; 124 static int g_nrecs_used; 125 static uchar_t g_enabled[LS_MAX_EVENTS]; 126 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 127 static dtrace_hdl_t *g_dtp; 128 static char *g_predicate; 129 static char *g_ipredicate; 130 static char *g_prog; 131 static int g_proglen; 132 static int g_dropped; 133 134 typedef struct ls_event_info { 135 char ev_type; 136 char ev_lhdr[20]; 137 char ev_desc[80]; 138 char ev_units[10]; 139 char ev_name[DTRACE_NAMELEN]; 140 char *ev_predicate; 141 char *ev_acquire; 142 } ls_event_info_t; 143 144 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 145 { 'C', "Lock", "Adaptive mutex spin", "nsec", 146 "lockstat:::adaptive-spin" }, 147 { 'C', "Lock", "Adaptive mutex block", "nsec", 148 "lockstat:::adaptive-block" }, 149 { 'C', "Lock", "Spin lock spin", "nsec", 150 "lockstat:::spin-spin" }, 151 { 'C', "Lock", "Thread lock spin", "nsec", 152 "lockstat:::thread-spin" }, 153 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 154 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 155 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 156 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 157 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 158 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 159 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 160 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 161 { 'C', "Lock", "Unknown event (type 8)", "units" }, 162 { 'C', "Lock", "Unknown event (type 9)", "units" }, 163 { 'C', "Lock", "Unknown event (type 10)", "units" }, 164 { 'C', "Lock", "Unknown event (type 11)", "units" }, 165 { 'C', "Lock", "Unknown event (type 12)", "units" }, 166 { 'C', "Lock", "Unknown event (type 13)", "units" }, 167 { 'C', "Lock", "Unknown event (type 14)", "units" }, 168 { 'C', "Lock", "Unknown event (type 15)", "units" }, 169 { 'C', "Lock", "Unknown event (type 16)", "units" }, 170 { 'C', "Lock", "Unknown event (type 17)", "units" }, 171 { 'C', "Lock", "Unknown event (type 18)", "units" }, 172 { 'C', "Lock", "Unknown event (type 19)", "units" }, 173 { 'C', "Lock", "Unknown event (type 20)", "units" }, 174 { 'C', "Lock", "Unknown event (type 21)", "units" }, 175 { 'C', "Lock", "Unknown event (type 22)", "units" }, 176 { 'C', "Lock", "Unknown event (type 23)", "units" }, 177 { 'C', "Lock", "Unknown event (type 24)", "units" }, 178 { 'C', "Lock", "Unknown event (type 25)", "units" }, 179 { 'C', "Lock", "Unknown event (type 26)", "units" }, 180 { 'C', "Lock", "Unknown event (type 27)", "units" }, 181 { 'C', "Lock", "Unknown event (type 28)", "units" }, 182 { 'C', "Lock", "Unknown event (type 29)", "units" }, 183 { 'C', "Lock", "Unknown event (type 30)", "units" }, 184 { 'C', "Lock", "Unknown event (type 31)", "units" }, 185 { 'H', "Lock", "Adaptive mutex hold", "nsec", 186 "lockstat:::adaptive-release", NULL, 187 "lockstat:::adaptive-acquire" }, 188 { 'H', "Lock", "Spin lock hold", "nsec", 189 "lockstat:::spin-release", NULL, 190 "lockstat:::spin-acquire" }, 191 { 'H', "Lock", "R/W writer hold", "nsec", 192 "lockstat:::rw-release", "arg1 == 0", 193 "lockstat:::rw-acquire" }, 194 { 'H', "Lock", "R/W reader hold", "nsec", 195 "lockstat:::rw-release", "arg1 != 0", 196 "lockstat:::rw-acquire" }, 197 { 'H', "Lock", "Unknown event (type 36)", "units" }, 198 { 'H', "Lock", "Unknown event (type 37)", "units" }, 199 { 'H', "Lock", "Unknown event (type 38)", "units" }, 200 { 'H', "Lock", "Unknown event (type 39)", "units" }, 201 { 'H', "Lock", "Unknown event (type 40)", "units" }, 202 { 'H', "Lock", "Unknown event (type 41)", "units" }, 203 { 'H', "Lock", "Unknown event (type 42)", "units" }, 204 { 'H', "Lock", "Unknown event (type 43)", "units" }, 205 { 'H', "Lock", "Unknown event (type 44)", "units" }, 206 { 'H', "Lock", "Unknown event (type 45)", "units" }, 207 { 'H', "Lock", "Unknown event (type 46)", "units" }, 208 { 'H', "Lock", "Unknown event (type 47)", "units" }, 209 { 'H', "Lock", "Unknown event (type 48)", "units" }, 210 { 'H', "Lock", "Unknown event (type 49)", "units" }, 211 { 'H', "Lock", "Unknown event (type 50)", "units" }, 212 { 'H', "Lock", "Unknown event (type 51)", "units" }, 213 { 'H', "Lock", "Unknown event (type 52)", "units" }, 214 { 'H', "Lock", "Unknown event (type 53)", "units" }, 215 { 'H', "Lock", "Unknown event (type 54)", "units" }, 216 { 'H', "Lock", "Unknown event (type 55)", "units" }, 217 #if defined(sun) 218 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 219 #else 220 /* FreeBSD */ 221 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 222 #endif 223 "profile:::profile-97", NULL }, 224 { 'I', "Lock", "Unknown event (type 57)", "units" }, 225 { 'I', "Lock", "Unknown event (type 58)", "units" }, 226 { 'I', "Lock", "Unknown event (type 59)", "units" }, 227 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 228 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 229 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 230 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 231 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 232 }; 233 234 #if !defined(sun) 235 static char *g_pri_class[] = { 236 "", 237 "Intr", 238 "RealT", 239 "TShar", 240 "Idle" 241 }; 242 #endif 243 244 static void 245 fail(int do_perror, const char *message, ...) 246 { 247 va_list args; 248 int save_errno = errno; 249 250 va_start(args, message); 251 (void) fprintf(stderr, "lockstat: "); 252 (void) vfprintf(stderr, message, args); 253 va_end(args); 254 if (do_perror) 255 (void) fprintf(stderr, ": %s", strerror(save_errno)); 256 (void) fprintf(stderr, "\n"); 257 exit(2); 258 } 259 260 static void 261 dfail(const char *message, ...) 262 { 263 va_list args; 264 265 va_start(args, message); 266 (void) fprintf(stderr, "lockstat: "); 267 (void) vfprintf(stderr, message, args); 268 va_end(args); 269 (void) fprintf(stderr, ": %s\n", 270 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 271 272 exit(2); 273 } 274 275 static void 276 show_events(char event_type, char *desc) 277 { 278 int i, first = -1, last; 279 280 for (i = 0; i < LS_MAX_EVENTS; i++) { 281 ls_event_info_t *evp = &g_event_info[i]; 282 if (evp->ev_type != event_type || 283 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 284 continue; 285 if (first == -1) 286 first = i; 287 last = i; 288 } 289 290 (void) fprintf(stderr, 291 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 292 desc, event_type, first, last); 293 294 for (i = first; i <= last; i++) 295 (void) fprintf(stderr, 296 "%4d = %s\n", i, g_event_info[i].ev_desc); 297 } 298 299 static void 300 usage(void) 301 { 302 (void) fprintf(stderr, 303 "Usage: lockstat [options] command [args]\n" 304 "\nEvent selection options:\n\n" 305 " -C watch contention events [on by default]\n" 306 " -E watch error events [off by default]\n" 307 " -H watch hold events [off by default]\n" 308 " -I watch interrupt events [off by default]\n" 309 " -A watch all lock events [equivalent to -CH]\n" 310 " -e event_list only watch the specified events (shown below);\n" 311 " <event_list> is a comma-separated list of\n" 312 " events or ranges of events, e.g. 1,4-7,35\n" 313 " -i rate interrupt rate for -I [default: %d Hz]\n" 314 "\nData gathering options:\n\n" 315 " -b basic statistics (lock, caller, event count)\n" 316 " -t timing for all events [default]\n" 317 " -h histograms for event times\n" 318 " -s depth stack traces <depth> deep\n" 319 " -x opt[=val] enable or modify DTrace options\n" 320 "\nData filtering options:\n\n" 321 " -n nrecords maximum number of data records [default: %d]\n" 322 " -l lock[,size] only watch <lock>, which can be specified as a\n" 323 " symbolic name or hex address; <size> defaults\n" 324 " to the ELF symbol size if available, 1 if not\n" 325 " -f func[,size] only watch events generated by <func>\n" 326 " -d duration only watch events longer than <duration>\n" 327 " -T trace (rather than sample) events\n" 328 "\nData reporting options:\n\n" 329 " -c coalesce lock data for arrays like pse_mutex[]\n" 330 " -k coalesce PCs within functions\n" 331 " -g show total events generated by function\n" 332 " -w wherever: don't distinguish events by caller\n" 333 " -W whichever: don't distinguish events by lock\n" 334 " -R display rates rather than counts\n" 335 " -p parsable output format (awk(1)-friendly)\n" 336 " -P sort lock data by (count * avg_time) product\n" 337 " -D n only display top <n> events of each type\n" 338 " -o filename send output to <filename>\n", 339 DEFAULT_HZ, DEFAULT_NRECS); 340 341 show_events('C', "Contention"); 342 show_events('H', "Hold-time"); 343 show_events('I', "Interrupt"); 344 show_events('E', "Error"); 345 (void) fprintf(stderr, "\n"); 346 347 exit(1); 348 } 349 350 static int 351 lockcmp(lsrec_t *a, lsrec_t *b) 352 { 353 int i; 354 355 if (a->ls_event < b->ls_event) 356 return (-1); 357 if (a->ls_event > b->ls_event) 358 return (1); 359 360 for (i = g_stkdepth - 1; i >= 0; i--) { 361 if (a->ls_stack[i] < b->ls_stack[i]) 362 return (-1); 363 if (a->ls_stack[i] > b->ls_stack[i]) 364 return (1); 365 } 366 367 if (a->ls_caller < b->ls_caller) 368 return (-1); 369 if (a->ls_caller > b->ls_caller) 370 return (1); 371 372 if (a->ls_lock < b->ls_lock) 373 return (-1); 374 if (a->ls_lock > b->ls_lock) 375 return (1); 376 377 return (0); 378 } 379 380 static int 381 countcmp(lsrec_t *a, lsrec_t *b) 382 { 383 if (a->ls_event < b->ls_event) 384 return (-1); 385 if (a->ls_event > b->ls_event) 386 return (1); 387 388 return (b->ls_count - a->ls_count); 389 } 390 391 static int 392 timecmp(lsrec_t *a, lsrec_t *b) 393 { 394 if (a->ls_event < b->ls_event) 395 return (-1); 396 if (a->ls_event > b->ls_event) 397 return (1); 398 399 if (a->ls_time < b->ls_time) 400 return (1); 401 if (a->ls_time > b->ls_time) 402 return (-1); 403 404 return (0); 405 } 406 407 static int 408 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 409 { 410 if (a->ls_event < b->ls_event) 411 return (-1); 412 if (a->ls_event > b->ls_event) 413 return (1); 414 415 if (a->ls_lock < b->ls_lock) 416 return (-1); 417 if (a->ls_lock > b->ls_lock) 418 return (1); 419 420 return (0); 421 } 422 423 static int 424 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 425 { 426 if (a->ls_event < b->ls_event) 427 return (-1); 428 if (a->ls_event > b->ls_event) 429 return (1); 430 431 if (a->ls_lock < b->ls_lock) 432 return (-1); 433 if (a->ls_lock > b->ls_lock) 434 return (1); 435 436 return (b->ls_count - a->ls_count); 437 } 438 439 static int 440 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 441 { 442 int i; 443 444 if (a->ls_event < b->ls_event) 445 return (-1); 446 if (a->ls_event > b->ls_event) 447 return (1); 448 449 for (i = g_stkdepth - 1; i >= 0; i--) { 450 if (a->ls_stack[i] < b->ls_stack[i]) 451 return (-1); 452 if (a->ls_stack[i] > b->ls_stack[i]) 453 return (1); 454 } 455 456 if (a->ls_caller < b->ls_caller) 457 return (-1); 458 if (a->ls_caller > b->ls_caller) 459 return (1); 460 461 return (0); 462 } 463 464 static int 465 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 466 { 467 int i; 468 469 if (a->ls_event < b->ls_event) 470 return (-1); 471 if (a->ls_event > b->ls_event) 472 return (1); 473 474 for (i = g_stkdepth - 1; i >= 0; i--) { 475 if (a->ls_stack[i] < b->ls_stack[i]) 476 return (-1); 477 if (a->ls_stack[i] > b->ls_stack[i]) 478 return (1); 479 } 480 481 if (a->ls_caller < b->ls_caller) 482 return (-1); 483 if (a->ls_caller > b->ls_caller) 484 return (1); 485 486 return (b->ls_count - a->ls_count); 487 } 488 489 static void 490 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 491 { 492 int m = n / 2; 493 int i, j; 494 495 if (m > 1) 496 lsmergesort(cmp, a, b, m); 497 if (n - m > 1) 498 lsmergesort(cmp, a + m, b + m, n - m); 499 for (i = m; i > 0; i--) 500 b[i - 1] = a[i - 1]; 501 for (j = m - 1; j < n - 1; j++) 502 b[n + m - j - 2] = a[j + 1]; 503 while (i < j) 504 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 505 *a = b[i]; 506 } 507 508 static void 509 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 510 { 511 int i, j; 512 lsrec_t *target, *current; 513 514 target = lock[0]; 515 516 for (i = 1; i < n; i++) { 517 current = lock[i]; 518 if (cmp(current, target) != 0) { 519 target = current; 520 continue; 521 } 522 current->ls_event = LS_MAX_EVENTS; 523 target->ls_count += current->ls_count; 524 target->ls_refcnt += current->ls_refcnt; 525 if (g_recsize < LS_TIME) 526 continue; 527 target->ls_time += current->ls_time; 528 if (g_recsize < LS_HIST) 529 continue; 530 for (j = 0; j < 64; j++) 531 target->ls_hist[j] += current->ls_hist[j]; 532 } 533 } 534 535 static void 536 coalesce_symbol(uintptr_t *addrp) 537 { 538 uintptr_t symoff; 539 size_t symsize; 540 541 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 542 *addrp -= symoff; 543 } 544 545 static void 546 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 547 { 548 char *new; 549 int len, newlen; 550 551 if (what == NULL) 552 return; 553 554 if (*pred == NULL) { 555 *pred = malloc(1); 556 *pred[0] = '\0'; 557 } 558 559 len = strlen(*pred); 560 newlen = len + strlen(what) + 32 + strlen("( && )"); 561 new = malloc(newlen); 562 563 if (*pred[0] != '\0') { 564 if (cmp != NULL) { 565 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 566 *pred, what, cmp, (void *)value); 567 } else { 568 (void) sprintf(new, "(%s) && (%s)", *pred, what); 569 } 570 } else { 571 if (cmp != NULL) { 572 (void) sprintf(new, "%s %s 0x%p", 573 what, cmp, (void *)value); 574 } else { 575 (void) sprintf(new, "%s", what); 576 } 577 } 578 579 free(*pred); 580 *pred = new; 581 } 582 583 static void 584 predicate_destroy(char **pred) 585 { 586 free(*pred); 587 *pred = NULL; 588 } 589 590 static void 591 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 592 { 593 char buf[256], *c = buf, *new; 594 int len, newlen; 595 596 if (*filt == NULL) { 597 *filt = malloc(1); 598 *filt[0] = '\0'; 599 } 600 601 #if defined(sun) 602 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 603 " || " : "", what, (void *)base, what, (void *)(base + size)); 604 #else 605 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 606 " || " : "", what, (void *)base, what, (void *)(base + size)); 607 #endif 608 609 newlen = (len = strlen(*filt) + 1) + strlen(c); 610 new = malloc(newlen); 611 bcopy(*filt, new, len); 612 (void) strcat(new, c); 613 free(*filt); 614 *filt = new; 615 } 616 617 static void 618 filter_destroy(char **filt) 619 { 620 free(*filt); 621 *filt = NULL; 622 } 623 624 static void 625 dprog_add(const char *fmt, ...) 626 { 627 va_list args; 628 int size, offs; 629 char c; 630 631 va_start(args, fmt); 632 size = vsnprintf(&c, 1, fmt, args) + 1; 633 va_end(args); 634 635 if (g_proglen == 0) { 636 offs = 0; 637 } else { 638 offs = g_proglen - 1; 639 } 640 641 g_proglen = offs + size; 642 643 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 644 fail(1, "failed to reallocate program text"); 645 646 va_start(args, fmt); 647 (void) vsnprintf(&g_prog[offs], size, fmt, args); 648 va_end(args); 649 } 650 651 /* 652 * This function may read like an open sewer, but keep in mind that programs 653 * that generate other programs are rarely pretty. If one has the unenviable 654 * task of maintaining or -- worse -- extending this code, use the -V option 655 * to examine the D program as generated by this function. 656 */ 657 static void 658 dprog_addevent(int event) 659 { 660 ls_event_info_t *info = &g_event_info[event]; 661 char *pred = NULL; 662 char stack[20]; 663 const char *arg0, *caller; 664 char *arg1 = "arg1"; 665 char buf[80]; 666 hrtime_t dur; 667 int depth; 668 669 if (info->ev_name[0] == '\0') 670 return; 671 672 if (info->ev_type == 'I') { 673 /* 674 * For interrupt events, arg0 (normally the lock pointer) is 675 * the CPU address plus the current pil, and arg1 (normally 676 * the number of nanoseconds) is the number of nanoseconds 677 * late -- and it's stored in arg2. 678 */ 679 #if defined(sun) 680 arg0 = "(uintptr_t)curthread->t_cpu + \n" 681 "\t curthread->t_cpu->cpu_profile_pil"; 682 #else 683 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 684 "\t 0x01000000 + curthread->td_pri_class"; 685 #endif 686 caller = "(uintptr_t)arg0"; 687 arg1 = "arg2"; 688 } else { 689 arg0 = "(uintptr_t)arg0"; 690 caller = "caller"; 691 } 692 693 if (g_recsize > LS_HIST) { 694 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 695 continue; 696 697 if (g_tracing) { 698 (void) sprintf(stack, "\tstack(%d);\n", depth); 699 } else { 700 (void) sprintf(stack, ", stack(%d)", depth); 701 } 702 } else { 703 (void) sprintf(stack, ""); 704 } 705 706 if (info->ev_acquire != NULL) { 707 /* 708 * If this is a hold event, we need to generate an additional 709 * clause for the acquire; the clause for the release will be 710 * generated with the aggregating statement, below. 711 */ 712 dprog_add("%s\n", info->ev_acquire); 713 predicate_add(&pred, info->ev_predicate, NULL, 0); 714 predicate_add(&pred, g_predicate, NULL, 0); 715 if (pred != NULL) 716 dprog_add("/%s/\n", pred); 717 718 dprog_add("{\n"); 719 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 720 721 if (info->ev_type == 'H') { 722 dprog_add("\t%s = timestamp;\n", buf); 723 } else { 724 /* 725 * If this isn't a hold event, it's the recursive 726 * error event. For this, we simply bump the 727 * thread-local, per-lock count. 728 */ 729 dprog_add("\t%s++;\n", buf); 730 } 731 732 dprog_add("}\n\n"); 733 predicate_destroy(&pred); 734 pred = NULL; 735 736 if (info->ev_type == 'E') { 737 /* 738 * If this is the recursive lock error event, we need 739 * to generate an additional clause to decrement the 740 * thread-local, per-lock count. This assures that we 741 * only execute the aggregating clause if we have 742 * recursive entry. 743 */ 744 dprog_add("%s\n", info->ev_name); 745 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 746 } 747 748 predicate_add(&pred, buf, NULL, 0); 749 750 if (info->ev_type == 'H') { 751 (void) sprintf(buf, "timestamp -\n\t " 752 "self->ev%d[(uintptr_t)arg0]", event); 753 } 754 755 arg1 = buf; 756 } else { 757 predicate_add(&pred, info->ev_predicate, NULL, 0); 758 if (info->ev_type != 'I') 759 predicate_add(&pred, g_predicate, NULL, 0); 760 else 761 predicate_add(&pred, g_ipredicate, NULL, 0); 762 } 763 764 if ((dur = g_min_duration[event]) != 0) 765 predicate_add(&pred, arg1, ">=", dur); 766 767 dprog_add("%s\n", info->ev_name); 768 769 if (pred != NULL) 770 dprog_add("/%s/\n", pred); 771 predicate_destroy(&pred); 772 773 dprog_add("{\n"); 774 775 if (g_tracing) { 776 dprog_add("\ttrace(%dULL);\n", event); 777 dprog_add("\ttrace(%s);\n", arg0); 778 dprog_add("\ttrace(%s);\n", caller); 779 dprog_add(stack); 780 } else { 781 /* 782 * The ordering here is important: when we process the 783 * aggregate, we count on the fact that @avg appears before 784 * @hist in program order to assure that @avg is assigned the 785 * first aggregation variable ID and @hist assigned the 786 * second; see the comment in process_aggregate() for details. 787 */ 788 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 789 event, arg0, caller, stack, arg1); 790 791 if (g_recsize >= LS_HIST) { 792 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 793 "(%s);\n", event, arg0, caller, stack, arg1); 794 } 795 } 796 797 if (info->ev_acquire != NULL) 798 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 799 800 dprog_add("}\n\n"); 801 } 802 803 static void 804 dprog_compile() 805 { 806 dtrace_prog_t *prog; 807 dtrace_proginfo_t info; 808 809 if (g_Vflag) { 810 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 811 (void) fputs(g_prog, stderr); 812 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 813 } 814 815 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 816 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 817 dfail("failed to compile program"); 818 819 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 820 dfail("failed to enable probes"); 821 822 if (dtrace_go(g_dtp) != 0) 823 dfail("couldn't start tracing"); 824 } 825 826 static void 827 #if defined(sun) 828 status_fire(void) 829 #else 830 status_fire(int i) 831 #endif 832 {} 833 834 static void 835 status_init(void) 836 { 837 dtrace_optval_t val, status, agg; 838 struct sigaction act; 839 struct itimerspec ts; 840 struct sigevent ev; 841 timer_t tid; 842 843 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 844 dfail("failed to get 'statusrate'"); 845 846 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 847 dfail("failed to get 'statusrate'"); 848 849 /* 850 * We would want to awaken at a rate that is the GCD of the statusrate 851 * and the aggrate -- but that seems a bit absurd. Instead, we'll 852 * simply awaken at a rate that is the more frequent of the two, which 853 * assures that we're never later than the interval implied by the 854 * more frequent rate. 855 */ 856 val = status < agg ? status : agg; 857 858 (void) sigemptyset(&act.sa_mask); 859 act.sa_flags = 0; 860 act.sa_handler = status_fire; 861 (void) sigaction(SIGUSR1, &act, NULL); 862 863 ev.sigev_notify = SIGEV_SIGNAL; 864 ev.sigev_signo = SIGUSR1; 865 866 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 867 dfail("cannot create CLOCK_REALTIME timer"); 868 869 ts.it_value.tv_sec = val / NANOSEC; 870 ts.it_value.tv_nsec = val % NANOSEC; 871 ts.it_interval = ts.it_value; 872 873 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 874 dfail("cannot set time on CLOCK_REALTIME timer"); 875 } 876 877 static void 878 status_check(void) 879 { 880 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 881 dfail("failed to snap aggregate"); 882 883 if (dtrace_status(g_dtp) == -1) 884 dfail("dtrace_status()"); 885 } 886 887 static void 888 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 889 { 890 bzero(lsrec, g_recsize); 891 lsrec->ls_count = 1; 892 893 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 894 fail(0, "truncated DTrace record"); 895 896 if (rec->dtrd_size != sizeof (uint64_t)) 897 fail(0, "bad event size in first record"); 898 899 /* LINTED - alignment */ 900 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 901 rec++; 902 903 if (rec->dtrd_size != sizeof (uintptr_t)) 904 fail(0, "bad lock address size in second record"); 905 906 /* LINTED - alignment */ 907 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 908 rec++; 909 910 if (rec->dtrd_size != sizeof (uintptr_t)) 911 fail(0, "bad caller size in third record"); 912 913 /* LINTED - alignment */ 914 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 915 rec++; 916 917 if (g_recsize > LS_HIST) { 918 int frames, i; 919 pc_t *stack; 920 921 frames = rec->dtrd_size / sizeof (pc_t); 922 /* LINTED - alignment */ 923 stack = (pc_t *)(data + rec->dtrd_offset); 924 925 for (i = 1; i < frames; i++) 926 lsrec->ls_stack[i - 1] = stack[i]; 927 } 928 } 929 930 /*ARGSUSED*/ 931 static int 932 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 933 { 934 *((size_t *)arg) += 1; 935 936 return (DTRACE_AGGWALK_NEXT); 937 } 938 939 static int 940 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 941 { 942 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 943 caddr_t data = agg->dtada_data; 944 lsdata_t *lsdata = arg; 945 lsrec_t *lsrec = lsdata->lsd_next; 946 const dtrace_recdesc_t *rec; 947 uint64_t *avg, *quantized; 948 int i, j; 949 950 assert(lsdata->lsd_count < g_nrecs); 951 952 /* 953 * Aggregation variable IDs are guaranteed to be generated in program 954 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 955 * plus one. As "avg" appears before "hist" in program order, we know 956 * that "avg" will be allocated the first aggregation variable ID, and 957 * "hist" will be allocated the second aggregation variable ID -- and 958 * we therefore use the aggregation variable ID to differentiate the 959 * cases. 960 */ 961 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 962 /* 963 * If this is the histogram entry. We'll copy the quantized 964 * data into lc_hist, and jump over the rest. 965 */ 966 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 967 968 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 969 fail(0, "bad variable ID in aggregation record"); 970 971 if (rec->dtrd_size != 972 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 973 fail(0, "bad quantize size in aggregation record"); 974 975 /* LINTED - alignment */ 976 quantized = (uint64_t *)(data + rec->dtrd_offset); 977 978 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 979 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 980 lsrec->ls_hist[j] = quantized[i]; 981 982 goto out; 983 } 984 985 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 986 aggdesc->dtagd_nrecs - 1, data); 987 988 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 989 990 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 991 fail(0, "bad avg size in aggregation record"); 992 993 /* LINTED - alignment */ 994 avg = (uint64_t *)(data + rec->dtrd_offset); 995 lsrec->ls_count = (uint32_t)avg[0]; 996 lsrec->ls_time = (uintptr_t)avg[1]; 997 998 if (g_recsize >= LS_HIST) 999 return (DTRACE_AGGWALK_NEXT); 1000 1001 out: 1002 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1003 lsdata->lsd_count++; 1004 1005 return (DTRACE_AGGWALK_NEXT); 1006 } 1007 1008 static int 1009 process_trace(const dtrace_probedata_t *pdata, void *arg) 1010 { 1011 lsdata_t *lsdata = arg; 1012 lsrec_t *lsrec = lsdata->lsd_next; 1013 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1014 caddr_t data = pdata->dtpda_data; 1015 1016 if (lsdata->lsd_count >= g_nrecs) 1017 return (DTRACE_CONSUME_NEXT); 1018 1019 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1020 1021 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1022 lsdata->lsd_count++; 1023 1024 return (DTRACE_CONSUME_NEXT); 1025 } 1026 1027 static int 1028 process_data(FILE *out, char *data) 1029 { 1030 lsdata_t lsdata; 1031 1032 /* LINTED - alignment */ 1033 lsdata.lsd_next = (lsrec_t *)data; 1034 lsdata.lsd_count = 0; 1035 1036 if (g_tracing) { 1037 if (dtrace_consume(g_dtp, out, 1038 process_trace, NULL, &lsdata) != 0) 1039 dfail("failed to consume buffer"); 1040 1041 return (lsdata.lsd_count); 1042 } 1043 1044 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1045 process_aggregate, &lsdata) != 0) 1046 dfail("failed to walk aggregate"); 1047 1048 return (lsdata.lsd_count); 1049 } 1050 1051 /*ARGSUSED*/ 1052 static int 1053 drophandler(const dtrace_dropdata_t *data, void *arg) 1054 { 1055 g_dropped++; 1056 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1057 return (DTRACE_HANDLE_OK); 1058 } 1059 1060 int 1061 main(int argc, char **argv) 1062 { 1063 char *data_buf; 1064 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1065 FILE *out = stdout; 1066 int c; 1067 pid_t child; 1068 int status; 1069 int i, j; 1070 hrtime_t duration; 1071 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1072 uintptr_t addr; 1073 size_t size, off; 1074 int events_specified = 0; 1075 int exec_errno = 0; 1076 uint32_t event; 1077 char *filt = NULL, *ifilt = NULL; 1078 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1079 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1080 dtrace_optval_t aggsize; 1081 char aggstr[10]; 1082 long ncpus; 1083 int dynvar = 0; 1084 int err; 1085 1086 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1087 fail(0, "cannot open dtrace library: %s", 1088 dtrace_errmsg(NULL, err)); 1089 } 1090 1091 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1092 dfail("couldn't establish drop handler"); 1093 1094 if (symtab_init() == -1) 1095 fail(1, "can't load kernel symbols"); 1096 1097 g_nrecs = DEFAULT_NRECS; 1098 1099 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1100 switch (c) { 1101 case 'b': 1102 g_recsize = LS_BASIC; 1103 break; 1104 1105 case 't': 1106 g_recsize = LS_TIME; 1107 break; 1108 1109 case 'h': 1110 g_recsize = LS_HIST; 1111 break; 1112 1113 case 's': 1114 if (!isdigit(optarg[0])) 1115 usage(); 1116 g_stkdepth = atoi(optarg); 1117 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1118 fail(0, "max stack depth is %d", 1119 LS_MAX_STACK_DEPTH); 1120 g_recsize = LS_STACK(g_stkdepth); 1121 break; 1122 1123 case 'n': 1124 if (!isdigit(optarg[0])) 1125 usage(); 1126 g_nrecs = atoi(optarg); 1127 break; 1128 1129 case 'd': 1130 if (!isdigit(optarg[0])) 1131 usage(); 1132 duration = atoll(optarg); 1133 1134 /* 1135 * XXX -- durations really should be per event 1136 * since the units are different, but it's hard 1137 * to express this nicely in the interface. 1138 * Not clear yet what the cleanest solution is. 1139 */ 1140 for (i = 0; i < LS_MAX_EVENTS; i++) 1141 if (g_event_info[i].ev_type != 'E') 1142 g_min_duration[i] = duration; 1143 1144 break; 1145 1146 case 'i': 1147 if (!isdigit(optarg[0])) 1148 usage(); 1149 i = atoi(optarg); 1150 if (i <= 0) 1151 usage(); 1152 if (i > MAX_HZ) 1153 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1154 1155 for (j = 0; j < LS_MAX_EVENTS; j++) 1156 if (strcmp(g_event_info[j].ev_desc, 1157 "Profiling interrupt") == 0) 1158 break; 1159 1160 (void) sprintf(g_event_info[j].ev_name, 1161 "profile:::profile-%d", i); 1162 break; 1163 1164 case 'l': 1165 case 'f': 1166 addrp = strtok(optarg, ","); 1167 sizep = strtok(NULL, ","); 1168 addrp = strtok(optarg, ",+"); 1169 offp = strtok(NULL, ","); 1170 1171 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1172 off = offp ? strtoul(offp, NULL, 0) : 0; 1173 1174 if (addrp[0] == '0') { 1175 addr = strtoul(addrp, NULL, 16) + off; 1176 } else { 1177 addr = sym_to_addr(addrp) + off; 1178 if (sizep == NULL) 1179 size = sym_size(addrp) - off; 1180 if (addr - off == 0) 1181 fail(0, "symbol '%s' not found", addrp); 1182 if (size == 0) 1183 size = 1; 1184 } 1185 1186 1187 if (c == 'l') { 1188 filter_add(&filt, "arg0", addr, size); 1189 } else { 1190 filter_add(&filt, "caller", addr, size); 1191 filter_add(&ifilt, "arg0", addr, size); 1192 } 1193 break; 1194 1195 case 'e': 1196 evp = strtok_r(optarg, ",", &lastp); 1197 while (evp) { 1198 int ev1, ev2; 1199 char *evp2; 1200 1201 (void) strtok(evp, "-"); 1202 evp2 = strtok(NULL, "-"); 1203 ev1 = atoi(evp); 1204 ev2 = evp2 ? atoi(evp2) : ev1; 1205 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1206 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1207 fail(0, "-e events out of range"); 1208 for (i = ev1; i <= ev2; i++) 1209 g_enabled[i] = 1; 1210 evp = strtok_r(NULL, ",", &lastp); 1211 } 1212 events_specified = 1; 1213 break; 1214 1215 case 'c': 1216 g_cflag = 1; 1217 break; 1218 1219 case 'k': 1220 g_kflag = 1; 1221 break; 1222 1223 case 'w': 1224 g_wflag = 1; 1225 break; 1226 1227 case 'W': 1228 g_Wflag = 1; 1229 break; 1230 1231 case 'g': 1232 g_gflag = 1; 1233 break; 1234 1235 case 'C': 1236 case 'E': 1237 case 'H': 1238 case 'I': 1239 for (i = 0; i < LS_MAX_EVENTS; i++) 1240 if (g_event_info[i].ev_type == c) 1241 g_enabled[i] = 1; 1242 events_specified = 1; 1243 break; 1244 1245 case 'A': 1246 for (i = 0; i < LS_MAX_EVENTS; i++) 1247 if (strchr("CH", g_event_info[i].ev_type)) 1248 g_enabled[i] = 1; 1249 events_specified = 1; 1250 break; 1251 1252 case 'T': 1253 g_tracing = 1; 1254 break; 1255 1256 case 'D': 1257 if (!isdigit(optarg[0])) 1258 usage(); 1259 g_topn = atoi(optarg); 1260 break; 1261 1262 case 'R': 1263 g_rates = 1; 1264 break; 1265 1266 case 'p': 1267 g_pflag = 1; 1268 break; 1269 1270 case 'P': 1271 g_Pflag = 1; 1272 break; 1273 1274 case 'o': 1275 if ((out = fopen(optarg, "w")) == NULL) 1276 fail(1, "error opening file"); 1277 break; 1278 1279 case 'V': 1280 g_Vflag = 1; 1281 break; 1282 1283 default: 1284 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1285 usage(); 1286 } 1287 } 1288 1289 if (filt != NULL) { 1290 predicate_add(&g_predicate, filt, NULL, 0); 1291 filter_destroy(&filt); 1292 } 1293 1294 if (ifilt != NULL) { 1295 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1296 filter_destroy(&ifilt); 1297 } 1298 1299 if (g_recsize == 0) { 1300 if (g_gflag) { 1301 g_stkdepth = LS_MAX_STACK_DEPTH; 1302 g_recsize = LS_STACK(g_stkdepth); 1303 } else { 1304 g_recsize = LS_TIME; 1305 } 1306 } 1307 1308 if (g_gflag && g_recsize <= LS_STACK(0)) 1309 fail(0, "'-g' requires at least '-s 1' data gathering"); 1310 1311 /* 1312 * Make sure the alignment is reasonable 1313 */ 1314 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1315 1316 for (i = 0; i < LS_MAX_EVENTS; i++) { 1317 /* 1318 * If no events were specified, enable -C. 1319 */ 1320 if (!events_specified && g_event_info[i].ev_type == 'C') 1321 g_enabled[i] = 1; 1322 } 1323 1324 for (i = 0; i < LS_MAX_EVENTS; i++) { 1325 if (!g_enabled[i]) 1326 continue; 1327 1328 if (g_event_info[i].ev_acquire != NULL) { 1329 /* 1330 * If we've enabled a hold event, we must explicitly 1331 * allocate dynamic variable space. 1332 */ 1333 dynvar = 1; 1334 } 1335 1336 dprog_addevent(i); 1337 } 1338 1339 /* 1340 * Make sure there are remaining arguments to specify a child command 1341 * to execute. 1342 */ 1343 if (argc <= optind) 1344 usage(); 1345 1346 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1347 dfail("couldn't determine number of online CPUs"); 1348 1349 /* 1350 * By default, we set our data buffer size to be the number of records 1351 * multiplied by the size of the record, doubled to account for some 1352 * DTrace slop and divided by the number of CPUs. We silently clamp 1353 * the aggregation size at both a minimum and a maximum to prevent 1354 * absurdly low or high values. 1355 */ 1356 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1357 aggsize = MIN_AGGSIZE; 1358 1359 if (aggsize > MAX_AGGSIZE) 1360 aggsize = MAX_AGGSIZE; 1361 1362 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1363 1364 if (!g_tracing) { 1365 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1366 dfail("failed to set 'bufsize'"); 1367 1368 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1369 dfail("failed to set 'aggsize'"); 1370 1371 if (dynvar) { 1372 /* 1373 * If we're using dynamic variables, we set our 1374 * dynamic variable size to be one megabyte per CPU, 1375 * with a hard-limit of 32 megabytes. This may still 1376 * be too small in some cases, but it can be tuned 1377 * manually via -x if need be. 1378 */ 1379 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1380 1381 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1382 dfail("failed to set 'dynvarsize'"); 1383 } 1384 } else { 1385 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1386 dfail("failed to set 'bufsize'"); 1387 } 1388 1389 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1390 dfail("failed to set 'statusrate'"); 1391 1392 optind = 1; 1393 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1394 switch (c) { 1395 case 'x': 1396 if ((p = strchr(optarg, '=')) != NULL) 1397 *p++ = '\0'; 1398 1399 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1400 dfail("failed to set -x %s", optarg); 1401 break; 1402 } 1403 } 1404 1405 argc -= optind; 1406 argv += optind; 1407 1408 dprog_compile(); 1409 status_init(); 1410 1411 g_elapsed = -gethrtime(); 1412 1413 /* 1414 * Spawn the specified command and wait for it to complete. 1415 */ 1416 child = fork(); 1417 if (child == -1) 1418 fail(1, "cannot fork"); 1419 if (child == 0) { 1420 (void) dtrace_close(g_dtp); 1421 (void) execvp(argv[0], &argv[0]); 1422 exec_errno = errno; 1423 exit(127); 1424 } 1425 1426 #if defined(sun) 1427 while (waitpid(child, &status, WEXITED) != child) 1428 #else 1429 while (waitpid(child, &status, 0) != child) 1430 #endif 1431 status_check(); 1432 1433 g_elapsed += gethrtime(); 1434 1435 if (WIFEXITED(status)) { 1436 if (WEXITSTATUS(status) != 0) { 1437 if (exec_errno != 0) { 1438 errno = exec_errno; 1439 fail(1, "could not execute %s", argv[0]); 1440 } 1441 (void) fprintf(stderr, 1442 "lockstat: warning: %s exited with code %d\n", 1443 argv[0], WEXITSTATUS(status)); 1444 } 1445 } else { 1446 (void) fprintf(stderr, 1447 "lockstat: warning: %s died on signal %d\n", 1448 argv[0], WTERMSIG(status)); 1449 } 1450 1451 if (dtrace_stop(g_dtp) == -1) 1452 dfail("failed to stop dtrace"); 1453 1454 /* 1455 * Before we read out the results, we need to allocate our buffer. 1456 * If we're tracing, then we'll just use the precalculated size. If 1457 * we're not, then we'll take a snapshot of the aggregate, and walk 1458 * it to count the number of records. 1459 */ 1460 if (!g_tracing) { 1461 if (dtrace_aggregate_snap(g_dtp) != 0) 1462 dfail("failed to snap aggregate"); 1463 1464 g_nrecs = 0; 1465 1466 if (dtrace_aggregate_walk(g_dtp, 1467 count_aggregate, &g_nrecs) != 0) 1468 dfail("failed to walk aggregate"); 1469 } 1470 1471 #if defined(sun) 1472 if ((data_buf = memalign(sizeof (uint64_t), 1473 (g_nrecs + 1) * g_recsize)) == NULL) 1474 #else 1475 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1476 (g_nrecs + 1) * g_recsize) ) 1477 #endif 1478 fail(1, "Memory allocation failed"); 1479 1480 /* 1481 * Read out the DTrace data. 1482 */ 1483 g_nrecs_used = process_data(out, data_buf); 1484 1485 if (g_nrecs_used > g_nrecs || g_dropped) 1486 (void) fprintf(stderr, "lockstat: warning: " 1487 "ran out of data records (use -n for more)\n"); 1488 1489 /* LINTED - alignment */ 1490 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1491 /* LINTED - alignment */ 1492 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1493 ev_count[lsp->ls_event] += lsp->ls_count; 1494 ev_time[lsp->ls_event] += lsp->ls_time; 1495 } 1496 1497 /* 1498 * If -g was specified, convert stacks into individual records. 1499 */ 1500 if (g_gflag) { 1501 lsrec_t *newlsp, *oldlsp; 1502 1503 #if defined(sun) 1504 newlsp = memalign(sizeof (uint64_t), 1505 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1506 #else 1507 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1508 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1509 #endif 1510 if (newlsp == NULL) 1511 fail(1, "Cannot allocate space for -g processing"); 1512 lsp = newlsp; 1513 /* LINTED - alignment */ 1514 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1515 /* LINTED - alignment */ 1516 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1517 int fr; 1518 int caller_in_stack = 0; 1519 1520 if (oldlsp->ls_count == 0) 1521 continue; 1522 1523 for (fr = 0; fr < g_stkdepth; fr++) { 1524 if (oldlsp->ls_stack[fr] == 0) 1525 break; 1526 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1527 caller_in_stack = 1; 1528 bcopy(oldlsp, lsp, LS_TIME); 1529 lsp->ls_caller = oldlsp->ls_stack[fr]; 1530 /* LINTED - alignment */ 1531 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1532 } 1533 if (!caller_in_stack) { 1534 bcopy(oldlsp, lsp, LS_TIME); 1535 /* LINTED - alignment */ 1536 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1537 } 1538 } 1539 g_nrecs = g_nrecs_used = 1540 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1541 g_recsize = LS_TIME; 1542 g_stkdepth = 0; 1543 free(data_buf); 1544 data_buf = (char *)newlsp; 1545 } 1546 1547 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1548 sizeof (void *))) == NULL) 1549 fail(1, "Sort buffer allocation failed"); 1550 merge_buf = sort_buf + (g_nrecs + 1); 1551 1552 /* 1553 * Build the sort buffer, discarding zero-count records along the way. 1554 */ 1555 /* LINTED - alignment */ 1556 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1557 /* LINTED - alignment */ 1558 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1559 if (lsp->ls_count == 0) 1560 lsp->ls_event = LS_MAX_EVENTS; 1561 sort_buf[i] = lsp; 1562 } 1563 1564 if (g_nrecs_used == 0) 1565 exit(0); 1566 1567 /* 1568 * Add a sentinel after the last record 1569 */ 1570 sort_buf[i] = lsp; 1571 lsp->ls_event = LS_MAX_EVENTS; 1572 1573 if (g_tracing) { 1574 report_trace(out, sort_buf); 1575 return (0); 1576 } 1577 1578 /* 1579 * Application of -g may have resulted in multiple records 1580 * with the same signature; coalesce them. 1581 */ 1582 if (g_gflag) { 1583 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1584 coalesce(lockcmp, sort_buf, g_nrecs_used); 1585 } 1586 1587 /* 1588 * Coalesce locks within the same symbol if -c option specified. 1589 * Coalesce PCs within the same function if -k option specified. 1590 */ 1591 if (g_cflag || g_kflag) { 1592 for (i = 0; i < g_nrecs_used; i++) { 1593 int fr; 1594 lsp = sort_buf[i]; 1595 if (g_cflag) 1596 coalesce_symbol(&lsp->ls_lock); 1597 if (g_kflag) { 1598 for (fr = 0; fr < g_stkdepth; fr++) 1599 coalesce_symbol(&lsp->ls_stack[fr]); 1600 coalesce_symbol(&lsp->ls_caller); 1601 } 1602 } 1603 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1604 coalesce(lockcmp, sort_buf, g_nrecs_used); 1605 } 1606 1607 /* 1608 * Coalesce callers if -w option specified 1609 */ 1610 if (g_wflag) { 1611 mergesort(lock_and_count_cmp_anywhere, 1612 sort_buf, merge_buf, g_nrecs_used); 1613 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1614 } 1615 1616 /* 1617 * Coalesce locks if -W option specified 1618 */ 1619 if (g_Wflag) { 1620 mergesort(site_and_count_cmp_anylock, 1621 sort_buf, merge_buf, g_nrecs_used); 1622 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1623 } 1624 1625 /* 1626 * Sort data by contention count (ls_count) or total time (ls_time), 1627 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1628 */ 1629 if (g_recsize < LS_TIME) 1630 g_Pflag = 0; 1631 1632 if (g_Pflag) 1633 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1634 else 1635 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1636 1637 /* 1638 * Display data by event type 1639 */ 1640 first = &sort_buf[0]; 1641 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1642 current = first; 1643 while ((lsp = *current)->ls_event == event) 1644 current++; 1645 report_stats(out, first, current - first, ev_count[event], 1646 ev_time[event]); 1647 first = current; 1648 } 1649 1650 return (0); 1651 } 1652 1653 static char * 1654 format_symbol(char *buf, uintptr_t addr, int show_size) 1655 { 1656 uintptr_t symoff; 1657 char *symname; 1658 size_t symsize; 1659 1660 symname = addr_to_sym(addr, &symoff, &symsize); 1661 1662 if (show_size && symoff == 0) 1663 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1664 else if (symoff == 0) 1665 (void) sprintf(buf, "%s", symname); 1666 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1667 #if defined(sun) 1668 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1669 #else 1670 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1671 #endif 1672 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1673 (void) sprintf(buf, "%s+0x%llx", symname, 1674 (unsigned long long)symoff); 1675 else 1676 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1677 return (buf); 1678 } 1679 1680 static void 1681 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1682 uint64_t total_time) 1683 { 1684 uint32_t event = sort_buf[0]->ls_event; 1685 lsrec_t *lsp; 1686 double ptotal = 0.0; 1687 double percent; 1688 int i, j, fr; 1689 int displayed; 1690 int first_bin, last_bin, max_bin_count, total_bin_count; 1691 int rectype; 1692 char buf[256]; 1693 char lhdr[80], chdr[80]; 1694 1695 rectype = g_recsize; 1696 1697 if (g_topn == 0) { 1698 (void) fprintf(out, "%20llu %s\n", 1699 g_rates == 0 ? total_count : 1700 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1701 g_event_info[event].ev_desc); 1702 return; 1703 } 1704 1705 (void) sprintf(lhdr, "%s%s", 1706 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1707 (void) sprintf(chdr, "%s%s", 1708 g_wflag ? "Hottest " : "", "Caller"); 1709 1710 if (!g_pflag) 1711 (void) fprintf(out, 1712 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1713 g_event_info[event].ev_desc, (double)total_count, 1714 (double)g_elapsed / NANOSEC, 1715 (double)total_count * NANOSEC / g_elapsed); 1716 1717 if (!g_pflag && rectype < LS_HIST) { 1718 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1719 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1720 g_rates ? "ops/s" : "Count", 1721 g_gflag ? "genr" : "indv", 1722 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1723 (void) fprintf(out, "---------------------------------" 1724 "----------------------------------------------\n"); 1725 } 1726 1727 displayed = 0; 1728 for (i = 0; i < nrecs; i++) { 1729 lsp = sort_buf[i]; 1730 1731 if (displayed++ >= g_topn) 1732 break; 1733 1734 if (g_pflag) { 1735 int j; 1736 1737 (void) fprintf(out, "%u %u", 1738 lsp->ls_event, lsp->ls_count); 1739 (void) fprintf(out, " %s", 1740 format_symbol(buf, lsp->ls_lock, g_cflag)); 1741 (void) fprintf(out, " %s", 1742 format_symbol(buf, lsp->ls_caller, 0)); 1743 (void) fprintf(out, " %f", 1744 (double)lsp->ls_refcnt / lsp->ls_count); 1745 if (rectype >= LS_TIME) 1746 (void) fprintf(out, " %llu", 1747 (unsigned long long)lsp->ls_time); 1748 if (rectype >= LS_HIST) { 1749 for (j = 0; j < 64; j++) 1750 (void) fprintf(out, " %u", 1751 lsp->ls_hist[j]); 1752 } 1753 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1754 if (rectype <= LS_STACK(j) || 1755 lsp->ls_stack[j] == 0) 1756 break; 1757 (void) fprintf(out, " %s", 1758 format_symbol(buf, lsp->ls_stack[j], 0)); 1759 } 1760 (void) fprintf(out, "\n"); 1761 continue; 1762 } 1763 1764 if (rectype >= LS_HIST) { 1765 (void) fprintf(out, "---------------------------------" 1766 "----------------------------------------------\n"); 1767 (void) sprintf(buf, "%s", 1768 g_event_info[event].ev_units); 1769 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1770 g_rates ? "ops/s" : "Count", 1771 g_gflag ? "genr" : "indv", 1772 "cuml", "rcnt", buf, lhdr, chdr); 1773 } 1774 1775 if (g_Pflag && total_time != 0) 1776 percent = (lsp->ls_time * 100.00) / total_time; 1777 else 1778 percent = (lsp->ls_count * 100.00) / total_count; 1779 1780 ptotal += percent; 1781 1782 if (rectype >= LS_TIME) 1783 (void) sprintf(buf, "%llu", 1784 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1785 else 1786 buf[0] = '\0'; 1787 1788 (void) fprintf(out, "%5llu ", 1789 g_rates == 0 ? lsp->ls_count : 1790 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1791 1792 (void) fprintf(out, "%3.0f%% ", percent); 1793 1794 if (g_gflag) 1795 (void) fprintf(out, "---- "); 1796 else 1797 (void) fprintf(out, "%3.0f%% ", ptotal); 1798 1799 (void) fprintf(out, "%4.2f %8s ", 1800 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1801 1802 (void) fprintf(out, "%-22s ", 1803 format_symbol(buf, lsp->ls_lock, g_cflag)); 1804 1805 (void) fprintf(out, "%-24s\n", 1806 format_symbol(buf, lsp->ls_caller, 0)); 1807 1808 if (rectype < LS_HIST) 1809 continue; 1810 1811 (void) fprintf(out, "\n"); 1812 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1813 g_event_info[event].ev_units, 1814 "------ Time Distribution ------", 1815 g_rates ? "ops/s" : "count", 1816 rectype > LS_STACK(0) ? "Stack" : ""); 1817 1818 first_bin = 0; 1819 while (lsp->ls_hist[first_bin] == 0) 1820 first_bin++; 1821 1822 last_bin = 63; 1823 while (lsp->ls_hist[last_bin] == 0) 1824 last_bin--; 1825 1826 max_bin_count = 0; 1827 total_bin_count = 0; 1828 for (j = first_bin; j <= last_bin; j++) { 1829 total_bin_count += lsp->ls_hist[j]; 1830 if (lsp->ls_hist[j] > max_bin_count) 1831 max_bin_count = lsp->ls_hist[j]; 1832 } 1833 1834 /* 1835 * If we went a few frames below the caller, ignore them 1836 */ 1837 for (fr = 3; fr > 0; fr--) 1838 if (lsp->ls_stack[fr] == lsp->ls_caller) 1839 break; 1840 1841 for (j = first_bin; j <= last_bin; j++) { 1842 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1843 (void) fprintf(out, "%10llu |%s%s %-9u ", 1844 1ULL << j, 1845 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1846 " " + depth, 1847 g_rates == 0 ? lsp->ls_hist[j] : 1848 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1849 g_elapsed)); 1850 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1851 (void) fprintf(out, "\n"); 1852 continue; 1853 } 1854 (void) fprintf(out, "%-24s\n", 1855 format_symbol(buf, lsp->ls_stack[fr], 0)); 1856 fr++; 1857 } 1858 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1859 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1860 format_symbol(buf, lsp->ls_stack[fr], 0)); 1861 fr++; 1862 } 1863 } 1864 1865 if (!g_pflag) 1866 (void) fprintf(out, "---------------------------------" 1867 "----------------------------------------------\n"); 1868 1869 (void) fflush(out); 1870 } 1871 1872 static void 1873 report_trace(FILE *out, lsrec_t **sort_buf) 1874 { 1875 lsrec_t *lsp; 1876 int i, fr; 1877 int rectype; 1878 char buf[256], buf2[256]; 1879 1880 rectype = g_recsize; 1881 1882 if (!g_pflag) { 1883 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1884 "Event", "Time", "Owner", "Lock", "Caller"); 1885 (void) fprintf(out, "---------------------------------" 1886 "----------------------------------------------\n"); 1887 } 1888 1889 for (i = 0; i < g_nrecs_used; i++) { 1890 1891 lsp = sort_buf[i]; 1892 1893 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1894 continue; 1895 1896 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1897 lsp->ls_event, (unsigned long long)lsp->ls_time, 1898 (void *)lsp->ls_next, 1899 format_symbol(buf, lsp->ls_lock, 0), 1900 format_symbol(buf2, lsp->ls_caller, 0)); 1901 1902 if (rectype <= LS_STACK(0)) 1903 continue; 1904 1905 /* 1906 * If we went a few frames below the caller, ignore them 1907 */ 1908 for (fr = 3; fr > 0; fr--) 1909 if (lsp->ls_stack[fr] == lsp->ls_caller) 1910 break; 1911 1912 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1913 (void) fprintf(out, "%53s %-24s\n", "", 1914 format_symbol(buf, lsp->ls_stack[fr], 0)); 1915 fr++; 1916 } 1917 (void) fprintf(out, "\n"); 1918 } 1919 1920 (void) fflush(out); 1921 } 1922