1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <stdio.h> 29 #include <stddef.h> 30 #include <stdlib.h> 31 #include <stdarg.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <fcntl.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <sys/types.h> 40 #include <sys/modctl.h> 41 #include <sys/stat.h> 42 #include <sys/wait.h> 43 #include <dtrace.h> 44 #include <sys/lockstat.h> 45 #include <alloca.h> 46 #include <signal.h> 47 #include <assert.h> 48 49 #ifdef illumos 50 #define GETOPT_EOF EOF 51 #else 52 #include <sys/time.h> 53 #include <sys/resource.h> 54 55 #define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 56 #define GETOPT_EOF (-1) 57 58 typedef uintptr_t pc_t; 59 #endif 60 61 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 62 63 #define LS_MAX_STACK_DEPTH 50 64 #define LS_MAX_EVENTS 64 65 66 typedef struct lsrec { 67 struct lsrec *ls_next; /* next in hash chain */ 68 #ifdef illumos 69 uintptr_t ls_lock; /* lock address */ 70 #else 71 char *ls_lock; /* lock name */ 72 #endif 73 uintptr_t ls_caller; /* caller address */ 74 uint32_t ls_count; /* cumulative event count */ 75 uint32_t ls_event; /* type of event */ 76 uintptr_t ls_refcnt; /* cumulative reference count */ 77 uint64_t ls_time; /* cumulative event duration */ 78 uint32_t ls_hist[64]; /* log2(duration) histogram */ 79 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 80 } lsrec_t; 81 82 typedef struct lsdata { 83 struct lsrec *lsd_next; /* next available */ 84 int lsd_count; /* number of records */ 85 } lsdata_t; 86 87 /* 88 * Definitions for the types of experiments which can be run. They are 89 * listed in increasing order of memory cost and processing time cost. 90 * The numerical value of each type is the number of bytes needed per record. 91 */ 92 #define LS_BASIC offsetof(lsrec_t, ls_time) 93 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 94 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 95 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 96 97 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 98 static void report_trace(FILE *, lsrec_t **); 99 100 extern int symtab_init(void); 101 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 102 extern uintptr_t sym_to_addr(char *name); 103 extern size_t sym_size(char *name); 104 extern char *strtok_r(char *, const char *, char **); 105 106 #define DEFAULT_NRECS 10000 107 #define DEFAULT_HZ 97 108 #define MAX_HZ 1000 109 #define MIN_AGGSIZE (16 * 1024) 110 #define MAX_AGGSIZE (32 * 1024 * 1024) 111 112 static int g_stkdepth; 113 static int g_topn = INT_MAX; 114 static hrtime_t g_elapsed; 115 static int g_rates = 0; 116 static int g_pflag = 0; 117 static int g_Pflag = 0; 118 static int g_wflag = 0; 119 static int g_Wflag = 0; 120 static int g_cflag = 0; 121 static int g_kflag = 0; 122 static int g_gflag = 0; 123 static int g_Vflag = 0; 124 static int g_tracing = 0; 125 static size_t g_recsize; 126 static size_t g_nrecs; 127 static int g_nrecs_used; 128 static uchar_t g_enabled[LS_MAX_EVENTS]; 129 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 130 static dtrace_hdl_t *g_dtp; 131 static char *g_predicate; 132 static char *g_ipredicate; 133 static char *g_prog; 134 static int g_proglen; 135 static int g_dropped; 136 137 typedef struct ls_event_info { 138 char ev_type; 139 char ev_lhdr[20]; 140 char ev_desc[80]; 141 char ev_units[10]; 142 char ev_name[DTRACE_NAMELEN]; 143 char *ev_predicate; 144 char *ev_acquire; 145 } ls_event_info_t; 146 147 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 148 { 'C', "Lock", "Adaptive mutex spin", "nsec", 149 "lockstat:::adaptive-spin" }, 150 { 'C', "Lock", "Adaptive mutex block", "nsec", 151 "lockstat:::adaptive-block" }, 152 { 'C', "Lock", "Spin lock spin", "nsec", 153 "lockstat:::spin-spin" }, 154 { 'C', "Lock", "Thread lock spin", "nsec", 155 "lockstat:::thread-spin" }, 156 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 157 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 158 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 159 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 160 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 161 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 162 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 163 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 164 { 'C', "Lock", "R/W writer spin on writer", "nsec", 165 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" }, 166 { 'C', "Lock", "R/W writer spin on readers", "nsec", 167 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" }, 168 { 'C', "Lock", "R/W reader spin on writer", "nsec", 169 "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" }, 170 { 'C', "Lock", "R/W reader spin on write wanted", "nsec", 171 "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" }, 172 { 'C', "Lock", "SX exclusive block", "nsec", 173 "lockstat:::sx-block", "arg2 == 0" }, 174 { 'C', "Lock", "SX shared block", "nsec", 175 "lockstat:::sx-block", "arg2 != 0" }, 176 { 'C', "Lock", "SX exclusive spin", "nsec", 177 "lockstat:::sx-spin", "arg2 == 0" }, 178 { 'C', "Lock", "SX shared spin", "nsec", 179 "lockstat:::sx-spin", "arg2 != 0" }, 180 { 'C', "Lock", "Unknown event (type 16)", "units" }, 181 { 'C', "Lock", "Unknown event (type 17)", "units" }, 182 { 'C', "Lock", "Unknown event (type 18)", "units" }, 183 { 'C', "Lock", "Unknown event (type 19)", "units" }, 184 { 'C', "Lock", "Unknown event (type 20)", "units" }, 185 { 'C', "Lock", "Unknown event (type 21)", "units" }, 186 { 'C', "Lock", "Unknown event (type 22)", "units" }, 187 { 'C', "Lock", "Unknown event (type 23)", "units" }, 188 { 'C', "Lock", "Unknown event (type 24)", "units" }, 189 { 'C', "Lock", "Unknown event (type 25)", "units" }, 190 { 'C', "Lock", "Unknown event (type 26)", "units" }, 191 { 'C', "Lock", "Unknown event (type 27)", "units" }, 192 { 'C', "Lock", "Unknown event (type 28)", "units" }, 193 { 'C', "Lock", "Unknown event (type 29)", "units" }, 194 { 'C', "Lock", "Unknown event (type 30)", "units" }, 195 { 'C', "Lock", "Unknown event (type 31)", "units" }, 196 { 'H', "Lock", "Adaptive mutex hold", "nsec", 197 "lockstat:::adaptive-release", NULL, 198 "lockstat:::adaptive-acquire" }, 199 { 'H', "Lock", "Spin lock hold", "nsec", 200 "lockstat:::spin-release", NULL, 201 "lockstat:::spin-acquire" }, 202 { 'H', "Lock", "R/W writer hold", "nsec", 203 "lockstat:::rw-release", "arg1 == 0", 204 "lockstat:::rw-acquire" }, 205 { 'H', "Lock", "R/W reader hold", "nsec", 206 "lockstat:::rw-release", "arg1 == 1", 207 "lockstat:::rw-acquire" }, 208 { 'H', "Lock", "SX shared hold", "nsec", 209 "lockstat:::sx-release", "arg1 == 0", 210 "lockstat:::sx-acquire" }, 211 { 'H', "Lock", "SX exclusive hold", "nsec", 212 "lockstat:::sx-release", "arg1 == 1", 213 "lockstat:::sx-acquire" }, 214 { 'H', "Lock", "Unknown event (type 38)", "units" }, 215 { 'H', "Lock", "Unknown event (type 39)", "units" }, 216 { 'H', "Lock", "Unknown event (type 40)", "units" }, 217 { 'H', "Lock", "Unknown event (type 41)", "units" }, 218 { 'H', "Lock", "Unknown event (type 42)", "units" }, 219 { 'H', "Lock", "Unknown event (type 43)", "units" }, 220 { 'H', "Lock", "Unknown event (type 44)", "units" }, 221 { 'H', "Lock", "Unknown event (type 45)", "units" }, 222 { 'H', "Lock", "Unknown event (type 46)", "units" }, 223 { 'H', "Lock", "Unknown event (type 47)", "units" }, 224 { 'H', "Lock", "Unknown event (type 48)", "units" }, 225 { 'H', "Lock", "Unknown event (type 49)", "units" }, 226 { 'H', "Lock", "Unknown event (type 50)", "units" }, 227 { 'H', "Lock", "Unknown event (type 51)", "units" }, 228 { 'H', "Lock", "Unknown event (type 52)", "units" }, 229 { 'H', "Lock", "Unknown event (type 53)", "units" }, 230 { 'H', "Lock", "Unknown event (type 54)", "units" }, 231 { 'H', "Lock", "Unknown event (type 55)", "units" }, 232 #ifdef illumos 233 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 234 #else 235 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 236 #endif 237 "profile:::profile-97", NULL }, 238 { 'I', "Lock", "Unknown event (type 57)", "units" }, 239 { 'I', "Lock", "Unknown event (type 58)", "units" }, 240 { 'I', "Lock", "Unknown event (type 59)", "units" }, 241 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 242 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 243 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 244 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 245 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 246 }; 247 248 #ifndef illumos 249 static char *g_pri_class[] = { 250 "", 251 "Intr", 252 "RealT", 253 "TShar", 254 "Idle" 255 }; 256 #endif 257 258 static void 259 fail(int do_perror, const char *message, ...) 260 { 261 va_list args; 262 int save_errno = errno; 263 264 va_start(args, message); 265 (void) fprintf(stderr, "lockstat: "); 266 (void) vfprintf(stderr, message, args); 267 va_end(args); 268 if (do_perror) 269 (void) fprintf(stderr, ": %s", strerror(save_errno)); 270 (void) fprintf(stderr, "\n"); 271 exit(2); 272 } 273 274 static void 275 dfail(const char *message, ...) 276 { 277 va_list args; 278 279 va_start(args, message); 280 (void) fprintf(stderr, "lockstat: "); 281 (void) vfprintf(stderr, message, args); 282 va_end(args); 283 (void) fprintf(stderr, ": %s\n", 284 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 285 286 exit(2); 287 } 288 289 static void 290 show_events(char event_type, char *desc) 291 { 292 int i, first = -1, last; 293 294 for (i = 0; i < LS_MAX_EVENTS; i++) { 295 ls_event_info_t *evp = &g_event_info[i]; 296 if (evp->ev_type != event_type || 297 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 298 continue; 299 if (first == -1) 300 first = i; 301 last = i; 302 } 303 304 (void) fprintf(stderr, 305 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 306 desc, event_type, first, last); 307 308 for (i = first; i <= last; i++) 309 (void) fprintf(stderr, 310 "%4d = %s\n", i, g_event_info[i].ev_desc); 311 } 312 313 static void 314 usage(void) 315 { 316 (void) fprintf(stderr, 317 "Usage: lockstat [options] command [args]\n" 318 "\nGeneral options:\n\n" 319 " -V print the corresponding D program\n" 320 "\nEvent selection options:\n\n" 321 " -C watch contention events [on by default]\n" 322 " -E watch error events [off by default]\n" 323 " -H watch hold events [off by default]\n" 324 " -I watch interrupt events [off by default]\n" 325 " -A watch all lock events [equivalent to -CH]\n" 326 " -e event_list only watch the specified events (shown below);\n" 327 " <event_list> is a comma-separated list of\n" 328 " events or ranges of events, e.g. 1,4-7,35\n" 329 " -i rate interrupt rate for -I [default: %d Hz]\n" 330 "\nData gathering options:\n\n" 331 " -b basic statistics (lock, caller, event count)\n" 332 " -t timing for all events [default]\n" 333 " -h histograms for event times\n" 334 " -s depth stack traces <depth> deep\n" 335 " -x opt[=val] enable or modify DTrace options\n" 336 "\nData filtering options:\n\n" 337 " -n nrecords maximum number of data records [default: %d]\n" 338 " -l lock[,size] only watch <lock>, which can be specified as a\n" 339 " symbolic name or hex address; <size> defaults\n" 340 " to the ELF symbol size if available, 1 if not\n" 341 " -f func[,size] only watch events generated by <func>\n" 342 " -d duration only watch events longer than <duration>\n" 343 " -T trace (rather than sample) events\n" 344 "\nData reporting options:\n\n" 345 #ifdef illumos 346 " -c coalesce lock data for arrays like pse_mutex[]\n" 347 #endif 348 " -k coalesce PCs within functions\n" 349 " -g show total events generated by function\n" 350 " -w wherever: don't distinguish events by caller\n" 351 " -W whichever: don't distinguish events by lock\n" 352 " -R display rates rather than counts\n" 353 " -p parsable output format (awk(1)-friendly)\n" 354 " -P sort lock data by (count * avg_time) product\n" 355 " -D n only display top <n> events of each type\n" 356 " -o filename send output to <filename>\n", 357 DEFAULT_HZ, DEFAULT_NRECS); 358 359 show_events('C', "Contention"); 360 show_events('H', "Hold-time"); 361 show_events('I', "Interrupt"); 362 show_events('E', "Error"); 363 (void) fprintf(stderr, "\n"); 364 365 exit(1); 366 } 367 368 static int 369 lockcmp(lsrec_t *a, lsrec_t *b) 370 { 371 int i; 372 373 if (a->ls_event < b->ls_event) 374 return (-1); 375 if (a->ls_event > b->ls_event) 376 return (1); 377 378 for (i = g_stkdepth - 1; i >= 0; i--) { 379 if (a->ls_stack[i] < b->ls_stack[i]) 380 return (-1); 381 if (a->ls_stack[i] > b->ls_stack[i]) 382 return (1); 383 } 384 385 if (a->ls_caller < b->ls_caller) 386 return (-1); 387 if (a->ls_caller > b->ls_caller) 388 return (1); 389 390 #ifdef illumos 391 if (a->ls_lock < b->ls_lock) 392 return (-1); 393 if (a->ls_lock > b->ls_lock) 394 return (1); 395 396 return (0); 397 #else 398 return (strcmp(a->ls_lock, b->ls_lock)); 399 #endif 400 } 401 402 static int 403 countcmp(lsrec_t *a, lsrec_t *b) 404 { 405 if (a->ls_event < b->ls_event) 406 return (-1); 407 if (a->ls_event > b->ls_event) 408 return (1); 409 410 return (b->ls_count - a->ls_count); 411 } 412 413 static int 414 timecmp(lsrec_t *a, lsrec_t *b) 415 { 416 if (a->ls_event < b->ls_event) 417 return (-1); 418 if (a->ls_event > b->ls_event) 419 return (1); 420 421 if (a->ls_time < b->ls_time) 422 return (1); 423 if (a->ls_time > b->ls_time) 424 return (-1); 425 426 return (0); 427 } 428 429 static int 430 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 431 { 432 if (a->ls_event < b->ls_event) 433 return (-1); 434 if (a->ls_event > b->ls_event) 435 return (1); 436 437 #ifdef illumos 438 if (a->ls_lock < b->ls_lock) 439 return (-1); 440 if (a->ls_lock > b->ls_lock) 441 return (1); 442 443 return (0); 444 #else 445 return (strcmp(a->ls_lock, b->ls_lock)); 446 #endif 447 } 448 449 static int 450 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 451 { 452 #ifndef illumos 453 int cmp; 454 #endif 455 456 if (a->ls_event < b->ls_event) 457 return (-1); 458 if (a->ls_event > b->ls_event) 459 return (1); 460 461 #ifdef illumos 462 if (a->ls_lock < b->ls_lock) 463 return (-1); 464 if (a->ls_lock > b->ls_lock) 465 return (1); 466 #else 467 cmp = strcmp(a->ls_lock, b->ls_lock); 468 if (cmp != 0) 469 return (cmp); 470 #endif 471 472 return (b->ls_count - a->ls_count); 473 } 474 475 static int 476 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 477 { 478 int i; 479 480 if (a->ls_event < b->ls_event) 481 return (-1); 482 if (a->ls_event > b->ls_event) 483 return (1); 484 485 for (i = g_stkdepth - 1; i >= 0; i--) { 486 if (a->ls_stack[i] < b->ls_stack[i]) 487 return (-1); 488 if (a->ls_stack[i] > b->ls_stack[i]) 489 return (1); 490 } 491 492 if (a->ls_caller < b->ls_caller) 493 return (-1); 494 if (a->ls_caller > b->ls_caller) 495 return (1); 496 497 return (0); 498 } 499 500 static int 501 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 502 { 503 int i; 504 505 if (a->ls_event < b->ls_event) 506 return (-1); 507 if (a->ls_event > b->ls_event) 508 return (1); 509 510 for (i = g_stkdepth - 1; i >= 0; i--) { 511 if (a->ls_stack[i] < b->ls_stack[i]) 512 return (-1); 513 if (a->ls_stack[i] > b->ls_stack[i]) 514 return (1); 515 } 516 517 if (a->ls_caller < b->ls_caller) 518 return (-1); 519 if (a->ls_caller > b->ls_caller) 520 return (1); 521 522 return (b->ls_count - a->ls_count); 523 } 524 525 static void 526 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 527 { 528 int m = n / 2; 529 int i, j; 530 531 if (m > 1) 532 lsmergesort(cmp, a, b, m); 533 if (n - m > 1) 534 lsmergesort(cmp, a + m, b + m, n - m); 535 for (i = m; i > 0; i--) 536 b[i - 1] = a[i - 1]; 537 for (j = m - 1; j < n - 1; j++) 538 b[n + m - j - 2] = a[j + 1]; 539 while (i < j) 540 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 541 *a = b[i]; 542 } 543 544 static void 545 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 546 { 547 int i, j; 548 lsrec_t *target, *current; 549 550 target = lock[0]; 551 552 for (i = 1; i < n; i++) { 553 current = lock[i]; 554 if (cmp(current, target) != 0) { 555 target = current; 556 continue; 557 } 558 current->ls_event = LS_MAX_EVENTS; 559 target->ls_count += current->ls_count; 560 target->ls_refcnt += current->ls_refcnt; 561 if (g_recsize < LS_TIME) 562 continue; 563 target->ls_time += current->ls_time; 564 if (g_recsize < LS_HIST) 565 continue; 566 for (j = 0; j < 64; j++) 567 target->ls_hist[j] += current->ls_hist[j]; 568 } 569 } 570 571 static void 572 coalesce_symbol(uintptr_t *addrp) 573 { 574 uintptr_t symoff; 575 size_t symsize; 576 577 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 578 *addrp -= symoff; 579 } 580 581 static void 582 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 583 { 584 char *new; 585 int len, newlen; 586 587 if (what == NULL) 588 return; 589 590 if (*pred == NULL) { 591 *pred = malloc(1); 592 *pred[0] = '\0'; 593 } 594 595 len = strlen(*pred); 596 newlen = len + strlen(what) + 32 + strlen("( && )"); 597 new = malloc(newlen); 598 599 if (*pred[0] != '\0') { 600 if (cmp != NULL) { 601 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 602 *pred, what, cmp, (void *)value); 603 } else { 604 (void) sprintf(new, "(%s) && (%s)", *pred, what); 605 } 606 } else { 607 if (cmp != NULL) { 608 (void) sprintf(new, "%s %s 0x%p", 609 what, cmp, (void *)value); 610 } else { 611 (void) sprintf(new, "%s", what); 612 } 613 } 614 615 free(*pred); 616 *pred = new; 617 } 618 619 static void 620 predicate_destroy(char **pred) 621 { 622 free(*pred); 623 *pred = NULL; 624 } 625 626 static void 627 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 628 { 629 char buf[256], *c = buf, *new; 630 int len, newlen; 631 632 if (*filt == NULL) { 633 *filt = malloc(1); 634 *filt[0] = '\0'; 635 } 636 637 #ifdef illumos 638 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 639 " || " : "", what, (void *)base, what, (void *)(base + size)); 640 #else 641 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 642 " || " : "", what, (void *)base, what, (void *)(base + size)); 643 #endif 644 645 newlen = (len = strlen(*filt) + 1) + strlen(c); 646 new = malloc(newlen); 647 bcopy(*filt, new, len); 648 (void) strcat(new, c); 649 free(*filt); 650 *filt = new; 651 } 652 653 static void 654 filter_destroy(char **filt) 655 { 656 free(*filt); 657 *filt = NULL; 658 } 659 660 static void 661 dprog_add(const char *fmt, ...) 662 { 663 va_list args; 664 int size, offs; 665 char c; 666 667 va_start(args, fmt); 668 size = vsnprintf(&c, 1, fmt, args) + 1; 669 va_end(args); 670 671 if (g_proglen == 0) { 672 offs = 0; 673 } else { 674 offs = g_proglen - 1; 675 } 676 677 g_proglen = offs + size; 678 679 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 680 fail(1, "failed to reallocate program text"); 681 682 va_start(args, fmt); 683 (void) vsnprintf(&g_prog[offs], size, fmt, args); 684 va_end(args); 685 } 686 687 /* 688 * This function may read like an open sewer, but keep in mind that programs 689 * that generate other programs are rarely pretty. If one has the unenviable 690 * task of maintaining or -- worse -- extending this code, use the -V option 691 * to examine the D program as generated by this function. 692 */ 693 static void 694 dprog_addevent(int event) 695 { 696 ls_event_info_t *info = &g_event_info[event]; 697 char *pred = NULL; 698 char stack[20]; 699 const char *arg0, *caller; 700 char *arg1 = "arg1"; 701 char buf[80]; 702 hrtime_t dur; 703 int depth; 704 705 if (info->ev_name[0] == '\0') 706 return; 707 708 if (info->ev_type == 'I') { 709 /* 710 * For interrupt events, arg0 (normally the lock pointer) is 711 * the CPU address plus the current pil, and arg1 (normally 712 * the number of nanoseconds) is the number of nanoseconds 713 * late -- and it's stored in arg2. 714 */ 715 #ifdef illumos 716 arg0 = "(uintptr_t)curthread->t_cpu + \n" 717 "\t curthread->t_cpu->cpu_profile_pil"; 718 #else 719 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 720 "\t 0x01000000 + curthread->td_pri_class"; 721 #endif 722 caller = "(uintptr_t)arg0"; 723 arg1 = "arg2"; 724 } else { 725 #ifdef illumos 726 arg0 = "(uintptr_t)arg0"; 727 #else 728 arg0 = "stringof(args[0]->lock_object.lo_name)"; 729 #endif 730 caller = "caller"; 731 } 732 733 if (g_recsize > LS_HIST) { 734 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 735 continue; 736 737 if (g_tracing) { 738 (void) sprintf(stack, "\tstack(%d);\n", depth); 739 } else { 740 (void) sprintf(stack, ", stack(%d)", depth); 741 } 742 } else { 743 (void) sprintf(stack, ""); 744 } 745 746 if (info->ev_acquire != NULL) { 747 /* 748 * If this is a hold event, we need to generate an additional 749 * clause for the acquire; the clause for the release will be 750 * generated with the aggregating statement, below. 751 */ 752 dprog_add("%s\n", info->ev_acquire); 753 predicate_add(&pred, info->ev_predicate, NULL, 0); 754 predicate_add(&pred, g_predicate, NULL, 0); 755 if (pred != NULL) 756 dprog_add("/%s/\n", pred); 757 758 dprog_add("{\n"); 759 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 760 761 if (info->ev_type == 'H') { 762 dprog_add("\t%s = timestamp;\n", buf); 763 } else { 764 /* 765 * If this isn't a hold event, it's the recursive 766 * error event. For this, we simply bump the 767 * thread-local, per-lock count. 768 */ 769 dprog_add("\t%s++;\n", buf); 770 } 771 772 dprog_add("}\n\n"); 773 predicate_destroy(&pred); 774 pred = NULL; 775 776 if (info->ev_type == 'E') { 777 /* 778 * If this is the recursive lock error event, we need 779 * to generate an additional clause to decrement the 780 * thread-local, per-lock count. This assures that we 781 * only execute the aggregating clause if we have 782 * recursive entry. 783 */ 784 dprog_add("%s\n", info->ev_name); 785 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 786 } 787 788 predicate_add(&pred, buf, NULL, 0); 789 790 if (info->ev_type == 'H') { 791 (void) sprintf(buf, "timestamp -\n\t " 792 "self->ev%d[(uintptr_t)arg0]", event); 793 } 794 795 arg1 = buf; 796 } else { 797 predicate_add(&pred, info->ev_predicate, NULL, 0); 798 if (info->ev_type != 'I') 799 predicate_add(&pred, g_predicate, NULL, 0); 800 else 801 predicate_add(&pred, g_ipredicate, NULL, 0); 802 } 803 804 if ((dur = g_min_duration[event]) != 0) 805 predicate_add(&pred, arg1, ">=", dur); 806 807 dprog_add("%s\n", info->ev_name); 808 809 if (pred != NULL) 810 dprog_add("/%s/\n", pred); 811 predicate_destroy(&pred); 812 813 dprog_add("{\n"); 814 815 if (g_tracing) { 816 dprog_add("\ttrace(%dULL);\n", event); 817 dprog_add("\ttrace(%s);\n", arg0); 818 dprog_add("\ttrace(%s);\n", caller); 819 dprog_add(stack); 820 } else { 821 /* 822 * The ordering here is important: when we process the 823 * aggregate, we count on the fact that @avg appears before 824 * @hist in program order to assure that @avg is assigned the 825 * first aggregation variable ID and @hist assigned the 826 * second; see the comment in process_aggregate() for details. 827 */ 828 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 829 event, arg0, caller, stack, arg1); 830 831 if (g_recsize >= LS_HIST) { 832 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 833 "(%s);\n", event, arg0, caller, stack, arg1); 834 } 835 } 836 837 if (info->ev_acquire != NULL) 838 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 839 840 dprog_add("}\n\n"); 841 } 842 843 static void 844 dprog_compile() 845 { 846 dtrace_prog_t *prog; 847 dtrace_proginfo_t info; 848 849 if (g_Vflag) { 850 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 851 (void) fputs(g_prog, stderr); 852 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 853 } 854 855 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 856 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 857 dfail("failed to compile program"); 858 859 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 860 dfail("failed to enable probes"); 861 862 if (dtrace_go(g_dtp) != 0) 863 dfail("couldn't start tracing"); 864 } 865 866 static void 867 #ifdef illumos 868 status_fire(void) 869 #else 870 status_fire(int i) 871 #endif 872 {} 873 874 static void 875 status_init(void) 876 { 877 dtrace_optval_t val, status, agg; 878 struct sigaction act; 879 struct itimerspec ts; 880 struct sigevent ev; 881 timer_t tid; 882 883 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 884 dfail("failed to get 'statusrate'"); 885 886 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 887 dfail("failed to get 'statusrate'"); 888 889 /* 890 * We would want to awaken at a rate that is the GCD of the statusrate 891 * and the aggrate -- but that seems a bit absurd. Instead, we'll 892 * simply awaken at a rate that is the more frequent of the two, which 893 * assures that we're never later than the interval implied by the 894 * more frequent rate. 895 */ 896 val = status < agg ? status : agg; 897 898 (void) sigemptyset(&act.sa_mask); 899 act.sa_flags = 0; 900 act.sa_handler = status_fire; 901 (void) sigaction(SIGUSR1, &act, NULL); 902 903 ev.sigev_notify = SIGEV_SIGNAL; 904 ev.sigev_signo = SIGUSR1; 905 906 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 907 dfail("cannot create CLOCK_REALTIME timer"); 908 909 ts.it_value.tv_sec = val / NANOSEC; 910 ts.it_value.tv_nsec = val % NANOSEC; 911 ts.it_interval = ts.it_value; 912 913 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 914 dfail("cannot set time on CLOCK_REALTIME timer"); 915 } 916 917 static void 918 status_check(void) 919 { 920 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 921 dfail("failed to snap aggregate"); 922 923 if (dtrace_status(g_dtp) == -1) 924 dfail("dtrace_status()"); 925 } 926 927 static void 928 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 929 { 930 bzero(lsrec, g_recsize); 931 lsrec->ls_count = 1; 932 933 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 934 fail(0, "truncated DTrace record"); 935 936 if (rec->dtrd_size != sizeof (uint64_t)) 937 fail(0, "bad event size in first record"); 938 939 /* LINTED - alignment */ 940 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 941 rec++; 942 943 #ifdef illumos 944 if (rec->dtrd_size != sizeof (uintptr_t)) 945 fail(0, "bad lock address size in second record"); 946 947 /* LINTED - alignment */ 948 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 949 rec++; 950 #else 951 lsrec->ls_lock = strdup((const char *)(data + rec->dtrd_offset)); 952 rec++; 953 #endif 954 955 if (rec->dtrd_size != sizeof (uintptr_t)) 956 fail(0, "bad caller size in third record"); 957 958 /* LINTED - alignment */ 959 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 960 rec++; 961 962 if (g_recsize > LS_HIST) { 963 int frames, i; 964 pc_t *stack; 965 966 frames = rec->dtrd_size / sizeof (pc_t); 967 /* LINTED - alignment */ 968 stack = (pc_t *)(data + rec->dtrd_offset); 969 970 for (i = 1; i < frames; i++) 971 lsrec->ls_stack[i - 1] = stack[i]; 972 } 973 } 974 975 /*ARGSUSED*/ 976 static int 977 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 978 { 979 *((size_t *)arg) += 1; 980 981 return (DTRACE_AGGWALK_NEXT); 982 } 983 984 static int 985 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 986 { 987 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 988 caddr_t data = agg->dtada_data; 989 lsdata_t *lsdata = arg; 990 lsrec_t *lsrec = lsdata->lsd_next; 991 const dtrace_recdesc_t *rec; 992 uint64_t *avg, *quantized; 993 int i, j; 994 995 assert(lsdata->lsd_count < g_nrecs); 996 997 /* 998 * Aggregation variable IDs are guaranteed to be generated in program 999 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 1000 * plus one. As "avg" appears before "hist" in program order, we know 1001 * that "avg" will be allocated the first aggregation variable ID, and 1002 * "hist" will be allocated the second aggregation variable ID -- and 1003 * we therefore use the aggregation variable ID to differentiate the 1004 * cases. 1005 */ 1006 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 1007 /* 1008 * If this is the histogram entry. We'll copy the quantized 1009 * data into lc_hist, and jump over the rest. 1010 */ 1011 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 1012 1013 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 1014 fail(0, "bad variable ID in aggregation record"); 1015 1016 if (rec->dtrd_size != 1017 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 1018 fail(0, "bad quantize size in aggregation record"); 1019 1020 /* LINTED - alignment */ 1021 quantized = (uint64_t *)(data + rec->dtrd_offset); 1022 1023 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 1024 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 1025 lsrec->ls_hist[j] = quantized[i]; 1026 1027 goto out; 1028 } 1029 1030 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 1031 aggdesc->dtagd_nrecs - 1, data); 1032 1033 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 1034 1035 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 1036 fail(0, "bad avg size in aggregation record"); 1037 1038 /* LINTED - alignment */ 1039 avg = (uint64_t *)(data + rec->dtrd_offset); 1040 lsrec->ls_count = (uint32_t)avg[0]; 1041 lsrec->ls_time = (uintptr_t)avg[1]; 1042 1043 if (g_recsize >= LS_HIST) 1044 return (DTRACE_AGGWALK_NEXT); 1045 1046 out: 1047 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1048 lsdata->lsd_count++; 1049 1050 return (DTRACE_AGGWALK_NEXT); 1051 } 1052 1053 static int 1054 process_trace(const dtrace_probedata_t *pdata, void *arg) 1055 { 1056 lsdata_t *lsdata = arg; 1057 lsrec_t *lsrec = lsdata->lsd_next; 1058 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1059 caddr_t data = pdata->dtpda_data; 1060 1061 if (lsdata->lsd_count >= g_nrecs) 1062 return (DTRACE_CONSUME_NEXT); 1063 1064 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1065 1066 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1067 lsdata->lsd_count++; 1068 1069 return (DTRACE_CONSUME_NEXT); 1070 } 1071 1072 static int 1073 process_data(FILE *out, char *data) 1074 { 1075 lsdata_t lsdata; 1076 1077 /* LINTED - alignment */ 1078 lsdata.lsd_next = (lsrec_t *)data; 1079 lsdata.lsd_count = 0; 1080 1081 if (g_tracing) { 1082 if (dtrace_consume(g_dtp, out, 1083 process_trace, NULL, &lsdata) != 0) 1084 dfail("failed to consume buffer"); 1085 1086 return (lsdata.lsd_count); 1087 } 1088 1089 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1090 process_aggregate, &lsdata) != 0) 1091 dfail("failed to walk aggregate"); 1092 1093 return (lsdata.lsd_count); 1094 } 1095 1096 /*ARGSUSED*/ 1097 static int 1098 drophandler(const dtrace_dropdata_t *data, void *arg) 1099 { 1100 g_dropped++; 1101 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1102 return (DTRACE_HANDLE_OK); 1103 } 1104 1105 int 1106 main(int argc, char **argv) 1107 { 1108 char *data_buf; 1109 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1110 FILE *out = stdout; 1111 int c; 1112 pid_t child; 1113 int status; 1114 int i, j; 1115 hrtime_t duration; 1116 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1117 uintptr_t addr; 1118 size_t size, off; 1119 int events_specified = 0; 1120 int exec_errno = 0; 1121 uint32_t event; 1122 char *filt = NULL, *ifilt = NULL; 1123 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1124 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1125 dtrace_optval_t aggsize; 1126 char aggstr[10]; 1127 long ncpus; 1128 int dynvar = 0; 1129 int err; 1130 1131 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1132 fail(0, "cannot open dtrace library: %s", 1133 dtrace_errmsg(NULL, err)); 1134 } 1135 1136 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1137 dfail("couldn't establish drop handler"); 1138 1139 if (symtab_init() == -1) 1140 fail(1, "can't load kernel symbols"); 1141 1142 g_nrecs = DEFAULT_NRECS; 1143 1144 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1145 switch (c) { 1146 case 'b': 1147 g_recsize = LS_BASIC; 1148 break; 1149 1150 case 't': 1151 g_recsize = LS_TIME; 1152 break; 1153 1154 case 'h': 1155 g_recsize = LS_HIST; 1156 break; 1157 1158 case 's': 1159 if (!isdigit(optarg[0])) 1160 usage(); 1161 g_stkdepth = atoi(optarg); 1162 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1163 fail(0, "max stack depth is %d", 1164 LS_MAX_STACK_DEPTH); 1165 g_recsize = LS_STACK(g_stkdepth); 1166 break; 1167 1168 case 'n': 1169 if (!isdigit(optarg[0])) 1170 usage(); 1171 g_nrecs = atoi(optarg); 1172 break; 1173 1174 case 'd': 1175 if (!isdigit(optarg[0])) 1176 usage(); 1177 duration = atoll(optarg); 1178 1179 /* 1180 * XXX -- durations really should be per event 1181 * since the units are different, but it's hard 1182 * to express this nicely in the interface. 1183 * Not clear yet what the cleanest solution is. 1184 */ 1185 for (i = 0; i < LS_MAX_EVENTS; i++) 1186 if (g_event_info[i].ev_type != 'E') 1187 g_min_duration[i] = duration; 1188 1189 break; 1190 1191 case 'i': 1192 if (!isdigit(optarg[0])) 1193 usage(); 1194 i = atoi(optarg); 1195 if (i <= 0) 1196 usage(); 1197 if (i > MAX_HZ) 1198 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1199 1200 for (j = 0; j < LS_MAX_EVENTS; j++) 1201 if (strcmp(g_event_info[j].ev_desc, 1202 "Profiling interrupt") == 0) 1203 break; 1204 1205 (void) sprintf(g_event_info[j].ev_name, 1206 "profile:::profile-%d", i); 1207 break; 1208 1209 case 'l': 1210 case 'f': 1211 addrp = strtok(optarg, ","); 1212 sizep = strtok(NULL, ","); 1213 addrp = strtok(optarg, ",+"); 1214 offp = strtok(NULL, ","); 1215 1216 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1217 off = offp ? strtoul(offp, NULL, 0) : 0; 1218 1219 if (addrp[0] == '0') { 1220 addr = strtoul(addrp, NULL, 16) + off; 1221 } else { 1222 addr = sym_to_addr(addrp) + off; 1223 if (sizep == NULL) 1224 size = sym_size(addrp) - off; 1225 if (addr - off == 0) 1226 fail(0, "symbol '%s' not found", addrp); 1227 if (size == 0) 1228 size = 1; 1229 } 1230 1231 1232 if (c == 'l') { 1233 filter_add(&filt, "arg0", addr, size); 1234 } else { 1235 filter_add(&filt, "caller", addr, size); 1236 filter_add(&ifilt, "arg0", addr, size); 1237 } 1238 break; 1239 1240 case 'e': 1241 evp = strtok_r(optarg, ",", &lastp); 1242 while (evp) { 1243 int ev1, ev2; 1244 char *evp2; 1245 1246 (void) strtok(evp, "-"); 1247 evp2 = strtok(NULL, "-"); 1248 ev1 = atoi(evp); 1249 ev2 = evp2 ? atoi(evp2) : ev1; 1250 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1251 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1252 fail(0, "-e events out of range"); 1253 for (i = ev1; i <= ev2; i++) 1254 g_enabled[i] = 1; 1255 evp = strtok_r(NULL, ",", &lastp); 1256 } 1257 events_specified = 1; 1258 break; 1259 1260 #ifdef illumos 1261 case 'c': 1262 g_cflag = 1; 1263 break; 1264 #endif 1265 1266 case 'k': 1267 g_kflag = 1; 1268 break; 1269 1270 case 'w': 1271 g_wflag = 1; 1272 break; 1273 1274 case 'W': 1275 g_Wflag = 1; 1276 break; 1277 1278 case 'g': 1279 g_gflag = 1; 1280 break; 1281 1282 case 'C': 1283 case 'E': 1284 case 'H': 1285 case 'I': 1286 for (i = 0; i < LS_MAX_EVENTS; i++) 1287 if (g_event_info[i].ev_type == c) 1288 g_enabled[i] = 1; 1289 events_specified = 1; 1290 break; 1291 1292 case 'A': 1293 for (i = 0; i < LS_MAX_EVENTS; i++) 1294 if (strchr("CH", g_event_info[i].ev_type)) 1295 g_enabled[i] = 1; 1296 events_specified = 1; 1297 break; 1298 1299 case 'T': 1300 g_tracing = 1; 1301 break; 1302 1303 case 'D': 1304 if (!isdigit(optarg[0])) 1305 usage(); 1306 g_topn = atoi(optarg); 1307 break; 1308 1309 case 'R': 1310 g_rates = 1; 1311 break; 1312 1313 case 'p': 1314 g_pflag = 1; 1315 break; 1316 1317 case 'P': 1318 g_Pflag = 1; 1319 break; 1320 1321 case 'o': 1322 if ((out = fopen(optarg, "w")) == NULL) 1323 fail(1, "error opening file"); 1324 break; 1325 1326 case 'V': 1327 g_Vflag = 1; 1328 break; 1329 1330 default: 1331 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1332 usage(); 1333 } 1334 } 1335 1336 if (filt != NULL) { 1337 predicate_add(&g_predicate, filt, NULL, 0); 1338 filter_destroy(&filt); 1339 } 1340 1341 if (ifilt != NULL) { 1342 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1343 filter_destroy(&ifilt); 1344 } 1345 1346 if (g_recsize == 0) { 1347 if (g_gflag) { 1348 g_stkdepth = LS_MAX_STACK_DEPTH; 1349 g_recsize = LS_STACK(g_stkdepth); 1350 } else { 1351 g_recsize = LS_TIME; 1352 } 1353 } 1354 1355 if (g_gflag && g_recsize <= LS_STACK(0)) 1356 fail(0, "'-g' requires at least '-s 1' data gathering"); 1357 1358 /* 1359 * Make sure the alignment is reasonable 1360 */ 1361 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1362 1363 for (i = 0; i < LS_MAX_EVENTS; i++) { 1364 /* 1365 * If no events were specified, enable -C. 1366 */ 1367 if (!events_specified && g_event_info[i].ev_type == 'C') 1368 g_enabled[i] = 1; 1369 } 1370 1371 for (i = 0; i < LS_MAX_EVENTS; i++) { 1372 if (!g_enabled[i]) 1373 continue; 1374 1375 if (g_event_info[i].ev_acquire != NULL) { 1376 /* 1377 * If we've enabled a hold event, we must explicitly 1378 * allocate dynamic variable space. 1379 */ 1380 dynvar = 1; 1381 } 1382 1383 dprog_addevent(i); 1384 } 1385 1386 /* 1387 * Make sure there are remaining arguments to specify a child command 1388 * to execute. 1389 */ 1390 if (argc <= optind) 1391 usage(); 1392 1393 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1394 dfail("couldn't determine number of online CPUs"); 1395 1396 /* 1397 * By default, we set our data buffer size to be the number of records 1398 * multiplied by the size of the record, doubled to account for some 1399 * DTrace slop and divided by the number of CPUs. We silently clamp 1400 * the aggregation size at both a minimum and a maximum to prevent 1401 * absurdly low or high values. 1402 */ 1403 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1404 aggsize = MIN_AGGSIZE; 1405 1406 if (aggsize > MAX_AGGSIZE) 1407 aggsize = MAX_AGGSIZE; 1408 1409 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1410 1411 if (!g_tracing) { 1412 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1413 dfail("failed to set 'bufsize'"); 1414 1415 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1416 dfail("failed to set 'aggsize'"); 1417 1418 if (dynvar) { 1419 /* 1420 * If we're using dynamic variables, we set our 1421 * dynamic variable size to be one megabyte per CPU, 1422 * with a hard-limit of 32 megabytes. This may still 1423 * be too small in some cases, but it can be tuned 1424 * manually via -x if need be. 1425 */ 1426 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1427 1428 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1429 dfail("failed to set 'dynvarsize'"); 1430 } 1431 } else { 1432 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1433 dfail("failed to set 'bufsize'"); 1434 } 1435 1436 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1437 dfail("failed to set 'statusrate'"); 1438 1439 optind = 1; 1440 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1441 switch (c) { 1442 case 'x': 1443 if ((p = strchr(optarg, '=')) != NULL) 1444 *p++ = '\0'; 1445 1446 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1447 dfail("failed to set -x %s", optarg); 1448 break; 1449 } 1450 } 1451 1452 argc -= optind; 1453 argv += optind; 1454 1455 dprog_compile(); 1456 status_init(); 1457 1458 g_elapsed = -gethrtime(); 1459 1460 /* 1461 * Spawn the specified command and wait for it to complete. 1462 */ 1463 child = fork(); 1464 if (child == -1) 1465 fail(1, "cannot fork"); 1466 if (child == 0) { 1467 (void) dtrace_close(g_dtp); 1468 (void) execvp(argv[0], &argv[0]); 1469 exec_errno = errno; 1470 exit(127); 1471 } 1472 1473 #ifdef illumos 1474 while (waitpid(child, &status, WEXITED) != child) 1475 #else 1476 while (waitpid(child, &status, 0) != child) 1477 #endif 1478 status_check(); 1479 1480 g_elapsed += gethrtime(); 1481 1482 if (WIFEXITED(status)) { 1483 if (WEXITSTATUS(status) != 0) { 1484 if (exec_errno != 0) { 1485 errno = exec_errno; 1486 fail(1, "could not execute %s", argv[0]); 1487 } 1488 (void) fprintf(stderr, 1489 "lockstat: warning: %s exited with code %d\n", 1490 argv[0], WEXITSTATUS(status)); 1491 } 1492 } else { 1493 (void) fprintf(stderr, 1494 "lockstat: warning: %s died on signal %d\n", 1495 argv[0], WTERMSIG(status)); 1496 } 1497 1498 if (dtrace_stop(g_dtp) == -1) 1499 dfail("failed to stop dtrace"); 1500 1501 /* 1502 * Before we read out the results, we need to allocate our buffer. 1503 * If we're tracing, then we'll just use the precalculated size. If 1504 * we're not, then we'll take a snapshot of the aggregate, and walk 1505 * it to count the number of records. 1506 */ 1507 if (!g_tracing) { 1508 if (dtrace_aggregate_snap(g_dtp) != 0) 1509 dfail("failed to snap aggregate"); 1510 1511 g_nrecs = 0; 1512 1513 if (dtrace_aggregate_walk(g_dtp, 1514 count_aggregate, &g_nrecs) != 0) 1515 dfail("failed to walk aggregate"); 1516 } 1517 1518 #ifdef illumos 1519 if ((data_buf = memalign(sizeof (uint64_t), 1520 (g_nrecs + 1) * g_recsize)) == NULL) 1521 #else 1522 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1523 (g_nrecs + 1) * g_recsize) ) 1524 #endif 1525 fail(1, "Memory allocation failed"); 1526 1527 /* 1528 * Read out the DTrace data. 1529 */ 1530 g_nrecs_used = process_data(out, data_buf); 1531 1532 if (g_nrecs_used > g_nrecs || g_dropped) 1533 (void) fprintf(stderr, "lockstat: warning: " 1534 "ran out of data records (use -n for more)\n"); 1535 1536 /* LINTED - alignment */ 1537 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1538 /* LINTED - alignment */ 1539 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1540 ev_count[lsp->ls_event] += lsp->ls_count; 1541 ev_time[lsp->ls_event] += lsp->ls_time; 1542 } 1543 1544 /* 1545 * If -g was specified, convert stacks into individual records. 1546 */ 1547 if (g_gflag) { 1548 lsrec_t *newlsp, *oldlsp; 1549 1550 #ifdef illumos 1551 newlsp = memalign(sizeof (uint64_t), 1552 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1553 #else 1554 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1555 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1556 #endif 1557 if (newlsp == NULL) 1558 fail(1, "Cannot allocate space for -g processing"); 1559 lsp = newlsp; 1560 /* LINTED - alignment */ 1561 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1562 /* LINTED - alignment */ 1563 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1564 int fr; 1565 int caller_in_stack = 0; 1566 1567 if (oldlsp->ls_count == 0) 1568 continue; 1569 1570 for (fr = 0; fr < g_stkdepth; fr++) { 1571 if (oldlsp->ls_stack[fr] == 0) 1572 break; 1573 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1574 caller_in_stack = 1; 1575 bcopy(oldlsp, lsp, LS_TIME); 1576 lsp->ls_caller = oldlsp->ls_stack[fr]; 1577 #ifndef illumos 1578 lsp->ls_lock = strdup(oldlsp->ls_lock); 1579 #endif 1580 /* LINTED - alignment */ 1581 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1582 } 1583 if (!caller_in_stack) { 1584 bcopy(oldlsp, lsp, LS_TIME); 1585 /* LINTED - alignment */ 1586 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1587 } 1588 #ifndef illumos 1589 free(oldlsp->ls_lock); 1590 #endif 1591 } 1592 g_nrecs = g_nrecs_used = 1593 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1594 g_recsize = LS_TIME; 1595 g_stkdepth = 0; 1596 free(data_buf); 1597 data_buf = (char *)newlsp; 1598 } 1599 1600 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1601 sizeof (void *))) == NULL) 1602 fail(1, "Sort buffer allocation failed"); 1603 merge_buf = sort_buf + (g_nrecs + 1); 1604 1605 /* 1606 * Build the sort buffer, discarding zero-count records along the way. 1607 */ 1608 /* LINTED - alignment */ 1609 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1610 /* LINTED - alignment */ 1611 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1612 if (lsp->ls_count == 0) 1613 lsp->ls_event = LS_MAX_EVENTS; 1614 sort_buf[i] = lsp; 1615 } 1616 1617 if (g_nrecs_used == 0) 1618 exit(0); 1619 1620 /* 1621 * Add a sentinel after the last record 1622 */ 1623 sort_buf[i] = lsp; 1624 lsp->ls_event = LS_MAX_EVENTS; 1625 1626 if (g_tracing) { 1627 report_trace(out, sort_buf); 1628 return (0); 1629 } 1630 1631 /* 1632 * Application of -g may have resulted in multiple records 1633 * with the same signature; coalesce them. 1634 */ 1635 if (g_gflag) { 1636 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1637 coalesce(lockcmp, sort_buf, g_nrecs_used); 1638 } 1639 1640 /* 1641 * Coalesce locks within the same symbol if -c option specified. 1642 * Coalesce PCs within the same function if -k option specified. 1643 */ 1644 if (g_cflag || g_kflag) { 1645 for (i = 0; i < g_nrecs_used; i++) { 1646 int fr; 1647 lsp = sort_buf[i]; 1648 #ifdef illumos 1649 if (g_cflag) 1650 coalesce_symbol(&lsp->ls_lock); 1651 #endif 1652 if (g_kflag) { 1653 for (fr = 0; fr < g_stkdepth; fr++) 1654 coalesce_symbol(&lsp->ls_stack[fr]); 1655 coalesce_symbol(&lsp->ls_caller); 1656 } 1657 } 1658 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1659 coalesce(lockcmp, sort_buf, g_nrecs_used); 1660 } 1661 1662 /* 1663 * Coalesce callers if -w option specified 1664 */ 1665 if (g_wflag) { 1666 mergesort(lock_and_count_cmp_anywhere, 1667 sort_buf, merge_buf, g_nrecs_used); 1668 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1669 } 1670 1671 /* 1672 * Coalesce locks if -W option specified 1673 */ 1674 if (g_Wflag) { 1675 mergesort(site_and_count_cmp_anylock, 1676 sort_buf, merge_buf, g_nrecs_used); 1677 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1678 } 1679 1680 /* 1681 * Sort data by contention count (ls_count) or total time (ls_time), 1682 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1683 */ 1684 if (g_recsize < LS_TIME) 1685 g_Pflag = 0; 1686 1687 if (g_Pflag) 1688 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1689 else 1690 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1691 1692 /* 1693 * Display data by event type 1694 */ 1695 first = &sort_buf[0]; 1696 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1697 current = first; 1698 while ((lsp = *current)->ls_event == event) 1699 current++; 1700 report_stats(out, first, current - first, ev_count[event], 1701 ev_time[event]); 1702 first = current; 1703 } 1704 1705 #ifndef illumos 1706 /* 1707 * Free lock name buffers 1708 */ 1709 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1710 lsp = (lsrec_t *)((char *)lsp + g_recsize)) 1711 free(lsp->ls_lock); 1712 #endif 1713 1714 return (0); 1715 } 1716 1717 static char * 1718 format_symbol(char *buf, uintptr_t addr, int show_size) 1719 { 1720 uintptr_t symoff; 1721 char *symname; 1722 size_t symsize; 1723 1724 symname = addr_to_sym(addr, &symoff, &symsize); 1725 1726 if (show_size && symoff == 0) 1727 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1728 else if (symoff == 0) 1729 (void) sprintf(buf, "%s", symname); 1730 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1731 #ifdef illumos 1732 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1733 #else 1734 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1735 #endif 1736 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1737 (void) sprintf(buf, "%s+0x%llx", symname, 1738 (unsigned long long)symoff); 1739 else 1740 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1741 return (buf); 1742 } 1743 1744 static void 1745 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1746 uint64_t total_time) 1747 { 1748 uint32_t event = sort_buf[0]->ls_event; 1749 lsrec_t *lsp; 1750 double ptotal = 0.0; 1751 double percent; 1752 int i, j, fr; 1753 int displayed; 1754 int first_bin, last_bin, max_bin_count, total_bin_count; 1755 int rectype; 1756 char buf[256]; 1757 char lhdr[80], chdr[80]; 1758 1759 rectype = g_recsize; 1760 1761 if (g_topn == 0) { 1762 (void) fprintf(out, "%20llu %s\n", 1763 g_rates == 0 ? total_count : 1764 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1765 g_event_info[event].ev_desc); 1766 return; 1767 } 1768 1769 (void) sprintf(lhdr, "%s%s", 1770 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1771 (void) sprintf(chdr, "%s%s", 1772 g_wflag ? "Hottest " : "", "Caller"); 1773 1774 if (!g_pflag) 1775 (void) fprintf(out, 1776 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1777 g_event_info[event].ev_desc, (double)total_count, 1778 (double)g_elapsed / NANOSEC, 1779 (double)total_count * NANOSEC / g_elapsed); 1780 1781 if (!g_pflag && rectype < LS_HIST) { 1782 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1783 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1784 g_rates ? "ops/s" : "Count", 1785 g_gflag ? "genr" : "indv", 1786 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1787 (void) fprintf(out, "---------------------------------" 1788 "----------------------------------------------\n"); 1789 } 1790 1791 displayed = 0; 1792 for (i = 0; i < nrecs; i++) { 1793 lsp = sort_buf[i]; 1794 1795 if (displayed++ >= g_topn) 1796 break; 1797 1798 if (g_pflag) { 1799 int j; 1800 1801 (void) fprintf(out, "%u %u", 1802 lsp->ls_event, lsp->ls_count); 1803 #ifdef illumos 1804 (void) fprintf(out, " %s", 1805 format_symbol(buf, lsp->ls_lock, g_cflag)); 1806 #else 1807 (void) fprintf(out, " %s", lsp->ls_lock); 1808 #endif 1809 (void) fprintf(out, " %s", 1810 format_symbol(buf, lsp->ls_caller, 0)); 1811 (void) fprintf(out, " %f", 1812 (double)lsp->ls_refcnt / lsp->ls_count); 1813 if (rectype >= LS_TIME) 1814 (void) fprintf(out, " %llu", 1815 (unsigned long long)lsp->ls_time); 1816 if (rectype >= LS_HIST) { 1817 for (j = 0; j < 64; j++) 1818 (void) fprintf(out, " %u", 1819 lsp->ls_hist[j]); 1820 } 1821 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1822 if (rectype <= LS_STACK(j) || 1823 lsp->ls_stack[j] == 0) 1824 break; 1825 (void) fprintf(out, " %s", 1826 format_symbol(buf, lsp->ls_stack[j], 0)); 1827 } 1828 (void) fprintf(out, "\n"); 1829 continue; 1830 } 1831 1832 if (rectype >= LS_HIST) { 1833 (void) fprintf(out, "---------------------------------" 1834 "----------------------------------------------\n"); 1835 (void) sprintf(buf, "%s", 1836 g_event_info[event].ev_units); 1837 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1838 g_rates ? "ops/s" : "Count", 1839 g_gflag ? "genr" : "indv", 1840 "cuml", "rcnt", buf, lhdr, chdr); 1841 } 1842 1843 if (g_Pflag && total_time != 0) 1844 percent = (lsp->ls_time * 100.00) / total_time; 1845 else 1846 percent = (lsp->ls_count * 100.00) / total_count; 1847 1848 ptotal += percent; 1849 1850 if (rectype >= LS_TIME) 1851 (void) sprintf(buf, "%llu", 1852 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1853 else 1854 buf[0] = '\0'; 1855 1856 (void) fprintf(out, "%5llu ", 1857 g_rates == 0 ? lsp->ls_count : 1858 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1859 1860 (void) fprintf(out, "%3.0f%% ", percent); 1861 1862 if (g_gflag) 1863 (void) fprintf(out, "---- "); 1864 else 1865 (void) fprintf(out, "%3.0f%% ", ptotal); 1866 1867 (void) fprintf(out, "%4.2f %8s ", 1868 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1869 1870 #ifdef illumos 1871 (void) fprintf(out, "%-22s ", 1872 format_symbol(buf, lsp->ls_lock, g_cflag)); 1873 #else 1874 (void) fprintf(out, "%-22s ", lsp->ls_lock); 1875 #endif 1876 1877 (void) fprintf(out, "%-24s\n", 1878 format_symbol(buf, lsp->ls_caller, 0)); 1879 1880 if (rectype < LS_HIST) 1881 continue; 1882 1883 (void) fprintf(out, "\n"); 1884 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1885 g_event_info[event].ev_units, 1886 "------ Time Distribution ------", 1887 g_rates ? "ops/s" : "count", 1888 rectype > LS_STACK(0) ? "Stack" : ""); 1889 1890 first_bin = 0; 1891 while (lsp->ls_hist[first_bin] == 0) 1892 first_bin++; 1893 1894 last_bin = 63; 1895 while (lsp->ls_hist[last_bin] == 0) 1896 last_bin--; 1897 1898 max_bin_count = 0; 1899 total_bin_count = 0; 1900 for (j = first_bin; j <= last_bin; j++) { 1901 total_bin_count += lsp->ls_hist[j]; 1902 if (lsp->ls_hist[j] > max_bin_count) 1903 max_bin_count = lsp->ls_hist[j]; 1904 } 1905 1906 /* 1907 * If we went a few frames below the caller, ignore them 1908 */ 1909 for (fr = 3; fr > 0; fr--) 1910 if (lsp->ls_stack[fr] == lsp->ls_caller) 1911 break; 1912 1913 for (j = first_bin; j <= last_bin; j++) { 1914 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1915 (void) fprintf(out, "%10llu |%s%s %-9u ", 1916 1ULL << j, 1917 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1918 " " + depth, 1919 g_rates == 0 ? lsp->ls_hist[j] : 1920 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1921 g_elapsed)); 1922 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1923 (void) fprintf(out, "\n"); 1924 continue; 1925 } 1926 (void) fprintf(out, "%-24s\n", 1927 format_symbol(buf, lsp->ls_stack[fr], 0)); 1928 fr++; 1929 } 1930 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1931 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1932 format_symbol(buf, lsp->ls_stack[fr], 0)); 1933 fr++; 1934 } 1935 } 1936 1937 if (!g_pflag) 1938 (void) fprintf(out, "---------------------------------" 1939 "----------------------------------------------\n"); 1940 1941 (void) fflush(out); 1942 } 1943 1944 static void 1945 report_trace(FILE *out, lsrec_t **sort_buf) 1946 { 1947 lsrec_t *lsp; 1948 int i, fr; 1949 int rectype; 1950 char buf[256], buf2[256]; 1951 1952 rectype = g_recsize; 1953 1954 if (!g_pflag) { 1955 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1956 "Event", "Time", "Owner", "Lock", "Caller"); 1957 (void) fprintf(out, "---------------------------------" 1958 "----------------------------------------------\n"); 1959 } 1960 1961 for (i = 0; i < g_nrecs_used; i++) { 1962 1963 lsp = sort_buf[i]; 1964 1965 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1966 continue; 1967 1968 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1969 lsp->ls_event, (unsigned long long)lsp->ls_time, 1970 (void *)lsp->ls_next, 1971 #ifdef illumos 1972 format_symbol(buf, lsp->ls_lock, 0), 1973 #else 1974 lsp->ls_lock, 1975 #endif 1976 format_symbol(buf2, lsp->ls_caller, 0)); 1977 1978 if (rectype <= LS_STACK(0)) 1979 continue; 1980 1981 /* 1982 * If we went a few frames below the caller, ignore them 1983 */ 1984 for (fr = 3; fr > 0; fr--) 1985 if (lsp->ls_stack[fr] == lsp->ls_caller) 1986 break; 1987 1988 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1989 (void) fprintf(out, "%53s %-24s\n", "", 1990 format_symbol(buf, lsp->ls_stack[fr], 0)); 1991 fr++; 1992 } 1993 (void) fprintf(out, "\n"); 1994 } 1995 1996 (void) fflush(out); 1997 } 1998