1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <stdio.h> 29 #include <stddef.h> 30 #include <stdlib.h> 31 #include <stdarg.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <fcntl.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <sys/types.h> 40 #include <sys/modctl.h> 41 #include <sys/stat.h> 42 #include <sys/wait.h> 43 #include <dtrace.h> 44 #include <sys/lockstat.h> 45 #include <alloca.h> 46 #include <signal.h> 47 #include <assert.h> 48 49 #ifdef illumos 50 #define GETOPT_EOF EOF 51 #else 52 #include <sys/time.h> 53 #include <sys/resource.h> 54 55 #define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 56 #define GETOPT_EOF (-1) 57 58 typedef uintptr_t pc_t; 59 #endif 60 61 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 62 63 #define LS_MAX_STACK_DEPTH 50 64 #define LS_MAX_EVENTS 64 65 66 typedef struct lsrec { 67 struct lsrec *ls_next; /* next in hash chain */ 68 uintptr_t ls_lock; /* lock address */ 69 uintptr_t ls_caller; /* caller address */ 70 uint32_t ls_count; /* cumulative event count */ 71 uint32_t ls_event; /* type of event */ 72 uintptr_t ls_refcnt; /* cumulative reference count */ 73 uint64_t ls_time; /* cumulative event duration */ 74 uint32_t ls_hist[64]; /* log2(duration) histogram */ 75 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 76 } lsrec_t; 77 78 typedef struct lsdata { 79 struct lsrec *lsd_next; /* next available */ 80 int lsd_count; /* number of records */ 81 } lsdata_t; 82 83 /* 84 * Definitions for the types of experiments which can be run. They are 85 * listed in increasing order of memory cost and processing time cost. 86 * The numerical value of each type is the number of bytes needed per record. 87 */ 88 #define LS_BASIC offsetof(lsrec_t, ls_time) 89 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 90 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 91 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 92 93 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 94 static void report_trace(FILE *, lsrec_t **); 95 96 extern int symtab_init(void); 97 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 98 extern uintptr_t sym_to_addr(char *name); 99 extern size_t sym_size(char *name); 100 extern char *strtok_r(char *, const char *, char **); 101 102 #define DEFAULT_NRECS 10000 103 #define DEFAULT_HZ 97 104 #define MAX_HZ 1000 105 #define MIN_AGGSIZE (16 * 1024) 106 #define MAX_AGGSIZE (32 * 1024 * 1024) 107 108 static int g_stkdepth; 109 static int g_topn = INT_MAX; 110 static hrtime_t g_elapsed; 111 static int g_rates = 0; 112 static int g_pflag = 0; 113 static int g_Pflag = 0; 114 static int g_wflag = 0; 115 static int g_Wflag = 0; 116 static int g_cflag = 0; 117 static int g_kflag = 0; 118 static int g_gflag = 0; 119 static int g_Vflag = 0; 120 static int g_tracing = 0; 121 static size_t g_recsize; 122 static size_t g_nrecs; 123 static int g_nrecs_used; 124 static uchar_t g_enabled[LS_MAX_EVENTS]; 125 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 126 static dtrace_hdl_t *g_dtp; 127 static char *g_predicate; 128 static char *g_ipredicate; 129 static char *g_prog; 130 static int g_proglen; 131 static int g_dropped; 132 133 typedef struct ls_event_info { 134 char ev_type; 135 char ev_lhdr[20]; 136 char ev_desc[80]; 137 char ev_units[10]; 138 char ev_name[DTRACE_NAMELEN]; 139 char *ev_predicate; 140 char *ev_acquire; 141 } ls_event_info_t; 142 143 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 144 { 'C', "Lock", "Adaptive mutex spin", "nsec", 145 "lockstat:::adaptive-spin" }, 146 { 'C', "Lock", "Adaptive mutex block", "nsec", 147 "lockstat:::adaptive-block" }, 148 { 'C', "Lock", "Spin lock spin", "nsec", 149 "lockstat:::spin-spin" }, 150 { 'C', "Lock", "Thread lock spin", "nsec", 151 "lockstat:::thread-spin" }, 152 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 153 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 154 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 155 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 156 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 157 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 158 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 159 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 160 { 'C', "Lock", "Unknown event (type 8)", "units" }, 161 { 'C', "Lock", "Unknown event (type 9)", "units" }, 162 { 'C', "Lock", "Unknown event (type 10)", "units" }, 163 { 'C', "Lock", "Unknown event (type 11)", "units" }, 164 { 'C', "Lock", "Unknown event (type 12)", "units" }, 165 { 'C', "Lock", "Unknown event (type 13)", "units" }, 166 { 'C', "Lock", "Unknown event (type 14)", "units" }, 167 { 'C', "Lock", "Unknown event (type 15)", "units" }, 168 { 'C', "Lock", "Unknown event (type 16)", "units" }, 169 { 'C', "Lock", "Unknown event (type 17)", "units" }, 170 { 'C', "Lock", "Unknown event (type 18)", "units" }, 171 { 'C', "Lock", "Unknown event (type 19)", "units" }, 172 { 'C', "Lock", "Unknown event (type 20)", "units" }, 173 { 'C', "Lock", "Unknown event (type 21)", "units" }, 174 { 'C', "Lock", "Unknown event (type 22)", "units" }, 175 { 'C', "Lock", "Unknown event (type 23)", "units" }, 176 { 'C', "Lock", "Unknown event (type 24)", "units" }, 177 { 'C', "Lock", "Unknown event (type 25)", "units" }, 178 { 'C', "Lock", "Unknown event (type 26)", "units" }, 179 { 'C', "Lock", "Unknown event (type 27)", "units" }, 180 { 'C', "Lock", "Unknown event (type 28)", "units" }, 181 { 'C', "Lock", "Unknown event (type 29)", "units" }, 182 { 'C', "Lock", "Unknown event (type 30)", "units" }, 183 { 'C', "Lock", "Unknown event (type 31)", "units" }, 184 { 'H', "Lock", "Adaptive mutex hold", "nsec", 185 "lockstat:::adaptive-release", NULL, 186 "lockstat:::adaptive-acquire" }, 187 { 'H', "Lock", "Spin lock hold", "nsec", 188 "lockstat:::spin-release", NULL, 189 "lockstat:::spin-acquire" }, 190 { 'H', "Lock", "R/W writer hold", "nsec", 191 "lockstat:::rw-release", "arg1 == 0", 192 "lockstat:::rw-acquire" }, 193 { 'H', "Lock", "R/W reader hold", "nsec", 194 "lockstat:::rw-release", "arg1 != 0", 195 "lockstat:::rw-acquire" }, 196 { 'H', "Lock", "Unknown event (type 36)", "units" }, 197 { 'H', "Lock", "Unknown event (type 37)", "units" }, 198 { 'H', "Lock", "Unknown event (type 38)", "units" }, 199 { 'H', "Lock", "Unknown event (type 39)", "units" }, 200 { 'H', "Lock", "Unknown event (type 40)", "units" }, 201 { 'H', "Lock", "Unknown event (type 41)", "units" }, 202 { 'H', "Lock", "Unknown event (type 42)", "units" }, 203 { 'H', "Lock", "Unknown event (type 43)", "units" }, 204 { 'H', "Lock", "Unknown event (type 44)", "units" }, 205 { 'H', "Lock", "Unknown event (type 45)", "units" }, 206 { 'H', "Lock", "Unknown event (type 46)", "units" }, 207 { 'H', "Lock", "Unknown event (type 47)", "units" }, 208 { 'H', "Lock", "Unknown event (type 48)", "units" }, 209 { 'H', "Lock", "Unknown event (type 49)", "units" }, 210 { 'H', "Lock", "Unknown event (type 50)", "units" }, 211 { 'H', "Lock", "Unknown event (type 51)", "units" }, 212 { 'H', "Lock", "Unknown event (type 52)", "units" }, 213 { 'H', "Lock", "Unknown event (type 53)", "units" }, 214 { 'H', "Lock", "Unknown event (type 54)", "units" }, 215 { 'H', "Lock", "Unknown event (type 55)", "units" }, 216 #ifdef illumos 217 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 218 #else 219 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 220 #endif 221 "profile:::profile-97", NULL }, 222 { 'I', "Lock", "Unknown event (type 57)", "units" }, 223 { 'I', "Lock", "Unknown event (type 58)", "units" }, 224 { 'I', "Lock", "Unknown event (type 59)", "units" }, 225 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 226 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 227 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 228 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 229 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 230 }; 231 232 #ifndef illumos 233 static char *g_pri_class[] = { 234 "", 235 "Intr", 236 "RealT", 237 "TShar", 238 "Idle" 239 }; 240 #endif 241 242 static void 243 fail(int do_perror, const char *message, ...) 244 { 245 va_list args; 246 int save_errno = errno; 247 248 va_start(args, message); 249 (void) fprintf(stderr, "lockstat: "); 250 (void) vfprintf(stderr, message, args); 251 va_end(args); 252 if (do_perror) 253 (void) fprintf(stderr, ": %s", strerror(save_errno)); 254 (void) fprintf(stderr, "\n"); 255 exit(2); 256 } 257 258 static void 259 dfail(const char *message, ...) 260 { 261 va_list args; 262 263 va_start(args, message); 264 (void) fprintf(stderr, "lockstat: "); 265 (void) vfprintf(stderr, message, args); 266 va_end(args); 267 (void) fprintf(stderr, ": %s\n", 268 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 269 270 exit(2); 271 } 272 273 static void 274 show_events(char event_type, char *desc) 275 { 276 int i, first = -1, last; 277 278 for (i = 0; i < LS_MAX_EVENTS; i++) { 279 ls_event_info_t *evp = &g_event_info[i]; 280 if (evp->ev_type != event_type || 281 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 282 continue; 283 if (first == -1) 284 first = i; 285 last = i; 286 } 287 288 (void) fprintf(stderr, 289 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 290 desc, event_type, first, last); 291 292 for (i = first; i <= last; i++) 293 (void) fprintf(stderr, 294 "%4d = %s\n", i, g_event_info[i].ev_desc); 295 } 296 297 static void 298 usage(void) 299 { 300 (void) fprintf(stderr, 301 "Usage: lockstat [options] command [args]\n" 302 "\nEvent selection options:\n\n" 303 " -C watch contention events [on by default]\n" 304 " -E watch error events [off by default]\n" 305 " -H watch hold events [off by default]\n" 306 " -I watch interrupt events [off by default]\n" 307 " -A watch all lock events [equivalent to -CH]\n" 308 " -e event_list only watch the specified events (shown below);\n" 309 " <event_list> is a comma-separated list of\n" 310 " events or ranges of events, e.g. 1,4-7,35\n" 311 " -i rate interrupt rate for -I [default: %d Hz]\n" 312 "\nData gathering options:\n\n" 313 " -b basic statistics (lock, caller, event count)\n" 314 " -t timing for all events [default]\n" 315 " -h histograms for event times\n" 316 " -s depth stack traces <depth> deep\n" 317 " -x opt[=val] enable or modify DTrace options\n" 318 "\nData filtering options:\n\n" 319 " -n nrecords maximum number of data records [default: %d]\n" 320 " -l lock[,size] only watch <lock>, which can be specified as a\n" 321 " symbolic name or hex address; <size> defaults\n" 322 " to the ELF symbol size if available, 1 if not\n" 323 " -f func[,size] only watch events generated by <func>\n" 324 " -d duration only watch events longer than <duration>\n" 325 " -T trace (rather than sample) events\n" 326 "\nData reporting options:\n\n" 327 " -c coalesce lock data for arrays like pse_mutex[]\n" 328 " -k coalesce PCs within functions\n" 329 " -g show total events generated by function\n" 330 " -w wherever: don't distinguish events by caller\n" 331 " -W whichever: don't distinguish events by lock\n" 332 " -R display rates rather than counts\n" 333 " -p parsable output format (awk(1)-friendly)\n" 334 " -P sort lock data by (count * avg_time) product\n" 335 " -D n only display top <n> events of each type\n" 336 " -o filename send output to <filename>\n", 337 DEFAULT_HZ, DEFAULT_NRECS); 338 339 show_events('C', "Contention"); 340 show_events('H', "Hold-time"); 341 show_events('I', "Interrupt"); 342 show_events('E', "Error"); 343 (void) fprintf(stderr, "\n"); 344 345 exit(1); 346 } 347 348 static int 349 lockcmp(lsrec_t *a, lsrec_t *b) 350 { 351 int i; 352 353 if (a->ls_event < b->ls_event) 354 return (-1); 355 if (a->ls_event > b->ls_event) 356 return (1); 357 358 for (i = g_stkdepth - 1; i >= 0; i--) { 359 if (a->ls_stack[i] < b->ls_stack[i]) 360 return (-1); 361 if (a->ls_stack[i] > b->ls_stack[i]) 362 return (1); 363 } 364 365 if (a->ls_caller < b->ls_caller) 366 return (-1); 367 if (a->ls_caller > b->ls_caller) 368 return (1); 369 370 if (a->ls_lock < b->ls_lock) 371 return (-1); 372 if (a->ls_lock > b->ls_lock) 373 return (1); 374 375 return (0); 376 } 377 378 static int 379 countcmp(lsrec_t *a, lsrec_t *b) 380 { 381 if (a->ls_event < b->ls_event) 382 return (-1); 383 if (a->ls_event > b->ls_event) 384 return (1); 385 386 return (b->ls_count - a->ls_count); 387 } 388 389 static int 390 timecmp(lsrec_t *a, lsrec_t *b) 391 { 392 if (a->ls_event < b->ls_event) 393 return (-1); 394 if (a->ls_event > b->ls_event) 395 return (1); 396 397 if (a->ls_time < b->ls_time) 398 return (1); 399 if (a->ls_time > b->ls_time) 400 return (-1); 401 402 return (0); 403 } 404 405 static int 406 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 407 { 408 if (a->ls_event < b->ls_event) 409 return (-1); 410 if (a->ls_event > b->ls_event) 411 return (1); 412 413 if (a->ls_lock < b->ls_lock) 414 return (-1); 415 if (a->ls_lock > b->ls_lock) 416 return (1); 417 418 return (0); 419 } 420 421 static int 422 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 423 { 424 if (a->ls_event < b->ls_event) 425 return (-1); 426 if (a->ls_event > b->ls_event) 427 return (1); 428 429 if (a->ls_lock < b->ls_lock) 430 return (-1); 431 if (a->ls_lock > b->ls_lock) 432 return (1); 433 434 return (b->ls_count - a->ls_count); 435 } 436 437 static int 438 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 439 { 440 int i; 441 442 if (a->ls_event < b->ls_event) 443 return (-1); 444 if (a->ls_event > b->ls_event) 445 return (1); 446 447 for (i = g_stkdepth - 1; i >= 0; i--) { 448 if (a->ls_stack[i] < b->ls_stack[i]) 449 return (-1); 450 if (a->ls_stack[i] > b->ls_stack[i]) 451 return (1); 452 } 453 454 if (a->ls_caller < b->ls_caller) 455 return (-1); 456 if (a->ls_caller > b->ls_caller) 457 return (1); 458 459 return (0); 460 } 461 462 static int 463 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 464 { 465 int i; 466 467 if (a->ls_event < b->ls_event) 468 return (-1); 469 if (a->ls_event > b->ls_event) 470 return (1); 471 472 for (i = g_stkdepth - 1; i >= 0; i--) { 473 if (a->ls_stack[i] < b->ls_stack[i]) 474 return (-1); 475 if (a->ls_stack[i] > b->ls_stack[i]) 476 return (1); 477 } 478 479 if (a->ls_caller < b->ls_caller) 480 return (-1); 481 if (a->ls_caller > b->ls_caller) 482 return (1); 483 484 return (b->ls_count - a->ls_count); 485 } 486 487 static void 488 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 489 { 490 int m = n / 2; 491 int i, j; 492 493 if (m > 1) 494 lsmergesort(cmp, a, b, m); 495 if (n - m > 1) 496 lsmergesort(cmp, a + m, b + m, n - m); 497 for (i = m; i > 0; i--) 498 b[i - 1] = a[i - 1]; 499 for (j = m - 1; j < n - 1; j++) 500 b[n + m - j - 2] = a[j + 1]; 501 while (i < j) 502 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 503 *a = b[i]; 504 } 505 506 static void 507 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 508 { 509 int i, j; 510 lsrec_t *target, *current; 511 512 target = lock[0]; 513 514 for (i = 1; i < n; i++) { 515 current = lock[i]; 516 if (cmp(current, target) != 0) { 517 target = current; 518 continue; 519 } 520 current->ls_event = LS_MAX_EVENTS; 521 target->ls_count += current->ls_count; 522 target->ls_refcnt += current->ls_refcnt; 523 if (g_recsize < LS_TIME) 524 continue; 525 target->ls_time += current->ls_time; 526 if (g_recsize < LS_HIST) 527 continue; 528 for (j = 0; j < 64; j++) 529 target->ls_hist[j] += current->ls_hist[j]; 530 } 531 } 532 533 static void 534 coalesce_symbol(uintptr_t *addrp) 535 { 536 uintptr_t symoff; 537 size_t symsize; 538 539 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 540 *addrp -= symoff; 541 } 542 543 static void 544 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 545 { 546 char *new; 547 int len, newlen; 548 549 if (what == NULL) 550 return; 551 552 if (*pred == NULL) { 553 *pred = malloc(1); 554 *pred[0] = '\0'; 555 } 556 557 len = strlen(*pred); 558 newlen = len + strlen(what) + 32 + strlen("( && )"); 559 new = malloc(newlen); 560 561 if (*pred[0] != '\0') { 562 if (cmp != NULL) { 563 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 564 *pred, what, cmp, (void *)value); 565 } else { 566 (void) sprintf(new, "(%s) && (%s)", *pred, what); 567 } 568 } else { 569 if (cmp != NULL) { 570 (void) sprintf(new, "%s %s 0x%p", 571 what, cmp, (void *)value); 572 } else { 573 (void) sprintf(new, "%s", what); 574 } 575 } 576 577 free(*pred); 578 *pred = new; 579 } 580 581 static void 582 predicate_destroy(char **pred) 583 { 584 free(*pred); 585 *pred = NULL; 586 } 587 588 static void 589 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 590 { 591 char buf[256], *c = buf, *new; 592 int len, newlen; 593 594 if (*filt == NULL) { 595 *filt = malloc(1); 596 *filt[0] = '\0'; 597 } 598 599 #ifdef illumos 600 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 601 " || " : "", what, (void *)base, what, (void *)(base + size)); 602 #else 603 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 604 " || " : "", what, (void *)base, what, (void *)(base + size)); 605 #endif 606 607 newlen = (len = strlen(*filt) + 1) + strlen(c); 608 new = malloc(newlen); 609 bcopy(*filt, new, len); 610 (void) strcat(new, c); 611 free(*filt); 612 *filt = new; 613 } 614 615 static void 616 filter_destroy(char **filt) 617 { 618 free(*filt); 619 *filt = NULL; 620 } 621 622 static void 623 dprog_add(const char *fmt, ...) 624 { 625 va_list args; 626 int size, offs; 627 char c; 628 629 va_start(args, fmt); 630 size = vsnprintf(&c, 1, fmt, args) + 1; 631 va_end(args); 632 633 if (g_proglen == 0) { 634 offs = 0; 635 } else { 636 offs = g_proglen - 1; 637 } 638 639 g_proglen = offs + size; 640 641 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 642 fail(1, "failed to reallocate program text"); 643 644 va_start(args, fmt); 645 (void) vsnprintf(&g_prog[offs], size, fmt, args); 646 va_end(args); 647 } 648 649 /* 650 * This function may read like an open sewer, but keep in mind that programs 651 * that generate other programs are rarely pretty. If one has the unenviable 652 * task of maintaining or -- worse -- extending this code, use the -V option 653 * to examine the D program as generated by this function. 654 */ 655 static void 656 dprog_addevent(int event) 657 { 658 ls_event_info_t *info = &g_event_info[event]; 659 char *pred = NULL; 660 char stack[20]; 661 const char *arg0, *caller; 662 char *arg1 = "arg1"; 663 char buf[80]; 664 hrtime_t dur; 665 int depth; 666 667 if (info->ev_name[0] == '\0') 668 return; 669 670 if (info->ev_type == 'I') { 671 /* 672 * For interrupt events, arg0 (normally the lock pointer) is 673 * the CPU address plus the current pil, and arg1 (normally 674 * the number of nanoseconds) is the number of nanoseconds 675 * late -- and it's stored in arg2. 676 */ 677 #ifdef illumos 678 arg0 = "(uintptr_t)curthread->t_cpu + \n" 679 "\t curthread->t_cpu->cpu_profile_pil"; 680 #else 681 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 682 "\t 0x01000000 + curthread->td_pri_class"; 683 #endif 684 caller = "(uintptr_t)arg0"; 685 arg1 = "arg2"; 686 } else { 687 arg0 = "(uintptr_t)arg0"; 688 caller = "caller"; 689 } 690 691 if (g_recsize > LS_HIST) { 692 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 693 continue; 694 695 if (g_tracing) { 696 (void) sprintf(stack, "\tstack(%d);\n", depth); 697 } else { 698 (void) sprintf(stack, ", stack(%d)", depth); 699 } 700 } else { 701 (void) sprintf(stack, ""); 702 } 703 704 if (info->ev_acquire != NULL) { 705 /* 706 * If this is a hold event, we need to generate an additional 707 * clause for the acquire; the clause for the release will be 708 * generated with the aggregating statement, below. 709 */ 710 dprog_add("%s\n", info->ev_acquire); 711 predicate_add(&pred, info->ev_predicate, NULL, 0); 712 predicate_add(&pred, g_predicate, NULL, 0); 713 if (pred != NULL) 714 dprog_add("/%s/\n", pred); 715 716 dprog_add("{\n"); 717 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 718 719 if (info->ev_type == 'H') { 720 dprog_add("\t%s = timestamp;\n", buf); 721 } else { 722 /* 723 * If this isn't a hold event, it's the recursive 724 * error event. For this, we simply bump the 725 * thread-local, per-lock count. 726 */ 727 dprog_add("\t%s++;\n", buf); 728 } 729 730 dprog_add("}\n\n"); 731 predicate_destroy(&pred); 732 pred = NULL; 733 734 if (info->ev_type == 'E') { 735 /* 736 * If this is the recursive lock error event, we need 737 * to generate an additional clause to decrement the 738 * thread-local, per-lock count. This assures that we 739 * only execute the aggregating clause if we have 740 * recursive entry. 741 */ 742 dprog_add("%s\n", info->ev_name); 743 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 744 } 745 746 predicate_add(&pred, buf, NULL, 0); 747 748 if (info->ev_type == 'H') { 749 (void) sprintf(buf, "timestamp -\n\t " 750 "self->ev%d[(uintptr_t)arg0]", event); 751 } 752 753 arg1 = buf; 754 } else { 755 predicate_add(&pred, info->ev_predicate, NULL, 0); 756 if (info->ev_type != 'I') 757 predicate_add(&pred, g_predicate, NULL, 0); 758 else 759 predicate_add(&pred, g_ipredicate, NULL, 0); 760 } 761 762 if ((dur = g_min_duration[event]) != 0) 763 predicate_add(&pred, arg1, ">=", dur); 764 765 dprog_add("%s\n", info->ev_name); 766 767 if (pred != NULL) 768 dprog_add("/%s/\n", pred); 769 predicate_destroy(&pred); 770 771 dprog_add("{\n"); 772 773 if (g_tracing) { 774 dprog_add("\ttrace(%dULL);\n", event); 775 dprog_add("\ttrace(%s);\n", arg0); 776 dprog_add("\ttrace(%s);\n", caller); 777 dprog_add(stack); 778 } else { 779 /* 780 * The ordering here is important: when we process the 781 * aggregate, we count on the fact that @avg appears before 782 * @hist in program order to assure that @avg is assigned the 783 * first aggregation variable ID and @hist assigned the 784 * second; see the comment in process_aggregate() for details. 785 */ 786 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 787 event, arg0, caller, stack, arg1); 788 789 if (g_recsize >= LS_HIST) { 790 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 791 "(%s);\n", event, arg0, caller, stack, arg1); 792 } 793 } 794 795 if (info->ev_acquire != NULL) 796 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 797 798 dprog_add("}\n\n"); 799 } 800 801 static void 802 dprog_compile() 803 { 804 dtrace_prog_t *prog; 805 dtrace_proginfo_t info; 806 807 if (g_Vflag) { 808 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 809 (void) fputs(g_prog, stderr); 810 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 811 } 812 813 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 814 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 815 dfail("failed to compile program"); 816 817 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 818 dfail("failed to enable probes"); 819 820 if (dtrace_go(g_dtp) != 0) 821 dfail("couldn't start tracing"); 822 } 823 824 static void 825 #ifdef illumos 826 status_fire(void) 827 #else 828 status_fire(int i) 829 #endif 830 {} 831 832 static void 833 status_init(void) 834 { 835 dtrace_optval_t val, status, agg; 836 struct sigaction act; 837 struct itimerspec ts; 838 struct sigevent ev; 839 timer_t tid; 840 841 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 842 dfail("failed to get 'statusrate'"); 843 844 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 845 dfail("failed to get 'statusrate'"); 846 847 /* 848 * We would want to awaken at a rate that is the GCD of the statusrate 849 * and the aggrate -- but that seems a bit absurd. Instead, we'll 850 * simply awaken at a rate that is the more frequent of the two, which 851 * assures that we're never later than the interval implied by the 852 * more frequent rate. 853 */ 854 val = status < agg ? status : agg; 855 856 (void) sigemptyset(&act.sa_mask); 857 act.sa_flags = 0; 858 act.sa_handler = status_fire; 859 (void) sigaction(SIGUSR1, &act, NULL); 860 861 ev.sigev_notify = SIGEV_SIGNAL; 862 ev.sigev_signo = SIGUSR1; 863 864 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 865 dfail("cannot create CLOCK_REALTIME timer"); 866 867 ts.it_value.tv_sec = val / NANOSEC; 868 ts.it_value.tv_nsec = val % NANOSEC; 869 ts.it_interval = ts.it_value; 870 871 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 872 dfail("cannot set time on CLOCK_REALTIME timer"); 873 } 874 875 static void 876 status_check(void) 877 { 878 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 879 dfail("failed to snap aggregate"); 880 881 if (dtrace_status(g_dtp) == -1) 882 dfail("dtrace_status()"); 883 } 884 885 static void 886 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 887 { 888 bzero(lsrec, g_recsize); 889 lsrec->ls_count = 1; 890 891 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 892 fail(0, "truncated DTrace record"); 893 894 if (rec->dtrd_size != sizeof (uint64_t)) 895 fail(0, "bad event size in first record"); 896 897 /* LINTED - alignment */ 898 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 899 rec++; 900 901 if (rec->dtrd_size != sizeof (uintptr_t)) 902 fail(0, "bad lock address size in second record"); 903 904 /* LINTED - alignment */ 905 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 906 rec++; 907 908 if (rec->dtrd_size != sizeof (uintptr_t)) 909 fail(0, "bad caller size in third record"); 910 911 /* LINTED - alignment */ 912 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 913 rec++; 914 915 if (g_recsize > LS_HIST) { 916 int frames, i; 917 pc_t *stack; 918 919 frames = rec->dtrd_size / sizeof (pc_t); 920 /* LINTED - alignment */ 921 stack = (pc_t *)(data + rec->dtrd_offset); 922 923 for (i = 1; i < frames; i++) 924 lsrec->ls_stack[i - 1] = stack[i]; 925 } 926 } 927 928 /*ARGSUSED*/ 929 static int 930 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 931 { 932 *((size_t *)arg) += 1; 933 934 return (DTRACE_AGGWALK_NEXT); 935 } 936 937 static int 938 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 939 { 940 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 941 caddr_t data = agg->dtada_data; 942 lsdata_t *lsdata = arg; 943 lsrec_t *lsrec = lsdata->lsd_next; 944 const dtrace_recdesc_t *rec; 945 uint64_t *avg, *quantized; 946 int i, j; 947 948 assert(lsdata->lsd_count < g_nrecs); 949 950 /* 951 * Aggregation variable IDs are guaranteed to be generated in program 952 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 953 * plus one. As "avg" appears before "hist" in program order, we know 954 * that "avg" will be allocated the first aggregation variable ID, and 955 * "hist" will be allocated the second aggregation variable ID -- and 956 * we therefore use the aggregation variable ID to differentiate the 957 * cases. 958 */ 959 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 960 /* 961 * If this is the histogram entry. We'll copy the quantized 962 * data into lc_hist, and jump over the rest. 963 */ 964 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 965 966 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 967 fail(0, "bad variable ID in aggregation record"); 968 969 if (rec->dtrd_size != 970 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 971 fail(0, "bad quantize size in aggregation record"); 972 973 /* LINTED - alignment */ 974 quantized = (uint64_t *)(data + rec->dtrd_offset); 975 976 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 977 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 978 lsrec->ls_hist[j] = quantized[i]; 979 980 goto out; 981 } 982 983 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 984 aggdesc->dtagd_nrecs - 1, data); 985 986 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 987 988 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 989 fail(0, "bad avg size in aggregation record"); 990 991 /* LINTED - alignment */ 992 avg = (uint64_t *)(data + rec->dtrd_offset); 993 lsrec->ls_count = (uint32_t)avg[0]; 994 lsrec->ls_time = (uintptr_t)avg[1]; 995 996 if (g_recsize >= LS_HIST) 997 return (DTRACE_AGGWALK_NEXT); 998 999 out: 1000 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1001 lsdata->lsd_count++; 1002 1003 return (DTRACE_AGGWALK_NEXT); 1004 } 1005 1006 static int 1007 process_trace(const dtrace_probedata_t *pdata, void *arg) 1008 { 1009 lsdata_t *lsdata = arg; 1010 lsrec_t *lsrec = lsdata->lsd_next; 1011 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1012 caddr_t data = pdata->dtpda_data; 1013 1014 if (lsdata->lsd_count >= g_nrecs) 1015 return (DTRACE_CONSUME_NEXT); 1016 1017 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1018 1019 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1020 lsdata->lsd_count++; 1021 1022 return (DTRACE_CONSUME_NEXT); 1023 } 1024 1025 static int 1026 process_data(FILE *out, char *data) 1027 { 1028 lsdata_t lsdata; 1029 1030 /* LINTED - alignment */ 1031 lsdata.lsd_next = (lsrec_t *)data; 1032 lsdata.lsd_count = 0; 1033 1034 if (g_tracing) { 1035 if (dtrace_consume(g_dtp, out, 1036 process_trace, NULL, &lsdata) != 0) 1037 dfail("failed to consume buffer"); 1038 1039 return (lsdata.lsd_count); 1040 } 1041 1042 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1043 process_aggregate, &lsdata) != 0) 1044 dfail("failed to walk aggregate"); 1045 1046 return (lsdata.lsd_count); 1047 } 1048 1049 /*ARGSUSED*/ 1050 static int 1051 drophandler(const dtrace_dropdata_t *data, void *arg) 1052 { 1053 g_dropped++; 1054 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1055 return (DTRACE_HANDLE_OK); 1056 } 1057 1058 int 1059 main(int argc, char **argv) 1060 { 1061 char *data_buf; 1062 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1063 FILE *out = stdout; 1064 int c; 1065 pid_t child; 1066 int status; 1067 int i, j; 1068 hrtime_t duration; 1069 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1070 uintptr_t addr; 1071 size_t size, off; 1072 int events_specified = 0; 1073 int exec_errno = 0; 1074 uint32_t event; 1075 char *filt = NULL, *ifilt = NULL; 1076 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1077 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1078 dtrace_optval_t aggsize; 1079 char aggstr[10]; 1080 long ncpus; 1081 int dynvar = 0; 1082 int err; 1083 1084 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1085 fail(0, "cannot open dtrace library: %s", 1086 dtrace_errmsg(NULL, err)); 1087 } 1088 1089 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1090 dfail("couldn't establish drop handler"); 1091 1092 if (symtab_init() == -1) 1093 fail(1, "can't load kernel symbols"); 1094 1095 g_nrecs = DEFAULT_NRECS; 1096 1097 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1098 switch (c) { 1099 case 'b': 1100 g_recsize = LS_BASIC; 1101 break; 1102 1103 case 't': 1104 g_recsize = LS_TIME; 1105 break; 1106 1107 case 'h': 1108 g_recsize = LS_HIST; 1109 break; 1110 1111 case 's': 1112 if (!isdigit(optarg[0])) 1113 usage(); 1114 g_stkdepth = atoi(optarg); 1115 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1116 fail(0, "max stack depth is %d", 1117 LS_MAX_STACK_DEPTH); 1118 g_recsize = LS_STACK(g_stkdepth); 1119 break; 1120 1121 case 'n': 1122 if (!isdigit(optarg[0])) 1123 usage(); 1124 g_nrecs = atoi(optarg); 1125 break; 1126 1127 case 'd': 1128 if (!isdigit(optarg[0])) 1129 usage(); 1130 duration = atoll(optarg); 1131 1132 /* 1133 * XXX -- durations really should be per event 1134 * since the units are different, but it's hard 1135 * to express this nicely in the interface. 1136 * Not clear yet what the cleanest solution is. 1137 */ 1138 for (i = 0; i < LS_MAX_EVENTS; i++) 1139 if (g_event_info[i].ev_type != 'E') 1140 g_min_duration[i] = duration; 1141 1142 break; 1143 1144 case 'i': 1145 if (!isdigit(optarg[0])) 1146 usage(); 1147 i = atoi(optarg); 1148 if (i <= 0) 1149 usage(); 1150 if (i > MAX_HZ) 1151 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1152 1153 for (j = 0; j < LS_MAX_EVENTS; j++) 1154 if (strcmp(g_event_info[j].ev_desc, 1155 "Profiling interrupt") == 0) 1156 break; 1157 1158 (void) sprintf(g_event_info[j].ev_name, 1159 "profile:::profile-%d", i); 1160 break; 1161 1162 case 'l': 1163 case 'f': 1164 addrp = strtok(optarg, ","); 1165 sizep = strtok(NULL, ","); 1166 addrp = strtok(optarg, ",+"); 1167 offp = strtok(NULL, ","); 1168 1169 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1170 off = offp ? strtoul(offp, NULL, 0) : 0; 1171 1172 if (addrp[0] == '0') { 1173 addr = strtoul(addrp, NULL, 16) + off; 1174 } else { 1175 addr = sym_to_addr(addrp) + off; 1176 if (sizep == NULL) 1177 size = sym_size(addrp) - off; 1178 if (addr - off == 0) 1179 fail(0, "symbol '%s' not found", addrp); 1180 if (size == 0) 1181 size = 1; 1182 } 1183 1184 1185 if (c == 'l') { 1186 filter_add(&filt, "arg0", addr, size); 1187 } else { 1188 filter_add(&filt, "caller", addr, size); 1189 filter_add(&ifilt, "arg0", addr, size); 1190 } 1191 break; 1192 1193 case 'e': 1194 evp = strtok_r(optarg, ",", &lastp); 1195 while (evp) { 1196 int ev1, ev2; 1197 char *evp2; 1198 1199 (void) strtok(evp, "-"); 1200 evp2 = strtok(NULL, "-"); 1201 ev1 = atoi(evp); 1202 ev2 = evp2 ? atoi(evp2) : ev1; 1203 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1204 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1205 fail(0, "-e events out of range"); 1206 for (i = ev1; i <= ev2; i++) 1207 g_enabled[i] = 1; 1208 evp = strtok_r(NULL, ",", &lastp); 1209 } 1210 events_specified = 1; 1211 break; 1212 1213 case 'c': 1214 g_cflag = 1; 1215 break; 1216 1217 case 'k': 1218 g_kflag = 1; 1219 break; 1220 1221 case 'w': 1222 g_wflag = 1; 1223 break; 1224 1225 case 'W': 1226 g_Wflag = 1; 1227 break; 1228 1229 case 'g': 1230 g_gflag = 1; 1231 break; 1232 1233 case 'C': 1234 case 'E': 1235 case 'H': 1236 case 'I': 1237 for (i = 0; i < LS_MAX_EVENTS; i++) 1238 if (g_event_info[i].ev_type == c) 1239 g_enabled[i] = 1; 1240 events_specified = 1; 1241 break; 1242 1243 case 'A': 1244 for (i = 0; i < LS_MAX_EVENTS; i++) 1245 if (strchr("CH", g_event_info[i].ev_type)) 1246 g_enabled[i] = 1; 1247 events_specified = 1; 1248 break; 1249 1250 case 'T': 1251 g_tracing = 1; 1252 break; 1253 1254 case 'D': 1255 if (!isdigit(optarg[0])) 1256 usage(); 1257 g_topn = atoi(optarg); 1258 break; 1259 1260 case 'R': 1261 g_rates = 1; 1262 break; 1263 1264 case 'p': 1265 g_pflag = 1; 1266 break; 1267 1268 case 'P': 1269 g_Pflag = 1; 1270 break; 1271 1272 case 'o': 1273 if ((out = fopen(optarg, "w")) == NULL) 1274 fail(1, "error opening file"); 1275 break; 1276 1277 case 'V': 1278 g_Vflag = 1; 1279 break; 1280 1281 default: 1282 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1283 usage(); 1284 } 1285 } 1286 1287 if (filt != NULL) { 1288 predicate_add(&g_predicate, filt, NULL, 0); 1289 filter_destroy(&filt); 1290 } 1291 1292 if (ifilt != NULL) { 1293 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1294 filter_destroy(&ifilt); 1295 } 1296 1297 if (g_recsize == 0) { 1298 if (g_gflag) { 1299 g_stkdepth = LS_MAX_STACK_DEPTH; 1300 g_recsize = LS_STACK(g_stkdepth); 1301 } else { 1302 g_recsize = LS_TIME; 1303 } 1304 } 1305 1306 if (g_gflag && g_recsize <= LS_STACK(0)) 1307 fail(0, "'-g' requires at least '-s 1' data gathering"); 1308 1309 /* 1310 * Make sure the alignment is reasonable 1311 */ 1312 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1313 1314 for (i = 0; i < LS_MAX_EVENTS; i++) { 1315 /* 1316 * If no events were specified, enable -C. 1317 */ 1318 if (!events_specified && g_event_info[i].ev_type == 'C') 1319 g_enabled[i] = 1; 1320 } 1321 1322 for (i = 0; i < LS_MAX_EVENTS; i++) { 1323 if (!g_enabled[i]) 1324 continue; 1325 1326 if (g_event_info[i].ev_acquire != NULL) { 1327 /* 1328 * If we've enabled a hold event, we must explicitly 1329 * allocate dynamic variable space. 1330 */ 1331 dynvar = 1; 1332 } 1333 1334 dprog_addevent(i); 1335 } 1336 1337 /* 1338 * Make sure there are remaining arguments to specify a child command 1339 * to execute. 1340 */ 1341 if (argc <= optind) 1342 usage(); 1343 1344 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1345 dfail("couldn't determine number of online CPUs"); 1346 1347 /* 1348 * By default, we set our data buffer size to be the number of records 1349 * multiplied by the size of the record, doubled to account for some 1350 * DTrace slop and divided by the number of CPUs. We silently clamp 1351 * the aggregation size at both a minimum and a maximum to prevent 1352 * absurdly low or high values. 1353 */ 1354 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1355 aggsize = MIN_AGGSIZE; 1356 1357 if (aggsize > MAX_AGGSIZE) 1358 aggsize = MAX_AGGSIZE; 1359 1360 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1361 1362 if (!g_tracing) { 1363 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1364 dfail("failed to set 'bufsize'"); 1365 1366 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1367 dfail("failed to set 'aggsize'"); 1368 1369 if (dynvar) { 1370 /* 1371 * If we're using dynamic variables, we set our 1372 * dynamic variable size to be one megabyte per CPU, 1373 * with a hard-limit of 32 megabytes. This may still 1374 * be too small in some cases, but it can be tuned 1375 * manually via -x if need be. 1376 */ 1377 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1378 1379 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1380 dfail("failed to set 'dynvarsize'"); 1381 } 1382 } else { 1383 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1384 dfail("failed to set 'bufsize'"); 1385 } 1386 1387 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1388 dfail("failed to set 'statusrate'"); 1389 1390 optind = 1; 1391 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1392 switch (c) { 1393 case 'x': 1394 if ((p = strchr(optarg, '=')) != NULL) 1395 *p++ = '\0'; 1396 1397 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1398 dfail("failed to set -x %s", optarg); 1399 break; 1400 } 1401 } 1402 1403 argc -= optind; 1404 argv += optind; 1405 1406 dprog_compile(); 1407 status_init(); 1408 1409 g_elapsed = -gethrtime(); 1410 1411 /* 1412 * Spawn the specified command and wait for it to complete. 1413 */ 1414 child = fork(); 1415 if (child == -1) 1416 fail(1, "cannot fork"); 1417 if (child == 0) { 1418 (void) dtrace_close(g_dtp); 1419 (void) execvp(argv[0], &argv[0]); 1420 exec_errno = errno; 1421 exit(127); 1422 } 1423 1424 #ifdef illumos 1425 while (waitpid(child, &status, WEXITED) != child) 1426 #else 1427 while (waitpid(child, &status, 0) != child) 1428 #endif 1429 status_check(); 1430 1431 g_elapsed += gethrtime(); 1432 1433 if (WIFEXITED(status)) { 1434 if (WEXITSTATUS(status) != 0) { 1435 if (exec_errno != 0) { 1436 errno = exec_errno; 1437 fail(1, "could not execute %s", argv[0]); 1438 } 1439 (void) fprintf(stderr, 1440 "lockstat: warning: %s exited with code %d\n", 1441 argv[0], WEXITSTATUS(status)); 1442 } 1443 } else { 1444 (void) fprintf(stderr, 1445 "lockstat: warning: %s died on signal %d\n", 1446 argv[0], WTERMSIG(status)); 1447 } 1448 1449 if (dtrace_stop(g_dtp) == -1) 1450 dfail("failed to stop dtrace"); 1451 1452 /* 1453 * Before we read out the results, we need to allocate our buffer. 1454 * If we're tracing, then we'll just use the precalculated size. If 1455 * we're not, then we'll take a snapshot of the aggregate, and walk 1456 * it to count the number of records. 1457 */ 1458 if (!g_tracing) { 1459 if (dtrace_aggregate_snap(g_dtp) != 0) 1460 dfail("failed to snap aggregate"); 1461 1462 g_nrecs = 0; 1463 1464 if (dtrace_aggregate_walk(g_dtp, 1465 count_aggregate, &g_nrecs) != 0) 1466 dfail("failed to walk aggregate"); 1467 } 1468 1469 #ifdef illumos 1470 if ((data_buf = memalign(sizeof (uint64_t), 1471 (g_nrecs + 1) * g_recsize)) == NULL) 1472 #else 1473 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1474 (g_nrecs + 1) * g_recsize) ) 1475 #endif 1476 fail(1, "Memory allocation failed"); 1477 1478 /* 1479 * Read out the DTrace data. 1480 */ 1481 g_nrecs_used = process_data(out, data_buf); 1482 1483 if (g_nrecs_used > g_nrecs || g_dropped) 1484 (void) fprintf(stderr, "lockstat: warning: " 1485 "ran out of data records (use -n for more)\n"); 1486 1487 /* LINTED - alignment */ 1488 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1489 /* LINTED - alignment */ 1490 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1491 ev_count[lsp->ls_event] += lsp->ls_count; 1492 ev_time[lsp->ls_event] += lsp->ls_time; 1493 } 1494 1495 /* 1496 * If -g was specified, convert stacks into individual records. 1497 */ 1498 if (g_gflag) { 1499 lsrec_t *newlsp, *oldlsp; 1500 1501 #ifdef illumos 1502 newlsp = memalign(sizeof (uint64_t), 1503 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1504 #else 1505 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1506 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1507 #endif 1508 if (newlsp == NULL) 1509 fail(1, "Cannot allocate space for -g processing"); 1510 lsp = newlsp; 1511 /* LINTED - alignment */ 1512 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1513 /* LINTED - alignment */ 1514 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1515 int fr; 1516 int caller_in_stack = 0; 1517 1518 if (oldlsp->ls_count == 0) 1519 continue; 1520 1521 for (fr = 0; fr < g_stkdepth; fr++) { 1522 if (oldlsp->ls_stack[fr] == 0) 1523 break; 1524 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1525 caller_in_stack = 1; 1526 bcopy(oldlsp, lsp, LS_TIME); 1527 lsp->ls_caller = oldlsp->ls_stack[fr]; 1528 /* LINTED - alignment */ 1529 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1530 } 1531 if (!caller_in_stack) { 1532 bcopy(oldlsp, lsp, LS_TIME); 1533 /* LINTED - alignment */ 1534 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1535 } 1536 } 1537 g_nrecs = g_nrecs_used = 1538 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1539 g_recsize = LS_TIME; 1540 g_stkdepth = 0; 1541 free(data_buf); 1542 data_buf = (char *)newlsp; 1543 } 1544 1545 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1546 sizeof (void *))) == NULL) 1547 fail(1, "Sort buffer allocation failed"); 1548 merge_buf = sort_buf + (g_nrecs + 1); 1549 1550 /* 1551 * Build the sort buffer, discarding zero-count records along the way. 1552 */ 1553 /* LINTED - alignment */ 1554 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1555 /* LINTED - alignment */ 1556 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1557 if (lsp->ls_count == 0) 1558 lsp->ls_event = LS_MAX_EVENTS; 1559 sort_buf[i] = lsp; 1560 } 1561 1562 if (g_nrecs_used == 0) 1563 exit(0); 1564 1565 /* 1566 * Add a sentinel after the last record 1567 */ 1568 sort_buf[i] = lsp; 1569 lsp->ls_event = LS_MAX_EVENTS; 1570 1571 if (g_tracing) { 1572 report_trace(out, sort_buf); 1573 return (0); 1574 } 1575 1576 /* 1577 * Application of -g may have resulted in multiple records 1578 * with the same signature; coalesce them. 1579 */ 1580 if (g_gflag) { 1581 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1582 coalesce(lockcmp, sort_buf, g_nrecs_used); 1583 } 1584 1585 /* 1586 * Coalesce locks within the same symbol if -c option specified. 1587 * Coalesce PCs within the same function if -k option specified. 1588 */ 1589 if (g_cflag || g_kflag) { 1590 for (i = 0; i < g_nrecs_used; i++) { 1591 int fr; 1592 lsp = sort_buf[i]; 1593 if (g_cflag) 1594 coalesce_symbol(&lsp->ls_lock); 1595 if (g_kflag) { 1596 for (fr = 0; fr < g_stkdepth; fr++) 1597 coalesce_symbol(&lsp->ls_stack[fr]); 1598 coalesce_symbol(&lsp->ls_caller); 1599 } 1600 } 1601 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1602 coalesce(lockcmp, sort_buf, g_nrecs_used); 1603 } 1604 1605 /* 1606 * Coalesce callers if -w option specified 1607 */ 1608 if (g_wflag) { 1609 mergesort(lock_and_count_cmp_anywhere, 1610 sort_buf, merge_buf, g_nrecs_used); 1611 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1612 } 1613 1614 /* 1615 * Coalesce locks if -W option specified 1616 */ 1617 if (g_Wflag) { 1618 mergesort(site_and_count_cmp_anylock, 1619 sort_buf, merge_buf, g_nrecs_used); 1620 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1621 } 1622 1623 /* 1624 * Sort data by contention count (ls_count) or total time (ls_time), 1625 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1626 */ 1627 if (g_recsize < LS_TIME) 1628 g_Pflag = 0; 1629 1630 if (g_Pflag) 1631 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1632 else 1633 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1634 1635 /* 1636 * Display data by event type 1637 */ 1638 first = &sort_buf[0]; 1639 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1640 current = first; 1641 while ((lsp = *current)->ls_event == event) 1642 current++; 1643 report_stats(out, first, current - first, ev_count[event], 1644 ev_time[event]); 1645 first = current; 1646 } 1647 1648 return (0); 1649 } 1650 1651 static char * 1652 format_symbol(char *buf, uintptr_t addr, int show_size) 1653 { 1654 uintptr_t symoff; 1655 char *symname; 1656 size_t symsize; 1657 1658 symname = addr_to_sym(addr, &symoff, &symsize); 1659 1660 if (show_size && symoff == 0) 1661 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1662 else if (symoff == 0) 1663 (void) sprintf(buf, "%s", symname); 1664 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1665 #ifdef illumos 1666 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1667 #else 1668 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1669 #endif 1670 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1671 (void) sprintf(buf, "%s+0x%llx", symname, 1672 (unsigned long long)symoff); 1673 else 1674 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1675 return (buf); 1676 } 1677 1678 static void 1679 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1680 uint64_t total_time) 1681 { 1682 uint32_t event = sort_buf[0]->ls_event; 1683 lsrec_t *lsp; 1684 double ptotal = 0.0; 1685 double percent; 1686 int i, j, fr; 1687 int displayed; 1688 int first_bin, last_bin, max_bin_count, total_bin_count; 1689 int rectype; 1690 char buf[256]; 1691 char lhdr[80], chdr[80]; 1692 1693 rectype = g_recsize; 1694 1695 if (g_topn == 0) { 1696 (void) fprintf(out, "%20llu %s\n", 1697 g_rates == 0 ? total_count : 1698 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1699 g_event_info[event].ev_desc); 1700 return; 1701 } 1702 1703 (void) sprintf(lhdr, "%s%s", 1704 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1705 (void) sprintf(chdr, "%s%s", 1706 g_wflag ? "Hottest " : "", "Caller"); 1707 1708 if (!g_pflag) 1709 (void) fprintf(out, 1710 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1711 g_event_info[event].ev_desc, (double)total_count, 1712 (double)g_elapsed / NANOSEC, 1713 (double)total_count * NANOSEC / g_elapsed); 1714 1715 if (!g_pflag && rectype < LS_HIST) { 1716 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1717 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1718 g_rates ? "ops/s" : "Count", 1719 g_gflag ? "genr" : "indv", 1720 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1721 (void) fprintf(out, "---------------------------------" 1722 "----------------------------------------------\n"); 1723 } 1724 1725 displayed = 0; 1726 for (i = 0; i < nrecs; i++) { 1727 lsp = sort_buf[i]; 1728 1729 if (displayed++ >= g_topn) 1730 break; 1731 1732 if (g_pflag) { 1733 int j; 1734 1735 (void) fprintf(out, "%u %u", 1736 lsp->ls_event, lsp->ls_count); 1737 (void) fprintf(out, " %s", 1738 format_symbol(buf, lsp->ls_lock, g_cflag)); 1739 (void) fprintf(out, " %s", 1740 format_symbol(buf, lsp->ls_caller, 0)); 1741 (void) fprintf(out, " %f", 1742 (double)lsp->ls_refcnt / lsp->ls_count); 1743 if (rectype >= LS_TIME) 1744 (void) fprintf(out, " %llu", 1745 (unsigned long long)lsp->ls_time); 1746 if (rectype >= LS_HIST) { 1747 for (j = 0; j < 64; j++) 1748 (void) fprintf(out, " %u", 1749 lsp->ls_hist[j]); 1750 } 1751 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1752 if (rectype <= LS_STACK(j) || 1753 lsp->ls_stack[j] == 0) 1754 break; 1755 (void) fprintf(out, " %s", 1756 format_symbol(buf, lsp->ls_stack[j], 0)); 1757 } 1758 (void) fprintf(out, "\n"); 1759 continue; 1760 } 1761 1762 if (rectype >= LS_HIST) { 1763 (void) fprintf(out, "---------------------------------" 1764 "----------------------------------------------\n"); 1765 (void) sprintf(buf, "%s", 1766 g_event_info[event].ev_units); 1767 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1768 g_rates ? "ops/s" : "Count", 1769 g_gflag ? "genr" : "indv", 1770 "cuml", "rcnt", buf, lhdr, chdr); 1771 } 1772 1773 if (g_Pflag && total_time != 0) 1774 percent = (lsp->ls_time * 100.00) / total_time; 1775 else 1776 percent = (lsp->ls_count * 100.00) / total_count; 1777 1778 ptotal += percent; 1779 1780 if (rectype >= LS_TIME) 1781 (void) sprintf(buf, "%llu", 1782 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1783 else 1784 buf[0] = '\0'; 1785 1786 (void) fprintf(out, "%5llu ", 1787 g_rates == 0 ? lsp->ls_count : 1788 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1789 1790 (void) fprintf(out, "%3.0f%% ", percent); 1791 1792 if (g_gflag) 1793 (void) fprintf(out, "---- "); 1794 else 1795 (void) fprintf(out, "%3.0f%% ", ptotal); 1796 1797 (void) fprintf(out, "%4.2f %8s ", 1798 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1799 1800 (void) fprintf(out, "%-22s ", 1801 format_symbol(buf, lsp->ls_lock, g_cflag)); 1802 1803 (void) fprintf(out, "%-24s\n", 1804 format_symbol(buf, lsp->ls_caller, 0)); 1805 1806 if (rectype < LS_HIST) 1807 continue; 1808 1809 (void) fprintf(out, "\n"); 1810 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1811 g_event_info[event].ev_units, 1812 "------ Time Distribution ------", 1813 g_rates ? "ops/s" : "count", 1814 rectype > LS_STACK(0) ? "Stack" : ""); 1815 1816 first_bin = 0; 1817 while (lsp->ls_hist[first_bin] == 0) 1818 first_bin++; 1819 1820 last_bin = 63; 1821 while (lsp->ls_hist[last_bin] == 0) 1822 last_bin--; 1823 1824 max_bin_count = 0; 1825 total_bin_count = 0; 1826 for (j = first_bin; j <= last_bin; j++) { 1827 total_bin_count += lsp->ls_hist[j]; 1828 if (lsp->ls_hist[j] > max_bin_count) 1829 max_bin_count = lsp->ls_hist[j]; 1830 } 1831 1832 /* 1833 * If we went a few frames below the caller, ignore them 1834 */ 1835 for (fr = 3; fr > 0; fr--) 1836 if (lsp->ls_stack[fr] == lsp->ls_caller) 1837 break; 1838 1839 for (j = first_bin; j <= last_bin; j++) { 1840 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1841 (void) fprintf(out, "%10llu |%s%s %-9u ", 1842 1ULL << j, 1843 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1844 " " + depth, 1845 g_rates == 0 ? lsp->ls_hist[j] : 1846 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1847 g_elapsed)); 1848 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1849 (void) fprintf(out, "\n"); 1850 continue; 1851 } 1852 (void) fprintf(out, "%-24s\n", 1853 format_symbol(buf, lsp->ls_stack[fr], 0)); 1854 fr++; 1855 } 1856 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1857 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1858 format_symbol(buf, lsp->ls_stack[fr], 0)); 1859 fr++; 1860 } 1861 } 1862 1863 if (!g_pflag) 1864 (void) fprintf(out, "---------------------------------" 1865 "----------------------------------------------\n"); 1866 1867 (void) fflush(out); 1868 } 1869 1870 static void 1871 report_trace(FILE *out, lsrec_t **sort_buf) 1872 { 1873 lsrec_t *lsp; 1874 int i, fr; 1875 int rectype; 1876 char buf[256], buf2[256]; 1877 1878 rectype = g_recsize; 1879 1880 if (!g_pflag) { 1881 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1882 "Event", "Time", "Owner", "Lock", "Caller"); 1883 (void) fprintf(out, "---------------------------------" 1884 "----------------------------------------------\n"); 1885 } 1886 1887 for (i = 0; i < g_nrecs_used; i++) { 1888 1889 lsp = sort_buf[i]; 1890 1891 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1892 continue; 1893 1894 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1895 lsp->ls_event, (unsigned long long)lsp->ls_time, 1896 (void *)lsp->ls_next, 1897 format_symbol(buf, lsp->ls_lock, 0), 1898 format_symbol(buf2, lsp->ls_caller, 0)); 1899 1900 if (rectype <= LS_STACK(0)) 1901 continue; 1902 1903 /* 1904 * If we went a few frames below the caller, ignore them 1905 */ 1906 for (fr = 3; fr > 0; fr--) 1907 if (lsp->ls_stack[fr] == lsp->ls_caller) 1908 break; 1909 1910 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1911 (void) fprintf(out, "%53s %-24s\n", "", 1912 format_symbol(buf, lsp->ls_stack[fr], 0)); 1913 fr++; 1914 } 1915 (void) fprintf(out, "\n"); 1916 } 1917 1918 (void) fflush(out); 1919 } 1920