1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <stdio.h> 29 #include <stddef.h> 30 #include <stdlib.h> 31 #include <stdarg.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <fcntl.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <sys/types.h> 40 #include <sys/modctl.h> 41 #include <sys/stat.h> 42 #include <sys/wait.h> 43 #include <dtrace.h> 44 #include <sys/lockstat.h> 45 #include <alloca.h> 46 #include <signal.h> 47 #include <assert.h> 48 49 #ifdef illumos 50 #define GETOPT_EOF EOF 51 #else 52 #include <sys/time.h> 53 #include <sys/resource.h> 54 55 #define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 56 #define GETOPT_EOF (-1) 57 58 typedef uintptr_t pc_t; 59 #endif 60 61 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 62 63 #define LS_MAX_STACK_DEPTH 50 64 #define LS_MAX_EVENTS 64 65 66 typedef struct lsrec { 67 struct lsrec *ls_next; /* next in hash chain */ 68 #ifdef illumos 69 uintptr_t ls_lock; /* lock address */ 70 #else 71 char *ls_lock; /* lock name */ 72 #endif 73 uintptr_t ls_caller; /* caller address */ 74 uint32_t ls_count; /* cumulative event count */ 75 uint32_t ls_event; /* type of event */ 76 uintptr_t ls_refcnt; /* cumulative reference count */ 77 uint64_t ls_time; /* cumulative event duration */ 78 uint32_t ls_hist[64]; /* log2(duration) histogram */ 79 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 80 } lsrec_t; 81 82 typedef struct lsdata { 83 struct lsrec *lsd_next; /* next available */ 84 int lsd_count; /* number of records */ 85 } lsdata_t; 86 87 /* 88 * Definitions for the types of experiments which can be run. They are 89 * listed in increasing order of memory cost and processing time cost. 90 * The numerical value of each type is the number of bytes needed per record. 91 */ 92 #define LS_BASIC offsetof(lsrec_t, ls_time) 93 #define LS_TIME offsetof(lsrec_t, ls_hist[0]) 94 #define LS_HIST offsetof(lsrec_t, ls_stack[0]) 95 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 96 97 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 98 static void report_trace(FILE *, lsrec_t **); 99 100 extern int symtab_init(void); 101 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 102 extern uintptr_t sym_to_addr(char *name); 103 extern size_t sym_size(char *name); 104 extern char *strtok_r(char *, const char *, char **); 105 106 #define DEFAULT_NRECS 10000 107 #define DEFAULT_HZ 97 108 #define MAX_HZ 1000 109 #define MIN_AGGSIZE (16 * 1024) 110 #define MAX_AGGSIZE (32 * 1024 * 1024) 111 112 static int g_stkdepth; 113 static int g_topn = INT_MAX; 114 static hrtime_t g_elapsed; 115 static int g_rates = 0; 116 static int g_pflag = 0; 117 static int g_Pflag = 0; 118 static int g_wflag = 0; 119 static int g_Wflag = 0; 120 static int g_cflag = 0; 121 static int g_kflag = 0; 122 static int g_gflag = 0; 123 static int g_Vflag = 0; 124 static int g_tracing = 0; 125 static size_t g_recsize; 126 static size_t g_nrecs; 127 static int g_nrecs_used; 128 static uchar_t g_enabled[LS_MAX_EVENTS]; 129 static hrtime_t g_min_duration[LS_MAX_EVENTS]; 130 static dtrace_hdl_t *g_dtp; 131 static char *g_predicate; 132 static char *g_ipredicate; 133 static char *g_prog; 134 static int g_proglen; 135 static int g_dropped; 136 137 typedef struct ls_event_info { 138 char ev_type; 139 char ev_lhdr[20]; 140 char ev_desc[80]; 141 char ev_units[10]; 142 char ev_name[DTRACE_NAMELEN]; 143 char *ev_predicate; 144 char *ev_acquire; 145 } ls_event_info_t; 146 147 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 148 { 'C', "Lock", "Adaptive mutex spin", "nsec", 149 "lockstat:::adaptive-spin" }, 150 { 'C', "Lock", "Adaptive mutex block", "nsec", 151 "lockstat:::adaptive-block" }, 152 { 'C', "Lock", "Spin lock spin", "nsec", 153 "lockstat:::spin-spin" }, 154 { 'C', "Lock", "Thread lock spin", "nsec", 155 "lockstat:::thread-spin" }, 156 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 157 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 158 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 159 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 160 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 161 "lockstat:::rw-block", "arg2 == 1 && arg3 == 1" }, 162 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 163 "lockstat:::rw-block", "arg2 == 1 && arg3 == 0 && arg4" }, 164 { 'C', "Lock", "R/W writer spin on writer", "nsec", 165 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" }, 166 { 'C', "Lock", "R/W writer spin on readers", "nsec", 167 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" }, 168 { 'C', "Lock", "R/W reader spin on writer", "nsec", 169 "lockstat:::rw-spin", "arg2 == 1 && arg3 == 1" }, 170 { 'C', "Lock", "R/W reader spin on write wanted", "nsec", 171 "lockstat:::rw-spin", "arg2 == 1 && arg3 == 0 && arg4" }, 172 { 'C', "Lock", "SX exclusive block", "nsec", 173 "lockstat:::sx-block", "arg2 == 0" }, 174 { 'C', "Lock", "SX shared block", "nsec", 175 "lockstat:::sx-block", "arg2 == 1" }, 176 { 'C', "Lock", "SX exclusive spin", "nsec", 177 "lockstat:::sx-spin", "arg2 == 0" }, 178 { 'C', "Lock", "SX shared spin", "nsec", 179 "lockstat:::sx-spin", "arg2 == 1" }, 180 { 'C', "Lock", "lockmgr writer blocked by writer", "nsec", 181 "lockstat:::lockmgr-block", "arg2 == 0 && arg3 == 1" }, 182 { 'C', "Lock", "lockmgr writer blocked by readers", "nsec", 183 "lockstat:::lockmgr-block", "arg2 == 0 && arg3 == 0 && arg4" }, 184 { 'C', "Lock", "lockmgr reader blocked by writer", "nsec", 185 "lockstat:::lockmgr-block", "arg2 == 1 && arg3 == 1" }, 186 { 'C', "Lock", "lockmgr reader blocked by write wanted", "nsec", 187 "lockstat:::lockmgr-block", "arg2 == 1 && arg3 == 0 && arg4" }, 188 { 'C', "Lock", "Unknown event (type 20)", "units" }, 189 { 'C', "Lock", "Unknown event (type 21)", "units" }, 190 { 'C', "Lock", "Unknown event (type 22)", "units" }, 191 { 'C', "Lock", "Unknown event (type 23)", "units" }, 192 { 'C', "Lock", "Unknown event (type 24)", "units" }, 193 { 'C', "Lock", "Unknown event (type 25)", "units" }, 194 { 'C', "Lock", "Unknown event (type 26)", "units" }, 195 { 'C', "Lock", "Unknown event (type 27)", "units" }, 196 { 'C', "Lock", "Unknown event (type 28)", "units" }, 197 { 'C', "Lock", "Unknown event (type 29)", "units" }, 198 { 'C', "Lock", "Unknown event (type 30)", "units" }, 199 { 'C', "Lock", "Unknown event (type 31)", "units" }, 200 { 'H', "Lock", "Adaptive mutex hold", "nsec", 201 "lockstat:::adaptive-release", NULL, 202 "lockstat:::adaptive-acquire" }, 203 { 'H', "Lock", "Spin lock hold", "nsec", 204 "lockstat:::spin-release", NULL, 205 "lockstat:::spin-acquire" }, 206 { 'H', "Lock", "R/W writer hold", "nsec", 207 "lockstat:::rw-release", "arg1 == 0", 208 "lockstat:::rw-acquire" }, 209 { 'H', "Lock", "R/W reader hold", "nsec", 210 "lockstat:::rw-release", "arg1 == 1", 211 "lockstat:::rw-acquire" }, 212 { 'H', "Lock", "SX shared hold", "nsec", 213 "lockstat:::sx-release", "arg1 == 1", 214 "lockstat:::sx-acquire" }, 215 { 'H', "Lock", "SX exclusive hold", "nsec", 216 "lockstat:::sx-release", "arg1 == 0", 217 "lockstat:::sx-acquire" }, 218 { 'H', "Lock", "lockmgr shared hold", "nsec", 219 "lockstat:::lockmgr-release", "arg1 == 1", 220 "lockstat:::lockmgr-acquire" }, 221 { 'H', "Lock", "lockmgr exclusive hold", "nsec", 222 "lockstat:::lockmgr-release,lockstat:::lockmgr-disown", "arg1 == 0", 223 "lockstat:::lockmgr-acquire" }, 224 { 'H', "Lock", "Unknown event (type 40)", "units" }, 225 { 'H', "Lock", "Unknown event (type 41)", "units" }, 226 { 'H', "Lock", "Unknown event (type 42)", "units" }, 227 { 'H', "Lock", "Unknown event (type 43)", "units" }, 228 { 'H', "Lock", "Unknown event (type 44)", "units" }, 229 { 'H', "Lock", "Unknown event (type 45)", "units" }, 230 { 'H', "Lock", "Unknown event (type 46)", "units" }, 231 { 'H', "Lock", "Unknown event (type 47)", "units" }, 232 { 'H', "Lock", "Unknown event (type 48)", "units" }, 233 { 'H', "Lock", "Unknown event (type 49)", "units" }, 234 { 'H', "Lock", "Unknown event (type 50)", "units" }, 235 { 'H', "Lock", "Unknown event (type 51)", "units" }, 236 { 'H', "Lock", "Unknown event (type 52)", "units" }, 237 { 'H', "Lock", "Unknown event (type 53)", "units" }, 238 { 'H', "Lock", "Unknown event (type 54)", "units" }, 239 { 'H', "Lock", "Unknown event (type 55)", "units" }, 240 #ifdef illumos 241 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 242 #else 243 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 244 #endif 245 "profile:::profile-97", NULL }, 246 { 'I', "Lock", "Unknown event (type 57)", "units" }, 247 { 'I', "Lock", "Unknown event (type 58)", "units" }, 248 { 'I', "Lock", "Unknown event (type 59)", "units" }, 249 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 250 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 251 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 252 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 253 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 254 }; 255 256 #ifndef illumos 257 static char *g_pri_class[] = { 258 "", 259 "Intr", 260 "RealT", 261 "TShar", 262 "Idle" 263 }; 264 #endif 265 266 static void 267 fail(int do_perror, const char *message, ...) 268 { 269 va_list args; 270 int save_errno = errno; 271 272 va_start(args, message); 273 (void) fprintf(stderr, "lockstat: "); 274 (void) vfprintf(stderr, message, args); 275 va_end(args); 276 if (do_perror) 277 (void) fprintf(stderr, ": %s", strerror(save_errno)); 278 (void) fprintf(stderr, "\n"); 279 exit(2); 280 } 281 282 static void 283 dfail(const char *message, ...) 284 { 285 va_list args; 286 287 va_start(args, message); 288 (void) fprintf(stderr, "lockstat: "); 289 (void) vfprintf(stderr, message, args); 290 va_end(args); 291 (void) fprintf(stderr, ": %s\n", 292 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 293 294 exit(2); 295 } 296 297 static void 298 show_events(char event_type, char *desc) 299 { 300 int i, first = -1, last; 301 302 for (i = 0; i < LS_MAX_EVENTS; i++) { 303 ls_event_info_t *evp = &g_event_info[i]; 304 if (evp->ev_type != event_type || 305 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 306 continue; 307 if (first == -1) 308 first = i; 309 last = i; 310 } 311 312 (void) fprintf(stderr, 313 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 314 desc, event_type, first, last); 315 316 for (i = first; i <= last; i++) 317 (void) fprintf(stderr, 318 "%4d = %s\n", i, g_event_info[i].ev_desc); 319 } 320 321 static void 322 usage(void) 323 { 324 (void) fprintf(stderr, 325 "Usage: lockstat [options] command [args]\n" 326 "\nGeneral options:\n\n" 327 " -V print the corresponding D program\n" 328 "\nEvent selection options:\n\n" 329 " -C watch contention events [on by default]\n" 330 " -E watch error events [off by default]\n" 331 " -H watch hold events [off by default]\n" 332 " -I watch interrupt events [off by default]\n" 333 " -A watch all lock events [equivalent to -CH]\n" 334 " -e event_list only watch the specified events (shown below);\n" 335 " <event_list> is a comma-separated list of\n" 336 " events or ranges of events, e.g. 1,4-7,35\n" 337 " -i rate interrupt rate for -I [default: %d Hz]\n" 338 "\nData gathering options:\n\n" 339 " -b basic statistics (lock, caller, event count)\n" 340 " -t timing for all events [default]\n" 341 " -h histograms for event times\n" 342 " -s depth stack traces <depth> deep\n" 343 " -x opt[=val] enable or modify DTrace options\n" 344 "\nData filtering options:\n\n" 345 " -n nrecords maximum number of data records [default: %d]\n" 346 " -l lock[,size] only watch <lock>, which can be specified as a\n" 347 " symbolic name or hex address; <size> defaults\n" 348 " to the ELF symbol size if available, 1 if not\n" 349 " -f func[,size] only watch events generated by <func>\n" 350 " -d duration only watch events longer than <duration>\n" 351 " -T trace (rather than sample) events\n" 352 "\nData reporting options:\n\n" 353 #ifdef illumos 354 " -c coalesce lock data for arrays like pse_mutex[]\n" 355 #endif 356 " -k coalesce PCs within functions\n" 357 " -g show total events generated by function\n" 358 " -w wherever: don't distinguish events by caller\n" 359 " -W whichever: don't distinguish events by lock\n" 360 " -R display rates rather than counts\n" 361 " -p parsable output format (awk(1)-friendly)\n" 362 " -P sort lock data by (count * avg_time) product\n" 363 " -D n only display top <n> events of each type\n" 364 " -o filename send output to <filename>\n", 365 DEFAULT_HZ, DEFAULT_NRECS); 366 367 show_events('C', "Contention"); 368 show_events('H', "Hold-time"); 369 show_events('I', "Interrupt"); 370 show_events('E', "Error"); 371 (void) fprintf(stderr, "\n"); 372 373 exit(1); 374 } 375 376 static int 377 lockcmp(lsrec_t *a, lsrec_t *b) 378 { 379 int i; 380 381 if (a->ls_event < b->ls_event) 382 return (-1); 383 if (a->ls_event > b->ls_event) 384 return (1); 385 386 for (i = g_stkdepth - 1; i >= 0; i--) { 387 if (a->ls_stack[i] < b->ls_stack[i]) 388 return (-1); 389 if (a->ls_stack[i] > b->ls_stack[i]) 390 return (1); 391 } 392 393 if (a->ls_caller < b->ls_caller) 394 return (-1); 395 if (a->ls_caller > b->ls_caller) 396 return (1); 397 398 #ifdef illumos 399 if (a->ls_lock < b->ls_lock) 400 return (-1); 401 if (a->ls_lock > b->ls_lock) 402 return (1); 403 404 return (0); 405 #else 406 return (strcmp(a->ls_lock, b->ls_lock)); 407 #endif 408 } 409 410 static int 411 countcmp(lsrec_t *a, lsrec_t *b) 412 { 413 if (a->ls_event < b->ls_event) 414 return (-1); 415 if (a->ls_event > b->ls_event) 416 return (1); 417 418 return (b->ls_count - a->ls_count); 419 } 420 421 static int 422 timecmp(lsrec_t *a, lsrec_t *b) 423 { 424 if (a->ls_event < b->ls_event) 425 return (-1); 426 if (a->ls_event > b->ls_event) 427 return (1); 428 429 if (a->ls_time < b->ls_time) 430 return (1); 431 if (a->ls_time > b->ls_time) 432 return (-1); 433 434 return (0); 435 } 436 437 static int 438 lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 439 { 440 if (a->ls_event < b->ls_event) 441 return (-1); 442 if (a->ls_event > b->ls_event) 443 return (1); 444 445 #ifdef illumos 446 if (a->ls_lock < b->ls_lock) 447 return (-1); 448 if (a->ls_lock > b->ls_lock) 449 return (1); 450 451 return (0); 452 #else 453 return (strcmp(a->ls_lock, b->ls_lock)); 454 #endif 455 } 456 457 static int 458 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 459 { 460 #ifndef illumos 461 int cmp; 462 #endif 463 464 if (a->ls_event < b->ls_event) 465 return (-1); 466 if (a->ls_event > b->ls_event) 467 return (1); 468 469 #ifdef illumos 470 if (a->ls_lock < b->ls_lock) 471 return (-1); 472 if (a->ls_lock > b->ls_lock) 473 return (1); 474 #else 475 cmp = strcmp(a->ls_lock, b->ls_lock); 476 if (cmp != 0) 477 return (cmp); 478 #endif 479 480 return (b->ls_count - a->ls_count); 481 } 482 483 static int 484 sitecmp_anylock(lsrec_t *a, lsrec_t *b) 485 { 486 int i; 487 488 if (a->ls_event < b->ls_event) 489 return (-1); 490 if (a->ls_event > b->ls_event) 491 return (1); 492 493 for (i = g_stkdepth - 1; i >= 0; i--) { 494 if (a->ls_stack[i] < b->ls_stack[i]) 495 return (-1); 496 if (a->ls_stack[i] > b->ls_stack[i]) 497 return (1); 498 } 499 500 if (a->ls_caller < b->ls_caller) 501 return (-1); 502 if (a->ls_caller > b->ls_caller) 503 return (1); 504 505 return (0); 506 } 507 508 static int 509 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 510 { 511 int i; 512 513 if (a->ls_event < b->ls_event) 514 return (-1); 515 if (a->ls_event > b->ls_event) 516 return (1); 517 518 for (i = g_stkdepth - 1; i >= 0; i--) { 519 if (a->ls_stack[i] < b->ls_stack[i]) 520 return (-1); 521 if (a->ls_stack[i] > b->ls_stack[i]) 522 return (1); 523 } 524 525 if (a->ls_caller < b->ls_caller) 526 return (-1); 527 if (a->ls_caller > b->ls_caller) 528 return (1); 529 530 return (b->ls_count - a->ls_count); 531 } 532 533 static void 534 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 535 { 536 int m = n / 2; 537 int i, j; 538 539 if (m > 1) 540 lsmergesort(cmp, a, b, m); 541 if (n - m > 1) 542 lsmergesort(cmp, a + m, b + m, n - m); 543 for (i = m; i > 0; i--) 544 b[i - 1] = a[i - 1]; 545 for (j = m - 1; j < n - 1; j++) 546 b[n + m - j - 2] = a[j + 1]; 547 while (i < j) 548 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 549 *a = b[i]; 550 } 551 552 static void 553 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 554 { 555 int i, j; 556 lsrec_t *target, *current; 557 558 target = lock[0]; 559 560 for (i = 1; i < n; i++) { 561 current = lock[i]; 562 if (cmp(current, target) != 0) { 563 target = current; 564 continue; 565 } 566 current->ls_event = LS_MAX_EVENTS; 567 target->ls_count += current->ls_count; 568 target->ls_refcnt += current->ls_refcnt; 569 if (g_recsize < LS_TIME) 570 continue; 571 target->ls_time += current->ls_time; 572 if (g_recsize < LS_HIST) 573 continue; 574 for (j = 0; j < 64; j++) 575 target->ls_hist[j] += current->ls_hist[j]; 576 } 577 } 578 579 static void 580 coalesce_symbol(uintptr_t *addrp) 581 { 582 uintptr_t symoff; 583 size_t symsize; 584 585 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 586 *addrp -= symoff; 587 } 588 589 static void 590 predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 591 { 592 char *new; 593 int len, newlen; 594 595 if (what == NULL) 596 return; 597 598 if (*pred == NULL) { 599 *pred = malloc(1); 600 *pred[0] = '\0'; 601 } 602 603 len = strlen(*pred); 604 newlen = len + strlen(what) + 32 + strlen("( && )"); 605 new = malloc(newlen); 606 607 if (*pred[0] != '\0') { 608 if (cmp != NULL) { 609 (void) sprintf(new, "(%s) && (%s %s %p)", 610 *pred, what, cmp, (void *)value); 611 } else { 612 (void) sprintf(new, "(%s) && (%s)", *pred, what); 613 } 614 } else { 615 if (cmp != NULL) { 616 (void) sprintf(new, "%s %s %p", 617 what, cmp, (void *)value); 618 } else { 619 (void) sprintf(new, "%s", what); 620 } 621 } 622 623 free(*pred); 624 *pred = new; 625 } 626 627 static void 628 predicate_destroy(char **pred) 629 { 630 free(*pred); 631 *pred = NULL; 632 } 633 634 static void 635 filter_add(char **filt, char *what, uintptr_t base, size_t size) 636 { 637 char buf[256], *c = buf, *new; 638 int len, newlen; 639 640 if (*filt == NULL) { 641 *filt = malloc(1); 642 *filt[0] = '\0'; 643 } 644 645 #ifdef illumos 646 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 647 " || " : "", what, (void *)base, what, (void *)(base + size)); 648 #else 649 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 650 " || " : "", what, (void *)base, what, (void *)(base + size)); 651 #endif 652 653 newlen = (len = strlen(*filt) + 1) + strlen(c); 654 new = malloc(newlen); 655 bcopy(*filt, new, len); 656 (void) strcat(new, c); 657 free(*filt); 658 *filt = new; 659 } 660 661 static void 662 filter_destroy(char **filt) 663 { 664 free(*filt); 665 *filt = NULL; 666 } 667 668 static void 669 dprog_add(const char *fmt, ...) 670 { 671 va_list args; 672 int size, offs; 673 char c; 674 675 va_start(args, fmt); 676 size = vsnprintf(&c, 1, fmt, args) + 1; 677 va_end(args); 678 679 if (g_proglen == 0) { 680 offs = 0; 681 } else { 682 offs = g_proglen - 1; 683 } 684 685 g_proglen = offs + size; 686 687 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 688 fail(1, "failed to reallocate program text"); 689 690 va_start(args, fmt); 691 (void) vsnprintf(&g_prog[offs], size, fmt, args); 692 va_end(args); 693 } 694 695 /* 696 * This function may read like an open sewer, but keep in mind that programs 697 * that generate other programs are rarely pretty. If one has the unenviable 698 * task of maintaining or -- worse -- extending this code, use the -V option 699 * to examine the D program as generated by this function. 700 */ 701 static void 702 dprog_addevent(int event) 703 { 704 ls_event_info_t *info = &g_event_info[event]; 705 char *pred = NULL; 706 char stack[20]; 707 const char *arg0, *caller; 708 char *arg1 = "arg1"; 709 char buf[80]; 710 hrtime_t dur; 711 int depth; 712 713 if (info->ev_name[0] == '\0') 714 return; 715 716 if (info->ev_type == 'I') { 717 /* 718 * For interrupt events, arg0 (normally the lock pointer) is 719 * the CPU address plus the current pil, and arg1 (normally 720 * the number of nanoseconds) is the number of nanoseconds 721 * late -- and it's stored in arg2. 722 */ 723 #ifdef illumos 724 arg0 = "(uintptr_t)curthread->t_cpu + \n" 725 "\t curthread->t_cpu->cpu_profile_pil"; 726 #else 727 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 728 "\t 0x01000000 + curthread->td_pri_class"; 729 #endif 730 caller = "(uintptr_t)arg0"; 731 arg1 = "arg2"; 732 } else { 733 #ifdef illumos 734 arg0 = "(uintptr_t)arg0"; 735 #else 736 arg0 = "stringof(args[0]->lock_object.lo_name)"; 737 #endif 738 caller = "caller"; 739 } 740 741 if (g_recsize > LS_HIST) { 742 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 743 continue; 744 745 if (g_tracing) { 746 (void) sprintf(stack, "\tstack(%d);\n", depth); 747 } else { 748 (void) sprintf(stack, ", stack(%d)", depth); 749 } 750 } else { 751 (void) sprintf(stack, ""); 752 } 753 754 if (info->ev_acquire != NULL) { 755 /* 756 * If this is a hold event, we need to generate an additional 757 * clause for the acquire; the clause for the release will be 758 * generated with the aggregating statement, below. 759 */ 760 dprog_add("%s\n", info->ev_acquire); 761 predicate_add(&pred, info->ev_predicate, NULL, 0); 762 predicate_add(&pred, g_predicate, NULL, 0); 763 if (pred != NULL) 764 dprog_add("/%s/\n", pred); 765 766 dprog_add("{\n"); 767 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 768 769 if (info->ev_type == 'H') { 770 dprog_add("\t%s = timestamp;\n", buf); 771 } else { 772 /* 773 * If this isn't a hold event, it's the recursive 774 * error event. For this, we simply bump the 775 * thread-local, per-lock count. 776 */ 777 dprog_add("\t%s++;\n", buf); 778 } 779 780 dprog_add("}\n\n"); 781 predicate_destroy(&pred); 782 pred = NULL; 783 784 if (info->ev_type == 'E') { 785 /* 786 * If this is the recursive lock error event, we need 787 * to generate an additional clause to decrement the 788 * thread-local, per-lock count. This assures that we 789 * only execute the aggregating clause if we have 790 * recursive entry. 791 */ 792 dprog_add("%s\n", info->ev_name); 793 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 794 } 795 796 predicate_add(&pred, buf, NULL, 0); 797 798 if (info->ev_type == 'H') { 799 (void) sprintf(buf, "timestamp -\n\t " 800 "self->ev%d[(uintptr_t)arg0]", event); 801 } 802 803 arg1 = buf; 804 } else { 805 predicate_add(&pred, info->ev_predicate, NULL, 0); 806 if (info->ev_type != 'I') 807 predicate_add(&pred, g_predicate, NULL, 0); 808 else 809 predicate_add(&pred, g_ipredicate, NULL, 0); 810 } 811 812 if ((dur = g_min_duration[event]) != 0) 813 predicate_add(&pred, arg1, ">=", dur); 814 815 dprog_add("%s\n", info->ev_name); 816 817 if (pred != NULL) 818 dprog_add("/%s/\n", pred); 819 predicate_destroy(&pred); 820 821 dprog_add("{\n"); 822 823 if (g_tracing) { 824 dprog_add("\ttrace(%dULL);\n", event); 825 dprog_add("\ttrace(%s);\n", arg0); 826 dprog_add("\ttrace(%s);\n", caller); 827 dprog_add(stack); 828 } else { 829 /* 830 * The ordering here is important: when we process the 831 * aggregate, we count on the fact that @avg appears before 832 * @hist in program order to assure that @avg is assigned the 833 * first aggregation variable ID and @hist assigned the 834 * second; see the comment in process_aggregate() for details. 835 */ 836 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 837 event, arg0, caller, stack, arg1); 838 839 if (g_recsize >= LS_HIST) { 840 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 841 "(%s);\n", event, arg0, caller, stack, arg1); 842 } 843 } 844 845 if (info->ev_acquire != NULL) 846 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 847 848 dprog_add("}\n\n"); 849 } 850 851 static void 852 dprog_compile() 853 { 854 dtrace_prog_t *prog; 855 dtrace_proginfo_t info; 856 857 if (g_Vflag) { 858 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 859 (void) fputs(g_prog, stderr); 860 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 861 } 862 863 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 864 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 865 dfail("failed to compile program"); 866 867 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 868 dfail("failed to enable probes"); 869 870 if (dtrace_go(g_dtp) != 0) 871 dfail("couldn't start tracing"); 872 } 873 874 static void 875 #ifdef illumos 876 status_fire(void) 877 #else 878 status_fire(int i) 879 #endif 880 {} 881 882 static void 883 status_init(void) 884 { 885 dtrace_optval_t val, status, agg; 886 struct sigaction act; 887 struct itimerspec ts; 888 struct sigevent ev; 889 timer_t tid; 890 891 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 892 dfail("failed to get 'statusrate'"); 893 894 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 895 dfail("failed to get 'statusrate'"); 896 897 /* 898 * We would want to awaken at a rate that is the GCD of the statusrate 899 * and the aggrate -- but that seems a bit absurd. Instead, we'll 900 * simply awaken at a rate that is the more frequent of the two, which 901 * assures that we're never later than the interval implied by the 902 * more frequent rate. 903 */ 904 val = status < agg ? status : agg; 905 906 (void) sigemptyset(&act.sa_mask); 907 act.sa_flags = 0; 908 act.sa_handler = status_fire; 909 (void) sigaction(SIGUSR1, &act, NULL); 910 911 ev.sigev_notify = SIGEV_SIGNAL; 912 ev.sigev_signo = SIGUSR1; 913 914 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 915 dfail("cannot create CLOCK_REALTIME timer"); 916 917 ts.it_value.tv_sec = val / NANOSEC; 918 ts.it_value.tv_nsec = val % NANOSEC; 919 ts.it_interval = ts.it_value; 920 921 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 922 dfail("cannot set time on CLOCK_REALTIME timer"); 923 } 924 925 static void 926 status_check(void) 927 { 928 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 929 dfail("failed to snap aggregate"); 930 931 if (dtrace_status(g_dtp) == -1) 932 dfail("dtrace_status()"); 933 } 934 935 static void 936 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 937 { 938 bzero(lsrec, g_recsize); 939 lsrec->ls_count = 1; 940 941 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 942 fail(0, "truncated DTrace record"); 943 944 if (rec->dtrd_size != sizeof (uint64_t)) 945 fail(0, "bad event size in first record"); 946 947 /* LINTED - alignment */ 948 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 949 rec++; 950 951 #ifdef illumos 952 if (rec->dtrd_size != sizeof (uintptr_t)) 953 fail(0, "bad lock address size in second record"); 954 955 /* LINTED - alignment */ 956 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 957 rec++; 958 #else 959 lsrec->ls_lock = strdup((const char *)(data + rec->dtrd_offset)); 960 rec++; 961 #endif 962 963 if (rec->dtrd_size != sizeof (uintptr_t)) 964 fail(0, "bad caller size in third record"); 965 966 /* LINTED - alignment */ 967 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 968 rec++; 969 970 if (g_recsize > LS_HIST) { 971 int frames, i; 972 pc_t *stack; 973 974 frames = rec->dtrd_size / sizeof (pc_t); 975 /* LINTED - alignment */ 976 stack = (pc_t *)(data + rec->dtrd_offset); 977 978 for (i = 1; i < frames; i++) 979 lsrec->ls_stack[i - 1] = stack[i]; 980 } 981 } 982 983 /*ARGSUSED*/ 984 static int 985 count_aggregate(const dtrace_aggdata_t *agg, void *arg) 986 { 987 *((size_t *)arg) += 1; 988 989 return (DTRACE_AGGWALK_NEXT); 990 } 991 992 static int 993 process_aggregate(const dtrace_aggdata_t *agg, void *arg) 994 { 995 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 996 caddr_t data = agg->dtada_data; 997 lsdata_t *lsdata = arg; 998 lsrec_t *lsrec = lsdata->lsd_next; 999 const dtrace_recdesc_t *rec; 1000 uint64_t *avg, *quantized; 1001 int i, j; 1002 1003 assert(lsdata->lsd_count < g_nrecs); 1004 1005 /* 1006 * Aggregation variable IDs are guaranteed to be generated in program 1007 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 1008 * plus one. As "avg" appears before "hist" in program order, we know 1009 * that "avg" will be allocated the first aggregation variable ID, and 1010 * "hist" will be allocated the second aggregation variable ID -- and 1011 * we therefore use the aggregation variable ID to differentiate the 1012 * cases. 1013 */ 1014 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 1015 /* 1016 * If this is the histogram entry. We'll copy the quantized 1017 * data into lc_hist, and jump over the rest. 1018 */ 1019 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 1020 1021 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 1022 fail(0, "bad variable ID in aggregation record"); 1023 1024 if (rec->dtrd_size != 1025 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 1026 fail(0, "bad quantize size in aggregation record"); 1027 1028 /* LINTED - alignment */ 1029 quantized = (uint64_t *)(data + rec->dtrd_offset); 1030 1031 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 1032 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 1033 lsrec->ls_hist[j] = quantized[i]; 1034 1035 goto out; 1036 } 1037 1038 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 1039 aggdesc->dtagd_nrecs - 1, data); 1040 1041 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 1042 1043 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 1044 fail(0, "bad avg size in aggregation record"); 1045 1046 /* LINTED - alignment */ 1047 avg = (uint64_t *)(data + rec->dtrd_offset); 1048 lsrec->ls_count = (uint32_t)avg[0]; 1049 lsrec->ls_time = (uintptr_t)avg[1]; 1050 1051 if (g_recsize >= LS_HIST) 1052 return (DTRACE_AGGWALK_NEXT); 1053 1054 out: 1055 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1056 lsdata->lsd_count++; 1057 1058 return (DTRACE_AGGWALK_NEXT); 1059 } 1060 1061 static int 1062 process_trace(const dtrace_probedata_t *pdata, void *arg) 1063 { 1064 lsdata_t *lsdata = arg; 1065 lsrec_t *lsrec = lsdata->lsd_next; 1066 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1067 caddr_t data = pdata->dtpda_data; 1068 1069 if (lsdata->lsd_count >= g_nrecs) 1070 return (DTRACE_CONSUME_NEXT); 1071 1072 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1073 1074 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1075 lsdata->lsd_count++; 1076 1077 return (DTRACE_CONSUME_NEXT); 1078 } 1079 1080 static int 1081 process_data(FILE *out, char *data) 1082 { 1083 lsdata_t lsdata; 1084 1085 /* LINTED - alignment */ 1086 lsdata.lsd_next = (lsrec_t *)data; 1087 lsdata.lsd_count = 0; 1088 1089 if (g_tracing) { 1090 if (dtrace_consume(g_dtp, out, 1091 process_trace, NULL, &lsdata) != 0) 1092 dfail("failed to consume buffer"); 1093 1094 return (lsdata.lsd_count); 1095 } 1096 1097 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1098 process_aggregate, &lsdata) != 0) 1099 dfail("failed to walk aggregate"); 1100 1101 return (lsdata.lsd_count); 1102 } 1103 1104 /*ARGSUSED*/ 1105 static int 1106 drophandler(const dtrace_dropdata_t *data, void *arg) 1107 { 1108 g_dropped++; 1109 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1110 return (DTRACE_HANDLE_OK); 1111 } 1112 1113 int 1114 main(int argc, char **argv) 1115 { 1116 char *data_buf; 1117 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1118 FILE *out = stdout; 1119 int c; 1120 pid_t child; 1121 int status; 1122 int i, j; 1123 hrtime_t duration; 1124 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1125 uintptr_t addr; 1126 size_t size, off; 1127 int events_specified = 0; 1128 int exec_errno = 0; 1129 uint32_t event; 1130 char *filt = NULL, *ifilt = NULL; 1131 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1132 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1133 dtrace_optval_t aggsize; 1134 char aggstr[10]; 1135 long ncpus; 1136 int dynvar = 0; 1137 int err; 1138 1139 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1140 fail(0, "cannot open dtrace library: %s", 1141 dtrace_errmsg(NULL, err)); 1142 } 1143 1144 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1145 dfail("couldn't establish drop handler"); 1146 1147 if (symtab_init() == -1) 1148 fail(1, "can't load kernel symbols"); 1149 1150 g_nrecs = DEFAULT_NRECS; 1151 1152 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1153 switch (c) { 1154 case 'b': 1155 g_recsize = LS_BASIC; 1156 break; 1157 1158 case 't': 1159 g_recsize = LS_TIME; 1160 break; 1161 1162 case 'h': 1163 g_recsize = LS_HIST; 1164 break; 1165 1166 case 's': 1167 if (!isdigit(optarg[0])) 1168 usage(); 1169 g_stkdepth = atoi(optarg); 1170 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1171 fail(0, "max stack depth is %d", 1172 LS_MAX_STACK_DEPTH); 1173 g_recsize = LS_STACK(g_stkdepth); 1174 break; 1175 1176 case 'n': 1177 if (!isdigit(optarg[0])) 1178 usage(); 1179 g_nrecs = atoi(optarg); 1180 break; 1181 1182 case 'd': 1183 if (!isdigit(optarg[0])) 1184 usage(); 1185 duration = atoll(optarg); 1186 1187 /* 1188 * XXX -- durations really should be per event 1189 * since the units are different, but it's hard 1190 * to express this nicely in the interface. 1191 * Not clear yet what the cleanest solution is. 1192 */ 1193 for (i = 0; i < LS_MAX_EVENTS; i++) 1194 if (g_event_info[i].ev_type != 'E') 1195 g_min_duration[i] = duration; 1196 1197 break; 1198 1199 case 'i': 1200 if (!isdigit(optarg[0])) 1201 usage(); 1202 i = atoi(optarg); 1203 if (i <= 0) 1204 usage(); 1205 if (i > MAX_HZ) 1206 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1207 1208 for (j = 0; j < LS_MAX_EVENTS; j++) 1209 if (strcmp(g_event_info[j].ev_desc, 1210 "Profiling interrupt") == 0) 1211 break; 1212 1213 (void) sprintf(g_event_info[j].ev_name, 1214 "profile:::profile-%d", i); 1215 break; 1216 1217 case 'l': 1218 case 'f': 1219 addrp = strtok(optarg, ","); 1220 sizep = strtok(NULL, ","); 1221 addrp = strtok(optarg, ",+"); 1222 offp = strtok(NULL, ","); 1223 1224 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1225 off = offp ? strtoul(offp, NULL, 0) : 0; 1226 1227 if (addrp[0] == '0') { 1228 addr = strtoul(addrp, NULL, 16) + off; 1229 } else { 1230 addr = sym_to_addr(addrp) + off; 1231 if (sizep == NULL) 1232 size = sym_size(addrp) - off; 1233 if (addr - off == 0) 1234 fail(0, "symbol '%s' not found", addrp); 1235 if (size == 0) 1236 size = 1; 1237 } 1238 1239 1240 if (c == 'l') { 1241 filter_add(&filt, "arg0", addr, size); 1242 } else { 1243 filter_add(&filt, "caller", addr, size); 1244 filter_add(&ifilt, "arg0", addr, size); 1245 } 1246 break; 1247 1248 case 'e': 1249 evp = strtok_r(optarg, ",", &lastp); 1250 while (evp) { 1251 int ev1, ev2; 1252 char *evp2; 1253 1254 (void) strtok(evp, "-"); 1255 evp2 = strtok(NULL, "-"); 1256 ev1 = atoi(evp); 1257 ev2 = evp2 ? atoi(evp2) : ev1; 1258 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1259 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1260 fail(0, "-e events out of range"); 1261 for (i = ev1; i <= ev2; i++) 1262 g_enabled[i] = 1; 1263 evp = strtok_r(NULL, ",", &lastp); 1264 } 1265 events_specified = 1; 1266 break; 1267 1268 #ifdef illumos 1269 case 'c': 1270 g_cflag = 1; 1271 break; 1272 #endif 1273 1274 case 'k': 1275 g_kflag = 1; 1276 break; 1277 1278 case 'w': 1279 g_wflag = 1; 1280 break; 1281 1282 case 'W': 1283 g_Wflag = 1; 1284 break; 1285 1286 case 'g': 1287 g_gflag = 1; 1288 break; 1289 1290 case 'C': 1291 case 'E': 1292 case 'H': 1293 case 'I': 1294 for (i = 0; i < LS_MAX_EVENTS; i++) 1295 if (g_event_info[i].ev_type == c) 1296 g_enabled[i] = 1; 1297 events_specified = 1; 1298 break; 1299 1300 case 'A': 1301 for (i = 0; i < LS_MAX_EVENTS; i++) 1302 if (strchr("CH", g_event_info[i].ev_type)) 1303 g_enabled[i] = 1; 1304 events_specified = 1; 1305 break; 1306 1307 case 'T': 1308 g_tracing = 1; 1309 break; 1310 1311 case 'D': 1312 if (!isdigit(optarg[0])) 1313 usage(); 1314 g_topn = atoi(optarg); 1315 break; 1316 1317 case 'R': 1318 g_rates = 1; 1319 break; 1320 1321 case 'p': 1322 g_pflag = 1; 1323 break; 1324 1325 case 'P': 1326 g_Pflag = 1; 1327 break; 1328 1329 case 'o': 1330 if ((out = fopen(optarg, "w")) == NULL) 1331 fail(1, "error opening file"); 1332 break; 1333 1334 case 'V': 1335 g_Vflag = 1; 1336 break; 1337 1338 default: 1339 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1340 usage(); 1341 } 1342 } 1343 1344 if (filt != NULL) { 1345 predicate_add(&g_predicate, filt, NULL, 0); 1346 filter_destroy(&filt); 1347 } 1348 1349 if (ifilt != NULL) { 1350 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1351 filter_destroy(&ifilt); 1352 } 1353 1354 if (g_recsize == 0) { 1355 if (g_gflag) { 1356 g_stkdepth = LS_MAX_STACK_DEPTH; 1357 g_recsize = LS_STACK(g_stkdepth); 1358 } else { 1359 g_recsize = LS_TIME; 1360 } 1361 } 1362 1363 if (g_gflag && g_recsize <= LS_STACK(0)) 1364 fail(0, "'-g' requires at least '-s 1' data gathering"); 1365 1366 /* 1367 * Make sure the alignment is reasonable 1368 */ 1369 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1370 1371 for (i = 0; i < LS_MAX_EVENTS; i++) { 1372 /* 1373 * If no events were specified, enable -C. 1374 */ 1375 if (!events_specified && g_event_info[i].ev_type == 'C') 1376 g_enabled[i] = 1; 1377 } 1378 1379 for (i = 0; i < LS_MAX_EVENTS; i++) { 1380 if (!g_enabled[i]) 1381 continue; 1382 1383 if (g_event_info[i].ev_acquire != NULL) { 1384 /* 1385 * If we've enabled a hold event, we must explicitly 1386 * allocate dynamic variable space. 1387 */ 1388 dynvar = 1; 1389 } 1390 1391 dprog_addevent(i); 1392 } 1393 1394 /* 1395 * Make sure there are remaining arguments to specify a child command 1396 * to execute. 1397 */ 1398 if (argc <= optind) 1399 usage(); 1400 1401 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1402 dfail("couldn't determine number of online CPUs"); 1403 1404 /* 1405 * By default, we set our data buffer size to be the number of records 1406 * multiplied by the size of the record, doubled to account for some 1407 * DTrace slop and divided by the number of CPUs. We silently clamp 1408 * the aggregation size at both a minimum and a maximum to prevent 1409 * absurdly low or high values. 1410 */ 1411 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1412 aggsize = MIN_AGGSIZE; 1413 1414 if (aggsize > MAX_AGGSIZE) 1415 aggsize = MAX_AGGSIZE; 1416 1417 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1418 1419 if (!g_tracing) { 1420 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1421 dfail("failed to set 'bufsize'"); 1422 1423 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1424 dfail("failed to set 'aggsize'"); 1425 1426 if (dynvar) { 1427 /* 1428 * If we're using dynamic variables, we set our 1429 * dynamic variable size to be one megabyte per CPU, 1430 * with a hard-limit of 32 megabytes. This may still 1431 * be too small in some cases, but it can be tuned 1432 * manually via -x if need be. 1433 */ 1434 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1435 1436 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1437 dfail("failed to set 'dynvarsize'"); 1438 } 1439 } else { 1440 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1441 dfail("failed to set 'bufsize'"); 1442 } 1443 1444 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1445 dfail("failed to set 'statusrate'"); 1446 1447 optind = 1; 1448 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1449 switch (c) { 1450 case 'x': 1451 if ((p = strchr(optarg, '=')) != NULL) 1452 *p++ = '\0'; 1453 1454 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1455 dfail("failed to set -x %s", optarg); 1456 break; 1457 } 1458 } 1459 1460 argc -= optind; 1461 argv += optind; 1462 1463 dprog_compile(); 1464 status_init(); 1465 1466 g_elapsed = -gethrtime(); 1467 1468 /* 1469 * Spawn the specified command and wait for it to complete. 1470 */ 1471 child = fork(); 1472 if (child == -1) 1473 fail(1, "cannot fork"); 1474 if (child == 0) { 1475 (void) dtrace_close(g_dtp); 1476 (void) execvp(argv[0], &argv[0]); 1477 exec_errno = errno; 1478 exit(127); 1479 } 1480 1481 #ifdef illumos 1482 while (waitpid(child, &status, WEXITED) != child) 1483 #else 1484 while (waitpid(child, &status, 0) != child) 1485 #endif 1486 status_check(); 1487 1488 g_elapsed += gethrtime(); 1489 1490 if (WIFEXITED(status)) { 1491 if (WEXITSTATUS(status) != 0) { 1492 if (exec_errno != 0) { 1493 errno = exec_errno; 1494 fail(1, "could not execute %s", argv[0]); 1495 } 1496 (void) fprintf(stderr, 1497 "lockstat: warning: %s exited with code %d\n", 1498 argv[0], WEXITSTATUS(status)); 1499 } 1500 } else { 1501 (void) fprintf(stderr, 1502 "lockstat: warning: %s died on signal %d\n", 1503 argv[0], WTERMSIG(status)); 1504 } 1505 1506 if (dtrace_stop(g_dtp) == -1) 1507 dfail("failed to stop dtrace"); 1508 1509 /* 1510 * Before we read out the results, we need to allocate our buffer. 1511 * If we're tracing, then we'll just use the precalculated size. If 1512 * we're not, then we'll take a snapshot of the aggregate, and walk 1513 * it to count the number of records. 1514 */ 1515 if (!g_tracing) { 1516 if (dtrace_aggregate_snap(g_dtp) != 0) 1517 dfail("failed to snap aggregate"); 1518 1519 g_nrecs = 0; 1520 1521 if (dtrace_aggregate_walk(g_dtp, 1522 count_aggregate, &g_nrecs) != 0) 1523 dfail("failed to walk aggregate"); 1524 } 1525 1526 #ifdef illumos 1527 if ((data_buf = memalign(sizeof (uint64_t), 1528 (g_nrecs + 1) * g_recsize)) == NULL) 1529 #else 1530 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1531 (g_nrecs + 1) * g_recsize) ) 1532 #endif 1533 fail(1, "Memory allocation failed"); 1534 1535 /* 1536 * Read out the DTrace data. 1537 */ 1538 g_nrecs_used = process_data(out, data_buf); 1539 1540 if (g_nrecs_used > g_nrecs || g_dropped) 1541 (void) fprintf(stderr, "lockstat: warning: " 1542 "ran out of data records (use -n for more)\n"); 1543 1544 /* LINTED - alignment */ 1545 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1546 /* LINTED - alignment */ 1547 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1548 ev_count[lsp->ls_event] += lsp->ls_count; 1549 ev_time[lsp->ls_event] += lsp->ls_time; 1550 } 1551 1552 /* 1553 * If -g was specified, convert stacks into individual records. 1554 */ 1555 if (g_gflag) { 1556 lsrec_t *newlsp, *oldlsp; 1557 1558 #ifdef illumos 1559 newlsp = memalign(sizeof (uint64_t), 1560 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1561 #else 1562 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1563 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1564 #endif 1565 if (newlsp == NULL) 1566 fail(1, "Cannot allocate space for -g processing"); 1567 lsp = newlsp; 1568 /* LINTED - alignment */ 1569 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1570 /* LINTED - alignment */ 1571 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1572 int fr; 1573 int caller_in_stack = 0; 1574 1575 if (oldlsp->ls_count == 0) 1576 continue; 1577 1578 for (fr = 0; fr < g_stkdepth; fr++) { 1579 if (oldlsp->ls_stack[fr] == 0) 1580 break; 1581 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1582 caller_in_stack = 1; 1583 bcopy(oldlsp, lsp, LS_TIME); 1584 lsp->ls_caller = oldlsp->ls_stack[fr]; 1585 #ifndef illumos 1586 lsp->ls_lock = strdup(oldlsp->ls_lock); 1587 #endif 1588 /* LINTED - alignment */ 1589 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1590 } 1591 if (!caller_in_stack) { 1592 bcopy(oldlsp, lsp, LS_TIME); 1593 /* LINTED - alignment */ 1594 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1595 } 1596 #ifndef illumos 1597 free(oldlsp->ls_lock); 1598 #endif 1599 } 1600 g_nrecs = g_nrecs_used = 1601 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1602 g_recsize = LS_TIME; 1603 g_stkdepth = 0; 1604 free(data_buf); 1605 data_buf = (char *)newlsp; 1606 } 1607 1608 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1609 sizeof (void *))) == NULL) 1610 fail(1, "Sort buffer allocation failed"); 1611 merge_buf = sort_buf + (g_nrecs + 1); 1612 1613 /* 1614 * Build the sort buffer, discarding zero-count records along the way. 1615 */ 1616 /* LINTED - alignment */ 1617 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1618 /* LINTED - alignment */ 1619 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1620 if (lsp->ls_count == 0) 1621 lsp->ls_event = LS_MAX_EVENTS; 1622 sort_buf[i] = lsp; 1623 } 1624 1625 if (g_nrecs_used == 0) 1626 exit(0); 1627 1628 /* 1629 * Add a sentinel after the last record 1630 */ 1631 sort_buf[i] = lsp; 1632 lsp->ls_event = LS_MAX_EVENTS; 1633 1634 if (g_tracing) { 1635 report_trace(out, sort_buf); 1636 return (0); 1637 } 1638 1639 /* 1640 * Application of -g may have resulted in multiple records 1641 * with the same signature; coalesce them. 1642 */ 1643 if (g_gflag) { 1644 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1645 coalesce(lockcmp, sort_buf, g_nrecs_used); 1646 } 1647 1648 /* 1649 * Coalesce locks within the same symbol if -c option specified. 1650 * Coalesce PCs within the same function if -k option specified. 1651 */ 1652 if (g_cflag || g_kflag) { 1653 for (i = 0; i < g_nrecs_used; i++) { 1654 int fr; 1655 lsp = sort_buf[i]; 1656 #ifdef illumos 1657 if (g_cflag) 1658 coalesce_symbol(&lsp->ls_lock); 1659 #endif 1660 if (g_kflag) { 1661 for (fr = 0; fr < g_stkdepth; fr++) 1662 coalesce_symbol(&lsp->ls_stack[fr]); 1663 coalesce_symbol(&lsp->ls_caller); 1664 } 1665 } 1666 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1667 coalesce(lockcmp, sort_buf, g_nrecs_used); 1668 } 1669 1670 /* 1671 * Coalesce callers if -w option specified 1672 */ 1673 if (g_wflag) { 1674 mergesort(lock_and_count_cmp_anywhere, 1675 sort_buf, merge_buf, g_nrecs_used); 1676 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1677 } 1678 1679 /* 1680 * Coalesce locks if -W option specified 1681 */ 1682 if (g_Wflag) { 1683 mergesort(site_and_count_cmp_anylock, 1684 sort_buf, merge_buf, g_nrecs_used); 1685 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1686 } 1687 1688 /* 1689 * Sort data by contention count (ls_count) or total time (ls_time), 1690 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1691 */ 1692 if (g_recsize < LS_TIME) 1693 g_Pflag = 0; 1694 1695 if (g_Pflag) 1696 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1697 else 1698 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1699 1700 /* 1701 * Display data by event type 1702 */ 1703 first = &sort_buf[0]; 1704 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1705 current = first; 1706 while ((lsp = *current)->ls_event == event) 1707 current++; 1708 report_stats(out, first, current - first, ev_count[event], 1709 ev_time[event]); 1710 first = current; 1711 } 1712 1713 #ifndef illumos 1714 /* 1715 * Free lock name buffers 1716 */ 1717 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1718 lsp = (lsrec_t *)((char *)lsp + g_recsize)) 1719 free(lsp->ls_lock); 1720 #endif 1721 1722 return (0); 1723 } 1724 1725 static char * 1726 format_symbol(char *buf, uintptr_t addr, int show_size) 1727 { 1728 uintptr_t symoff; 1729 char *symname; 1730 size_t symsize; 1731 1732 symname = addr_to_sym(addr, &symoff, &symsize); 1733 1734 if (show_size && symoff == 0) 1735 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1736 else if (symoff == 0) 1737 (void) sprintf(buf, "%s", symname); 1738 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1739 #ifdef illumos 1740 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1741 #else 1742 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1743 #endif 1744 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1745 (void) sprintf(buf, "%s+0x%llx", symname, 1746 (unsigned long long)symoff); 1747 else 1748 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1749 return (buf); 1750 } 1751 1752 static void 1753 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1754 uint64_t total_time) 1755 { 1756 uint32_t event = sort_buf[0]->ls_event; 1757 lsrec_t *lsp; 1758 double ptotal = 0.0; 1759 double percent; 1760 int i, j, fr; 1761 int displayed; 1762 int first_bin, last_bin, max_bin_count, total_bin_count; 1763 int rectype; 1764 char buf[256]; 1765 char lhdr[80], chdr[80]; 1766 1767 rectype = g_recsize; 1768 1769 if (g_topn == 0) { 1770 (void) fprintf(out, "%20llu %s\n", 1771 g_rates == 0 ? total_count : 1772 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1773 g_event_info[event].ev_desc); 1774 return; 1775 } 1776 1777 (void) sprintf(lhdr, "%s%s", 1778 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1779 (void) sprintf(chdr, "%s%s", 1780 g_wflag ? "Hottest " : "", "Caller"); 1781 1782 if (!g_pflag) 1783 (void) fprintf(out, 1784 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1785 g_event_info[event].ev_desc, (double)total_count, 1786 (double)g_elapsed / NANOSEC, 1787 (double)total_count * NANOSEC / g_elapsed); 1788 1789 if (!g_pflag && rectype < LS_HIST) { 1790 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1791 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1792 g_rates ? "ops/s" : "Count", 1793 g_gflag ? "genr" : "indv", 1794 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1795 (void) fprintf(out, "---------------------------------" 1796 "----------------------------------------------\n"); 1797 } 1798 1799 displayed = 0; 1800 for (i = 0; i < nrecs; i++) { 1801 lsp = sort_buf[i]; 1802 1803 if (displayed++ >= g_topn) 1804 break; 1805 1806 if (g_pflag) { 1807 int j; 1808 1809 (void) fprintf(out, "%u %u", 1810 lsp->ls_event, lsp->ls_count); 1811 #ifdef illumos 1812 (void) fprintf(out, " %s", 1813 format_symbol(buf, lsp->ls_lock, g_cflag)); 1814 #else 1815 (void) fprintf(out, " %s", lsp->ls_lock); 1816 #endif 1817 (void) fprintf(out, " %s", 1818 format_symbol(buf, lsp->ls_caller, 0)); 1819 (void) fprintf(out, " %f", 1820 (double)lsp->ls_refcnt / lsp->ls_count); 1821 if (rectype >= LS_TIME) 1822 (void) fprintf(out, " %llu", 1823 (unsigned long long)lsp->ls_time); 1824 if (rectype >= LS_HIST) { 1825 for (j = 0; j < 64; j++) 1826 (void) fprintf(out, " %u", 1827 lsp->ls_hist[j]); 1828 } 1829 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1830 if (rectype <= LS_STACK(j) || 1831 lsp->ls_stack[j] == 0) 1832 break; 1833 (void) fprintf(out, " %s", 1834 format_symbol(buf, lsp->ls_stack[j], 0)); 1835 } 1836 (void) fprintf(out, "\n"); 1837 continue; 1838 } 1839 1840 if (rectype >= LS_HIST) { 1841 (void) fprintf(out, "---------------------------------" 1842 "----------------------------------------------\n"); 1843 (void) sprintf(buf, "%s", 1844 g_event_info[event].ev_units); 1845 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1846 g_rates ? "ops/s" : "Count", 1847 g_gflag ? "genr" : "indv", 1848 "cuml", "rcnt", buf, lhdr, chdr); 1849 } 1850 1851 if (g_Pflag && total_time != 0) 1852 percent = (lsp->ls_time * 100.00) / total_time; 1853 else 1854 percent = (lsp->ls_count * 100.00) / total_count; 1855 1856 ptotal += percent; 1857 1858 if (rectype >= LS_TIME) 1859 (void) sprintf(buf, "%llu", 1860 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1861 else 1862 buf[0] = '\0'; 1863 1864 (void) fprintf(out, "%5llu ", 1865 g_rates == 0 ? lsp->ls_count : 1866 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1867 1868 (void) fprintf(out, "%3.0f%% ", percent); 1869 1870 if (g_gflag) 1871 (void) fprintf(out, "---- "); 1872 else 1873 (void) fprintf(out, "%3.0f%% ", ptotal); 1874 1875 (void) fprintf(out, "%4.2f %8s ", 1876 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1877 1878 #ifdef illumos 1879 (void) fprintf(out, "%-22s ", 1880 format_symbol(buf, lsp->ls_lock, g_cflag)); 1881 #else 1882 (void) fprintf(out, "%-22s ", lsp->ls_lock); 1883 #endif 1884 1885 (void) fprintf(out, "%-24s\n", 1886 format_symbol(buf, lsp->ls_caller, 0)); 1887 1888 if (rectype < LS_HIST) 1889 continue; 1890 1891 (void) fprintf(out, "\n"); 1892 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1893 g_event_info[event].ev_units, 1894 "------ Time Distribution ------", 1895 g_rates ? "ops/s" : "count", 1896 rectype > LS_STACK(0) ? "Stack" : ""); 1897 1898 first_bin = 0; 1899 while (lsp->ls_hist[first_bin] == 0) 1900 first_bin++; 1901 1902 last_bin = 63; 1903 while (lsp->ls_hist[last_bin] == 0) 1904 last_bin--; 1905 1906 max_bin_count = 0; 1907 total_bin_count = 0; 1908 for (j = first_bin; j <= last_bin; j++) { 1909 total_bin_count += lsp->ls_hist[j]; 1910 if (lsp->ls_hist[j] > max_bin_count) 1911 max_bin_count = lsp->ls_hist[j]; 1912 } 1913 1914 /* 1915 * If we went a few frames below the caller, ignore them 1916 */ 1917 for (fr = 3; fr > 0; fr--) 1918 if (lsp->ls_stack[fr] == lsp->ls_caller) 1919 break; 1920 1921 for (j = first_bin; j <= last_bin; j++) { 1922 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1923 (void) fprintf(out, "%10llu |%s%s %-9u ", 1924 1ULL << j, 1925 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1926 " " + depth, 1927 g_rates == 0 ? lsp->ls_hist[j] : 1928 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1929 g_elapsed)); 1930 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1931 (void) fprintf(out, "\n"); 1932 continue; 1933 } 1934 (void) fprintf(out, "%-24s\n", 1935 format_symbol(buf, lsp->ls_stack[fr], 0)); 1936 fr++; 1937 } 1938 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1939 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1940 format_symbol(buf, lsp->ls_stack[fr], 0)); 1941 fr++; 1942 } 1943 } 1944 1945 if (!g_pflag) 1946 (void) fprintf(out, "---------------------------------" 1947 "----------------------------------------------\n"); 1948 1949 (void) fflush(out); 1950 } 1951 1952 static void 1953 report_trace(FILE *out, lsrec_t **sort_buf) 1954 { 1955 lsrec_t *lsp; 1956 int i, fr; 1957 int rectype; 1958 char buf[256], buf2[256]; 1959 1960 rectype = g_recsize; 1961 1962 if (!g_pflag) { 1963 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1964 "Event", "Time", "Owner", "Lock", "Caller"); 1965 (void) fprintf(out, "---------------------------------" 1966 "----------------------------------------------\n"); 1967 } 1968 1969 for (i = 0; i < g_nrecs_used; i++) { 1970 1971 lsp = sort_buf[i]; 1972 1973 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1974 continue; 1975 1976 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1977 lsp->ls_event, (unsigned long long)lsp->ls_time, 1978 (void *)lsp->ls_next, 1979 #ifdef illumos 1980 format_symbol(buf, lsp->ls_lock, 0), 1981 #else 1982 lsp->ls_lock, 1983 #endif 1984 format_symbol(buf2, lsp->ls_caller, 0)); 1985 1986 if (rectype <= LS_STACK(0)) 1987 continue; 1988 1989 /* 1990 * If we went a few frames below the caller, ignore them 1991 */ 1992 for (fr = 3; fr > 0; fr--) 1993 if (lsp->ls_stack[fr] == lsp->ls_caller) 1994 break; 1995 1996 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1997 (void) fprintf(out, "%53s %-24s\n", "", 1998 format_symbol(buf, lsp->ls_stack[fr], 0)); 1999 fr++; 2000 } 2001 (void) fprintf(out, "\n"); 2002 } 2003 2004 (void) fflush(out); 2005 } 2006