1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 #include <stdarg.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <ctype.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <sys/types.h>
40 #include <sys/modctl.h>
41 #include <sys/stat.h>
42 #include <sys/wait.h>
43 #include <dtrace.h>
44 #include <sys/lockstat.h>
45 #include <alloca.h>
46 #include <signal.h>
47 #include <assert.h>
48
49 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
50
51 #define LS_MAX_STACK_DEPTH 50
52 #define LS_MAX_EVENTS 64
53
54 typedef struct lsrec {
55 struct lsrec *ls_next; /* next in hash chain */
56 uintptr_t ls_lock; /* lock address */
57 uintptr_t ls_caller; /* caller address */
58 uint32_t ls_count; /* cumulative event count */
59 uint32_t ls_event; /* type of event */
60 uintptr_t ls_refcnt; /* cumulative reference count */
61 uint64_t ls_time; /* cumulative event duration */
62 uint32_t ls_hist[64]; /* log2(duration) histogram */
63 uintptr_t ls_stack[LS_MAX_STACK_DEPTH];
64 } lsrec_t;
65
66 typedef struct lsdata {
67 struct lsrec *lsd_next; /* next available */
68 int lsd_count; /* number of records */
69 } lsdata_t;
70
71 /*
72 * Definitions for the types of experiments which can be run. They are
73 * listed in increasing order of memory cost and processing time cost.
74 * The numerical value of each type is the number of bytes needed per record.
75 */
76 #define LS_BASIC offsetof(lsrec_t, ls_time)
77 #define LS_TIME offsetof(lsrec_t, ls_hist[0])
78 #define LS_HIST offsetof(lsrec_t, ls_stack[0])
79 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])
80
81 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
82 static void report_trace(FILE *, lsrec_t **);
83
84 extern int symtab_init(void);
85 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
86 extern uintptr_t sym_to_addr(char *name);
87 extern size_t sym_size(char *name);
88 extern char *strtok_r(char *, const char *, char **);
89
90 #define DEFAULT_NRECS 10000
91 #define DEFAULT_HZ 97
92 #define MAX_HZ 1000
93 #define MIN_AGGSIZE (16 * 1024)
94 #define MAX_AGGSIZE (32 * 1024 * 1024)
95
96 static int g_stkdepth;
97 static int g_topn = INT_MAX;
98 static hrtime_t g_elapsed;
99 static int g_rates = 0;
100 static int g_pflag = 0;
101 static int g_Pflag = 0;
102 static int g_wflag = 0;
103 static int g_Wflag = 0;
104 static int g_cflag = 0;
105 static int g_kflag = 0;
106 static int g_gflag = 0;
107 static int g_Vflag = 0;
108 static int g_tracing = 0;
109 static size_t g_recsize;
110 static size_t g_nrecs;
111 static int g_nrecs_used;
112 static uchar_t g_enabled[LS_MAX_EVENTS];
113 static hrtime_t g_min_duration[LS_MAX_EVENTS];
114 static dtrace_hdl_t *g_dtp;
115 static char *g_predicate;
116 static char *g_ipredicate;
117 static char *g_prog;
118 static int g_proglen;
119 static int g_dropped;
120
121 typedef struct ls_event_info {
122 char ev_type;
123 char ev_lhdr[20];
124 char ev_desc[80];
125 char ev_units[10];
126 char ev_name[DTRACE_NAMELEN];
127 char *ev_predicate;
128 char *ev_acquire;
129 } ls_event_info_t;
130
131 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
132 { 'C', "Lock", "Adaptive mutex spin", "nsec",
133 "lockstat:::adaptive-spin" },
134 { 'C', "Lock", "Adaptive mutex block", "nsec",
135 "lockstat:::adaptive-block" },
136 { 'C', "Lock", "Spin lock spin", "nsec",
137 "lockstat:::spin-spin" },
138 { 'C', "Lock", "Thread lock spin", "nsec",
139 "lockstat:::thread-spin" },
140 { 'C', "Lock", "R/W writer blocked by writer", "nsec",
141 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
142 { 'C', "Lock", "R/W writer blocked by readers", "nsec",
143 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
144 { 'C', "Lock", "R/W reader blocked by writer", "nsec",
145 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
146 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec",
147 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
148 { 'C', "Lock", "Unknown event (type 8)", "units" },
149 { 'C', "Lock", "Unknown event (type 9)", "units" },
150 { 'C', "Lock", "Unknown event (type 10)", "units" },
151 { 'C', "Lock", "Unknown event (type 11)", "units" },
152 { 'C', "Lock", "Unknown event (type 12)", "units" },
153 { 'C', "Lock", "Unknown event (type 13)", "units" },
154 { 'C', "Lock", "Unknown event (type 14)", "units" },
155 { 'C', "Lock", "Unknown event (type 15)", "units" },
156 { 'C', "Lock", "Unknown event (type 16)", "units" },
157 { 'C', "Lock", "Unknown event (type 17)", "units" },
158 { 'C', "Lock", "Unknown event (type 18)", "units" },
159 { 'C', "Lock", "Unknown event (type 19)", "units" },
160 { 'C', "Lock", "Unknown event (type 20)", "units" },
161 { 'C', "Lock", "Unknown event (type 21)", "units" },
162 { 'C', "Lock", "Unknown event (type 22)", "units" },
163 { 'C', "Lock", "Unknown event (type 23)", "units" },
164 { 'C', "Lock", "Unknown event (type 24)", "units" },
165 { 'C', "Lock", "Unknown event (type 25)", "units" },
166 { 'C', "Lock", "Unknown event (type 26)", "units" },
167 { 'C', "Lock", "Unknown event (type 27)", "units" },
168 { 'C', "Lock", "Unknown event (type 28)", "units" },
169 { 'C', "Lock", "Unknown event (type 29)", "units" },
170 { 'C', "Lock", "Unknown event (type 30)", "units" },
171 { 'C', "Lock", "Unknown event (type 31)", "units" },
172 { 'H', "Lock", "Adaptive mutex hold", "nsec",
173 "lockstat:::adaptive-release", NULL,
174 "lockstat:::adaptive-acquire" },
175 { 'H', "Lock", "Spin lock hold", "nsec",
176 "lockstat:::spin-release", NULL,
177 "lockstat:::spin-acquire" },
178 { 'H', "Lock", "R/W writer hold", "nsec",
179 "lockstat:::rw-release", "arg1 == 0",
180 "lockstat:::rw-acquire" },
181 { 'H', "Lock", "R/W reader hold", "nsec",
182 "lockstat:::rw-release", "arg1 != 0",
183 "lockstat:::rw-acquire" },
184 { 'H', "Lock", "Unknown event (type 36)", "units" },
185 { 'H', "Lock", "Unknown event (type 37)", "units" },
186 { 'H', "Lock", "Unknown event (type 38)", "units" },
187 { 'H', "Lock", "Unknown event (type 39)", "units" },
188 { 'H', "Lock", "Unknown event (type 40)", "units" },
189 { 'H', "Lock", "Unknown event (type 41)", "units" },
190 { 'H', "Lock", "Unknown event (type 42)", "units" },
191 { 'H', "Lock", "Unknown event (type 43)", "units" },
192 { 'H', "Lock", "Unknown event (type 44)", "units" },
193 { 'H', "Lock", "Unknown event (type 45)", "units" },
194 { 'H', "Lock", "Unknown event (type 46)", "units" },
195 { 'H', "Lock", "Unknown event (type 47)", "units" },
196 { 'H', "Lock", "Unknown event (type 48)", "units" },
197 { 'H', "Lock", "Unknown event (type 49)", "units" },
198 { 'H', "Lock", "Unknown event (type 50)", "units" },
199 { 'H', "Lock", "Unknown event (type 51)", "units" },
200 { 'H', "Lock", "Unknown event (type 52)", "units" },
201 { 'H', "Lock", "Unknown event (type 53)", "units" },
202 { 'H', "Lock", "Unknown event (type 54)", "units" },
203 { 'H', "Lock", "Unknown event (type 55)", "units" },
204 { 'I', "CPU+PIL", "Profiling interrupt", "nsec",
205 "profile:::profile-97", NULL },
206 { 'I', "Lock", "Unknown event (type 57)", "units" },
207 { 'I', "Lock", "Unknown event (type 58)", "units" },
208 { 'I', "Lock", "Unknown event (type 59)", "units" },
209 { 'E', "Lock", "Recursive lock entry detected", "(N/A)",
210 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
211 { 'E', "Lock", "Lockstat enter failure", "(N/A)" },
212 { 'E', "Lock", "Lockstat exit failure", "nsec" },
213 { 'E', "Lock", "Lockstat record failure", "(N/A)" },
214 };
215
216 static void
fail(int do_perror,const char * message,...)217 fail(int do_perror, const char *message, ...)
218 {
219 va_list args;
220 int save_errno = errno;
221
222 va_start(args, message);
223 (void) fprintf(stderr, "lockstat: ");
224 (void) vfprintf(stderr, message, args);
225 va_end(args);
226 if (do_perror)
227 (void) fprintf(stderr, ": %s", strerror(save_errno));
228 (void) fprintf(stderr, "\n");
229 exit(2);
230 }
231
232 static void
dfail(const char * message,...)233 dfail(const char *message, ...)
234 {
235 va_list args;
236
237 va_start(args, message);
238 (void) fprintf(stderr, "lockstat: ");
239 (void) vfprintf(stderr, message, args);
240 va_end(args);
241 (void) fprintf(stderr, ": %s\n",
242 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
243
244 exit(2);
245 }
246
247 static void
show_events(char event_type,char * desc)248 show_events(char event_type, char *desc)
249 {
250 int i, first = -1, last;
251
252 for (i = 0; i < LS_MAX_EVENTS; i++) {
253 ls_event_info_t *evp = &g_event_info[i];
254 if (evp->ev_type != event_type ||
255 strncmp(evp->ev_desc, "Unknown event", 13) == 0)
256 continue;
257 if (first == -1)
258 first = i;
259 last = i;
260 }
261
262 (void) fprintf(stderr,
263 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
264 desc, event_type, first, last);
265
266 for (i = first; i <= last; i++)
267 (void) fprintf(stderr,
268 "%4d = %s\n", i, g_event_info[i].ev_desc);
269 }
270
271 static void
usage(void)272 usage(void)
273 {
274 (void) fprintf(stderr,
275 "Usage: lockstat [options] command [args]\n"
276 "\nEvent selection options:\n\n"
277 " -C watch contention events [on by default]\n"
278 " -E watch error events [off by default]\n"
279 " -H watch hold events [off by default]\n"
280 " -I watch interrupt events [off by default]\n"
281 " -A watch all lock events [equivalent to -CH]\n"
282 " -e event_list only watch the specified events (shown below);\n"
283 " <event_list> is a comma-separated list of\n"
284 " events or ranges of events, e.g. 1,4-7,35\n"
285 " -i rate interrupt rate for -I [default: %d Hz]\n"
286 "\nData gathering options:\n\n"
287 " -b basic statistics (lock, caller, event count)\n"
288 " -t timing for all events [default]\n"
289 " -h histograms for event times\n"
290 " -s depth stack traces <depth> deep\n"
291 " -x opt[=val] enable or modify DTrace options\n"
292 "\nData filtering options:\n\n"
293 " -n nrecords maximum number of data records [default: %d]\n"
294 " -l lock[,size] only watch <lock>, which can be specified as a\n"
295 " symbolic name or hex address; <size> defaults\n"
296 " to the ELF symbol size if available, 1 if not\n"
297 " -f func[,size] only watch events generated by <func>\n"
298 " -d duration only watch events longer than <duration>\n"
299 " -T trace (rather than sample) events\n"
300 "\nData reporting options:\n\n"
301 " -c coalesce lock data for arrays like pse_mutex[]\n"
302 " -k coalesce PCs within functions\n"
303 " -g show total events generated by function\n"
304 " -w wherever: don't distinguish events by caller\n"
305 " -W whichever: don't distinguish events by lock\n"
306 " -R display rates rather than counts\n"
307 " -p parsable output format (awk(1)-friendly)\n"
308 " -P sort lock data by (count * avg_time) product\n"
309 " -D n only display top <n> events of each type\n"
310 " -o filename send output to <filename>\n",
311 DEFAULT_HZ, DEFAULT_NRECS);
312
313 show_events('C', "Contention");
314 show_events('H', "Hold-time");
315 show_events('I', "Interrupt");
316 show_events('E', "Error");
317 (void) fprintf(stderr, "\n");
318
319 exit(1);
320 }
321
322 static int
lockcmp(lsrec_t * a,lsrec_t * b)323 lockcmp(lsrec_t *a, lsrec_t *b)
324 {
325 int i;
326
327 if (a->ls_event < b->ls_event)
328 return (-1);
329 if (a->ls_event > b->ls_event)
330 return (1);
331
332 for (i = g_stkdepth - 1; i >= 0; i--) {
333 if (a->ls_stack[i] < b->ls_stack[i])
334 return (-1);
335 if (a->ls_stack[i] > b->ls_stack[i])
336 return (1);
337 }
338
339 if (a->ls_caller < b->ls_caller)
340 return (-1);
341 if (a->ls_caller > b->ls_caller)
342 return (1);
343
344 if (a->ls_lock < b->ls_lock)
345 return (-1);
346 if (a->ls_lock > b->ls_lock)
347 return (1);
348
349 return (0);
350 }
351
352 static int
countcmp(lsrec_t * a,lsrec_t * b)353 countcmp(lsrec_t *a, lsrec_t *b)
354 {
355 if (a->ls_event < b->ls_event)
356 return (-1);
357 if (a->ls_event > b->ls_event)
358 return (1);
359
360 return (b->ls_count - a->ls_count);
361 }
362
363 static int
timecmp(lsrec_t * a,lsrec_t * b)364 timecmp(lsrec_t *a, lsrec_t *b)
365 {
366 if (a->ls_event < b->ls_event)
367 return (-1);
368 if (a->ls_event > b->ls_event)
369 return (1);
370
371 if (a->ls_time < b->ls_time)
372 return (1);
373 if (a->ls_time > b->ls_time)
374 return (-1);
375
376 return (0);
377 }
378
379 static int
lockcmp_anywhere(lsrec_t * a,lsrec_t * b)380 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
381 {
382 if (a->ls_event < b->ls_event)
383 return (-1);
384 if (a->ls_event > b->ls_event)
385 return (1);
386
387 if (a->ls_lock < b->ls_lock)
388 return (-1);
389 if (a->ls_lock > b->ls_lock)
390 return (1);
391
392 return (0);
393 }
394
395 static int
lock_and_count_cmp_anywhere(lsrec_t * a,lsrec_t * b)396 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
397 {
398 if (a->ls_event < b->ls_event)
399 return (-1);
400 if (a->ls_event > b->ls_event)
401 return (1);
402
403 if (a->ls_lock < b->ls_lock)
404 return (-1);
405 if (a->ls_lock > b->ls_lock)
406 return (1);
407
408 return (b->ls_count - a->ls_count);
409 }
410
411 static int
sitecmp_anylock(lsrec_t * a,lsrec_t * b)412 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
413 {
414 int i;
415
416 if (a->ls_event < b->ls_event)
417 return (-1);
418 if (a->ls_event > b->ls_event)
419 return (1);
420
421 for (i = g_stkdepth - 1; i >= 0; i--) {
422 if (a->ls_stack[i] < b->ls_stack[i])
423 return (-1);
424 if (a->ls_stack[i] > b->ls_stack[i])
425 return (1);
426 }
427
428 if (a->ls_caller < b->ls_caller)
429 return (-1);
430 if (a->ls_caller > b->ls_caller)
431 return (1);
432
433 return (0);
434 }
435
436 static int
site_and_count_cmp_anylock(lsrec_t * a,lsrec_t * b)437 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
438 {
439 int i;
440
441 if (a->ls_event < b->ls_event)
442 return (-1);
443 if (a->ls_event > b->ls_event)
444 return (1);
445
446 for (i = g_stkdepth - 1; i >= 0; i--) {
447 if (a->ls_stack[i] < b->ls_stack[i])
448 return (-1);
449 if (a->ls_stack[i] > b->ls_stack[i])
450 return (1);
451 }
452
453 if (a->ls_caller < b->ls_caller)
454 return (-1);
455 if (a->ls_caller > b->ls_caller)
456 return (1);
457
458 return (b->ls_count - a->ls_count);
459 }
460
461 static void
mergesort(int (* cmp)(lsrec_t *,lsrec_t *),lsrec_t ** a,lsrec_t ** b,int n)462 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
463 {
464 int m = n / 2;
465 int i, j;
466
467 if (m > 1)
468 mergesort(cmp, a, b, m);
469 if (n - m > 1)
470 mergesort(cmp, a + m, b + m, n - m);
471 for (i = m; i > 0; i--)
472 b[i - 1] = a[i - 1];
473 for (j = m - 1; j < n - 1; j++)
474 b[n + m - j - 2] = a[j + 1];
475 while (i < j)
476 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
477 *a = b[i];
478 }
479
480 static void
coalesce(int (* cmp)(lsrec_t *,lsrec_t *),lsrec_t ** lock,int n)481 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
482 {
483 int i, j;
484 lsrec_t *target, *current;
485
486 target = lock[0];
487
488 for (i = 1; i < n; i++) {
489 current = lock[i];
490 if (cmp(current, target) != 0) {
491 target = current;
492 continue;
493 }
494 current->ls_event = LS_MAX_EVENTS;
495 target->ls_count += current->ls_count;
496 target->ls_refcnt += current->ls_refcnt;
497 if (g_recsize < LS_TIME)
498 continue;
499 target->ls_time += current->ls_time;
500 if (g_recsize < LS_HIST)
501 continue;
502 for (j = 0; j < 64; j++)
503 target->ls_hist[j] += current->ls_hist[j];
504 }
505 }
506
507 static void
coalesce_symbol(uintptr_t * addrp)508 coalesce_symbol(uintptr_t *addrp)
509 {
510 uintptr_t symoff;
511 size_t symsize;
512
513 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
514 *addrp -= symoff;
515 }
516
517 static void
predicate_add(char ** pred,char * what,char * cmp,uintptr_t value)518 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
519 {
520 char *new;
521 int len, newlen;
522
523 if (what == NULL)
524 return;
525
526 if (*pred == NULL) {
527 *pred = malloc(1);
528 *pred[0] = '\0';
529 }
530
531 len = strlen(*pred);
532 newlen = len + strlen(what) + 32 + strlen("( && )");
533 new = malloc(newlen);
534
535 if (*pred[0] != '\0') {
536 if (cmp != NULL) {
537 (void) sprintf(new, "(%s) && (%s %s 0x%p)",
538 *pred, what, cmp, (void *)value);
539 } else {
540 (void) sprintf(new, "(%s) && (%s)", *pred, what);
541 }
542 } else {
543 if (cmp != NULL) {
544 (void) sprintf(new, "%s %s 0x%p",
545 what, cmp, (void *)value);
546 } else {
547 (void) sprintf(new, "%s", what);
548 }
549 }
550
551 free(*pred);
552 *pred = new;
553 }
554
555 static void
predicate_destroy(char ** pred)556 predicate_destroy(char **pred)
557 {
558 free(*pred);
559 *pred = NULL;
560 }
561
562 static void
filter_add(char ** filt,char * what,uintptr_t base,uintptr_t size)563 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
564 {
565 char buf[256], *c = buf, *new;
566 int len, newlen;
567
568 if (*filt == NULL) {
569 *filt = malloc(1);
570 *filt[0] = '\0';
571 }
572
573 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
574 " || " : "", what, (void *)base, what, (void *)(base + size));
575
576 newlen = (len = strlen(*filt) + 1) + strlen(c);
577 new = malloc(newlen);
578 bcopy(*filt, new, len);
579 (void) strcat(new, c);
580 free(*filt);
581 *filt = new;
582 }
583
584 static void
filter_destroy(char ** filt)585 filter_destroy(char **filt)
586 {
587 free(*filt);
588 *filt = NULL;
589 }
590
591 static void
dprog_add(const char * fmt,...)592 dprog_add(const char *fmt, ...)
593 {
594 va_list args;
595 int size, offs;
596 char c;
597
598 va_start(args, fmt);
599 size = vsnprintf(&c, 1, fmt, args) + 1;
600
601 if (g_proglen == 0) {
602 offs = 0;
603 } else {
604 offs = g_proglen - 1;
605 }
606
607 g_proglen = offs + size;
608
609 if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
610 fail(1, "failed to reallocate program text");
611
612 (void) vsnprintf(&g_prog[offs], size, fmt, args);
613 }
614
615 /*
616 * This function may read like an open sewer, but keep in mind that programs
617 * that generate other programs are rarely pretty. If one has the unenviable
618 * task of maintaining or -- worse -- extending this code, use the -V option
619 * to examine the D program as generated by this function.
620 */
621 static void
dprog_addevent(int event)622 dprog_addevent(int event)
623 {
624 ls_event_info_t *info = &g_event_info[event];
625 char *pred = NULL;
626 char stack[20];
627 const char *arg0, *caller;
628 char *arg1 = "arg1";
629 char buf[80];
630 hrtime_t dur;
631 int depth;
632
633 if (info->ev_name[0] == '\0')
634 return;
635
636 if (info->ev_type == 'I') {
637 /*
638 * For interrupt events, arg0 (normally the lock pointer) is
639 * the CPU address plus the current pil, and arg1 (normally
640 * the number of nanoseconds) is the number of nanoseconds
641 * late -- and it's stored in arg2.
642 */
643 arg0 = "(uintptr_t)curthread->t_cpu + \n"
644 "\t curthread->t_cpu->cpu_profile_pil";
645 caller = "(uintptr_t)arg0";
646 arg1 = "arg2";
647 } else {
648 arg0 = "(uintptr_t)arg0";
649 caller = "caller";
650 }
651
652 if (g_recsize > LS_HIST) {
653 for (depth = 0; g_recsize > LS_STACK(depth); depth++)
654 continue;
655
656 if (g_tracing) {
657 (void) sprintf(stack, "\tstack(%d);\n", depth);
658 } else {
659 (void) sprintf(stack, ", stack(%d)", depth);
660 }
661 } else {
662 (void) sprintf(stack, "");
663 }
664
665 if (info->ev_acquire != NULL) {
666 /*
667 * If this is a hold event, we need to generate an additional
668 * clause for the acquire; the clause for the release will be
669 * generated with the aggregating statement, below.
670 */
671 dprog_add("%s\n", info->ev_acquire);
672 predicate_add(&pred, info->ev_predicate, NULL, 0);
673 predicate_add(&pred, g_predicate, NULL, 0);
674 if (pred != NULL)
675 dprog_add("/%s/\n", pred);
676
677 dprog_add("{\n");
678 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
679
680 if (info->ev_type == 'H') {
681 dprog_add("\t%s = timestamp;\n", buf);
682 } else {
683 /*
684 * If this isn't a hold event, it's the recursive
685 * error event. For this, we simply bump the
686 * thread-local, per-lock count.
687 */
688 dprog_add("\t%s++;\n", buf);
689 }
690
691 dprog_add("}\n\n");
692 predicate_destroy(&pred);
693 pred = NULL;
694
695 if (info->ev_type == 'E') {
696 /*
697 * If this is the recursive lock error event, we need
698 * to generate an additional clause to decrement the
699 * thread-local, per-lock count. This assures that we
700 * only execute the aggregating clause if we have
701 * recursive entry.
702 */
703 dprog_add("%s\n", info->ev_name);
704 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
705 }
706
707 predicate_add(&pred, buf, NULL, 0);
708
709 if (info->ev_type == 'H') {
710 (void) sprintf(buf, "timestamp -\n\t "
711 "self->ev%d[(uintptr_t)arg0]", event);
712 }
713
714 arg1 = buf;
715 } else {
716 predicate_add(&pred, info->ev_predicate, NULL, 0);
717 if (info->ev_type != 'I')
718 predicate_add(&pred, g_predicate, NULL, 0);
719 else
720 predicate_add(&pred, g_ipredicate, NULL, 0);
721 }
722
723 if ((dur = g_min_duration[event]) != 0)
724 predicate_add(&pred, arg1, ">=", dur);
725
726 dprog_add("%s\n", info->ev_name);
727
728 if (pred != NULL)
729 dprog_add("/%s/\n", pred);
730 predicate_destroy(&pred);
731
732 dprog_add("{\n");
733
734 if (g_tracing) {
735 dprog_add("\ttrace(%dULL);\n", event);
736 dprog_add("\ttrace(%s);\n", arg0);
737 dprog_add("\ttrace(%s);\n", caller);
738 dprog_add(stack);
739 } else {
740 /*
741 * The ordering here is important: when we process the
742 * aggregate, we count on the fact that @avg appears before
743 * @hist in program order to assure that @avg is assigned the
744 * first aggregation variable ID and @hist assigned the
745 * second; see the comment in process_aggregate() for details.
746 */
747 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
748 event, arg0, caller, stack, arg1);
749
750 if (g_recsize >= LS_HIST) {
751 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
752 "(%s);\n", event, arg0, caller, stack, arg1);
753 }
754 }
755
756 if (info->ev_acquire != NULL)
757 dprog_add("\tself->ev%d[arg0] = 0;\n", event);
758
759 dprog_add("}\n\n");
760 }
761
762 static void
dprog_compile()763 dprog_compile()
764 {
765 dtrace_prog_t *prog;
766 dtrace_proginfo_t info;
767
768 if (g_Vflag) {
769 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
770 (void) fputs(g_prog, stderr);
771 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
772 }
773
774 if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
775 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
776 dfail("failed to compile program");
777
778 if (dtrace_program_exec(g_dtp, prog, &info) == -1)
779 dfail("failed to enable probes");
780
781 if (dtrace_go(g_dtp) != 0)
782 dfail("couldn't start tracing");
783 }
784
785 static void
status_fire(void)786 status_fire(void)
787 {}
788
789 static void
status_init(void)790 status_init(void)
791 {
792 dtrace_optval_t val, status, agg;
793 struct sigaction act;
794 struct itimerspec ts;
795 struct sigevent ev;
796 timer_t tid;
797
798 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
799 dfail("failed to get 'statusrate'");
800
801 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
802 dfail("failed to get 'statusrate'");
803
804 /*
805 * We would want to awaken at a rate that is the GCD of the statusrate
806 * and the aggrate -- but that seems a bit absurd. Instead, we'll
807 * simply awaken at a rate that is the more frequent of the two, which
808 * assures that we're never later than the interval implied by the
809 * more frequent rate.
810 */
811 val = status < agg ? status : agg;
812
813 (void) sigemptyset(&act.sa_mask);
814 act.sa_flags = 0;
815 act.sa_handler = status_fire;
816 (void) sigaction(SIGUSR1, &act, NULL);
817
818 ev.sigev_notify = SIGEV_SIGNAL;
819 ev.sigev_signo = SIGUSR1;
820
821 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
822 dfail("cannot create CLOCK_REALTIME timer");
823
824 ts.it_value.tv_sec = val / NANOSEC;
825 ts.it_value.tv_nsec = val % NANOSEC;
826 ts.it_interval = ts.it_value;
827
828 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
829 dfail("cannot set time on CLOCK_REALTIME timer");
830 }
831
832 static void
status_check(void)833 status_check(void)
834 {
835 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
836 dfail("failed to snap aggregate");
837
838 if (dtrace_status(g_dtp) == -1)
839 dfail("dtrace_status()");
840 }
841
842 static void
lsrec_fill(lsrec_t * lsrec,const dtrace_recdesc_t * rec,int nrecs,caddr_t data)843 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
844 {
845 bzero(lsrec, g_recsize);
846 lsrec->ls_count = 1;
847
848 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
849 fail(0, "truncated DTrace record");
850
851 if (rec->dtrd_size != sizeof (uint64_t))
852 fail(0, "bad event size in first record");
853
854 /* LINTED - alignment */
855 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
856 rec++;
857
858 if (rec->dtrd_size != sizeof (uintptr_t))
859 fail(0, "bad lock address size in second record");
860
861 /* LINTED - alignment */
862 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
863 rec++;
864
865 if (rec->dtrd_size != sizeof (uintptr_t))
866 fail(0, "bad caller size in third record");
867
868 /* LINTED - alignment */
869 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
870 rec++;
871
872 if (g_recsize > LS_HIST) {
873 int frames, i;
874 pc_t *stack;
875
876 frames = rec->dtrd_size / sizeof (pc_t);
877 /* LINTED - alignment */
878 stack = (pc_t *)(data + rec->dtrd_offset);
879
880 for (i = 1; i < frames; i++)
881 lsrec->ls_stack[i - 1] = stack[i];
882 }
883 }
884
885 /*ARGSUSED*/
886 static int
count_aggregate(const dtrace_aggdata_t * agg,void * arg)887 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
888 {
889 *((size_t *)arg) += 1;
890
891 return (DTRACE_AGGWALK_NEXT);
892 }
893
894 static int
process_aggregate(const dtrace_aggdata_t * agg,void * arg)895 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
896 {
897 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
898 caddr_t data = agg->dtada_data;
899 lsdata_t *lsdata = arg;
900 lsrec_t *lsrec = lsdata->lsd_next;
901 const dtrace_recdesc_t *rec;
902 uint64_t *avg, *quantized;
903 int i, j;
904
905 assert(lsdata->lsd_count < g_nrecs);
906
907 /*
908 * Aggregation variable IDs are guaranteed to be generated in program
909 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
910 * plus one. As "avg" appears before "hist" in program order, we know
911 * that "avg" will be allocated the first aggregation variable ID, and
912 * "hist" will be allocated the second aggregation variable ID -- and
913 * we therefore use the aggregation variable ID to differentiate the
914 * cases.
915 */
916 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
917 /*
918 * If this is the histogram entry. We'll copy the quantized
919 * data into lc_hist, and jump over the rest.
920 */
921 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
922
923 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
924 fail(0, "bad variable ID in aggregation record");
925
926 if (rec->dtrd_size !=
927 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
928 fail(0, "bad quantize size in aggregation record");
929
930 /* LINTED - alignment */
931 quantized = (uint64_t *)(data + rec->dtrd_offset);
932
933 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
934 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
935 lsrec->ls_hist[j] = quantized[i];
936
937 goto out;
938 }
939
940 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
941 aggdesc->dtagd_nrecs - 1, data);
942
943 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
944
945 if (rec->dtrd_size != 2 * sizeof (uint64_t))
946 fail(0, "bad avg size in aggregation record");
947
948 /* LINTED - alignment */
949 avg = (uint64_t *)(data + rec->dtrd_offset);
950 lsrec->ls_count = (uint32_t)avg[0];
951 lsrec->ls_time = (uintptr_t)avg[1];
952
953 if (g_recsize >= LS_HIST)
954 return (DTRACE_AGGWALK_NEXT);
955
956 out:
957 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
958 lsdata->lsd_count++;
959
960 return (DTRACE_AGGWALK_NEXT);
961 }
962
963 static int
process_trace(const dtrace_probedata_t * pdata,void * arg)964 process_trace(const dtrace_probedata_t *pdata, void *arg)
965 {
966 lsdata_t *lsdata = arg;
967 lsrec_t *lsrec = lsdata->lsd_next;
968 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
969 caddr_t data = pdata->dtpda_data;
970
971 if (lsdata->lsd_count >= g_nrecs)
972 return (DTRACE_CONSUME_NEXT);
973
974 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
975
976 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
977 lsdata->lsd_count++;
978
979 return (DTRACE_CONSUME_NEXT);
980 }
981
982 static int
process_data(FILE * out,char * data)983 process_data(FILE *out, char *data)
984 {
985 lsdata_t lsdata;
986
987 /* LINTED - alignment */
988 lsdata.lsd_next = (lsrec_t *)data;
989 lsdata.lsd_count = 0;
990
991 if (g_tracing) {
992 if (dtrace_consume(g_dtp, out,
993 process_trace, NULL, &lsdata) != 0)
994 dfail("failed to consume buffer");
995
996 return (lsdata.lsd_count);
997 }
998
999 if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1000 process_aggregate, &lsdata) != 0)
1001 dfail("failed to walk aggregate");
1002
1003 return (lsdata.lsd_count);
1004 }
1005
1006 /*ARGSUSED*/
1007 static int
drophandler(const dtrace_dropdata_t * data,void * arg)1008 drophandler(const dtrace_dropdata_t *data, void *arg)
1009 {
1010 g_dropped++;
1011 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1012 return (DTRACE_HANDLE_OK);
1013 }
1014
1015 int
main(int argc,char ** argv)1016 main(int argc, char **argv)
1017 {
1018 char *data_buf;
1019 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1020 FILE *out = stdout;
1021 char c;
1022 pid_t child;
1023 int status;
1024 int i, j;
1025 hrtime_t duration;
1026 char *addrp, *offp, *sizep, *evp, *lastp, *p;
1027 uintptr_t addr;
1028 size_t size, off;
1029 int events_specified = 0;
1030 int exec_errno = 0;
1031 uint32_t event;
1032 char *filt = NULL, *ifilt = NULL;
1033 static uint64_t ev_count[LS_MAX_EVENTS + 1];
1034 static uint64_t ev_time[LS_MAX_EVENTS + 1];
1035 dtrace_optval_t aggsize;
1036 char aggstr[10];
1037 long ncpus;
1038 int dynvar = 0;
1039 int err;
1040
1041 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1042 fail(0, "cannot open dtrace library: %s",
1043 dtrace_errmsg(NULL, err));
1044 }
1045
1046 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1047 dfail("couldn't establish drop handler");
1048
1049 if (symtab_init() == -1)
1050 fail(1, "can't load kernel symbols");
1051
1052 g_nrecs = DEFAULT_NRECS;
1053
1054 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1055 switch (c) {
1056 case 'b':
1057 g_recsize = LS_BASIC;
1058 break;
1059
1060 case 't':
1061 g_recsize = LS_TIME;
1062 break;
1063
1064 case 'h':
1065 g_recsize = LS_HIST;
1066 break;
1067
1068 case 's':
1069 if (!isdigit(optarg[0]))
1070 usage();
1071 g_stkdepth = atoi(optarg);
1072 if (g_stkdepth > LS_MAX_STACK_DEPTH)
1073 fail(0, "max stack depth is %d",
1074 LS_MAX_STACK_DEPTH);
1075 g_recsize = LS_STACK(g_stkdepth);
1076 break;
1077
1078 case 'n':
1079 if (!isdigit(optarg[0]))
1080 usage();
1081 g_nrecs = atoi(optarg);
1082 break;
1083
1084 case 'd':
1085 if (!isdigit(optarg[0]))
1086 usage();
1087 duration = atoll(optarg);
1088
1089 /*
1090 * XXX -- durations really should be per event
1091 * since the units are different, but it's hard
1092 * to express this nicely in the interface.
1093 * Not clear yet what the cleanest solution is.
1094 */
1095 for (i = 0; i < LS_MAX_EVENTS; i++)
1096 if (g_event_info[i].ev_type != 'E')
1097 g_min_duration[i] = duration;
1098
1099 break;
1100
1101 case 'i':
1102 if (!isdigit(optarg[0]))
1103 usage();
1104 i = atoi(optarg);
1105 if (i <= 0)
1106 usage();
1107 if (i > MAX_HZ)
1108 fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1109
1110 for (j = 0; j < LS_MAX_EVENTS; j++)
1111 if (strcmp(g_event_info[j].ev_desc,
1112 "Profiling interrupt") == 0)
1113 break;
1114
1115 (void) sprintf(g_event_info[j].ev_name,
1116 "profile:::profile-%d", i);
1117 break;
1118
1119 case 'l':
1120 case 'f':
1121 addrp = strtok(optarg, ",");
1122 sizep = strtok(NULL, ",");
1123 addrp = strtok(optarg, ",+");
1124 offp = strtok(NULL, ",");
1125
1126 size = sizep ? strtoul(sizep, NULL, 0) : 1;
1127 off = offp ? strtoul(offp, NULL, 0) : 0;
1128
1129 if (addrp[0] == '0') {
1130 addr = strtoul(addrp, NULL, 16) + off;
1131 } else {
1132 addr = sym_to_addr(addrp) + off;
1133 if (sizep == NULL)
1134 size = sym_size(addrp) - off;
1135 if (addr - off == 0)
1136 fail(0, "symbol '%s' not found", addrp);
1137 if (size == 0)
1138 size = 1;
1139 }
1140
1141
1142 if (c == 'l') {
1143 filter_add(&filt, "arg0", addr, size);
1144 } else {
1145 filter_add(&filt, "caller", addr, size);
1146 filter_add(&ifilt, "arg0", addr, size);
1147 }
1148 break;
1149
1150 case 'e':
1151 evp = strtok_r(optarg, ",", &lastp);
1152 while (evp) {
1153 int ev1, ev2;
1154 char *evp2;
1155
1156 (void) strtok(evp, "-");
1157 evp2 = strtok(NULL, "-");
1158 ev1 = atoi(evp);
1159 ev2 = evp2 ? atoi(evp2) : ev1;
1160 if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1161 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1162 fail(0, "-e events out of range");
1163 for (i = ev1; i <= ev2; i++)
1164 g_enabled[i] = 1;
1165 evp = strtok_r(NULL, ",", &lastp);
1166 }
1167 events_specified = 1;
1168 break;
1169
1170 case 'c':
1171 g_cflag = 1;
1172 break;
1173
1174 case 'k':
1175 g_kflag = 1;
1176 break;
1177
1178 case 'w':
1179 g_wflag = 1;
1180 break;
1181
1182 case 'W':
1183 g_Wflag = 1;
1184 break;
1185
1186 case 'g':
1187 g_gflag = 1;
1188 break;
1189
1190 case 'C':
1191 case 'E':
1192 case 'H':
1193 case 'I':
1194 for (i = 0; i < LS_MAX_EVENTS; i++)
1195 if (g_event_info[i].ev_type == c)
1196 g_enabled[i] = 1;
1197 events_specified = 1;
1198 break;
1199
1200 case 'A':
1201 for (i = 0; i < LS_MAX_EVENTS; i++)
1202 if (strchr("CH", g_event_info[i].ev_type))
1203 g_enabled[i] = 1;
1204 events_specified = 1;
1205 break;
1206
1207 case 'T':
1208 g_tracing = 1;
1209 break;
1210
1211 case 'D':
1212 if (!isdigit(optarg[0]))
1213 usage();
1214 g_topn = atoi(optarg);
1215 break;
1216
1217 case 'R':
1218 g_rates = 1;
1219 break;
1220
1221 case 'p':
1222 g_pflag = 1;
1223 break;
1224
1225 case 'P':
1226 g_Pflag = 1;
1227 break;
1228
1229 case 'o':
1230 if ((out = fopen(optarg, "w")) == NULL)
1231 fail(1, "error opening file");
1232 break;
1233
1234 case 'V':
1235 g_Vflag = 1;
1236 break;
1237
1238 default:
1239 if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1240 usage();
1241 }
1242 }
1243
1244 if (filt != NULL) {
1245 predicate_add(&g_predicate, filt, NULL, 0);
1246 filter_destroy(&filt);
1247 }
1248
1249 if (ifilt != NULL) {
1250 predicate_add(&g_ipredicate, ifilt, NULL, 0);
1251 filter_destroy(&ifilt);
1252 }
1253
1254 if (g_recsize == 0) {
1255 if (g_gflag) {
1256 g_stkdepth = LS_MAX_STACK_DEPTH;
1257 g_recsize = LS_STACK(g_stkdepth);
1258 } else {
1259 g_recsize = LS_TIME;
1260 }
1261 }
1262
1263 if (g_gflag && g_recsize <= LS_STACK(0))
1264 fail(0, "'-g' requires at least '-s 1' data gathering");
1265
1266 /*
1267 * Make sure the alignment is reasonable
1268 */
1269 g_recsize = -(-g_recsize & -sizeof (uint64_t));
1270
1271 for (i = 0; i < LS_MAX_EVENTS; i++) {
1272 /*
1273 * If no events were specified, enable -C.
1274 */
1275 if (!events_specified && g_event_info[i].ev_type == 'C')
1276 g_enabled[i] = 1;
1277 }
1278
1279 for (i = 0; i < LS_MAX_EVENTS; i++) {
1280 if (!g_enabled[i])
1281 continue;
1282
1283 if (g_event_info[i].ev_acquire != NULL) {
1284 /*
1285 * If we've enabled a hold event, we must explicitly
1286 * allocate dynamic variable space.
1287 */
1288 dynvar = 1;
1289 }
1290
1291 dprog_addevent(i);
1292 }
1293
1294 /*
1295 * Make sure there are remaining arguments to specify a child command
1296 * to execute.
1297 */
1298 if (argc <= optind)
1299 usage();
1300
1301 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1302 dfail("couldn't determine number of online CPUs");
1303
1304 /*
1305 * By default, we set our data buffer size to be the number of records
1306 * multiplied by the size of the record, doubled to account for some
1307 * DTrace slop and divided by the number of CPUs. We silently clamp
1308 * the aggregation size at both a minimum and a maximum to prevent
1309 * absurdly low or high values.
1310 */
1311 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1312 aggsize = MIN_AGGSIZE;
1313
1314 if (aggsize > MAX_AGGSIZE)
1315 aggsize = MAX_AGGSIZE;
1316
1317 (void) sprintf(aggstr, "%lld", (long long)aggsize);
1318
1319 if (!g_tracing) {
1320 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1321 dfail("failed to set 'bufsize'");
1322
1323 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1324 dfail("failed to set 'aggsize'");
1325
1326 if (dynvar) {
1327 /*
1328 * If we're using dynamic variables, we set our
1329 * dynamic variable size to be one megabyte per CPU,
1330 * with a hard-limit of 32 megabytes. This may still
1331 * be too small in some cases, but it can be tuned
1332 * manually via -x if need be.
1333 */
1334 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1335
1336 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1337 dfail("failed to set 'dynvarsize'");
1338 }
1339 } else {
1340 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1341 dfail("failed to set 'bufsize'");
1342 }
1343
1344 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1345 dfail("failed to set 'statusrate'");
1346
1347 optind = 1;
1348 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1349 switch (c) {
1350 case 'x':
1351 if ((p = strchr(optarg, '=')) != NULL)
1352 *p++ = '\0';
1353
1354 if (dtrace_setopt(g_dtp, optarg, p) != 0)
1355 dfail("failed to set -x %s", optarg);
1356 break;
1357 }
1358 }
1359
1360 argc -= optind;
1361 argv += optind;
1362
1363 dprog_compile();
1364 status_init();
1365
1366 g_elapsed = -gethrtime();
1367
1368 /*
1369 * Spawn the specified command and wait for it to complete.
1370 */
1371 child = fork();
1372 if (child == -1)
1373 fail(1, "cannot fork");
1374 if (child == 0) {
1375 (void) dtrace_close(g_dtp);
1376 (void) execvp(argv[0], &argv[0]);
1377 exec_errno = errno;
1378 exit(127);
1379 }
1380
1381 while (waitpid(child, &status, WEXITED) != child)
1382 status_check();
1383
1384 g_elapsed += gethrtime();
1385
1386 if (WIFEXITED(status)) {
1387 if (WEXITSTATUS(status) != 0) {
1388 if (exec_errno != 0) {
1389 errno = exec_errno;
1390 fail(1, "could not execute %s", argv[0]);
1391 }
1392 (void) fprintf(stderr,
1393 "lockstat: warning: %s exited with code %d\n",
1394 argv[0], WEXITSTATUS(status));
1395 }
1396 } else {
1397 (void) fprintf(stderr,
1398 "lockstat: warning: %s died on signal %d\n",
1399 argv[0], WTERMSIG(status));
1400 }
1401
1402 if (dtrace_stop(g_dtp) == -1)
1403 dfail("failed to stop dtrace");
1404
1405 /*
1406 * Before we read out the results, we need to allocate our buffer.
1407 * If we're tracing, then we'll just use the precalculated size. If
1408 * we're not, then we'll take a snapshot of the aggregate, and walk
1409 * it to count the number of records.
1410 */
1411 if (!g_tracing) {
1412 if (dtrace_aggregate_snap(g_dtp) != 0)
1413 dfail("failed to snap aggregate");
1414
1415 g_nrecs = 0;
1416
1417 if (dtrace_aggregate_walk(g_dtp,
1418 count_aggregate, &g_nrecs) != 0)
1419 dfail("failed to walk aggregate");
1420 }
1421
1422 if ((data_buf = memalign(sizeof (uint64_t),
1423 (g_nrecs + 1) * g_recsize)) == NULL)
1424 fail(1, "Memory allocation failed");
1425
1426 /*
1427 * Read out the DTrace data.
1428 */
1429 g_nrecs_used = process_data(out, data_buf);
1430
1431 if (g_nrecs_used > g_nrecs || g_dropped)
1432 (void) fprintf(stderr, "lockstat: warning: "
1433 "ran out of data records (use -n for more)\n");
1434
1435 /* LINTED - alignment */
1436 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1437 /* LINTED - alignment */
1438 lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1439 ev_count[lsp->ls_event] += lsp->ls_count;
1440 ev_time[lsp->ls_event] += lsp->ls_time;
1441 }
1442
1443 /*
1444 * If -g was specified, convert stacks into individual records.
1445 */
1446 if (g_gflag) {
1447 lsrec_t *newlsp, *oldlsp;
1448
1449 newlsp = memalign(sizeof (uint64_t),
1450 g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1451 if (newlsp == NULL)
1452 fail(1, "Cannot allocate space for -g processing");
1453 lsp = newlsp;
1454 /* LINTED - alignment */
1455 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1456 /* LINTED - alignment */
1457 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1458 int fr;
1459 int caller_in_stack = 0;
1460
1461 if (oldlsp->ls_count == 0)
1462 continue;
1463
1464 for (fr = 0; fr < g_stkdepth; fr++) {
1465 if (oldlsp->ls_stack[fr] == 0)
1466 break;
1467 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1468 caller_in_stack = 1;
1469 bcopy(oldlsp, lsp, LS_TIME);
1470 lsp->ls_caller = oldlsp->ls_stack[fr];
1471 /* LINTED - alignment */
1472 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1473 }
1474 if (!caller_in_stack) {
1475 bcopy(oldlsp, lsp, LS_TIME);
1476 /* LINTED - alignment */
1477 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1478 }
1479 }
1480 g_nrecs = g_nrecs_used =
1481 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1482 g_recsize = LS_TIME;
1483 g_stkdepth = 0;
1484 free(data_buf);
1485 data_buf = (char *)newlsp;
1486 }
1487
1488 if ((sort_buf = calloc(2 * (g_nrecs + 1),
1489 sizeof (void *))) == NULL)
1490 fail(1, "Sort buffer allocation failed");
1491 merge_buf = sort_buf + (g_nrecs + 1);
1492
1493 /*
1494 * Build the sort buffer, discarding zero-count records along the way.
1495 */
1496 /* LINTED - alignment */
1497 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1498 /* LINTED - alignment */
1499 lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1500 if (lsp->ls_count == 0)
1501 lsp->ls_event = LS_MAX_EVENTS;
1502 sort_buf[i] = lsp;
1503 }
1504
1505 if (g_nrecs_used == 0)
1506 exit(0);
1507
1508 /*
1509 * Add a sentinel after the last record
1510 */
1511 sort_buf[i] = lsp;
1512 lsp->ls_event = LS_MAX_EVENTS;
1513
1514 if (g_tracing) {
1515 report_trace(out, sort_buf);
1516 return (0);
1517 }
1518
1519 /*
1520 * Application of -g may have resulted in multiple records
1521 * with the same signature; coalesce them.
1522 */
1523 if (g_gflag) {
1524 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1525 coalesce(lockcmp, sort_buf, g_nrecs_used);
1526 }
1527
1528 /*
1529 * Coalesce locks within the same symbol if -c option specified.
1530 * Coalesce PCs within the same function if -k option specified.
1531 */
1532 if (g_cflag || g_kflag) {
1533 for (i = 0; i < g_nrecs_used; i++) {
1534 int fr;
1535 lsp = sort_buf[i];
1536 if (g_cflag)
1537 coalesce_symbol(&lsp->ls_lock);
1538 if (g_kflag) {
1539 for (fr = 0; fr < g_stkdepth; fr++)
1540 coalesce_symbol(&lsp->ls_stack[fr]);
1541 coalesce_symbol(&lsp->ls_caller);
1542 }
1543 }
1544 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1545 coalesce(lockcmp, sort_buf, g_nrecs_used);
1546 }
1547
1548 /*
1549 * Coalesce callers if -w option specified
1550 */
1551 if (g_wflag) {
1552 mergesort(lock_and_count_cmp_anywhere,
1553 sort_buf, merge_buf, g_nrecs_used);
1554 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1555 }
1556
1557 /*
1558 * Coalesce locks if -W option specified
1559 */
1560 if (g_Wflag) {
1561 mergesort(site_and_count_cmp_anylock,
1562 sort_buf, merge_buf, g_nrecs_used);
1563 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1564 }
1565
1566 /*
1567 * Sort data by contention count (ls_count) or total time (ls_time),
1568 * depending on g_Pflag. Override g_Pflag if time wasn't measured.
1569 */
1570 if (g_recsize < LS_TIME)
1571 g_Pflag = 0;
1572
1573 if (g_Pflag)
1574 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1575 else
1576 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1577
1578 /*
1579 * Display data by event type
1580 */
1581 first = &sort_buf[0];
1582 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1583 current = first;
1584 while ((lsp = *current)->ls_event == event)
1585 current++;
1586 report_stats(out, first, current - first, ev_count[event],
1587 ev_time[event]);
1588 first = current;
1589 }
1590
1591 return (0);
1592 }
1593
1594 static char *
format_symbol(char * buf,uintptr_t addr,int show_size)1595 format_symbol(char *buf, uintptr_t addr, int show_size)
1596 {
1597 uintptr_t symoff;
1598 char *symname;
1599 size_t symsize;
1600
1601 symname = addr_to_sym(addr, &symoff, &symsize);
1602
1603 if (show_size && symoff == 0)
1604 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1605 else if (symoff == 0)
1606 (void) sprintf(buf, "%s", symname);
1607 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */
1608 (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1609 else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1610 (void) sprintf(buf, "%s+0x%llx", symname,
1611 (unsigned long long)symoff);
1612 else
1613 (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1614 return (buf);
1615 }
1616
1617 static void
report_stats(FILE * out,lsrec_t ** sort_buf,size_t nrecs,uint64_t total_count,uint64_t total_time)1618 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1619 uint64_t total_time)
1620 {
1621 uint32_t event = sort_buf[0]->ls_event;
1622 lsrec_t *lsp;
1623 double ptotal = 0.0;
1624 double percent;
1625 int i, j, fr;
1626 int displayed;
1627 int first_bin, last_bin, max_bin_count, total_bin_count;
1628 int rectype;
1629 char buf[256];
1630 char lhdr[80], chdr[80];
1631
1632 rectype = g_recsize;
1633
1634 if (g_topn == 0) {
1635 (void) fprintf(out, "%20llu %s\n",
1636 g_rates == 0 ? total_count :
1637 ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1638 g_event_info[event].ev_desc);
1639 return;
1640 }
1641
1642 (void) sprintf(lhdr, "%s%s",
1643 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1644 (void) sprintf(chdr, "%s%s",
1645 g_wflag ? "Hottest " : "", "Caller");
1646
1647 if (!g_pflag)
1648 (void) fprintf(out,
1649 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1650 g_event_info[event].ev_desc, (double)total_count,
1651 (double)g_elapsed / NANOSEC,
1652 (double)total_count * NANOSEC / g_elapsed);
1653
1654 if (!g_pflag && rectype < LS_HIST) {
1655 (void) sprintf(buf, "%s", g_event_info[event].ev_units);
1656 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1657 g_rates ? "ops/s" : "Count",
1658 g_gflag ? "genr" : "indv",
1659 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1660 (void) fprintf(out, "---------------------------------"
1661 "----------------------------------------------\n");
1662 }
1663
1664 displayed = 0;
1665 for (i = 0; i < nrecs; i++) {
1666 lsp = sort_buf[i];
1667
1668 if (displayed++ >= g_topn)
1669 break;
1670
1671 if (g_pflag) {
1672 int j;
1673
1674 (void) fprintf(out, "%u %u",
1675 lsp->ls_event, lsp->ls_count);
1676 (void) fprintf(out, " %s",
1677 format_symbol(buf, lsp->ls_lock, g_cflag));
1678 (void) fprintf(out, " %s",
1679 format_symbol(buf, lsp->ls_caller, 0));
1680 (void) fprintf(out, " %f",
1681 (double)lsp->ls_refcnt / lsp->ls_count);
1682 if (rectype >= LS_TIME)
1683 (void) fprintf(out, " %llu",
1684 (unsigned long long)lsp->ls_time);
1685 if (rectype >= LS_HIST) {
1686 for (j = 0; j < 64; j++)
1687 (void) fprintf(out, " %u",
1688 lsp->ls_hist[j]);
1689 }
1690 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1691 if (rectype <= LS_STACK(j) ||
1692 lsp->ls_stack[j] == 0)
1693 break;
1694 (void) fprintf(out, " %s",
1695 format_symbol(buf, lsp->ls_stack[j], 0));
1696 }
1697 (void) fprintf(out, "\n");
1698 continue;
1699 }
1700
1701 if (rectype >= LS_HIST) {
1702 (void) fprintf(out, "---------------------------------"
1703 "----------------------------------------------\n");
1704 (void) sprintf(buf, "%s",
1705 g_event_info[event].ev_units);
1706 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1707 g_rates ? "ops/s" : "Count",
1708 g_gflag ? "genr" : "indv",
1709 "cuml", "rcnt", buf, lhdr, chdr);
1710 }
1711
1712 if (g_Pflag && total_time != 0)
1713 percent = (lsp->ls_time * 100.00) / total_time;
1714 else
1715 percent = (lsp->ls_count * 100.00) / total_count;
1716
1717 ptotal += percent;
1718
1719 if (rectype >= LS_TIME)
1720 (void) sprintf(buf, "%llu",
1721 (unsigned long long)(lsp->ls_time / lsp->ls_count));
1722 else
1723 buf[0] = '\0';
1724
1725 (void) fprintf(out, "%5llu ",
1726 g_rates == 0 ? lsp->ls_count :
1727 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1728
1729 (void) fprintf(out, "%3.0f%% ", percent);
1730
1731 if (g_gflag)
1732 (void) fprintf(out, "---- ");
1733 else
1734 (void) fprintf(out, "%3.0f%% ", ptotal);
1735
1736 (void) fprintf(out, "%4.2f %8s ",
1737 (double)lsp->ls_refcnt / lsp->ls_count, buf);
1738
1739 (void) fprintf(out, "%-22s ",
1740 format_symbol(buf, lsp->ls_lock, g_cflag));
1741
1742 (void) fprintf(out, "%-24s\n",
1743 format_symbol(buf, lsp->ls_caller, 0));
1744
1745 if (rectype < LS_HIST)
1746 continue;
1747
1748 (void) fprintf(out, "\n");
1749 (void) fprintf(out, "%10s %31s %-9s %-24s\n",
1750 g_event_info[event].ev_units,
1751 "------ Time Distribution ------",
1752 g_rates ? "ops/s" : "count",
1753 rectype > LS_STACK(0) ? "Stack" : "");
1754
1755 first_bin = 0;
1756 while (lsp->ls_hist[first_bin] == 0)
1757 first_bin++;
1758
1759 last_bin = 63;
1760 while (lsp->ls_hist[last_bin] == 0)
1761 last_bin--;
1762
1763 max_bin_count = 0;
1764 total_bin_count = 0;
1765 for (j = first_bin; j <= last_bin; j++) {
1766 total_bin_count += lsp->ls_hist[j];
1767 if (lsp->ls_hist[j] > max_bin_count)
1768 max_bin_count = lsp->ls_hist[j];
1769 }
1770
1771 /*
1772 * If we went a few frames below the caller, ignore them
1773 */
1774 for (fr = 3; fr > 0; fr--)
1775 if (lsp->ls_stack[fr] == lsp->ls_caller)
1776 break;
1777
1778 for (j = first_bin; j <= last_bin; j++) {
1779 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1780 (void) fprintf(out, "%10llu |%s%s %-9u ",
1781 1ULL << j,
1782 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1783 " " + depth,
1784 g_rates == 0 ? lsp->ls_hist[j] :
1785 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1786 g_elapsed));
1787 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1788 (void) fprintf(out, "\n");
1789 continue;
1790 }
1791 (void) fprintf(out, "%-24s\n",
1792 format_symbol(buf, lsp->ls_stack[fr], 0));
1793 fr++;
1794 }
1795 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1796 (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1797 format_symbol(buf, lsp->ls_stack[fr], 0));
1798 fr++;
1799 }
1800 }
1801
1802 if (!g_pflag)
1803 (void) fprintf(out, "---------------------------------"
1804 "----------------------------------------------\n");
1805
1806 (void) fflush(out);
1807 }
1808
1809 static void
report_trace(FILE * out,lsrec_t ** sort_buf)1810 report_trace(FILE *out, lsrec_t **sort_buf)
1811 {
1812 lsrec_t *lsp;
1813 int i, fr;
1814 int rectype;
1815 char buf[256], buf2[256];
1816
1817 rectype = g_recsize;
1818
1819 if (!g_pflag) {
1820 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n",
1821 "Event", "Time", "Owner", "Lock", "Caller");
1822 (void) fprintf(out, "---------------------------------"
1823 "----------------------------------------------\n");
1824 }
1825
1826 for (i = 0; i < g_nrecs_used; i++) {
1827
1828 lsp = sort_buf[i];
1829
1830 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1831 continue;
1832
1833 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n",
1834 lsp->ls_event, (unsigned long long)lsp->ls_time,
1835 (void *)lsp->ls_next,
1836 format_symbol(buf, lsp->ls_lock, 0),
1837 format_symbol(buf2, lsp->ls_caller, 0));
1838
1839 if (rectype <= LS_STACK(0))
1840 continue;
1841
1842 /*
1843 * If we went a few frames below the caller, ignore them
1844 */
1845 for (fr = 3; fr > 0; fr--)
1846 if (lsp->ls_stack[fr] == lsp->ls_caller)
1847 break;
1848
1849 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1850 (void) fprintf(out, "%53s %-24s\n", "",
1851 format_symbol(buf, lsp->ls_stack[fr], 0));
1852 fr++;
1853 }
1854 (void) fprintf(out, "\n");
1855 }
1856
1857 (void) fflush(out);
1858 }
1859