1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <stdio.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <stdarg.h>
30 #include <string.h>
31 #include <strings.h>
32 #include <ctype.h>
33 #include <fcntl.h>
34 #include <unistd.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <sys/types.h>
38 #include <sys/modctl.h>
39 #include <sys/stat.h>
40 #include <sys/wait.h>
41 #include <dtrace.h>
42 #include <sys/lockstat.h>
43 #include <alloca.h>
44 #include <signal.h>
45 #include <assert.h>
46
47 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
48
49 #define LS_MAX_STACK_DEPTH 50
50 #define LS_MAX_EVENTS 64
51
52 typedef struct lsrec {
53 struct lsrec *ls_next; /* next in hash chain */
54 uintptr_t ls_lock; /* lock address */
55 uintptr_t ls_caller; /* caller address */
56 uint32_t ls_count; /* cumulative event count */
57 uint32_t ls_event; /* type of event */
58 uintptr_t ls_refcnt; /* cumulative reference count */
59 uint64_t ls_time; /* cumulative event duration */
60 uint32_t ls_hist[64]; /* log2(duration) histogram */
61 uintptr_t ls_stack[LS_MAX_STACK_DEPTH];
62 } lsrec_t;
63
64 typedef struct lsdata {
65 struct lsrec *lsd_next; /* next available */
66 int lsd_count; /* number of records */
67 } lsdata_t;
68
69 /*
70 * Definitions for the types of experiments which can be run. They are
71 * listed in increasing order of memory cost and processing time cost.
72 * The numerical value of each type is the number of bytes needed per record.
73 */
74 #define LS_BASIC offsetof(lsrec_t, ls_time)
75 #define LS_TIME offsetof(lsrec_t, ls_hist[0])
76 #define LS_HIST offsetof(lsrec_t, ls_stack[0])
77 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])
78
79 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
80 static void report_trace(FILE *, lsrec_t **);
81
82 extern int symtab_init(void);
83 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
84 extern uintptr_t sym_to_addr(char *name);
85 extern size_t sym_size(char *name);
86 extern char *strtok_r(char *, const char *, char **);
87
88 #define DEFAULT_NRECS 10000
89 #define DEFAULT_HZ 97
90 #define MAX_HZ 1000
91 #define MIN_AGGSIZE (16 * 1024)
92 #define MAX_AGGSIZE (32 * 1024 * 1024)
93
94 static int g_stkdepth;
95 static int g_topn = INT_MAX;
96 static hrtime_t g_elapsed;
97 static int g_rates = 0;
98 static int g_pflag = 0;
99 static int g_Pflag = 0;
100 static int g_wflag = 0;
101 static int g_Wflag = 0;
102 static int g_cflag = 0;
103 static int g_kflag = 0;
104 static int g_gflag = 0;
105 static int g_Vflag = 0;
106 static int g_tracing = 0;
107 static size_t g_recsize;
108 static size_t g_nrecs;
109 static int g_nrecs_used;
110 static uchar_t g_enabled[LS_MAX_EVENTS];
111 static hrtime_t g_min_duration[LS_MAX_EVENTS];
112 static dtrace_hdl_t *g_dtp;
113 static char *g_predicate;
114 static char *g_ipredicate;
115 static char *g_prog;
116 static int g_proglen;
117 static int g_dropped;
118
119 typedef struct ls_event_info {
120 char ev_type;
121 char ev_lhdr[20];
122 char ev_desc[80];
123 char ev_units[10];
124 char ev_name[DTRACE_NAMELEN];
125 char *ev_predicate;
126 char *ev_acquire;
127 } ls_event_info_t;
128
129 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
130 { 'C', "Lock", "Adaptive mutex spin", "nsec",
131 "lockstat:::adaptive-spin" },
132 { 'C', "Lock", "Adaptive mutex block", "nsec",
133 "lockstat:::adaptive-block" },
134 { 'C', "Lock", "Spin lock spin", "nsec",
135 "lockstat:::spin-spin" },
136 { 'C', "Lock", "Thread lock spin", "nsec",
137 "lockstat:::thread-spin" },
138 { 'C', "Lock", "R/W writer blocked by writer", "nsec",
139 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
140 { 'C', "Lock", "R/W writer blocked by readers", "nsec",
141 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
142 { 'C', "Lock", "R/W reader blocked by writer", "nsec",
143 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
144 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec",
145 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
146 { 'C', "Lock", "Unknown event (type 8)", "units" },
147 { 'C', "Lock", "Unknown event (type 9)", "units" },
148 { 'C', "Lock", "Unknown event (type 10)", "units" },
149 { 'C', "Lock", "Unknown event (type 11)", "units" },
150 { 'C', "Lock", "Unknown event (type 12)", "units" },
151 { 'C', "Lock", "Unknown event (type 13)", "units" },
152 { 'C', "Lock", "Unknown event (type 14)", "units" },
153 { 'C', "Lock", "Unknown event (type 15)", "units" },
154 { 'C', "Lock", "Unknown event (type 16)", "units" },
155 { 'C', "Lock", "Unknown event (type 17)", "units" },
156 { 'C', "Lock", "Unknown event (type 18)", "units" },
157 { 'C', "Lock", "Unknown event (type 19)", "units" },
158 { 'C', "Lock", "Unknown event (type 20)", "units" },
159 { 'C', "Lock", "Unknown event (type 21)", "units" },
160 { 'C', "Lock", "Unknown event (type 22)", "units" },
161 { 'C', "Lock", "Unknown event (type 23)", "units" },
162 { 'C', "Lock", "Unknown event (type 24)", "units" },
163 { 'C', "Lock", "Unknown event (type 25)", "units" },
164 { 'C', "Lock", "Unknown event (type 26)", "units" },
165 { 'C', "Lock", "Unknown event (type 27)", "units" },
166 { 'C', "Lock", "Unknown event (type 28)", "units" },
167 { 'C', "Lock", "Unknown event (type 29)", "units" },
168 { 'C', "Lock", "Unknown event (type 30)", "units" },
169 { 'C', "Lock", "Unknown event (type 31)", "units" },
170 { 'H', "Lock", "Adaptive mutex hold", "nsec",
171 "lockstat:::adaptive-release", NULL,
172 "lockstat:::adaptive-acquire" },
173 { 'H', "Lock", "Spin lock hold", "nsec",
174 "lockstat:::spin-release", NULL,
175 "lockstat:::spin-acquire" },
176 { 'H', "Lock", "R/W writer hold", "nsec",
177 "lockstat:::rw-release", "arg1 == 0",
178 "lockstat:::rw-acquire" },
179 { 'H', "Lock", "R/W reader hold", "nsec",
180 "lockstat:::rw-release", "arg1 != 0",
181 "lockstat:::rw-acquire" },
182 { 'H', "Lock", "Unknown event (type 36)", "units" },
183 { 'H', "Lock", "Unknown event (type 37)", "units" },
184 { 'H', "Lock", "Unknown event (type 38)", "units" },
185 { 'H', "Lock", "Unknown event (type 39)", "units" },
186 { 'H', "Lock", "Unknown event (type 40)", "units" },
187 { 'H', "Lock", "Unknown event (type 41)", "units" },
188 { 'H', "Lock", "Unknown event (type 42)", "units" },
189 { 'H', "Lock", "Unknown event (type 43)", "units" },
190 { 'H', "Lock", "Unknown event (type 44)", "units" },
191 { 'H', "Lock", "Unknown event (type 45)", "units" },
192 { 'H', "Lock", "Unknown event (type 46)", "units" },
193 { 'H', "Lock", "Unknown event (type 47)", "units" },
194 { 'H', "Lock", "Unknown event (type 48)", "units" },
195 { 'H', "Lock", "Unknown event (type 49)", "units" },
196 { 'H', "Lock", "Unknown event (type 50)", "units" },
197 { 'H', "Lock", "Unknown event (type 51)", "units" },
198 { 'H', "Lock", "Unknown event (type 52)", "units" },
199 { 'H', "Lock", "Unknown event (type 53)", "units" },
200 { 'H', "Lock", "Unknown event (type 54)", "units" },
201 { 'H', "Lock", "Unknown event (type 55)", "units" },
202 { 'I', "CPU+PIL", "Profiling interrupt", "nsec",
203 "profile:::profile-97", NULL },
204 { 'I', "Lock", "Unknown event (type 57)", "units" },
205 { 'I', "Lock", "Unknown event (type 58)", "units" },
206 { 'I', "Lock", "Unknown event (type 59)", "units" },
207 { 'E', "Lock", "Recursive lock entry detected", "(N/A)",
208 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
209 { 'E', "Lock", "Lockstat enter failure", "(N/A)" },
210 { 'E', "Lock", "Lockstat exit failure", "nsec" },
211 { 'E', "Lock", "Lockstat record failure", "(N/A)" },
212 };
213
214 static void
fail(int do_perror,const char * message,...)215 fail(int do_perror, const char *message, ...)
216 {
217 va_list args;
218 int save_errno = errno;
219
220 va_start(args, message);
221 (void) fprintf(stderr, "lockstat: ");
222 (void) vfprintf(stderr, message, args);
223 va_end(args);
224 if (do_perror)
225 (void) fprintf(stderr, ": %s", strerror(save_errno));
226 (void) fprintf(stderr, "\n");
227 exit(2);
228 }
229
230 static void
dfail(const char * message,...)231 dfail(const char *message, ...)
232 {
233 va_list args;
234
235 va_start(args, message);
236 (void) fprintf(stderr, "lockstat: ");
237 (void) vfprintf(stderr, message, args);
238 va_end(args);
239 (void) fprintf(stderr, ": %s\n",
240 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
241
242 exit(2);
243 }
244
245 static void
show_events(char event_type,char * desc)246 show_events(char event_type, char *desc)
247 {
248 int i, first = -1, last;
249
250 for (i = 0; i < LS_MAX_EVENTS; i++) {
251 ls_event_info_t *evp = &g_event_info[i];
252 if (evp->ev_type != event_type ||
253 strncmp(evp->ev_desc, "Unknown event", 13) == 0)
254 continue;
255 if (first == -1)
256 first = i;
257 last = i;
258 }
259
260 (void) fprintf(stderr,
261 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
262 desc, event_type, first, last);
263
264 for (i = first; i <= last; i++)
265 (void) fprintf(stderr,
266 "%4d = %s\n", i, g_event_info[i].ev_desc);
267 }
268
269 static void
usage(void)270 usage(void)
271 {
272 (void) fprintf(stderr,
273 "Usage: lockstat [options] command [args]\n"
274 "\nEvent selection options:\n\n"
275 " -C watch contention events [on by default]\n"
276 " -E watch error events [off by default]\n"
277 " -H watch hold events [off by default]\n"
278 " -I watch interrupt events [off by default]\n"
279 " -A watch all lock events [equivalent to -CH]\n"
280 " -e event_list only watch the specified events (shown below);\n"
281 " <event_list> is a comma-separated list of\n"
282 " events or ranges of events, e.g. 1,4-7,35\n"
283 " -i rate interrupt rate for -I [default: %d Hz]\n"
284 "\nData gathering options:\n\n"
285 " -b basic statistics (lock, caller, event count)\n"
286 " -t timing for all events [default]\n"
287 " -h histograms for event times\n"
288 " -s depth stack traces <depth> deep\n"
289 " -x opt[=val] enable or modify DTrace options\n"
290 "\nData filtering options:\n\n"
291 " -n nrecords maximum number of data records [default: %d]\n"
292 " -l lock[,size] only watch <lock>, which can be specified as a\n"
293 " symbolic name or hex address; <size> defaults\n"
294 " to the ELF symbol size if available, 1 if not\n"
295 " -f func[,size] only watch events generated by <func>\n"
296 " -d duration only watch events longer than <duration>\n"
297 " -T trace (rather than sample) events\n"
298 "\nData reporting options:\n\n"
299 " -c coalesce lock data for arrays like pse_mutex[]\n"
300 " -k coalesce PCs within functions\n"
301 " -g show total events generated by function\n"
302 " -w wherever: don't distinguish events by caller\n"
303 " -W whichever: don't distinguish events by lock\n"
304 " -R display rates rather than counts\n"
305 " -p parsable output format (awk(1)-friendly)\n"
306 " -P sort lock data by (count * avg_time) product\n"
307 " -D n only display top <n> events of each type\n"
308 " -o filename send output to <filename>\n",
309 DEFAULT_HZ, DEFAULT_NRECS);
310
311 show_events('C', "Contention");
312 show_events('H', "Hold-time");
313 show_events('I', "Interrupt");
314 show_events('E', "Error");
315 (void) fprintf(stderr, "\n");
316
317 exit(1);
318 }
319
320 static int
lockcmp(lsrec_t * a,lsrec_t * b)321 lockcmp(lsrec_t *a, lsrec_t *b)
322 {
323 int i;
324
325 if (a->ls_event < b->ls_event)
326 return (-1);
327 if (a->ls_event > b->ls_event)
328 return (1);
329
330 for (i = g_stkdepth - 1; i >= 0; i--) {
331 if (a->ls_stack[i] < b->ls_stack[i])
332 return (-1);
333 if (a->ls_stack[i] > b->ls_stack[i])
334 return (1);
335 }
336
337 if (a->ls_caller < b->ls_caller)
338 return (-1);
339 if (a->ls_caller > b->ls_caller)
340 return (1);
341
342 if (a->ls_lock < b->ls_lock)
343 return (-1);
344 if (a->ls_lock > b->ls_lock)
345 return (1);
346
347 return (0);
348 }
349
350 static int
countcmp(lsrec_t * a,lsrec_t * b)351 countcmp(lsrec_t *a, lsrec_t *b)
352 {
353 if (a->ls_event < b->ls_event)
354 return (-1);
355 if (a->ls_event > b->ls_event)
356 return (1);
357
358 return (b->ls_count - a->ls_count);
359 }
360
361 static int
timecmp(lsrec_t * a,lsrec_t * b)362 timecmp(lsrec_t *a, lsrec_t *b)
363 {
364 if (a->ls_event < b->ls_event)
365 return (-1);
366 if (a->ls_event > b->ls_event)
367 return (1);
368
369 if (a->ls_time < b->ls_time)
370 return (1);
371 if (a->ls_time > b->ls_time)
372 return (-1);
373
374 return (0);
375 }
376
377 static int
lockcmp_anywhere(lsrec_t * a,lsrec_t * b)378 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
379 {
380 if (a->ls_event < b->ls_event)
381 return (-1);
382 if (a->ls_event > b->ls_event)
383 return (1);
384
385 if (a->ls_lock < b->ls_lock)
386 return (-1);
387 if (a->ls_lock > b->ls_lock)
388 return (1);
389
390 return (0);
391 }
392
393 static int
lock_and_count_cmp_anywhere(lsrec_t * a,lsrec_t * b)394 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
395 {
396 if (a->ls_event < b->ls_event)
397 return (-1);
398 if (a->ls_event > b->ls_event)
399 return (1);
400
401 if (a->ls_lock < b->ls_lock)
402 return (-1);
403 if (a->ls_lock > b->ls_lock)
404 return (1);
405
406 return (b->ls_count - a->ls_count);
407 }
408
409 static int
sitecmp_anylock(lsrec_t * a,lsrec_t * b)410 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
411 {
412 int i;
413
414 if (a->ls_event < b->ls_event)
415 return (-1);
416 if (a->ls_event > b->ls_event)
417 return (1);
418
419 for (i = g_stkdepth - 1; i >= 0; i--) {
420 if (a->ls_stack[i] < b->ls_stack[i])
421 return (-1);
422 if (a->ls_stack[i] > b->ls_stack[i])
423 return (1);
424 }
425
426 if (a->ls_caller < b->ls_caller)
427 return (-1);
428 if (a->ls_caller > b->ls_caller)
429 return (1);
430
431 return (0);
432 }
433
434 static int
site_and_count_cmp_anylock(lsrec_t * a,lsrec_t * b)435 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
436 {
437 int i;
438
439 if (a->ls_event < b->ls_event)
440 return (-1);
441 if (a->ls_event > b->ls_event)
442 return (1);
443
444 for (i = g_stkdepth - 1; i >= 0; i--) {
445 if (a->ls_stack[i] < b->ls_stack[i])
446 return (-1);
447 if (a->ls_stack[i] > b->ls_stack[i])
448 return (1);
449 }
450
451 if (a->ls_caller < b->ls_caller)
452 return (-1);
453 if (a->ls_caller > b->ls_caller)
454 return (1);
455
456 return (b->ls_count - a->ls_count);
457 }
458
459 static void
mergesort(int (* cmp)(lsrec_t *,lsrec_t *),lsrec_t ** a,lsrec_t ** b,int n)460 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
461 {
462 int m = n / 2;
463 int i, j;
464
465 if (m > 1)
466 mergesort(cmp, a, b, m);
467 if (n - m > 1)
468 mergesort(cmp, a + m, b + m, n - m);
469 for (i = m; i > 0; i--)
470 b[i - 1] = a[i - 1];
471 for (j = m - 1; j < n - 1; j++)
472 b[n + m - j - 2] = a[j + 1];
473 while (i < j)
474 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
475 *a = b[i];
476 }
477
478 static void
coalesce(int (* cmp)(lsrec_t *,lsrec_t *),lsrec_t ** lock,int n)479 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
480 {
481 int i, j;
482 lsrec_t *target, *current;
483
484 target = lock[0];
485
486 for (i = 1; i < n; i++) {
487 current = lock[i];
488 if (cmp(current, target) != 0) {
489 target = current;
490 continue;
491 }
492 current->ls_event = LS_MAX_EVENTS;
493 target->ls_count += current->ls_count;
494 target->ls_refcnt += current->ls_refcnt;
495 if (g_recsize < LS_TIME)
496 continue;
497 target->ls_time += current->ls_time;
498 if (g_recsize < LS_HIST)
499 continue;
500 for (j = 0; j < 64; j++)
501 target->ls_hist[j] += current->ls_hist[j];
502 }
503 }
504
505 static void
coalesce_symbol(uintptr_t * addrp)506 coalesce_symbol(uintptr_t *addrp)
507 {
508 uintptr_t symoff;
509 size_t symsize;
510
511 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
512 *addrp -= symoff;
513 }
514
515 static void
predicate_add(char ** pred,char * what,char * cmp,uintptr_t value)516 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
517 {
518 char *new;
519 int len, newlen;
520
521 if (what == NULL)
522 return;
523
524 if (*pred == NULL) {
525 *pred = malloc(1);
526 *pred[0] = '\0';
527 }
528
529 len = strlen(*pred);
530 newlen = len + strlen(what) + 32 + strlen("( && )");
531 new = malloc(newlen);
532
533 if (*pred[0] != '\0') {
534 if (cmp != NULL) {
535 (void) sprintf(new, "(%s) && (%s %s 0x%p)",
536 *pred, what, cmp, (void *)value);
537 } else {
538 (void) sprintf(new, "(%s) && (%s)", *pred, what);
539 }
540 } else {
541 if (cmp != NULL) {
542 (void) sprintf(new, "%s %s 0x%p",
543 what, cmp, (void *)value);
544 } else {
545 (void) sprintf(new, "%s", what);
546 }
547 }
548
549 free(*pred);
550 *pred = new;
551 }
552
553 static void
predicate_destroy(char ** pred)554 predicate_destroy(char **pred)
555 {
556 free(*pred);
557 *pred = NULL;
558 }
559
560 static void
filter_add(char ** filt,char * what,uintptr_t base,uintptr_t size)561 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
562 {
563 char buf[256], *c = buf, *new;
564 int len, newlen;
565
566 if (*filt == NULL) {
567 *filt = malloc(1);
568 *filt[0] = '\0';
569 }
570
571 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
572 " || " : "", what, (void *)base, what, (void *)(base + size));
573
574 newlen = (len = strlen(*filt) + 1) + strlen(c);
575 new = malloc(newlen);
576 bcopy(*filt, new, len);
577 (void) strcat(new, c);
578 free(*filt);
579 *filt = new;
580 }
581
582 static void
filter_destroy(char ** filt)583 filter_destroy(char **filt)
584 {
585 free(*filt);
586 *filt = NULL;
587 }
588
589 static void
dprog_add(const char * fmt,...)590 dprog_add(const char *fmt, ...)
591 {
592 va_list args;
593 int size, offs;
594 char c;
595
596 va_start(args, fmt);
597 size = vsnprintf(&c, 1, fmt, args) + 1;
598
599 if (g_proglen == 0) {
600 offs = 0;
601 } else {
602 offs = g_proglen - 1;
603 }
604
605 g_proglen = offs + size;
606
607 if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
608 fail(1, "failed to reallocate program text");
609
610 (void) vsnprintf(&g_prog[offs], size, fmt, args);
611 }
612
613 /*
614 * This function may read like an open sewer, but keep in mind that programs
615 * that generate other programs are rarely pretty. If one has the unenviable
616 * task of maintaining or -- worse -- extending this code, use the -V option
617 * to examine the D program as generated by this function.
618 */
619 static void
dprog_addevent(int event)620 dprog_addevent(int event)
621 {
622 ls_event_info_t *info = &g_event_info[event];
623 char *pred = NULL;
624 char stack[20];
625 const char *arg0, *caller;
626 char *arg1 = "arg1";
627 char buf[80];
628 hrtime_t dur;
629 int depth;
630
631 if (info->ev_name[0] == '\0')
632 return;
633
634 if (info->ev_type == 'I') {
635 /*
636 * For interrupt events, arg0 (normally the lock pointer) is
637 * the CPU address plus the current pil, and arg1 (normally
638 * the number of nanoseconds) is the number of nanoseconds
639 * late -- and it's stored in arg2.
640 */
641 arg0 = "(uintptr_t)curthread->t_cpu + \n"
642 "\t curthread->t_cpu->cpu_profile_pil";
643 caller = "(uintptr_t)arg0";
644 arg1 = "arg2";
645 } else {
646 arg0 = "(uintptr_t)arg0";
647 caller = "caller";
648 }
649
650 if (g_recsize > LS_HIST) {
651 for (depth = 0; g_recsize > LS_STACK(depth); depth++)
652 continue;
653
654 if (g_tracing) {
655 (void) sprintf(stack, "\tstack(%d);\n", depth);
656 } else {
657 (void) sprintf(stack, ", stack(%d)", depth);
658 }
659 } else {
660 (void) sprintf(stack, "");
661 }
662
663 if (info->ev_acquire != NULL) {
664 /*
665 * If this is a hold event, we need to generate an additional
666 * clause for the acquire; the clause for the release will be
667 * generated with the aggregating statement, below.
668 */
669 dprog_add("%s\n", info->ev_acquire);
670 predicate_add(&pred, info->ev_predicate, NULL, 0);
671 predicate_add(&pred, g_predicate, NULL, 0);
672 if (pred != NULL)
673 dprog_add("/%s/\n", pred);
674
675 dprog_add("{\n");
676 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
677
678 if (info->ev_type == 'H') {
679 dprog_add("\t%s = timestamp;\n", buf);
680 } else {
681 /*
682 * If this isn't a hold event, it's the recursive
683 * error event. For this, we simply bump the
684 * thread-local, per-lock count.
685 */
686 dprog_add("\t%s++;\n", buf);
687 }
688
689 dprog_add("}\n\n");
690 predicate_destroy(&pred);
691 pred = NULL;
692
693 if (info->ev_type == 'E') {
694 /*
695 * If this is the recursive lock error event, we need
696 * to generate an additional clause to decrement the
697 * thread-local, per-lock count. This assures that we
698 * only execute the aggregating clause if we have
699 * recursive entry.
700 */
701 dprog_add("%s\n", info->ev_name);
702 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
703 }
704
705 predicate_add(&pred, buf, NULL, 0);
706
707 if (info->ev_type == 'H') {
708 (void) sprintf(buf, "timestamp -\n\t "
709 "self->ev%d[(uintptr_t)arg0]", event);
710 }
711
712 arg1 = buf;
713 } else {
714 predicate_add(&pred, info->ev_predicate, NULL, 0);
715 if (info->ev_type != 'I')
716 predicate_add(&pred, g_predicate, NULL, 0);
717 else
718 predicate_add(&pred, g_ipredicate, NULL, 0);
719 }
720
721 if ((dur = g_min_duration[event]) != 0)
722 predicate_add(&pred, arg1, ">=", dur);
723
724 dprog_add("%s\n", info->ev_name);
725
726 if (pred != NULL)
727 dprog_add("/%s/\n", pred);
728 predicate_destroy(&pred);
729
730 dprog_add("{\n");
731
732 if (g_tracing) {
733 dprog_add("\ttrace(%dULL);\n", event);
734 dprog_add("\ttrace(%s);\n", arg0);
735 dprog_add("\ttrace(%s);\n", caller);
736 dprog_add(stack);
737 } else {
738 /*
739 * The ordering here is important: when we process the
740 * aggregate, we count on the fact that @avg appears before
741 * @hist in program order to assure that @avg is assigned the
742 * first aggregation variable ID and @hist assigned the
743 * second; see the comment in process_aggregate() for details.
744 */
745 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
746 event, arg0, caller, stack, arg1);
747
748 if (g_recsize >= LS_HIST) {
749 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
750 "(%s);\n", event, arg0, caller, stack, arg1);
751 }
752 }
753
754 if (info->ev_acquire != NULL)
755 dprog_add("\tself->ev%d[arg0] = 0;\n", event);
756
757 dprog_add("}\n\n");
758 }
759
760 static void
dprog_compile()761 dprog_compile()
762 {
763 dtrace_prog_t *prog;
764 dtrace_proginfo_t info;
765
766 if (g_Vflag) {
767 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
768 (void) fputs(g_prog, stderr);
769 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
770 }
771
772 if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
773 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
774 dfail("failed to compile program");
775
776 if (dtrace_program_exec(g_dtp, prog, &info) == -1)
777 dfail("failed to enable probes");
778
779 if (dtrace_go(g_dtp) != 0)
780 dfail("couldn't start tracing");
781 }
782
783 static void
status_fire(int signal __unused)784 status_fire(int signal __unused)
785 {
786 }
787
788 static void
status_init(void)789 status_init(void)
790 {
791 dtrace_optval_t val, status, agg;
792 struct sigaction act;
793 struct itimerspec ts;
794 struct sigevent ev;
795 timer_t tid;
796
797 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
798 dfail("failed to get 'statusrate'");
799
800 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
801 dfail("failed to get 'statusrate'");
802
803 /*
804 * We would want to awaken at a rate that is the GCD of the statusrate
805 * and the aggrate -- but that seems a bit absurd. Instead, we'll
806 * simply awaken at a rate that is the more frequent of the two, which
807 * assures that we're never later than the interval implied by the
808 * more frequent rate.
809 */
810 val = status < agg ? status : agg;
811
812 (void) sigemptyset(&act.sa_mask);
813 act.sa_flags = 0;
814 act.sa_handler = status_fire;
815 (void) sigaction(SIGUSR1, &act, NULL);
816
817 ev.sigev_notify = SIGEV_SIGNAL;
818 ev.sigev_signo = SIGUSR1;
819
820 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
821 dfail("cannot create CLOCK_REALTIME timer");
822
823 ts.it_value.tv_sec = val / NANOSEC;
824 ts.it_value.tv_nsec = val % NANOSEC;
825 ts.it_interval = ts.it_value;
826
827 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
828 dfail("cannot set time on CLOCK_REALTIME timer");
829 }
830
831 static void
status_check(void)832 status_check(void)
833 {
834 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
835 dfail("failed to snap aggregate");
836
837 if (dtrace_status(g_dtp) == -1)
838 dfail("dtrace_status()");
839 }
840
841 static void
lsrec_fill(lsrec_t * lsrec,const dtrace_recdesc_t * rec,int nrecs,caddr_t data)842 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
843 {
844 bzero(lsrec, g_recsize);
845 lsrec->ls_count = 1;
846
847 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
848 fail(0, "truncated DTrace record");
849
850 if (rec->dtrd_size != sizeof (uint64_t))
851 fail(0, "bad event size in first record");
852
853 /* LINTED - alignment */
854 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
855 rec++;
856
857 if (rec->dtrd_size != sizeof (uintptr_t))
858 fail(0, "bad lock address size in second record");
859
860 /* LINTED - alignment */
861 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
862 rec++;
863
864 if (rec->dtrd_size != sizeof (uintptr_t))
865 fail(0, "bad caller size in third record");
866
867 /* LINTED - alignment */
868 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
869 rec++;
870
871 if (g_recsize > LS_HIST) {
872 int frames, i;
873 pc_t *stack;
874
875 frames = rec->dtrd_size / sizeof (pc_t);
876 /* LINTED - alignment */
877 stack = (pc_t *)(data + rec->dtrd_offset);
878
879 for (i = 1; i < frames; i++)
880 lsrec->ls_stack[i - 1] = stack[i];
881 }
882 }
883
884 /*ARGSUSED*/
885 static int
count_aggregate(const dtrace_aggdata_t * agg,void * arg)886 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
887 {
888 *((size_t *)arg) += 1;
889
890 return (DTRACE_AGGWALK_NEXT);
891 }
892
893 static int
process_aggregate(const dtrace_aggdata_t * agg,void * arg)894 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
895 {
896 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
897 caddr_t data = agg->dtada_data;
898 lsdata_t *lsdata = arg;
899 lsrec_t *lsrec = lsdata->lsd_next;
900 const dtrace_recdesc_t *rec;
901 uint64_t *avg, *quantized;
902 int i, j;
903
904 assert(lsdata->lsd_count < g_nrecs);
905
906 /*
907 * Aggregation variable IDs are guaranteed to be generated in program
908 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
909 * plus one. As "avg" appears before "hist" in program order, we know
910 * that "avg" will be allocated the first aggregation variable ID, and
911 * "hist" will be allocated the second aggregation variable ID -- and
912 * we therefore use the aggregation variable ID to differentiate the
913 * cases.
914 */
915 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
916 /*
917 * If this is the histogram entry. We'll copy the quantized
918 * data into lc_hist, and jump over the rest.
919 */
920 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
921
922 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
923 fail(0, "bad variable ID in aggregation record");
924
925 if (rec->dtrd_size !=
926 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
927 fail(0, "bad quantize size in aggregation record");
928
929 /* LINTED - alignment */
930 quantized = (uint64_t *)(data + rec->dtrd_offset);
931
932 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
933 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
934 lsrec->ls_hist[j] = quantized[i];
935
936 goto out;
937 }
938
939 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
940 aggdesc->dtagd_nrecs - 1, data);
941
942 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
943
944 if (rec->dtrd_size != 2 * sizeof (uint64_t))
945 fail(0, "bad avg size in aggregation record");
946
947 /* LINTED - alignment */
948 avg = (uint64_t *)(data + rec->dtrd_offset);
949 lsrec->ls_count = (uint32_t)avg[0];
950 lsrec->ls_time = (uintptr_t)avg[1];
951
952 if (g_recsize >= LS_HIST)
953 return (DTRACE_AGGWALK_NEXT);
954
955 out:
956 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
957 lsdata->lsd_count++;
958
959 return (DTRACE_AGGWALK_NEXT);
960 }
961
962 static int
process_trace(const dtrace_probedata_t * pdata,void * arg)963 process_trace(const dtrace_probedata_t *pdata, void *arg)
964 {
965 lsdata_t *lsdata = arg;
966 lsrec_t *lsrec = lsdata->lsd_next;
967 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
968 caddr_t data = pdata->dtpda_data;
969
970 if (lsdata->lsd_count >= g_nrecs)
971 return (DTRACE_CONSUME_NEXT);
972
973 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
974
975 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
976 lsdata->lsd_count++;
977
978 return (DTRACE_CONSUME_NEXT);
979 }
980
981 static int
process_data(FILE * out,char * data)982 process_data(FILE *out, char *data)
983 {
984 lsdata_t lsdata;
985
986 /* LINTED - alignment */
987 lsdata.lsd_next = (lsrec_t *)data;
988 lsdata.lsd_count = 0;
989
990 if (g_tracing) {
991 if (dtrace_consume(g_dtp, out,
992 process_trace, NULL, &lsdata) != 0)
993 dfail("failed to consume buffer");
994
995 return (lsdata.lsd_count);
996 }
997
998 if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
999 process_aggregate, &lsdata) != 0)
1000 dfail("failed to walk aggregate");
1001
1002 return (lsdata.lsd_count);
1003 }
1004
1005 /*ARGSUSED*/
1006 static int
drophandler(const dtrace_dropdata_t * data,void * arg)1007 drophandler(const dtrace_dropdata_t *data, void *arg)
1008 {
1009 g_dropped++;
1010 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1011 return (DTRACE_HANDLE_OK);
1012 }
1013
1014 int
main(int argc,char ** argv)1015 main(int argc, char **argv)
1016 {
1017 char *data_buf;
1018 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1019 FILE *out = stdout;
1020 int c;
1021 pid_t child;
1022 int status;
1023 int i, j;
1024 hrtime_t duration;
1025 char *addrp, *offp, *sizep, *evp, *lastp, *p;
1026 uintptr_t addr;
1027 size_t size, off;
1028 int events_specified = 0;
1029 int exec_errno = 0;
1030 uint32_t event;
1031 char *filt = NULL, *ifilt = NULL;
1032 static uint64_t ev_count[LS_MAX_EVENTS + 1];
1033 static uint64_t ev_time[LS_MAX_EVENTS + 1];
1034 dtrace_optval_t aggsize;
1035 char aggstr[10];
1036 long ncpus;
1037 int dynvar = 0;
1038 int err;
1039
1040 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1041 fail(0, "cannot open dtrace library: %s",
1042 dtrace_errmsg(NULL, err));
1043 }
1044
1045 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1046 dfail("couldn't establish drop handler");
1047
1048 if (symtab_init() == -1)
1049 fail(1, "can't load kernel symbols");
1050
1051 g_nrecs = DEFAULT_NRECS;
1052
1053 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1054 switch (c) {
1055 case 'b':
1056 g_recsize = LS_BASIC;
1057 break;
1058
1059 case 't':
1060 g_recsize = LS_TIME;
1061 break;
1062
1063 case 'h':
1064 g_recsize = LS_HIST;
1065 break;
1066
1067 case 's':
1068 if (!isdigit(optarg[0]))
1069 usage();
1070 g_stkdepth = atoi(optarg);
1071 if (g_stkdepth > LS_MAX_STACK_DEPTH)
1072 fail(0, "max stack depth is %d",
1073 LS_MAX_STACK_DEPTH);
1074 g_recsize = LS_STACK(g_stkdepth);
1075 break;
1076
1077 case 'n':
1078 if (!isdigit(optarg[0]))
1079 usage();
1080 g_nrecs = atoi(optarg);
1081 break;
1082
1083 case 'd':
1084 if (!isdigit(optarg[0]))
1085 usage();
1086 duration = atoll(optarg);
1087
1088 /*
1089 * XXX -- durations really should be per event
1090 * since the units are different, but it's hard
1091 * to express this nicely in the interface.
1092 * Not clear yet what the cleanest solution is.
1093 */
1094 for (i = 0; i < LS_MAX_EVENTS; i++)
1095 if (g_event_info[i].ev_type != 'E')
1096 g_min_duration[i] = duration;
1097
1098 break;
1099
1100 case 'i':
1101 if (!isdigit(optarg[0]))
1102 usage();
1103 i = atoi(optarg);
1104 if (i <= 0)
1105 usage();
1106 if (i > MAX_HZ)
1107 fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1108
1109 for (j = 0; j < LS_MAX_EVENTS; j++)
1110 if (strcmp(g_event_info[j].ev_desc,
1111 "Profiling interrupt") == 0)
1112 break;
1113
1114 (void) sprintf(g_event_info[j].ev_name,
1115 "profile:::profile-%d", i);
1116 break;
1117
1118 case 'l':
1119 case 'f':
1120 addrp = strtok(optarg, ",");
1121 sizep = strtok(NULL, ",");
1122 addrp = strtok(optarg, ",+");
1123 offp = strtok(NULL, ",");
1124
1125 size = sizep ? strtoul(sizep, NULL, 0) : 1;
1126 off = offp ? strtoul(offp, NULL, 0) : 0;
1127
1128 if (addrp[0] == '0') {
1129 addr = strtoul(addrp, NULL, 16) + off;
1130 } else {
1131 addr = sym_to_addr(addrp) + off;
1132 if (sizep == NULL)
1133 size = sym_size(addrp) - off;
1134 if (addr - off == 0)
1135 fail(0, "symbol '%s' not found", addrp);
1136 if (size == 0)
1137 size = 1;
1138 }
1139
1140
1141 if (c == 'l') {
1142 filter_add(&filt, "arg0", addr, size);
1143 } else {
1144 filter_add(&filt, "caller", addr, size);
1145 filter_add(&ifilt, "arg0", addr, size);
1146 }
1147 break;
1148
1149 case 'e':
1150 evp = strtok_r(optarg, ",", &lastp);
1151 while (evp) {
1152 int ev1, ev2;
1153 char *evp2;
1154
1155 (void) strtok(evp, "-");
1156 evp2 = strtok(NULL, "-");
1157 ev1 = atoi(evp);
1158 ev2 = evp2 ? atoi(evp2) : ev1;
1159 if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1160 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1161 fail(0, "-e events out of range");
1162 for (i = ev1; i <= ev2; i++)
1163 g_enabled[i] = 1;
1164 evp = strtok_r(NULL, ",", &lastp);
1165 }
1166 events_specified = 1;
1167 break;
1168
1169 case 'c':
1170 g_cflag = 1;
1171 break;
1172
1173 case 'k':
1174 g_kflag = 1;
1175 break;
1176
1177 case 'w':
1178 g_wflag = 1;
1179 break;
1180
1181 case 'W':
1182 g_Wflag = 1;
1183 break;
1184
1185 case 'g':
1186 g_gflag = 1;
1187 break;
1188
1189 case 'C':
1190 case 'E':
1191 case 'H':
1192 case 'I':
1193 for (i = 0; i < LS_MAX_EVENTS; i++)
1194 if (g_event_info[i].ev_type == c)
1195 g_enabled[i] = 1;
1196 events_specified = 1;
1197 break;
1198
1199 case 'A':
1200 for (i = 0; i < LS_MAX_EVENTS; i++)
1201 if (strchr("CH", g_event_info[i].ev_type))
1202 g_enabled[i] = 1;
1203 events_specified = 1;
1204 break;
1205
1206 case 'T':
1207 g_tracing = 1;
1208 break;
1209
1210 case 'D':
1211 if (!isdigit(optarg[0]))
1212 usage();
1213 g_topn = atoi(optarg);
1214 break;
1215
1216 case 'R':
1217 g_rates = 1;
1218 break;
1219
1220 case 'p':
1221 g_pflag = 1;
1222 break;
1223
1224 case 'P':
1225 g_Pflag = 1;
1226 break;
1227
1228 case 'o':
1229 if ((out = fopen(optarg, "w")) == NULL)
1230 fail(1, "error opening file");
1231 break;
1232
1233 case 'V':
1234 g_Vflag = 1;
1235 break;
1236
1237 default:
1238 if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1239 usage();
1240 }
1241 }
1242
1243 if (filt != NULL) {
1244 predicate_add(&g_predicate, filt, NULL, 0);
1245 filter_destroy(&filt);
1246 }
1247
1248 if (ifilt != NULL) {
1249 predicate_add(&g_ipredicate, ifilt, NULL, 0);
1250 filter_destroy(&ifilt);
1251 }
1252
1253 if (g_recsize == 0) {
1254 if (g_gflag) {
1255 g_stkdepth = LS_MAX_STACK_DEPTH;
1256 g_recsize = LS_STACK(g_stkdepth);
1257 } else {
1258 g_recsize = LS_TIME;
1259 }
1260 }
1261
1262 if (g_gflag && g_recsize <= LS_STACK(0))
1263 fail(0, "'-g' requires at least '-s 1' data gathering");
1264
1265 /*
1266 * Make sure the alignment is reasonable
1267 */
1268 g_recsize = -(-g_recsize & -sizeof (uint64_t));
1269
1270 for (i = 0; i < LS_MAX_EVENTS; i++) {
1271 /*
1272 * If no events were specified, enable -C.
1273 */
1274 if (!events_specified && g_event_info[i].ev_type == 'C')
1275 g_enabled[i] = 1;
1276 }
1277
1278 for (i = 0; i < LS_MAX_EVENTS; i++) {
1279 if (!g_enabled[i])
1280 continue;
1281
1282 if (g_event_info[i].ev_acquire != NULL) {
1283 /*
1284 * If we've enabled a hold event, we must explicitly
1285 * allocate dynamic variable space.
1286 */
1287 dynvar = 1;
1288 }
1289
1290 dprog_addevent(i);
1291 }
1292
1293 /*
1294 * Make sure there are remaining arguments to specify a child command
1295 * to execute.
1296 */
1297 if (argc <= optind)
1298 usage();
1299
1300 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1301 dfail("couldn't determine number of online CPUs");
1302
1303 /*
1304 * By default, we set our data buffer size to be the number of records
1305 * multiplied by the size of the record, doubled to account for some
1306 * DTrace slop and divided by the number of CPUs. We silently clamp
1307 * the aggregation size at both a minimum and a maximum to prevent
1308 * absurdly low or high values.
1309 */
1310 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1311 aggsize = MIN_AGGSIZE;
1312
1313 if (aggsize > MAX_AGGSIZE)
1314 aggsize = MAX_AGGSIZE;
1315
1316 (void) sprintf(aggstr, "%lld", (long long)aggsize);
1317
1318 if (!g_tracing) {
1319 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1320 dfail("failed to set 'bufsize'");
1321
1322 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1323 dfail("failed to set 'aggsize'");
1324
1325 if (dynvar) {
1326 /*
1327 * If we're using dynamic variables, we set our
1328 * dynamic variable size to be one megabyte per CPU,
1329 * with a hard-limit of 32 megabytes. This may still
1330 * be too small in some cases, but it can be tuned
1331 * manually via -x if need be.
1332 */
1333 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1334
1335 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1336 dfail("failed to set 'dynvarsize'");
1337 }
1338 } else {
1339 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1340 dfail("failed to set 'bufsize'");
1341 }
1342
1343 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1344 dfail("failed to set 'statusrate'");
1345
1346 optind = 1;
1347 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1348 switch (c) {
1349 case 'x':
1350 if ((p = strchr(optarg, '=')) != NULL)
1351 *p++ = '\0';
1352
1353 if (dtrace_setopt(g_dtp, optarg, p) != 0)
1354 dfail("failed to set -x %s", optarg);
1355 break;
1356 }
1357 }
1358
1359 argc -= optind;
1360 argv += optind;
1361
1362 dprog_compile();
1363 status_init();
1364
1365 g_elapsed = -gethrtime();
1366
1367 /*
1368 * Spawn the specified command and wait for it to complete.
1369 */
1370 child = fork();
1371 if (child == -1)
1372 fail(1, "cannot fork");
1373 if (child == 0) {
1374 (void) dtrace_close(g_dtp);
1375 (void) execvp(argv[0], &argv[0]);
1376 exec_errno = errno;
1377 exit(127);
1378 }
1379
1380 while (waitpid(child, &status, WEXITED) != child)
1381 status_check();
1382
1383 g_elapsed += gethrtime();
1384
1385 if (WIFEXITED(status)) {
1386 if (WEXITSTATUS(status) != 0) {
1387 if (exec_errno != 0) {
1388 errno = exec_errno;
1389 fail(1, "could not execute %s", argv[0]);
1390 }
1391 (void) fprintf(stderr,
1392 "lockstat: warning: %s exited with code %d\n",
1393 argv[0], WEXITSTATUS(status));
1394 }
1395 } else {
1396 (void) fprintf(stderr,
1397 "lockstat: warning: %s died on signal %d\n",
1398 argv[0], WTERMSIG(status));
1399 }
1400
1401 if (dtrace_stop(g_dtp) == -1)
1402 dfail("failed to stop dtrace");
1403
1404 /*
1405 * Before we read out the results, we need to allocate our buffer.
1406 * If we're tracing, then we'll just use the precalculated size. If
1407 * we're not, then we'll take a snapshot of the aggregate, and walk
1408 * it to count the number of records.
1409 */
1410 if (!g_tracing) {
1411 if (dtrace_aggregate_snap(g_dtp) != 0)
1412 dfail("failed to snap aggregate");
1413
1414 g_nrecs = 0;
1415
1416 if (dtrace_aggregate_walk(g_dtp,
1417 count_aggregate, &g_nrecs) != 0)
1418 dfail("failed to walk aggregate");
1419 }
1420
1421 if ((data_buf = memalign(sizeof (uint64_t),
1422 (g_nrecs + 1) * g_recsize)) == NULL)
1423 fail(1, "Memory allocation failed");
1424
1425 /*
1426 * Read out the DTrace data.
1427 */
1428 g_nrecs_used = process_data(out, data_buf);
1429
1430 if (g_nrecs_used > g_nrecs || g_dropped)
1431 (void) fprintf(stderr, "lockstat: warning: "
1432 "ran out of data records (use -n for more)\n");
1433
1434 /* LINTED - alignment */
1435 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1436 /* LINTED - alignment */
1437 lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1438 ev_count[lsp->ls_event] += lsp->ls_count;
1439 ev_time[lsp->ls_event] += lsp->ls_time;
1440 }
1441
1442 /*
1443 * If -g was specified, convert stacks into individual records.
1444 */
1445 if (g_gflag) {
1446 lsrec_t *newlsp, *oldlsp;
1447
1448 newlsp = memalign(sizeof (uint64_t),
1449 g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1450 if (newlsp == NULL)
1451 fail(1, "Cannot allocate space for -g processing");
1452 lsp = newlsp;
1453 /* LINTED - alignment */
1454 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1455 /* LINTED - alignment */
1456 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1457 int fr;
1458 int caller_in_stack = 0;
1459
1460 if (oldlsp->ls_count == 0)
1461 continue;
1462
1463 for (fr = 0; fr < g_stkdepth; fr++) {
1464 if (oldlsp->ls_stack[fr] == 0)
1465 break;
1466 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1467 caller_in_stack = 1;
1468 bcopy(oldlsp, lsp, LS_TIME);
1469 lsp->ls_caller = oldlsp->ls_stack[fr];
1470 /* LINTED - alignment */
1471 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1472 }
1473 if (!caller_in_stack) {
1474 bcopy(oldlsp, lsp, LS_TIME);
1475 /* LINTED - alignment */
1476 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1477 }
1478 }
1479 g_nrecs = g_nrecs_used =
1480 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1481 g_recsize = LS_TIME;
1482 g_stkdepth = 0;
1483 free(data_buf);
1484 data_buf = (char *)newlsp;
1485 }
1486
1487 if ((sort_buf = calloc(2 * (g_nrecs + 1),
1488 sizeof (void *))) == NULL)
1489 fail(1, "Sort buffer allocation failed");
1490 merge_buf = sort_buf + (g_nrecs + 1);
1491
1492 /*
1493 * Build the sort buffer, discarding zero-count records along the way.
1494 */
1495 /* LINTED - alignment */
1496 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1497 /* LINTED - alignment */
1498 lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1499 if (lsp->ls_count == 0)
1500 lsp->ls_event = LS_MAX_EVENTS;
1501 sort_buf[i] = lsp;
1502 }
1503
1504 if (g_nrecs_used == 0)
1505 exit(0);
1506
1507 /*
1508 * Add a sentinel after the last record
1509 */
1510 sort_buf[i] = lsp;
1511 lsp->ls_event = LS_MAX_EVENTS;
1512
1513 if (g_tracing) {
1514 report_trace(out, sort_buf);
1515 return (0);
1516 }
1517
1518 /*
1519 * Application of -g may have resulted in multiple records
1520 * with the same signature; coalesce them.
1521 */
1522 if (g_gflag) {
1523 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1524 coalesce(lockcmp, sort_buf, g_nrecs_used);
1525 }
1526
1527 /*
1528 * Coalesce locks within the same symbol if -c option specified.
1529 * Coalesce PCs within the same function if -k option specified.
1530 */
1531 if (g_cflag || g_kflag) {
1532 for (i = 0; i < g_nrecs_used; i++) {
1533 int fr;
1534 lsp = sort_buf[i];
1535 if (g_cflag)
1536 coalesce_symbol(&lsp->ls_lock);
1537 if (g_kflag) {
1538 for (fr = 0; fr < g_stkdepth; fr++)
1539 coalesce_symbol(&lsp->ls_stack[fr]);
1540 coalesce_symbol(&lsp->ls_caller);
1541 }
1542 }
1543 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1544 coalesce(lockcmp, sort_buf, g_nrecs_used);
1545 }
1546
1547 /*
1548 * Coalesce callers if -w option specified
1549 */
1550 if (g_wflag) {
1551 mergesort(lock_and_count_cmp_anywhere,
1552 sort_buf, merge_buf, g_nrecs_used);
1553 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1554 }
1555
1556 /*
1557 * Coalesce locks if -W option specified
1558 */
1559 if (g_Wflag) {
1560 mergesort(site_and_count_cmp_anylock,
1561 sort_buf, merge_buf, g_nrecs_used);
1562 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1563 }
1564
1565 /*
1566 * Sort data by contention count (ls_count) or total time (ls_time),
1567 * depending on g_Pflag. Override g_Pflag if time wasn't measured.
1568 */
1569 if (g_recsize < LS_TIME)
1570 g_Pflag = 0;
1571
1572 if (g_Pflag)
1573 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1574 else
1575 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1576
1577 /*
1578 * Display data by event type
1579 */
1580 first = &sort_buf[0];
1581 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1582 current = first;
1583 while ((lsp = *current)->ls_event == event)
1584 current++;
1585 report_stats(out, first, current - first, ev_count[event],
1586 ev_time[event]);
1587 first = current;
1588 }
1589
1590 return (0);
1591 }
1592
1593 static char *
format_symbol(char * buf,uintptr_t addr,int show_size)1594 format_symbol(char *buf, uintptr_t addr, int show_size)
1595 {
1596 uintptr_t symoff;
1597 char *symname;
1598 size_t symsize;
1599
1600 symname = addr_to_sym(addr, &symoff, &symsize);
1601
1602 if (show_size && symoff == 0)
1603 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1604 else if (symoff == 0)
1605 (void) sprintf(buf, "%s", symname);
1606 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */
1607 (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1608 else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1609 (void) sprintf(buf, "%s+0x%llx", symname,
1610 (unsigned long long)symoff);
1611 else
1612 (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1613 return (buf);
1614 }
1615
1616 static void
report_stats(FILE * out,lsrec_t ** sort_buf,size_t nrecs,uint64_t total_count,uint64_t total_time)1617 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1618 uint64_t total_time)
1619 {
1620 uint32_t event = sort_buf[0]->ls_event;
1621 lsrec_t *lsp;
1622 double ptotal = 0.0;
1623 double percent;
1624 int i, j, fr;
1625 int displayed;
1626 int first_bin, last_bin, max_bin_count, total_bin_count;
1627 int rectype;
1628 char buf[256];
1629 char lhdr[80], chdr[80];
1630
1631 rectype = g_recsize;
1632
1633 if (g_topn == 0) {
1634 (void) fprintf(out, "%20llu %s\n",
1635 g_rates == 0 ? total_count :
1636 ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1637 g_event_info[event].ev_desc);
1638 return;
1639 }
1640
1641 (void) sprintf(lhdr, "%s%s",
1642 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1643 (void) sprintf(chdr, "%s%s",
1644 g_wflag ? "Hottest " : "", "Caller");
1645
1646 if (!g_pflag)
1647 (void) fprintf(out,
1648 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1649 g_event_info[event].ev_desc, (double)total_count,
1650 (double)g_elapsed / NANOSEC,
1651 (double)total_count * NANOSEC / g_elapsed);
1652
1653 if (!g_pflag && rectype < LS_HIST) {
1654 (void) sprintf(buf, "%s", g_event_info[event].ev_units);
1655 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1656 g_rates ? "ops/s" : "Count",
1657 g_gflag ? "genr" : "indv",
1658 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1659 (void) fprintf(out, "---------------------------------"
1660 "----------------------------------------------\n");
1661 }
1662
1663 displayed = 0;
1664 for (i = 0; i < nrecs; i++) {
1665 lsp = sort_buf[i];
1666
1667 if (displayed++ >= g_topn)
1668 break;
1669
1670 if (g_pflag) {
1671 int j;
1672
1673 (void) fprintf(out, "%u %u",
1674 lsp->ls_event, lsp->ls_count);
1675 (void) fprintf(out, " %s",
1676 format_symbol(buf, lsp->ls_lock, g_cflag));
1677 (void) fprintf(out, " %s",
1678 format_symbol(buf, lsp->ls_caller, 0));
1679 (void) fprintf(out, " %f",
1680 (double)lsp->ls_refcnt / lsp->ls_count);
1681 if (rectype >= LS_TIME)
1682 (void) fprintf(out, " %llu",
1683 (unsigned long long)lsp->ls_time);
1684 if (rectype >= LS_HIST) {
1685 for (j = 0; j < 64; j++)
1686 (void) fprintf(out, " %u",
1687 lsp->ls_hist[j]);
1688 }
1689 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1690 if (rectype <= LS_STACK(j) ||
1691 lsp->ls_stack[j] == 0)
1692 break;
1693 (void) fprintf(out, " %s",
1694 format_symbol(buf, lsp->ls_stack[j], 0));
1695 }
1696 (void) fprintf(out, "\n");
1697 continue;
1698 }
1699
1700 if (rectype >= LS_HIST) {
1701 (void) fprintf(out, "---------------------------------"
1702 "----------------------------------------------\n");
1703 (void) sprintf(buf, "%s",
1704 g_event_info[event].ev_units);
1705 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1706 g_rates ? "ops/s" : "Count",
1707 g_gflag ? "genr" : "indv",
1708 "cuml", "rcnt", buf, lhdr, chdr);
1709 }
1710
1711 if (g_Pflag && total_time != 0)
1712 percent = (lsp->ls_time * 100.00) / total_time;
1713 else
1714 percent = (lsp->ls_count * 100.00) / total_count;
1715
1716 ptotal += percent;
1717
1718 if (rectype >= LS_TIME)
1719 (void) sprintf(buf, "%llu",
1720 (unsigned long long)(lsp->ls_time / lsp->ls_count));
1721 else
1722 buf[0] = '\0';
1723
1724 (void) fprintf(out, "%5llu ",
1725 g_rates == 0 ? lsp->ls_count :
1726 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1727
1728 (void) fprintf(out, "%3.0f%% ", percent);
1729
1730 if (g_gflag)
1731 (void) fprintf(out, "---- ");
1732 else
1733 (void) fprintf(out, "%3.0f%% ", ptotal);
1734
1735 (void) fprintf(out, "%4.2f %8s ",
1736 (double)lsp->ls_refcnt / lsp->ls_count, buf);
1737
1738 (void) fprintf(out, "%-22s ",
1739 format_symbol(buf, lsp->ls_lock, g_cflag));
1740
1741 (void) fprintf(out, "%-24s\n",
1742 format_symbol(buf, lsp->ls_caller, 0));
1743
1744 if (rectype < LS_HIST)
1745 continue;
1746
1747 (void) fprintf(out, "\n");
1748 (void) fprintf(out, "%10s %31s %-9s %-24s\n",
1749 g_event_info[event].ev_units,
1750 "------ Time Distribution ------",
1751 g_rates ? "ops/s" : "count",
1752 rectype > LS_STACK(0) ? "Stack" : "");
1753
1754 first_bin = 0;
1755 while (lsp->ls_hist[first_bin] == 0)
1756 first_bin++;
1757
1758 last_bin = 63;
1759 while (lsp->ls_hist[last_bin] == 0)
1760 last_bin--;
1761
1762 max_bin_count = 0;
1763 total_bin_count = 0;
1764 for (j = first_bin; j <= last_bin; j++) {
1765 total_bin_count += lsp->ls_hist[j];
1766 if (lsp->ls_hist[j] > max_bin_count)
1767 max_bin_count = lsp->ls_hist[j];
1768 }
1769
1770 /*
1771 * If we went a few frames below the caller, ignore them
1772 */
1773 for (fr = 3; fr > 0; fr--)
1774 if (lsp->ls_stack[fr] == lsp->ls_caller)
1775 break;
1776
1777 for (j = first_bin; j <= last_bin; j++) {
1778 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1779 (void) fprintf(out, "%10llu |%s%s %-9u ",
1780 1ULL << j,
1781 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1782 " " + depth,
1783 g_rates == 0 ? lsp->ls_hist[j] :
1784 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1785 g_elapsed));
1786 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1787 (void) fprintf(out, "\n");
1788 continue;
1789 }
1790 (void) fprintf(out, "%-24s\n",
1791 format_symbol(buf, lsp->ls_stack[fr], 0));
1792 fr++;
1793 }
1794 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1795 (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1796 format_symbol(buf, lsp->ls_stack[fr], 0));
1797 fr++;
1798 }
1799 }
1800
1801 if (!g_pflag)
1802 (void) fprintf(out, "---------------------------------"
1803 "----------------------------------------------\n");
1804
1805 (void) fflush(out);
1806 }
1807
1808 static void
report_trace(FILE * out,lsrec_t ** sort_buf)1809 report_trace(FILE *out, lsrec_t **sort_buf)
1810 {
1811 lsrec_t *lsp;
1812 int i, fr;
1813 int rectype;
1814 char buf[256], buf2[256];
1815
1816 rectype = g_recsize;
1817
1818 if (!g_pflag) {
1819 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n",
1820 "Event", "Time", "Owner", "Lock", "Caller");
1821 (void) fprintf(out, "---------------------------------"
1822 "----------------------------------------------\n");
1823 }
1824
1825 for (i = 0; i < g_nrecs_used; i++) {
1826
1827 lsp = sort_buf[i];
1828
1829 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1830 continue;
1831
1832 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n",
1833 lsp->ls_event, (unsigned long long)lsp->ls_time,
1834 (void *)lsp->ls_next,
1835 format_symbol(buf, lsp->ls_lock, 0),
1836 format_symbol(buf2, lsp->ls_caller, 0));
1837
1838 if (rectype <= LS_STACK(0))
1839 continue;
1840
1841 /*
1842 * If we went a few frames below the caller, ignore them
1843 */
1844 for (fr = 3; fr > 0; fr--)
1845 if (lsp->ls_stack[fr] == lsp->ls_caller)
1846 break;
1847
1848 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1849 (void) fprintf(out, "%53s %-24s\n", "",
1850 format_symbol(buf, lsp->ls_stack[fr], 0));
1851 fr++;
1852 }
1853 (void) fprintf(out, "\n");
1854 }
1855
1856 (void) fflush(out);
1857 }
1858