xref: /illumos-gate/usr/src/cmd/lockstat/lockstat.c (revision 13b136d3061155363c62c9f6568d25b8b27da8f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 #include <stdarg.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <ctype.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <sys/types.h>
40 #include <sys/modctl.h>
41 #include <sys/stat.h>
42 #include <sys/wait.h>
43 #include <dtrace.h>
44 #include <sys/lockstat.h>
45 #include <alloca.h>
46 #include <signal.h>
47 #include <assert.h>
48 
49 #define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
50 
51 #define	LS_MAX_STACK_DEPTH	50
52 #define	LS_MAX_EVENTS		64
53 
54 typedef struct lsrec {
55 	struct lsrec	*ls_next;	/* next in hash chain */
56 	uintptr_t	ls_lock;	/* lock address */
57 	uintptr_t	ls_caller;	/* caller address */
58 	uint32_t	ls_count;	/* cumulative event count */
59 	uint32_t	ls_event;	/* type of event */
60 	uintptr_t	ls_refcnt;	/* cumulative reference count */
61 	uint64_t	ls_time;	/* cumulative event duration */
62 	uint32_t	ls_hist[64];	/* log2(duration) histogram */
63 	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
64 } lsrec_t;
65 
66 typedef struct lsdata {
67 	struct lsrec	*lsd_next;	/* next available */
68 	int		lsd_count;	/* number of records */
69 } lsdata_t;
70 
71 /*
72  * Definitions for the types of experiments which can be run.  They are
73  * listed in increasing order of memory cost and processing time cost.
74  * The numerical value of each type is the number of bytes needed per record.
75  */
76 #define	LS_BASIC	offsetof(lsrec_t, ls_time)
77 #define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
78 #define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
79 #define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
80 
81 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
82 static void report_trace(FILE *, lsrec_t **);
83 
84 extern int symtab_init(void);
85 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
86 extern uintptr_t sym_to_addr(char *name);
87 extern size_t sym_size(char *name);
88 extern char *strtok_r(char *, const char *, char **);
89 
90 #define	DEFAULT_NRECS	10000
91 #define	DEFAULT_HZ	97
92 #define	MAX_HZ		1000
93 #define	MIN_AGGSIZE	(16 * 1024)
94 #define	MAX_AGGSIZE	(32 * 1024 * 1024)
95 
96 static int g_stkdepth;
97 static int g_topn = INT_MAX;
98 static hrtime_t g_elapsed;
99 static int g_rates = 0;
100 static int g_pflag = 0;
101 static int g_Pflag = 0;
102 static int g_wflag = 0;
103 static int g_Wflag = 0;
104 static int g_cflag = 0;
105 static int g_kflag = 0;
106 static int g_gflag = 0;
107 static int g_Vflag = 0;
108 static int g_tracing = 0;
109 static size_t g_recsize;
110 static size_t g_nrecs;
111 static int g_nrecs_used;
112 static uchar_t g_enabled[LS_MAX_EVENTS];
113 static hrtime_t g_min_duration[LS_MAX_EVENTS];
114 static dtrace_hdl_t *g_dtp;
115 static char *g_predicate;
116 static char *g_ipredicate;
117 static char *g_prog;
118 static int g_proglen;
119 static int g_dropped;
120 
121 typedef struct ls_event_info {
122 	char	ev_type;
123 	char	ev_lhdr[20];
124 	char	ev_desc[80];
125 	char	ev_units[10];
126 	char	ev_name[DTRACE_NAMELEN];
127 	char	*ev_predicate;
128 	char	*ev_acquire;
129 } ls_event_info_t;
130 
131 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
132 	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
133 	    "lockstat:::adaptive-spin" },
134 	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
135 	    "lockstat:::adaptive-block" },
136 	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
137 	    "lockstat:::spin-spin" },
138 	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
139 	    "lockstat:::thread-spin" },
140 	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
141 	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
142 	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
143 	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
144 	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
145 	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
146 	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
147 	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
148 	{ 'C',	"Lock",	"Unknown event (type 8)",		"units"	},
149 	{ 'C',	"Lock",	"Unknown event (type 9)",		"units"	},
150 	{ 'C',	"Lock",	"Unknown event (type 10)",		"units"	},
151 	{ 'C',	"Lock",	"Unknown event (type 11)",		"units"	},
152 	{ 'C',	"Lock",	"Unknown event (type 12)",		"units"	},
153 	{ 'C',	"Lock",	"Unknown event (type 13)",		"units"	},
154 	{ 'C',	"Lock",	"Unknown event (type 14)",		"units"	},
155 	{ 'C',	"Lock",	"Unknown event (type 15)",		"units"	},
156 	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
157 	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
158 	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
159 	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
160 	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
161 	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
162 	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
163 	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
164 	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
165 	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
166 	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
167 	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
168 	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
169 	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
170 	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
171 	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
172 	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
173 	    "lockstat:::adaptive-release", NULL,
174 	    "lockstat:::adaptive-acquire" },
175 	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
176 	    "lockstat:::spin-release", NULL,
177 	    "lockstat:::spin-acquire" },
178 	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
179 	    "lockstat:::rw-release", "arg1 == 0",
180 	    "lockstat:::rw-acquire" },
181 	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
182 	    "lockstat:::rw-release", "arg1 != 0",
183 	    "lockstat:::rw-acquire" },
184 	{ 'H',	"Lock",	"Unknown event (type 36)",		"units"	},
185 	{ 'H',	"Lock",	"Unknown event (type 37)",		"units"	},
186 	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
187 	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
188 	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
189 	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
190 	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
191 	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
192 	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
193 	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
194 	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
195 	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
196 	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
197 	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
198 	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
199 	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
200 	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
201 	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
202 	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
203 	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
204 	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
205 	    "profile:::profile-97", NULL },
206 	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
207 	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
208 	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
209 	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
210 	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
211 	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
212 	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
213 	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
214 };
215 
216 static void
217 fail(int do_perror, const char *message, ...)
218 {
219 	va_list args;
220 	int save_errno = errno;
221 
222 	va_start(args, message);
223 	(void) fprintf(stderr, "lockstat: ");
224 	(void) vfprintf(stderr, message, args);
225 	va_end(args);
226 	if (do_perror)
227 		(void) fprintf(stderr, ": %s", strerror(save_errno));
228 	(void) fprintf(stderr, "\n");
229 	exit(2);
230 }
231 
232 static void
233 dfail(const char *message, ...)
234 {
235 	va_list args;
236 
237 	va_start(args, message);
238 	(void) fprintf(stderr, "lockstat: ");
239 	(void) vfprintf(stderr, message, args);
240 	va_end(args);
241 	(void) fprintf(stderr, ": %s\n",
242 	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
243 
244 	exit(2);
245 }
246 
247 static void
248 show_events(char event_type, char *desc)
249 {
250 	int i, first = -1, last;
251 
252 	for (i = 0; i < LS_MAX_EVENTS; i++) {
253 		ls_event_info_t *evp = &g_event_info[i];
254 		if (evp->ev_type != event_type ||
255 		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
256 			continue;
257 		if (first == -1)
258 			first = i;
259 		last = i;
260 	}
261 
262 	(void) fprintf(stderr,
263 	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
264 	    desc, event_type, first, last);
265 
266 	for (i = first; i <= last; i++)
267 		(void) fprintf(stderr,
268 		    "%4d = %s\n", i, g_event_info[i].ev_desc);
269 }
270 
271 static void
272 usage(void)
273 {
274 	(void) fprintf(stderr,
275 	    "Usage: lockstat [options] command [args]\n"
276 	    "\nEvent selection options:\n\n"
277 	    "  -C              watch contention events [on by default]\n"
278 	    "  -E              watch error events [off by default]\n"
279 	    "  -H              watch hold events [off by default]\n"
280 	    "  -I              watch interrupt events [off by default]\n"
281 	    "  -A              watch all lock events [equivalent to -CH]\n"
282 	    "  -e event_list   only watch the specified events (shown below);\n"
283 	    "                  <event_list> is a comma-separated list of\n"
284 	    "                  events or ranges of events, e.g. 1,4-7,35\n"
285 	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
286 	    "\nData gathering options:\n\n"
287 	    "  -b              basic statistics (lock, caller, event count)\n"
288 	    "  -t              timing for all events [default]\n"
289 	    "  -h              histograms for event times\n"
290 	    "  -s depth        stack traces <depth> deep\n"
291 	    "  -x opt[=val]    enable or modify DTrace options\n"
292 	    "\nData filtering options:\n\n"
293 	    "  -n nrecords     maximum number of data records [default: %d]\n"
294 	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
295 	    "                  symbolic name or hex address; <size> defaults\n"
296 	    "                  to the ELF symbol size if available, 1 if not\n"
297 	    "  -f func[,size]  only watch events generated by <func>\n"
298 	    "  -d duration     only watch events longer than <duration>\n"
299 	    "  -T              trace (rather than sample) events\n"
300 	    "\nData reporting options:\n\n"
301 	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
302 	    "  -k              coalesce PCs within functions\n"
303 	    "  -g              show total events generated by function\n"
304 	    "  -w              wherever: don't distinguish events by caller\n"
305 	    "  -W              whichever: don't distinguish events by lock\n"
306 	    "  -R              display rates rather than counts\n"
307 	    "  -p              parsable output format (awk(1)-friendly)\n"
308 	    "  -P              sort lock data by (count * avg_time) product\n"
309 	    "  -D n            only display top <n> events of each type\n"
310 	    "  -o filename     send output to <filename>\n",
311 	    DEFAULT_HZ, DEFAULT_NRECS);
312 
313 	show_events('C', "Contention");
314 	show_events('H', "Hold-time");
315 	show_events('I', "Interrupt");
316 	show_events('E', "Error");
317 	(void) fprintf(stderr, "\n");
318 
319 	exit(1);
320 }
321 
322 static int
323 lockcmp(lsrec_t *a, lsrec_t *b)
324 {
325 	int i;
326 
327 	if (a->ls_event < b->ls_event)
328 		return (-1);
329 	if (a->ls_event > b->ls_event)
330 		return (1);
331 
332 	for (i = g_stkdepth - 1; i >= 0; i--) {
333 		if (a->ls_stack[i] < b->ls_stack[i])
334 			return (-1);
335 		if (a->ls_stack[i] > b->ls_stack[i])
336 			return (1);
337 	}
338 
339 	if (a->ls_caller < b->ls_caller)
340 		return (-1);
341 	if (a->ls_caller > b->ls_caller)
342 		return (1);
343 
344 	if (a->ls_lock < b->ls_lock)
345 		return (-1);
346 	if (a->ls_lock > b->ls_lock)
347 		return (1);
348 
349 	return (0);
350 }
351 
352 static int
353 countcmp(lsrec_t *a, lsrec_t *b)
354 {
355 	if (a->ls_event < b->ls_event)
356 		return (-1);
357 	if (a->ls_event > b->ls_event)
358 		return (1);
359 
360 	return (b->ls_count - a->ls_count);
361 }
362 
363 static int
364 timecmp(lsrec_t *a, lsrec_t *b)
365 {
366 	if (a->ls_event < b->ls_event)
367 		return (-1);
368 	if (a->ls_event > b->ls_event)
369 		return (1);
370 
371 	if (a->ls_time < b->ls_time)
372 		return (1);
373 	if (a->ls_time > b->ls_time)
374 		return (-1);
375 
376 	return (0);
377 }
378 
379 static int
380 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
381 {
382 	if (a->ls_event < b->ls_event)
383 		return (-1);
384 	if (a->ls_event > b->ls_event)
385 		return (1);
386 
387 	if (a->ls_lock < b->ls_lock)
388 		return (-1);
389 	if (a->ls_lock > b->ls_lock)
390 		return (1);
391 
392 	return (0);
393 }
394 
395 static int
396 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
397 {
398 	if (a->ls_event < b->ls_event)
399 		return (-1);
400 	if (a->ls_event > b->ls_event)
401 		return (1);
402 
403 	if (a->ls_lock < b->ls_lock)
404 		return (-1);
405 	if (a->ls_lock > b->ls_lock)
406 		return (1);
407 
408 	return (b->ls_count - a->ls_count);
409 }
410 
411 static int
412 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
413 {
414 	int i;
415 
416 	if (a->ls_event < b->ls_event)
417 		return (-1);
418 	if (a->ls_event > b->ls_event)
419 		return (1);
420 
421 	for (i = g_stkdepth - 1; i >= 0; i--) {
422 		if (a->ls_stack[i] < b->ls_stack[i])
423 			return (-1);
424 		if (a->ls_stack[i] > b->ls_stack[i])
425 			return (1);
426 	}
427 
428 	if (a->ls_caller < b->ls_caller)
429 		return (-1);
430 	if (a->ls_caller > b->ls_caller)
431 		return (1);
432 
433 	return (0);
434 }
435 
436 static int
437 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
438 {
439 	int i;
440 
441 	if (a->ls_event < b->ls_event)
442 		return (-1);
443 	if (a->ls_event > b->ls_event)
444 		return (1);
445 
446 	for (i = g_stkdepth - 1; i >= 0; i--) {
447 		if (a->ls_stack[i] < b->ls_stack[i])
448 			return (-1);
449 		if (a->ls_stack[i] > b->ls_stack[i])
450 			return (1);
451 	}
452 
453 	if (a->ls_caller < b->ls_caller)
454 		return (-1);
455 	if (a->ls_caller > b->ls_caller)
456 		return (1);
457 
458 	return (b->ls_count - a->ls_count);
459 }
460 
461 static void
462 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
463 {
464 	int m = n / 2;
465 	int i, j;
466 
467 	if (m > 1)
468 		mergesort(cmp, a, b, m);
469 	if (n - m > 1)
470 		mergesort(cmp, a + m, b + m, n - m);
471 	for (i = m; i > 0; i--)
472 		b[i - 1] = a[i - 1];
473 	for (j = m - 1; j < n - 1; j++)
474 		b[n + m - j - 2] = a[j + 1];
475 	while (i < j)
476 		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
477 	*a = b[i];
478 }
479 
480 static void
481 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
482 {
483 	int i, j;
484 	lsrec_t *target, *current;
485 
486 	target = lock[0];
487 
488 	for (i = 1; i < n; i++) {
489 		current = lock[i];
490 		if (cmp(current, target) != 0) {
491 			target = current;
492 			continue;
493 		}
494 		current->ls_event = LS_MAX_EVENTS;
495 		target->ls_count += current->ls_count;
496 		target->ls_refcnt += current->ls_refcnt;
497 		if (g_recsize < LS_TIME)
498 			continue;
499 		target->ls_time += current->ls_time;
500 		if (g_recsize < LS_HIST)
501 			continue;
502 		for (j = 0; j < 64; j++)
503 			target->ls_hist[j] += current->ls_hist[j];
504 	}
505 }
506 
507 static void
508 coalesce_symbol(uintptr_t *addrp)
509 {
510 	uintptr_t symoff;
511 	size_t symsize;
512 
513 	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
514 		*addrp -= symoff;
515 }
516 
517 static void
518 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
519 {
520 	char *new;
521 	int len, newlen;
522 
523 	if (what == NULL)
524 		return;
525 
526 	if (*pred == NULL) {
527 		*pred = malloc(1);
528 		*pred[0] = '\0';
529 	}
530 
531 	len = strlen(*pred);
532 	newlen = len + strlen(what) + 32 + strlen("( && )");
533 	new = malloc(newlen);
534 
535 	if (*pred[0] != '\0') {
536 		if (cmp != NULL) {
537 			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
538 			    *pred, what, cmp, (void *)value);
539 		} else {
540 			(void) sprintf(new, "(%s) && (%s)", *pred, what);
541 		}
542 	} else {
543 		if (cmp != NULL) {
544 			(void) sprintf(new, "%s %s 0x%p",
545 			    what, cmp, (void *)value);
546 		} else {
547 			(void) sprintf(new, "%s", what);
548 		}
549 	}
550 
551 	free(*pred);
552 	*pred = new;
553 }
554 
555 static void
556 predicate_destroy(char **pred)
557 {
558 	free(*pred);
559 	*pred = NULL;
560 }
561 
562 static void
563 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
564 {
565 	char buf[256], *c = buf, *new;
566 	int len, newlen;
567 
568 	if (*filt == NULL) {
569 		*filt = malloc(1);
570 		*filt[0] = '\0';
571 	}
572 
573 	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
574 	    " || " : "", what, (void *)base, what, (void *)(base + size));
575 
576 	newlen = (len = strlen(*filt) + 1) + strlen(c);
577 	new = malloc(newlen);
578 	bcopy(*filt, new, len);
579 	(void) strcat(new, c);
580 	free(*filt);
581 	*filt = new;
582 }
583 
584 static void
585 filter_destroy(char **filt)
586 {
587 	free(*filt);
588 	*filt = NULL;
589 }
590 
591 static void
592 dprog_add(const char *fmt, ...)
593 {
594 	va_list args;
595 	int size, offs;
596 	char c;
597 
598 	va_start(args, fmt);
599 	size = vsnprintf(&c, 1, fmt, args) + 1;
600 
601 	if (g_proglen == 0) {
602 		offs = 0;
603 	} else {
604 		offs = g_proglen - 1;
605 	}
606 
607 	g_proglen = offs + size;
608 
609 	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
610 		fail(1, "failed to reallocate program text");
611 
612 	(void) vsnprintf(&g_prog[offs], size, fmt, args);
613 }
614 
615 /*
616  * This function may read like an open sewer, but keep in mind that programs
617  * that generate other programs are rarely pretty.  If one has the unenviable
618  * task of maintaining or -- worse -- extending this code, use the -V option
619  * to examine the D program as generated by this function.
620  */
621 static void
622 dprog_addevent(int event)
623 {
624 	ls_event_info_t *info = &g_event_info[event];
625 	char *pred = NULL;
626 	char stack[20];
627 	const char *arg0, *caller;
628 	char *arg1 = "arg1";
629 	char buf[80];
630 	hrtime_t dur;
631 	int depth;
632 
633 	if (info->ev_name[0] == '\0')
634 		return;
635 
636 	if (info->ev_type == 'I') {
637 		/*
638 		 * For interrupt events, arg0 (normally the lock pointer) is
639 		 * the CPU address plus the current pil, and arg1 (normally
640 		 * the number of nanoseconds) is the number of nanoseconds
641 		 * late -- and it's stored in arg2.
642 		 */
643 		arg0 = "(uintptr_t)curthread->t_cpu + \n"
644 		    "\t    curthread->t_cpu->cpu_profile_pil";
645 		caller = "(uintptr_t)arg0";
646 		arg1 = "arg2";
647 	} else {
648 		arg0 = "(uintptr_t)arg0";
649 		caller = "caller";
650 	}
651 
652 	if (g_recsize > LS_HIST) {
653 		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
654 			continue;
655 
656 		if (g_tracing) {
657 			(void) sprintf(stack, "\tstack(%d);\n", depth);
658 		} else {
659 			(void) sprintf(stack, ", stack(%d)", depth);
660 		}
661 	} else {
662 		(void) sprintf(stack, "");
663 	}
664 
665 	if (info->ev_acquire != NULL) {
666 		/*
667 		 * If this is a hold event, we need to generate an additional
668 		 * clause for the acquire; the clause for the release will be
669 		 * generated with the aggregating statement, below.
670 		 */
671 		dprog_add("%s\n", info->ev_acquire);
672 		predicate_add(&pred, info->ev_predicate, NULL, 0);
673 		predicate_add(&pred, g_predicate, NULL, 0);
674 		if (pred != NULL)
675 			dprog_add("/%s/\n", pred);
676 
677 		dprog_add("{\n");
678 		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
679 
680 		if (info->ev_type == 'H') {
681 			dprog_add("\t%s = timestamp;\n", buf);
682 		} else {
683 			/*
684 			 * If this isn't a hold event, it's the recursive
685 			 * error event.  For this, we simply bump the
686 			 * thread-local, per-lock count.
687 			 */
688 			dprog_add("\t%s++;\n", buf);
689 		}
690 
691 		dprog_add("}\n\n");
692 		predicate_destroy(&pred);
693 		pred = NULL;
694 
695 		if (info->ev_type == 'E') {
696 			/*
697 			 * If this is the recursive lock error event, we need
698 			 * to generate an additional clause to decrement the
699 			 * thread-local, per-lock count.  This assures that we
700 			 * only execute the aggregating clause if we have
701 			 * recursive entry.
702 			 */
703 			dprog_add("%s\n", info->ev_name);
704 			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
705 		}
706 
707 		predicate_add(&pred, buf, NULL, 0);
708 
709 		if (info->ev_type == 'H') {
710 			(void) sprintf(buf, "timestamp -\n\t    "
711 			    "self->ev%d[(uintptr_t)arg0]", event);
712 		}
713 
714 		arg1 = buf;
715 	} else {
716 		predicate_add(&pred, info->ev_predicate, NULL, 0);
717 		if (info->ev_type != 'I')
718 			predicate_add(&pred, g_predicate, NULL, 0);
719 		else
720 			predicate_add(&pred, g_ipredicate, NULL, 0);
721 	}
722 
723 	if ((dur = g_min_duration[event]) != 0)
724 		predicate_add(&pred, arg1, ">=", dur);
725 
726 	dprog_add("%s\n", info->ev_name);
727 
728 	if (pred != NULL)
729 		dprog_add("/%s/\n", pred);
730 	predicate_destroy(&pred);
731 
732 	dprog_add("{\n");
733 
734 	if (g_tracing) {
735 		dprog_add("\ttrace(%dULL);\n", event);
736 		dprog_add("\ttrace(%s);\n", arg0);
737 		dprog_add("\ttrace(%s);\n", caller);
738 		dprog_add(stack);
739 	} else {
740 		/*
741 		 * The ordering here is important:  when we process the
742 		 * aggregate, we count on the fact that @avg appears before
743 		 * @hist in program order to assure that @avg is assigned the
744 		 * first aggregation variable ID and @hist assigned the
745 		 * second; see the comment in process_aggregate() for details.
746 		 */
747 		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
748 		    event, arg0, caller, stack, arg1);
749 
750 		if (g_recsize >= LS_HIST) {
751 			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
752 			    "(%s);\n", event, arg0, caller, stack, arg1);
753 		}
754 	}
755 
756 	if (info->ev_acquire != NULL)
757 		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
758 
759 	dprog_add("}\n\n");
760 }
761 
762 static void
763 dprog_compile()
764 {
765 	dtrace_prog_t *prog;
766 	dtrace_proginfo_t info;
767 
768 	if (g_Vflag) {
769 		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
770 		(void) fputs(g_prog, stderr);
771 		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
772 	}
773 
774 	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
775 	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
776 		dfail("failed to compile program");
777 
778 	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
779 		dfail("failed to enable probes");
780 
781 	if (dtrace_go(g_dtp) != 0)
782 		dfail("couldn't start tracing");
783 }
784 
785 static void
786 status_fire(void)
787 {}
788 
789 static void
790 status_init(void)
791 {
792 	dtrace_optval_t val, status, agg;
793 	struct sigaction act;
794 	struct itimerspec ts;
795 	struct sigevent ev;
796 	timer_t tid;
797 
798 	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
799 		dfail("failed to get 'statusrate'");
800 
801 	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
802 		dfail("failed to get 'statusrate'");
803 
804 	/*
805 	 * We would want to awaken at a rate that is the GCD of the statusrate
806 	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
807 	 * simply awaken at a rate that is the more frequent of the two, which
808 	 * assures that we're never later than the interval implied by the
809 	 * more frequent rate.
810 	 */
811 	val = status < agg ? status : agg;
812 
813 	(void) sigemptyset(&act.sa_mask);
814 	act.sa_flags = 0;
815 	act.sa_handler = status_fire;
816 	(void) sigaction(SIGUSR1, &act, NULL);
817 
818 	ev.sigev_notify = SIGEV_SIGNAL;
819 	ev.sigev_signo = SIGUSR1;
820 
821 	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
822 		dfail("cannot create CLOCK_REALTIME timer");
823 
824 	ts.it_value.tv_sec = val / NANOSEC;
825 	ts.it_value.tv_nsec = val % NANOSEC;
826 	ts.it_interval = ts.it_value;
827 
828 	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
829 		dfail("cannot set time on CLOCK_REALTIME timer");
830 }
831 
832 static void
833 status_check(void)
834 {
835 	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
836 		dfail("failed to snap aggregate");
837 
838 	if (dtrace_status(g_dtp) == -1)
839 		dfail("dtrace_status()");
840 }
841 
842 static void
843 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
844 {
845 	bzero(lsrec, g_recsize);
846 	lsrec->ls_count = 1;
847 
848 	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
849 		fail(0, "truncated DTrace record");
850 
851 	if (rec->dtrd_size != sizeof (uint64_t))
852 		fail(0, "bad event size in first record");
853 
854 	/* LINTED - alignment */
855 	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
856 	rec++;
857 
858 	if (rec->dtrd_size != sizeof (uintptr_t))
859 		fail(0, "bad lock address size in second record");
860 
861 	/* LINTED - alignment */
862 	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
863 	rec++;
864 
865 	if (rec->dtrd_size != sizeof (uintptr_t))
866 		fail(0, "bad caller size in third record");
867 
868 	/* LINTED - alignment */
869 	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
870 	rec++;
871 
872 	if (g_recsize > LS_HIST) {
873 		int frames, i;
874 		pc_t *stack;
875 
876 		frames = rec->dtrd_size / sizeof (pc_t);
877 		/* LINTED - alignment */
878 		stack = (pc_t *)(data + rec->dtrd_offset);
879 
880 		for (i = 1; i < frames; i++)
881 			lsrec->ls_stack[i - 1] = stack[i];
882 	}
883 }
884 
885 /*ARGSUSED*/
886 static int
887 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
888 {
889 	*((size_t *)arg) += 1;
890 
891 	return (DTRACE_AGGWALK_NEXT);
892 }
893 
894 static int
895 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
896 {
897 	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
898 	caddr_t data = agg->dtada_data;
899 	lsdata_t *lsdata = arg;
900 	lsrec_t *lsrec = lsdata->lsd_next;
901 	const dtrace_recdesc_t *rec;
902 	uint64_t *avg, *quantized;
903 	int i, j;
904 
905 	assert(lsdata->lsd_count < g_nrecs);
906 
907 	/*
908 	 * Aggregation variable IDs are guaranteed to be generated in program
909 	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
910 	 * plus one.  As "avg" appears before "hist" in program order, we know
911 	 * that "avg" will be allocated the first aggregation variable ID, and
912 	 * "hist" will be allocated the second aggregation variable ID -- and
913 	 * we therefore use the aggregation variable ID to differentiate the
914 	 * cases.
915 	 */
916 	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
917 		/*
918 		 * If this is the histogram entry.  We'll copy the quantized
919 		 * data into lc_hist, and jump over the rest.
920 		 */
921 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
922 
923 		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
924 			fail(0, "bad variable ID in aggregation record");
925 
926 		if (rec->dtrd_size !=
927 		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
928 			fail(0, "bad quantize size in aggregation record");
929 
930 		/* LINTED - alignment */
931 		quantized = (uint64_t *)(data + rec->dtrd_offset);
932 
933 		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
934 		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
935 			lsrec->ls_hist[j] = quantized[i];
936 
937 		goto out;
938 	}
939 
940 	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
941 	    aggdesc->dtagd_nrecs - 1, data);
942 
943 	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
944 
945 	if (rec->dtrd_size != 2 * sizeof (uint64_t))
946 		fail(0, "bad avg size in aggregation record");
947 
948 	/* LINTED - alignment */
949 	avg = (uint64_t *)(data + rec->dtrd_offset);
950 	lsrec->ls_count = (uint32_t)avg[0];
951 	lsrec->ls_time = (uintptr_t)avg[1];
952 
953 	if (g_recsize >= LS_HIST)
954 		return (DTRACE_AGGWALK_NEXT);
955 
956 out:
957 	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
958 	lsdata->lsd_count++;
959 
960 	return (DTRACE_AGGWALK_NEXT);
961 }
962 
963 static int
964 process_trace(const dtrace_probedata_t *pdata, void *arg)
965 {
966 	lsdata_t *lsdata = arg;
967 	lsrec_t *lsrec = lsdata->lsd_next;
968 	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
969 	caddr_t data = pdata->dtpda_data;
970 
971 	if (lsdata->lsd_count >= g_nrecs)
972 		return (DTRACE_CONSUME_NEXT);
973 
974 	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
975 
976 	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
977 	lsdata->lsd_count++;
978 
979 	return (DTRACE_CONSUME_NEXT);
980 }
981 
982 static int
983 process_data(FILE *out, char *data)
984 {
985 	lsdata_t lsdata;
986 
987 	/* LINTED - alignment */
988 	lsdata.lsd_next = (lsrec_t *)data;
989 	lsdata.lsd_count = 0;
990 
991 	if (g_tracing) {
992 		if (dtrace_consume(g_dtp, out,
993 		    process_trace, NULL, &lsdata) != 0)
994 			dfail("failed to consume buffer");
995 
996 		return (lsdata.lsd_count);
997 	}
998 
999 	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1000 	    process_aggregate, &lsdata) != 0)
1001 		dfail("failed to walk aggregate");
1002 
1003 	return (lsdata.lsd_count);
1004 }
1005 
1006 /*ARGSUSED*/
1007 static int
1008 drophandler(const dtrace_dropdata_t *data, void *arg)
1009 {
1010 	g_dropped++;
1011 	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1012 	return (DTRACE_HANDLE_OK);
1013 }
1014 
1015 int
1016 main(int argc, char **argv)
1017 {
1018 	char *data_buf;
1019 	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1020 	FILE *out = stdout;
1021 	char c;
1022 	pid_t child;
1023 	int status;
1024 	int i, j;
1025 	hrtime_t duration;
1026 	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1027 	uintptr_t addr;
1028 	size_t size, off;
1029 	int events_specified = 0;
1030 	int exec_errno = 0;
1031 	uint32_t event;
1032 	char *filt = NULL, *ifilt = NULL;
1033 	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1034 	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1035 	dtrace_optval_t aggsize;
1036 	char aggstr[10];
1037 	long ncpus;
1038 	int dynvar = 0;
1039 	int err;
1040 
1041 	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1042 		fail(0, "cannot open dtrace library: %s",
1043 		    dtrace_errmsg(NULL, err));
1044 	}
1045 
1046 	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1047 		dfail("couldn't establish drop handler");
1048 
1049 	if (symtab_init() == -1)
1050 		fail(1, "can't load kernel symbols");
1051 
1052 	g_nrecs = DEFAULT_NRECS;
1053 
1054 	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1055 		switch (c) {
1056 		case 'b':
1057 			g_recsize = LS_BASIC;
1058 			break;
1059 
1060 		case 't':
1061 			g_recsize = LS_TIME;
1062 			break;
1063 
1064 		case 'h':
1065 			g_recsize = LS_HIST;
1066 			break;
1067 
1068 		case 's':
1069 			if (!isdigit(optarg[0]))
1070 				usage();
1071 			g_stkdepth = atoi(optarg);
1072 			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1073 				fail(0, "max stack depth is %d",
1074 				    LS_MAX_STACK_DEPTH);
1075 			g_recsize = LS_STACK(g_stkdepth);
1076 			break;
1077 
1078 		case 'n':
1079 			if (!isdigit(optarg[0]))
1080 				usage();
1081 			g_nrecs = atoi(optarg);
1082 			break;
1083 
1084 		case 'd':
1085 			if (!isdigit(optarg[0]))
1086 				usage();
1087 			duration = atoll(optarg);
1088 
1089 			/*
1090 			 * XXX -- durations really should be per event
1091 			 * since the units are different, but it's hard
1092 			 * to express this nicely in the interface.
1093 			 * Not clear yet what the cleanest solution is.
1094 			 */
1095 			for (i = 0; i < LS_MAX_EVENTS; i++)
1096 				if (g_event_info[i].ev_type != 'E')
1097 					g_min_duration[i] = duration;
1098 
1099 			break;
1100 
1101 		case 'i':
1102 			if (!isdigit(optarg[0]))
1103 				usage();
1104 			i = atoi(optarg);
1105 			if (i <= 0)
1106 				usage();
1107 			if (i > MAX_HZ)
1108 				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1109 
1110 			for (j = 0; j < LS_MAX_EVENTS; j++)
1111 				if (strcmp(g_event_info[j].ev_desc,
1112 				    "Profiling interrupt") == 0)
1113 					break;
1114 
1115 			(void) sprintf(g_event_info[j].ev_name,
1116 			    "profile:::profile-%d", i);
1117 			break;
1118 
1119 		case 'l':
1120 		case 'f':
1121 			addrp = strtok(optarg, ",");
1122 			sizep = strtok(NULL, ",");
1123 			addrp = strtok(optarg, ",+");
1124 			offp = strtok(NULL, ",");
1125 
1126 			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1127 			off = offp ? strtoul(offp, NULL, 0) : 0;
1128 
1129 			if (addrp[0] == '0') {
1130 				addr = strtoul(addrp, NULL, 16) + off;
1131 			} else {
1132 				addr = sym_to_addr(addrp) + off;
1133 				if (sizep == NULL)
1134 					size = sym_size(addrp) - off;
1135 				if (addr - off == 0)
1136 					fail(0, "symbol '%s' not found", addrp);
1137 				if (size == 0)
1138 					size = 1;
1139 			}
1140 
1141 
1142 			if (c == 'l') {
1143 				filter_add(&filt, "arg0", addr, size);
1144 			} else {
1145 				filter_add(&filt, "caller", addr, size);
1146 				filter_add(&ifilt, "arg0", addr, size);
1147 			}
1148 			break;
1149 
1150 		case 'e':
1151 			evp = strtok_r(optarg, ",", &lastp);
1152 			while (evp) {
1153 				int ev1, ev2;
1154 				char *evp2;
1155 
1156 				(void) strtok(evp, "-");
1157 				evp2 = strtok(NULL, "-");
1158 				ev1 = atoi(evp);
1159 				ev2 = evp2 ? atoi(evp2) : ev1;
1160 				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1161 				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1162 					fail(0, "-e events out of range");
1163 				for (i = ev1; i <= ev2; i++)
1164 					g_enabled[i] = 1;
1165 				evp = strtok_r(NULL, ",", &lastp);
1166 			}
1167 			events_specified = 1;
1168 			break;
1169 
1170 		case 'c':
1171 			g_cflag = 1;
1172 			break;
1173 
1174 		case 'k':
1175 			g_kflag = 1;
1176 			break;
1177 
1178 		case 'w':
1179 			g_wflag = 1;
1180 			break;
1181 
1182 		case 'W':
1183 			g_Wflag = 1;
1184 			break;
1185 
1186 		case 'g':
1187 			g_gflag = 1;
1188 			break;
1189 
1190 		case 'C':
1191 		case 'E':
1192 		case 'H':
1193 		case 'I':
1194 			for (i = 0; i < LS_MAX_EVENTS; i++)
1195 				if (g_event_info[i].ev_type == c)
1196 					g_enabled[i] = 1;
1197 			events_specified = 1;
1198 			break;
1199 
1200 		case 'A':
1201 			for (i = 0; i < LS_MAX_EVENTS; i++)
1202 				if (strchr("CH", g_event_info[i].ev_type))
1203 					g_enabled[i] = 1;
1204 			events_specified = 1;
1205 			break;
1206 
1207 		case 'T':
1208 			g_tracing = 1;
1209 			break;
1210 
1211 		case 'D':
1212 			if (!isdigit(optarg[0]))
1213 				usage();
1214 			g_topn = atoi(optarg);
1215 			break;
1216 
1217 		case 'R':
1218 			g_rates = 1;
1219 			break;
1220 
1221 		case 'p':
1222 			g_pflag = 1;
1223 			break;
1224 
1225 		case 'P':
1226 			g_Pflag = 1;
1227 			break;
1228 
1229 		case 'o':
1230 			if ((out = fopen(optarg, "w")) == NULL)
1231 				fail(1, "error opening file");
1232 			break;
1233 
1234 		case 'V':
1235 			g_Vflag = 1;
1236 			break;
1237 
1238 		default:
1239 			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1240 				usage();
1241 		}
1242 	}
1243 
1244 	if (filt != NULL) {
1245 		predicate_add(&g_predicate, filt, NULL, 0);
1246 		filter_destroy(&filt);
1247 	}
1248 
1249 	if (ifilt != NULL) {
1250 		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1251 		filter_destroy(&ifilt);
1252 	}
1253 
1254 	if (g_recsize == 0) {
1255 		if (g_gflag) {
1256 			g_stkdepth = LS_MAX_STACK_DEPTH;
1257 			g_recsize = LS_STACK(g_stkdepth);
1258 		} else {
1259 			g_recsize = LS_TIME;
1260 		}
1261 	}
1262 
1263 	if (g_gflag && g_recsize <= LS_STACK(0))
1264 		fail(0, "'-g' requires at least '-s 1' data gathering");
1265 
1266 	/*
1267 	 * Make sure the alignment is reasonable
1268 	 */
1269 	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1270 
1271 	for (i = 0; i < LS_MAX_EVENTS; i++) {
1272 		/*
1273 		 * If no events were specified, enable -C.
1274 		 */
1275 		if (!events_specified && g_event_info[i].ev_type == 'C')
1276 			g_enabled[i] = 1;
1277 	}
1278 
1279 	for (i = 0; i < LS_MAX_EVENTS; i++) {
1280 		if (!g_enabled[i])
1281 			continue;
1282 
1283 		if (g_event_info[i].ev_acquire != NULL) {
1284 			/*
1285 			 * If we've enabled a hold event, we must explicitly
1286 			 * allocate dynamic variable space.
1287 			 */
1288 			dynvar = 1;
1289 		}
1290 
1291 		dprog_addevent(i);
1292 	}
1293 
1294 	/*
1295 	 * Make sure there are remaining arguments to specify a child command
1296 	 * to execute.
1297 	 */
1298 	if (argc <= optind)
1299 		usage();
1300 
1301 	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1302 		dfail("couldn't determine number of online CPUs");
1303 
1304 	/*
1305 	 * By default, we set our data buffer size to be the number of records
1306 	 * multiplied by the size of the record, doubled to account for some
1307 	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1308 	 * the aggregation size at both a minimum and a maximum to prevent
1309 	 * absurdly low or high values.
1310 	 */
1311 	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1312 		aggsize = MIN_AGGSIZE;
1313 
1314 	if (aggsize > MAX_AGGSIZE)
1315 		aggsize = MAX_AGGSIZE;
1316 
1317 	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1318 
1319 	if (!g_tracing) {
1320 		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1321 			dfail("failed to set 'bufsize'");
1322 
1323 		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1324 			dfail("failed to set 'aggsize'");
1325 
1326 		if (dynvar) {
1327 			/*
1328 			 * If we're using dynamic variables, we set our
1329 			 * dynamic variable size to be one megabyte per CPU,
1330 			 * with a hard-limit of 32 megabytes.  This may still
1331 			 * be too small in some cases, but it can be tuned
1332 			 * manually via -x if need be.
1333 			 */
1334 			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1335 
1336 			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1337 				dfail("failed to set 'dynvarsize'");
1338 		}
1339 	} else {
1340 		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1341 			dfail("failed to set 'bufsize'");
1342 	}
1343 
1344 	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1345 		dfail("failed to set 'statusrate'");
1346 
1347 	optind = 1;
1348 	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1349 		switch (c) {
1350 		case 'x':
1351 			if ((p = strchr(optarg, '=')) != NULL)
1352 				*p++ = '\0';
1353 
1354 			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1355 				dfail("failed to set -x %s", optarg);
1356 			break;
1357 		}
1358 	}
1359 
1360 	argc -= optind;
1361 	argv += optind;
1362 
1363 	dprog_compile();
1364 	status_init();
1365 
1366 	g_elapsed = -gethrtime();
1367 
1368 	/*
1369 	 * Spawn the specified command and wait for it to complete.
1370 	 */
1371 	child = fork();
1372 	if (child == -1)
1373 		fail(1, "cannot fork");
1374 	if (child == 0) {
1375 		(void) dtrace_close(g_dtp);
1376 		(void) execvp(argv[0], &argv[0]);
1377 		exec_errno = errno;
1378 		exit(127);
1379 	}
1380 
1381 	while (waitpid(child, &status, WEXITED) != child)
1382 		status_check();
1383 
1384 	g_elapsed += gethrtime();
1385 
1386 	if (WIFEXITED(status)) {
1387 		if (WEXITSTATUS(status) != 0) {
1388 			if (exec_errno != 0) {
1389 				errno = exec_errno;
1390 				fail(1, "could not execute %s", argv[0]);
1391 			}
1392 			(void) fprintf(stderr,
1393 			    "lockstat: warning: %s exited with code %d\n",
1394 			    argv[0], WEXITSTATUS(status));
1395 		}
1396 	} else {
1397 		(void) fprintf(stderr,
1398 		    "lockstat: warning: %s died on signal %d\n",
1399 		    argv[0], WTERMSIG(status));
1400 	}
1401 
1402 	if (dtrace_stop(g_dtp) == -1)
1403 		dfail("failed to stop dtrace");
1404 
1405 	/*
1406 	 * Before we read out the results, we need to allocate our buffer.
1407 	 * If we're tracing, then we'll just use the precalculated size.  If
1408 	 * we're not, then we'll take a snapshot of the aggregate, and walk
1409 	 * it to count the number of records.
1410 	 */
1411 	if (!g_tracing) {
1412 		if (dtrace_aggregate_snap(g_dtp) != 0)
1413 			dfail("failed to snap aggregate");
1414 
1415 		g_nrecs = 0;
1416 
1417 		if (dtrace_aggregate_walk(g_dtp,
1418 		    count_aggregate, &g_nrecs) != 0)
1419 			dfail("failed to walk aggregate");
1420 	}
1421 
1422 	if ((data_buf = memalign(sizeof (uint64_t),
1423 	    (g_nrecs + 1) * g_recsize)) == NULL)
1424 		fail(1, "Memory allocation failed");
1425 
1426 	/*
1427 	 * Read out the DTrace data.
1428 	 */
1429 	g_nrecs_used = process_data(out, data_buf);
1430 
1431 	if (g_nrecs_used > g_nrecs || g_dropped)
1432 		(void) fprintf(stderr, "lockstat: warning: "
1433 		    "ran out of data records (use -n for more)\n");
1434 
1435 	/* LINTED - alignment */
1436 	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1437 	    /* LINTED - alignment */
1438 	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1439 		ev_count[lsp->ls_event] += lsp->ls_count;
1440 		ev_time[lsp->ls_event] += lsp->ls_time;
1441 	}
1442 
1443 	/*
1444 	 * If -g was specified, convert stacks into individual records.
1445 	 */
1446 	if (g_gflag) {
1447 		lsrec_t *newlsp, *oldlsp;
1448 
1449 		newlsp = memalign(sizeof (uint64_t),
1450 		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1451 		if (newlsp == NULL)
1452 			fail(1, "Cannot allocate space for -g processing");
1453 		lsp = newlsp;
1454 		/* LINTED - alignment */
1455 		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1456 		    /* LINTED - alignment */
1457 		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1458 			int fr;
1459 			int caller_in_stack = 0;
1460 
1461 			if (oldlsp->ls_count == 0)
1462 				continue;
1463 
1464 			for (fr = 0; fr < g_stkdepth; fr++) {
1465 				if (oldlsp->ls_stack[fr] == 0)
1466 					break;
1467 				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1468 					caller_in_stack = 1;
1469 				bcopy(oldlsp, lsp, LS_TIME);
1470 				lsp->ls_caller = oldlsp->ls_stack[fr];
1471 				/* LINTED - alignment */
1472 				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1473 			}
1474 			if (!caller_in_stack) {
1475 				bcopy(oldlsp, lsp, LS_TIME);
1476 				/* LINTED - alignment */
1477 				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1478 			}
1479 		}
1480 		g_nrecs = g_nrecs_used =
1481 		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1482 		g_recsize = LS_TIME;
1483 		g_stkdepth = 0;
1484 		free(data_buf);
1485 		data_buf = (char *)newlsp;
1486 	}
1487 
1488 	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1489 	    sizeof (void *))) == NULL)
1490 		fail(1, "Sort buffer allocation failed");
1491 	merge_buf = sort_buf + (g_nrecs + 1);
1492 
1493 	/*
1494 	 * Build the sort buffer, discarding zero-count records along the way.
1495 	 */
1496 	/* LINTED - alignment */
1497 	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1498 	    /* LINTED - alignment */
1499 	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1500 		if (lsp->ls_count == 0)
1501 			lsp->ls_event = LS_MAX_EVENTS;
1502 		sort_buf[i] = lsp;
1503 	}
1504 
1505 	if (g_nrecs_used == 0)
1506 		exit(0);
1507 
1508 	/*
1509 	 * Add a sentinel after the last record
1510 	 */
1511 	sort_buf[i] = lsp;
1512 	lsp->ls_event = LS_MAX_EVENTS;
1513 
1514 	if (g_tracing) {
1515 		report_trace(out, sort_buf);
1516 		return (0);
1517 	}
1518 
1519 	/*
1520 	 * Application of -g may have resulted in multiple records
1521 	 * with the same signature; coalesce them.
1522 	 */
1523 	if (g_gflag) {
1524 		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1525 		coalesce(lockcmp, sort_buf, g_nrecs_used);
1526 	}
1527 
1528 	/*
1529 	 * Coalesce locks within the same symbol if -c option specified.
1530 	 * Coalesce PCs within the same function if -k option specified.
1531 	 */
1532 	if (g_cflag || g_kflag) {
1533 		for (i = 0; i < g_nrecs_used; i++) {
1534 			int fr;
1535 			lsp = sort_buf[i];
1536 			if (g_cflag)
1537 				coalesce_symbol(&lsp->ls_lock);
1538 			if (g_kflag) {
1539 				for (fr = 0; fr < g_stkdepth; fr++)
1540 					coalesce_symbol(&lsp->ls_stack[fr]);
1541 				coalesce_symbol(&lsp->ls_caller);
1542 			}
1543 		}
1544 		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1545 		coalesce(lockcmp, sort_buf, g_nrecs_used);
1546 	}
1547 
1548 	/*
1549 	 * Coalesce callers if -w option specified
1550 	 */
1551 	if (g_wflag) {
1552 		mergesort(lock_and_count_cmp_anywhere,
1553 		    sort_buf, merge_buf, g_nrecs_used);
1554 		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1555 	}
1556 
1557 	/*
1558 	 * Coalesce locks if -W option specified
1559 	 */
1560 	if (g_Wflag) {
1561 		mergesort(site_and_count_cmp_anylock,
1562 		    sort_buf, merge_buf, g_nrecs_used);
1563 		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1564 	}
1565 
1566 	/*
1567 	 * Sort data by contention count (ls_count) or total time (ls_time),
1568 	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1569 	 */
1570 	if (g_recsize < LS_TIME)
1571 		g_Pflag = 0;
1572 
1573 	if (g_Pflag)
1574 		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1575 	else
1576 		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1577 
1578 	/*
1579 	 * Display data by event type
1580 	 */
1581 	first = &sort_buf[0];
1582 	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1583 		current = first;
1584 		while ((lsp = *current)->ls_event == event)
1585 			current++;
1586 		report_stats(out, first, current - first, ev_count[event],
1587 		    ev_time[event]);
1588 		first = current;
1589 	}
1590 
1591 	return (0);
1592 }
1593 
1594 static char *
1595 format_symbol(char *buf, uintptr_t addr, int show_size)
1596 {
1597 	uintptr_t symoff;
1598 	char *symname;
1599 	size_t symsize;
1600 
1601 	symname = addr_to_sym(addr, &symoff, &symsize);
1602 
1603 	if (show_size && symoff == 0)
1604 		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1605 	else if (symoff == 0)
1606 		(void) sprintf(buf, "%s", symname);
1607 	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1608 		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1609 	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1610 		(void) sprintf(buf, "%s+0x%llx", symname,
1611 		    (unsigned long long)symoff);
1612 	else
1613 		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1614 	return (buf);
1615 }
1616 
1617 static void
1618 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1619 	uint64_t total_time)
1620 {
1621 	uint32_t event = sort_buf[0]->ls_event;
1622 	lsrec_t *lsp;
1623 	double ptotal = 0.0;
1624 	double percent;
1625 	int i, j, fr;
1626 	int displayed;
1627 	int first_bin, last_bin, max_bin_count, total_bin_count;
1628 	int rectype;
1629 	char buf[256];
1630 	char lhdr[80], chdr[80];
1631 
1632 	rectype = g_recsize;
1633 
1634 	if (g_topn == 0) {
1635 		(void) fprintf(out, "%20llu %s\n",
1636 		    g_rates == 0 ? total_count :
1637 		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1638 		    g_event_info[event].ev_desc);
1639 		return;
1640 	}
1641 
1642 	(void) sprintf(lhdr, "%s%s",
1643 	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1644 	(void) sprintf(chdr, "%s%s",
1645 	    g_wflag ? "Hottest " : "", "Caller");
1646 
1647 	if (!g_pflag)
1648 		(void) fprintf(out,
1649 		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1650 		    g_event_info[event].ev_desc, (double)total_count,
1651 		    (double)g_elapsed / NANOSEC,
1652 		    (double)total_count * NANOSEC / g_elapsed);
1653 
1654 	if (!g_pflag && rectype < LS_HIST) {
1655 		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1656 		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1657 		    g_rates ? "ops/s" : "Count",
1658 		    g_gflag ? "genr" : "indv",
1659 		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1660 		(void) fprintf(out, "---------------------------------"
1661 		    "----------------------------------------------\n");
1662 	}
1663 
1664 	displayed = 0;
1665 	for (i = 0; i < nrecs; i++) {
1666 		lsp = sort_buf[i];
1667 
1668 		if (displayed++ >= g_topn)
1669 			break;
1670 
1671 		if (g_pflag) {
1672 			int j;
1673 
1674 			(void) fprintf(out, "%u %u",
1675 			    lsp->ls_event, lsp->ls_count);
1676 			(void) fprintf(out, " %s",
1677 			    format_symbol(buf, lsp->ls_lock, g_cflag));
1678 			(void) fprintf(out, " %s",
1679 			    format_symbol(buf, lsp->ls_caller, 0));
1680 			(void) fprintf(out, " %f",
1681 			    (double)lsp->ls_refcnt / lsp->ls_count);
1682 			if (rectype >= LS_TIME)
1683 				(void) fprintf(out, " %llu",
1684 				    (unsigned long long)lsp->ls_time);
1685 			if (rectype >= LS_HIST) {
1686 				for (j = 0; j < 64; j++)
1687 					(void) fprintf(out, " %u",
1688 					    lsp->ls_hist[j]);
1689 			}
1690 			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1691 				if (rectype <= LS_STACK(j) ||
1692 				    lsp->ls_stack[j] == 0)
1693 					break;
1694 				(void) fprintf(out, " %s",
1695 				    format_symbol(buf, lsp->ls_stack[j], 0));
1696 			}
1697 			(void) fprintf(out, "\n");
1698 			continue;
1699 		}
1700 
1701 		if (rectype >= LS_HIST) {
1702 			(void) fprintf(out, "---------------------------------"
1703 			    "----------------------------------------------\n");
1704 			(void) sprintf(buf, "%s",
1705 			    g_event_info[event].ev_units);
1706 			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1707 			    g_rates ? "ops/s" : "Count",
1708 			    g_gflag ? "genr" : "indv",
1709 			    "cuml", "rcnt", buf, lhdr, chdr);
1710 		}
1711 
1712 		if (g_Pflag && total_time != 0)
1713 			percent = (lsp->ls_time * 100.00) / total_time;
1714 		else
1715 			percent = (lsp->ls_count * 100.00) / total_count;
1716 
1717 		ptotal += percent;
1718 
1719 		if (rectype >= LS_TIME)
1720 			(void) sprintf(buf, "%llu",
1721 			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1722 		else
1723 			buf[0] = '\0';
1724 
1725 		(void) fprintf(out, "%5llu ",
1726 		    g_rates == 0 ? lsp->ls_count :
1727 		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1728 
1729 		(void) fprintf(out, "%3.0f%% ", percent);
1730 
1731 		if (g_gflag)
1732 			(void) fprintf(out, "---- ");
1733 		else
1734 			(void) fprintf(out, "%3.0f%% ", ptotal);
1735 
1736 		(void) fprintf(out, "%4.2f %8s ",
1737 		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1738 
1739 		(void) fprintf(out, "%-22s ",
1740 		    format_symbol(buf, lsp->ls_lock, g_cflag));
1741 
1742 		(void) fprintf(out, "%-24s\n",
1743 		    format_symbol(buf, lsp->ls_caller, 0));
1744 
1745 		if (rectype < LS_HIST)
1746 			continue;
1747 
1748 		(void) fprintf(out, "\n");
1749 		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1750 		    g_event_info[event].ev_units,
1751 		    "------ Time Distribution ------",
1752 		    g_rates ? "ops/s" : "count",
1753 		    rectype > LS_STACK(0) ? "Stack" : "");
1754 
1755 		first_bin = 0;
1756 		while (lsp->ls_hist[first_bin] == 0)
1757 			first_bin++;
1758 
1759 		last_bin = 63;
1760 		while (lsp->ls_hist[last_bin] == 0)
1761 			last_bin--;
1762 
1763 		max_bin_count = 0;
1764 		total_bin_count = 0;
1765 		for (j = first_bin; j <= last_bin; j++) {
1766 			total_bin_count += lsp->ls_hist[j];
1767 			if (lsp->ls_hist[j] > max_bin_count)
1768 				max_bin_count = lsp->ls_hist[j];
1769 		}
1770 
1771 		/*
1772 		 * If we went a few frames below the caller, ignore them
1773 		 */
1774 		for (fr = 3; fr > 0; fr--)
1775 			if (lsp->ls_stack[fr] == lsp->ls_caller)
1776 				break;
1777 
1778 		for (j = first_bin; j <= last_bin; j++) {
1779 			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1780 			(void) fprintf(out, "%10llu |%s%s %-9u ",
1781 			    1ULL << j,
1782 			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1783 			    "                              " + depth,
1784 			    g_rates == 0 ? lsp->ls_hist[j] :
1785 			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1786 			    g_elapsed));
1787 			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1788 				(void) fprintf(out, "\n");
1789 				continue;
1790 			}
1791 			(void) fprintf(out, "%-24s\n",
1792 			    format_symbol(buf, lsp->ls_stack[fr], 0));
1793 			fr++;
1794 		}
1795 		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1796 			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1797 			    format_symbol(buf, lsp->ls_stack[fr], 0));
1798 			fr++;
1799 		}
1800 	}
1801 
1802 	if (!g_pflag)
1803 		(void) fprintf(out, "---------------------------------"
1804 		    "----------------------------------------------\n");
1805 
1806 	(void) fflush(out);
1807 }
1808 
1809 static void
1810 report_trace(FILE *out, lsrec_t **sort_buf)
1811 {
1812 	lsrec_t *lsp;
1813 	int i, fr;
1814 	int rectype;
1815 	char buf[256], buf2[256];
1816 
1817 	rectype = g_recsize;
1818 
1819 	if (!g_pflag) {
1820 		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1821 		    "Event", "Time", "Owner", "Lock", "Caller");
1822 		(void) fprintf(out, "---------------------------------"
1823 		    "----------------------------------------------\n");
1824 	}
1825 
1826 	for (i = 0; i < g_nrecs_used; i++) {
1827 
1828 		lsp = sort_buf[i];
1829 
1830 		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1831 			continue;
1832 
1833 		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1834 		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1835 		    (void *)lsp->ls_next,
1836 		    format_symbol(buf, lsp->ls_lock, 0),
1837 		    format_symbol(buf2, lsp->ls_caller, 0));
1838 
1839 		if (rectype <= LS_STACK(0))
1840 			continue;
1841 
1842 		/*
1843 		 * If we went a few frames below the caller, ignore them
1844 		 */
1845 		for (fr = 3; fr > 0; fr--)
1846 			if (lsp->ls_stack[fr] == lsp->ls_caller)
1847 				break;
1848 
1849 		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1850 			(void) fprintf(out, "%53s  %-24s\n", "",
1851 			    format_symbol(buf, lsp->ls_stack[fr], 0));
1852 			fr++;
1853 		}
1854 		(void) fprintf(out, "\n");
1855 	}
1856 
1857 	(void) fflush(out);
1858 }
1859