xref: /titanic_51/usr/src/cmd/lockstat/lockstat.c (revision c8343062f6e25afd9c2a31b65df357030e69fa55)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <stdio.h>
30 #include <stddef.h>
31 #include <stdlib.h>
32 #include <stdarg.h>
33 #include <string.h>
34 #include <strings.h>
35 #include <ctype.h>
36 #include <fcntl.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <limits.h>
40 #include <sys/types.h>
41 #include <sys/modctl.h>
42 #include <sys/stat.h>
43 #include <sys/wait.h>
44 #include <dtrace.h>
45 #include <sys/lockstat.h>
46 #include <alloca.h>
47 #include <signal.h>
48 #include <assert.h>
49 
50 #define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
51 
52 #define	LS_MAX_STACK_DEPTH	50
53 #define	LS_MAX_EVENTS		64
54 
55 typedef struct lsrec {
56 	struct lsrec	*ls_next;	/* next in hash chain */
57 	uintptr_t	ls_lock;	/* lock address */
58 	uintptr_t	ls_caller;	/* caller address */
59 	uint32_t	ls_count;	/* cumulative event count */
60 	uint32_t	ls_event;	/* type of event */
61 	uintptr_t	ls_refcnt;	/* cumulative reference count */
62 	uint64_t	ls_time;	/* cumulative event duration */
63 	uint32_t	ls_hist[64];	/* log2(duration) histogram */
64 	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
65 } lsrec_t;
66 
67 typedef struct lsdata {
68 	struct lsrec	*lsd_next;	/* next available */
69 	int		lsd_count;	/* number of records */
70 } lsdata_t;
71 
72 /*
73  * Definitions for the types of experiments which can be run.  They are
74  * listed in increasing order of memory cost and processing time cost.
75  * The numerical value of each type is the number of bytes needed per record.
76  */
77 #define	LS_BASIC	offsetof(lsrec_t, ls_time)
78 #define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
79 #define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
80 #define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
81 
82 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
83 static void report_trace(FILE *, lsrec_t **);
84 
85 extern int symtab_init(void);
86 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
87 extern uintptr_t sym_to_addr(char *name);
88 extern size_t sym_size(char *name);
89 extern char *strtok_r(char *, const char *, char **);
90 
91 #define	DEFAULT_NRECS	10000
92 #define	DEFAULT_HZ	97
93 #define	MAX_HZ		1000
94 #define	MIN_AGGSIZE	(16 * 1024)
95 #define	MAX_AGGSIZE	(32 * 1024 * 1024)
96 
97 static int g_stkdepth;
98 static int g_topn = INT_MAX;
99 static hrtime_t g_elapsed;
100 static int g_rates = 0;
101 static int g_pflag = 0;
102 static int g_Pflag = 0;
103 static int g_wflag = 0;
104 static int g_Wflag = 0;
105 static int g_cflag = 0;
106 static int g_kflag = 0;
107 static int g_gflag = 0;
108 static int g_Vflag = 0;
109 static int g_tracing = 0;
110 static size_t g_recsize;
111 static size_t g_nrecs;
112 static int g_nrecs_used;
113 static uchar_t g_enabled[LS_MAX_EVENTS];
114 static hrtime_t g_min_duration[LS_MAX_EVENTS];
115 static dtrace_hdl_t *g_dtp;
116 static char *g_predicate;
117 static char *g_ipredicate;
118 static char *g_prog;
119 static int g_proglen;
120 static int g_dropped;
121 
122 typedef struct ls_event_info {
123 	char	ev_type;
124 	char	ev_lhdr[20];
125 	char	ev_desc[80];
126 	char	ev_units[10];
127 	char	ev_name[DTRACE_NAMELEN];
128 	char	*ev_predicate;
129 	char	*ev_acquire;
130 } ls_event_info_t;
131 
132 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
133 	{ 'C',	"Lock",	"Adaptive mutex spin",			"spin",
134 	    "lockstat:::adaptive-spin" },
135 	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
136 	    "lockstat:::adaptive-block" },
137 	{ 'C',	"Lock",	"Spin lock spin",			"spin",
138 	    "lockstat:::spin-spin" },
139 	{ 'C',	"Lock",	"Thread lock spin",			"spin",
140 	    "lockstat:::thread-spin" },
141 	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
142 	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
143 	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
144 	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
145 	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
146 	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
147 	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
148 	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
149 	{ 'C',	"Lock",	"Unknown event (type 8)",		"units"	},
150 	{ 'C',	"Lock",	"Unknown event (type 9)",		"units"	},
151 	{ 'C',	"Lock",	"Unknown event (type 10)",		"units"	},
152 	{ 'C',	"Lock",	"Unknown event (type 11)",		"units"	},
153 	{ 'C',	"Lock",	"Unknown event (type 12)",		"units"	},
154 	{ 'C',	"Lock",	"Unknown event (type 13)",		"units"	},
155 	{ 'C',	"Lock",	"Unknown event (type 14)",		"units"	},
156 	{ 'C',	"Lock",	"Unknown event (type 15)",		"units"	},
157 	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
158 	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
159 	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
160 	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
161 	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
162 	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
163 	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
164 	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
165 	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
166 	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
167 	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
168 	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
169 	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
170 	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
171 	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
172 	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
173 	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
174 	    "lockstat:::adaptive-release", NULL,
175 	    "lockstat:::adaptive-acquire" },
176 	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
177 	    "lockstat:::spin-release", NULL,
178 	    "lockstat:::spin-acquire" },
179 	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
180 	    "lockstat:::rw-release", "arg1 == 0",
181 	    "lockstat:::rw-acquire" },
182 	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
183 	    "lockstat:::rw-release", "arg1 != 0",
184 	    "lockstat:::rw-acquire" },
185 	{ 'H',	"Lock",	"Unknown event (type 36)",		"units"	},
186 	{ 'H',	"Lock",	"Unknown event (type 37)",		"units"	},
187 	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
188 	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
189 	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
190 	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
191 	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
192 	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
193 	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
194 	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
195 	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
196 	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
197 	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
198 	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
199 	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
200 	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
201 	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
202 	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
203 	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
204 	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
205 	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
206 	    "profile:::profile-97", NULL },
207 	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
208 	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
209 	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
210 	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
211 	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
212 	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
213 	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
214 	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
215 };
216 
217 static void
218 fail(int do_perror, const char *message, ...)
219 {
220 	va_list args;
221 	int save_errno = errno;
222 
223 	va_start(args, message);
224 	(void) fprintf(stderr, "lockstat: ");
225 	(void) vfprintf(stderr, message, args);
226 	va_end(args);
227 	if (do_perror)
228 		(void) fprintf(stderr, ": %s", strerror(save_errno));
229 	(void) fprintf(stderr, "\n");
230 	exit(2);
231 }
232 
233 static void
234 dfail(const char *message, ...)
235 {
236 	va_list args;
237 
238 	va_start(args, message);
239 	(void) fprintf(stderr, "lockstat: ");
240 	(void) vfprintf(stderr, message, args);
241 	va_end(args);
242 	(void) fprintf(stderr, ": %s\n",
243 	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
244 
245 	exit(2);
246 }
247 
248 static void
249 show_events(char event_type, char *desc)
250 {
251 	int i, first = -1, last;
252 
253 	for (i = 0; i < LS_MAX_EVENTS; i++) {
254 		ls_event_info_t *evp = &g_event_info[i];
255 		if (evp->ev_type != event_type ||
256 		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
257 			continue;
258 		if (first == -1)
259 			first = i;
260 		last = i;
261 	}
262 
263 	(void) fprintf(stderr,
264 	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
265 	    desc, event_type, first, last);
266 
267 	for (i = first; i <= last; i++)
268 		(void) fprintf(stderr,
269 		    "%4d = %s\n", i, g_event_info[i].ev_desc);
270 }
271 
272 static void
273 usage(void)
274 {
275 	(void) fprintf(stderr,
276 	    "Usage: lockstat [options] command [args]\n"
277 	    "\nEvent selection options:\n\n"
278 	    "  -C              watch contention events [on by default]\n"
279 	    "  -E              watch error events [off by default]\n"
280 	    "  -H              watch hold events [off by default]\n"
281 	    "  -I              watch interrupt events [off by default]\n"
282 	    "  -A              watch all lock events [equivalent to -CH]\n"
283 	    "  -e event_list   only watch the specified events (shown below);\n"
284 	    "                  <event_list> is a comma-separated list of\n"
285 	    "                  events or ranges of events, e.g. 1,4-7,35\n"
286 	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
287 	    "\nData gathering options:\n\n"
288 	    "  -b              basic statistics (lock, caller, event count)\n"
289 	    "  -t              timing for all events [default]\n"
290 	    "  -h              histograms for event times\n"
291 	    "  -s depth        stack traces <depth> deep\n"
292 	    "  -x opt[=val]    enable or modify DTrace options\n"
293 	    "\nData filtering options:\n\n"
294 	    "  -n nrecords     maximum number of data records [default: %d]\n"
295 	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
296 	    "                  symbolic name or hex address; <size> defaults\n"
297 	    "                  to the ELF symbol size if available, 1 if not\n"
298 	    "  -f func[,size]  only watch events generated by <func>\n"
299 	    "  -d duration     only watch events longer than <duration>\n"
300 	    "  -T              trace (rather than sample) events\n"
301 	    "\nData reporting options:\n\n"
302 	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
303 	    "  -k              coalesce PCs within functions\n"
304 	    "  -g              show total events generated by function\n"
305 	    "  -w              wherever: don't distinguish events by caller\n"
306 	    "  -W              whichever: don't distinguish events by lock\n"
307 	    "  -R              display rates rather than counts\n"
308 	    "  -p              parsable output format (awk(1)-friendly)\n"
309 	    "  -P              sort lock data by (count * avg_time) product\n"
310 	    "  -D n            only display top <n> events of each type\n"
311 	    "  -o filename     send output to <filename>\n",
312 	    DEFAULT_HZ, DEFAULT_NRECS);
313 
314 	show_events('C', "Contention");
315 	show_events('H', "Hold-time");
316 	show_events('I', "Interrupt");
317 	show_events('E', "Error");
318 	(void) fprintf(stderr, "\n");
319 
320 	exit(1);
321 }
322 
323 static int
324 lockcmp(lsrec_t *a, lsrec_t *b)
325 {
326 	int i;
327 
328 	if (a->ls_event < b->ls_event)
329 		return (-1);
330 	if (a->ls_event > b->ls_event)
331 		return (1);
332 
333 	for (i = g_stkdepth - 1; i >= 0; i--) {
334 		if (a->ls_stack[i] < b->ls_stack[i])
335 			return (-1);
336 		if (a->ls_stack[i] > b->ls_stack[i])
337 			return (1);
338 	}
339 
340 	if (a->ls_caller < b->ls_caller)
341 		return (-1);
342 	if (a->ls_caller > b->ls_caller)
343 		return (1);
344 
345 	if (a->ls_lock < b->ls_lock)
346 		return (-1);
347 	if (a->ls_lock > b->ls_lock)
348 		return (1);
349 
350 	return (0);
351 }
352 
353 static int
354 countcmp(lsrec_t *a, lsrec_t *b)
355 {
356 	if (a->ls_event < b->ls_event)
357 		return (-1);
358 	if (a->ls_event > b->ls_event)
359 		return (1);
360 
361 	return (b->ls_count - a->ls_count);
362 }
363 
364 static int
365 timecmp(lsrec_t *a, lsrec_t *b)
366 {
367 	if (a->ls_event < b->ls_event)
368 		return (-1);
369 	if (a->ls_event > b->ls_event)
370 		return (1);
371 
372 	if (a->ls_time < b->ls_time)
373 		return (1);
374 	if (a->ls_time > b->ls_time)
375 		return (-1);
376 
377 	return (0);
378 }
379 
380 static int
381 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
382 {
383 	if (a->ls_event < b->ls_event)
384 		return (-1);
385 	if (a->ls_event > b->ls_event)
386 		return (1);
387 
388 	if (a->ls_lock < b->ls_lock)
389 		return (-1);
390 	if (a->ls_lock > b->ls_lock)
391 		return (1);
392 
393 	return (0);
394 }
395 
396 static int
397 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
398 {
399 	if (a->ls_event < b->ls_event)
400 		return (-1);
401 	if (a->ls_event > b->ls_event)
402 		return (1);
403 
404 	if (a->ls_lock < b->ls_lock)
405 		return (-1);
406 	if (a->ls_lock > b->ls_lock)
407 		return (1);
408 
409 	return (b->ls_count - a->ls_count);
410 }
411 
412 static int
413 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
414 {
415 	int i;
416 
417 	if (a->ls_event < b->ls_event)
418 		return (-1);
419 	if (a->ls_event > b->ls_event)
420 		return (1);
421 
422 	for (i = g_stkdepth - 1; i >= 0; i--) {
423 		if (a->ls_stack[i] < b->ls_stack[i])
424 			return (-1);
425 		if (a->ls_stack[i] > b->ls_stack[i])
426 			return (1);
427 	}
428 
429 	if (a->ls_caller < b->ls_caller)
430 		return (-1);
431 	if (a->ls_caller > b->ls_caller)
432 		return (1);
433 
434 	return (0);
435 }
436 
437 static int
438 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
439 {
440 	int i;
441 
442 	if (a->ls_event < b->ls_event)
443 		return (-1);
444 	if (a->ls_event > b->ls_event)
445 		return (1);
446 
447 	for (i = g_stkdepth - 1; i >= 0; i--) {
448 		if (a->ls_stack[i] < b->ls_stack[i])
449 			return (-1);
450 		if (a->ls_stack[i] > b->ls_stack[i])
451 			return (1);
452 	}
453 
454 	if (a->ls_caller < b->ls_caller)
455 		return (-1);
456 	if (a->ls_caller > b->ls_caller)
457 		return (1);
458 
459 	return (b->ls_count - a->ls_count);
460 }
461 
462 static void
463 mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
464 {
465 	int m = n / 2;
466 	int i, j;
467 
468 	if (m > 1)
469 		mergesort(cmp, a, b, m);
470 	if (n - m > 1)
471 		mergesort(cmp, a + m, b + m, n - m);
472 	for (i = m; i > 0; i--)
473 		b[i - 1] = a[i - 1];
474 	for (j = m - 1; j < n - 1; j++)
475 		b[n + m - j - 2] = a[j + 1];
476 	while (i < j)
477 		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
478 	*a = b[i];
479 }
480 
481 static void
482 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
483 {
484 	int i, j;
485 	lsrec_t *target, *current;
486 
487 	target = lock[0];
488 
489 	for (i = 1; i < n; i++) {
490 		current = lock[i];
491 		if (cmp(current, target) != 0) {
492 			target = current;
493 			continue;
494 		}
495 		current->ls_event = LS_MAX_EVENTS;
496 		target->ls_count += current->ls_count;
497 		target->ls_refcnt += current->ls_refcnt;
498 		if (g_recsize < LS_TIME)
499 			continue;
500 		target->ls_time += current->ls_time;
501 		if (g_recsize < LS_HIST)
502 			continue;
503 		for (j = 0; j < 64; j++)
504 			target->ls_hist[j] += current->ls_hist[j];
505 	}
506 }
507 
508 static void
509 coalesce_symbol(uintptr_t *addrp)
510 {
511 	uintptr_t symoff;
512 	size_t symsize;
513 
514 	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
515 		*addrp -= symoff;
516 }
517 
518 static void
519 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
520 {
521 	char *new;
522 	int len, newlen;
523 
524 	if (what == NULL)
525 		return;
526 
527 	if (*pred == NULL) {
528 		*pred = malloc(1);
529 		*pred[0] = '\0';
530 	}
531 
532 	len = strlen(*pred);
533 	newlen = len + strlen(what) + 32 + strlen("( && )");
534 	new = malloc(newlen);
535 
536 	if (*pred[0] != '\0') {
537 		if (cmp != NULL) {
538 			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
539 			    *pred, what, cmp, (void *)value);
540 		} else {
541 			(void) sprintf(new, "(%s) && (%s)", *pred, what);
542 		}
543 	} else {
544 		if (cmp != NULL) {
545 			(void) sprintf(new, "%s %s 0x%p",
546 			    what, cmp, (void *)value);
547 		} else {
548 			(void) sprintf(new, "%s", what);
549 		}
550 	}
551 
552 	free(*pred);
553 	*pred = new;
554 }
555 
556 static void
557 predicate_destroy(char **pred)
558 {
559 	free(*pred);
560 	*pred = NULL;
561 }
562 
563 static void
564 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
565 {
566 	char buf[256], *c = buf, *new;
567 	int len, newlen;
568 
569 	if (*filt == NULL) {
570 		*filt = malloc(1);
571 		*filt[0] = '\0';
572 	}
573 
574 	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
575 	    " || " : "", what, (void *)base, what, (void *)(base + size));
576 
577 	newlen = (len = strlen(*filt) + 1) + strlen(c);
578 	new = malloc(newlen);
579 	bcopy(*filt, new, len);
580 	(void) strcat(new, c);
581 	free(*filt);
582 	*filt = new;
583 }
584 
585 static void
586 filter_destroy(char **filt)
587 {
588 	free(*filt);
589 	*filt = NULL;
590 }
591 
592 static void
593 dprog_add(const char *fmt, ...)
594 {
595 	va_list args;
596 	int size, offs;
597 	char c;
598 
599 	va_start(args, fmt);
600 	size = vsnprintf(&c, 1, fmt, args) + 1;
601 
602 	if (g_proglen == 0) {
603 		offs = 0;
604 	} else {
605 		offs = g_proglen - 1;
606 	}
607 
608 	g_proglen = offs + size;
609 
610 	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
611 		fail(1, "failed to reallocate program text");
612 
613 	(void) vsnprintf(&g_prog[offs], size, fmt, args);
614 }
615 
616 /*
617  * This function may read like an open sewer, but keep in mind that programs
618  * that generate other programs are rarely pretty.  If one has the unenviable
619  * task of maintaining or -- worse -- extending this code, use the -V option
620  * to examine the D program as generated by this function.
621  */
622 static void
623 dprog_addevent(int event)
624 {
625 	ls_event_info_t *info = &g_event_info[event];
626 	char *pred = NULL;
627 	char stack[20];
628 	const char *arg0, *caller;
629 	char *arg1 = "arg1";
630 	char buf[80];
631 	hrtime_t dur;
632 	int depth;
633 
634 	if (info->ev_name[0] == '\0')
635 		return;
636 
637 	if (info->ev_type == 'I') {
638 		/*
639 		 * For interrupt events, arg0 (normally the lock pointer) is
640 		 * the CPU address plus the current pil, and arg1 (normally
641 		 * the number of nanoseconds) is the number of nanoseconds
642 		 * late -- and it's stored in arg2.
643 		 */
644 		arg0 = "(uintptr_t)curthread->t_cpu + \n"
645 		    "\t    curthread->t_cpu->cpu_profile_pil";
646 		caller = "(uintptr_t)arg0";
647 		arg1 = "arg2";
648 	} else {
649 		arg0 = "(uintptr_t)arg0";
650 		caller = "caller";
651 	}
652 
653 	if (g_recsize > LS_HIST) {
654 		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
655 			continue;
656 
657 		if (g_tracing) {
658 			(void) sprintf(stack, "\tstack(%d);\n", depth);
659 		} else {
660 			(void) sprintf(stack, ", stack(%d)", depth);
661 		}
662 	} else {
663 		(void) sprintf(stack, "");
664 	}
665 
666 	if (info->ev_acquire != NULL) {
667 		/*
668 		 * If this is a hold event, we need to generate an additional
669 		 * clause for the acquire; the clause for the release will be
670 		 * generated with the aggregating statement, below.
671 		 */
672 		dprog_add("%s\n", info->ev_acquire);
673 		predicate_add(&pred, info->ev_predicate, NULL, 0);
674 		predicate_add(&pred, g_predicate, NULL, 0);
675 		if (pred != NULL)
676 			dprog_add("/%s/\n", pred);
677 
678 		dprog_add("{\n");
679 		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
680 
681 		if (info->ev_type == 'H') {
682 			dprog_add("\t%s = timestamp;\n", buf);
683 		} else {
684 			/*
685 			 * If this isn't a hold event, it's the recursive
686 			 * error event.  For this, we simply bump the
687 			 * thread-local, per-lock count.
688 			 */
689 			dprog_add("\t%s++;\n", buf);
690 		}
691 
692 		dprog_add("}\n\n");
693 		predicate_destroy(&pred);
694 		pred = NULL;
695 
696 		if (info->ev_type == 'E') {
697 			/*
698 			 * If this is the recursive lock error event, we need
699 			 * to generate an additional clause to decrement the
700 			 * thread-local, per-lock count.  This assures that we
701 			 * only execute the aggregating clause if we have
702 			 * recursive entry.
703 			 */
704 			dprog_add("%s\n", info->ev_name);
705 			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
706 		}
707 
708 		predicate_add(&pred, buf, NULL, 0);
709 
710 		if (info->ev_type == 'H') {
711 			(void) sprintf(buf, "timestamp -\n\t    "
712 			    "self->ev%d[(uintptr_t)arg0]", event);
713 		}
714 
715 		arg1 = buf;
716 	} else {
717 		predicate_add(&pred, info->ev_predicate, NULL, 0);
718 		if (info->ev_type != 'I')
719 			predicate_add(&pred, g_predicate, NULL, 0);
720 		else
721 			predicate_add(&pred, g_ipredicate, NULL, 0);
722 	}
723 
724 	if ((dur = g_min_duration[event]) != 0)
725 		predicate_add(&pred, arg1, ">=", dur);
726 
727 	dprog_add("%s\n", info->ev_name);
728 
729 	if (pred != NULL)
730 		dprog_add("/%s/\n", pred);
731 	predicate_destroy(&pred);
732 
733 	dprog_add("{\n");
734 
735 	if (g_tracing) {
736 		dprog_add("\ttrace(%dULL);\n", event);
737 		dprog_add("\ttrace(%s);\n", arg0);
738 		dprog_add("\ttrace(%s);\n", caller);
739 		dprog_add(stack);
740 	} else {
741 		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
742 		    event, arg0, caller, stack, arg1);
743 
744 		if (g_recsize >= LS_HIST) {
745 			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
746 			    "(%s);\n", event, arg0, caller, stack, arg1);
747 		}
748 	}
749 
750 	if (info->ev_acquire != NULL)
751 		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
752 
753 	dprog_add("}\n\n");
754 }
755 
756 static void
757 dprog_compile()
758 {
759 	dtrace_prog_t *prog;
760 	dtrace_proginfo_t info;
761 
762 	if (g_Vflag) {
763 		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
764 		(void) fputs(g_prog, stderr);
765 		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
766 	}
767 
768 	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
769 	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
770 		dfail("failed to compile program");
771 
772 	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
773 		dfail("failed to enable probes");
774 
775 	if (dtrace_go(g_dtp) != 0)
776 		dfail("couldn't start tracing");
777 }
778 
779 static void
780 status_fire(void)
781 {}
782 
783 static void
784 status_init(void)
785 {
786 	dtrace_optval_t val, status, agg;
787 	struct sigaction act;
788 	struct itimerspec ts;
789 	struct sigevent ev;
790 	timer_t tid;
791 
792 	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
793 		dfail("failed to get 'statusrate'");
794 
795 	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
796 		dfail("failed to get 'statusrate'");
797 
798 	/*
799 	 * We would want to awaken at a rate that is the GCD of the statusrate
800 	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
801 	 * simply awaken at a rate that is the more frequent of the two, which
802 	 * assures that we're never later than the interval implied by the
803 	 * more frequent rate.
804 	 */
805 	val = status < agg ? status : agg;
806 
807 	(void) sigemptyset(&act.sa_mask);
808 	act.sa_flags = 0;
809 	act.sa_handler = status_fire;
810 	(void) sigaction(SIGUSR1, &act, NULL);
811 
812 	ev.sigev_notify = SIGEV_SIGNAL;
813 	ev.sigev_signo = SIGUSR1;
814 
815 	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
816 		dfail("cannot create CLOCK_REALTIME timer");
817 
818 	ts.it_value.tv_sec = val / NANOSEC;
819 	ts.it_value.tv_nsec = val % NANOSEC;
820 	ts.it_interval = ts.it_value;
821 
822 	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
823 		dfail("cannot set time on CLOCK_REALTIME timer");
824 }
825 
826 static void
827 status_check(void)
828 {
829 	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
830 		dfail("failed to snap aggregate");
831 
832 	if (dtrace_status(g_dtp) == -1)
833 		dfail("dtrace_status()");
834 }
835 
836 static void
837 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
838 {
839 	bzero(lsrec, g_recsize);
840 	lsrec->ls_count = 1;
841 
842 	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
843 		fail(0, "truncated DTrace record");
844 
845 	if (rec->dtrd_size != sizeof (uint64_t))
846 		fail(0, "bad event size in first record");
847 
848 	/* LINTED - alignment */
849 	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
850 	rec++;
851 
852 	if (rec->dtrd_size != sizeof (uintptr_t))
853 		fail(0, "bad lock address size in second record");
854 
855 	/* LINTED - alignment */
856 	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
857 	rec++;
858 
859 	if (rec->dtrd_size != sizeof (uintptr_t))
860 		fail(0, "bad caller size in third record");
861 
862 	/* LINTED - alignment */
863 	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
864 	rec++;
865 
866 	if (g_recsize > LS_HIST) {
867 		int frames, i;
868 		pc_t *stack;
869 
870 		frames = rec->dtrd_size / sizeof (pc_t);
871 		/* LINTED - alignment */
872 		stack = (pc_t *)(data + rec->dtrd_offset);
873 
874 		for (i = 1; i < frames; i++)
875 			lsrec->ls_stack[i - 1] = stack[i];
876 	}
877 }
878 
879 /*ARGSUSED*/
880 static int
881 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
882 {
883 	*((size_t *)arg) += 1;
884 
885 	return (DTRACE_AGGWALK_NEXT);
886 }
887 
888 static int
889 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
890 {
891 	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
892 	caddr_t data = agg->dtada_data;
893 	lsdata_t *lsdata = arg;
894 	lsrec_t *lsrec = lsdata->lsd_next;
895 	const dtrace_recdesc_t *rec;
896 	uint64_t *avg, *quantized;
897 	int i, j;
898 
899 	assert(lsdata->lsd_count < g_nrecs);
900 
901 	rec = &aggdesc->dtagd_rec[0];
902 
903 	if (rec->dtrd_size != sizeof (uint64_t))
904 		fail(0, "bad variable size in zeroth record");
905 
906 	/* LINTED - alignment */
907 	if (*((uint64_t *)(data + rec->dtrd_offset))) {
908 		/*
909 		 * If the variable is non-zero, this is the histogram entry.
910 		 * We'll copy the quantized data into lc_hist, and jump over
911 		 * the rest.
912 		 */
913 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
914 
915 		if (rec->dtrd_size !=
916 		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
917 			fail(0, "bad quantize size in aggregation record");
918 
919 		/* LINTED - alignment */
920 		quantized = (uint64_t *)(data + rec->dtrd_offset);
921 
922 		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
923 		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
924 			lsrec->ls_hist[j] = quantized[i];
925 
926 		goto out;
927 	}
928 
929 	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
930 	    aggdesc->dtagd_nrecs - 1, data);
931 
932 	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
933 
934 	if (rec->dtrd_size != 2 * sizeof (uint64_t))
935 		fail(0, "bad avg size in aggregation record");
936 
937 	/* LINTED - alignment */
938 	avg = (uint64_t *)(data + rec->dtrd_offset);
939 	lsrec->ls_count = (uint32_t)avg[0];
940 	lsrec->ls_time = (uintptr_t)avg[1];
941 
942 	if (g_recsize >= LS_HIST)
943 		return (DTRACE_AGGWALK_NEXT);
944 
945 out:
946 	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
947 	lsdata->lsd_count++;
948 
949 	return (DTRACE_AGGWALK_NEXT);
950 }
951 
952 static int
953 process_trace(const dtrace_probedata_t *pdata, void *arg)
954 {
955 	lsdata_t *lsdata = arg;
956 	lsrec_t *lsrec = lsdata->lsd_next;
957 	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
958 	caddr_t data = pdata->dtpda_data;
959 
960 	if (lsdata->lsd_count >= g_nrecs)
961 		return (DTRACE_CONSUME_NEXT);
962 
963 	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
964 
965 	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
966 	lsdata->lsd_count++;
967 
968 	return (DTRACE_CONSUME_NEXT);
969 }
970 
971 static int
972 process_data(FILE *out, char *data)
973 {
974 	lsdata_t lsdata;
975 
976 	/* LINTED - alignment */
977 	lsdata.lsd_next = (lsrec_t *)data;
978 	lsdata.lsd_count = 0;
979 
980 	if (g_tracing) {
981 		if (dtrace_consume(g_dtp, out,
982 		    process_trace, NULL, &lsdata) != 0)
983 			dfail("failed to consume buffer");
984 
985 		return (lsdata.lsd_count);
986 	}
987 
988 	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
989 	    process_aggregate, &lsdata) != 0)
990 		dfail("failed to walk aggregate");
991 
992 	return (lsdata.lsd_count);
993 }
994 
995 /*ARGSUSED*/
996 static int
997 drophandler(const dtrace_dropdata_t *data, void *arg)
998 {
999 	g_dropped++;
1000 	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1001 	return (DTRACE_HANDLE_OK);
1002 }
1003 
1004 int
1005 main(int argc, char **argv)
1006 {
1007 	char *data_buf;
1008 	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1009 	FILE *out = stdout;
1010 	char c;
1011 	pid_t child;
1012 	int status;
1013 	int i, j;
1014 	hrtime_t duration;
1015 	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1016 	uintptr_t addr;
1017 	size_t size, off;
1018 	int events_specified = 0;
1019 	int exec_errno = 0;
1020 	uint32_t event;
1021 	char *filt = NULL, *ifilt = NULL;
1022 	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1023 	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1024 	dtrace_optval_t aggsize;
1025 	char aggstr[10];
1026 	long ncpus;
1027 	int dynvar = 0;
1028 	int err;
1029 
1030 	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1031 		fail(0, "cannot open dtrace library: %s",
1032 		    dtrace_errmsg(NULL, err));
1033 	}
1034 
1035 	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1036 		dfail("couldn't establish drop handler");
1037 
1038 	if (symtab_init() == -1)
1039 		fail(1, "can't load kernel symbols");
1040 
1041 	g_nrecs = DEFAULT_NRECS;
1042 
1043 	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1044 		switch (c) {
1045 		case 'b':
1046 			g_recsize = LS_BASIC;
1047 			break;
1048 
1049 		case 't':
1050 			g_recsize = LS_TIME;
1051 			break;
1052 
1053 		case 'h':
1054 			g_recsize = LS_HIST;
1055 			break;
1056 
1057 		case 's':
1058 			if (!isdigit(optarg[0]))
1059 				usage();
1060 			g_stkdepth = atoi(optarg);
1061 			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1062 				fail(0, "max stack depth is %d",
1063 				    LS_MAX_STACK_DEPTH);
1064 			g_recsize = LS_STACK(g_stkdepth);
1065 			break;
1066 
1067 		case 'n':
1068 			if (!isdigit(optarg[0]))
1069 				usage();
1070 			g_nrecs = atoi(optarg);
1071 			break;
1072 
1073 		case 'd':
1074 			if (!isdigit(optarg[0]))
1075 				usage();
1076 			duration = atoll(optarg);
1077 
1078 			/*
1079 			 * XXX -- durations really should be per event
1080 			 * since the units are different, but it's hard
1081 			 * to express this nicely in the interface.
1082 			 * Not clear yet what the cleanest solution is.
1083 			 */
1084 			for (i = 0; i < LS_MAX_EVENTS; i++)
1085 				if (g_event_info[i].ev_type != 'E')
1086 					g_min_duration[i] = duration;
1087 
1088 			break;
1089 
1090 		case 'i':
1091 			if (!isdigit(optarg[0]))
1092 				usage();
1093 			i = atoi(optarg);
1094 			if (i <= 0)
1095 				usage();
1096 			if (i > MAX_HZ)
1097 				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1098 
1099 			for (j = 0; j < LS_MAX_EVENTS; j++)
1100 				if (strcmp(g_event_info[j].ev_desc,
1101 				    "Profiling interrupt") == 0)
1102 					break;
1103 
1104 			(void) sprintf(g_event_info[j].ev_name,
1105 			    "profile:::profile-%d", i);
1106 			break;
1107 
1108 		case 'l':
1109 		case 'f':
1110 			addrp = strtok(optarg, ",");
1111 			sizep = strtok(NULL, ",");
1112 			addrp = strtok(optarg, ",+");
1113 			offp = strtok(NULL, ",");
1114 
1115 			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1116 			off = offp ? strtoul(offp, NULL, 0) : 0;
1117 
1118 			if (addrp[0] == '0') {
1119 				addr = strtoul(addrp, NULL, 16) + off;
1120 			} else {
1121 				addr = sym_to_addr(addrp) + off;
1122 				if (sizep == NULL)
1123 					size = sym_size(addrp) - off;
1124 				if (addr - off == 0)
1125 					fail(0, "symbol '%s' not found", addrp);
1126 				if (size == 0)
1127 					size = 1;
1128 			}
1129 
1130 
1131 			if (c == 'l') {
1132 				filter_add(&filt, "arg0", addr, size);
1133 			} else {
1134 				filter_add(&filt, "caller", addr, size);
1135 				filter_add(&ifilt, "arg0", addr, size);
1136 			}
1137 			break;
1138 
1139 		case 'e':
1140 			evp = strtok_r(optarg, ",", &lastp);
1141 			while (evp) {
1142 				int ev1, ev2;
1143 				char *evp2;
1144 
1145 				(void) strtok(evp, "-");
1146 				evp2 = strtok(NULL, "-");
1147 				ev1 = atoi(evp);
1148 				ev2 = evp2 ? atoi(evp2) : ev1;
1149 				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1150 				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1151 					fail(0, "-e events out of range");
1152 				for (i = ev1; i <= ev2; i++)
1153 					g_enabled[i] = 1;
1154 				evp = strtok_r(NULL, ",", &lastp);
1155 			}
1156 			events_specified = 1;
1157 			break;
1158 
1159 		case 'c':
1160 			g_cflag = 1;
1161 			break;
1162 
1163 		case 'k':
1164 			g_kflag = 1;
1165 			break;
1166 
1167 		case 'w':
1168 			g_wflag = 1;
1169 			break;
1170 
1171 		case 'W':
1172 			g_Wflag = 1;
1173 			break;
1174 
1175 		case 'g':
1176 			g_gflag = 1;
1177 			break;
1178 
1179 		case 'C':
1180 		case 'E':
1181 		case 'H':
1182 		case 'I':
1183 			for (i = 0; i < LS_MAX_EVENTS; i++)
1184 				if (g_event_info[i].ev_type == c)
1185 					g_enabled[i] = 1;
1186 			events_specified = 1;
1187 			break;
1188 
1189 		case 'A':
1190 			for (i = 0; i < LS_MAX_EVENTS; i++)
1191 				if (strchr("CH", g_event_info[i].ev_type))
1192 					g_enabled[i] = 1;
1193 			events_specified = 1;
1194 			break;
1195 
1196 		case 'T':
1197 			g_tracing = 1;
1198 			break;
1199 
1200 		case 'D':
1201 			if (!isdigit(optarg[0]))
1202 				usage();
1203 			g_topn = atoi(optarg);
1204 			break;
1205 
1206 		case 'R':
1207 			g_rates = 1;
1208 			break;
1209 
1210 		case 'p':
1211 			g_pflag = 1;
1212 			break;
1213 
1214 		case 'P':
1215 			g_Pflag = 1;
1216 			break;
1217 
1218 		case 'o':
1219 			if ((out = fopen(optarg, "w")) == NULL)
1220 				fail(1, "error opening file");
1221 			break;
1222 
1223 		case 'V':
1224 			g_Vflag = 1;
1225 			break;
1226 
1227 		default:
1228 			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1229 				usage();
1230 		}
1231 	}
1232 
1233 	if (filt != NULL) {
1234 		predicate_add(&g_predicate, filt, NULL, 0);
1235 		filter_destroy(&filt);
1236 	}
1237 
1238 	if (ifilt != NULL) {
1239 		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1240 		filter_destroy(&ifilt);
1241 	}
1242 
1243 	if (g_recsize == 0) {
1244 		if (g_gflag) {
1245 			g_stkdepth = LS_MAX_STACK_DEPTH;
1246 			g_recsize = LS_STACK(g_stkdepth);
1247 		} else {
1248 			g_recsize = LS_TIME;
1249 		}
1250 	}
1251 
1252 	if (g_gflag && g_recsize <= LS_STACK(0))
1253 		fail(0, "'-g' requires at least '-s 1' data gathering");
1254 
1255 	/*
1256 	 * Make sure the alignment is reasonable
1257 	 */
1258 	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1259 
1260 	for (i = 0; i < LS_MAX_EVENTS; i++) {
1261 		/*
1262 		 * If no events were specified, enable -C.
1263 		 */
1264 		if (!events_specified && g_event_info[i].ev_type == 'C')
1265 			g_enabled[i] = 1;
1266 	}
1267 
1268 	for (i = 0; i < LS_MAX_EVENTS; i++) {
1269 		if (!g_enabled[i])
1270 			continue;
1271 
1272 		if (g_event_info[i].ev_acquire != NULL) {
1273 			/*
1274 			 * If we've enabled a hold event, we must explicitly
1275 			 * allocate dynamic variable space.
1276 			 */
1277 			dynvar = 1;
1278 		}
1279 
1280 		dprog_addevent(i);
1281 	}
1282 
1283 	/*
1284 	 * Make sure there are remaining arguments to specify a child command
1285 	 * to execute.
1286 	 */
1287 	if (argc <= optind)
1288 		usage();
1289 
1290 	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1291 		dfail("couldn't determine number of online CPUs");
1292 
1293 	/*
1294 	 * By default, we set our data buffer size to be the number of records
1295 	 * multiplied by the size of the record, doubled to account for some
1296 	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1297 	 * the aggregation size at both a minimum and a maximum to prevent
1298 	 * absurdly low or high values.
1299 	 */
1300 	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1301 		aggsize = MIN_AGGSIZE;
1302 
1303 	if (aggsize > MAX_AGGSIZE)
1304 		aggsize = MAX_AGGSIZE;
1305 
1306 	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1307 
1308 	if (!g_tracing) {
1309 		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1310 			dfail("failed to set 'bufsize'");
1311 
1312 		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1313 			dfail("failed to set 'aggsize'");
1314 
1315 		if (dynvar) {
1316 			/*
1317 			 * If we're using dynamic variables, we set our
1318 			 * dynamic variable size to be one megabyte per CPU,
1319 			 * with a hard-limit of 32 megabytes.  This may still
1320 			 * be too small in some cases, but it can be tuned
1321 			 * manually via -x if need be.
1322 			 */
1323 			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1324 
1325 			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1326 				dfail("failed to set 'dynvarsize'");
1327 		}
1328 	} else {
1329 		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1330 			dfail("failed to set 'bufsize'");
1331 	}
1332 
1333 	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1334 		dfail("failed to set 'statusrate'");
1335 
1336 	optind = 1;
1337 	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
1338 		switch (c) {
1339 		case 'x':
1340 			if ((p = strchr(optarg, '=')) != NULL)
1341 				*p++ = '\0';
1342 
1343 			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1344 				dfail("failed to set -x %s", optarg);
1345 			break;
1346 		}
1347 	}
1348 
1349 	argc -= optind;
1350 	argv += optind;
1351 
1352 	dprog_compile();
1353 	status_init();
1354 
1355 	g_elapsed = -gethrtime();
1356 
1357 	/*
1358 	 * Spawn the specified command and wait for it to complete.
1359 	 */
1360 	child = fork();
1361 	if (child == -1)
1362 		fail(1, "cannot fork");
1363 	if (child == 0) {
1364 		(void) dtrace_close(g_dtp);
1365 		(void) execvp(argv[0], &argv[0]);
1366 		exec_errno = errno;
1367 		exit(127);
1368 	}
1369 
1370 	while (waitpid(child, &status, WEXITED) != child)
1371 		status_check();
1372 
1373 	g_elapsed += gethrtime();
1374 
1375 	if (WIFEXITED(status)) {
1376 		if (WEXITSTATUS(status) != 0) {
1377 			if (exec_errno != 0) {
1378 				errno = exec_errno;
1379 				fail(1, "could not execute %s", argv[0]);
1380 			}
1381 			(void) fprintf(stderr,
1382 			    "lockstat: warning: %s exited with code %d\n",
1383 				argv[0], WEXITSTATUS(status));
1384 		}
1385 	} else {
1386 		(void) fprintf(stderr,
1387 		    "lockstat: warning: %s died on signal %d\n",
1388 			argv[0], WTERMSIG(status));
1389 	}
1390 
1391 	if (dtrace_stop(g_dtp) == -1)
1392 		dfail("failed to stop dtrace");
1393 
1394 	/*
1395 	 * Before we read out the results, we need to allocate our buffer.
1396 	 * If we're tracing, then we'll just use the precalculated size.  If
1397 	 * we're not, then we'll take a snapshot of the aggregate, and walk
1398 	 * it to count the number of records.
1399 	 */
1400 	if (!g_tracing) {
1401 		if (dtrace_aggregate_snap(g_dtp) != 0)
1402 			dfail("failed to snap aggregate");
1403 
1404 		g_nrecs = 0;
1405 
1406 		if (dtrace_aggregate_walk(g_dtp,
1407 		    count_aggregate, &g_nrecs) != 0)
1408 			dfail("failed to walk aggregate");
1409 	}
1410 
1411 	if ((data_buf = memalign(sizeof (uint64_t),
1412 	    (g_nrecs + 1) * g_recsize)) == NULL)
1413 		fail(1, "Memory allocation failed");
1414 
1415 	/*
1416 	 * Read out the DTrace data.
1417 	 */
1418 	g_nrecs_used = process_data(out, data_buf);
1419 
1420 	if (g_nrecs_used > g_nrecs || g_dropped)
1421 		(void) fprintf(stderr, "lockstat: warning: "
1422 		    "ran out of data records (use -n for more)\n");
1423 
1424 	/* LINTED - alignment */
1425 	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1426 	    /* LINTED - alignment */
1427 	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1428 		ev_count[lsp->ls_event] += lsp->ls_count;
1429 		ev_time[lsp->ls_event] += lsp->ls_time;
1430 	}
1431 
1432 	/*
1433 	 * If -g was specified, convert stacks into individual records.
1434 	 */
1435 	if (g_gflag) {
1436 		lsrec_t *newlsp, *oldlsp;
1437 
1438 		newlsp = memalign(sizeof (uint64_t),
1439 		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1440 		if (newlsp == NULL)
1441 			fail(1, "Cannot allocate space for -g processing");
1442 		lsp = newlsp;
1443 		/* LINTED - alignment */
1444 		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1445 		    /* LINTED - alignment */
1446 		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1447 			int fr;
1448 			int caller_in_stack = 0;
1449 
1450 			if (oldlsp->ls_count == 0)
1451 				continue;
1452 
1453 			for (fr = 0; fr < g_stkdepth; fr++) {
1454 				if (oldlsp->ls_stack[fr] == 0)
1455 					break;
1456 				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1457 					caller_in_stack = 1;
1458 				bcopy(oldlsp, lsp, LS_TIME);
1459 				lsp->ls_caller = oldlsp->ls_stack[fr];
1460 				/* LINTED - alignment */
1461 				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1462 			}
1463 			if (!caller_in_stack) {
1464 				bcopy(oldlsp, lsp, LS_TIME);
1465 				/* LINTED - alignment */
1466 				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1467 			}
1468 		}
1469 		g_nrecs = g_nrecs_used =
1470 		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1471 		g_recsize = LS_TIME;
1472 		g_stkdepth = 0;
1473 		free(data_buf);
1474 		data_buf = (char *)newlsp;
1475 	}
1476 
1477 	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1478 	    sizeof (void *))) == NULL)
1479 		fail(1, "Sort buffer allocation failed");
1480 	merge_buf = sort_buf + (g_nrecs + 1);
1481 
1482 	/*
1483 	 * Build the sort buffer, discarding zero-count records along the way.
1484 	 */
1485 	/* LINTED - alignment */
1486 	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1487 	    /* LINTED - alignment */
1488 	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1489 		if (lsp->ls_count == 0)
1490 			lsp->ls_event = LS_MAX_EVENTS;
1491 		sort_buf[i] = lsp;
1492 	}
1493 
1494 	if (g_nrecs_used == 0)
1495 		exit(0);
1496 
1497 	/*
1498 	 * Add a sentinel after the last record
1499 	 */
1500 	sort_buf[i] = lsp;
1501 	lsp->ls_event = LS_MAX_EVENTS;
1502 
1503 	if (g_tracing) {
1504 		report_trace(out, sort_buf);
1505 		return (0);
1506 	}
1507 
1508 	/*
1509 	 * Application of -g may have resulted in multiple records
1510 	 * with the same signature; coalesce them.
1511 	 */
1512 	if (g_gflag) {
1513 		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1514 		coalesce(lockcmp, sort_buf, g_nrecs_used);
1515 	}
1516 
1517 	/*
1518 	 * Coalesce locks within the same symbol if -c option specified.
1519 	 * Coalesce PCs within the same function if -k option specified.
1520 	 */
1521 	if (g_cflag || g_kflag) {
1522 		for (i = 0; i < g_nrecs_used; i++) {
1523 			int fr;
1524 			lsp = sort_buf[i];
1525 			if (g_cflag)
1526 				coalesce_symbol(&lsp->ls_lock);
1527 			if (g_kflag) {
1528 				for (fr = 0; fr < g_stkdepth; fr++)
1529 					coalesce_symbol(&lsp->ls_stack[fr]);
1530 				coalesce_symbol(&lsp->ls_caller);
1531 			}
1532 		}
1533 		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1534 		coalesce(lockcmp, sort_buf, g_nrecs_used);
1535 	}
1536 
1537 	/*
1538 	 * Coalesce callers if -w option specified
1539 	 */
1540 	if (g_wflag) {
1541 		mergesort(lock_and_count_cmp_anywhere,
1542 		    sort_buf, merge_buf, g_nrecs_used);
1543 		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1544 	}
1545 
1546 	/*
1547 	 * Coalesce locks if -W option specified
1548 	 */
1549 	if (g_Wflag) {
1550 		mergesort(site_and_count_cmp_anylock,
1551 		    sort_buf, merge_buf, g_nrecs_used);
1552 		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1553 	}
1554 
1555 	/*
1556 	 * Sort data by contention count (ls_count) or total time (ls_time),
1557 	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1558 	 */
1559 	if (g_recsize < LS_TIME)
1560 		g_Pflag = 0;
1561 
1562 	if (g_Pflag)
1563 		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1564 	else
1565 		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1566 
1567 	/*
1568 	 * Display data by event type
1569 	 */
1570 	first = &sort_buf[0];
1571 	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1572 		current = first;
1573 		while ((lsp = *current)->ls_event == event)
1574 			current++;
1575 		report_stats(out, first, current - first, ev_count[event],
1576 		    ev_time[event]);
1577 		first = current;
1578 	}
1579 
1580 	return (0);
1581 }
1582 
1583 static char *
1584 format_symbol(char *buf, uintptr_t addr, int show_size)
1585 {
1586 	uintptr_t symoff;
1587 	char *symname;
1588 	size_t symsize;
1589 
1590 	symname = addr_to_sym(addr, &symoff, &symsize);
1591 
1592 	if (show_size && symoff == 0)
1593 		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1594 	else if (symoff == 0)
1595 		(void) sprintf(buf, "%s", symname);
1596 	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1597 		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1598 	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1599 		(void) sprintf(buf, "%s+0x%llx", symname,
1600 		    (unsigned long long)symoff);
1601 	else
1602 		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1603 	return (buf);
1604 }
1605 
1606 static void
1607 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1608 	uint64_t total_time)
1609 {
1610 	uint32_t event = sort_buf[0]->ls_event;
1611 	lsrec_t *lsp;
1612 	double ptotal = 0.0;
1613 	double percent;
1614 	int i, j, fr;
1615 	int displayed;
1616 	int first_bin, last_bin, max_bin_count, total_bin_count;
1617 	int rectype;
1618 	char buf[256];
1619 	char lhdr[80], chdr[80];
1620 
1621 	rectype = g_recsize;
1622 
1623 	if (g_topn == 0) {
1624 		(void) fprintf(out, "%20llu %s\n",
1625 		    g_rates == 0 ? total_count :
1626 		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1627 		    g_event_info[event].ev_desc);
1628 		return;
1629 	}
1630 
1631 	(void) sprintf(lhdr, "%s%s",
1632 	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1633 	(void) sprintf(chdr, "%s%s",
1634 	    g_wflag ? "Hottest " : "", "Caller");
1635 
1636 	if (!g_pflag)
1637 		(void) fprintf(out,
1638 		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1639 		    g_event_info[event].ev_desc, (double)total_count,
1640 		    (double)g_elapsed / NANOSEC,
1641 		    (double)total_count * NANOSEC / g_elapsed);
1642 
1643 	if (!g_pflag && rectype < LS_HIST) {
1644 		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1645 		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1646 		    g_rates ? "ops/s" : "Count",
1647 		    g_gflag ? "genr" : "indv",
1648 		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1649 		(void) fprintf(out, "---------------------------------"
1650 		    "----------------------------------------------\n");
1651 	}
1652 
1653 	displayed = 0;
1654 	for (i = 0; i < nrecs; i++) {
1655 		lsp = sort_buf[i];
1656 
1657 		if (displayed++ >= g_topn)
1658 			break;
1659 
1660 		if (g_pflag) {
1661 			int j;
1662 
1663 			(void) fprintf(out, "%u %u",
1664 			    lsp->ls_event, lsp->ls_count);
1665 			(void) fprintf(out, " %s",
1666 			    format_symbol(buf, lsp->ls_lock, g_cflag));
1667 			(void) fprintf(out, " %s",
1668 			    format_symbol(buf, lsp->ls_caller, 0));
1669 			(void) fprintf(out, " %f",
1670 			    (double)lsp->ls_refcnt / lsp->ls_count);
1671 			if (rectype >= LS_TIME)
1672 				(void) fprintf(out, " %llu",
1673 				    (unsigned long long)lsp->ls_time);
1674 			if (rectype >= LS_HIST) {
1675 				for (j = 0; j < 64; j++)
1676 					(void) fprintf(out, " %u",
1677 					    lsp->ls_hist[j]);
1678 			}
1679 			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1680 				if (rectype <= LS_STACK(j) ||
1681 				    lsp->ls_stack[j] == 0)
1682 					break;
1683 				(void) fprintf(out, " %s",
1684 				    format_symbol(buf, lsp->ls_stack[j], 0));
1685 			}
1686 			(void) fprintf(out, "\n");
1687 			continue;
1688 		}
1689 
1690 		if (rectype >= LS_HIST) {
1691 			(void) fprintf(out, "---------------------------------"
1692 			    "----------------------------------------------\n");
1693 			(void) sprintf(buf, "%s",
1694 			    g_event_info[event].ev_units);
1695 			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1696 			    g_rates ? "ops/s" : "Count",
1697 			    g_gflag ? "genr" : "indv",
1698 			    "cuml", "rcnt", buf, lhdr, chdr);
1699 		}
1700 
1701 		if (g_Pflag && total_time != 0)
1702 			percent = (lsp->ls_time * 100.00) / total_time;
1703 		else
1704 			percent = (lsp->ls_count * 100.00) / total_count;
1705 
1706 		ptotal += percent;
1707 
1708 		if (rectype >= LS_TIME)
1709 			(void) sprintf(buf, "%llu",
1710 			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1711 		else
1712 			buf[0] = '\0';
1713 
1714 		(void) fprintf(out, "%5llu ",
1715 		    g_rates == 0 ? lsp->ls_count :
1716 		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1717 
1718 		(void) fprintf(out, "%3.0f%% ", percent);
1719 
1720 		if (g_gflag)
1721 			(void) fprintf(out, "---- ");
1722 		else
1723 			(void) fprintf(out, "%3.0f%% ", ptotal);
1724 
1725 		(void) fprintf(out, "%4.2f %8s ",
1726 		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1727 
1728 		(void) fprintf(out, "%-22s ",
1729 		    format_symbol(buf, lsp->ls_lock, g_cflag));
1730 
1731 		(void) fprintf(out, "%-24s\n",
1732 		    format_symbol(buf, lsp->ls_caller, 0));
1733 
1734 		if (rectype < LS_HIST)
1735 			continue;
1736 
1737 		(void) fprintf(out, "\n");
1738 		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1739 			g_event_info[event].ev_units,
1740 			"------ Time Distribution ------",
1741 			g_rates ? "ops/s" : "count",
1742 			rectype > LS_STACK(0) ? "Stack" : "");
1743 
1744 		first_bin = 0;
1745 		while (lsp->ls_hist[first_bin] == 0)
1746 			first_bin++;
1747 
1748 		last_bin = 63;
1749 		while (lsp->ls_hist[last_bin] == 0)
1750 			last_bin--;
1751 
1752 		max_bin_count = 0;
1753 		total_bin_count = 0;
1754 		for (j = first_bin; j <= last_bin; j++) {
1755 			total_bin_count += lsp->ls_hist[j];
1756 			if (lsp->ls_hist[j] > max_bin_count)
1757 				max_bin_count = lsp->ls_hist[j];
1758 		}
1759 
1760 		/*
1761 		 * If we went a few frames below the caller, ignore them
1762 		 */
1763 		for (fr = 3; fr > 0; fr--)
1764 			if (lsp->ls_stack[fr] == lsp->ls_caller)
1765 				break;
1766 
1767 		for (j = first_bin; j <= last_bin; j++) {
1768 			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1769 			(void) fprintf(out, "%10llu |%s%s %-9u ",
1770 			    1ULL << j,
1771 			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1772 			    "                              " + depth,
1773 			    g_rates == 0 ? lsp->ls_hist[j] :
1774 			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1775 			    g_elapsed));
1776 			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1777 				(void) fprintf(out, "\n");
1778 				continue;
1779 			}
1780 			(void) fprintf(out, "%-24s\n",
1781 			    format_symbol(buf, lsp->ls_stack[fr], 0));
1782 			fr++;
1783 		}
1784 		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1785 			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1786 			    format_symbol(buf, lsp->ls_stack[fr], 0));
1787 			fr++;
1788 		}
1789 	}
1790 
1791 	if (!g_pflag)
1792 		(void) fprintf(out, "---------------------------------"
1793 		    "----------------------------------------------\n");
1794 
1795 	(void) fflush(out);
1796 }
1797 
1798 static void
1799 report_trace(FILE *out, lsrec_t **sort_buf)
1800 {
1801 	lsrec_t *lsp;
1802 	int i, fr;
1803 	int rectype;
1804 	char buf[256], buf2[256];
1805 
1806 	rectype = g_recsize;
1807 
1808 	if (!g_pflag) {
1809 		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1810 		    "Event", "Time", "Owner", "Lock", "Caller");
1811 		(void) fprintf(out, "---------------------------------"
1812 		    "----------------------------------------------\n");
1813 	}
1814 
1815 	for (i = 0; i < g_nrecs_used; i++) {
1816 
1817 		lsp = sort_buf[i];
1818 
1819 		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1820 			continue;
1821 
1822 		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1823 		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1824 		    (void *)lsp->ls_next,
1825 		    format_symbol(buf, lsp->ls_lock, 0),
1826 		    format_symbol(buf2, lsp->ls_caller, 0));
1827 
1828 		if (rectype <= LS_STACK(0))
1829 			continue;
1830 
1831 		/*
1832 		 * If we went a few frames below the caller, ignore them
1833 		 */
1834 		for (fr = 3; fr > 0; fr--)
1835 			if (lsp->ls_stack[fr] == lsp->ls_caller)
1836 				break;
1837 
1838 		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1839 			(void) fprintf(out, "%53s  %-24s\n", "",
1840 			    format_symbol(buf, lsp->ls_stack[fr], 0));
1841 			fr++;
1842 		}
1843 		(void) fprintf(out, "\n");
1844 	}
1845 
1846 	(void) fflush(out);
1847 }
1848