xref: /titanic_41/usr/src/lib/libdtrace/common/dt_consume.c (revision c9a6ea2e938727c95af7108c5e00eee4c890c7ae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <stdlib.h>
27 #include <strings.h>
28 #include <errno.h>
29 #include <unistd.h>
30 #include <limits.h>
31 #include <assert.h>
32 #include <ctype.h>
33 #include <alloca.h>
34 #include <dt_impl.h>
35 
36 #define	DT_MASK_LO 0x00000000FFFFFFFFULL
37 
38 /*
39  * We declare this here because (1) we need it and (2) we want to avoid a
40  * dependency on libm in libdtrace.
41  */
42 static long double
43 dt_fabsl(long double x)
44 {
45 	if (x < 0)
46 		return (-x);
47 
48 	return (x);
49 }
50 
51 /*
52  * 128-bit arithmetic functions needed to support the stddev() aggregating
53  * action.
54  */
55 static int
56 dt_gt_128(uint64_t *a, uint64_t *b)
57 {
58 	return (a[1] > b[1] || (a[1] == b[1] && a[0] > b[0]));
59 }
60 
61 static int
62 dt_ge_128(uint64_t *a, uint64_t *b)
63 {
64 	return (a[1] > b[1] || (a[1] == b[1] && a[0] >= b[0]));
65 }
66 
67 static int
68 dt_le_128(uint64_t *a, uint64_t *b)
69 {
70 	return (a[1] < b[1] || (a[1] == b[1] && a[0] <= b[0]));
71 }
72 
73 /*
74  * Shift the 128-bit value in a by b. If b is positive, shift left.
75  * If b is negative, shift right.
76  */
77 static void
78 dt_shift_128(uint64_t *a, int b)
79 {
80 	uint64_t mask;
81 
82 	if (b == 0)
83 		return;
84 
85 	if (b < 0) {
86 		b = -b;
87 		if (b >= 64) {
88 			a[0] = a[1] >> (b - 64);
89 			a[1] = 0;
90 		} else {
91 			a[0] >>= b;
92 			mask = 1LL << (64 - b);
93 			mask -= 1;
94 			a[0] |= ((a[1] & mask) << (64 - b));
95 			a[1] >>= b;
96 		}
97 	} else {
98 		if (b >= 64) {
99 			a[1] = a[0] << (b - 64);
100 			a[0] = 0;
101 		} else {
102 			a[1] <<= b;
103 			mask = a[0] >> (64 - b);
104 			a[1] |= mask;
105 			a[0] <<= b;
106 		}
107 	}
108 }
109 
110 static int
111 dt_nbits_128(uint64_t *a)
112 {
113 	int nbits = 0;
114 	uint64_t tmp[2];
115 	uint64_t zero[2] = { 0, 0 };
116 
117 	tmp[0] = a[0];
118 	tmp[1] = a[1];
119 
120 	dt_shift_128(tmp, -1);
121 	while (dt_gt_128(tmp, zero)) {
122 		dt_shift_128(tmp, -1);
123 		nbits++;
124 	}
125 
126 	return (nbits);
127 }
128 
129 static void
130 dt_subtract_128(uint64_t *minuend, uint64_t *subtrahend, uint64_t *difference)
131 {
132 	uint64_t result[2];
133 
134 	result[0] = minuend[0] - subtrahend[0];
135 	result[1] = minuend[1] - subtrahend[1] -
136 	    (minuend[0] < subtrahend[0] ? 1 : 0);
137 
138 	difference[0] = result[0];
139 	difference[1] = result[1];
140 }
141 
142 static void
143 dt_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
144 {
145 	uint64_t result[2];
146 
147 	result[0] = addend1[0] + addend2[0];
148 	result[1] = addend1[1] + addend2[1] +
149 	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
150 
151 	sum[0] = result[0];
152 	sum[1] = result[1];
153 }
154 
155 /*
156  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
157  * use native multiplication on those, and then re-combine into the
158  * resulting 128-bit value.
159  *
160  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
161  *     hi1 * hi2 << 64 +
162  *     hi1 * lo2 << 32 +
163  *     hi2 * lo1 << 32 +
164  *     lo1 * lo2
165  */
166 static void
167 dt_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
168 {
169 	uint64_t hi1, hi2, lo1, lo2;
170 	uint64_t tmp[2];
171 
172 	hi1 = factor1 >> 32;
173 	hi2 = factor2 >> 32;
174 
175 	lo1 = factor1 & DT_MASK_LO;
176 	lo2 = factor2 & DT_MASK_LO;
177 
178 	product[0] = lo1 * lo2;
179 	product[1] = hi1 * hi2;
180 
181 	tmp[0] = hi1 * lo2;
182 	tmp[1] = 0;
183 	dt_shift_128(tmp, 32);
184 	dt_add_128(product, tmp, product);
185 
186 	tmp[0] = hi2 * lo1;
187 	tmp[1] = 0;
188 	dt_shift_128(tmp, 32);
189 	dt_add_128(product, tmp, product);
190 }
191 
192 /*
193  * This is long-hand division.
194  *
195  * We initialize subtrahend by shifting divisor left as far as possible. We
196  * loop, comparing subtrahend to dividend:  if subtrahend is smaller, we
197  * subtract and set the appropriate bit in the result.  We then shift
198  * subtrahend right by one bit for the next comparison.
199  */
200 static void
201 dt_divide_128(uint64_t *dividend, uint64_t divisor, uint64_t *quotient)
202 {
203 	uint64_t result[2] = { 0, 0 };
204 	uint64_t remainder[2];
205 	uint64_t subtrahend[2];
206 	uint64_t divisor_128[2];
207 	uint64_t mask[2] = { 1, 0 };
208 	int log = 0;
209 
210 	assert(divisor != 0);
211 
212 	divisor_128[0] = divisor;
213 	divisor_128[1] = 0;
214 
215 	remainder[0] = dividend[0];
216 	remainder[1] = dividend[1];
217 
218 	subtrahend[0] = divisor;
219 	subtrahend[1] = 0;
220 
221 	while (divisor > 0) {
222 		log++;
223 		divisor >>= 1;
224 	}
225 
226 	dt_shift_128(subtrahend, 128 - log);
227 	dt_shift_128(mask, 128 - log);
228 
229 	while (dt_ge_128(remainder, divisor_128)) {
230 		if (dt_ge_128(remainder, subtrahend)) {
231 			dt_subtract_128(remainder, subtrahend, remainder);
232 			result[0] |= mask[0];
233 			result[1] |= mask[1];
234 		}
235 
236 		dt_shift_128(subtrahend, -1);
237 		dt_shift_128(mask, -1);
238 	}
239 
240 	quotient[0] = result[0];
241 	quotient[1] = result[1];
242 }
243 
244 /*
245  * This is the long-hand method of calculating a square root.
246  * The algorithm is as follows:
247  *
248  * 1. Group the digits by 2 from the right.
249  * 2. Over the leftmost group, find the largest single-digit number
250  *    whose square is less than that group.
251  * 3. Subtract the result of the previous step (2 or 4, depending) and
252  *    bring down the next two-digit group.
253  * 4. For the result R we have so far, find the largest single-digit number
254  *    x such that 2 * R * 10 * x + x^2 is less than the result from step 3.
255  *    (Note that this is doubling R and performing a decimal left-shift by 1
256  *    and searching for the appropriate decimal to fill the one's place.)
257  *    The value x is the next digit in the square root.
258  * Repeat steps 3 and 4 until the desired precision is reached.  (We're
259  * dealing with integers, so the above is sufficient.)
260  *
261  * In decimal, the square root of 582,734 would be calculated as so:
262  *
263  *     __7__6__3
264  *    | 58 27 34
265  *     -49       (7^2 == 49 => 7 is the first digit in the square root)
266  *      --
267  *       9 27    (Subtract and bring down the next group.)
268  * 146   8 76    (2 * 7 * 10 * 6 + 6^2 == 876 => 6 is the next digit in
269  *      -----     the square root)
270  *         51 34 (Subtract and bring down the next group.)
271  * 1523    45 69 (2 * 76 * 10 * 3 + 3^2 == 4569 => 3 is the next digit in
272  *         -----  the square root)
273  *          5 65 (remainder)
274  *
275  * The above algorithm applies similarly in binary, but note that the
276  * only possible non-zero value for x in step 4 is 1, so step 4 becomes a
277  * simple decision: is 2 * R * 2 * 1 + 1^2 (aka R << 2 + 1) less than the
278  * preceding difference?
279  *
280  * In binary, the square root of 11011011 would be calculated as so:
281  *
282  *     __1__1__1__0
283  *    | 11 01 10 11
284  *      01          (0 << 2 + 1 == 1 < 11 => this bit is 1)
285  *      --
286  *      10 01 10 11
287  * 101   1 01       (1 << 2 + 1 == 101 < 1001 => next bit is 1)
288  *      -----
289  *       1 00 10 11
290  * 1101    11 01    (11 << 2 + 1 == 1101 < 10010 => next bit is 1)
291  *       -------
292  *          1 01 11
293  * 11101    1 11 01 (111 << 2 + 1 == 11101 > 10111 => last bit is 0)
294  *
295  */
296 static uint64_t
297 dt_sqrt_128(uint64_t *square)
298 {
299 	uint64_t result[2] = { 0, 0 };
300 	uint64_t diff[2] = { 0, 0 };
301 	uint64_t one[2] = { 1, 0 };
302 	uint64_t next_pair[2];
303 	uint64_t next_try[2];
304 	uint64_t bit_pairs, pair_shift;
305 	int i;
306 
307 	bit_pairs = dt_nbits_128(square) / 2;
308 	pair_shift = bit_pairs * 2;
309 
310 	for (i = 0; i <= bit_pairs; i++) {
311 		/*
312 		 * Bring down the next pair of bits.
313 		 */
314 		next_pair[0] = square[0];
315 		next_pair[1] = square[1];
316 		dt_shift_128(next_pair, -pair_shift);
317 		next_pair[0] &= 0x3;
318 		next_pair[1] = 0;
319 
320 		dt_shift_128(diff, 2);
321 		dt_add_128(diff, next_pair, diff);
322 
323 		/*
324 		 * next_try = R << 2 + 1
325 		 */
326 		next_try[0] = result[0];
327 		next_try[1] = result[1];
328 		dt_shift_128(next_try, 2);
329 		dt_add_128(next_try, one, next_try);
330 
331 		if (dt_le_128(next_try, diff)) {
332 			dt_subtract_128(diff, next_try, diff);
333 			dt_shift_128(result, 1);
334 			dt_add_128(result, one, result);
335 		} else {
336 			dt_shift_128(result, 1);
337 		}
338 
339 		pair_shift -= 2;
340 	}
341 
342 	assert(result[1] == 0);
343 
344 	return (result[0]);
345 }
346 
347 uint64_t
348 dt_stddev(uint64_t *data, uint64_t normal)
349 {
350 	uint64_t avg_of_squares[2];
351 	uint64_t square_of_avg[2];
352 	int64_t norm_avg;
353 	uint64_t diff[2];
354 
355 	/*
356 	 * The standard approximation for standard deviation is
357 	 * sqrt(average(x**2) - average(x)**2), i.e. the square root
358 	 * of the average of the squares minus the square of the average.
359 	 */
360 	dt_divide_128(data + 2, normal, avg_of_squares);
361 	dt_divide_128(avg_of_squares, data[0], avg_of_squares);
362 
363 	norm_avg = (int64_t)data[1] / (int64_t)normal / (int64_t)data[0];
364 
365 	if (norm_avg < 0)
366 		norm_avg = -norm_avg;
367 
368 	dt_multiply_128((uint64_t)norm_avg, (uint64_t)norm_avg, square_of_avg);
369 
370 	dt_subtract_128(avg_of_squares, square_of_avg, diff);
371 
372 	return (dt_sqrt_128(diff));
373 }
374 
375 static int
376 dt_flowindent(dtrace_hdl_t *dtp, dtrace_probedata_t *data, dtrace_epid_t last,
377     dtrace_bufdesc_t *buf, size_t offs)
378 {
379 	dtrace_probedesc_t *pd = data->dtpda_pdesc, *npd;
380 	dtrace_eprobedesc_t *epd = data->dtpda_edesc, *nepd;
381 	char *p = pd->dtpd_provider, *n = pd->dtpd_name, *sub;
382 	dtrace_flowkind_t flow = DTRACEFLOW_NONE;
383 	const char *str = NULL;
384 	static const char *e_str[2] = { " -> ", " => " };
385 	static const char *r_str[2] = { " <- ", " <= " };
386 	static const char *ent = "entry", *ret = "return";
387 	static int entlen = 0, retlen = 0;
388 	dtrace_epid_t next, id = epd->dtepd_epid;
389 	int rval;
390 
391 	if (entlen == 0) {
392 		assert(retlen == 0);
393 		entlen = strlen(ent);
394 		retlen = strlen(ret);
395 	}
396 
397 	/*
398 	 * If the name of the probe is "entry" or ends with "-entry", we
399 	 * treat it as an entry; if it is "return" or ends with "-return",
400 	 * we treat it as a return.  (This allows application-provided probes
401 	 * like "method-entry" or "function-entry" to participate in flow
402 	 * indentation -- without accidentally misinterpreting popular probe
403 	 * names like "carpentry", "gentry" or "Coventry".)
404 	 */
405 	if ((sub = strstr(n, ent)) != NULL && sub[entlen] == '\0' &&
406 	    (sub == n || sub[-1] == '-')) {
407 		flow = DTRACEFLOW_ENTRY;
408 		str = e_str[strcmp(p, "syscall") == 0];
409 	} else if ((sub = strstr(n, ret)) != NULL && sub[retlen] == '\0' &&
410 	    (sub == n || sub[-1] == '-')) {
411 		flow = DTRACEFLOW_RETURN;
412 		str = r_str[strcmp(p, "syscall") == 0];
413 	}
414 
415 	/*
416 	 * If we're going to indent this, we need to check the ID of our last
417 	 * call.  If we're looking at the same probe ID but a different EPID,
418 	 * we _don't_ want to indent.  (Yes, there are some minor holes in
419 	 * this scheme -- it's a heuristic.)
420 	 */
421 	if (flow == DTRACEFLOW_ENTRY) {
422 		if ((last != DTRACE_EPIDNONE && id != last &&
423 		    pd->dtpd_id == dtp->dt_pdesc[last]->dtpd_id))
424 			flow = DTRACEFLOW_NONE;
425 	}
426 
427 	/*
428 	 * If we're going to unindent this, it's more difficult to see if
429 	 * we don't actually want to unindent it -- we need to look at the
430 	 * _next_ EPID.
431 	 */
432 	if (flow == DTRACEFLOW_RETURN) {
433 		offs += epd->dtepd_size;
434 
435 		do {
436 			if (offs >= buf->dtbd_size) {
437 				/*
438 				 * We're at the end -- maybe.  If the oldest
439 				 * record is non-zero, we need to wrap.
440 				 */
441 				if (buf->dtbd_oldest != 0) {
442 					offs = 0;
443 				} else {
444 					goto out;
445 				}
446 			}
447 
448 			next = *(uint32_t *)((uintptr_t)buf->dtbd_data + offs);
449 
450 			if (next == DTRACE_EPIDNONE)
451 				offs += sizeof (id);
452 		} while (next == DTRACE_EPIDNONE);
453 
454 		if ((rval = dt_epid_lookup(dtp, next, &nepd, &npd)) != 0)
455 			return (rval);
456 
457 		if (next != id && npd->dtpd_id == pd->dtpd_id)
458 			flow = DTRACEFLOW_NONE;
459 	}
460 
461 out:
462 	if (flow == DTRACEFLOW_ENTRY || flow == DTRACEFLOW_RETURN) {
463 		data->dtpda_prefix = str;
464 	} else {
465 		data->dtpda_prefix = "| ";
466 	}
467 
468 	if (flow == DTRACEFLOW_RETURN && data->dtpda_indent > 0)
469 		data->dtpda_indent -= 2;
470 
471 	data->dtpda_flow = flow;
472 
473 	return (0);
474 }
475 
476 static int
477 dt_nullprobe()
478 {
479 	return (DTRACE_CONSUME_THIS);
480 }
481 
482 static int
483 dt_nullrec()
484 {
485 	return (DTRACE_CONSUME_NEXT);
486 }
487 
488 int
489 dt_print_quantline(dtrace_hdl_t *dtp, FILE *fp, int64_t val,
490     uint64_t normal, long double total, char positives, char negatives)
491 {
492 	long double f;
493 	uint_t depth, len = 40;
494 
495 	const char *ats = "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
496 	const char *spaces = "                                        ";
497 
498 	assert(strlen(ats) == len && strlen(spaces) == len);
499 	assert(!(total == 0 && (positives || negatives)));
500 	assert(!(val < 0 && !negatives));
501 	assert(!(val > 0 && !positives));
502 	assert(!(val != 0 && total == 0));
503 
504 	if (!negatives) {
505 		if (positives) {
506 			f = (dt_fabsl((long double)val) * len) / total;
507 			depth = (uint_t)(f + 0.5);
508 		} else {
509 			depth = 0;
510 		}
511 
512 		return (dt_printf(dtp, fp, "|%s%s %-9lld\n", ats + len - depth,
513 		    spaces + depth, (long long)val / normal));
514 	}
515 
516 	if (!positives) {
517 		f = (dt_fabsl((long double)val) * len) / total;
518 		depth = (uint_t)(f + 0.5);
519 
520 		return (dt_printf(dtp, fp, "%s%s| %-9lld\n", spaces + depth,
521 		    ats + len - depth, (long long)val / normal));
522 	}
523 
524 	/*
525 	 * If we're here, we have both positive and negative bucket values.
526 	 * To express this graphically, we're going to generate both positive
527 	 * and negative bars separated by a centerline.  These bars are half
528 	 * the size of normal quantize()/lquantize() bars, so we divide the
529 	 * length in half before calculating the bar length.
530 	 */
531 	len /= 2;
532 	ats = &ats[len];
533 	spaces = &spaces[len];
534 
535 	f = (dt_fabsl((long double)val) * len) / total;
536 	depth = (uint_t)(f + 0.5);
537 
538 	if (val <= 0) {
539 		return (dt_printf(dtp, fp, "%s%s|%*s %-9lld\n", spaces + depth,
540 		    ats + len - depth, len, "", (long long)val / normal));
541 	} else {
542 		return (dt_printf(dtp, fp, "%20s|%s%s %-9lld\n", "",
543 		    ats + len - depth, spaces + depth,
544 		    (long long)val / normal));
545 	}
546 }
547 
548 int
549 dt_print_quantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr,
550     size_t size, uint64_t normal)
551 {
552 	const int64_t *data = addr;
553 	int i, first_bin = 0, last_bin = DTRACE_QUANTIZE_NBUCKETS - 1;
554 	long double total = 0;
555 	char positives = 0, negatives = 0;
556 
557 	if (size != DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
558 		return (dt_set_errno(dtp, EDT_DMISMATCH));
559 
560 	while (first_bin < DTRACE_QUANTIZE_NBUCKETS - 1 && data[first_bin] == 0)
561 		first_bin++;
562 
563 	if (first_bin == DTRACE_QUANTIZE_NBUCKETS - 1) {
564 		/*
565 		 * There isn't any data.  This is possible if (and only if)
566 		 * negative increment values have been used.  In this case,
567 		 * we'll print the buckets around 0.
568 		 */
569 		first_bin = DTRACE_QUANTIZE_ZEROBUCKET - 1;
570 		last_bin = DTRACE_QUANTIZE_ZEROBUCKET + 1;
571 	} else {
572 		if (first_bin > 0)
573 			first_bin--;
574 
575 		while (last_bin > 0 && data[last_bin] == 0)
576 			last_bin--;
577 
578 		if (last_bin < DTRACE_QUANTIZE_NBUCKETS - 1)
579 			last_bin++;
580 	}
581 
582 	for (i = first_bin; i <= last_bin; i++) {
583 		positives |= (data[i] > 0);
584 		negatives |= (data[i] < 0);
585 		total += dt_fabsl((long double)data[i]);
586 	}
587 
588 	if (dt_printf(dtp, fp, "\n%16s %41s %-9s\n", "value",
589 	    "------------- Distribution -------------", "count") < 0)
590 		return (-1);
591 
592 	for (i = first_bin; i <= last_bin; i++) {
593 		if (dt_printf(dtp, fp, "%16lld ",
594 		    (long long)DTRACE_QUANTIZE_BUCKETVAL(i)) < 0)
595 			return (-1);
596 
597 		if (dt_print_quantline(dtp, fp, data[i], normal, total,
598 		    positives, negatives) < 0)
599 			return (-1);
600 	}
601 
602 	return (0);
603 }
604 
605 int
606 dt_print_lquantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr,
607     size_t size, uint64_t normal)
608 {
609 	const int64_t *data = addr;
610 	int i, first_bin, last_bin, base;
611 	uint64_t arg;
612 	long double total = 0;
613 	uint16_t step, levels;
614 	char positives = 0, negatives = 0;
615 
616 	if (size < sizeof (uint64_t))
617 		return (dt_set_errno(dtp, EDT_DMISMATCH));
618 
619 	arg = *data++;
620 	size -= sizeof (uint64_t);
621 
622 	base = DTRACE_LQUANTIZE_BASE(arg);
623 	step = DTRACE_LQUANTIZE_STEP(arg);
624 	levels = DTRACE_LQUANTIZE_LEVELS(arg);
625 
626 	first_bin = 0;
627 	last_bin = levels + 1;
628 
629 	if (size != sizeof (uint64_t) * (levels + 2))
630 		return (dt_set_errno(dtp, EDT_DMISMATCH));
631 
632 	while (first_bin <= levels + 1 && data[first_bin] == 0)
633 		first_bin++;
634 
635 	if (first_bin > levels + 1) {
636 		first_bin = 0;
637 		last_bin = 2;
638 	} else {
639 		if (first_bin > 0)
640 			first_bin--;
641 
642 		while (last_bin > 0 && data[last_bin] == 0)
643 			last_bin--;
644 
645 		if (last_bin < levels + 1)
646 			last_bin++;
647 	}
648 
649 	for (i = first_bin; i <= last_bin; i++) {
650 		positives |= (data[i] > 0);
651 		negatives |= (data[i] < 0);
652 		total += dt_fabsl((long double)data[i]);
653 	}
654 
655 	if (dt_printf(dtp, fp, "\n%16s %41s %-9s\n", "value",
656 	    "------------- Distribution -------------", "count") < 0)
657 		return (-1);
658 
659 	for (i = first_bin; i <= last_bin; i++) {
660 		char c[32];
661 		int err;
662 
663 		if (i == 0) {
664 			(void) snprintf(c, sizeof (c), "< %d",
665 			    base / (uint32_t)normal);
666 			err = dt_printf(dtp, fp, "%16s ", c);
667 		} else if (i == levels + 1) {
668 			(void) snprintf(c, sizeof (c), ">= %d",
669 			    base + (levels * step));
670 			err = dt_printf(dtp, fp, "%16s ", c);
671 		} else {
672 			err = dt_printf(dtp, fp, "%16d ",
673 			    base + (i - 1) * step);
674 		}
675 
676 		if (err < 0 || dt_print_quantline(dtp, fp, data[i], normal,
677 		    total, positives, negatives) < 0)
678 			return (-1);
679 	}
680 
681 	return (0);
682 }
683 
684 /*ARGSUSED*/
685 static int
686 dt_print_average(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr,
687     size_t size, uint64_t normal)
688 {
689 	/* LINTED - alignment */
690 	int64_t *data = (int64_t *)addr;
691 
692 	return (dt_printf(dtp, fp, " %16lld", data[0] ?
693 	    (long long)(data[1] / (int64_t)normal / data[0]) : 0));
694 }
695 
696 /*ARGSUSED*/
697 static int
698 dt_print_stddev(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr,
699     size_t size, uint64_t normal)
700 {
701 	/* LINTED - alignment */
702 	uint64_t *data = (uint64_t *)addr;
703 
704 	return (dt_printf(dtp, fp, " %16llu", data[0] ?
705 	    (unsigned long long) dt_stddev(data, normal) : 0));
706 }
707 
708 /*ARGSUSED*/
709 int
710 dt_print_bytes(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr,
711     size_t nbytes, int width, int quiet)
712 {
713 	/*
714 	 * If the byte stream is a series of printable characters, followed by
715 	 * a terminating byte, we print it out as a string.  Otherwise, we
716 	 * assume that it's something else and just print the bytes.
717 	 */
718 	int i, j, margin = 5;
719 	char *c = (char *)addr;
720 
721 	if (nbytes == 0)
722 		return (0);
723 
724 	if (dtp->dt_options[DTRACEOPT_RAWBYTES] != DTRACEOPT_UNSET)
725 		goto raw;
726 
727 	for (i = 0; i < nbytes; i++) {
728 		/*
729 		 * We define a "printable character" to be one for which
730 		 * isprint(3C) returns non-zero, isspace(3C) returns non-zero,
731 		 * or a character which is either backspace or the bell.
732 		 * Backspace and the bell are regrettably special because
733 		 * they fail the first two tests -- and yet they are entirely
734 		 * printable.  These are the only two control characters that
735 		 * have meaning for the terminal and for which isprint(3C) and
736 		 * isspace(3C) return 0.
737 		 */
738 		if (isprint(c[i]) || isspace(c[i]) ||
739 		    c[i] == '\b' || c[i] == '\a')
740 			continue;
741 
742 		if (c[i] == '\0' && i > 0) {
743 			/*
744 			 * This looks like it might be a string.  Before we
745 			 * assume that it is indeed a string, check the
746 			 * remainder of the byte range; if it contains
747 			 * additional non-nul characters, we'll assume that
748 			 * it's a binary stream that just happens to look like
749 			 * a string, and we'll print out the individual bytes.
750 			 */
751 			for (j = i + 1; j < nbytes; j++) {
752 				if (c[j] != '\0')
753 					break;
754 			}
755 
756 			if (j != nbytes)
757 				break;
758 
759 			if (quiet)
760 				return (dt_printf(dtp, fp, "%s", c));
761 			else
762 				return (dt_printf(dtp, fp, "  %-*s", width, c));
763 		}
764 
765 		break;
766 	}
767 
768 	if (i == nbytes) {
769 		/*
770 		 * The byte range is all printable characters, but there is
771 		 * no trailing nul byte.  We'll assume that it's a string and
772 		 * print it as such.
773 		 */
774 		char *s = alloca(nbytes + 1);
775 		bcopy(c, s, nbytes);
776 		s[nbytes] = '\0';
777 		return (dt_printf(dtp, fp, "  %-*s", width, s));
778 	}
779 
780 raw:
781 	if (dt_printf(dtp, fp, "\n%*s      ", margin, "") < 0)
782 		return (-1);
783 
784 	for (i = 0; i < 16; i++)
785 		if (dt_printf(dtp, fp, "  %c", "0123456789abcdef"[i]) < 0)
786 			return (-1);
787 
788 	if (dt_printf(dtp, fp, "  0123456789abcdef\n") < 0)
789 		return (-1);
790 
791 
792 	for (i = 0; i < nbytes; i += 16) {
793 		if (dt_printf(dtp, fp, "%*s%5x:", margin, "", i) < 0)
794 			return (-1);
795 
796 		for (j = i; j < i + 16 && j < nbytes; j++) {
797 			if (dt_printf(dtp, fp, " %02x", (uchar_t)c[j]) < 0)
798 				return (-1);
799 		}
800 
801 		while (j++ % 16) {
802 			if (dt_printf(dtp, fp, "   ") < 0)
803 				return (-1);
804 		}
805 
806 		if (dt_printf(dtp, fp, "  ") < 0)
807 			return (-1);
808 
809 		for (j = i; j < i + 16 && j < nbytes; j++) {
810 			if (dt_printf(dtp, fp, "%c",
811 			    c[j] < ' ' || c[j] > '~' ? '.' : c[j]) < 0)
812 				return (-1);
813 		}
814 
815 		if (dt_printf(dtp, fp, "\n") < 0)
816 			return (-1);
817 	}
818 
819 	return (0);
820 }
821 
822 int
823 dt_print_stack(dtrace_hdl_t *dtp, FILE *fp, const char *format,
824     caddr_t addr, int depth, int size)
825 {
826 	dtrace_syminfo_t dts;
827 	GElf_Sym sym;
828 	int i, indent;
829 	char c[PATH_MAX * 2];
830 	uint64_t pc;
831 
832 	if (dt_printf(dtp, fp, "\n") < 0)
833 		return (-1);
834 
835 	if (format == NULL)
836 		format = "%s";
837 
838 	if (dtp->dt_options[DTRACEOPT_STACKINDENT] != DTRACEOPT_UNSET)
839 		indent = (int)dtp->dt_options[DTRACEOPT_STACKINDENT];
840 	else
841 		indent = _dtrace_stkindent;
842 
843 	for (i = 0; i < depth; i++) {
844 		switch (size) {
845 		case sizeof (uint32_t):
846 			/* LINTED - alignment */
847 			pc = *((uint32_t *)addr);
848 			break;
849 
850 		case sizeof (uint64_t):
851 			/* LINTED - alignment */
852 			pc = *((uint64_t *)addr);
853 			break;
854 
855 		default:
856 			return (dt_set_errno(dtp, EDT_BADSTACKPC));
857 		}
858 
859 		if (pc == NULL)
860 			break;
861 
862 		addr += size;
863 
864 		if (dt_printf(dtp, fp, "%*s", indent, "") < 0)
865 			return (-1);
866 
867 		if (dtrace_lookup_by_addr(dtp, pc, &sym, &dts) == 0) {
868 			if (pc > sym.st_value) {
869 				(void) snprintf(c, sizeof (c), "%s`%s+0x%llx",
870 				    dts.dts_object, dts.dts_name,
871 				    pc - sym.st_value);
872 			} else {
873 				(void) snprintf(c, sizeof (c), "%s`%s",
874 				    dts.dts_object, dts.dts_name);
875 			}
876 		} else {
877 			/*
878 			 * We'll repeat the lookup, but this time we'll specify
879 			 * a NULL GElf_Sym -- indicating that we're only
880 			 * interested in the containing module.
881 			 */
882 			if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
883 				(void) snprintf(c, sizeof (c), "%s`0x%llx",
884 				    dts.dts_object, pc);
885 			} else {
886 				(void) snprintf(c, sizeof (c), "0x%llx", pc);
887 			}
888 		}
889 
890 		if (dt_printf(dtp, fp, format, c) < 0)
891 			return (-1);
892 
893 		if (dt_printf(dtp, fp, "\n") < 0)
894 			return (-1);
895 	}
896 
897 	return (0);
898 }
899 
900 int
901 dt_print_ustack(dtrace_hdl_t *dtp, FILE *fp, const char *format,
902     caddr_t addr, uint64_t arg)
903 {
904 	/* LINTED - alignment */
905 	uint64_t *pc = (uint64_t *)addr;
906 	uint32_t depth = DTRACE_USTACK_NFRAMES(arg);
907 	uint32_t strsize = DTRACE_USTACK_STRSIZE(arg);
908 	const char *strbase = addr + (depth + 1) * sizeof (uint64_t);
909 	const char *str = strsize ? strbase : NULL;
910 	int err = 0;
911 
912 	char name[PATH_MAX], objname[PATH_MAX], c[PATH_MAX * 2];
913 	struct ps_prochandle *P;
914 	GElf_Sym sym;
915 	int i, indent;
916 	pid_t pid;
917 
918 	if (depth == 0)
919 		return (0);
920 
921 	pid = (pid_t)*pc++;
922 
923 	if (dt_printf(dtp, fp, "\n") < 0)
924 		return (-1);
925 
926 	if (format == NULL)
927 		format = "%s";
928 
929 	if (dtp->dt_options[DTRACEOPT_STACKINDENT] != DTRACEOPT_UNSET)
930 		indent = (int)dtp->dt_options[DTRACEOPT_STACKINDENT];
931 	else
932 		indent = _dtrace_stkindent;
933 
934 	/*
935 	 * Ultimately, we need to add an entry point in the library vector for
936 	 * determining <symbol, offset> from <pid, address>.  For now, if
937 	 * this is a vector open, we just print the raw address or string.
938 	 */
939 	if (dtp->dt_vector == NULL)
940 		P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0);
941 	else
942 		P = NULL;
943 
944 	if (P != NULL)
945 		dt_proc_lock(dtp, P); /* lock handle while we perform lookups */
946 
947 	for (i = 0; i < depth && pc[i] != NULL; i++) {
948 		const prmap_t *map;
949 
950 		if ((err = dt_printf(dtp, fp, "%*s", indent, "")) < 0)
951 			break;
952 
953 		if (P != NULL && Plookup_by_addr(P, pc[i],
954 		    name, sizeof (name), &sym) == 0) {
955 			(void) Pobjname(P, pc[i], objname, sizeof (objname));
956 
957 			if (pc[i] > sym.st_value) {
958 				(void) snprintf(c, sizeof (c),
959 				    "%s`%s+0x%llx", dt_basename(objname), name,
960 				    (u_longlong_t)(pc[i] - sym.st_value));
961 			} else {
962 				(void) snprintf(c, sizeof (c),
963 				    "%s`%s", dt_basename(objname), name);
964 			}
965 		} else if (str != NULL && str[0] != '\0' && str[0] != '@' &&
966 		    (P != NULL && ((map = Paddr_to_map(P, pc[i])) == NULL ||
967 		    (map->pr_mflags & MA_WRITE)))) {
968 			/*
969 			 * If the current string pointer in the string table
970 			 * does not point to an empty string _and_ the program
971 			 * counter falls in a writable region, we'll use the
972 			 * string from the string table instead of the raw
973 			 * address.  This last condition is necessary because
974 			 * some (broken) ustack helpers will return a string
975 			 * even for a program counter that they can't
976 			 * identify.  If we have a string for a program
977 			 * counter that falls in a segment that isn't
978 			 * writable, we assume that we have fallen into this
979 			 * case and we refuse to use the string.
980 			 */
981 			(void) snprintf(c, sizeof (c), "%s", str);
982 		} else {
983 			if (P != NULL && Pobjname(P, pc[i], objname,
984 			    sizeof (objname)) != NULL) {
985 				(void) snprintf(c, sizeof (c), "%s`0x%llx",
986 				    dt_basename(objname), (u_longlong_t)pc[i]);
987 			} else {
988 				(void) snprintf(c, sizeof (c), "0x%llx",
989 				    (u_longlong_t)pc[i]);
990 			}
991 		}
992 
993 		if ((err = dt_printf(dtp, fp, format, c)) < 0)
994 			break;
995 
996 		if ((err = dt_printf(dtp, fp, "\n")) < 0)
997 			break;
998 
999 		if (str != NULL && str[0] == '@') {
1000 			/*
1001 			 * If the first character of the string is an "at" sign,
1002 			 * then the string is inferred to be an annotation --
1003 			 * and it is printed out beneath the frame and offset
1004 			 * with brackets.
1005 			 */
1006 			if ((err = dt_printf(dtp, fp, "%*s", indent, "")) < 0)
1007 				break;
1008 
1009 			(void) snprintf(c, sizeof (c), "  [ %s ]", &str[1]);
1010 
1011 			if ((err = dt_printf(dtp, fp, format, c)) < 0)
1012 				break;
1013 
1014 			if ((err = dt_printf(dtp, fp, "\n")) < 0)
1015 				break;
1016 		}
1017 
1018 		if (str != NULL) {
1019 			str += strlen(str) + 1;
1020 			if (str - strbase >= strsize)
1021 				str = NULL;
1022 		}
1023 	}
1024 
1025 	if (P != NULL) {
1026 		dt_proc_unlock(dtp, P);
1027 		dt_proc_release(dtp, P);
1028 	}
1029 
1030 	return (err);
1031 }
1032 
1033 static int
1034 dt_print_usym(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, dtrace_actkind_t act)
1035 {
1036 	/* LINTED - alignment */
1037 	uint64_t pid = ((uint64_t *)addr)[0];
1038 	/* LINTED - alignment */
1039 	uint64_t pc = ((uint64_t *)addr)[1];
1040 	const char *format = "  %-50s";
1041 	char *s;
1042 	int n, len = 256;
1043 
1044 	if (act == DTRACEACT_USYM && dtp->dt_vector == NULL) {
1045 		struct ps_prochandle *P;
1046 
1047 		if ((P = dt_proc_grab(dtp, pid,
1048 		    PGRAB_RDONLY | PGRAB_FORCE, 0)) != NULL) {
1049 			GElf_Sym sym;
1050 
1051 			dt_proc_lock(dtp, P);
1052 
1053 			if (Plookup_by_addr(P, pc, NULL, 0, &sym) == 0)
1054 				pc = sym.st_value;
1055 
1056 			dt_proc_unlock(dtp, P);
1057 			dt_proc_release(dtp, P);
1058 		}
1059 	}
1060 
1061 	do {
1062 		n = len;
1063 		s = alloca(n);
1064 	} while ((len = dtrace_uaddr2str(dtp, pid, pc, s, n)) > n);
1065 
1066 	return (dt_printf(dtp, fp, format, s));
1067 }
1068 
1069 int
1070 dt_print_umod(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
1071 {
1072 	/* LINTED - alignment */
1073 	uint64_t pid = ((uint64_t *)addr)[0];
1074 	/* LINTED - alignment */
1075 	uint64_t pc = ((uint64_t *)addr)[1];
1076 	int err = 0;
1077 
1078 	char objname[PATH_MAX], c[PATH_MAX * 2];
1079 	struct ps_prochandle *P;
1080 
1081 	if (format == NULL)
1082 		format = "  %-50s";
1083 
1084 	/*
1085 	 * See the comment in dt_print_ustack() for the rationale for
1086 	 * printing raw addresses in the vectored case.
1087 	 */
1088 	if (dtp->dt_vector == NULL)
1089 		P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0);
1090 	else
1091 		P = NULL;
1092 
1093 	if (P != NULL)
1094 		dt_proc_lock(dtp, P); /* lock handle while we perform lookups */
1095 
1096 	if (P != NULL && Pobjname(P, pc, objname, sizeof (objname)) != NULL) {
1097 		(void) snprintf(c, sizeof (c), "%s", dt_basename(objname));
1098 	} else {
1099 		(void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc);
1100 	}
1101 
1102 	err = dt_printf(dtp, fp, format, c);
1103 
1104 	if (P != NULL) {
1105 		dt_proc_unlock(dtp, P);
1106 		dt_proc_release(dtp, P);
1107 	}
1108 
1109 	return (err);
1110 }
1111 
1112 static int
1113 dt_print_sym(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
1114 {
1115 	/* LINTED - alignment */
1116 	uint64_t pc = *((uint64_t *)addr);
1117 	dtrace_syminfo_t dts;
1118 	GElf_Sym sym;
1119 	char c[PATH_MAX * 2];
1120 
1121 	if (format == NULL)
1122 		format = "  %-50s";
1123 
1124 	if (dtrace_lookup_by_addr(dtp, pc, &sym, &dts) == 0) {
1125 		(void) snprintf(c, sizeof (c), "%s`%s",
1126 		    dts.dts_object, dts.dts_name);
1127 	} else {
1128 		/*
1129 		 * We'll repeat the lookup, but this time we'll specify a
1130 		 * NULL GElf_Sym -- indicating that we're only interested in
1131 		 * the containing module.
1132 		 */
1133 		if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
1134 			(void) snprintf(c, sizeof (c), "%s`0x%llx",
1135 			    dts.dts_object, (u_longlong_t)pc);
1136 		} else {
1137 			(void) snprintf(c, sizeof (c), "0x%llx",
1138 			    (u_longlong_t)pc);
1139 		}
1140 	}
1141 
1142 	if (dt_printf(dtp, fp, format, c) < 0)
1143 		return (-1);
1144 
1145 	return (0);
1146 }
1147 
1148 int
1149 dt_print_mod(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
1150 {
1151 	/* LINTED - alignment */
1152 	uint64_t pc = *((uint64_t *)addr);
1153 	dtrace_syminfo_t dts;
1154 	char c[PATH_MAX * 2];
1155 
1156 	if (format == NULL)
1157 		format = "  %-50s";
1158 
1159 	if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
1160 		(void) snprintf(c, sizeof (c), "%s", dts.dts_object);
1161 	} else {
1162 		(void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc);
1163 	}
1164 
1165 	if (dt_printf(dtp, fp, format, c) < 0)
1166 		return (-1);
1167 
1168 	return (0);
1169 }
1170 
1171 typedef struct dt_normal {
1172 	dtrace_aggvarid_t dtnd_id;
1173 	uint64_t dtnd_normal;
1174 } dt_normal_t;
1175 
1176 static int
1177 dt_normalize_agg(const dtrace_aggdata_t *aggdata, void *arg)
1178 {
1179 	dt_normal_t *normal = arg;
1180 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1181 	dtrace_aggvarid_t id = normal->dtnd_id;
1182 
1183 	if (agg->dtagd_nrecs == 0)
1184 		return (DTRACE_AGGWALK_NEXT);
1185 
1186 	if (agg->dtagd_varid != id)
1187 		return (DTRACE_AGGWALK_NEXT);
1188 
1189 	((dtrace_aggdata_t *)aggdata)->dtada_normal = normal->dtnd_normal;
1190 	return (DTRACE_AGGWALK_NORMALIZE);
1191 }
1192 
1193 static int
1194 dt_normalize(dtrace_hdl_t *dtp, caddr_t base, dtrace_recdesc_t *rec)
1195 {
1196 	dt_normal_t normal;
1197 	caddr_t addr;
1198 
1199 	/*
1200 	 * We (should) have two records:  the aggregation ID followed by the
1201 	 * normalization value.
1202 	 */
1203 	addr = base + rec->dtrd_offset;
1204 
1205 	if (rec->dtrd_size != sizeof (dtrace_aggvarid_t))
1206 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1207 
1208 	/* LINTED - alignment */
1209 	normal.dtnd_id = *((dtrace_aggvarid_t *)addr);
1210 	rec++;
1211 
1212 	if (rec->dtrd_action != DTRACEACT_LIBACT)
1213 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1214 
1215 	if (rec->dtrd_arg != DT_ACT_NORMALIZE)
1216 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1217 
1218 	addr = base + rec->dtrd_offset;
1219 
1220 	switch (rec->dtrd_size) {
1221 	case sizeof (uint64_t):
1222 		/* LINTED - alignment */
1223 		normal.dtnd_normal = *((uint64_t *)addr);
1224 		break;
1225 	case sizeof (uint32_t):
1226 		/* LINTED - alignment */
1227 		normal.dtnd_normal = *((uint32_t *)addr);
1228 		break;
1229 	case sizeof (uint16_t):
1230 		/* LINTED - alignment */
1231 		normal.dtnd_normal = *((uint16_t *)addr);
1232 		break;
1233 	case sizeof (uint8_t):
1234 		normal.dtnd_normal = *((uint8_t *)addr);
1235 		break;
1236 	default:
1237 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1238 	}
1239 
1240 	(void) dtrace_aggregate_walk(dtp, dt_normalize_agg, &normal);
1241 
1242 	return (0);
1243 }
1244 
1245 static int
1246 dt_denormalize_agg(const dtrace_aggdata_t *aggdata, void *arg)
1247 {
1248 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1249 	dtrace_aggvarid_t id = *((dtrace_aggvarid_t *)arg);
1250 
1251 	if (agg->dtagd_nrecs == 0)
1252 		return (DTRACE_AGGWALK_NEXT);
1253 
1254 	if (agg->dtagd_varid != id)
1255 		return (DTRACE_AGGWALK_NEXT);
1256 
1257 	return (DTRACE_AGGWALK_DENORMALIZE);
1258 }
1259 
1260 static int
1261 dt_clear_agg(const dtrace_aggdata_t *aggdata, void *arg)
1262 {
1263 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1264 	dtrace_aggvarid_t id = *((dtrace_aggvarid_t *)arg);
1265 
1266 	if (agg->dtagd_nrecs == 0)
1267 		return (DTRACE_AGGWALK_NEXT);
1268 
1269 	if (agg->dtagd_varid != id)
1270 		return (DTRACE_AGGWALK_NEXT);
1271 
1272 	return (DTRACE_AGGWALK_CLEAR);
1273 }
1274 
1275 typedef struct dt_trunc {
1276 	dtrace_aggvarid_t dttd_id;
1277 	uint64_t dttd_remaining;
1278 } dt_trunc_t;
1279 
1280 static int
1281 dt_trunc_agg(const dtrace_aggdata_t *aggdata, void *arg)
1282 {
1283 	dt_trunc_t *trunc = arg;
1284 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1285 	dtrace_aggvarid_t id = trunc->dttd_id;
1286 
1287 	if (agg->dtagd_nrecs == 0)
1288 		return (DTRACE_AGGWALK_NEXT);
1289 
1290 	if (agg->dtagd_varid != id)
1291 		return (DTRACE_AGGWALK_NEXT);
1292 
1293 	if (trunc->dttd_remaining == 0)
1294 		return (DTRACE_AGGWALK_REMOVE);
1295 
1296 	trunc->dttd_remaining--;
1297 	return (DTRACE_AGGWALK_NEXT);
1298 }
1299 
1300 static int
1301 dt_trunc(dtrace_hdl_t *dtp, caddr_t base, dtrace_recdesc_t *rec)
1302 {
1303 	dt_trunc_t trunc;
1304 	caddr_t addr;
1305 	int64_t remaining;
1306 	int (*func)(dtrace_hdl_t *, dtrace_aggregate_f *, void *);
1307 
1308 	/*
1309 	 * We (should) have two records:  the aggregation ID followed by the
1310 	 * number of aggregation entries after which the aggregation is to be
1311 	 * truncated.
1312 	 */
1313 	addr = base + rec->dtrd_offset;
1314 
1315 	if (rec->dtrd_size != sizeof (dtrace_aggvarid_t))
1316 		return (dt_set_errno(dtp, EDT_BADTRUNC));
1317 
1318 	/* LINTED - alignment */
1319 	trunc.dttd_id = *((dtrace_aggvarid_t *)addr);
1320 	rec++;
1321 
1322 	if (rec->dtrd_action != DTRACEACT_LIBACT)
1323 		return (dt_set_errno(dtp, EDT_BADTRUNC));
1324 
1325 	if (rec->dtrd_arg != DT_ACT_TRUNC)
1326 		return (dt_set_errno(dtp, EDT_BADTRUNC));
1327 
1328 	addr = base + rec->dtrd_offset;
1329 
1330 	switch (rec->dtrd_size) {
1331 	case sizeof (uint64_t):
1332 		/* LINTED - alignment */
1333 		remaining = *((int64_t *)addr);
1334 		break;
1335 	case sizeof (uint32_t):
1336 		/* LINTED - alignment */
1337 		remaining = *((int32_t *)addr);
1338 		break;
1339 	case sizeof (uint16_t):
1340 		/* LINTED - alignment */
1341 		remaining = *((int16_t *)addr);
1342 		break;
1343 	case sizeof (uint8_t):
1344 		remaining = *((int8_t *)addr);
1345 		break;
1346 	default:
1347 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1348 	}
1349 
1350 	if (remaining < 0) {
1351 		func = dtrace_aggregate_walk_valsorted;
1352 		remaining = -remaining;
1353 	} else {
1354 		func = dtrace_aggregate_walk_valrevsorted;
1355 	}
1356 
1357 	assert(remaining >= 0);
1358 	trunc.dttd_remaining = remaining;
1359 
1360 	(void) func(dtp, dt_trunc_agg, &trunc);
1361 
1362 	return (0);
1363 }
1364 
1365 static int
1366 dt_print_datum(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec,
1367     caddr_t addr, size_t size, uint64_t normal)
1368 {
1369 	int err;
1370 	dtrace_actkind_t act = rec->dtrd_action;
1371 
1372 	switch (act) {
1373 	case DTRACEACT_STACK:
1374 		return (dt_print_stack(dtp, fp, NULL, addr,
1375 		    rec->dtrd_arg, rec->dtrd_size / rec->dtrd_arg));
1376 
1377 	case DTRACEACT_USTACK:
1378 	case DTRACEACT_JSTACK:
1379 		return (dt_print_ustack(dtp, fp, NULL, addr, rec->dtrd_arg));
1380 
1381 	case DTRACEACT_USYM:
1382 	case DTRACEACT_UADDR:
1383 		return (dt_print_usym(dtp, fp, addr, act));
1384 
1385 	case DTRACEACT_UMOD:
1386 		return (dt_print_umod(dtp, fp, NULL, addr));
1387 
1388 	case DTRACEACT_SYM:
1389 		return (dt_print_sym(dtp, fp, NULL, addr));
1390 
1391 	case DTRACEACT_MOD:
1392 		return (dt_print_mod(dtp, fp, NULL, addr));
1393 
1394 	case DTRACEAGG_QUANTIZE:
1395 		return (dt_print_quantize(dtp, fp, addr, size, normal));
1396 
1397 	case DTRACEAGG_LQUANTIZE:
1398 		return (dt_print_lquantize(dtp, fp, addr, size, normal));
1399 
1400 	case DTRACEAGG_AVG:
1401 		return (dt_print_average(dtp, fp, addr, size, normal));
1402 
1403 	case DTRACEAGG_STDDEV:
1404 		return (dt_print_stddev(dtp, fp, addr, size, normal));
1405 
1406 	default:
1407 		break;
1408 	}
1409 
1410 	switch (size) {
1411 	case sizeof (uint64_t):
1412 		err = dt_printf(dtp, fp, " %16lld",
1413 		    /* LINTED - alignment */
1414 		    (long long)*((uint64_t *)addr) / normal);
1415 		break;
1416 	case sizeof (uint32_t):
1417 		/* LINTED - alignment */
1418 		err = dt_printf(dtp, fp, " %8d", *((uint32_t *)addr) /
1419 		    (uint32_t)normal);
1420 		break;
1421 	case sizeof (uint16_t):
1422 		/* LINTED - alignment */
1423 		err = dt_printf(dtp, fp, " %5d", *((uint16_t *)addr) /
1424 		    (uint32_t)normal);
1425 		break;
1426 	case sizeof (uint8_t):
1427 		err = dt_printf(dtp, fp, " %3d", *((uint8_t *)addr) /
1428 		    (uint32_t)normal);
1429 		break;
1430 	default:
1431 		err = dt_print_bytes(dtp, fp, addr, size, 50, 0);
1432 		break;
1433 	}
1434 
1435 	return (err);
1436 }
1437 
1438 int
1439 dt_print_aggs(const dtrace_aggdata_t **aggsdata, int naggvars, void *arg)
1440 {
1441 	int i, aggact = 0;
1442 	dt_print_aggdata_t *pd = arg;
1443 	const dtrace_aggdata_t *aggdata = aggsdata[0];
1444 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1445 	FILE *fp = pd->dtpa_fp;
1446 	dtrace_hdl_t *dtp = pd->dtpa_dtp;
1447 	dtrace_recdesc_t *rec;
1448 	dtrace_actkind_t act;
1449 	caddr_t addr;
1450 	size_t size;
1451 
1452 	/*
1453 	 * Iterate over each record description in the key, printing the traced
1454 	 * data, skipping the first datum (the tuple member created by the
1455 	 * compiler).
1456 	 */
1457 	for (i = 1; i < agg->dtagd_nrecs; i++) {
1458 		rec = &agg->dtagd_rec[i];
1459 		act = rec->dtrd_action;
1460 		addr = aggdata->dtada_data + rec->dtrd_offset;
1461 		size = rec->dtrd_size;
1462 
1463 		if (DTRACEACT_ISAGG(act)) {
1464 			aggact = i;
1465 			break;
1466 		}
1467 
1468 		if (dt_print_datum(dtp, fp, rec, addr, size, 1) < 0)
1469 			return (-1);
1470 
1471 		if (dt_buffered_flush(dtp, NULL, rec, aggdata,
1472 		    DTRACE_BUFDATA_AGGKEY) < 0)
1473 			return (-1);
1474 	}
1475 
1476 	assert(aggact != 0);
1477 
1478 	for (i = (naggvars == 1 ? 0 : 1); i < naggvars; i++) {
1479 		uint64_t normal;
1480 
1481 		aggdata = aggsdata[i];
1482 		agg = aggdata->dtada_desc;
1483 		rec = &agg->dtagd_rec[aggact];
1484 		act = rec->dtrd_action;
1485 		addr = aggdata->dtada_data + rec->dtrd_offset;
1486 		size = rec->dtrd_size;
1487 
1488 		assert(DTRACEACT_ISAGG(act));
1489 		normal = aggdata->dtada_normal;
1490 
1491 		if (dt_print_datum(dtp, fp, rec, addr, size, normal) < 0)
1492 			return (-1);
1493 
1494 		if (dt_buffered_flush(dtp, NULL, rec, aggdata,
1495 		    DTRACE_BUFDATA_AGGVAL) < 0)
1496 			return (-1);
1497 
1498 		if (!pd->dtpa_allunprint)
1499 			agg->dtagd_flags |= DTRACE_AGD_PRINTED;
1500 	}
1501 
1502 	if (dt_printf(dtp, fp, "\n") < 0)
1503 		return (-1);
1504 
1505 	if (dt_buffered_flush(dtp, NULL, NULL, aggdata,
1506 	    DTRACE_BUFDATA_AGGFORMAT | DTRACE_BUFDATA_AGGLAST) < 0)
1507 		return (-1);
1508 
1509 	return (0);
1510 }
1511 
1512 int
1513 dt_print_agg(const dtrace_aggdata_t *aggdata, void *arg)
1514 {
1515 	dt_print_aggdata_t *pd = arg;
1516 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1517 	dtrace_aggvarid_t aggvarid = pd->dtpa_id;
1518 
1519 	if (pd->dtpa_allunprint) {
1520 		if (agg->dtagd_flags & DTRACE_AGD_PRINTED)
1521 			return (0);
1522 	} else {
1523 		/*
1524 		 * If we're not printing all unprinted aggregations, then the
1525 		 * aggregation variable ID denotes a specific aggregation
1526 		 * variable that we should print -- skip any other aggregations
1527 		 * that we encounter.
1528 		 */
1529 		if (agg->dtagd_nrecs == 0)
1530 			return (0);
1531 
1532 		if (aggvarid != agg->dtagd_varid)
1533 			return (0);
1534 	}
1535 
1536 	return (dt_print_aggs(&aggdata, 1, arg));
1537 }
1538 
1539 int
1540 dt_setopt(dtrace_hdl_t *dtp, const dtrace_probedata_t *data,
1541     const char *option, const char *value)
1542 {
1543 	int len, rval;
1544 	char *msg;
1545 	const char *errstr;
1546 	dtrace_setoptdata_t optdata;
1547 
1548 	bzero(&optdata, sizeof (optdata));
1549 	(void) dtrace_getopt(dtp, option, &optdata.dtsda_oldval);
1550 
1551 	if (dtrace_setopt(dtp, option, value) == 0) {
1552 		(void) dtrace_getopt(dtp, option, &optdata.dtsda_newval);
1553 		optdata.dtsda_probe = data;
1554 		optdata.dtsda_option = option;
1555 		optdata.dtsda_handle = dtp;
1556 
1557 		if ((rval = dt_handle_setopt(dtp, &optdata)) != 0)
1558 			return (rval);
1559 
1560 		return (0);
1561 	}
1562 
1563 	errstr = dtrace_errmsg(dtp, dtrace_errno(dtp));
1564 	len = strlen(option) + strlen(value) + strlen(errstr) + 80;
1565 	msg = alloca(len);
1566 
1567 	(void) snprintf(msg, len, "couldn't set option \"%s\" to \"%s\": %s\n",
1568 	    option, value, errstr);
1569 
1570 	if ((rval = dt_handle_liberr(dtp, data, msg)) == 0)
1571 		return (0);
1572 
1573 	return (rval);
1574 }
1575 
1576 static int
1577 dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, dtrace_bufdesc_t *buf,
1578     dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc, void *arg)
1579 {
1580 	dtrace_epid_t id;
1581 	size_t offs, start = buf->dtbd_oldest, end = buf->dtbd_size;
1582 	int flow = (dtp->dt_options[DTRACEOPT_FLOWINDENT] != DTRACEOPT_UNSET);
1583 	int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET);
1584 	int rval, i, n;
1585 	dtrace_epid_t last = DTRACE_EPIDNONE;
1586 	dtrace_probedata_t data;
1587 	uint64_t drops;
1588 	caddr_t addr;
1589 
1590 	bzero(&data, sizeof (data));
1591 	data.dtpda_handle = dtp;
1592 	data.dtpda_cpu = cpu;
1593 
1594 again:
1595 	for (offs = start; offs < end; ) {
1596 		dtrace_eprobedesc_t *epd;
1597 
1598 		/*
1599 		 * We're guaranteed to have an ID.
1600 		 */
1601 		id = *(uint32_t *)((uintptr_t)buf->dtbd_data + offs);
1602 
1603 		if (id == DTRACE_EPIDNONE) {
1604 			/*
1605 			 * This is filler to assure proper alignment of the
1606 			 * next record; we simply ignore it.
1607 			 */
1608 			offs += sizeof (id);
1609 			continue;
1610 		}
1611 
1612 		if ((rval = dt_epid_lookup(dtp, id, &data.dtpda_edesc,
1613 		    &data.dtpda_pdesc)) != 0)
1614 			return (rval);
1615 
1616 		epd = data.dtpda_edesc;
1617 		data.dtpda_data = buf->dtbd_data + offs;
1618 
1619 		if (data.dtpda_edesc->dtepd_uarg != DT_ECB_DEFAULT) {
1620 			rval = dt_handle(dtp, &data);
1621 
1622 			if (rval == DTRACE_CONSUME_NEXT)
1623 				goto nextepid;
1624 
1625 			if (rval == DTRACE_CONSUME_ERROR)
1626 				return (-1);
1627 		}
1628 
1629 		if (flow)
1630 			(void) dt_flowindent(dtp, &data, last, buf, offs);
1631 
1632 		rval = (*efunc)(&data, arg);
1633 
1634 		if (flow) {
1635 			if (data.dtpda_flow == DTRACEFLOW_ENTRY)
1636 				data.dtpda_indent += 2;
1637 		}
1638 
1639 		if (rval == DTRACE_CONSUME_NEXT)
1640 			goto nextepid;
1641 
1642 		if (rval == DTRACE_CONSUME_ABORT)
1643 			return (dt_set_errno(dtp, EDT_DIRABORT));
1644 
1645 		if (rval != DTRACE_CONSUME_THIS)
1646 			return (dt_set_errno(dtp, EDT_BADRVAL));
1647 
1648 		for (i = 0; i < epd->dtepd_nrecs; i++) {
1649 			dtrace_recdesc_t *rec = &epd->dtepd_rec[i];
1650 			dtrace_actkind_t act = rec->dtrd_action;
1651 
1652 			data.dtpda_data = buf->dtbd_data + offs +
1653 			    rec->dtrd_offset;
1654 			addr = data.dtpda_data;
1655 
1656 			if (act == DTRACEACT_LIBACT) {
1657 				uint64_t arg = rec->dtrd_arg;
1658 				dtrace_aggvarid_t id;
1659 
1660 				switch (arg) {
1661 				case DT_ACT_CLEAR:
1662 					/* LINTED - alignment */
1663 					id = *((dtrace_aggvarid_t *)addr);
1664 					(void) dtrace_aggregate_walk(dtp,
1665 					    dt_clear_agg, &id);
1666 					continue;
1667 
1668 				case DT_ACT_DENORMALIZE:
1669 					/* LINTED - alignment */
1670 					id = *((dtrace_aggvarid_t *)addr);
1671 					(void) dtrace_aggregate_walk(dtp,
1672 					    dt_denormalize_agg, &id);
1673 					continue;
1674 
1675 				case DT_ACT_FTRUNCATE:
1676 					if (fp == NULL)
1677 						continue;
1678 
1679 					(void) fflush(fp);
1680 					(void) ftruncate(fileno(fp), 0);
1681 					(void) fseeko(fp, 0, SEEK_SET);
1682 					continue;
1683 
1684 				case DT_ACT_NORMALIZE:
1685 					if (i == epd->dtepd_nrecs - 1)
1686 						return (dt_set_errno(dtp,
1687 						    EDT_BADNORMAL));
1688 
1689 					if (dt_normalize(dtp,
1690 					    buf->dtbd_data + offs, rec) != 0)
1691 						return (-1);
1692 
1693 					i++;
1694 					continue;
1695 
1696 				case DT_ACT_SETOPT: {
1697 					uint64_t *opts = dtp->dt_options;
1698 					dtrace_recdesc_t *valrec;
1699 					uint32_t valsize;
1700 					caddr_t val;
1701 					int rv;
1702 
1703 					if (i == epd->dtepd_nrecs - 1) {
1704 						return (dt_set_errno(dtp,
1705 						    EDT_BADSETOPT));
1706 					}
1707 
1708 					valrec = &epd->dtepd_rec[++i];
1709 					valsize = valrec->dtrd_size;
1710 
1711 					if (valrec->dtrd_action != act ||
1712 					    valrec->dtrd_arg != arg) {
1713 						return (dt_set_errno(dtp,
1714 						    EDT_BADSETOPT));
1715 					}
1716 
1717 					if (valsize > sizeof (uint64_t)) {
1718 						val = buf->dtbd_data + offs +
1719 						    valrec->dtrd_offset;
1720 					} else {
1721 						val = "1";
1722 					}
1723 
1724 					rv = dt_setopt(dtp, &data, addr, val);
1725 
1726 					if (rv != 0)
1727 						return (-1);
1728 
1729 					flow = (opts[DTRACEOPT_FLOWINDENT] !=
1730 					    DTRACEOPT_UNSET);
1731 					quiet = (opts[DTRACEOPT_QUIET] !=
1732 					    DTRACEOPT_UNSET);
1733 
1734 					continue;
1735 				}
1736 
1737 				case DT_ACT_TRUNC:
1738 					if (i == epd->dtepd_nrecs - 1)
1739 						return (dt_set_errno(dtp,
1740 						    EDT_BADTRUNC));
1741 
1742 					if (dt_trunc(dtp,
1743 					    buf->dtbd_data + offs, rec) != 0)
1744 						return (-1);
1745 
1746 					i++;
1747 					continue;
1748 
1749 				default:
1750 					continue;
1751 				}
1752 			}
1753 
1754 			rval = (*rfunc)(&data, rec, arg);
1755 
1756 			if (rval == DTRACE_CONSUME_NEXT)
1757 				continue;
1758 
1759 			if (rval == DTRACE_CONSUME_ABORT)
1760 				return (dt_set_errno(dtp, EDT_DIRABORT));
1761 
1762 			if (rval != DTRACE_CONSUME_THIS)
1763 				return (dt_set_errno(dtp, EDT_BADRVAL));
1764 
1765 			if (act == DTRACEACT_STACK) {
1766 				int depth = rec->dtrd_arg;
1767 
1768 				if (dt_print_stack(dtp, fp, NULL, addr, depth,
1769 				    rec->dtrd_size / depth) < 0)
1770 					return (-1);
1771 				goto nextrec;
1772 			}
1773 
1774 			if (act == DTRACEACT_USTACK ||
1775 			    act == DTRACEACT_JSTACK) {
1776 				if (dt_print_ustack(dtp, fp, NULL,
1777 				    addr, rec->dtrd_arg) < 0)
1778 					return (-1);
1779 				goto nextrec;
1780 			}
1781 
1782 			if (act == DTRACEACT_SYM) {
1783 				if (dt_print_sym(dtp, fp, NULL, addr) < 0)
1784 					return (-1);
1785 				goto nextrec;
1786 			}
1787 
1788 			if (act == DTRACEACT_MOD) {
1789 				if (dt_print_mod(dtp, fp, NULL, addr) < 0)
1790 					return (-1);
1791 				goto nextrec;
1792 			}
1793 
1794 			if (act == DTRACEACT_USYM || act == DTRACEACT_UADDR) {
1795 				if (dt_print_usym(dtp, fp, addr, act) < 0)
1796 					return (-1);
1797 				goto nextrec;
1798 			}
1799 
1800 			if (act == DTRACEACT_UMOD) {
1801 				if (dt_print_umod(dtp, fp, NULL, addr) < 0)
1802 					return (-1);
1803 				goto nextrec;
1804 			}
1805 
1806 			if (DTRACEACT_ISPRINTFLIKE(act)) {
1807 				void *fmtdata;
1808 				int (*func)(dtrace_hdl_t *, FILE *, void *,
1809 				    const dtrace_probedata_t *,
1810 				    const dtrace_recdesc_t *, uint_t,
1811 				    const void *buf, size_t);
1812 
1813 				if ((fmtdata = dt_format_lookup(dtp,
1814 				    rec->dtrd_format)) == NULL)
1815 					goto nofmt;
1816 
1817 				switch (act) {
1818 				case DTRACEACT_PRINTF:
1819 					func = dtrace_fprintf;
1820 					break;
1821 				case DTRACEACT_PRINTA:
1822 					func = dtrace_fprinta;
1823 					break;
1824 				case DTRACEACT_SYSTEM:
1825 					func = dtrace_system;
1826 					break;
1827 				case DTRACEACT_FREOPEN:
1828 					func = dtrace_freopen;
1829 					break;
1830 				}
1831 
1832 				n = (*func)(dtp, fp, fmtdata, &data,
1833 				    rec, epd->dtepd_nrecs - i,
1834 				    (uchar_t *)buf->dtbd_data + offs,
1835 				    buf->dtbd_size - offs);
1836 
1837 				if (n < 0)
1838 					return (-1); /* errno is set for us */
1839 
1840 				if (n > 0)
1841 					i += n - 1;
1842 				goto nextrec;
1843 			}
1844 
1845 nofmt:
1846 			if (act == DTRACEACT_PRINTA) {
1847 				dt_print_aggdata_t pd;
1848 				dtrace_aggvarid_t *aggvars;
1849 				int j, naggvars = 0;
1850 				size_t size = ((epd->dtepd_nrecs - i) *
1851 				    sizeof (dtrace_aggvarid_t));
1852 
1853 				if ((aggvars = dt_alloc(dtp, size)) == NULL)
1854 					return (-1);
1855 
1856 				/*
1857 				 * This might be a printa() with multiple
1858 				 * aggregation variables.  We need to scan
1859 				 * forward through the records until we find
1860 				 * a record from a different statement.
1861 				 */
1862 				for (j = i; j < epd->dtepd_nrecs; j++) {
1863 					dtrace_recdesc_t *nrec;
1864 					caddr_t naddr;
1865 
1866 					nrec = &epd->dtepd_rec[j];
1867 
1868 					if (nrec->dtrd_uarg != rec->dtrd_uarg)
1869 						break;
1870 
1871 					if (nrec->dtrd_action != act) {
1872 						return (dt_set_errno(dtp,
1873 						    EDT_BADAGG));
1874 					}
1875 
1876 					naddr = buf->dtbd_data + offs +
1877 					    nrec->dtrd_offset;
1878 
1879 					aggvars[naggvars++] =
1880 					    /* LINTED - alignment */
1881 					    *((dtrace_aggvarid_t *)naddr);
1882 				}
1883 
1884 				i = j - 1;
1885 				bzero(&pd, sizeof (pd));
1886 				pd.dtpa_dtp = dtp;
1887 				pd.dtpa_fp = fp;
1888 
1889 				assert(naggvars >= 1);
1890 
1891 				if (naggvars == 1) {
1892 					pd.dtpa_id = aggvars[0];
1893 					dt_free(dtp, aggvars);
1894 
1895 					if (dt_printf(dtp, fp, "\n") < 0 ||
1896 					    dtrace_aggregate_walk_sorted(dtp,
1897 					    dt_print_agg, &pd) < 0)
1898 						return (-1);
1899 					goto nextrec;
1900 				}
1901 
1902 				if (dt_printf(dtp, fp, "\n") < 0 ||
1903 				    dtrace_aggregate_walk_joined(dtp, aggvars,
1904 				    naggvars, dt_print_aggs, &pd) < 0) {
1905 					dt_free(dtp, aggvars);
1906 					return (-1);
1907 				}
1908 
1909 				dt_free(dtp, aggvars);
1910 				goto nextrec;
1911 			}
1912 
1913 			switch (rec->dtrd_size) {
1914 			case sizeof (uint64_t):
1915 				n = dt_printf(dtp, fp,
1916 				    quiet ? "%lld" : " %16lld",
1917 				    /* LINTED - alignment */
1918 				    *((unsigned long long *)addr));
1919 				break;
1920 			case sizeof (uint32_t):
1921 				n = dt_printf(dtp, fp, quiet ? "%d" : " %8d",
1922 				    /* LINTED - alignment */
1923 				    *((uint32_t *)addr));
1924 				break;
1925 			case sizeof (uint16_t):
1926 				n = dt_printf(dtp, fp, quiet ? "%d" : " %5d",
1927 				    /* LINTED - alignment */
1928 				    *((uint16_t *)addr));
1929 				break;
1930 			case sizeof (uint8_t):
1931 				n = dt_printf(dtp, fp, quiet ? "%d" : " %3d",
1932 				    *((uint8_t *)addr));
1933 				break;
1934 			default:
1935 				n = dt_print_bytes(dtp, fp, addr,
1936 				    rec->dtrd_size, 33, quiet);
1937 				break;
1938 			}
1939 
1940 			if (n < 0)
1941 				return (-1); /* errno is set for us */
1942 
1943 nextrec:
1944 			if (dt_buffered_flush(dtp, &data, rec, NULL, 0) < 0)
1945 				return (-1); /* errno is set for us */
1946 		}
1947 
1948 		/*
1949 		 * Call the record callback with a NULL record to indicate
1950 		 * that we're done processing this EPID.
1951 		 */
1952 		rval = (*rfunc)(&data, NULL, arg);
1953 nextepid:
1954 		offs += epd->dtepd_size;
1955 		last = id;
1956 	}
1957 
1958 	if (buf->dtbd_oldest != 0 && start == buf->dtbd_oldest) {
1959 		end = buf->dtbd_oldest;
1960 		start = 0;
1961 		goto again;
1962 	}
1963 
1964 	if ((drops = buf->dtbd_drops) == 0)
1965 		return (0);
1966 
1967 	/*
1968 	 * Explicitly zero the drops to prevent us from processing them again.
1969 	 */
1970 	buf->dtbd_drops = 0;
1971 
1972 	return (dt_handle_cpudrop(dtp, cpu, DTRACEDROP_PRINCIPAL, drops));
1973 }
1974 
1975 typedef struct dt_begin {
1976 	dtrace_consume_probe_f *dtbgn_probefunc;
1977 	dtrace_consume_rec_f *dtbgn_recfunc;
1978 	void *dtbgn_arg;
1979 	dtrace_handle_err_f *dtbgn_errhdlr;
1980 	void *dtbgn_errarg;
1981 	int dtbgn_beginonly;
1982 } dt_begin_t;
1983 
1984 static int
1985 dt_consume_begin_probe(const dtrace_probedata_t *data, void *arg)
1986 {
1987 	dt_begin_t *begin = (dt_begin_t *)arg;
1988 	dtrace_probedesc_t *pd = data->dtpda_pdesc;
1989 
1990 	int r1 = (strcmp(pd->dtpd_provider, "dtrace") == 0);
1991 	int r2 = (strcmp(pd->dtpd_name, "BEGIN") == 0);
1992 
1993 	if (begin->dtbgn_beginonly) {
1994 		if (!(r1 && r2))
1995 			return (DTRACE_CONSUME_NEXT);
1996 	} else {
1997 		if (r1 && r2)
1998 			return (DTRACE_CONSUME_NEXT);
1999 	}
2000 
2001 	/*
2002 	 * We have a record that we're interested in.  Now call the underlying
2003 	 * probe function...
2004 	 */
2005 	return (begin->dtbgn_probefunc(data, begin->dtbgn_arg));
2006 }
2007 
2008 static int
2009 dt_consume_begin_record(const dtrace_probedata_t *data,
2010     const dtrace_recdesc_t *rec, void *arg)
2011 {
2012 	dt_begin_t *begin = (dt_begin_t *)arg;
2013 
2014 	return (begin->dtbgn_recfunc(data, rec, begin->dtbgn_arg));
2015 }
2016 
2017 static int
2018 dt_consume_begin_error(const dtrace_errdata_t *data, void *arg)
2019 {
2020 	dt_begin_t *begin = (dt_begin_t *)arg;
2021 	dtrace_probedesc_t *pd = data->dteda_pdesc;
2022 
2023 	int r1 = (strcmp(pd->dtpd_provider, "dtrace") == 0);
2024 	int r2 = (strcmp(pd->dtpd_name, "BEGIN") == 0);
2025 
2026 	if (begin->dtbgn_beginonly) {
2027 		if (!(r1 && r2))
2028 			return (DTRACE_HANDLE_OK);
2029 	} else {
2030 		if (r1 && r2)
2031 			return (DTRACE_HANDLE_OK);
2032 	}
2033 
2034 	return (begin->dtbgn_errhdlr(data, begin->dtbgn_errarg));
2035 }
2036 
2037 static int
2038 dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf,
2039     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
2040 {
2041 	/*
2042 	 * There's this idea that the BEGIN probe should be processed before
2043 	 * everything else, and that the END probe should be processed after
2044 	 * anything else.  In the common case, this is pretty easy to deal
2045 	 * with.  However, a situation may arise where the BEGIN enabling and
2046 	 * END enabling are on the same CPU, and some enabling in the middle
2047 	 * occurred on a different CPU.  To deal with this (blech!) we need to
2048 	 * consume the BEGIN buffer up until the end of the BEGIN probe, and
2049 	 * then set it aside.  We will then process every other CPU, and then
2050 	 * we'll return to the BEGIN CPU and process the rest of the data
2051 	 * (which will inevitably include the END probe, if any).  Making this
2052 	 * even more complicated (!) is the library's ERROR enabling.  Because
2053 	 * this enabling is processed before we even get into the consume call
2054 	 * back, any ERROR firing would result in the library's ERROR enabling
2055 	 * being processed twice -- once in our first pass (for BEGIN probes),
2056 	 * and again in our second pass (for everything but BEGIN probes).  To
2057 	 * deal with this, we interpose on the ERROR handler to assure that we
2058 	 * only process ERROR enablings induced by BEGIN enablings in the
2059 	 * first pass, and that we only process ERROR enablings _not_ induced
2060 	 * by BEGIN enablings in the second pass.
2061 	 */
2062 	dt_begin_t begin;
2063 	processorid_t cpu = dtp->dt_beganon;
2064 	dtrace_bufdesc_t nbuf;
2065 	int rval, i;
2066 	static int max_ncpus;
2067 	dtrace_optval_t size;
2068 
2069 	dtp->dt_beganon = -1;
2070 
2071 	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
2072 		/*
2073 		 * We really don't expect this to fail, but it is at least
2074 		 * technically possible for this to fail with ENOENT.  In this
2075 		 * case, we just drive on...
2076 		 */
2077 		if (errno == ENOENT)
2078 			return (0);
2079 
2080 		return (dt_set_errno(dtp, errno));
2081 	}
2082 
2083 	if (!dtp->dt_stopped || buf->dtbd_cpu != dtp->dt_endedon) {
2084 		/*
2085 		 * This is the simple case.  We're either not stopped, or if
2086 		 * we are, we actually processed any END probes on another
2087 		 * CPU.  We can simply consume this buffer and return.
2088 		 */
2089 		return (dt_consume_cpu(dtp, fp, cpu, buf, pf, rf, arg));
2090 	}
2091 
2092 	begin.dtbgn_probefunc = pf;
2093 	begin.dtbgn_recfunc = rf;
2094 	begin.dtbgn_arg = arg;
2095 	begin.dtbgn_beginonly = 1;
2096 
2097 	/*
2098 	 * We need to interpose on the ERROR handler to be sure that we
2099 	 * only process ERRORs induced by BEGIN.
2100 	 */
2101 	begin.dtbgn_errhdlr = dtp->dt_errhdlr;
2102 	begin.dtbgn_errarg = dtp->dt_errarg;
2103 	dtp->dt_errhdlr = dt_consume_begin_error;
2104 	dtp->dt_errarg = &begin;
2105 
2106 	rval = dt_consume_cpu(dtp, fp, cpu, buf, dt_consume_begin_probe,
2107 	    dt_consume_begin_record, &begin);
2108 
2109 	dtp->dt_errhdlr = begin.dtbgn_errhdlr;
2110 	dtp->dt_errarg = begin.dtbgn_errarg;
2111 
2112 	if (rval != 0)
2113 		return (rval);
2114 
2115 	/*
2116 	 * Now allocate a new buffer.  We'll use this to deal with every other
2117 	 * CPU.
2118 	 */
2119 	bzero(&nbuf, sizeof (dtrace_bufdesc_t));
2120 	(void) dtrace_getopt(dtp, "bufsize", &size);
2121 	if ((nbuf.dtbd_data = malloc(size)) == NULL)
2122 		return (dt_set_errno(dtp, EDT_NOMEM));
2123 
2124 	if (max_ncpus == 0)
2125 		max_ncpus = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
2126 
2127 	for (i = 0; i < max_ncpus; i++) {
2128 		nbuf.dtbd_cpu = i;
2129 
2130 		if (i == cpu)
2131 			continue;
2132 
2133 		if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &nbuf) == -1) {
2134 			/*
2135 			 * If we failed with ENOENT, it may be because the
2136 			 * CPU was unconfigured -- this is okay.  Any other
2137 			 * error, however, is unexpected.
2138 			 */
2139 			if (errno == ENOENT)
2140 				continue;
2141 
2142 			free(nbuf.dtbd_data);
2143 
2144 			return (dt_set_errno(dtp, errno));
2145 		}
2146 
2147 		if ((rval = dt_consume_cpu(dtp, fp,
2148 		    i, &nbuf, pf, rf, arg)) != 0) {
2149 			free(nbuf.dtbd_data);
2150 			return (rval);
2151 		}
2152 	}
2153 
2154 	free(nbuf.dtbd_data);
2155 
2156 	/*
2157 	 * Okay -- we're done with the other buffers.  Now we want to
2158 	 * reconsume the first buffer -- but this time we're looking for
2159 	 * everything _but_ BEGIN.  And of course, in order to only consume
2160 	 * those ERRORs _not_ associated with BEGIN, we need to reinstall our
2161 	 * ERROR interposition function...
2162 	 */
2163 	begin.dtbgn_beginonly = 0;
2164 
2165 	assert(begin.dtbgn_errhdlr == dtp->dt_errhdlr);
2166 	assert(begin.dtbgn_errarg == dtp->dt_errarg);
2167 	dtp->dt_errhdlr = dt_consume_begin_error;
2168 	dtp->dt_errarg = &begin;
2169 
2170 	rval = dt_consume_cpu(dtp, fp, cpu, buf, dt_consume_begin_probe,
2171 	    dt_consume_begin_record, &begin);
2172 
2173 	dtp->dt_errhdlr = begin.dtbgn_errhdlr;
2174 	dtp->dt_errarg = begin.dtbgn_errarg;
2175 
2176 	return (rval);
2177 }
2178 
2179 int
2180 dtrace_consume(dtrace_hdl_t *dtp, FILE *fp,
2181     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
2182 {
2183 	dtrace_bufdesc_t *buf = &dtp->dt_buf;
2184 	dtrace_optval_t size;
2185 	static int max_ncpus;
2186 	int i, rval;
2187 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_SWITCHRATE];
2188 	hrtime_t now = gethrtime();
2189 
2190 	if (dtp->dt_lastswitch != 0) {
2191 		if (now - dtp->dt_lastswitch < interval)
2192 			return (0);
2193 
2194 		dtp->dt_lastswitch += interval;
2195 	} else {
2196 		dtp->dt_lastswitch = now;
2197 	}
2198 
2199 	if (!dtp->dt_active)
2200 		return (dt_set_errno(dtp, EINVAL));
2201 
2202 	if (max_ncpus == 0)
2203 		max_ncpus = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
2204 
2205 	if (pf == NULL)
2206 		pf = (dtrace_consume_probe_f *)dt_nullprobe;
2207 
2208 	if (rf == NULL)
2209 		rf = (dtrace_consume_rec_f *)dt_nullrec;
2210 
2211 	if (buf->dtbd_data == NULL) {
2212 		(void) dtrace_getopt(dtp, "bufsize", &size);
2213 		if ((buf->dtbd_data = malloc(size)) == NULL)
2214 			return (dt_set_errno(dtp, EDT_NOMEM));
2215 
2216 		buf->dtbd_size = size;
2217 	}
2218 
2219 	/*
2220 	 * If we have just begun, we want to first process the CPU that
2221 	 * executed the BEGIN probe (if any).
2222 	 */
2223 	if (dtp->dt_active && dtp->dt_beganon != -1) {
2224 		buf->dtbd_cpu = dtp->dt_beganon;
2225 		if ((rval = dt_consume_begin(dtp, fp, buf, pf, rf, arg)) != 0)
2226 			return (rval);
2227 	}
2228 
2229 	for (i = 0; i < max_ncpus; i++) {
2230 		buf->dtbd_cpu = i;
2231 
2232 		/*
2233 		 * If we have stopped, we want to process the CPU on which the
2234 		 * END probe was processed only _after_ we have processed
2235 		 * everything else.
2236 		 */
2237 		if (dtp->dt_stopped && (i == dtp->dt_endedon))
2238 			continue;
2239 
2240 		if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
2241 			/*
2242 			 * If we failed with ENOENT, it may be because the
2243 			 * CPU was unconfigured -- this is okay.  Any other
2244 			 * error, however, is unexpected.
2245 			 */
2246 			if (errno == ENOENT)
2247 				continue;
2248 
2249 			return (dt_set_errno(dtp, errno));
2250 		}
2251 
2252 		if ((rval = dt_consume_cpu(dtp, fp, i, buf, pf, rf, arg)) != 0)
2253 			return (rval);
2254 	}
2255 
2256 	if (!dtp->dt_stopped)
2257 		return (0);
2258 
2259 	buf->dtbd_cpu = dtp->dt_endedon;
2260 
2261 	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
2262 		/*
2263 		 * This _really_ shouldn't fail, but it is strictly speaking
2264 		 * possible for this to return ENOENT if the CPU that called
2265 		 * the END enabling somehow managed to become unconfigured.
2266 		 * It's unclear how the user can possibly expect anything
2267 		 * rational to happen in this case -- the state has been thrown
2268 		 * out along with the unconfigured CPU -- so we'll just drive
2269 		 * on...
2270 		 */
2271 		if (errno == ENOENT)
2272 			return (0);
2273 
2274 		return (dt_set_errno(dtp, errno));
2275 	}
2276 
2277 	return (dt_consume_cpu(dtp, fp, dtp->dt_endedon, buf, pf, rf, arg));
2278 }
2279