xref: /titanic_50/usr/src/lib/libdtrace/common/dt_consume.c (revision 4c273cfa4ad8398f4157cd1d6fa54fc1cbc266ff)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
28  * Copyright (c) 2012 by Delphix. All rights reserved.
29  */
30 
31 #include <stdlib.h>
32 #include <strings.h>
33 #include <errno.h>
34 #include <unistd.h>
35 #include <limits.h>
36 #include <assert.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <dt_impl.h>
40 #include <dt_pq.h>
41 
42 #define	DT_MASK_LO 0x00000000FFFFFFFFULL
43 
44 /*
45  * We declare this here because (1) we need it and (2) we want to avoid a
46  * dependency on libm in libdtrace.
47  */
48 static long double
49 dt_fabsl(long double x)
50 {
51 	if (x < 0)
52 		return (-x);
53 
54 	return (x);
55 }
56 
57 /*
58  * 128-bit arithmetic functions needed to support the stddev() aggregating
59  * action.
60  */
61 static int
62 dt_gt_128(uint64_t *a, uint64_t *b)
63 {
64 	return (a[1] > b[1] || (a[1] == b[1] && a[0] > b[0]));
65 }
66 
67 static int
68 dt_ge_128(uint64_t *a, uint64_t *b)
69 {
70 	return (a[1] > b[1] || (a[1] == b[1] && a[0] >= b[0]));
71 }
72 
73 static int
74 dt_le_128(uint64_t *a, uint64_t *b)
75 {
76 	return (a[1] < b[1] || (a[1] == b[1] && a[0] <= b[0]));
77 }
78 
79 /*
80  * Shift the 128-bit value in a by b. If b is positive, shift left.
81  * If b is negative, shift right.
82  */
83 static void
84 dt_shift_128(uint64_t *a, int b)
85 {
86 	uint64_t mask;
87 
88 	if (b == 0)
89 		return;
90 
91 	if (b < 0) {
92 		b = -b;
93 		if (b >= 64) {
94 			a[0] = a[1] >> (b - 64);
95 			a[1] = 0;
96 		} else {
97 			a[0] >>= b;
98 			mask = 1LL << (64 - b);
99 			mask -= 1;
100 			a[0] |= ((a[1] & mask) << (64 - b));
101 			a[1] >>= b;
102 		}
103 	} else {
104 		if (b >= 64) {
105 			a[1] = a[0] << (b - 64);
106 			a[0] = 0;
107 		} else {
108 			a[1] <<= b;
109 			mask = a[0] >> (64 - b);
110 			a[1] |= mask;
111 			a[0] <<= b;
112 		}
113 	}
114 }
115 
116 static int
117 dt_nbits_128(uint64_t *a)
118 {
119 	int nbits = 0;
120 	uint64_t tmp[2];
121 	uint64_t zero[2] = { 0, 0 };
122 
123 	tmp[0] = a[0];
124 	tmp[1] = a[1];
125 
126 	dt_shift_128(tmp, -1);
127 	while (dt_gt_128(tmp, zero)) {
128 		dt_shift_128(tmp, -1);
129 		nbits++;
130 	}
131 
132 	return (nbits);
133 }
134 
135 static void
136 dt_subtract_128(uint64_t *minuend, uint64_t *subtrahend, uint64_t *difference)
137 {
138 	uint64_t result[2];
139 
140 	result[0] = minuend[0] - subtrahend[0];
141 	result[1] = minuend[1] - subtrahend[1] -
142 	    (minuend[0] < subtrahend[0] ? 1 : 0);
143 
144 	difference[0] = result[0];
145 	difference[1] = result[1];
146 }
147 
148 static void
149 dt_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
150 {
151 	uint64_t result[2];
152 
153 	result[0] = addend1[0] + addend2[0];
154 	result[1] = addend1[1] + addend2[1] +
155 	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
156 
157 	sum[0] = result[0];
158 	sum[1] = result[1];
159 }
160 
161 /*
162  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
163  * use native multiplication on those, and then re-combine into the
164  * resulting 128-bit value.
165  *
166  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
167  *     hi1 * hi2 << 64 +
168  *     hi1 * lo2 << 32 +
169  *     hi2 * lo1 << 32 +
170  *     lo1 * lo2
171  */
172 static void
173 dt_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
174 {
175 	uint64_t hi1, hi2, lo1, lo2;
176 	uint64_t tmp[2];
177 
178 	hi1 = factor1 >> 32;
179 	hi2 = factor2 >> 32;
180 
181 	lo1 = factor1 & DT_MASK_LO;
182 	lo2 = factor2 & DT_MASK_LO;
183 
184 	product[0] = lo1 * lo2;
185 	product[1] = hi1 * hi2;
186 
187 	tmp[0] = hi1 * lo2;
188 	tmp[1] = 0;
189 	dt_shift_128(tmp, 32);
190 	dt_add_128(product, tmp, product);
191 
192 	tmp[0] = hi2 * lo1;
193 	tmp[1] = 0;
194 	dt_shift_128(tmp, 32);
195 	dt_add_128(product, tmp, product);
196 }
197 
198 /*
199  * This is long-hand division.
200  *
201  * We initialize subtrahend by shifting divisor left as far as possible. We
202  * loop, comparing subtrahend to dividend:  if subtrahend is smaller, we
203  * subtract and set the appropriate bit in the result.  We then shift
204  * subtrahend right by one bit for the next comparison.
205  */
206 static void
207 dt_divide_128(uint64_t *dividend, uint64_t divisor, uint64_t *quotient)
208 {
209 	uint64_t result[2] = { 0, 0 };
210 	uint64_t remainder[2];
211 	uint64_t subtrahend[2];
212 	uint64_t divisor_128[2];
213 	uint64_t mask[2] = { 1, 0 };
214 	int log = 0;
215 
216 	assert(divisor != 0);
217 
218 	divisor_128[0] = divisor;
219 	divisor_128[1] = 0;
220 
221 	remainder[0] = dividend[0];
222 	remainder[1] = dividend[1];
223 
224 	subtrahend[0] = divisor;
225 	subtrahend[1] = 0;
226 
227 	while (divisor > 0) {
228 		log++;
229 		divisor >>= 1;
230 	}
231 
232 	dt_shift_128(subtrahend, 128 - log);
233 	dt_shift_128(mask, 128 - log);
234 
235 	while (dt_ge_128(remainder, divisor_128)) {
236 		if (dt_ge_128(remainder, subtrahend)) {
237 			dt_subtract_128(remainder, subtrahend, remainder);
238 			result[0] |= mask[0];
239 			result[1] |= mask[1];
240 		}
241 
242 		dt_shift_128(subtrahend, -1);
243 		dt_shift_128(mask, -1);
244 	}
245 
246 	quotient[0] = result[0];
247 	quotient[1] = result[1];
248 }
249 
250 /*
251  * This is the long-hand method of calculating a square root.
252  * The algorithm is as follows:
253  *
254  * 1. Group the digits by 2 from the right.
255  * 2. Over the leftmost group, find the largest single-digit number
256  *    whose square is less than that group.
257  * 3. Subtract the result of the previous step (2 or 4, depending) and
258  *    bring down the next two-digit group.
259  * 4. For the result R we have so far, find the largest single-digit number
260  *    x such that 2 * R * 10 * x + x^2 is less than the result from step 3.
261  *    (Note that this is doubling R and performing a decimal left-shift by 1
262  *    and searching for the appropriate decimal to fill the one's place.)
263  *    The value x is the next digit in the square root.
264  * Repeat steps 3 and 4 until the desired precision is reached.  (We're
265  * dealing with integers, so the above is sufficient.)
266  *
267  * In decimal, the square root of 582,734 would be calculated as so:
268  *
269  *     __7__6__3
270  *    | 58 27 34
271  *     -49       (7^2 == 49 => 7 is the first digit in the square root)
272  *      --
273  *       9 27    (Subtract and bring down the next group.)
274  * 146   8 76    (2 * 7 * 10 * 6 + 6^2 == 876 => 6 is the next digit in
275  *      -----     the square root)
276  *         51 34 (Subtract and bring down the next group.)
277  * 1523    45 69 (2 * 76 * 10 * 3 + 3^2 == 4569 => 3 is the next digit in
278  *         -----  the square root)
279  *          5 65 (remainder)
280  *
281  * The above algorithm applies similarly in binary, but note that the
282  * only possible non-zero value for x in step 4 is 1, so step 4 becomes a
283  * simple decision: is 2 * R * 2 * 1 + 1^2 (aka R << 2 + 1) less than the
284  * preceding difference?
285  *
286  * In binary, the square root of 11011011 would be calculated as so:
287  *
288  *     __1__1__1__0
289  *    | 11 01 10 11
290  *      01          (0 << 2 + 1 == 1 < 11 => this bit is 1)
291  *      --
292  *      10 01 10 11
293  * 101   1 01       (1 << 2 + 1 == 101 < 1001 => next bit is 1)
294  *      -----
295  *       1 00 10 11
296  * 1101    11 01    (11 << 2 + 1 == 1101 < 10010 => next bit is 1)
297  *       -------
298  *          1 01 11
299  * 11101    1 11 01 (111 << 2 + 1 == 11101 > 10111 => last bit is 0)
300  *
301  */
302 static uint64_t
303 dt_sqrt_128(uint64_t *square)
304 {
305 	uint64_t result[2] = { 0, 0 };
306 	uint64_t diff[2] = { 0, 0 };
307 	uint64_t one[2] = { 1, 0 };
308 	uint64_t next_pair[2];
309 	uint64_t next_try[2];
310 	uint64_t bit_pairs, pair_shift;
311 	int i;
312 
313 	bit_pairs = dt_nbits_128(square) / 2;
314 	pair_shift = bit_pairs * 2;
315 
316 	for (i = 0; i <= bit_pairs; i++) {
317 		/*
318 		 * Bring down the next pair of bits.
319 		 */
320 		next_pair[0] = square[0];
321 		next_pair[1] = square[1];
322 		dt_shift_128(next_pair, -pair_shift);
323 		next_pair[0] &= 0x3;
324 		next_pair[1] = 0;
325 
326 		dt_shift_128(diff, 2);
327 		dt_add_128(diff, next_pair, diff);
328 
329 		/*
330 		 * next_try = R << 2 + 1
331 		 */
332 		next_try[0] = result[0];
333 		next_try[1] = result[1];
334 		dt_shift_128(next_try, 2);
335 		dt_add_128(next_try, one, next_try);
336 
337 		if (dt_le_128(next_try, diff)) {
338 			dt_subtract_128(diff, next_try, diff);
339 			dt_shift_128(result, 1);
340 			dt_add_128(result, one, result);
341 		} else {
342 			dt_shift_128(result, 1);
343 		}
344 
345 		pair_shift -= 2;
346 	}
347 
348 	assert(result[1] == 0);
349 
350 	return (result[0]);
351 }
352 
353 uint64_t
354 dt_stddev(uint64_t *data, uint64_t normal)
355 {
356 	uint64_t avg_of_squares[2];
357 	uint64_t square_of_avg[2];
358 	int64_t norm_avg;
359 	uint64_t diff[2];
360 
361 	/*
362 	 * The standard approximation for standard deviation is
363 	 * sqrt(average(x**2) - average(x)**2), i.e. the square root
364 	 * of the average of the squares minus the square of the average.
365 	 */
366 	dt_divide_128(data + 2, normal, avg_of_squares);
367 	dt_divide_128(avg_of_squares, data[0], avg_of_squares);
368 
369 	norm_avg = (int64_t)data[1] / (int64_t)normal / (int64_t)data[0];
370 
371 	if (norm_avg < 0)
372 		norm_avg = -norm_avg;
373 
374 	dt_multiply_128((uint64_t)norm_avg, (uint64_t)norm_avg, square_of_avg);
375 
376 	dt_subtract_128(avg_of_squares, square_of_avg, diff);
377 
378 	return (dt_sqrt_128(diff));
379 }
380 
381 static int
382 dt_flowindent(dtrace_hdl_t *dtp, dtrace_probedata_t *data, dtrace_epid_t last,
383     dtrace_bufdesc_t *buf, size_t offs)
384 {
385 	dtrace_probedesc_t *pd = data->dtpda_pdesc, *npd;
386 	dtrace_eprobedesc_t *epd = data->dtpda_edesc, *nepd;
387 	char *p = pd->dtpd_provider, *n = pd->dtpd_name, *sub;
388 	dtrace_flowkind_t flow = DTRACEFLOW_NONE;
389 	const char *str = NULL;
390 	static const char *e_str[2] = { " -> ", " => " };
391 	static const char *r_str[2] = { " <- ", " <= " };
392 	static const char *ent = "entry", *ret = "return";
393 	static int entlen = 0, retlen = 0;
394 	dtrace_epid_t next, id = epd->dtepd_epid;
395 	int rval;
396 
397 	if (entlen == 0) {
398 		assert(retlen == 0);
399 		entlen = strlen(ent);
400 		retlen = strlen(ret);
401 	}
402 
403 	/*
404 	 * If the name of the probe is "entry" or ends with "-entry", we
405 	 * treat it as an entry; if it is "return" or ends with "-return",
406 	 * we treat it as a return.  (This allows application-provided probes
407 	 * like "method-entry" or "function-entry" to participate in flow
408 	 * indentation -- without accidentally misinterpreting popular probe
409 	 * names like "carpentry", "gentry" or "Coventry".)
410 	 */
411 	if ((sub = strstr(n, ent)) != NULL && sub[entlen] == '\0' &&
412 	    (sub == n || sub[-1] == '-')) {
413 		flow = DTRACEFLOW_ENTRY;
414 		str = e_str[strcmp(p, "syscall") == 0];
415 	} else if ((sub = strstr(n, ret)) != NULL && sub[retlen] == '\0' &&
416 	    (sub == n || sub[-1] == '-')) {
417 		flow = DTRACEFLOW_RETURN;
418 		str = r_str[strcmp(p, "syscall") == 0];
419 	}
420 
421 	/*
422 	 * If we're going to indent this, we need to check the ID of our last
423 	 * call.  If we're looking at the same probe ID but a different EPID,
424 	 * we _don't_ want to indent.  (Yes, there are some minor holes in
425 	 * this scheme -- it's a heuristic.)
426 	 */
427 	if (flow == DTRACEFLOW_ENTRY) {
428 		if ((last != DTRACE_EPIDNONE && id != last &&
429 		    pd->dtpd_id == dtp->dt_pdesc[last]->dtpd_id))
430 			flow = DTRACEFLOW_NONE;
431 	}
432 
433 	/*
434 	 * If we're going to unindent this, it's more difficult to see if
435 	 * we don't actually want to unindent it -- we need to look at the
436 	 * _next_ EPID.
437 	 */
438 	if (flow == DTRACEFLOW_RETURN) {
439 		offs += epd->dtepd_size;
440 
441 		do {
442 			if (offs >= buf->dtbd_size)
443 				goto out;
444 
445 			next = *(uint32_t *)((uintptr_t)buf->dtbd_data + offs);
446 
447 			if (next == DTRACE_EPIDNONE)
448 				offs += sizeof (id);
449 		} while (next == DTRACE_EPIDNONE);
450 
451 		if ((rval = dt_epid_lookup(dtp, next, &nepd, &npd)) != 0)
452 			return (rval);
453 
454 		if (next != id && npd->dtpd_id == pd->dtpd_id)
455 			flow = DTRACEFLOW_NONE;
456 	}
457 
458 out:
459 	if (flow == DTRACEFLOW_ENTRY || flow == DTRACEFLOW_RETURN) {
460 		data->dtpda_prefix = str;
461 	} else {
462 		data->dtpda_prefix = "| ";
463 	}
464 
465 	if (flow == DTRACEFLOW_RETURN && data->dtpda_indent > 0)
466 		data->dtpda_indent -= 2;
467 
468 	data->dtpda_flow = flow;
469 
470 	return (0);
471 }
472 
473 static int
474 dt_nullprobe()
475 {
476 	return (DTRACE_CONSUME_THIS);
477 }
478 
479 static int
480 dt_nullrec()
481 {
482 	return (DTRACE_CONSUME_NEXT);
483 }
484 
485 int
486 dt_print_quantline(dtrace_hdl_t *dtp, FILE *fp, int64_t val,
487     uint64_t normal, long double total, char positives, char negatives)
488 {
489 	long double f;
490 	uint_t depth, len = 40;
491 
492 	const char *ats = "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
493 	const char *spaces = "                                        ";
494 
495 	assert(strlen(ats) == len && strlen(spaces) == len);
496 	assert(!(total == 0 && (positives || negatives)));
497 	assert(!(val < 0 && !negatives));
498 	assert(!(val > 0 && !positives));
499 	assert(!(val != 0 && total == 0));
500 
501 	if (!negatives) {
502 		if (positives) {
503 			f = (dt_fabsl((long double)val) * len) / total;
504 			depth = (uint_t)(f + 0.5);
505 		} else {
506 			depth = 0;
507 		}
508 
509 		return (dt_printf(dtp, fp, "|%s%s %-9lld\n", ats + len - depth,
510 		    spaces + depth, (long long)val / normal));
511 	}
512 
513 	if (!positives) {
514 		f = (dt_fabsl((long double)val) * len) / total;
515 		depth = (uint_t)(f + 0.5);
516 
517 		return (dt_printf(dtp, fp, "%s%s| %-9lld\n", spaces + depth,
518 		    ats + len - depth, (long long)val / normal));
519 	}
520 
521 	/*
522 	 * If we're here, we have both positive and negative bucket values.
523 	 * To express this graphically, we're going to generate both positive
524 	 * and negative bars separated by a centerline.  These bars are half
525 	 * the size of normal quantize()/lquantize() bars, so we divide the
526 	 * length in half before calculating the bar length.
527 	 */
528 	len /= 2;
529 	ats = &ats[len];
530 	spaces = &spaces[len];
531 
532 	f = (dt_fabsl((long double)val) * len) / total;
533 	depth = (uint_t)(f + 0.5);
534 
535 	if (val <= 0) {
536 		return (dt_printf(dtp, fp, "%s%s|%*s %-9lld\n", spaces + depth,
537 		    ats + len - depth, len, "", (long long)val / normal));
538 	} else {
539 		return (dt_printf(dtp, fp, "%20s|%s%s %-9lld\n", "",
540 		    ats + len - depth, spaces + depth,
541 		    (long long)val / normal));
542 	}
543 }
544 
545 int
546 dt_print_quantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr,
547     size_t size, uint64_t normal)
548 {
549 	const int64_t *data = addr;
550 	int i, first_bin = 0, last_bin = DTRACE_QUANTIZE_NBUCKETS - 1;
551 	long double total = 0;
552 	char positives = 0, negatives = 0;
553 
554 	if (size != DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
555 		return (dt_set_errno(dtp, EDT_DMISMATCH));
556 
557 	while (first_bin < DTRACE_QUANTIZE_NBUCKETS - 1 && data[first_bin] == 0)
558 		first_bin++;
559 
560 	if (first_bin == DTRACE_QUANTIZE_NBUCKETS - 1) {
561 		/*
562 		 * There isn't any data.  This is possible if (and only if)
563 		 * negative increment values have been used.  In this case,
564 		 * we'll print the buckets around 0.
565 		 */
566 		first_bin = DTRACE_QUANTIZE_ZEROBUCKET - 1;
567 		last_bin = DTRACE_QUANTIZE_ZEROBUCKET + 1;
568 	} else {
569 		if (first_bin > 0)
570 			first_bin--;
571 
572 		while (last_bin > 0 && data[last_bin] == 0)
573 			last_bin--;
574 
575 		if (last_bin < DTRACE_QUANTIZE_NBUCKETS - 1)
576 			last_bin++;
577 	}
578 
579 	for (i = first_bin; i <= last_bin; i++) {
580 		positives |= (data[i] > 0);
581 		negatives |= (data[i] < 0);
582 		total += dt_fabsl((long double)data[i]);
583 	}
584 
585 	if (dt_printf(dtp, fp, "\n%16s %41s %-9s\n", "value",
586 	    "------------- Distribution -------------", "count") < 0)
587 		return (-1);
588 
589 	for (i = first_bin; i <= last_bin; i++) {
590 		if (dt_printf(dtp, fp, "%16lld ",
591 		    (long long)DTRACE_QUANTIZE_BUCKETVAL(i)) < 0)
592 			return (-1);
593 
594 		if (dt_print_quantline(dtp, fp, data[i], normal, total,
595 		    positives, negatives) < 0)
596 			return (-1);
597 	}
598 
599 	return (0);
600 }
601 
602 int
603 dt_print_lquantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr,
604     size_t size, uint64_t normal)
605 {
606 	const int64_t *data = addr;
607 	int i, first_bin, last_bin, base;
608 	uint64_t arg;
609 	long double total = 0;
610 	uint16_t step, levels;
611 	char positives = 0, negatives = 0;
612 
613 	if (size < sizeof (uint64_t))
614 		return (dt_set_errno(dtp, EDT_DMISMATCH));
615 
616 	arg = *data++;
617 	size -= sizeof (uint64_t);
618 
619 	base = DTRACE_LQUANTIZE_BASE(arg);
620 	step = DTRACE_LQUANTIZE_STEP(arg);
621 	levels = DTRACE_LQUANTIZE_LEVELS(arg);
622 
623 	first_bin = 0;
624 	last_bin = levels + 1;
625 
626 	if (size != sizeof (uint64_t) * (levels + 2))
627 		return (dt_set_errno(dtp, EDT_DMISMATCH));
628 
629 	while (first_bin <= levels + 1 && data[first_bin] == 0)
630 		first_bin++;
631 
632 	if (first_bin > levels + 1) {
633 		first_bin = 0;
634 		last_bin = 2;
635 	} else {
636 		if (first_bin > 0)
637 			first_bin--;
638 
639 		while (last_bin > 0 && data[last_bin] == 0)
640 			last_bin--;
641 
642 		if (last_bin < levels + 1)
643 			last_bin++;
644 	}
645 
646 	for (i = first_bin; i <= last_bin; i++) {
647 		positives |= (data[i] > 0);
648 		negatives |= (data[i] < 0);
649 		total += dt_fabsl((long double)data[i]);
650 	}
651 
652 	if (dt_printf(dtp, fp, "\n%16s %41s %-9s\n", "value",
653 	    "------------- Distribution -------------", "count") < 0)
654 		return (-1);
655 
656 	for (i = first_bin; i <= last_bin; i++) {
657 		char c[32];
658 		int err;
659 
660 		if (i == 0) {
661 			(void) snprintf(c, sizeof (c), "< %d",
662 			    base / (uint32_t)normal);
663 			err = dt_printf(dtp, fp, "%16s ", c);
664 		} else if (i == levels + 1) {
665 			(void) snprintf(c, sizeof (c), ">= %d",
666 			    base + (levels * step));
667 			err = dt_printf(dtp, fp, "%16s ", c);
668 		} else {
669 			err = dt_printf(dtp, fp, "%16d ",
670 			    base + (i - 1) * step);
671 		}
672 
673 		if (err < 0 || dt_print_quantline(dtp, fp, data[i], normal,
674 		    total, positives, negatives) < 0)
675 			return (-1);
676 	}
677 
678 	return (0);
679 }
680 
681 int
682 dt_print_llquantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr,
683     size_t size, uint64_t normal)
684 {
685 	int i, first_bin, last_bin, bin = 1, order, levels;
686 	uint16_t factor, low, high, nsteps;
687 	const int64_t *data = addr;
688 	int64_t value = 1, next, step;
689 	char positives = 0, negatives = 0;
690 	long double total = 0;
691 	uint64_t arg;
692 	char c[32];
693 
694 	if (size < sizeof (uint64_t))
695 		return (dt_set_errno(dtp, EDT_DMISMATCH));
696 
697 	arg = *data++;
698 	size -= sizeof (uint64_t);
699 
700 	factor = DTRACE_LLQUANTIZE_FACTOR(arg);
701 	low = DTRACE_LLQUANTIZE_LOW(arg);
702 	high = DTRACE_LLQUANTIZE_HIGH(arg);
703 	nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
704 
705 	/*
706 	 * We don't expect to be handed invalid llquantize() parameters here,
707 	 * but sanity check them (to a degree) nonetheless.
708 	 */
709 	if (size > INT32_MAX || factor < 2 || low >= high ||
710 	    nsteps == 0 || factor > nsteps)
711 		return (dt_set_errno(dtp, EDT_DMISMATCH));
712 
713 	levels = (int)size / sizeof (uint64_t);
714 
715 	first_bin = 0;
716 	last_bin = levels - 1;
717 
718 	while (first_bin < levels && data[first_bin] == 0)
719 		first_bin++;
720 
721 	if (first_bin == levels) {
722 		first_bin = 0;
723 		last_bin = 1;
724 	} else {
725 		if (first_bin > 0)
726 			first_bin--;
727 
728 		while (last_bin > 0 && data[last_bin] == 0)
729 			last_bin--;
730 
731 		if (last_bin < levels - 1)
732 			last_bin++;
733 	}
734 
735 	for (i = first_bin; i <= last_bin; i++) {
736 		positives |= (data[i] > 0);
737 		negatives |= (data[i] < 0);
738 		total += dt_fabsl((long double)data[i]);
739 	}
740 
741 	if (dt_printf(dtp, fp, "\n%16s %41s %-9s\n", "value",
742 	    "------------- Distribution -------------", "count") < 0)
743 		return (-1);
744 
745 	for (order = 0; order < low; order++)
746 		value *= factor;
747 
748 	next = value * factor;
749 	step = next > nsteps ? next / nsteps : 1;
750 
751 	if (first_bin == 0) {
752 		(void) snprintf(c, sizeof (c), "< %lld", value);
753 
754 		if (dt_printf(dtp, fp, "%16s ", c) < 0)
755 			return (-1);
756 
757 		if (dt_print_quantline(dtp, fp, data[0], normal,
758 		    total, positives, negatives) < 0)
759 			return (-1);
760 	}
761 
762 	while (order <= high) {
763 		if (bin >= first_bin && bin <= last_bin) {
764 			if (dt_printf(dtp, fp, "%16lld ", (long long)value) < 0)
765 				return (-1);
766 
767 			if (dt_print_quantline(dtp, fp, data[bin],
768 			    normal, total, positives, negatives) < 0)
769 				return (-1);
770 		}
771 
772 		assert(value < next);
773 		bin++;
774 
775 		if ((value += step) != next)
776 			continue;
777 
778 		next = value * factor;
779 		step = next > nsteps ? next / nsteps : 1;
780 		order++;
781 	}
782 
783 	if (last_bin < bin)
784 		return (0);
785 
786 	assert(last_bin == bin);
787 	(void) snprintf(c, sizeof (c), ">= %lld", value);
788 
789 	if (dt_printf(dtp, fp, "%16s ", c) < 0)
790 		return (-1);
791 
792 	return (dt_print_quantline(dtp, fp, data[bin], normal,
793 	    total, positives, negatives));
794 }
795 
796 /*ARGSUSED*/
797 static int
798 dt_print_average(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr,
799     size_t size, uint64_t normal)
800 {
801 	/* LINTED - alignment */
802 	int64_t *data = (int64_t *)addr;
803 
804 	return (dt_printf(dtp, fp, " %16lld", data[0] ?
805 	    (long long)(data[1] / (int64_t)normal / data[0]) : 0));
806 }
807 
808 /*ARGSUSED*/
809 static int
810 dt_print_stddev(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr,
811     size_t size, uint64_t normal)
812 {
813 	/* LINTED - alignment */
814 	uint64_t *data = (uint64_t *)addr;
815 
816 	return (dt_printf(dtp, fp, " %16llu", data[0] ?
817 	    (unsigned long long) dt_stddev(data, normal) : 0));
818 }
819 
820 /*ARGSUSED*/
821 int
822 dt_print_bytes(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr,
823     size_t nbytes, int width, int quiet, int forceraw)
824 {
825 	/*
826 	 * If the byte stream is a series of printable characters, followed by
827 	 * a terminating byte, we print it out as a string.  Otherwise, we
828 	 * assume that it's something else and just print the bytes.
829 	 */
830 	int i, j, margin = 5;
831 	char *c = (char *)addr;
832 
833 	if (nbytes == 0)
834 		return (0);
835 
836 	if (forceraw)
837 		goto raw;
838 
839 	if (dtp->dt_options[DTRACEOPT_RAWBYTES] != DTRACEOPT_UNSET)
840 		goto raw;
841 
842 	for (i = 0; i < nbytes; i++) {
843 		/*
844 		 * We define a "printable character" to be one for which
845 		 * isprint(3C) returns non-zero, isspace(3C) returns non-zero,
846 		 * or a character which is either backspace or the bell.
847 		 * Backspace and the bell are regrettably special because
848 		 * they fail the first two tests -- and yet they are entirely
849 		 * printable.  These are the only two control characters that
850 		 * have meaning for the terminal and for which isprint(3C) and
851 		 * isspace(3C) return 0.
852 		 */
853 		if (isprint(c[i]) || isspace(c[i]) ||
854 		    c[i] == '\b' || c[i] == '\a')
855 			continue;
856 
857 		if (c[i] == '\0' && i > 0) {
858 			/*
859 			 * This looks like it might be a string.  Before we
860 			 * assume that it is indeed a string, check the
861 			 * remainder of the byte range; if it contains
862 			 * additional non-nul characters, we'll assume that
863 			 * it's a binary stream that just happens to look like
864 			 * a string, and we'll print out the individual bytes.
865 			 */
866 			for (j = i + 1; j < nbytes; j++) {
867 				if (c[j] != '\0')
868 					break;
869 			}
870 
871 			if (j != nbytes)
872 				break;
873 
874 			if (quiet)
875 				return (dt_printf(dtp, fp, "%s", c));
876 			else
877 				return (dt_printf(dtp, fp, "  %-*s", width, c));
878 		}
879 
880 		break;
881 	}
882 
883 	if (i == nbytes) {
884 		/*
885 		 * The byte range is all printable characters, but there is
886 		 * no trailing nul byte.  We'll assume that it's a string and
887 		 * print it as such.
888 		 */
889 		char *s = alloca(nbytes + 1);
890 		bcopy(c, s, nbytes);
891 		s[nbytes] = '\0';
892 		return (dt_printf(dtp, fp, "  %-*s", width, s));
893 	}
894 
895 raw:
896 	if (dt_printf(dtp, fp, "\n%*s      ", margin, "") < 0)
897 		return (-1);
898 
899 	for (i = 0; i < 16; i++)
900 		if (dt_printf(dtp, fp, "  %c", "0123456789abcdef"[i]) < 0)
901 			return (-1);
902 
903 	if (dt_printf(dtp, fp, "  0123456789abcdef\n") < 0)
904 		return (-1);
905 
906 
907 	for (i = 0; i < nbytes; i += 16) {
908 		if (dt_printf(dtp, fp, "%*s%5x:", margin, "", i) < 0)
909 			return (-1);
910 
911 		for (j = i; j < i + 16 && j < nbytes; j++) {
912 			if (dt_printf(dtp, fp, " %02x", (uchar_t)c[j]) < 0)
913 				return (-1);
914 		}
915 
916 		while (j++ % 16) {
917 			if (dt_printf(dtp, fp, "   ") < 0)
918 				return (-1);
919 		}
920 
921 		if (dt_printf(dtp, fp, "  ") < 0)
922 			return (-1);
923 
924 		for (j = i; j < i + 16 && j < nbytes; j++) {
925 			if (dt_printf(dtp, fp, "%c",
926 			    c[j] < ' ' || c[j] > '~' ? '.' : c[j]) < 0)
927 				return (-1);
928 		}
929 
930 		if (dt_printf(dtp, fp, "\n") < 0)
931 			return (-1);
932 	}
933 
934 	return (0);
935 }
936 
937 int
938 dt_print_stack(dtrace_hdl_t *dtp, FILE *fp, const char *format,
939     caddr_t addr, int depth, int size)
940 {
941 	dtrace_syminfo_t dts;
942 	GElf_Sym sym;
943 	int i, indent;
944 	char c[PATH_MAX * 2];
945 	uint64_t pc;
946 
947 	if (dt_printf(dtp, fp, "\n") < 0)
948 		return (-1);
949 
950 	if (format == NULL)
951 		format = "%s";
952 
953 	if (dtp->dt_options[DTRACEOPT_STACKINDENT] != DTRACEOPT_UNSET)
954 		indent = (int)dtp->dt_options[DTRACEOPT_STACKINDENT];
955 	else
956 		indent = _dtrace_stkindent;
957 
958 	for (i = 0; i < depth; i++) {
959 		switch (size) {
960 		case sizeof (uint32_t):
961 			/* LINTED - alignment */
962 			pc = *((uint32_t *)addr);
963 			break;
964 
965 		case sizeof (uint64_t):
966 			/* LINTED - alignment */
967 			pc = *((uint64_t *)addr);
968 			break;
969 
970 		default:
971 			return (dt_set_errno(dtp, EDT_BADSTACKPC));
972 		}
973 
974 		if (pc == NULL)
975 			break;
976 
977 		addr += size;
978 
979 		if (dt_printf(dtp, fp, "%*s", indent, "") < 0)
980 			return (-1);
981 
982 		if (dtrace_lookup_by_addr(dtp, pc, &sym, &dts) == 0) {
983 			if (pc > sym.st_value) {
984 				(void) snprintf(c, sizeof (c), "%s`%s+0x%llx",
985 				    dts.dts_object, dts.dts_name,
986 				    pc - sym.st_value);
987 			} else {
988 				(void) snprintf(c, sizeof (c), "%s`%s",
989 				    dts.dts_object, dts.dts_name);
990 			}
991 		} else {
992 			/*
993 			 * We'll repeat the lookup, but this time we'll specify
994 			 * a NULL GElf_Sym -- indicating that we're only
995 			 * interested in the containing module.
996 			 */
997 			if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
998 				(void) snprintf(c, sizeof (c), "%s`0x%llx",
999 				    dts.dts_object, pc);
1000 			} else {
1001 				(void) snprintf(c, sizeof (c), "0x%llx", pc);
1002 			}
1003 		}
1004 
1005 		if (dt_printf(dtp, fp, format, c) < 0)
1006 			return (-1);
1007 
1008 		if (dt_printf(dtp, fp, "\n") < 0)
1009 			return (-1);
1010 	}
1011 
1012 	return (0);
1013 }
1014 
1015 int
1016 dt_print_ustack(dtrace_hdl_t *dtp, FILE *fp, const char *format,
1017     caddr_t addr, uint64_t arg)
1018 {
1019 	/* LINTED - alignment */
1020 	uint64_t *pc = (uint64_t *)addr;
1021 	uint32_t depth = DTRACE_USTACK_NFRAMES(arg);
1022 	uint32_t strsize = DTRACE_USTACK_STRSIZE(arg);
1023 	const char *strbase = addr + (depth + 1) * sizeof (uint64_t);
1024 	const char *str = strsize ? strbase : NULL;
1025 	int err = 0;
1026 
1027 	char name[PATH_MAX], objname[PATH_MAX], c[PATH_MAX * 2];
1028 	struct ps_prochandle *P;
1029 	GElf_Sym sym;
1030 	int i, indent;
1031 	pid_t pid;
1032 
1033 	if (depth == 0)
1034 		return (0);
1035 
1036 	pid = (pid_t)*pc++;
1037 
1038 	if (dt_printf(dtp, fp, "\n") < 0)
1039 		return (-1);
1040 
1041 	if (format == NULL)
1042 		format = "%s";
1043 
1044 	if (dtp->dt_options[DTRACEOPT_STACKINDENT] != DTRACEOPT_UNSET)
1045 		indent = (int)dtp->dt_options[DTRACEOPT_STACKINDENT];
1046 	else
1047 		indent = _dtrace_stkindent;
1048 
1049 	/*
1050 	 * Ultimately, we need to add an entry point in the library vector for
1051 	 * determining <symbol, offset> from <pid, address>.  For now, if
1052 	 * this is a vector open, we just print the raw address or string.
1053 	 */
1054 	if (dtp->dt_vector == NULL)
1055 		P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0);
1056 	else
1057 		P = NULL;
1058 
1059 	if (P != NULL)
1060 		dt_proc_lock(dtp, P); /* lock handle while we perform lookups */
1061 
1062 	for (i = 0; i < depth && pc[i] != NULL; i++) {
1063 		const prmap_t *map;
1064 
1065 		if ((err = dt_printf(dtp, fp, "%*s", indent, "")) < 0)
1066 			break;
1067 
1068 		if (P != NULL && Plookup_by_addr(P, pc[i],
1069 		    name, sizeof (name), &sym) == 0) {
1070 			(void) Pobjname(P, pc[i], objname, sizeof (objname));
1071 
1072 			if (pc[i] > sym.st_value) {
1073 				(void) snprintf(c, sizeof (c),
1074 				    "%s`%s+0x%llx", dt_basename(objname), name,
1075 				    (u_longlong_t)(pc[i] - sym.st_value));
1076 			} else {
1077 				(void) snprintf(c, sizeof (c),
1078 				    "%s`%s", dt_basename(objname), name);
1079 			}
1080 		} else if (str != NULL && str[0] != '\0' && str[0] != '@' &&
1081 		    (P != NULL && ((map = Paddr_to_map(P, pc[i])) == NULL ||
1082 		    (map->pr_mflags & MA_WRITE)))) {
1083 			/*
1084 			 * If the current string pointer in the string table
1085 			 * does not point to an empty string _and_ the program
1086 			 * counter falls in a writable region, we'll use the
1087 			 * string from the string table instead of the raw
1088 			 * address.  This last condition is necessary because
1089 			 * some (broken) ustack helpers will return a string
1090 			 * even for a program counter that they can't
1091 			 * identify.  If we have a string for a program
1092 			 * counter that falls in a segment that isn't
1093 			 * writable, we assume that we have fallen into this
1094 			 * case and we refuse to use the string.
1095 			 */
1096 			(void) snprintf(c, sizeof (c), "%s", str);
1097 		} else {
1098 			if (P != NULL && Pobjname(P, pc[i], objname,
1099 			    sizeof (objname)) != NULL) {
1100 				(void) snprintf(c, sizeof (c), "%s`0x%llx",
1101 				    dt_basename(objname), (u_longlong_t)pc[i]);
1102 			} else {
1103 				(void) snprintf(c, sizeof (c), "0x%llx",
1104 				    (u_longlong_t)pc[i]);
1105 			}
1106 		}
1107 
1108 		if ((err = dt_printf(dtp, fp, format, c)) < 0)
1109 			break;
1110 
1111 		if ((err = dt_printf(dtp, fp, "\n")) < 0)
1112 			break;
1113 
1114 		if (str != NULL && str[0] == '@') {
1115 			/*
1116 			 * If the first character of the string is an "at" sign,
1117 			 * then the string is inferred to be an annotation --
1118 			 * and it is printed out beneath the frame and offset
1119 			 * with brackets.
1120 			 */
1121 			if ((err = dt_printf(dtp, fp, "%*s", indent, "")) < 0)
1122 				break;
1123 
1124 			(void) snprintf(c, sizeof (c), "  [ %s ]", &str[1]);
1125 
1126 			if ((err = dt_printf(dtp, fp, format, c)) < 0)
1127 				break;
1128 
1129 			if ((err = dt_printf(dtp, fp, "\n")) < 0)
1130 				break;
1131 		}
1132 
1133 		if (str != NULL) {
1134 			str += strlen(str) + 1;
1135 			if (str - strbase >= strsize)
1136 				str = NULL;
1137 		}
1138 	}
1139 
1140 	if (P != NULL) {
1141 		dt_proc_unlock(dtp, P);
1142 		dt_proc_release(dtp, P);
1143 	}
1144 
1145 	return (err);
1146 }
1147 
1148 static int
1149 dt_print_usym(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, dtrace_actkind_t act)
1150 {
1151 	/* LINTED - alignment */
1152 	uint64_t pid = ((uint64_t *)addr)[0];
1153 	/* LINTED - alignment */
1154 	uint64_t pc = ((uint64_t *)addr)[1];
1155 	const char *format = "  %-50s";
1156 	char *s;
1157 	int n, len = 256;
1158 
1159 	if (act == DTRACEACT_USYM && dtp->dt_vector == NULL) {
1160 		struct ps_prochandle *P;
1161 
1162 		if ((P = dt_proc_grab(dtp, pid,
1163 		    PGRAB_RDONLY | PGRAB_FORCE, 0)) != NULL) {
1164 			GElf_Sym sym;
1165 
1166 			dt_proc_lock(dtp, P);
1167 
1168 			if (Plookup_by_addr(P, pc, NULL, 0, &sym) == 0)
1169 				pc = sym.st_value;
1170 
1171 			dt_proc_unlock(dtp, P);
1172 			dt_proc_release(dtp, P);
1173 		}
1174 	}
1175 
1176 	do {
1177 		n = len;
1178 		s = alloca(n);
1179 	} while ((len = dtrace_uaddr2str(dtp, pid, pc, s, n)) > n);
1180 
1181 	return (dt_printf(dtp, fp, format, s));
1182 }
1183 
1184 int
1185 dt_print_umod(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
1186 {
1187 	/* LINTED - alignment */
1188 	uint64_t pid = ((uint64_t *)addr)[0];
1189 	/* LINTED - alignment */
1190 	uint64_t pc = ((uint64_t *)addr)[1];
1191 	int err = 0;
1192 
1193 	char objname[PATH_MAX], c[PATH_MAX * 2];
1194 	struct ps_prochandle *P;
1195 
1196 	if (format == NULL)
1197 		format = "  %-50s";
1198 
1199 	/*
1200 	 * See the comment in dt_print_ustack() for the rationale for
1201 	 * printing raw addresses in the vectored case.
1202 	 */
1203 	if (dtp->dt_vector == NULL)
1204 		P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0);
1205 	else
1206 		P = NULL;
1207 
1208 	if (P != NULL)
1209 		dt_proc_lock(dtp, P); /* lock handle while we perform lookups */
1210 
1211 	if (P != NULL && Pobjname(P, pc, objname, sizeof (objname)) != NULL) {
1212 		(void) snprintf(c, sizeof (c), "%s", dt_basename(objname));
1213 	} else {
1214 		(void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc);
1215 	}
1216 
1217 	err = dt_printf(dtp, fp, format, c);
1218 
1219 	if (P != NULL) {
1220 		dt_proc_unlock(dtp, P);
1221 		dt_proc_release(dtp, P);
1222 	}
1223 
1224 	return (err);
1225 }
1226 
1227 static int
1228 dt_print_sym(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
1229 {
1230 	/* LINTED - alignment */
1231 	uint64_t pc = *((uint64_t *)addr);
1232 	dtrace_syminfo_t dts;
1233 	GElf_Sym sym;
1234 	char c[PATH_MAX * 2];
1235 
1236 	if (format == NULL)
1237 		format = "  %-50s";
1238 
1239 	if (dtrace_lookup_by_addr(dtp, pc, &sym, &dts) == 0) {
1240 		(void) snprintf(c, sizeof (c), "%s`%s",
1241 		    dts.dts_object, dts.dts_name);
1242 	} else {
1243 		/*
1244 		 * We'll repeat the lookup, but this time we'll specify a
1245 		 * NULL GElf_Sym -- indicating that we're only interested in
1246 		 * the containing module.
1247 		 */
1248 		if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
1249 			(void) snprintf(c, sizeof (c), "%s`0x%llx",
1250 			    dts.dts_object, (u_longlong_t)pc);
1251 		} else {
1252 			(void) snprintf(c, sizeof (c), "0x%llx",
1253 			    (u_longlong_t)pc);
1254 		}
1255 	}
1256 
1257 	if (dt_printf(dtp, fp, format, c) < 0)
1258 		return (-1);
1259 
1260 	return (0);
1261 }
1262 
1263 int
1264 dt_print_mod(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
1265 {
1266 	/* LINTED - alignment */
1267 	uint64_t pc = *((uint64_t *)addr);
1268 	dtrace_syminfo_t dts;
1269 	char c[PATH_MAX * 2];
1270 
1271 	if (format == NULL)
1272 		format = "  %-50s";
1273 
1274 	if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
1275 		(void) snprintf(c, sizeof (c), "%s", dts.dts_object);
1276 	} else {
1277 		(void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc);
1278 	}
1279 
1280 	if (dt_printf(dtp, fp, format, c) < 0)
1281 		return (-1);
1282 
1283 	return (0);
1284 }
1285 
1286 typedef struct dt_normal {
1287 	dtrace_aggvarid_t dtnd_id;
1288 	uint64_t dtnd_normal;
1289 } dt_normal_t;
1290 
1291 static int
1292 dt_normalize_agg(const dtrace_aggdata_t *aggdata, void *arg)
1293 {
1294 	dt_normal_t *normal = arg;
1295 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1296 	dtrace_aggvarid_t id = normal->dtnd_id;
1297 
1298 	if (agg->dtagd_nrecs == 0)
1299 		return (DTRACE_AGGWALK_NEXT);
1300 
1301 	if (agg->dtagd_varid != id)
1302 		return (DTRACE_AGGWALK_NEXT);
1303 
1304 	((dtrace_aggdata_t *)aggdata)->dtada_normal = normal->dtnd_normal;
1305 	return (DTRACE_AGGWALK_NORMALIZE);
1306 }
1307 
1308 static int
1309 dt_normalize(dtrace_hdl_t *dtp, caddr_t base, dtrace_recdesc_t *rec)
1310 {
1311 	dt_normal_t normal;
1312 	caddr_t addr;
1313 
1314 	/*
1315 	 * We (should) have two records:  the aggregation ID followed by the
1316 	 * normalization value.
1317 	 */
1318 	addr = base + rec->dtrd_offset;
1319 
1320 	if (rec->dtrd_size != sizeof (dtrace_aggvarid_t))
1321 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1322 
1323 	/* LINTED - alignment */
1324 	normal.dtnd_id = *((dtrace_aggvarid_t *)addr);
1325 	rec++;
1326 
1327 	if (rec->dtrd_action != DTRACEACT_LIBACT)
1328 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1329 
1330 	if (rec->dtrd_arg != DT_ACT_NORMALIZE)
1331 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1332 
1333 	addr = base + rec->dtrd_offset;
1334 
1335 	switch (rec->dtrd_size) {
1336 	case sizeof (uint64_t):
1337 		/* LINTED - alignment */
1338 		normal.dtnd_normal = *((uint64_t *)addr);
1339 		break;
1340 	case sizeof (uint32_t):
1341 		/* LINTED - alignment */
1342 		normal.dtnd_normal = *((uint32_t *)addr);
1343 		break;
1344 	case sizeof (uint16_t):
1345 		/* LINTED - alignment */
1346 		normal.dtnd_normal = *((uint16_t *)addr);
1347 		break;
1348 	case sizeof (uint8_t):
1349 		normal.dtnd_normal = *((uint8_t *)addr);
1350 		break;
1351 	default:
1352 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1353 	}
1354 
1355 	(void) dtrace_aggregate_walk(dtp, dt_normalize_agg, &normal);
1356 
1357 	return (0);
1358 }
1359 
1360 static int
1361 dt_denormalize_agg(const dtrace_aggdata_t *aggdata, void *arg)
1362 {
1363 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1364 	dtrace_aggvarid_t id = *((dtrace_aggvarid_t *)arg);
1365 
1366 	if (agg->dtagd_nrecs == 0)
1367 		return (DTRACE_AGGWALK_NEXT);
1368 
1369 	if (agg->dtagd_varid != id)
1370 		return (DTRACE_AGGWALK_NEXT);
1371 
1372 	return (DTRACE_AGGWALK_DENORMALIZE);
1373 }
1374 
1375 static int
1376 dt_clear_agg(const dtrace_aggdata_t *aggdata, void *arg)
1377 {
1378 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1379 	dtrace_aggvarid_t id = *((dtrace_aggvarid_t *)arg);
1380 
1381 	if (agg->dtagd_nrecs == 0)
1382 		return (DTRACE_AGGWALK_NEXT);
1383 
1384 	if (agg->dtagd_varid != id)
1385 		return (DTRACE_AGGWALK_NEXT);
1386 
1387 	return (DTRACE_AGGWALK_CLEAR);
1388 }
1389 
1390 typedef struct dt_trunc {
1391 	dtrace_aggvarid_t dttd_id;
1392 	uint64_t dttd_remaining;
1393 } dt_trunc_t;
1394 
1395 static int
1396 dt_trunc_agg(const dtrace_aggdata_t *aggdata, void *arg)
1397 {
1398 	dt_trunc_t *trunc = arg;
1399 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1400 	dtrace_aggvarid_t id = trunc->dttd_id;
1401 
1402 	if (agg->dtagd_nrecs == 0)
1403 		return (DTRACE_AGGWALK_NEXT);
1404 
1405 	if (agg->dtagd_varid != id)
1406 		return (DTRACE_AGGWALK_NEXT);
1407 
1408 	if (trunc->dttd_remaining == 0)
1409 		return (DTRACE_AGGWALK_REMOVE);
1410 
1411 	trunc->dttd_remaining--;
1412 	return (DTRACE_AGGWALK_NEXT);
1413 }
1414 
1415 static int
1416 dt_trunc(dtrace_hdl_t *dtp, caddr_t base, dtrace_recdesc_t *rec)
1417 {
1418 	dt_trunc_t trunc;
1419 	caddr_t addr;
1420 	int64_t remaining;
1421 	int (*func)(dtrace_hdl_t *, dtrace_aggregate_f *, void *);
1422 
1423 	/*
1424 	 * We (should) have two records:  the aggregation ID followed by the
1425 	 * number of aggregation entries after which the aggregation is to be
1426 	 * truncated.
1427 	 */
1428 	addr = base + rec->dtrd_offset;
1429 
1430 	if (rec->dtrd_size != sizeof (dtrace_aggvarid_t))
1431 		return (dt_set_errno(dtp, EDT_BADTRUNC));
1432 
1433 	/* LINTED - alignment */
1434 	trunc.dttd_id = *((dtrace_aggvarid_t *)addr);
1435 	rec++;
1436 
1437 	if (rec->dtrd_action != DTRACEACT_LIBACT)
1438 		return (dt_set_errno(dtp, EDT_BADTRUNC));
1439 
1440 	if (rec->dtrd_arg != DT_ACT_TRUNC)
1441 		return (dt_set_errno(dtp, EDT_BADTRUNC));
1442 
1443 	addr = base + rec->dtrd_offset;
1444 
1445 	switch (rec->dtrd_size) {
1446 	case sizeof (uint64_t):
1447 		/* LINTED - alignment */
1448 		remaining = *((int64_t *)addr);
1449 		break;
1450 	case sizeof (uint32_t):
1451 		/* LINTED - alignment */
1452 		remaining = *((int32_t *)addr);
1453 		break;
1454 	case sizeof (uint16_t):
1455 		/* LINTED - alignment */
1456 		remaining = *((int16_t *)addr);
1457 		break;
1458 	case sizeof (uint8_t):
1459 		remaining = *((int8_t *)addr);
1460 		break;
1461 	default:
1462 		return (dt_set_errno(dtp, EDT_BADNORMAL));
1463 	}
1464 
1465 	if (remaining < 0) {
1466 		func = dtrace_aggregate_walk_valsorted;
1467 		remaining = -remaining;
1468 	} else {
1469 		func = dtrace_aggregate_walk_valrevsorted;
1470 	}
1471 
1472 	assert(remaining >= 0);
1473 	trunc.dttd_remaining = remaining;
1474 
1475 	(void) func(dtp, dt_trunc_agg, &trunc);
1476 
1477 	return (0);
1478 }
1479 
1480 static int
1481 dt_print_datum(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec,
1482     caddr_t addr, size_t size, uint64_t normal)
1483 {
1484 	int err;
1485 	dtrace_actkind_t act = rec->dtrd_action;
1486 
1487 	switch (act) {
1488 	case DTRACEACT_STACK:
1489 		return (dt_print_stack(dtp, fp, NULL, addr,
1490 		    rec->dtrd_arg, rec->dtrd_size / rec->dtrd_arg));
1491 
1492 	case DTRACEACT_USTACK:
1493 	case DTRACEACT_JSTACK:
1494 		return (dt_print_ustack(dtp, fp, NULL, addr, rec->dtrd_arg));
1495 
1496 	case DTRACEACT_USYM:
1497 	case DTRACEACT_UADDR:
1498 		return (dt_print_usym(dtp, fp, addr, act));
1499 
1500 	case DTRACEACT_UMOD:
1501 		return (dt_print_umod(dtp, fp, NULL, addr));
1502 
1503 	case DTRACEACT_SYM:
1504 		return (dt_print_sym(dtp, fp, NULL, addr));
1505 
1506 	case DTRACEACT_MOD:
1507 		return (dt_print_mod(dtp, fp, NULL, addr));
1508 
1509 	case DTRACEAGG_QUANTIZE:
1510 		return (dt_print_quantize(dtp, fp, addr, size, normal));
1511 
1512 	case DTRACEAGG_LQUANTIZE:
1513 		return (dt_print_lquantize(dtp, fp, addr, size, normal));
1514 
1515 	case DTRACEAGG_LLQUANTIZE:
1516 		return (dt_print_llquantize(dtp, fp, addr, size, normal));
1517 
1518 	case DTRACEAGG_AVG:
1519 		return (dt_print_average(dtp, fp, addr, size, normal));
1520 
1521 	case DTRACEAGG_STDDEV:
1522 		return (dt_print_stddev(dtp, fp, addr, size, normal));
1523 
1524 	default:
1525 		break;
1526 	}
1527 
1528 	switch (size) {
1529 	case sizeof (uint64_t):
1530 		err = dt_printf(dtp, fp, " %16lld",
1531 		    /* LINTED - alignment */
1532 		    (long long)*((uint64_t *)addr) / normal);
1533 		break;
1534 	case sizeof (uint32_t):
1535 		/* LINTED - alignment */
1536 		err = dt_printf(dtp, fp, " %8d", *((uint32_t *)addr) /
1537 		    (uint32_t)normal);
1538 		break;
1539 	case sizeof (uint16_t):
1540 		/* LINTED - alignment */
1541 		err = dt_printf(dtp, fp, " %5d", *((uint16_t *)addr) /
1542 		    (uint32_t)normal);
1543 		break;
1544 	case sizeof (uint8_t):
1545 		err = dt_printf(dtp, fp, " %3d", *((uint8_t *)addr) /
1546 		    (uint32_t)normal);
1547 		break;
1548 	default:
1549 		err = dt_print_bytes(dtp, fp, addr, size, 50, 0, 0);
1550 		break;
1551 	}
1552 
1553 	return (err);
1554 }
1555 
1556 int
1557 dt_print_aggs(const dtrace_aggdata_t **aggsdata, int naggvars, void *arg)
1558 {
1559 	int i, aggact = 0;
1560 	dt_print_aggdata_t *pd = arg;
1561 	const dtrace_aggdata_t *aggdata = aggsdata[0];
1562 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1563 	FILE *fp = pd->dtpa_fp;
1564 	dtrace_hdl_t *dtp = pd->dtpa_dtp;
1565 	dtrace_recdesc_t *rec;
1566 	dtrace_actkind_t act;
1567 	caddr_t addr;
1568 	size_t size;
1569 
1570 	/*
1571 	 * Iterate over each record description in the key, printing the traced
1572 	 * data, skipping the first datum (the tuple member created by the
1573 	 * compiler).
1574 	 */
1575 	for (i = 1; i < agg->dtagd_nrecs; i++) {
1576 		rec = &agg->dtagd_rec[i];
1577 		act = rec->dtrd_action;
1578 		addr = aggdata->dtada_data + rec->dtrd_offset;
1579 		size = rec->dtrd_size;
1580 
1581 		if (DTRACEACT_ISAGG(act)) {
1582 			aggact = i;
1583 			break;
1584 		}
1585 
1586 		if (dt_print_datum(dtp, fp, rec, addr, size, 1) < 0)
1587 			return (-1);
1588 
1589 		if (dt_buffered_flush(dtp, NULL, rec, aggdata,
1590 		    DTRACE_BUFDATA_AGGKEY) < 0)
1591 			return (-1);
1592 	}
1593 
1594 	assert(aggact != 0);
1595 
1596 	for (i = (naggvars == 1 ? 0 : 1); i < naggvars; i++) {
1597 		uint64_t normal;
1598 
1599 		aggdata = aggsdata[i];
1600 		agg = aggdata->dtada_desc;
1601 		rec = &agg->dtagd_rec[aggact];
1602 		act = rec->dtrd_action;
1603 		addr = aggdata->dtada_data + rec->dtrd_offset;
1604 		size = rec->dtrd_size;
1605 
1606 		assert(DTRACEACT_ISAGG(act));
1607 		normal = aggdata->dtada_normal;
1608 
1609 		if (dt_print_datum(dtp, fp, rec, addr, size, normal) < 0)
1610 			return (-1);
1611 
1612 		if (dt_buffered_flush(dtp, NULL, rec, aggdata,
1613 		    DTRACE_BUFDATA_AGGVAL) < 0)
1614 			return (-1);
1615 
1616 		if (!pd->dtpa_allunprint)
1617 			agg->dtagd_flags |= DTRACE_AGD_PRINTED;
1618 	}
1619 
1620 	if (dt_printf(dtp, fp, "\n") < 0)
1621 		return (-1);
1622 
1623 	if (dt_buffered_flush(dtp, NULL, NULL, aggdata,
1624 	    DTRACE_BUFDATA_AGGFORMAT | DTRACE_BUFDATA_AGGLAST) < 0)
1625 		return (-1);
1626 
1627 	return (0);
1628 }
1629 
1630 int
1631 dt_print_agg(const dtrace_aggdata_t *aggdata, void *arg)
1632 {
1633 	dt_print_aggdata_t *pd = arg;
1634 	dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1635 	dtrace_aggvarid_t aggvarid = pd->dtpa_id;
1636 
1637 	if (pd->dtpa_allunprint) {
1638 		if (agg->dtagd_flags & DTRACE_AGD_PRINTED)
1639 			return (0);
1640 	} else {
1641 		/*
1642 		 * If we're not printing all unprinted aggregations, then the
1643 		 * aggregation variable ID denotes a specific aggregation
1644 		 * variable that we should print -- skip any other aggregations
1645 		 * that we encounter.
1646 		 */
1647 		if (agg->dtagd_nrecs == 0)
1648 			return (0);
1649 
1650 		if (aggvarid != agg->dtagd_varid)
1651 			return (0);
1652 	}
1653 
1654 	return (dt_print_aggs(&aggdata, 1, arg));
1655 }
1656 
1657 int
1658 dt_setopt(dtrace_hdl_t *dtp, const dtrace_probedata_t *data,
1659     const char *option, const char *value)
1660 {
1661 	int len, rval;
1662 	char *msg;
1663 	const char *errstr;
1664 	dtrace_setoptdata_t optdata;
1665 
1666 	bzero(&optdata, sizeof (optdata));
1667 	(void) dtrace_getopt(dtp, option, &optdata.dtsda_oldval);
1668 
1669 	if (dtrace_setopt(dtp, option, value) == 0) {
1670 		(void) dtrace_getopt(dtp, option, &optdata.dtsda_newval);
1671 		optdata.dtsda_probe = data;
1672 		optdata.dtsda_option = option;
1673 		optdata.dtsda_handle = dtp;
1674 
1675 		if ((rval = dt_handle_setopt(dtp, &optdata)) != 0)
1676 			return (rval);
1677 
1678 		return (0);
1679 	}
1680 
1681 	errstr = dtrace_errmsg(dtp, dtrace_errno(dtp));
1682 	len = strlen(option) + strlen(value) + strlen(errstr) + 80;
1683 	msg = alloca(len);
1684 
1685 	(void) snprintf(msg, len, "couldn't set option \"%s\" to \"%s\": %s\n",
1686 	    option, value, errstr);
1687 
1688 	if ((rval = dt_handle_liberr(dtp, data, msg)) == 0)
1689 		return (0);
1690 
1691 	return (rval);
1692 }
1693 
1694 static int
1695 dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu,
1696     dtrace_bufdesc_t *buf, boolean_t just_one,
1697     dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc, void *arg)
1698 {
1699 	dtrace_epid_t id;
1700 	size_t offs;
1701 	int flow = (dtp->dt_options[DTRACEOPT_FLOWINDENT] != DTRACEOPT_UNSET);
1702 	int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET);
1703 	int rval, i, n;
1704 	uint64_t tracememsize = 0;
1705 	dtrace_probedata_t data;
1706 	uint64_t drops;
1707 
1708 	bzero(&data, sizeof (data));
1709 	data.dtpda_handle = dtp;
1710 	data.dtpda_cpu = cpu;
1711 	data.dtpda_flow = dtp->dt_flow;
1712 	data.dtpda_indent = dtp->dt_indent;
1713 	data.dtpda_prefix = dtp->dt_prefix;
1714 
1715 	for (offs = buf->dtbd_oldest; offs < buf->dtbd_size; ) {
1716 		dtrace_eprobedesc_t *epd;
1717 
1718 		/*
1719 		 * We're guaranteed to have an ID.
1720 		 */
1721 		id = *(uint32_t *)((uintptr_t)buf->dtbd_data + offs);
1722 
1723 		if (id == DTRACE_EPIDNONE) {
1724 			/*
1725 			 * This is filler to assure proper alignment of the
1726 			 * next record; we simply ignore it.
1727 			 */
1728 			offs += sizeof (id);
1729 			continue;
1730 		}
1731 
1732 		if ((rval = dt_epid_lookup(dtp, id, &data.dtpda_edesc,
1733 		    &data.dtpda_pdesc)) != 0)
1734 			return (rval);
1735 
1736 		epd = data.dtpda_edesc;
1737 		data.dtpda_data = buf->dtbd_data + offs;
1738 
1739 		if (data.dtpda_edesc->dtepd_uarg != DT_ECB_DEFAULT) {
1740 			rval = dt_handle(dtp, &data);
1741 
1742 			if (rval == DTRACE_CONSUME_NEXT)
1743 				goto nextepid;
1744 
1745 			if (rval == DTRACE_CONSUME_ERROR)
1746 				return (-1);
1747 		}
1748 
1749 		if (flow)
1750 			(void) dt_flowindent(dtp, &data, dtp->dt_last_epid,
1751 			    buf, offs);
1752 
1753 		rval = (*efunc)(&data, arg);
1754 
1755 		if (flow) {
1756 			if (data.dtpda_flow == DTRACEFLOW_ENTRY)
1757 				data.dtpda_indent += 2;
1758 		}
1759 
1760 		if (rval == DTRACE_CONSUME_NEXT)
1761 			goto nextepid;
1762 
1763 		if (rval == DTRACE_CONSUME_ABORT)
1764 			return (dt_set_errno(dtp, EDT_DIRABORT));
1765 
1766 		if (rval != DTRACE_CONSUME_THIS)
1767 			return (dt_set_errno(dtp, EDT_BADRVAL));
1768 
1769 		for (i = 0; i < epd->dtepd_nrecs; i++) {
1770 			caddr_t addr;
1771 			dtrace_recdesc_t *rec = &epd->dtepd_rec[i];
1772 			dtrace_actkind_t act = rec->dtrd_action;
1773 
1774 			data.dtpda_data = buf->dtbd_data + offs +
1775 			    rec->dtrd_offset;
1776 			addr = data.dtpda_data;
1777 
1778 			if (act == DTRACEACT_LIBACT) {
1779 				uint64_t arg = rec->dtrd_arg;
1780 				dtrace_aggvarid_t id;
1781 
1782 				switch (arg) {
1783 				case DT_ACT_CLEAR:
1784 					/* LINTED - alignment */
1785 					id = *((dtrace_aggvarid_t *)addr);
1786 					(void) dtrace_aggregate_walk(dtp,
1787 					    dt_clear_agg, &id);
1788 					continue;
1789 
1790 				case DT_ACT_DENORMALIZE:
1791 					/* LINTED - alignment */
1792 					id = *((dtrace_aggvarid_t *)addr);
1793 					(void) dtrace_aggregate_walk(dtp,
1794 					    dt_denormalize_agg, &id);
1795 					continue;
1796 
1797 				case DT_ACT_FTRUNCATE:
1798 					if (fp == NULL)
1799 						continue;
1800 
1801 					(void) fflush(fp);
1802 					(void) ftruncate(fileno(fp), 0);
1803 					(void) fseeko(fp, 0, SEEK_SET);
1804 					continue;
1805 
1806 				case DT_ACT_NORMALIZE:
1807 					if (i == epd->dtepd_nrecs - 1)
1808 						return (dt_set_errno(dtp,
1809 						    EDT_BADNORMAL));
1810 
1811 					if (dt_normalize(dtp,
1812 					    buf->dtbd_data + offs, rec) != 0)
1813 						return (-1);
1814 
1815 					i++;
1816 					continue;
1817 
1818 				case DT_ACT_SETOPT: {
1819 					uint64_t *opts = dtp->dt_options;
1820 					dtrace_recdesc_t *valrec;
1821 					uint32_t valsize;
1822 					caddr_t val;
1823 					int rv;
1824 
1825 					if (i == epd->dtepd_nrecs - 1) {
1826 						return (dt_set_errno(dtp,
1827 						    EDT_BADSETOPT));
1828 					}
1829 
1830 					valrec = &epd->dtepd_rec[++i];
1831 					valsize = valrec->dtrd_size;
1832 
1833 					if (valrec->dtrd_action != act ||
1834 					    valrec->dtrd_arg != arg) {
1835 						return (dt_set_errno(dtp,
1836 						    EDT_BADSETOPT));
1837 					}
1838 
1839 					if (valsize > sizeof (uint64_t)) {
1840 						val = buf->dtbd_data + offs +
1841 						    valrec->dtrd_offset;
1842 					} else {
1843 						val = "1";
1844 					}
1845 
1846 					rv = dt_setopt(dtp, &data, addr, val);
1847 
1848 					if (rv != 0)
1849 						return (-1);
1850 
1851 					flow = (opts[DTRACEOPT_FLOWINDENT] !=
1852 					    DTRACEOPT_UNSET);
1853 					quiet = (opts[DTRACEOPT_QUIET] !=
1854 					    DTRACEOPT_UNSET);
1855 
1856 					continue;
1857 				}
1858 
1859 				case DT_ACT_TRUNC:
1860 					if (i == epd->dtepd_nrecs - 1)
1861 						return (dt_set_errno(dtp,
1862 						    EDT_BADTRUNC));
1863 
1864 					if (dt_trunc(dtp,
1865 					    buf->dtbd_data + offs, rec) != 0)
1866 						return (-1);
1867 
1868 					i++;
1869 					continue;
1870 
1871 				default:
1872 					continue;
1873 				}
1874 			}
1875 
1876 			if (act == DTRACEACT_TRACEMEM_DYNSIZE &&
1877 			    rec->dtrd_size == sizeof (uint64_t)) {
1878 				/* LINTED - alignment */
1879 				tracememsize = *((unsigned long long *)addr);
1880 				continue;
1881 			}
1882 
1883 			rval = (*rfunc)(&data, rec, arg);
1884 
1885 			if (rval == DTRACE_CONSUME_NEXT)
1886 				continue;
1887 
1888 			if (rval == DTRACE_CONSUME_ABORT)
1889 				return (dt_set_errno(dtp, EDT_DIRABORT));
1890 
1891 			if (rval != DTRACE_CONSUME_THIS)
1892 				return (dt_set_errno(dtp, EDT_BADRVAL));
1893 
1894 			if (act == DTRACEACT_STACK) {
1895 				int depth = rec->dtrd_arg;
1896 
1897 				if (dt_print_stack(dtp, fp, NULL, addr, depth,
1898 				    rec->dtrd_size / depth) < 0)
1899 					return (-1);
1900 				goto nextrec;
1901 			}
1902 
1903 			if (act == DTRACEACT_USTACK ||
1904 			    act == DTRACEACT_JSTACK) {
1905 				if (dt_print_ustack(dtp, fp, NULL,
1906 				    addr, rec->dtrd_arg) < 0)
1907 					return (-1);
1908 				goto nextrec;
1909 			}
1910 
1911 			if (act == DTRACEACT_SYM) {
1912 				if (dt_print_sym(dtp, fp, NULL, addr) < 0)
1913 					return (-1);
1914 				goto nextrec;
1915 			}
1916 
1917 			if (act == DTRACEACT_MOD) {
1918 				if (dt_print_mod(dtp, fp, NULL, addr) < 0)
1919 					return (-1);
1920 				goto nextrec;
1921 			}
1922 
1923 			if (act == DTRACEACT_USYM || act == DTRACEACT_UADDR) {
1924 				if (dt_print_usym(dtp, fp, addr, act) < 0)
1925 					return (-1);
1926 				goto nextrec;
1927 			}
1928 
1929 			if (act == DTRACEACT_UMOD) {
1930 				if (dt_print_umod(dtp, fp, NULL, addr) < 0)
1931 					return (-1);
1932 				goto nextrec;
1933 			}
1934 
1935 			if (DTRACEACT_ISPRINTFLIKE(act)) {
1936 				void *fmtdata;
1937 				int (*func)(dtrace_hdl_t *, FILE *, void *,
1938 				    const dtrace_probedata_t *,
1939 				    const dtrace_recdesc_t *, uint_t,
1940 				    const void *buf, size_t);
1941 
1942 				if ((fmtdata = dt_format_lookup(dtp,
1943 				    rec->dtrd_format)) == NULL)
1944 					goto nofmt;
1945 
1946 				switch (act) {
1947 				case DTRACEACT_PRINTF:
1948 					func = dtrace_fprintf;
1949 					break;
1950 				case DTRACEACT_PRINTA:
1951 					func = dtrace_fprinta;
1952 					break;
1953 				case DTRACEACT_SYSTEM:
1954 					func = dtrace_system;
1955 					break;
1956 				case DTRACEACT_FREOPEN:
1957 					func = dtrace_freopen;
1958 					break;
1959 				}
1960 
1961 				n = (*func)(dtp, fp, fmtdata, &data,
1962 				    rec, epd->dtepd_nrecs - i,
1963 				    (uchar_t *)buf->dtbd_data + offs,
1964 				    buf->dtbd_size - offs);
1965 
1966 				if (n < 0)
1967 					return (-1); /* errno is set for us */
1968 
1969 				if (n > 0)
1970 					i += n - 1;
1971 				goto nextrec;
1972 			}
1973 
1974 			/*
1975 			 * If this is a DIF expression, and the record has a
1976 			 * format set, this indicates we have a CTF type name
1977 			 * associated with the data and we should try to print
1978 			 * it out by type.
1979 			 */
1980 			if (act == DTRACEACT_DIFEXPR) {
1981 				const char *strdata = dt_strdata_lookup(dtp,
1982 				    rec->dtrd_format);
1983 				if (strdata != NULL) {
1984 					n = dtrace_print(dtp, fp, strdata,
1985 					    addr, rec->dtrd_size);
1986 
1987 					/*
1988 					 * dtrace_print() will return -1 on
1989 					 * error, or return the number of bytes
1990 					 * consumed.  It will return 0 if the
1991 					 * type couldn't be determined, and we
1992 					 * should fall through to the normal
1993 					 * trace method.
1994 					 */
1995 					if (n < 0)
1996 						return (-1);
1997 
1998 					if (n > 0)
1999 						goto nextrec;
2000 				}
2001 			}
2002 
2003 nofmt:
2004 			if (act == DTRACEACT_PRINTA) {
2005 				dt_print_aggdata_t pd;
2006 				dtrace_aggvarid_t *aggvars;
2007 				int j, naggvars = 0;
2008 				size_t size = ((epd->dtepd_nrecs - i) *
2009 				    sizeof (dtrace_aggvarid_t));
2010 
2011 				if ((aggvars = dt_alloc(dtp, size)) == NULL)
2012 					return (-1);
2013 
2014 				/*
2015 				 * This might be a printa() with multiple
2016 				 * aggregation variables.  We need to scan
2017 				 * forward through the records until we find
2018 				 * a record from a different statement.
2019 				 */
2020 				for (j = i; j < epd->dtepd_nrecs; j++) {
2021 					dtrace_recdesc_t *nrec;
2022 					caddr_t naddr;
2023 
2024 					nrec = &epd->dtepd_rec[j];
2025 
2026 					if (nrec->dtrd_uarg != rec->dtrd_uarg)
2027 						break;
2028 
2029 					if (nrec->dtrd_action != act) {
2030 						return (dt_set_errno(dtp,
2031 						    EDT_BADAGG));
2032 					}
2033 
2034 					naddr = buf->dtbd_data + offs +
2035 					    nrec->dtrd_offset;
2036 
2037 					aggvars[naggvars++] =
2038 					    /* LINTED - alignment */
2039 					    *((dtrace_aggvarid_t *)naddr);
2040 				}
2041 
2042 				i = j - 1;
2043 				bzero(&pd, sizeof (pd));
2044 				pd.dtpa_dtp = dtp;
2045 				pd.dtpa_fp = fp;
2046 
2047 				assert(naggvars >= 1);
2048 
2049 				if (naggvars == 1) {
2050 					pd.dtpa_id = aggvars[0];
2051 					dt_free(dtp, aggvars);
2052 
2053 					if (dt_printf(dtp, fp, "\n") < 0 ||
2054 					    dtrace_aggregate_walk_sorted(dtp,
2055 					    dt_print_agg, &pd) < 0)
2056 						return (-1);
2057 					goto nextrec;
2058 				}
2059 
2060 				if (dt_printf(dtp, fp, "\n") < 0 ||
2061 				    dtrace_aggregate_walk_joined(dtp, aggvars,
2062 				    naggvars, dt_print_aggs, &pd) < 0) {
2063 					dt_free(dtp, aggvars);
2064 					return (-1);
2065 				}
2066 
2067 				dt_free(dtp, aggvars);
2068 				goto nextrec;
2069 			}
2070 
2071 			if (act == DTRACEACT_TRACEMEM) {
2072 				if (tracememsize == 0 ||
2073 				    tracememsize > rec->dtrd_size) {
2074 					tracememsize = rec->dtrd_size;
2075 				}
2076 
2077 				n = dt_print_bytes(dtp, fp, addr,
2078 				    tracememsize, 33, quiet, 1);
2079 
2080 				tracememsize = 0;
2081 
2082 				if (n < 0)
2083 					return (-1);
2084 
2085 				goto nextrec;
2086 			}
2087 
2088 			switch (rec->dtrd_size) {
2089 			case sizeof (uint64_t):
2090 				n = dt_printf(dtp, fp,
2091 				    quiet ? "%lld" : " %16lld",
2092 				    /* LINTED - alignment */
2093 				    *((unsigned long long *)addr));
2094 				break;
2095 			case sizeof (uint32_t):
2096 				n = dt_printf(dtp, fp, quiet ? "%d" : " %8d",
2097 				    /* LINTED - alignment */
2098 				    *((uint32_t *)addr));
2099 				break;
2100 			case sizeof (uint16_t):
2101 				n = dt_printf(dtp, fp, quiet ? "%d" : " %5d",
2102 				    /* LINTED - alignment */
2103 				    *((uint16_t *)addr));
2104 				break;
2105 			case sizeof (uint8_t):
2106 				n = dt_printf(dtp, fp, quiet ? "%d" : " %3d",
2107 				    *((uint8_t *)addr));
2108 				break;
2109 			default:
2110 				n = dt_print_bytes(dtp, fp, addr,
2111 				    rec->dtrd_size, 33, quiet, 0);
2112 				break;
2113 			}
2114 
2115 			if (n < 0)
2116 				return (-1); /* errno is set for us */
2117 
2118 nextrec:
2119 			if (dt_buffered_flush(dtp, &data, rec, NULL, 0) < 0)
2120 				return (-1); /* errno is set for us */
2121 		}
2122 
2123 		/*
2124 		 * Call the record callback with a NULL record to indicate
2125 		 * that we're done processing this EPID.
2126 		 */
2127 		rval = (*rfunc)(&data, NULL, arg);
2128 nextepid:
2129 		offs += epd->dtepd_size;
2130 		dtp->dt_last_epid = id;
2131 		if (just_one) {
2132 			buf->dtbd_oldest = offs;
2133 			break;
2134 		}
2135 	}
2136 
2137 	dtp->dt_flow = data.dtpda_flow;
2138 	dtp->dt_indent = data.dtpda_indent;
2139 	dtp->dt_prefix = data.dtpda_prefix;
2140 
2141 	if ((drops = buf->dtbd_drops) == 0)
2142 		return (0);
2143 
2144 	/*
2145 	 * Explicitly zero the drops to prevent us from processing them again.
2146 	 */
2147 	buf->dtbd_drops = 0;
2148 
2149 	return (dt_handle_cpudrop(dtp, cpu, DTRACEDROP_PRINCIPAL, drops));
2150 }
2151 
2152 /*
2153  * Reduce memory usage by shrinking the buffer if it's no more than half full.
2154  * Note, we need to preserve the alignment of the data at dtbd_oldest, which is
2155  * only 4-byte aligned.
2156  */
2157 static void
2158 dt_realloc_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf, int cursize)
2159 {
2160 	uint64_t used = buf->dtbd_size - buf->dtbd_oldest;
2161 	if (used < cursize / 2) {
2162 		int misalign = buf->dtbd_oldest & (sizeof (uint64_t) - 1);
2163 		char *newdata = dt_alloc(dtp, used + misalign);
2164 		if (newdata == NULL)
2165 			return;
2166 		bzero(newdata, misalign);
2167 		bcopy(buf->dtbd_data + buf->dtbd_oldest,
2168 		    newdata + misalign, used);
2169 		dt_free(dtp, buf->dtbd_data);
2170 		buf->dtbd_oldest = misalign;
2171 		buf->dtbd_size = used + misalign;
2172 		buf->dtbd_data = newdata;
2173 	}
2174 }
2175 
2176 /*
2177  * If the ring buffer has wrapped, the data is not in order.  Rearrange it
2178  * so that it is.  Note, we need to preserve the alignment of the data at
2179  * dtbd_oldest, which is only 4-byte aligned.
2180  */
2181 static int
2182 dt_unring_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf)
2183 {
2184 	int misalign;
2185 	char *newdata, *ndp;
2186 
2187 	if (buf->dtbd_oldest == 0)
2188 		return (0);
2189 
2190 	misalign = buf->dtbd_oldest & (sizeof (uint64_t) - 1);
2191 	newdata = ndp = dt_alloc(dtp, buf->dtbd_size + misalign);
2192 
2193 	if (newdata == NULL)
2194 		return (-1);
2195 
2196 	assert(0 == (buf->dtbd_size & (sizeof (uint64_t) - 1)));
2197 
2198 	bzero(ndp, misalign);
2199 	ndp += misalign;
2200 
2201 	bcopy(buf->dtbd_data + buf->dtbd_oldest, ndp,
2202 	    buf->dtbd_size - buf->dtbd_oldest);
2203 	ndp += buf->dtbd_size - buf->dtbd_oldest;
2204 
2205 	bcopy(buf->dtbd_data, ndp, buf->dtbd_oldest);
2206 
2207 	dt_free(dtp, buf->dtbd_data);
2208 	buf->dtbd_oldest = 0;
2209 	buf->dtbd_data = newdata;
2210 	buf->dtbd_size += misalign;
2211 
2212 	return (0);
2213 }
2214 
2215 static void
2216 dt_put_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf)
2217 {
2218 	dt_free(dtp, buf->dtbd_data);
2219 	dt_free(dtp, buf);
2220 }
2221 
2222 /*
2223  * Returns 0 on success, in which case *cbp will be filled in if we retrieved
2224  * data, or NULL if there is no data for this CPU.
2225  * Returns -1 on failure and sets dt_errno.
2226  */
2227 static int
2228 dt_get_buf(dtrace_hdl_t *dtp, int cpu, dtrace_bufdesc_t **bufp)
2229 {
2230 	dtrace_optval_t size;
2231 	dtrace_bufdesc_t *buf = dt_zalloc(dtp, sizeof (*buf));
2232 	int error;
2233 
2234 	if (buf == NULL)
2235 		return (-1);
2236 
2237 	(void) dtrace_getopt(dtp, "bufsize", &size);
2238 	buf->dtbd_data = dt_alloc(dtp, size);
2239 	if (buf->dtbd_data == NULL) {
2240 		dt_free(dtp, buf);
2241 		return (-1);
2242 	}
2243 	buf->dtbd_size = size;
2244 	buf->dtbd_cpu = cpu;
2245 
2246 	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
2247 		dt_put_buf(dtp, buf);
2248 		/*
2249 		 * If we failed with ENOENT, it may be because the
2250 		 * CPU was unconfigured -- this is okay.  Any other
2251 		 * error, however, is unexpected.
2252 		 */
2253 		if (errno == ENOENT) {
2254 			*bufp = NULL;
2255 			return (0);
2256 		}
2257 
2258 		return (dt_set_errno(dtp, errno));
2259 	}
2260 
2261 	error = dt_unring_buf(dtp, buf);
2262 	if (error != 0) {
2263 		dt_put_buf(dtp, buf);
2264 		return (error);
2265 	}
2266 	dt_realloc_buf(dtp, buf, size);
2267 
2268 	*bufp = buf;
2269 	return (0);
2270 }
2271 
2272 typedef struct dt_begin {
2273 	dtrace_consume_probe_f *dtbgn_probefunc;
2274 	dtrace_consume_rec_f *dtbgn_recfunc;
2275 	void *dtbgn_arg;
2276 	dtrace_handle_err_f *dtbgn_errhdlr;
2277 	void *dtbgn_errarg;
2278 	int dtbgn_beginonly;
2279 } dt_begin_t;
2280 
2281 static int
2282 dt_consume_begin_probe(const dtrace_probedata_t *data, void *arg)
2283 {
2284 	dt_begin_t *begin = arg;
2285 	dtrace_probedesc_t *pd = data->dtpda_pdesc;
2286 
2287 	int r1 = (strcmp(pd->dtpd_provider, "dtrace") == 0);
2288 	int r2 = (strcmp(pd->dtpd_name, "BEGIN") == 0);
2289 
2290 	if (begin->dtbgn_beginonly) {
2291 		if (!(r1 && r2))
2292 			return (DTRACE_CONSUME_NEXT);
2293 	} else {
2294 		if (r1 && r2)
2295 			return (DTRACE_CONSUME_NEXT);
2296 	}
2297 
2298 	/*
2299 	 * We have a record that we're interested in.  Now call the underlying
2300 	 * probe function...
2301 	 */
2302 	return (begin->dtbgn_probefunc(data, begin->dtbgn_arg));
2303 }
2304 
2305 static int
2306 dt_consume_begin_record(const dtrace_probedata_t *data,
2307     const dtrace_recdesc_t *rec, void *arg)
2308 {
2309 	dt_begin_t *begin = arg;
2310 
2311 	return (begin->dtbgn_recfunc(data, rec, begin->dtbgn_arg));
2312 }
2313 
2314 static int
2315 dt_consume_begin_error(const dtrace_errdata_t *data, void *arg)
2316 {
2317 	dt_begin_t *begin = (dt_begin_t *)arg;
2318 	dtrace_probedesc_t *pd = data->dteda_pdesc;
2319 
2320 	int r1 = (strcmp(pd->dtpd_provider, "dtrace") == 0);
2321 	int r2 = (strcmp(pd->dtpd_name, "BEGIN") == 0);
2322 
2323 	if (begin->dtbgn_beginonly) {
2324 		if (!(r1 && r2))
2325 			return (DTRACE_HANDLE_OK);
2326 	} else {
2327 		if (r1 && r2)
2328 			return (DTRACE_HANDLE_OK);
2329 	}
2330 
2331 	return (begin->dtbgn_errhdlr(data, begin->dtbgn_errarg));
2332 }
2333 
2334 static int
2335 dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp,
2336     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
2337 {
2338 	/*
2339 	 * There's this idea that the BEGIN probe should be processed before
2340 	 * everything else, and that the END probe should be processed after
2341 	 * anything else.  In the common case, this is pretty easy to deal
2342 	 * with.  However, a situation may arise where the BEGIN enabling and
2343 	 * END enabling are on the same CPU, and some enabling in the middle
2344 	 * occurred on a different CPU.  To deal with this (blech!) we need to
2345 	 * consume the BEGIN buffer up until the end of the BEGIN probe, and
2346 	 * then set it aside.  We will then process every other CPU, and then
2347 	 * we'll return to the BEGIN CPU and process the rest of the data
2348 	 * (which will inevitably include the END probe, if any).  Making this
2349 	 * even more complicated (!) is the library's ERROR enabling.  Because
2350 	 * this enabling is processed before we even get into the consume call
2351 	 * back, any ERROR firing would result in the library's ERROR enabling
2352 	 * being processed twice -- once in our first pass (for BEGIN probes),
2353 	 * and again in our second pass (for everything but BEGIN probes).  To
2354 	 * deal with this, we interpose on the ERROR handler to assure that we
2355 	 * only process ERROR enablings induced by BEGIN enablings in the
2356 	 * first pass, and that we only process ERROR enablings _not_ induced
2357 	 * by BEGIN enablings in the second pass.
2358 	 */
2359 
2360 	dt_begin_t begin;
2361 	processorid_t cpu = dtp->dt_beganon;
2362 	int rval, i;
2363 	static int max_ncpus;
2364 	dtrace_bufdesc_t *buf;
2365 
2366 	dtp->dt_beganon = -1;
2367 
2368 	if (dt_get_buf(dtp, cpu, &buf) != 0)
2369 		return (-1);
2370 	if (buf == NULL)
2371 		return (0);
2372 
2373 	if (!dtp->dt_stopped || buf->dtbd_cpu != dtp->dt_endedon) {
2374 		/*
2375 		 * This is the simple case.  We're either not stopped, or if
2376 		 * we are, we actually processed any END probes on another
2377 		 * CPU.  We can simply consume this buffer and return.
2378 		 */
2379 		rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE,
2380 		    pf, rf, arg);
2381 		dt_put_buf(dtp, buf);
2382 		return (rval);
2383 	}
2384 
2385 	begin.dtbgn_probefunc = pf;
2386 	begin.dtbgn_recfunc = rf;
2387 	begin.dtbgn_arg = arg;
2388 	begin.dtbgn_beginonly = 1;
2389 
2390 	/*
2391 	 * We need to interpose on the ERROR handler to be sure that we
2392 	 * only process ERRORs induced by BEGIN.
2393 	 */
2394 	begin.dtbgn_errhdlr = dtp->dt_errhdlr;
2395 	begin.dtbgn_errarg = dtp->dt_errarg;
2396 	dtp->dt_errhdlr = dt_consume_begin_error;
2397 	dtp->dt_errarg = &begin;
2398 
2399 	rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE,
2400 	    dt_consume_begin_probe, dt_consume_begin_record, &begin);
2401 
2402 	dtp->dt_errhdlr = begin.dtbgn_errhdlr;
2403 	dtp->dt_errarg = begin.dtbgn_errarg;
2404 
2405 	if (rval != 0) {
2406 		dt_put_buf(dtp, buf);
2407 		return (rval);
2408 	}
2409 
2410 	if (max_ncpus == 0)
2411 		max_ncpus = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
2412 
2413 	for (i = 0; i < max_ncpus; i++) {
2414 		dtrace_bufdesc_t *nbuf;
2415 		if (i == cpu)
2416 			continue;
2417 
2418 		if (dt_get_buf(dtp, i, &nbuf) != 0) {
2419 			dt_put_buf(dtp, buf);
2420 			return (-1);
2421 		}
2422 		if (nbuf == NULL)
2423 			continue;
2424 
2425 		rval = dt_consume_cpu(dtp, fp, i, nbuf, B_FALSE,
2426 		    pf, rf, arg);
2427 		dt_put_buf(dtp, nbuf);
2428 		if (rval != 0) {
2429 			dt_put_buf(dtp, buf);
2430 			return (rval);
2431 		}
2432 	}
2433 
2434 	/*
2435 	 * Okay -- we're done with the other buffers.  Now we want to
2436 	 * reconsume the first buffer -- but this time we're looking for
2437 	 * everything _but_ BEGIN.  And of course, in order to only consume
2438 	 * those ERRORs _not_ associated with BEGIN, we need to reinstall our
2439 	 * ERROR interposition function...
2440 	 */
2441 	begin.dtbgn_beginonly = 0;
2442 
2443 	assert(begin.dtbgn_errhdlr == dtp->dt_errhdlr);
2444 	assert(begin.dtbgn_errarg == dtp->dt_errarg);
2445 	dtp->dt_errhdlr = dt_consume_begin_error;
2446 	dtp->dt_errarg = &begin;
2447 
2448 	rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE,
2449 	    dt_consume_begin_probe, dt_consume_begin_record, &begin);
2450 
2451 	dtp->dt_errhdlr = begin.dtbgn_errhdlr;
2452 	dtp->dt_errarg = begin.dtbgn_errarg;
2453 
2454 	return (rval);
2455 }
2456 
2457 /* ARGSUSED */
2458 static uint64_t
2459 dt_buf_oldest(void *elem, void *arg)
2460 {
2461 	dtrace_bufdesc_t *buf = elem;
2462 	size_t offs = buf->dtbd_oldest;
2463 
2464 	while (offs < buf->dtbd_size) {
2465 		dtrace_rechdr_t *dtrh =
2466 		    (dtrace_rechdr_t *)(buf->dtbd_data + offs);
2467 		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2468 			offs += sizeof (dtrace_epid_t);
2469 		} else {
2470 			return (DTRACE_RECORD_LOAD_TIMESTAMP(dtrh));
2471 		}
2472 	}
2473 
2474 	/* There are no records left; use the time the buffer was retrieved. */
2475 	return (buf->dtbd_timestamp);
2476 }
2477 
2478 int
2479 dtrace_consume(dtrace_hdl_t *dtp, FILE *fp,
2480     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
2481 {
2482 	dtrace_optval_t size;
2483 	static int max_ncpus;
2484 	int i, rval;
2485 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_SWITCHRATE];
2486 	hrtime_t now = gethrtime();
2487 
2488 	if (dtp->dt_lastswitch != 0) {
2489 		if (now - dtp->dt_lastswitch < interval)
2490 			return (0);
2491 
2492 		dtp->dt_lastswitch += interval;
2493 	} else {
2494 		dtp->dt_lastswitch = now;
2495 	}
2496 
2497 	if (!dtp->dt_active)
2498 		return (dt_set_errno(dtp, EINVAL));
2499 
2500 	if (max_ncpus == 0)
2501 		max_ncpus = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
2502 
2503 	if (pf == NULL)
2504 		pf = (dtrace_consume_probe_f *)dt_nullprobe;
2505 
2506 	if (rf == NULL)
2507 		rf = (dtrace_consume_rec_f *)dt_nullrec;
2508 
2509 	if (dtp->dt_options[DTRACEOPT_TEMPORAL] == DTRACEOPT_UNSET) {
2510 		/*
2511 		 * The output will not be in the order it was traced.  Rather,
2512 		 * we will consume all of the data from each CPU's buffer in
2513 		 * turn.  We apply special handling for the records from BEGIN
2514 		 * and END probes so that they are consumed first and last,
2515 		 * respectively.
2516 		 *
2517 		 * If we have just begun, we want to first process the CPU that
2518 		 * executed the BEGIN probe (if any).
2519 		 */
2520 		if (dtp->dt_active && dtp->dt_beganon != -1 &&
2521 		    (rval = dt_consume_begin(dtp, fp, pf, rf, arg)) != 0)
2522 			return (rval);
2523 
2524 		for (i = 0; i < max_ncpus; i++) {
2525 			dtrace_bufdesc_t *buf;
2526 
2527 			/*
2528 			 * If we have stopped, we want to process the CPU on
2529 			 * which the END probe was processed only _after_ we
2530 			 * have processed everything else.
2531 			 */
2532 			if (dtp->dt_stopped && (i == dtp->dt_endedon))
2533 				continue;
2534 
2535 			if (dt_get_buf(dtp, i, &buf) != 0)
2536 				return (-1);
2537 			if (buf == NULL)
2538 				continue;
2539 
2540 			dtp->dt_flow = 0;
2541 			dtp->dt_indent = 0;
2542 			dtp->dt_prefix = NULL;
2543 			rval = dt_consume_cpu(dtp, fp, i,
2544 			    buf, B_FALSE, pf, rf, arg);
2545 			dt_put_buf(dtp, buf);
2546 			if (rval != 0)
2547 				return (rval);
2548 		}
2549 		if (dtp->dt_stopped) {
2550 			dtrace_bufdesc_t *buf;
2551 
2552 			if (dt_get_buf(dtp, dtp->dt_endedon, &buf) != 0)
2553 				return (-1);
2554 			if (buf == NULL)
2555 				return (0);
2556 
2557 			rval = dt_consume_cpu(dtp, fp, dtp->dt_endedon,
2558 			    buf, B_FALSE, pf, rf, arg);
2559 			dt_put_buf(dtp, buf);
2560 			return (rval);
2561 		}
2562 	} else {
2563 		/*
2564 		 * The output will be in the order it was traced (or for
2565 		 * speculations, when it was committed).  We retrieve a buffer
2566 		 * from each CPU and put it into a priority queue, which sorts
2567 		 * based on the first entry in the buffer.  This is sufficient
2568 		 * because entries within a buffer are already sorted.
2569 		 *
2570 		 * We then consume records one at a time, always consuming the
2571 		 * oldest record, as determined by the priority queue.  When
2572 		 * we reach the end of the time covered by these buffers,
2573 		 * we need to stop and retrieve more records on the next pass.
2574 		 * The kernel tells us the time covered by each buffer, in
2575 		 * dtbd_timestamp.  The first buffer's timestamp tells us the
2576 		 * time covered by all buffers, as subsequently retrieved
2577 		 * buffers will cover to a more recent time.
2578 		 */
2579 
2580 		uint64_t *drops = alloca(max_ncpus * sizeof (uint64_t));
2581 		uint64_t first_timestamp = 0;
2582 		uint_t cookie = 0;
2583 		dtrace_bufdesc_t *buf;
2584 
2585 		bzero(drops, max_ncpus * sizeof (uint64_t));
2586 
2587 		if (dtp->dt_bufq == NULL) {
2588 			dtp->dt_bufq = dt_pq_init(dtp, max_ncpus * 2,
2589 			    dt_buf_oldest, NULL);
2590 			if (dtp->dt_bufq == NULL) /* ENOMEM */
2591 				return (-1);
2592 		}
2593 
2594 		/* Retrieve data from each CPU. */
2595 		(void) dtrace_getopt(dtp, "bufsize", &size);
2596 		for (i = 0; i < max_ncpus; i++) {
2597 			dtrace_bufdesc_t *buf;
2598 
2599 			if (dt_get_buf(dtp, i, &buf) != 0)
2600 				return (-1);
2601 			if (buf != NULL) {
2602 				if (first_timestamp == 0)
2603 					first_timestamp = buf->dtbd_timestamp;
2604 				assert(buf->dtbd_timestamp >= first_timestamp);
2605 
2606 				dt_pq_insert(dtp->dt_bufq, buf);
2607 				drops[i] = buf->dtbd_drops;
2608 				buf->dtbd_drops = 0;
2609 			}
2610 		}
2611 
2612 		/* Consume records. */
2613 		for (;;) {
2614 			dtrace_bufdesc_t *buf = dt_pq_pop(dtp->dt_bufq);
2615 			uint64_t timestamp;
2616 
2617 			if (buf == NULL)
2618 				break;
2619 
2620 			timestamp = dt_buf_oldest(buf, dtp);
2621 			assert(timestamp >= dtp->dt_last_timestamp);
2622 			dtp->dt_last_timestamp = timestamp;
2623 
2624 			if (timestamp == buf->dtbd_timestamp) {
2625 				/*
2626 				 * We've reached the end of the time covered
2627 				 * by this buffer.  If this is the oldest
2628 				 * buffer, we must do another pass
2629 				 * to retrieve more data.
2630 				 */
2631 				dt_put_buf(dtp, buf);
2632 				if (timestamp == first_timestamp &&
2633 				    !dtp->dt_stopped)
2634 					break;
2635 				continue;
2636 			}
2637 
2638 			if ((rval = dt_consume_cpu(dtp, fp,
2639 			    buf->dtbd_cpu, buf, B_TRUE, pf, rf, arg)) != 0)
2640 				return (rval);
2641 			dt_pq_insert(dtp->dt_bufq, buf);
2642 		}
2643 
2644 		/* Consume drops. */
2645 		for (i = 0; i < max_ncpus; i++) {
2646 			if (drops[i] != 0) {
2647 				int error = dt_handle_cpudrop(dtp, i,
2648 				    DTRACEDROP_PRINCIPAL, drops[i]);
2649 				if (error != 0)
2650 					return (error);
2651 			}
2652 		}
2653 
2654 		/*
2655 		 * Reduce memory usage by re-allocating smaller buffers
2656 		 * for the "remnants".
2657 		 */
2658 		while (buf = dt_pq_walk(dtp->dt_bufq, &cookie))
2659 			dt_realloc_buf(dtp, buf, buf->dtbd_size);
2660 	}
2661 
2662 	return (0);
2663 }
2664