xref: /titanic_44/usr/src/lib/libdtrace/common/dt_aggregate.c (revision 4cc1ac68c690efa70450ed478a37fe6d78f0f42e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <unistd.h>
33 #include <dt_impl.h>
34 #include <assert.h>
35 
36 #define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
37 
38 static void
39 dt_aggregate_count(uint64_t *existing, uint64_t *new, size_t size)
40 {
41 	int i;
42 
43 	for (i = 0; i < size / sizeof (uint64_t); i++)
44 		existing[i] = existing[i] + new[i];
45 }
46 
47 static int
48 dt_aggregate_countcmp(uint64_t *lhs, uint64_t *rhs)
49 {
50 	uint64_t lvar = *lhs;
51 	uint64_t rvar = *rhs;
52 
53 	if (lvar > rvar)
54 		return (1);
55 
56 	if (lvar < rvar)
57 		return (-1);
58 
59 	return (0);
60 }
61 
62 /*ARGSUSED*/
63 static void
64 dt_aggregate_min(uint64_t *existing, uint64_t *new, size_t size)
65 {
66 	if (*new < *existing)
67 		*existing = *new;
68 }
69 
70 /*ARGSUSED*/
71 static void
72 dt_aggregate_max(uint64_t *existing, uint64_t *new, size_t size)
73 {
74 	if (*new > *existing)
75 		*existing = *new;
76 }
77 
78 static int
79 dt_aggregate_averagecmp(uint64_t *lhs, uint64_t *rhs)
80 {
81 	uint64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
82 	uint64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
83 
84 	if (lavg > ravg)
85 		return (1);
86 
87 	if (lavg < ravg)
88 		return (-1);
89 
90 	return (0);
91 }
92 
93 /*ARGSUSED*/
94 static void
95 dt_aggregate_lquantize(uint64_t *existing, uint64_t *new, size_t size)
96 {
97 	uint64_t arg = *existing++;
98 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
99 	int i;
100 
101 	for (i = 0; i <= levels + 1; i++)
102 		existing[i] = existing[i] + new[i + 1];
103 }
104 
105 static int64_t
106 dt_aggregate_lquantizedsum(uint64_t *lquanta)
107 {
108 	uint64_t arg = *lquanta++;
109 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
110 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
111 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
112 	int64_t total = lquanta[0] * (base - 1);
113 
114 	for (i = 0; i < levels; base += step, i++)
115 		total += lquanta[i + 1] * base;
116 
117 	return (total + lquanta[levels + 1] * (base + 1));
118 }
119 
120 static int
121 dt_aggregate_lquantizedcmp(uint64_t *lhs, uint64_t *rhs)
122 {
123 	int64_t lsum = dt_aggregate_lquantizedsum(lhs);
124 	int64_t rsum = dt_aggregate_lquantizedsum(rhs);
125 
126 	if (lsum > rsum)
127 		return (1);
128 
129 	if (lsum < rsum)
130 		return (-1);
131 
132 	return (0);
133 }
134 
135 static int
136 dt_aggregate_quantizedcmp(uint64_t *lhs, uint64_t *rhs)
137 {
138 	int nbuckets = DTRACE_QUANTIZE_NBUCKETS, i;
139 	int64_t ltotal = 0, rtotal = 0;
140 
141 	for (i = 0; i < nbuckets; i++) {
142 		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
143 
144 		ltotal += bucketval * lhs[i];
145 		rtotal += bucketval * rhs[i];
146 	}
147 
148 	if (ltotal > rtotal)
149 		return (1);
150 
151 	if (ltotal < rtotal)
152 		return (-1);
153 
154 	return (0);
155 }
156 
157 static int
158 dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
159 {
160 	dtrace_epid_t id;
161 	uint64_t hashval;
162 	size_t offs, roffs, size, ndx;
163 	int i, j, rval;
164 	caddr_t addr, data;
165 	dtrace_recdesc_t *rec;
166 	dt_aggregate_t *agp = &dtp->dt_aggregate;
167 	dtrace_aggdesc_t *agg;
168 	dt_ahash_t *hash = &agp->dtat_hash;
169 	dt_ahashent_t *h;
170 	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
171 	dtrace_aggdata_t *aggdata;
172 	int flags = agp->dtat_flags;
173 
174 	buf->dtbd_cpu = cpu;
175 
176 	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
177 		if (errno == ENOENT) {
178 			/*
179 			 * If that failed with ENOENT, it may be because the
180 			 * CPU was unconfigured.  This is okay; we'll just
181 			 * do nothing but return success.
182 			 */
183 			return (0);
184 		}
185 
186 		return (dt_set_errno(dtp, errno));
187 	}
188 
189 	if (buf->dtbd_drops != 0) {
190 		if (dt_handle_cpudrop(dtp, cpu,
191 		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1)
192 			return (-1);
193 	}
194 
195 	if (buf->dtbd_size == 0)
196 		return (0);
197 
198 	if (hash->dtah_hash == NULL) {
199 		size_t size;
200 
201 		hash->dtah_size = DTRACE_AHASHSIZE;
202 		size = hash->dtah_size * sizeof (dt_ahashent_t *);
203 
204 		if ((hash->dtah_hash = malloc(size)) == NULL)
205 			return (dt_set_errno(dtp, EDT_NOMEM));
206 
207 		bzero(hash->dtah_hash, size);
208 	}
209 
210 	for (offs = 0; offs < buf->dtbd_size; ) {
211 		/*
212 		 * We're guaranteed to have an ID.
213 		 */
214 		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
215 		    (uintptr_t)offs));
216 
217 		if (id == DTRACE_AGGIDNONE) {
218 			/*
219 			 * This is filler to assure proper alignment of the
220 			 * next record; we simply ignore it.
221 			 */
222 			offs += sizeof (id);
223 			continue;
224 		}
225 
226 		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
227 			return (rval);
228 
229 		addr = buf->dtbd_data + offs;
230 		size = agg->dtagd_size;
231 		hashval = 0;
232 
233 		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
234 			rec = &agg->dtagd_rec[j];
235 			roffs = rec->dtrd_offset;
236 
237 			for (i = 0; i < rec->dtrd_size; i++)
238 				hashval += addr[roffs + i];
239 		}
240 
241 		ndx = hashval % hash->dtah_size;
242 
243 		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
244 			if (h->dtahe_hashval != hashval)
245 				continue;
246 
247 			if (h->dtahe_size != size)
248 				continue;
249 
250 			aggdata = &h->dtahe_data;
251 			data = aggdata->dtada_data;
252 
253 			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
254 				rec = &agg->dtagd_rec[j];
255 				roffs = rec->dtrd_offset;
256 
257 				for (i = 0; i < rec->dtrd_size; i++)
258 					if (addr[roffs + i] != data[roffs + i])
259 						goto hashnext;
260 			}
261 
262 			/*
263 			 * We found it.  Now we need to apply the aggregating
264 			 * action on the data here.
265 			 */
266 			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
267 			roffs = rec->dtrd_offset;
268 			/* LINTED - alignment */
269 			h->dtahe_aggregate((uint64_t *)&data[roffs],
270 			    /* LINTED - alignment */
271 			    (uint64_t *)&addr[roffs], rec->dtrd_size);
272 
273 			/*
274 			 * If we're keeping per CPU data, apply the aggregating
275 			 * action there as well.
276 			 */
277 			if (aggdata->dtada_percpu != NULL) {
278 				data = aggdata->dtada_percpu[cpu];
279 
280 				/* LINTED - alignment */
281 				h->dtahe_aggregate((uint64_t *)data,
282 				    /* LINTED - alignment */
283 				    (uint64_t *)&addr[roffs], rec->dtrd_size);
284 			}
285 
286 			goto bufnext;
287 hashnext:
288 			continue;
289 		}
290 
291 		/*
292 		 * If we're here, we couldn't find an entry for this record.
293 		 */
294 		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
295 			return (dt_set_errno(dtp, EDT_NOMEM));
296 		bzero(h, sizeof (dt_ahashent_t));
297 		aggdata = &h->dtahe_data;
298 
299 		if ((aggdata->dtada_data = malloc(size)) == NULL) {
300 			free(h);
301 			return (dt_set_errno(dtp, EDT_NOMEM));
302 		}
303 
304 		bcopy(addr, aggdata->dtada_data, size);
305 		aggdata->dtada_size = size;
306 		aggdata->dtada_desc = agg;
307 		aggdata->dtada_handle = dtp;
308 		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
309 		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
310 		aggdata->dtada_normal = 1;
311 
312 		h->dtahe_hashval = hashval;
313 		h->dtahe_size = size;
314 
315 		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
316 
317 		if (flags & DTRACE_A_PERCPU) {
318 			int max_cpus = agp->dtat_maxcpu;
319 			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
320 
321 			if (percpu == NULL) {
322 				free(aggdata->dtada_data);
323 				free(h);
324 				return (dt_set_errno(dtp, EDT_NOMEM));
325 			}
326 
327 			for (j = 0; j < max_cpus; j++) {
328 				percpu[j] = malloc(rec->dtrd_size);
329 
330 				if (percpu[j] == NULL) {
331 					while (--j >= 0)
332 						free(percpu[j]);
333 
334 					free(aggdata->dtada_data);
335 					free(h);
336 					return (dt_set_errno(dtp, EDT_NOMEM));
337 				}
338 
339 				if (j == cpu) {
340 					bcopy(&addr[rec->dtrd_offset],
341 					    percpu[j], rec->dtrd_size);
342 				} else {
343 					bzero(percpu[j], rec->dtrd_size);
344 				}
345 			}
346 
347 			aggdata->dtada_percpu = percpu;
348 		}
349 
350 		switch (rec->dtrd_action) {
351 		case DTRACEAGG_MIN:
352 			h->dtahe_aggregate = dt_aggregate_min;
353 			break;
354 
355 		case DTRACEAGG_MAX:
356 			h->dtahe_aggregate = dt_aggregate_max;
357 			break;
358 
359 		case DTRACEAGG_LQUANTIZE:
360 			h->dtahe_aggregate = dt_aggregate_lquantize;
361 			break;
362 
363 		case DTRACEAGG_COUNT:
364 		case DTRACEAGG_SUM:
365 		case DTRACEAGG_AVG:
366 		case DTRACEAGG_QUANTIZE:
367 			h->dtahe_aggregate = dt_aggregate_count;
368 			break;
369 
370 		default:
371 			return (dt_set_errno(dtp, EDT_BADAGG));
372 		}
373 
374 		if (hash->dtah_hash[ndx] != NULL)
375 			hash->dtah_hash[ndx]->dtahe_prev = h;
376 
377 		h->dtahe_next = hash->dtah_hash[ndx];
378 		hash->dtah_hash[ndx] = h;
379 
380 		if (hash->dtah_all != NULL)
381 			hash->dtah_all->dtahe_prevall = h;
382 
383 		h->dtahe_nextall = hash->dtah_all;
384 		hash->dtah_all = h;
385 bufnext:
386 		offs += agg->dtagd_size;
387 	}
388 
389 	return (0);
390 }
391 
392 int
393 dtrace_aggregate_snap(dtrace_hdl_t *dtp)
394 {
395 	int i, rval;
396 	dt_aggregate_t *agp = &dtp->dt_aggregate;
397 	hrtime_t now = gethrtime();
398 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
399 
400 	if (dtp->dt_lastagg != 0) {
401 		if (now - dtp->dt_lastagg < interval)
402 			return (0);
403 
404 		dtp->dt_lastagg += interval;
405 	} else {
406 		dtp->dt_lastagg = now;
407 	}
408 
409 	if (!dtp->dt_active)
410 		return (dt_set_errno(dtp, EINVAL));
411 
412 	if (agp->dtat_buf.dtbd_size == 0)
413 		return (0);
414 
415 	for (i = 0; i < agp->dtat_ncpus; i++) {
416 		if (rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i]))
417 			return (rval);
418 	}
419 
420 	return (0);
421 }
422 
423 static int
424 dt_aggregate_hashcmp(const void *lhs, const void *rhs)
425 {
426 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
427 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
428 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
429 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
430 
431 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
432 		return (-1);
433 
434 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
435 		return (1);
436 
437 	return (0);
438 }
439 
440 static int
441 dt_aggregate_varcmp(const void *lhs, const void *rhs)
442 {
443 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
444 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
445 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
446 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
447 	caddr_t ldata = lh->dtahe_data.dtada_data;
448 	caddr_t rdata = rh->dtahe_data.dtada_data;
449 	dtrace_recdesc_t *lrec, *rrec;
450 	uint64_t lid, rid;
451 
452 	/*
453 	 * We know that we have a compiler-generated ID as the first record.
454 	 */
455 	lrec = lagg->dtagd_rec;
456 	rrec = ragg->dtagd_rec;
457 
458 	lid = *((uint64_t *)(uintptr_t)(ldata + lrec->dtrd_offset));
459 	rid = *((uint64_t *)(uintptr_t)(rdata + rrec->dtrd_offset));
460 
461 	if (lid < rid)
462 		return (-1);
463 
464 	if (lid > rid)
465 		return (1);
466 
467 	return (0);
468 }
469 
470 static int
471 dt_aggregate_keycmp(const void *lhs, const void *rhs)
472 {
473 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
474 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
475 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
476 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
477 	dtrace_recdesc_t *lrec, *rrec;
478 	char *ldata, *rdata;
479 	int rval, i, j;
480 
481 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
482 		return (rval);
483 
484 	for (i = 1; i < lagg->dtagd_nrecs - 1; i++) {
485 		uint64_t lval, rval;
486 
487 		lrec = &lagg->dtagd_rec[i];
488 		rrec = &ragg->dtagd_rec[i];
489 
490 		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
491 		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
492 
493 		if (lrec->dtrd_size < rrec->dtrd_size)
494 			return (-1);
495 
496 		if (lrec->dtrd_size > rrec->dtrd_size)
497 			return (1);
498 
499 		switch (lrec->dtrd_size) {
500 		case sizeof (uint64_t):
501 			/* LINTED - alignment */
502 			lval = *((uint64_t *)ldata);
503 			/* LINTED - alignment */
504 			rval = *((uint64_t *)rdata);
505 			break;
506 
507 		case sizeof (uint32_t):
508 			/* LINTED - alignment */
509 			lval = *((uint32_t *)ldata);
510 			/* LINTED - alignment */
511 			rval = *((uint32_t *)rdata);
512 			break;
513 
514 		case sizeof (uint16_t):
515 			/* LINTED - alignment */
516 			lval = *((uint16_t *)ldata);
517 			/* LINTED - alignment */
518 			rval = *((uint16_t *)rdata);
519 			break;
520 
521 		case sizeof (uint8_t):
522 			lval = *((uint8_t *)ldata);
523 			rval = *((uint8_t *)rdata);
524 			break;
525 
526 		default:
527 			for (j = 0; j < lrec->dtrd_size; j++) {
528 				lval = ((uint8_t *)ldata)[j];
529 				rval = ((uint8_t *)rdata)[j];
530 
531 				if (lval < rval)
532 					return (-1);
533 
534 				if (lval > rval)
535 					return (1);
536 			}
537 
538 			continue;
539 		}
540 
541 		if (lval < rval)
542 			return (-1);
543 
544 		if (lval > rval)
545 			return (1);
546 	}
547 
548 	return (0);
549 }
550 
551 static int
552 dt_aggregate_valcmp(const void *lhs, const void *rhs)
553 {
554 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
555 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
556 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
557 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
558 	caddr_t ldata = lh->dtahe_data.dtada_data;
559 	caddr_t rdata = rh->dtahe_data.dtada_data;
560 	dtrace_recdesc_t *lrec, *rrec;
561 	uint64_t *laddr, *raddr;
562 	int rval, i;
563 
564 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
565 		return (rval);
566 
567 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
568 		return (-1);
569 
570 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
571 		return (1);
572 
573 	for (i = 0; i < lagg->dtagd_nrecs; i++) {
574 		lrec = &lagg->dtagd_rec[i];
575 		rrec = &ragg->dtagd_rec[i];
576 
577 		if (lrec->dtrd_offset < rrec->dtrd_offset)
578 			return (-1);
579 
580 		if (lrec->dtrd_offset > rrec->dtrd_offset)
581 			return (1);
582 
583 		if (lrec->dtrd_action < rrec->dtrd_action)
584 			return (-1);
585 
586 		if (lrec->dtrd_action > rrec->dtrd_action)
587 			return (1);
588 	}
589 
590 	laddr = (uint64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
591 	raddr = (uint64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
592 
593 	switch (lrec->dtrd_action) {
594 	case DTRACEAGG_AVG:
595 		rval = dt_aggregate_averagecmp(laddr, raddr);
596 		break;
597 
598 	case DTRACEAGG_QUANTIZE:
599 		rval = dt_aggregate_quantizedcmp(laddr, raddr);
600 		break;
601 
602 	case DTRACEAGG_LQUANTIZE:
603 		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
604 		break;
605 
606 	case DTRACEAGG_COUNT:
607 	case DTRACEAGG_SUM:
608 	case DTRACEAGG_MIN:
609 	case DTRACEAGG_MAX:
610 		rval = dt_aggregate_countcmp(laddr, raddr);
611 		break;
612 
613 	default:
614 		assert(0);
615 	}
616 
617 	if (rval != 0)
618 		return (rval);
619 
620 	/*
621 	 * If we're here, the values for the two aggregation elements are
622 	 * equal.  We already know that the key layout is the same for the two
623 	 * elements; we must now compare the keys themselves as a tie-breaker.
624 	 */
625 	return (dt_aggregate_keycmp(lhs, rhs));
626 }
627 
628 static int
629 dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
630 {
631 	int rval;
632 
633 	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
634 		return (rval);
635 
636 	return (dt_aggregate_varcmp(lhs, rhs));
637 }
638 
639 static int
640 dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
641 {
642 	int rval;
643 
644 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
645 		return (rval);
646 
647 	return (dt_aggregate_keycmp(lhs, rhs));
648 }
649 
650 static int
651 dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
652 {
653 	int rval;
654 
655 	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
656 		return (rval);
657 
658 	return (dt_aggregate_varcmp(lhs, rhs));
659 }
660 
661 static int
662 dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
663 {
664 	int rval;
665 
666 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
667 		return (rval);
668 
669 	return (dt_aggregate_valcmp(lhs, rhs));
670 }
671 
672 static int
673 dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
674 {
675 	return (dt_aggregate_keyvarcmp(rhs, lhs));
676 }
677 
678 static int
679 dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
680 {
681 	return (dt_aggregate_varkeycmp(rhs, lhs));
682 }
683 
684 static int
685 dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
686 {
687 	return (dt_aggregate_valvarcmp(rhs, lhs));
688 }
689 
690 static int
691 dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
692 {
693 	return (dt_aggregate_varvalcmp(rhs, lhs));
694 }
695 
696 int
697 dt_aggregate_go(dtrace_hdl_t *dtp)
698 {
699 	dt_aggregate_t *agp = &dtp->dt_aggregate;
700 	dtrace_optval_t size, cpu;
701 	dtrace_bufdesc_t *buf = &agp->dtat_buf;
702 	int rval, i;
703 
704 	assert(agp->dtat_maxcpu == 0);
705 	assert(agp->dtat_ncpu == 0);
706 	assert(agp->dtat_cpus == NULL);
707 
708 	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
709 	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
710 	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
711 
712 	if (agp->dtat_cpus == NULL)
713 		return (dt_set_errno(dtp, EDT_NOMEM));
714 
715 	/*
716 	 * Use the aggregation buffer size as reloaded from the kernel.
717 	 */
718 	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
719 
720 	rval = dtrace_getopt(dtp, "aggsize", &size);
721 	assert(rval == 0);
722 
723 	if (size == 0 || size == DTRACEOPT_UNSET)
724 		return (0);
725 
726 	buf = &agp->dtat_buf;
727 	buf->dtbd_size = size;
728 
729 	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
730 		return (dt_set_errno(dtp, EDT_NOMEM));
731 
732 	/*
733 	 * Now query for the CPUs enabled.
734 	 */
735 	rval = dtrace_getopt(dtp, "cpu", &cpu);
736 	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
737 
738 	if (cpu != DTRACE_CPUALL) {
739 		assert(cpu < agp->dtat_ncpu);
740 		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
741 
742 		return (0);
743 	}
744 
745 	agp->dtat_ncpus = 0;
746 	for (i = 0; i < agp->dtat_maxcpu; i++) {
747 		if (dt_status(dtp, i) == -1)
748 			continue;
749 
750 		agp->dtat_cpus[agp->dtat_ncpus++] = i;
751 	}
752 
753 	return (0);
754 }
755 
756 static int
757 dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
758 {
759 	dt_aggregate_t *agp = &dtp->dt_aggregate;
760 	dtrace_aggdata_t *data;
761 	dtrace_aggdesc_t *aggdesc;
762 	dtrace_recdesc_t *rec;
763 	int i;
764 
765 	switch (rval) {
766 	case DTRACE_AGGWALK_NEXT:
767 		break;
768 
769 	case DTRACE_AGGWALK_CLEAR: {
770 		uint32_t size, offs = 0;
771 
772 		aggdesc = h->dtahe_data.dtada_desc;
773 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
774 		size = rec->dtrd_size;
775 		data = &h->dtahe_data;
776 
777 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
778 			offs = sizeof (uint64_t);
779 			size -= sizeof (uint64_t);
780 		}
781 
782 		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
783 
784 		if (data->dtada_percpu == NULL)
785 			break;
786 
787 		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
788 			bzero(data->dtada_percpu[i] + offs, size);
789 		break;
790 	}
791 
792 	case DTRACE_AGGWALK_ERROR:
793 		/*
794 		 * We assume that errno is already set in this case.
795 		 */
796 		return (dt_set_errno(dtp, errno));
797 
798 	case DTRACE_AGGWALK_ABORT:
799 		return (dt_set_errno(dtp, EDT_DIRABORT));
800 
801 	case DTRACE_AGGWALK_DENORMALIZE:
802 		h->dtahe_data.dtada_normal = 1;
803 		return (0);
804 
805 	case DTRACE_AGGWALK_NORMALIZE:
806 		if (h->dtahe_data.dtada_normal == 0) {
807 			h->dtahe_data.dtada_normal = 1;
808 			return (dt_set_errno(dtp, EDT_BADRVAL));
809 		}
810 
811 		return (0);
812 
813 	case DTRACE_AGGWALK_REMOVE: {
814 		dtrace_aggdata_t *aggdata = &h->dtahe_data;
815 		int i, max_cpus = agp->dtat_maxcpu;
816 
817 		/*
818 		 * First, remove this hash entry from its hash chain.
819 		 */
820 		if (h->dtahe_prev != NULL) {
821 			h->dtahe_prev->dtahe_next = h->dtahe_next;
822 		} else {
823 			dt_ahash_t *hash = &agp->dtat_hash;
824 			size_t ndx = h->dtahe_hashval % hash->dtah_size;
825 
826 			assert(hash->dtah_hash[ndx] == h);
827 			hash->dtah_hash[ndx] = h->dtahe_next;
828 		}
829 
830 		if (h->dtahe_next != NULL)
831 			h->dtahe_next->dtahe_prev = h->dtahe_prev;
832 
833 		/*
834 		 * Now remove it from the list of all hash entries.
835 		 */
836 		if (h->dtahe_prevall != NULL) {
837 			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
838 		} else {
839 			dt_ahash_t *hash = &agp->dtat_hash;
840 
841 			assert(hash->dtah_all == h);
842 			hash->dtah_all = h->dtahe_nextall;
843 		}
844 
845 		if (h->dtahe_nextall != NULL)
846 			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
847 
848 		/*
849 		 * We're unlinked.  We can safely destroy the data.
850 		 */
851 		if (aggdata->dtada_percpu != NULL) {
852 			for (i = 0; i < max_cpus; i++)
853 				free(aggdata->dtada_percpu[i]);
854 			free(aggdata->dtada_percpu);
855 		}
856 
857 		free(aggdata->dtada_data);
858 		free(h);
859 
860 		return (0);
861 	}
862 
863 	default:
864 		return (dt_set_errno(dtp, EDT_BADRVAL));
865 	}
866 
867 	return (0);
868 }
869 
870 int
871 dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
872 {
873 	dt_ahashent_t *h, *next;
874 	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
875 
876 	for (h = hash->dtah_all; h != NULL; h = next) {
877 		/*
878 		 * dt_aggwalk_rval() can potentially remove the current hash
879 		 * entry; we need to load the next hash entry before calling
880 		 * into it.
881 		 */
882 		next = h->dtahe_nextall;
883 
884 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
885 			return (-1);
886 	}
887 
888 	return (0);
889 }
890 
891 static int
892 dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
893     dtrace_aggregate_f *func, void *arg,
894     int (*sfunc)(const void *, const void *))
895 {
896 	dt_aggregate_t *agp = &dtp->dt_aggregate;
897 	dt_ahashent_t *h, **sorted;
898 	dt_ahash_t *hash = &agp->dtat_hash;
899 	size_t i, nentries = 0;
900 
901 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
902 		nentries++;
903 
904 	sorted = malloc(nentries * sizeof (dt_ahashent_t *));
905 
906 	if (sorted == NULL)
907 		return (dt_set_errno(dtp, EDT_NOMEM));
908 
909 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
910 		sorted[i++] = h;
911 
912 	qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
913 
914 	for (i = 0; i < nentries; i++) {
915 		h = sorted[i];
916 
917 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
918 			return (-1);
919 	}
920 
921 	free(sorted);
922 	return (0);
923 }
924 
925 int
926 dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
927     dtrace_aggregate_f *func, void *arg)
928 {
929 	return (dt_aggregate_walk_sorted(dtp, func,
930 	    arg, dt_aggregate_varkeycmp));
931 }
932 
933 int
934 dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
935     dtrace_aggregate_f *func, void *arg)
936 {
937 	return (dt_aggregate_walk_sorted(dtp, func,
938 	    arg, dt_aggregate_varvalcmp));
939 }
940 
941 int
942 dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
943     dtrace_aggregate_f *func, void *arg)
944 {
945 	return (dt_aggregate_walk_sorted(dtp, func,
946 	    arg, dt_aggregate_keyvarcmp));
947 }
948 
949 int
950 dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
951     dtrace_aggregate_f *func, void *arg)
952 {
953 	return (dt_aggregate_walk_sorted(dtp, func,
954 	    arg, dt_aggregate_valvarcmp));
955 }
956 
957 int
958 dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
959     dtrace_aggregate_f *func, void *arg)
960 {
961 	return (dt_aggregate_walk_sorted(dtp, func,
962 	    arg, dt_aggregate_varkeyrevcmp));
963 }
964 
965 int
966 dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
967     dtrace_aggregate_f *func, void *arg)
968 {
969 	return (dt_aggregate_walk_sorted(dtp, func,
970 	    arg, dt_aggregate_varvalrevcmp));
971 }
972 
973 int
974 dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
975     dtrace_aggregate_f *func, void *arg)
976 {
977 	return (dt_aggregate_walk_sorted(dtp, func,
978 	    arg, dt_aggregate_keyvarrevcmp));
979 }
980 
981 int
982 dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
983     dtrace_aggregate_f *func, void *arg)
984 {
985 	return (dt_aggregate_walk_sorted(dtp, func,
986 	    arg, dt_aggregate_valvarrevcmp));
987 }
988 
989 int
990 dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
991     dtrace_aggregate_walk_f *func)
992 {
993 	dt_print_aggdata_t pd;
994 
995 	pd.dtpa_dtp = dtp;
996 	pd.dtpa_fp = fp;
997 	pd.dtpa_allunprint = 1;
998 
999 	if (func == NULL)
1000 		func = dtrace_aggregate_walk_valsorted;
1001 
1002 	if ((*func)(dtp, dt_print_agg, &pd) == -1)
1003 		return (dt_set_errno(dtp, dtp->dt_errno));
1004 
1005 	return (0);
1006 }
1007 
1008 void
1009 dtrace_aggregate_clear(dtrace_hdl_t *dtp)
1010 {
1011 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1012 	dt_ahash_t *hash = &agp->dtat_hash;
1013 	dt_ahashent_t *h;
1014 	dtrace_aggdata_t *data;
1015 	dtrace_aggdesc_t *aggdesc;
1016 	dtrace_recdesc_t *rec;
1017 	int i, max_cpus = agp->dtat_maxcpu;
1018 
1019 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1020 		aggdesc = h->dtahe_data.dtada_desc;
1021 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1022 		data = &h->dtahe_data;
1023 
1024 		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
1025 
1026 		if (data->dtada_percpu == NULL)
1027 			continue;
1028 
1029 		for (i = 0; i < max_cpus; i++)
1030 			bzero(data->dtada_percpu[i], rec->dtrd_size);
1031 	}
1032 }
1033 
1034 void
1035 dt_aggregate_destroy(dtrace_hdl_t *dtp)
1036 {
1037 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1038 	dt_ahash_t *hash = &agp->dtat_hash;
1039 	dt_ahashent_t *h, *next;
1040 	dtrace_aggdata_t *aggdata;
1041 	int i, max_cpus = agp->dtat_maxcpu;
1042 
1043 	if (hash->dtah_hash == NULL) {
1044 		assert(hash->dtah_all == NULL);
1045 	} else {
1046 		free(hash->dtah_hash);
1047 
1048 		for (h = hash->dtah_all; h != NULL; h = next) {
1049 			next = h->dtahe_nextall;
1050 
1051 			aggdata = &h->dtahe_data;
1052 
1053 			if (aggdata->dtada_percpu != NULL) {
1054 				for (i = 0; i < max_cpus; i++)
1055 					free(aggdata->dtada_percpu[i]);
1056 				free(aggdata->dtada_percpu);
1057 			}
1058 
1059 			free(aggdata->dtada_data);
1060 			free(h);
1061 		}
1062 
1063 		hash->dtah_hash = NULL;
1064 		hash->dtah_all = NULL;
1065 		hash->dtah_size = 0;
1066 	}
1067 
1068 	free(agp->dtat_buf.dtbd_data);
1069 	free(agp->dtat_cpus);
1070 }
1071