xref: /illumos-gate/usr/src/lib/libdtrace/common/dt_aggregate.c (revision 88f8b78a88cbdc6d8c1af5c3e54bc49d25095c98)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <unistd.h>
33 #include <dt_impl.h>
34 #include <assert.h>
35 
36 #define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
37 
38 static void
39 dt_aggregate_count(int64_t *existing, int64_t *new, size_t size)
40 {
41 	int i;
42 
43 	for (i = 0; i < size / sizeof (int64_t); i++)
44 		existing[i] = existing[i] + new[i];
45 }
46 
47 static int
48 dt_aggregate_countcmp(int64_t *lhs, int64_t *rhs)
49 {
50 	int64_t lvar = *lhs;
51 	int64_t rvar = *rhs;
52 
53 	if (lvar > rvar)
54 		return (1);
55 
56 	if (lvar < rvar)
57 		return (-1);
58 
59 	return (0);
60 }
61 
62 /*ARGSUSED*/
63 static void
64 dt_aggregate_min(int64_t *existing, int64_t *new, size_t size)
65 {
66 	if (*new < *existing)
67 		*existing = *new;
68 }
69 
70 /*ARGSUSED*/
71 static void
72 dt_aggregate_max(int64_t *existing, int64_t *new, size_t size)
73 {
74 	if (*new > *existing)
75 		*existing = *new;
76 }
77 
78 static int
79 dt_aggregate_averagecmp(int64_t *lhs, int64_t *rhs)
80 {
81 	int64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
82 	int64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
83 
84 	if (lavg > ravg)
85 		return (1);
86 
87 	if (lavg < ravg)
88 		return (-1);
89 
90 	return (0);
91 }
92 
93 /*ARGSUSED*/
94 static void
95 dt_aggregate_lquantize(int64_t *existing, int64_t *new, size_t size)
96 {
97 	int64_t arg = *existing++;
98 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
99 	int i;
100 
101 	for (i = 0; i <= levels + 1; i++)
102 		existing[i] = existing[i] + new[i + 1];
103 }
104 
105 static long double
106 dt_aggregate_lquantizedsum(int64_t *lquanta)
107 {
108 	int64_t arg = *lquanta++;
109 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
110 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
111 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
112 	long double total = (long double)lquanta[0] * (long double)(base - 1);
113 
114 	for (i = 0; i < levels; base += step, i++)
115 		total += (long double)lquanta[i + 1] * (long double)base;
116 
117 	return (total + (long double)lquanta[levels + 1] *
118 	    (long double)(base + 1));
119 }
120 
121 static int64_t
122 dt_aggregate_lquantizedzero(int64_t *lquanta)
123 {
124 	int64_t arg = *lquanta++;
125 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
126 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
127 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
128 
129 	if (base - 1 == 0)
130 		return (lquanta[0]);
131 
132 	for (i = 0; i < levels; base += step, i++) {
133 		if (base != 0)
134 			continue;
135 
136 		return (lquanta[i + 1]);
137 	}
138 
139 	if (base + 1 == 0)
140 		return (lquanta[levels + 1]);
141 
142 	return (0);
143 }
144 
145 static int
146 dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs)
147 {
148 	long double lsum = dt_aggregate_lquantizedsum(lhs);
149 	long double rsum = dt_aggregate_lquantizedsum(rhs);
150 	int64_t lzero, rzero;
151 
152 	if (lsum > rsum)
153 		return (1);
154 
155 	if (lsum < rsum)
156 		return (-1);
157 
158 	/*
159 	 * If they're both equal, then we will compare based on the weights at
160 	 * zero.  If the weights at zero are equal (or if zero is not within
161 	 * the range of the linear quantization), then this will be judged a
162 	 * tie and will be resolved based on the key comparison.
163 	 */
164 	lzero = dt_aggregate_lquantizedzero(lhs);
165 	rzero = dt_aggregate_lquantizedzero(rhs);
166 
167 	if (lzero > rzero)
168 		return (1);
169 
170 	if (lzero < rzero)
171 		return (-1);
172 
173 	return (0);
174 }
175 
176 static int
177 dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs)
178 {
179 	int nbuckets = DTRACE_QUANTIZE_NBUCKETS, i;
180 	long double ltotal = 0, rtotal = 0;
181 	int64_t lzero, rzero;
182 
183 	for (i = 0; i < nbuckets; i++) {
184 		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
185 
186 		if (bucketval == 0) {
187 			lzero = lhs[i];
188 			rzero = rhs[i];
189 		}
190 
191 		ltotal += (long double)bucketval * (long double)lhs[i];
192 		rtotal += (long double)bucketval * (long double)rhs[i];
193 	}
194 
195 	if (ltotal > rtotal)
196 		return (1);
197 
198 	if (ltotal < rtotal)
199 		return (-1);
200 
201 	/*
202 	 * If they're both equal, then we will compare based on the weights at
203 	 * zero.  If the weights at zero are equal, then this will be judged a
204 	 * tie and will be resolved based on the key comparison.
205 	 */
206 	if (lzero > rzero)
207 		return (1);
208 
209 	if (lzero < rzero)
210 		return (-1);
211 
212 	return (0);
213 }
214 
215 static void
216 dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
217 {
218 	uint64_t pid = data[0];
219 	uint64_t *pc = &data[1];
220 	struct ps_prochandle *P;
221 	GElf_Sym sym;
222 
223 	if (dtp->dt_vector != NULL)
224 		return;
225 
226 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
227 		return;
228 
229 	dt_proc_lock(dtp, P);
230 
231 	if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
232 		*pc = sym.st_value;
233 
234 	dt_proc_unlock(dtp, P);
235 	dt_proc_release(dtp, P);
236 }
237 
238 static void
239 dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
240 {
241 	uint64_t pid = data[0];
242 	uint64_t *pc = &data[1];
243 	struct ps_prochandle *P;
244 	const prmap_t *map;
245 
246 	if (dtp->dt_vector != NULL)
247 		return;
248 
249 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
250 		return;
251 
252 	dt_proc_lock(dtp, P);
253 
254 	if ((map = Paddr_to_map(P, *pc)) != NULL)
255 		*pc = map->pr_vaddr;
256 
257 	dt_proc_unlock(dtp, P);
258 	dt_proc_release(dtp, P);
259 }
260 
261 static void
262 dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
263 {
264 	GElf_Sym sym;
265 	uint64_t *pc = data;
266 
267 	if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
268 		*pc = sym.st_value;
269 }
270 
271 static void
272 dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
273 {
274 	uint64_t *pc = data;
275 	dt_module_t *dmp;
276 
277 	if (dtp->dt_vector != NULL) {
278 		/*
279 		 * We don't have a way of just getting the module for a
280 		 * vectored open, and it doesn't seem to be worth defining
281 		 * one.  This means that use of mod() won't get true
282 		 * aggregation in the postmortem case (some modules may
283 		 * appear more than once in aggregation output).  It seems
284 		 * unlikely that anyone will ever notice or care...
285 		 */
286 		return;
287 	}
288 
289 	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
290 	    dmp = dt_list_next(dmp)) {
291 		if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
292 			*pc = dmp->dm_text_va;
293 			return;
294 		}
295 	}
296 }
297 
298 static int
299 dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
300 {
301 	dtrace_epid_t id;
302 	uint64_t hashval;
303 	size_t offs, roffs, size, ndx;
304 	int i, j, rval;
305 	caddr_t addr, data;
306 	dtrace_recdesc_t *rec;
307 	dt_aggregate_t *agp = &dtp->dt_aggregate;
308 	dtrace_aggdesc_t *agg;
309 	dt_ahash_t *hash = &agp->dtat_hash;
310 	dt_ahashent_t *h;
311 	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
312 	dtrace_aggdata_t *aggdata;
313 	int flags = agp->dtat_flags;
314 
315 	buf->dtbd_cpu = cpu;
316 
317 	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
318 		if (errno == ENOENT) {
319 			/*
320 			 * If that failed with ENOENT, it may be because the
321 			 * CPU was unconfigured.  This is okay; we'll just
322 			 * do nothing but return success.
323 			 */
324 			return (0);
325 		}
326 
327 		return (dt_set_errno(dtp, errno));
328 	}
329 
330 	if (buf->dtbd_drops != 0) {
331 		if (dt_handle_cpudrop(dtp, cpu,
332 		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1)
333 			return (-1);
334 	}
335 
336 	if (buf->dtbd_size == 0)
337 		return (0);
338 
339 	if (hash->dtah_hash == NULL) {
340 		size_t size;
341 
342 		hash->dtah_size = DTRACE_AHASHSIZE;
343 		size = hash->dtah_size * sizeof (dt_ahashent_t *);
344 
345 		if ((hash->dtah_hash = malloc(size)) == NULL)
346 			return (dt_set_errno(dtp, EDT_NOMEM));
347 
348 		bzero(hash->dtah_hash, size);
349 	}
350 
351 	for (offs = 0; offs < buf->dtbd_size; ) {
352 		/*
353 		 * We're guaranteed to have an ID.
354 		 */
355 		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
356 		    (uintptr_t)offs));
357 
358 		if (id == DTRACE_AGGIDNONE) {
359 			/*
360 			 * This is filler to assure proper alignment of the
361 			 * next record; we simply ignore it.
362 			 */
363 			offs += sizeof (id);
364 			continue;
365 		}
366 
367 		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
368 			return (rval);
369 
370 		addr = buf->dtbd_data + offs;
371 		size = agg->dtagd_size;
372 		hashval = 0;
373 
374 		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
375 			rec = &agg->dtagd_rec[j];
376 			roffs = rec->dtrd_offset;
377 
378 			switch (rec->dtrd_action) {
379 			case DTRACEACT_USYM:
380 				dt_aggregate_usym(dtp,
381 				    /* LINTED - alignment */
382 				    (uint64_t *)&addr[roffs]);
383 				break;
384 
385 			case DTRACEACT_UMOD:
386 				dt_aggregate_umod(dtp,
387 				    /* LINTED - alignment */
388 				    (uint64_t *)&addr[roffs]);
389 				break;
390 
391 			case DTRACEACT_SYM:
392 				/* LINTED - alignment */
393 				dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
394 				break;
395 
396 			case DTRACEACT_MOD:
397 				/* LINTED - alignment */
398 				dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
399 				break;
400 
401 			default:
402 				break;
403 			}
404 
405 			for (i = 0; i < rec->dtrd_size; i++)
406 				hashval += addr[roffs + i];
407 		}
408 
409 		ndx = hashval % hash->dtah_size;
410 
411 		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
412 			if (h->dtahe_hashval != hashval)
413 				continue;
414 
415 			if (h->dtahe_size != size)
416 				continue;
417 
418 			aggdata = &h->dtahe_data;
419 			data = aggdata->dtada_data;
420 
421 			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
422 				rec = &agg->dtagd_rec[j];
423 				roffs = rec->dtrd_offset;
424 
425 				for (i = 0; i < rec->dtrd_size; i++)
426 					if (addr[roffs + i] != data[roffs + i])
427 						goto hashnext;
428 			}
429 
430 			/*
431 			 * We found it.  Now we need to apply the aggregating
432 			 * action on the data here.
433 			 */
434 			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
435 			roffs = rec->dtrd_offset;
436 			/* LINTED - alignment */
437 			h->dtahe_aggregate((int64_t *)&data[roffs],
438 			    /* LINTED - alignment */
439 			    (int64_t *)&addr[roffs], rec->dtrd_size);
440 
441 			/*
442 			 * If we're keeping per CPU data, apply the aggregating
443 			 * action there as well.
444 			 */
445 			if (aggdata->dtada_percpu != NULL) {
446 				data = aggdata->dtada_percpu[cpu];
447 
448 				/* LINTED - alignment */
449 				h->dtahe_aggregate((int64_t *)data,
450 				    /* LINTED - alignment */
451 				    (int64_t *)&addr[roffs], rec->dtrd_size);
452 			}
453 
454 			goto bufnext;
455 hashnext:
456 			continue;
457 		}
458 
459 		/*
460 		 * If we're here, we couldn't find an entry for this record.
461 		 */
462 		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
463 			return (dt_set_errno(dtp, EDT_NOMEM));
464 		bzero(h, sizeof (dt_ahashent_t));
465 		aggdata = &h->dtahe_data;
466 
467 		if ((aggdata->dtada_data = malloc(size)) == NULL) {
468 			free(h);
469 			return (dt_set_errno(dtp, EDT_NOMEM));
470 		}
471 
472 		bcopy(addr, aggdata->dtada_data, size);
473 		aggdata->dtada_size = size;
474 		aggdata->dtada_desc = agg;
475 		aggdata->dtada_handle = dtp;
476 		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
477 		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
478 		aggdata->dtada_normal = 1;
479 
480 		h->dtahe_hashval = hashval;
481 		h->dtahe_size = size;
482 
483 		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
484 
485 		if (flags & DTRACE_A_PERCPU) {
486 			int max_cpus = agp->dtat_maxcpu;
487 			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
488 
489 			if (percpu == NULL) {
490 				free(aggdata->dtada_data);
491 				free(h);
492 				return (dt_set_errno(dtp, EDT_NOMEM));
493 			}
494 
495 			for (j = 0; j < max_cpus; j++) {
496 				percpu[j] = malloc(rec->dtrd_size);
497 
498 				if (percpu[j] == NULL) {
499 					while (--j >= 0)
500 						free(percpu[j]);
501 
502 					free(aggdata->dtada_data);
503 					free(h);
504 					return (dt_set_errno(dtp, EDT_NOMEM));
505 				}
506 
507 				if (j == cpu) {
508 					bcopy(&addr[rec->dtrd_offset],
509 					    percpu[j], rec->dtrd_size);
510 				} else {
511 					bzero(percpu[j], rec->dtrd_size);
512 				}
513 			}
514 
515 			aggdata->dtada_percpu = percpu;
516 		}
517 
518 		switch (rec->dtrd_action) {
519 		case DTRACEAGG_MIN:
520 			h->dtahe_aggregate = dt_aggregate_min;
521 			break;
522 
523 		case DTRACEAGG_MAX:
524 			h->dtahe_aggregate = dt_aggregate_max;
525 			break;
526 
527 		case DTRACEAGG_LQUANTIZE:
528 			h->dtahe_aggregate = dt_aggregate_lquantize;
529 			break;
530 
531 		case DTRACEAGG_COUNT:
532 		case DTRACEAGG_SUM:
533 		case DTRACEAGG_AVG:
534 		case DTRACEAGG_QUANTIZE:
535 			h->dtahe_aggregate = dt_aggregate_count;
536 			break;
537 
538 		default:
539 			return (dt_set_errno(dtp, EDT_BADAGG));
540 		}
541 
542 		if (hash->dtah_hash[ndx] != NULL)
543 			hash->dtah_hash[ndx]->dtahe_prev = h;
544 
545 		h->dtahe_next = hash->dtah_hash[ndx];
546 		hash->dtah_hash[ndx] = h;
547 
548 		if (hash->dtah_all != NULL)
549 			hash->dtah_all->dtahe_prevall = h;
550 
551 		h->dtahe_nextall = hash->dtah_all;
552 		hash->dtah_all = h;
553 bufnext:
554 		offs += agg->dtagd_size;
555 	}
556 
557 	return (0);
558 }
559 
560 int
561 dtrace_aggregate_snap(dtrace_hdl_t *dtp)
562 {
563 	int i, rval;
564 	dt_aggregate_t *agp = &dtp->dt_aggregate;
565 	hrtime_t now = gethrtime();
566 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
567 
568 	if (dtp->dt_lastagg != 0) {
569 		if (now - dtp->dt_lastagg < interval)
570 			return (0);
571 
572 		dtp->dt_lastagg += interval;
573 	} else {
574 		dtp->dt_lastagg = now;
575 	}
576 
577 	if (!dtp->dt_active)
578 		return (dt_set_errno(dtp, EINVAL));
579 
580 	if (agp->dtat_buf.dtbd_size == 0)
581 		return (0);
582 
583 	for (i = 0; i < agp->dtat_ncpus; i++) {
584 		if (rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i]))
585 			return (rval);
586 	}
587 
588 	return (0);
589 }
590 
591 static int
592 dt_aggregate_hashcmp(const void *lhs, const void *rhs)
593 {
594 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
595 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
596 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
597 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
598 
599 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
600 		return (-1);
601 
602 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
603 		return (1);
604 
605 	return (0);
606 }
607 
608 static int
609 dt_aggregate_varcmp(const void *lhs, const void *rhs)
610 {
611 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
612 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
613 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
614 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
615 	caddr_t ldata = lh->dtahe_data.dtada_data;
616 	caddr_t rdata = rh->dtahe_data.dtada_data;
617 	dtrace_recdesc_t *lrec, *rrec;
618 	uint64_t lid, rid;
619 
620 	/*
621 	 * We know that we have a compiler-generated ID as the first record.
622 	 */
623 	lrec = lagg->dtagd_rec;
624 	rrec = ragg->dtagd_rec;
625 
626 	lid = *((uint64_t *)(uintptr_t)(ldata + lrec->dtrd_offset));
627 	rid = *((uint64_t *)(uintptr_t)(rdata + rrec->dtrd_offset));
628 
629 	if (lid < rid)
630 		return (-1);
631 
632 	if (lid > rid)
633 		return (1);
634 
635 	return (0);
636 }
637 
638 static int
639 dt_aggregate_keycmp(const void *lhs, const void *rhs)
640 {
641 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
642 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
643 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
644 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
645 	dtrace_recdesc_t *lrec, *rrec;
646 	char *ldata, *rdata;
647 	int rval, i, j;
648 
649 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
650 		return (rval);
651 
652 	for (i = 1; i < lagg->dtagd_nrecs - 1; i++) {
653 		uint64_t lval, rval;
654 
655 		lrec = &lagg->dtagd_rec[i];
656 		rrec = &ragg->dtagd_rec[i];
657 
658 		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
659 		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
660 
661 		if (lrec->dtrd_size < rrec->dtrd_size)
662 			return (-1);
663 
664 		if (lrec->dtrd_size > rrec->dtrd_size)
665 			return (1);
666 
667 		switch (lrec->dtrd_size) {
668 		case sizeof (uint64_t):
669 			/* LINTED - alignment */
670 			lval = *((uint64_t *)ldata);
671 			/* LINTED - alignment */
672 			rval = *((uint64_t *)rdata);
673 			break;
674 
675 		case sizeof (uint32_t):
676 			/* LINTED - alignment */
677 			lval = *((uint32_t *)ldata);
678 			/* LINTED - alignment */
679 			rval = *((uint32_t *)rdata);
680 			break;
681 
682 		case sizeof (uint16_t):
683 			/* LINTED - alignment */
684 			lval = *((uint16_t *)ldata);
685 			/* LINTED - alignment */
686 			rval = *((uint16_t *)rdata);
687 			break;
688 
689 		case sizeof (uint8_t):
690 			lval = *((uint8_t *)ldata);
691 			rval = *((uint8_t *)rdata);
692 			break;
693 
694 		default:
695 			for (j = 0; j < lrec->dtrd_size; j++) {
696 				lval = ((uint8_t *)ldata)[j];
697 				rval = ((uint8_t *)rdata)[j];
698 
699 				if (lval < rval)
700 					return (-1);
701 
702 				if (lval > rval)
703 					return (1);
704 			}
705 
706 			continue;
707 		}
708 
709 		if (lval < rval)
710 			return (-1);
711 
712 		if (lval > rval)
713 			return (1);
714 	}
715 
716 	return (0);
717 }
718 
719 static int
720 dt_aggregate_valcmp(const void *lhs, const void *rhs)
721 {
722 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
723 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
724 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
725 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
726 	caddr_t ldata = lh->dtahe_data.dtada_data;
727 	caddr_t rdata = rh->dtahe_data.dtada_data;
728 	dtrace_recdesc_t *lrec, *rrec;
729 	int64_t *laddr, *raddr;
730 	int rval, i;
731 
732 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
733 		return (rval);
734 
735 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
736 		return (-1);
737 
738 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
739 		return (1);
740 
741 	for (i = 0; i < lagg->dtagd_nrecs; i++) {
742 		lrec = &lagg->dtagd_rec[i];
743 		rrec = &ragg->dtagd_rec[i];
744 
745 		if (lrec->dtrd_offset < rrec->dtrd_offset)
746 			return (-1);
747 
748 		if (lrec->dtrd_offset > rrec->dtrd_offset)
749 			return (1);
750 
751 		if (lrec->dtrd_action < rrec->dtrd_action)
752 			return (-1);
753 
754 		if (lrec->dtrd_action > rrec->dtrd_action)
755 			return (1);
756 	}
757 
758 	laddr = (int64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
759 	raddr = (int64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
760 
761 	switch (lrec->dtrd_action) {
762 	case DTRACEAGG_AVG:
763 		rval = dt_aggregate_averagecmp(laddr, raddr);
764 		break;
765 
766 	case DTRACEAGG_QUANTIZE:
767 		rval = dt_aggregate_quantizedcmp(laddr, raddr);
768 		break;
769 
770 	case DTRACEAGG_LQUANTIZE:
771 		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
772 		break;
773 
774 	case DTRACEAGG_COUNT:
775 	case DTRACEAGG_SUM:
776 	case DTRACEAGG_MIN:
777 	case DTRACEAGG_MAX:
778 		rval = dt_aggregate_countcmp(laddr, raddr);
779 		break;
780 
781 	default:
782 		assert(0);
783 	}
784 
785 	if (rval != 0)
786 		return (rval);
787 
788 	/*
789 	 * If we're here, the values for the two aggregation elements are
790 	 * equal.  We already know that the key layout is the same for the two
791 	 * elements; we must now compare the keys themselves as a tie-breaker.
792 	 */
793 	return (dt_aggregate_keycmp(lhs, rhs));
794 }
795 
796 static int
797 dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
798 {
799 	int rval;
800 
801 	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
802 		return (rval);
803 
804 	return (dt_aggregate_varcmp(lhs, rhs));
805 }
806 
807 static int
808 dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
809 {
810 	int rval;
811 
812 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
813 		return (rval);
814 
815 	return (dt_aggregate_keycmp(lhs, rhs));
816 }
817 
818 static int
819 dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
820 {
821 	int rval;
822 
823 	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
824 		return (rval);
825 
826 	return (dt_aggregate_varcmp(lhs, rhs));
827 }
828 
829 static int
830 dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
831 {
832 	int rval;
833 
834 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
835 		return (rval);
836 
837 	return (dt_aggregate_valcmp(lhs, rhs));
838 }
839 
840 static int
841 dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
842 {
843 	return (dt_aggregate_keyvarcmp(rhs, lhs));
844 }
845 
846 static int
847 dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
848 {
849 	return (dt_aggregate_varkeycmp(rhs, lhs));
850 }
851 
852 static int
853 dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
854 {
855 	return (dt_aggregate_valvarcmp(rhs, lhs));
856 }
857 
858 static int
859 dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
860 {
861 	return (dt_aggregate_varvalcmp(rhs, lhs));
862 }
863 
864 int
865 dt_aggregate_go(dtrace_hdl_t *dtp)
866 {
867 	dt_aggregate_t *agp = &dtp->dt_aggregate;
868 	dtrace_optval_t size, cpu;
869 	dtrace_bufdesc_t *buf = &agp->dtat_buf;
870 	int rval, i;
871 
872 	assert(agp->dtat_maxcpu == 0);
873 	assert(agp->dtat_ncpu == 0);
874 	assert(agp->dtat_cpus == NULL);
875 
876 	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
877 	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
878 	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
879 
880 	if (agp->dtat_cpus == NULL)
881 		return (dt_set_errno(dtp, EDT_NOMEM));
882 
883 	/*
884 	 * Use the aggregation buffer size as reloaded from the kernel.
885 	 */
886 	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
887 
888 	rval = dtrace_getopt(dtp, "aggsize", &size);
889 	assert(rval == 0);
890 
891 	if (size == 0 || size == DTRACEOPT_UNSET)
892 		return (0);
893 
894 	buf = &agp->dtat_buf;
895 	buf->dtbd_size = size;
896 
897 	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
898 		return (dt_set_errno(dtp, EDT_NOMEM));
899 
900 	/*
901 	 * Now query for the CPUs enabled.
902 	 */
903 	rval = dtrace_getopt(dtp, "cpu", &cpu);
904 	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
905 
906 	if (cpu != DTRACE_CPUALL) {
907 		assert(cpu < agp->dtat_ncpu);
908 		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
909 
910 		return (0);
911 	}
912 
913 	agp->dtat_ncpus = 0;
914 	for (i = 0; i < agp->dtat_maxcpu; i++) {
915 		if (dt_status(dtp, i) == -1)
916 			continue;
917 
918 		agp->dtat_cpus[agp->dtat_ncpus++] = i;
919 	}
920 
921 	return (0);
922 }
923 
924 static int
925 dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
926 {
927 	dt_aggregate_t *agp = &dtp->dt_aggregate;
928 	dtrace_aggdata_t *data;
929 	dtrace_aggdesc_t *aggdesc;
930 	dtrace_recdesc_t *rec;
931 	int i;
932 
933 	switch (rval) {
934 	case DTRACE_AGGWALK_NEXT:
935 		break;
936 
937 	case DTRACE_AGGWALK_CLEAR: {
938 		uint32_t size, offs = 0;
939 
940 		aggdesc = h->dtahe_data.dtada_desc;
941 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
942 		size = rec->dtrd_size;
943 		data = &h->dtahe_data;
944 
945 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
946 			offs = sizeof (uint64_t);
947 			size -= sizeof (uint64_t);
948 		}
949 
950 		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
951 
952 		if (data->dtada_percpu == NULL)
953 			break;
954 
955 		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
956 			bzero(data->dtada_percpu[i] + offs, size);
957 		break;
958 	}
959 
960 	case DTRACE_AGGWALK_ERROR:
961 		/*
962 		 * We assume that errno is already set in this case.
963 		 */
964 		return (dt_set_errno(dtp, errno));
965 
966 	case DTRACE_AGGWALK_ABORT:
967 		return (dt_set_errno(dtp, EDT_DIRABORT));
968 
969 	case DTRACE_AGGWALK_DENORMALIZE:
970 		h->dtahe_data.dtada_normal = 1;
971 		return (0);
972 
973 	case DTRACE_AGGWALK_NORMALIZE:
974 		if (h->dtahe_data.dtada_normal == 0) {
975 			h->dtahe_data.dtada_normal = 1;
976 			return (dt_set_errno(dtp, EDT_BADRVAL));
977 		}
978 
979 		return (0);
980 
981 	case DTRACE_AGGWALK_REMOVE: {
982 		dtrace_aggdata_t *aggdata = &h->dtahe_data;
983 		int i, max_cpus = agp->dtat_maxcpu;
984 
985 		/*
986 		 * First, remove this hash entry from its hash chain.
987 		 */
988 		if (h->dtahe_prev != NULL) {
989 			h->dtahe_prev->dtahe_next = h->dtahe_next;
990 		} else {
991 			dt_ahash_t *hash = &agp->dtat_hash;
992 			size_t ndx = h->dtahe_hashval % hash->dtah_size;
993 
994 			assert(hash->dtah_hash[ndx] == h);
995 			hash->dtah_hash[ndx] = h->dtahe_next;
996 		}
997 
998 		if (h->dtahe_next != NULL)
999 			h->dtahe_next->dtahe_prev = h->dtahe_prev;
1000 
1001 		/*
1002 		 * Now remove it from the list of all hash entries.
1003 		 */
1004 		if (h->dtahe_prevall != NULL) {
1005 			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
1006 		} else {
1007 			dt_ahash_t *hash = &agp->dtat_hash;
1008 
1009 			assert(hash->dtah_all == h);
1010 			hash->dtah_all = h->dtahe_nextall;
1011 		}
1012 
1013 		if (h->dtahe_nextall != NULL)
1014 			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
1015 
1016 		/*
1017 		 * We're unlinked.  We can safely destroy the data.
1018 		 */
1019 		if (aggdata->dtada_percpu != NULL) {
1020 			for (i = 0; i < max_cpus; i++)
1021 				free(aggdata->dtada_percpu[i]);
1022 			free(aggdata->dtada_percpu);
1023 		}
1024 
1025 		free(aggdata->dtada_data);
1026 		free(h);
1027 
1028 		return (0);
1029 	}
1030 
1031 	default:
1032 		return (dt_set_errno(dtp, EDT_BADRVAL));
1033 	}
1034 
1035 	return (0);
1036 }
1037 
1038 int
1039 dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
1040 {
1041 	dt_ahashent_t *h, *next;
1042 	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
1043 
1044 	for (h = hash->dtah_all; h != NULL; h = next) {
1045 		/*
1046 		 * dt_aggwalk_rval() can potentially remove the current hash
1047 		 * entry; we need to load the next hash entry before calling
1048 		 * into it.
1049 		 */
1050 		next = h->dtahe_nextall;
1051 
1052 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1053 			return (-1);
1054 	}
1055 
1056 	return (0);
1057 }
1058 
1059 static int
1060 dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1061     dtrace_aggregate_f *func, void *arg,
1062     int (*sfunc)(const void *, const void *))
1063 {
1064 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1065 	dt_ahashent_t *h, **sorted;
1066 	dt_ahash_t *hash = &agp->dtat_hash;
1067 	size_t i, nentries = 0;
1068 
1069 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1070 		nentries++;
1071 
1072 	sorted = malloc(nentries * sizeof (dt_ahashent_t *));
1073 
1074 	if (sorted == NULL)
1075 		return (dt_set_errno(dtp, EDT_NOMEM));
1076 
1077 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1078 		sorted[i++] = h;
1079 
1080 	qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1081 
1082 	for (i = 0; i < nentries; i++) {
1083 		h = sorted[i];
1084 
1085 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1086 			return (-1);
1087 	}
1088 
1089 	free(sorted);
1090 	return (0);
1091 }
1092 
1093 int
1094 dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1095     dtrace_aggregate_f *func, void *arg)
1096 {
1097 	return (dt_aggregate_walk_sorted(dtp, func,
1098 	    arg, dt_aggregate_varkeycmp));
1099 }
1100 
1101 int
1102 dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1103     dtrace_aggregate_f *func, void *arg)
1104 {
1105 	return (dt_aggregate_walk_sorted(dtp, func,
1106 	    arg, dt_aggregate_varvalcmp));
1107 }
1108 
1109 int
1110 dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1111     dtrace_aggregate_f *func, void *arg)
1112 {
1113 	return (dt_aggregate_walk_sorted(dtp, func,
1114 	    arg, dt_aggregate_keyvarcmp));
1115 }
1116 
1117 int
1118 dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1119     dtrace_aggregate_f *func, void *arg)
1120 {
1121 	return (dt_aggregate_walk_sorted(dtp, func,
1122 	    arg, dt_aggregate_valvarcmp));
1123 }
1124 
1125 int
1126 dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1127     dtrace_aggregate_f *func, void *arg)
1128 {
1129 	return (dt_aggregate_walk_sorted(dtp, func,
1130 	    arg, dt_aggregate_varkeyrevcmp));
1131 }
1132 
1133 int
1134 dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1135     dtrace_aggregate_f *func, void *arg)
1136 {
1137 	return (dt_aggregate_walk_sorted(dtp, func,
1138 	    arg, dt_aggregate_varvalrevcmp));
1139 }
1140 
1141 int
1142 dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1143     dtrace_aggregate_f *func, void *arg)
1144 {
1145 	return (dt_aggregate_walk_sorted(dtp, func,
1146 	    arg, dt_aggregate_keyvarrevcmp));
1147 }
1148 
1149 int
1150 dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1151     dtrace_aggregate_f *func, void *arg)
1152 {
1153 	return (dt_aggregate_walk_sorted(dtp, func,
1154 	    arg, dt_aggregate_valvarrevcmp));
1155 }
1156 
1157 int
1158 dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
1159     dtrace_aggregate_walk_f *func)
1160 {
1161 	dt_print_aggdata_t pd;
1162 
1163 	pd.dtpa_dtp = dtp;
1164 	pd.dtpa_fp = fp;
1165 	pd.dtpa_allunprint = 1;
1166 
1167 	if (func == NULL)
1168 		func = dtrace_aggregate_walk_valsorted;
1169 
1170 	if ((*func)(dtp, dt_print_agg, &pd) == -1)
1171 		return (dt_set_errno(dtp, dtp->dt_errno));
1172 
1173 	return (0);
1174 }
1175 
1176 void
1177 dtrace_aggregate_clear(dtrace_hdl_t *dtp)
1178 {
1179 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1180 	dt_ahash_t *hash = &agp->dtat_hash;
1181 	dt_ahashent_t *h;
1182 	dtrace_aggdata_t *data;
1183 	dtrace_aggdesc_t *aggdesc;
1184 	dtrace_recdesc_t *rec;
1185 	int i, max_cpus = agp->dtat_maxcpu;
1186 
1187 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1188 		aggdesc = h->dtahe_data.dtada_desc;
1189 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1190 		data = &h->dtahe_data;
1191 
1192 		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
1193 
1194 		if (data->dtada_percpu == NULL)
1195 			continue;
1196 
1197 		for (i = 0; i < max_cpus; i++)
1198 			bzero(data->dtada_percpu[i], rec->dtrd_size);
1199 	}
1200 }
1201 
1202 void
1203 dt_aggregate_destroy(dtrace_hdl_t *dtp)
1204 {
1205 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1206 	dt_ahash_t *hash = &agp->dtat_hash;
1207 	dt_ahashent_t *h, *next;
1208 	dtrace_aggdata_t *aggdata;
1209 	int i, max_cpus = agp->dtat_maxcpu;
1210 
1211 	if (hash->dtah_hash == NULL) {
1212 		assert(hash->dtah_all == NULL);
1213 	} else {
1214 		free(hash->dtah_hash);
1215 
1216 		for (h = hash->dtah_all; h != NULL; h = next) {
1217 			next = h->dtahe_nextall;
1218 
1219 			aggdata = &h->dtahe_data;
1220 
1221 			if (aggdata->dtada_percpu != NULL) {
1222 				for (i = 0; i < max_cpus; i++)
1223 					free(aggdata->dtada_percpu[i]);
1224 				free(aggdata->dtada_percpu);
1225 			}
1226 
1227 			free(aggdata->dtada_data);
1228 			free(h);
1229 		}
1230 
1231 		hash->dtah_hash = NULL;
1232 		hash->dtah_all = NULL;
1233 		hash->dtah_size = 0;
1234 	}
1235 
1236 	free(agp->dtat_buf.dtbd_data);
1237 	free(agp->dtat_cpus);
1238 }
1239