xref: /titanic_50/usr/src/lib/libdtrace/common/dt_aggregate.c (revision 21ad40f5447a73ac8a7ed2b9b66dd73ff1b088c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
29  */
30 
31 #include <stdlib.h>
32 #include <strings.h>
33 #include <errno.h>
34 #include <unistd.h>
35 #include <dt_impl.h>
36 #include <assert.h>
37 #include <alloca.h>
38 #include <limits.h>
39 
40 #define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
41 
42 /*
43  * Because qsort(3C) does not allow an argument to be passed to a comparison
44  * function, the variables that affect comparison must regrettably be global;
45  * they are protected by a global static lock, dt_qsort_lock.
46  */
47 static pthread_mutex_t dt_qsort_lock = PTHREAD_MUTEX_INITIALIZER;
48 
49 static int dt_revsort;
50 static int dt_keysort;
51 static int dt_keypos;
52 
53 #define	DT_LESSTHAN	(dt_revsort == 0 ? -1 : 1)
54 #define	DT_GREATERTHAN	(dt_revsort == 0 ? 1 : -1)
55 
56 static void
57 dt_aggregate_count(int64_t *existing, int64_t *new, size_t size)
58 {
59 	int i;
60 
61 	for (i = 0; i < size / sizeof (int64_t); i++)
62 		existing[i] = existing[i] + new[i];
63 }
64 
65 static int
66 dt_aggregate_countcmp(int64_t *lhs, int64_t *rhs)
67 {
68 	int64_t lvar = *lhs;
69 	int64_t rvar = *rhs;
70 
71 	if (lvar < rvar)
72 		return (DT_LESSTHAN);
73 
74 	if (lvar > rvar)
75 		return (DT_GREATERTHAN);
76 
77 	return (0);
78 }
79 
80 /*ARGSUSED*/
81 static void
82 dt_aggregate_min(int64_t *existing, int64_t *new, size_t size)
83 {
84 	if (*new < *existing)
85 		*existing = *new;
86 }
87 
88 /*ARGSUSED*/
89 static void
90 dt_aggregate_max(int64_t *existing, int64_t *new, size_t size)
91 {
92 	if (*new > *existing)
93 		*existing = *new;
94 }
95 
96 static int
97 dt_aggregate_averagecmp(int64_t *lhs, int64_t *rhs)
98 {
99 	int64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
100 	int64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
101 
102 	if (lavg < ravg)
103 		return (DT_LESSTHAN);
104 
105 	if (lavg > ravg)
106 		return (DT_GREATERTHAN);
107 
108 	return (0);
109 }
110 
111 static int
112 dt_aggregate_stddevcmp(int64_t *lhs, int64_t *rhs)
113 {
114 	uint64_t lsd = dt_stddev((uint64_t *)lhs, 1);
115 	uint64_t rsd = dt_stddev((uint64_t *)rhs, 1);
116 
117 	if (lsd < rsd)
118 		return (DT_LESSTHAN);
119 
120 	if (lsd > rsd)
121 		return (DT_GREATERTHAN);
122 
123 	return (0);
124 }
125 
126 /*ARGSUSED*/
127 static void
128 dt_aggregate_lquantize(int64_t *existing, int64_t *new, size_t size)
129 {
130 	int64_t arg = *existing++;
131 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
132 	int i;
133 
134 	for (i = 0; i <= levels + 1; i++)
135 		existing[i] = existing[i] + new[i + 1];
136 }
137 
138 static long double
139 dt_aggregate_lquantizedsum(int64_t *lquanta)
140 {
141 	int64_t arg = *lquanta++;
142 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
143 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
144 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
145 	long double total = (long double)lquanta[0] * (long double)(base - 1);
146 
147 	for (i = 0; i < levels; base += step, i++)
148 		total += (long double)lquanta[i + 1] * (long double)base;
149 
150 	return (total + (long double)lquanta[levels + 1] *
151 	    (long double)(base + 1));
152 }
153 
154 static int64_t
155 dt_aggregate_lquantizedzero(int64_t *lquanta)
156 {
157 	int64_t arg = *lquanta++;
158 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
159 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
160 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
161 
162 	if (base - 1 == 0)
163 		return (lquanta[0]);
164 
165 	for (i = 0; i < levels; base += step, i++) {
166 		if (base != 0)
167 			continue;
168 
169 		return (lquanta[i + 1]);
170 	}
171 
172 	if (base + 1 == 0)
173 		return (lquanta[levels + 1]);
174 
175 	return (0);
176 }
177 
178 static int
179 dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs)
180 {
181 	long double lsum = dt_aggregate_lquantizedsum(lhs);
182 	long double rsum = dt_aggregate_lquantizedsum(rhs);
183 	int64_t lzero, rzero;
184 
185 	if (lsum < rsum)
186 		return (DT_LESSTHAN);
187 
188 	if (lsum > rsum)
189 		return (DT_GREATERTHAN);
190 
191 	/*
192 	 * If they're both equal, then we will compare based on the weights at
193 	 * zero.  If the weights at zero are equal (or if zero is not within
194 	 * the range of the linear quantization), then this will be judged a
195 	 * tie and will be resolved based on the key comparison.
196 	 */
197 	lzero = dt_aggregate_lquantizedzero(lhs);
198 	rzero = dt_aggregate_lquantizedzero(rhs);
199 
200 	if (lzero < rzero)
201 		return (DT_LESSTHAN);
202 
203 	if (lzero > rzero)
204 		return (DT_GREATERTHAN);
205 
206 	return (0);
207 }
208 
209 static void
210 dt_aggregate_llquantize(int64_t *existing, int64_t *new, size_t size)
211 {
212 	int i;
213 
214 	for (i = 1; i < size / sizeof (int64_t); i++)
215 		existing[i] = existing[i] + new[i];
216 }
217 
218 static long double
219 dt_aggregate_llquantizedsum(int64_t *llquanta)
220 {
221 	int64_t arg = *llquanta++;
222 	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
223 	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
224 	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
225 	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
226 	int bin = 0, order;
227 	int64_t value = 1, next, step;
228 	long double total;
229 
230 	assert(nsteps >= factor);
231 	assert(nsteps % factor == 0);
232 
233 	for (order = 0; order < low; order++)
234 		value *= factor;
235 
236 	total = (long double)llquanta[bin++] * (long double)(value - 1);
237 
238 	next = value * factor;
239 	step = next > nsteps ? next / nsteps : 1;
240 
241 	while (order <= high) {
242 		assert(value < next);
243 		total += (long double)llquanta[bin++] * (long double)(value);
244 
245 		if ((value += step) != next)
246 			continue;
247 
248 		next = value * factor;
249 		step = next > nsteps ? next / nsteps : 1;
250 		order++;
251 	}
252 
253 	return (total + (long double)llquanta[bin] * (long double)value);
254 }
255 
256 static int
257 dt_aggregate_llquantizedcmp(int64_t *lhs, int64_t *rhs)
258 {
259 	long double lsum = dt_aggregate_llquantizedsum(lhs);
260 	long double rsum = dt_aggregate_llquantizedsum(rhs);
261 	int64_t lzero, rzero;
262 
263 	if (lsum < rsum)
264 		return (DT_LESSTHAN);
265 
266 	if (lsum > rsum)
267 		return (DT_GREATERTHAN);
268 
269 	/*
270 	 * If they're both equal, then we will compare based on the weights at
271 	 * zero.  If the weights at zero are equal, then this will be judged a
272 	 * tie and will be resolved based on the key comparison.
273 	 */
274 	lzero = lhs[1];
275 	rzero = rhs[1];
276 
277 	if (lzero < rzero)
278 		return (DT_LESSTHAN);
279 
280 	if (lzero > rzero)
281 		return (DT_GREATERTHAN);
282 
283 	return (0);
284 }
285 
286 static int
287 dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs)
288 {
289 	int nbuckets = DTRACE_QUANTIZE_NBUCKETS, i;
290 	long double ltotal = 0, rtotal = 0;
291 	int64_t lzero, rzero;
292 
293 	for (i = 0; i < nbuckets; i++) {
294 		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
295 
296 		if (bucketval == 0) {
297 			lzero = lhs[i];
298 			rzero = rhs[i];
299 		}
300 
301 		ltotal += (long double)bucketval * (long double)lhs[i];
302 		rtotal += (long double)bucketval * (long double)rhs[i];
303 	}
304 
305 	if (ltotal < rtotal)
306 		return (DT_LESSTHAN);
307 
308 	if (ltotal > rtotal)
309 		return (DT_GREATERTHAN);
310 
311 	/*
312 	 * If they're both equal, then we will compare based on the weights at
313 	 * zero.  If the weights at zero are equal, then this will be judged a
314 	 * tie and will be resolved based on the key comparison.
315 	 */
316 	if (lzero < rzero)
317 		return (DT_LESSTHAN);
318 
319 	if (lzero > rzero)
320 		return (DT_GREATERTHAN);
321 
322 	return (0);
323 }
324 
325 static void
326 dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
327 {
328 	uint64_t pid = data[0];
329 	uint64_t *pc = &data[1];
330 	struct ps_prochandle *P;
331 	GElf_Sym sym;
332 
333 	if (dtp->dt_vector != NULL)
334 		return;
335 
336 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
337 		return;
338 
339 	dt_proc_lock(dtp, P);
340 
341 	if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
342 		*pc = sym.st_value;
343 
344 	dt_proc_unlock(dtp, P);
345 	dt_proc_release(dtp, P);
346 }
347 
348 static void
349 dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
350 {
351 	uint64_t pid = data[0];
352 	uint64_t *pc = &data[1];
353 	struct ps_prochandle *P;
354 	const prmap_t *map;
355 
356 	if (dtp->dt_vector != NULL)
357 		return;
358 
359 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
360 		return;
361 
362 	dt_proc_lock(dtp, P);
363 
364 	if ((map = Paddr_to_map(P, *pc)) != NULL)
365 		*pc = map->pr_vaddr;
366 
367 	dt_proc_unlock(dtp, P);
368 	dt_proc_release(dtp, P);
369 }
370 
371 static void
372 dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
373 {
374 	GElf_Sym sym;
375 	uint64_t *pc = data;
376 
377 	if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
378 		*pc = sym.st_value;
379 }
380 
381 static void
382 dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
383 {
384 	uint64_t *pc = data;
385 	dt_module_t *dmp;
386 
387 	if (dtp->dt_vector != NULL) {
388 		/*
389 		 * We don't have a way of just getting the module for a
390 		 * vectored open, and it doesn't seem to be worth defining
391 		 * one.  This means that use of mod() won't get true
392 		 * aggregation in the postmortem case (some modules may
393 		 * appear more than once in aggregation output).  It seems
394 		 * unlikely that anyone will ever notice or care...
395 		 */
396 		return;
397 	}
398 
399 	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
400 	    dmp = dt_list_next(dmp)) {
401 		if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
402 			*pc = dmp->dm_text_va;
403 			return;
404 		}
405 	}
406 }
407 
408 static dtrace_aggvarid_t
409 dt_aggregate_aggvarid(dt_ahashent_t *ent)
410 {
411 	dtrace_aggdesc_t *agg = ent->dtahe_data.dtada_desc;
412 	caddr_t data = ent->dtahe_data.dtada_data;
413 	dtrace_recdesc_t *rec = agg->dtagd_rec;
414 
415 	/*
416 	 * First, we'll check the variable ID in the aggdesc.  If it's valid,
417 	 * we'll return it.  If not, we'll use the compiler-generated ID
418 	 * present as the first record.
419 	 */
420 	if (agg->dtagd_varid != DTRACE_AGGVARIDNONE)
421 		return (agg->dtagd_varid);
422 
423 	agg->dtagd_varid = *((dtrace_aggvarid_t *)(uintptr_t)(data +
424 	    rec->dtrd_offset));
425 
426 	return (agg->dtagd_varid);
427 }
428 
429 
430 static int
431 dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
432 {
433 	dtrace_epid_t id;
434 	uint64_t hashval;
435 	size_t offs, roffs, size, ndx;
436 	int i, j, rval;
437 	caddr_t addr, data;
438 	dtrace_recdesc_t *rec;
439 	dt_aggregate_t *agp = &dtp->dt_aggregate;
440 	dtrace_aggdesc_t *agg;
441 	dt_ahash_t *hash = &agp->dtat_hash;
442 	dt_ahashent_t *h;
443 	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
444 	dtrace_aggdata_t *aggdata;
445 	int flags = agp->dtat_flags;
446 
447 	buf->dtbd_cpu = cpu;
448 
449 	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
450 		if (errno == ENOENT) {
451 			/*
452 			 * If that failed with ENOENT, it may be because the
453 			 * CPU was unconfigured.  This is okay; we'll just
454 			 * do nothing but return success.
455 			 */
456 			return (0);
457 		}
458 
459 		return (dt_set_errno(dtp, errno));
460 	}
461 
462 	if (buf->dtbd_drops != 0) {
463 		if (dt_handle_cpudrop(dtp, cpu,
464 		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1)
465 			return (-1);
466 	}
467 
468 	if (buf->dtbd_size == 0)
469 		return (0);
470 
471 	if (hash->dtah_hash == NULL) {
472 		size_t size;
473 
474 		hash->dtah_size = DTRACE_AHASHSIZE;
475 		size = hash->dtah_size * sizeof (dt_ahashent_t *);
476 
477 		if ((hash->dtah_hash = malloc(size)) == NULL)
478 			return (dt_set_errno(dtp, EDT_NOMEM));
479 
480 		bzero(hash->dtah_hash, size);
481 	}
482 
483 	for (offs = 0; offs < buf->dtbd_size; ) {
484 		/*
485 		 * We're guaranteed to have an ID.
486 		 */
487 		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
488 		    (uintptr_t)offs));
489 
490 		if (id == DTRACE_AGGIDNONE) {
491 			/*
492 			 * This is filler to assure proper alignment of the
493 			 * next record; we simply ignore it.
494 			 */
495 			offs += sizeof (id);
496 			continue;
497 		}
498 
499 		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
500 			return (rval);
501 
502 		addr = buf->dtbd_data + offs;
503 		size = agg->dtagd_size;
504 		hashval = 0;
505 
506 		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
507 			rec = &agg->dtagd_rec[j];
508 			roffs = rec->dtrd_offset;
509 
510 			switch (rec->dtrd_action) {
511 			case DTRACEACT_USYM:
512 				dt_aggregate_usym(dtp,
513 				    /* LINTED - alignment */
514 				    (uint64_t *)&addr[roffs]);
515 				break;
516 
517 			case DTRACEACT_UMOD:
518 				dt_aggregate_umod(dtp,
519 				    /* LINTED - alignment */
520 				    (uint64_t *)&addr[roffs]);
521 				break;
522 
523 			case DTRACEACT_SYM:
524 				/* LINTED - alignment */
525 				dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
526 				break;
527 
528 			case DTRACEACT_MOD:
529 				/* LINTED - alignment */
530 				dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
531 				break;
532 
533 			default:
534 				break;
535 			}
536 
537 			for (i = 0; i < rec->dtrd_size; i++)
538 				hashval += addr[roffs + i];
539 		}
540 
541 		ndx = hashval % hash->dtah_size;
542 
543 		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
544 			if (h->dtahe_hashval != hashval)
545 				continue;
546 
547 			if (h->dtahe_size != size)
548 				continue;
549 
550 			aggdata = &h->dtahe_data;
551 			data = aggdata->dtada_data;
552 
553 			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
554 				rec = &agg->dtagd_rec[j];
555 				roffs = rec->dtrd_offset;
556 
557 				for (i = 0; i < rec->dtrd_size; i++)
558 					if (addr[roffs + i] != data[roffs + i])
559 						goto hashnext;
560 			}
561 
562 			/*
563 			 * We found it.  Now we need to apply the aggregating
564 			 * action on the data here.
565 			 */
566 			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
567 			roffs = rec->dtrd_offset;
568 			/* LINTED - alignment */
569 			h->dtahe_aggregate((int64_t *)&data[roffs],
570 			    /* LINTED - alignment */
571 			    (int64_t *)&addr[roffs], rec->dtrd_size);
572 
573 			/*
574 			 * If we're keeping per CPU data, apply the aggregating
575 			 * action there as well.
576 			 */
577 			if (aggdata->dtada_percpu != NULL) {
578 				data = aggdata->dtada_percpu[cpu];
579 
580 				/* LINTED - alignment */
581 				h->dtahe_aggregate((int64_t *)data,
582 				    /* LINTED - alignment */
583 				    (int64_t *)&addr[roffs], rec->dtrd_size);
584 			}
585 
586 			goto bufnext;
587 hashnext:
588 			continue;
589 		}
590 
591 		/*
592 		 * If we're here, we couldn't find an entry for this record.
593 		 */
594 		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
595 			return (dt_set_errno(dtp, EDT_NOMEM));
596 		bzero(h, sizeof (dt_ahashent_t));
597 		aggdata = &h->dtahe_data;
598 
599 		if ((aggdata->dtada_data = malloc(size)) == NULL) {
600 			free(h);
601 			return (dt_set_errno(dtp, EDT_NOMEM));
602 		}
603 
604 		bcopy(addr, aggdata->dtada_data, size);
605 		aggdata->dtada_size = size;
606 		aggdata->dtada_desc = agg;
607 		aggdata->dtada_handle = dtp;
608 		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
609 		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
610 		aggdata->dtada_normal = 1;
611 
612 		h->dtahe_hashval = hashval;
613 		h->dtahe_size = size;
614 		(void) dt_aggregate_aggvarid(h);
615 
616 		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
617 
618 		if (flags & DTRACE_A_PERCPU) {
619 			int max_cpus = agp->dtat_maxcpu;
620 			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
621 
622 			if (percpu == NULL) {
623 				free(aggdata->dtada_data);
624 				free(h);
625 				return (dt_set_errno(dtp, EDT_NOMEM));
626 			}
627 
628 			for (j = 0; j < max_cpus; j++) {
629 				percpu[j] = malloc(rec->dtrd_size);
630 
631 				if (percpu[j] == NULL) {
632 					while (--j >= 0)
633 						free(percpu[j]);
634 
635 					free(aggdata->dtada_data);
636 					free(h);
637 					return (dt_set_errno(dtp, EDT_NOMEM));
638 				}
639 
640 				if (j == cpu) {
641 					bcopy(&addr[rec->dtrd_offset],
642 					    percpu[j], rec->dtrd_size);
643 				} else {
644 					bzero(percpu[j], rec->dtrd_size);
645 				}
646 			}
647 
648 			aggdata->dtada_percpu = percpu;
649 		}
650 
651 		switch (rec->dtrd_action) {
652 		case DTRACEAGG_MIN:
653 			h->dtahe_aggregate = dt_aggregate_min;
654 			break;
655 
656 		case DTRACEAGG_MAX:
657 			h->dtahe_aggregate = dt_aggregate_max;
658 			break;
659 
660 		case DTRACEAGG_LQUANTIZE:
661 			h->dtahe_aggregate = dt_aggregate_lquantize;
662 			break;
663 
664 		case DTRACEAGG_LLQUANTIZE:
665 			h->dtahe_aggregate = dt_aggregate_llquantize;
666 			break;
667 
668 		case DTRACEAGG_COUNT:
669 		case DTRACEAGG_SUM:
670 		case DTRACEAGG_AVG:
671 		case DTRACEAGG_STDDEV:
672 		case DTRACEAGG_QUANTIZE:
673 			h->dtahe_aggregate = dt_aggregate_count;
674 			break;
675 
676 		default:
677 			return (dt_set_errno(dtp, EDT_BADAGG));
678 		}
679 
680 		if (hash->dtah_hash[ndx] != NULL)
681 			hash->dtah_hash[ndx]->dtahe_prev = h;
682 
683 		h->dtahe_next = hash->dtah_hash[ndx];
684 		hash->dtah_hash[ndx] = h;
685 
686 		if (hash->dtah_all != NULL)
687 			hash->dtah_all->dtahe_prevall = h;
688 
689 		h->dtahe_nextall = hash->dtah_all;
690 		hash->dtah_all = h;
691 bufnext:
692 		offs += agg->dtagd_size;
693 	}
694 
695 	return (0);
696 }
697 
698 int
699 dtrace_aggregate_snap(dtrace_hdl_t *dtp)
700 {
701 	int i, rval;
702 	dt_aggregate_t *agp = &dtp->dt_aggregate;
703 	hrtime_t now = gethrtime();
704 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
705 
706 	if (dtp->dt_lastagg != 0) {
707 		if (now - dtp->dt_lastagg < interval)
708 			return (0);
709 
710 		dtp->dt_lastagg += interval;
711 	} else {
712 		dtp->dt_lastagg = now;
713 	}
714 
715 	if (!dtp->dt_active)
716 		return (dt_set_errno(dtp, EINVAL));
717 
718 	if (agp->dtat_buf.dtbd_size == 0)
719 		return (0);
720 
721 	for (i = 0; i < agp->dtat_ncpus; i++) {
722 		if (rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i]))
723 			return (rval);
724 	}
725 
726 	return (0);
727 }
728 
729 static int
730 dt_aggregate_hashcmp(const void *lhs, const void *rhs)
731 {
732 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
733 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
734 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
735 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
736 
737 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
738 		return (DT_LESSTHAN);
739 
740 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
741 		return (DT_GREATERTHAN);
742 
743 	return (0);
744 }
745 
746 static int
747 dt_aggregate_varcmp(const void *lhs, const void *rhs)
748 {
749 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
750 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
751 	dtrace_aggvarid_t lid, rid;
752 
753 	lid = dt_aggregate_aggvarid(lh);
754 	rid = dt_aggregate_aggvarid(rh);
755 
756 	if (lid < rid)
757 		return (DT_LESSTHAN);
758 
759 	if (lid > rid)
760 		return (DT_GREATERTHAN);
761 
762 	return (0);
763 }
764 
765 static int
766 dt_aggregate_keycmp(const void *lhs, const void *rhs)
767 {
768 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
769 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
770 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
771 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
772 	dtrace_recdesc_t *lrec, *rrec;
773 	char *ldata, *rdata;
774 	int rval, i, j, keypos, nrecs;
775 
776 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
777 		return (rval);
778 
779 	nrecs = lagg->dtagd_nrecs - 1;
780 	assert(nrecs == ragg->dtagd_nrecs - 1);
781 
782 	keypos = dt_keypos + 1 >= nrecs ? 0 : dt_keypos;
783 
784 	for (i = 1; i < nrecs; i++) {
785 		uint64_t lval, rval;
786 		int ndx = i + keypos;
787 
788 		if (ndx >= nrecs)
789 			ndx = ndx - nrecs + 1;
790 
791 		lrec = &lagg->dtagd_rec[ndx];
792 		rrec = &ragg->dtagd_rec[ndx];
793 
794 		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
795 		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
796 
797 		if (lrec->dtrd_size < rrec->dtrd_size)
798 			return (DT_LESSTHAN);
799 
800 		if (lrec->dtrd_size > rrec->dtrd_size)
801 			return (DT_GREATERTHAN);
802 
803 		switch (lrec->dtrd_size) {
804 		case sizeof (uint64_t):
805 			/* LINTED - alignment */
806 			lval = *((uint64_t *)ldata);
807 			/* LINTED - alignment */
808 			rval = *((uint64_t *)rdata);
809 			break;
810 
811 		case sizeof (uint32_t):
812 			/* LINTED - alignment */
813 			lval = *((uint32_t *)ldata);
814 			/* LINTED - alignment */
815 			rval = *((uint32_t *)rdata);
816 			break;
817 
818 		case sizeof (uint16_t):
819 			/* LINTED - alignment */
820 			lval = *((uint16_t *)ldata);
821 			/* LINTED - alignment */
822 			rval = *((uint16_t *)rdata);
823 			break;
824 
825 		case sizeof (uint8_t):
826 			lval = *((uint8_t *)ldata);
827 			rval = *((uint8_t *)rdata);
828 			break;
829 
830 		default:
831 			switch (lrec->dtrd_action) {
832 			case DTRACEACT_UMOD:
833 			case DTRACEACT_UADDR:
834 			case DTRACEACT_USYM:
835 				for (j = 0; j < 2; j++) {
836 					/* LINTED - alignment */
837 					lval = ((uint64_t *)ldata)[j];
838 					/* LINTED - alignment */
839 					rval = ((uint64_t *)rdata)[j];
840 
841 					if (lval < rval)
842 						return (DT_LESSTHAN);
843 
844 					if (lval > rval)
845 						return (DT_GREATERTHAN);
846 				}
847 
848 				break;
849 
850 			default:
851 				for (j = 0; j < lrec->dtrd_size; j++) {
852 					lval = ((uint8_t *)ldata)[j];
853 					rval = ((uint8_t *)rdata)[j];
854 
855 					if (lval < rval)
856 						return (DT_LESSTHAN);
857 
858 					if (lval > rval)
859 						return (DT_GREATERTHAN);
860 				}
861 			}
862 
863 			continue;
864 		}
865 
866 		if (lval < rval)
867 			return (DT_LESSTHAN);
868 
869 		if (lval > rval)
870 			return (DT_GREATERTHAN);
871 	}
872 
873 	return (0);
874 }
875 
876 static int
877 dt_aggregate_valcmp(const void *lhs, const void *rhs)
878 {
879 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
880 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
881 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
882 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
883 	caddr_t ldata = lh->dtahe_data.dtada_data;
884 	caddr_t rdata = rh->dtahe_data.dtada_data;
885 	dtrace_recdesc_t *lrec, *rrec;
886 	int64_t *laddr, *raddr;
887 	int rval, i;
888 
889 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
890 		return (rval);
891 
892 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
893 		return (DT_GREATERTHAN);
894 
895 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
896 		return (DT_LESSTHAN);
897 
898 	for (i = 0; i < lagg->dtagd_nrecs; i++) {
899 		lrec = &lagg->dtagd_rec[i];
900 		rrec = &ragg->dtagd_rec[i];
901 
902 		if (lrec->dtrd_offset < rrec->dtrd_offset)
903 			return (DT_LESSTHAN);
904 
905 		if (lrec->dtrd_offset > rrec->dtrd_offset)
906 			return (DT_GREATERTHAN);
907 
908 		if (lrec->dtrd_action < rrec->dtrd_action)
909 			return (DT_LESSTHAN);
910 
911 		if (lrec->dtrd_action > rrec->dtrd_action)
912 			return (DT_GREATERTHAN);
913 	}
914 
915 	laddr = (int64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
916 	raddr = (int64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
917 
918 	switch (lrec->dtrd_action) {
919 	case DTRACEAGG_AVG:
920 		rval = dt_aggregate_averagecmp(laddr, raddr);
921 		break;
922 
923 	case DTRACEAGG_STDDEV:
924 		rval = dt_aggregate_stddevcmp(laddr, raddr);
925 		break;
926 
927 	case DTRACEAGG_QUANTIZE:
928 		rval = dt_aggregate_quantizedcmp(laddr, raddr);
929 		break;
930 
931 	case DTRACEAGG_LQUANTIZE:
932 		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
933 		break;
934 
935 	case DTRACEAGG_LLQUANTIZE:
936 		rval = dt_aggregate_llquantizedcmp(laddr, raddr);
937 		break;
938 
939 	case DTRACEAGG_COUNT:
940 	case DTRACEAGG_SUM:
941 	case DTRACEAGG_MIN:
942 	case DTRACEAGG_MAX:
943 		rval = dt_aggregate_countcmp(laddr, raddr);
944 		break;
945 
946 	default:
947 		assert(0);
948 	}
949 
950 	return (rval);
951 }
952 
953 static int
954 dt_aggregate_valkeycmp(const void *lhs, const void *rhs)
955 {
956 	int rval;
957 
958 	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
959 		return (rval);
960 
961 	/*
962 	 * If we're here, the values for the two aggregation elements are
963 	 * equal.  We already know that the key layout is the same for the two
964 	 * elements; we must now compare the keys themselves as a tie-breaker.
965 	 */
966 	return (dt_aggregate_keycmp(lhs, rhs));
967 }
968 
969 static int
970 dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
971 {
972 	int rval;
973 
974 	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
975 		return (rval);
976 
977 	return (dt_aggregate_varcmp(lhs, rhs));
978 }
979 
980 static int
981 dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
982 {
983 	int rval;
984 
985 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
986 		return (rval);
987 
988 	return (dt_aggregate_keycmp(lhs, rhs));
989 }
990 
991 static int
992 dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
993 {
994 	int rval;
995 
996 	if ((rval = dt_aggregate_valkeycmp(lhs, rhs)) != 0)
997 		return (rval);
998 
999 	return (dt_aggregate_varcmp(lhs, rhs));
1000 }
1001 
1002 static int
1003 dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
1004 {
1005 	int rval;
1006 
1007 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
1008 		return (rval);
1009 
1010 	return (dt_aggregate_valkeycmp(lhs, rhs));
1011 }
1012 
1013 static int
1014 dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
1015 {
1016 	return (dt_aggregate_keyvarcmp(rhs, lhs));
1017 }
1018 
1019 static int
1020 dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
1021 {
1022 	return (dt_aggregate_varkeycmp(rhs, lhs));
1023 }
1024 
1025 static int
1026 dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
1027 {
1028 	return (dt_aggregate_valvarcmp(rhs, lhs));
1029 }
1030 
1031 static int
1032 dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
1033 {
1034 	return (dt_aggregate_varvalcmp(rhs, lhs));
1035 }
1036 
1037 static int
1038 dt_aggregate_bundlecmp(const void *lhs, const void *rhs)
1039 {
1040 	dt_ahashent_t **lh = *((dt_ahashent_t ***)lhs);
1041 	dt_ahashent_t **rh = *((dt_ahashent_t ***)rhs);
1042 	int i, rval;
1043 
1044 	if (dt_keysort) {
1045 		/*
1046 		 * If we're sorting on keys, we need to scan until we find the
1047 		 * last entry -- that's the representative key.  (The order of
1048 		 * the bundle is values followed by key to accommodate the
1049 		 * default behavior of sorting by value.)  If the keys are
1050 		 * equal, we'll fall into the value comparison loop, below.
1051 		 */
1052 		for (i = 0; lh[i + 1] != NULL; i++)
1053 			continue;
1054 
1055 		assert(i != 0);
1056 		assert(rh[i + 1] == NULL);
1057 
1058 		if ((rval = dt_aggregate_keycmp(&lh[i], &rh[i])) != 0)
1059 			return (rval);
1060 	}
1061 
1062 	for (i = 0; ; i++) {
1063 		if (lh[i + 1] == NULL) {
1064 			/*
1065 			 * All of the values are equal; if we're sorting on
1066 			 * keys, then we're only here because the keys were
1067 			 * found to be equal and these records are therefore
1068 			 * equal.  If we're not sorting on keys, we'll use the
1069 			 * key comparison from the representative key as the
1070 			 * tie-breaker.
1071 			 */
1072 			if (dt_keysort)
1073 				return (0);
1074 
1075 			assert(i != 0);
1076 			assert(rh[i + 1] == NULL);
1077 			return (dt_aggregate_keycmp(&lh[i], &rh[i]));
1078 		} else {
1079 			if ((rval = dt_aggregate_valcmp(&lh[i], &rh[i])) != 0)
1080 				return (rval);
1081 		}
1082 	}
1083 }
1084 
1085 int
1086 dt_aggregate_go(dtrace_hdl_t *dtp)
1087 {
1088 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1089 	dtrace_optval_t size, cpu;
1090 	dtrace_bufdesc_t *buf = &agp->dtat_buf;
1091 	int rval, i;
1092 
1093 	assert(agp->dtat_maxcpu == 0);
1094 	assert(agp->dtat_ncpu == 0);
1095 	assert(agp->dtat_cpus == NULL);
1096 
1097 	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
1098 	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
1099 	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
1100 
1101 	if (agp->dtat_cpus == NULL)
1102 		return (dt_set_errno(dtp, EDT_NOMEM));
1103 
1104 	/*
1105 	 * Use the aggregation buffer size as reloaded from the kernel.
1106 	 */
1107 	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
1108 
1109 	rval = dtrace_getopt(dtp, "aggsize", &size);
1110 	assert(rval == 0);
1111 
1112 	if (size == 0 || size == DTRACEOPT_UNSET)
1113 		return (0);
1114 
1115 	buf = &agp->dtat_buf;
1116 	buf->dtbd_size = size;
1117 
1118 	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
1119 		return (dt_set_errno(dtp, EDT_NOMEM));
1120 
1121 	/*
1122 	 * Now query for the CPUs enabled.
1123 	 */
1124 	rval = dtrace_getopt(dtp, "cpu", &cpu);
1125 	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
1126 
1127 	if (cpu != DTRACE_CPUALL) {
1128 		assert(cpu < agp->dtat_ncpu);
1129 		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
1130 
1131 		return (0);
1132 	}
1133 
1134 	agp->dtat_ncpus = 0;
1135 	for (i = 0; i < agp->dtat_maxcpu; i++) {
1136 		if (dt_status(dtp, i) == -1)
1137 			continue;
1138 
1139 		agp->dtat_cpus[agp->dtat_ncpus++] = i;
1140 	}
1141 
1142 	return (0);
1143 }
1144 
1145 static int
1146 dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
1147 {
1148 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1149 	dtrace_aggdata_t *data;
1150 	dtrace_aggdesc_t *aggdesc;
1151 	dtrace_recdesc_t *rec;
1152 	int i;
1153 
1154 	switch (rval) {
1155 	case DTRACE_AGGWALK_NEXT:
1156 		break;
1157 
1158 	case DTRACE_AGGWALK_CLEAR: {
1159 		uint32_t size, offs = 0;
1160 
1161 		aggdesc = h->dtahe_data.dtada_desc;
1162 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1163 		size = rec->dtrd_size;
1164 		data = &h->dtahe_data;
1165 
1166 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1167 			offs = sizeof (uint64_t);
1168 			size -= sizeof (uint64_t);
1169 		}
1170 
1171 		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
1172 
1173 		if (data->dtada_percpu == NULL)
1174 			break;
1175 
1176 		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
1177 			bzero(data->dtada_percpu[i] + offs, size);
1178 		break;
1179 	}
1180 
1181 	case DTRACE_AGGWALK_ERROR:
1182 		/*
1183 		 * We assume that errno is already set in this case.
1184 		 */
1185 		return (dt_set_errno(dtp, errno));
1186 
1187 	case DTRACE_AGGWALK_ABORT:
1188 		return (dt_set_errno(dtp, EDT_DIRABORT));
1189 
1190 	case DTRACE_AGGWALK_DENORMALIZE:
1191 		h->dtahe_data.dtada_normal = 1;
1192 		return (0);
1193 
1194 	case DTRACE_AGGWALK_NORMALIZE:
1195 		if (h->dtahe_data.dtada_normal == 0) {
1196 			h->dtahe_data.dtada_normal = 1;
1197 			return (dt_set_errno(dtp, EDT_BADRVAL));
1198 		}
1199 
1200 		return (0);
1201 
1202 	case DTRACE_AGGWALK_REMOVE: {
1203 		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1204 		int i, max_cpus = agp->dtat_maxcpu;
1205 
1206 		/*
1207 		 * First, remove this hash entry from its hash chain.
1208 		 */
1209 		if (h->dtahe_prev != NULL) {
1210 			h->dtahe_prev->dtahe_next = h->dtahe_next;
1211 		} else {
1212 			dt_ahash_t *hash = &agp->dtat_hash;
1213 			size_t ndx = h->dtahe_hashval % hash->dtah_size;
1214 
1215 			assert(hash->dtah_hash[ndx] == h);
1216 			hash->dtah_hash[ndx] = h->dtahe_next;
1217 		}
1218 
1219 		if (h->dtahe_next != NULL)
1220 			h->dtahe_next->dtahe_prev = h->dtahe_prev;
1221 
1222 		/*
1223 		 * Now remove it from the list of all hash entries.
1224 		 */
1225 		if (h->dtahe_prevall != NULL) {
1226 			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
1227 		} else {
1228 			dt_ahash_t *hash = &agp->dtat_hash;
1229 
1230 			assert(hash->dtah_all == h);
1231 			hash->dtah_all = h->dtahe_nextall;
1232 		}
1233 
1234 		if (h->dtahe_nextall != NULL)
1235 			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
1236 
1237 		/*
1238 		 * We're unlinked.  We can safely destroy the data.
1239 		 */
1240 		if (aggdata->dtada_percpu != NULL) {
1241 			for (i = 0; i < max_cpus; i++)
1242 				free(aggdata->dtada_percpu[i]);
1243 			free(aggdata->dtada_percpu);
1244 		}
1245 
1246 		free(aggdata->dtada_data);
1247 		free(h);
1248 
1249 		return (0);
1250 	}
1251 
1252 	default:
1253 		return (dt_set_errno(dtp, EDT_BADRVAL));
1254 	}
1255 
1256 	return (0);
1257 }
1258 
1259 void
1260 dt_aggregate_qsort(dtrace_hdl_t *dtp, void *base, size_t nel, size_t width,
1261     int (*compar)(const void *, const void *))
1262 {
1263 	int rev = dt_revsort, key = dt_keysort, keypos = dt_keypos;
1264 	dtrace_optval_t keyposopt = dtp->dt_options[DTRACEOPT_AGGSORTKEYPOS];
1265 
1266 	dt_revsort = (dtp->dt_options[DTRACEOPT_AGGSORTREV] != DTRACEOPT_UNSET);
1267 	dt_keysort = (dtp->dt_options[DTRACEOPT_AGGSORTKEY] != DTRACEOPT_UNSET);
1268 
1269 	if (keyposopt != DTRACEOPT_UNSET && keyposopt <= INT_MAX) {
1270 		dt_keypos = (int)keyposopt;
1271 	} else {
1272 		dt_keypos = 0;
1273 	}
1274 
1275 	if (compar == NULL) {
1276 		if (!dt_keysort) {
1277 			compar = dt_aggregate_varvalcmp;
1278 		} else {
1279 			compar = dt_aggregate_varkeycmp;
1280 		}
1281 	}
1282 
1283 	qsort(base, nel, width, compar);
1284 
1285 	dt_revsort = rev;
1286 	dt_keysort = key;
1287 	dt_keypos = keypos;
1288 }
1289 
1290 int
1291 dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
1292 {
1293 	dt_ahashent_t *h, *next;
1294 	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
1295 
1296 	for (h = hash->dtah_all; h != NULL; h = next) {
1297 		/*
1298 		 * dt_aggwalk_rval() can potentially remove the current hash
1299 		 * entry; we need to load the next hash entry before calling
1300 		 * into it.
1301 		 */
1302 		next = h->dtahe_nextall;
1303 
1304 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1305 			return (-1);
1306 	}
1307 
1308 	return (0);
1309 }
1310 
1311 static int
1312 dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1313     dtrace_aggregate_f *func, void *arg,
1314     int (*sfunc)(const void *, const void *))
1315 {
1316 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1317 	dt_ahashent_t *h, **sorted;
1318 	dt_ahash_t *hash = &agp->dtat_hash;
1319 	size_t i, nentries = 0;
1320 
1321 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1322 		nentries++;
1323 
1324 	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1325 
1326 	if (sorted == NULL)
1327 		return (-1);
1328 
1329 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1330 		sorted[i++] = h;
1331 
1332 	(void) pthread_mutex_lock(&dt_qsort_lock);
1333 
1334 	if (sfunc == NULL) {
1335 		dt_aggregate_qsort(dtp, sorted, nentries,
1336 		    sizeof (dt_ahashent_t *), NULL);
1337 	} else {
1338 		/*
1339 		 * If we've been explicitly passed a sorting function,
1340 		 * we'll use that -- ignoring the values of the "aggsortrev",
1341 		 * "aggsortkey" and "aggsortkeypos" options.
1342 		 */
1343 		qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1344 	}
1345 
1346 	(void) pthread_mutex_unlock(&dt_qsort_lock);
1347 
1348 	for (i = 0; i < nentries; i++) {
1349 		h = sorted[i];
1350 
1351 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1) {
1352 			dt_free(dtp, sorted);
1353 			return (-1);
1354 		}
1355 	}
1356 
1357 	dt_free(dtp, sorted);
1358 	return (0);
1359 }
1360 
1361 int
1362 dtrace_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1363     dtrace_aggregate_f *func, void *arg)
1364 {
1365 	return (dt_aggregate_walk_sorted(dtp, func, arg, NULL));
1366 }
1367 
1368 int
1369 dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1370     dtrace_aggregate_f *func, void *arg)
1371 {
1372 	return (dt_aggregate_walk_sorted(dtp, func,
1373 	    arg, dt_aggregate_varkeycmp));
1374 }
1375 
1376 int
1377 dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1378     dtrace_aggregate_f *func, void *arg)
1379 {
1380 	return (dt_aggregate_walk_sorted(dtp, func,
1381 	    arg, dt_aggregate_varvalcmp));
1382 }
1383 
1384 int
1385 dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1386     dtrace_aggregate_f *func, void *arg)
1387 {
1388 	return (dt_aggregate_walk_sorted(dtp, func,
1389 	    arg, dt_aggregate_keyvarcmp));
1390 }
1391 
1392 int
1393 dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1394     dtrace_aggregate_f *func, void *arg)
1395 {
1396 	return (dt_aggregate_walk_sorted(dtp, func,
1397 	    arg, dt_aggregate_valvarcmp));
1398 }
1399 
1400 int
1401 dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1402     dtrace_aggregate_f *func, void *arg)
1403 {
1404 	return (dt_aggregate_walk_sorted(dtp, func,
1405 	    arg, dt_aggregate_varkeyrevcmp));
1406 }
1407 
1408 int
1409 dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1410     dtrace_aggregate_f *func, void *arg)
1411 {
1412 	return (dt_aggregate_walk_sorted(dtp, func,
1413 	    arg, dt_aggregate_varvalrevcmp));
1414 }
1415 
1416 int
1417 dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1418     dtrace_aggregate_f *func, void *arg)
1419 {
1420 	return (dt_aggregate_walk_sorted(dtp, func,
1421 	    arg, dt_aggregate_keyvarrevcmp));
1422 }
1423 
1424 int
1425 dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1426     dtrace_aggregate_f *func, void *arg)
1427 {
1428 	return (dt_aggregate_walk_sorted(dtp, func,
1429 	    arg, dt_aggregate_valvarrevcmp));
1430 }
1431 
1432 int
1433 dtrace_aggregate_walk_joined(dtrace_hdl_t *dtp, dtrace_aggvarid_t *aggvars,
1434     int naggvars, dtrace_aggregate_walk_joined_f *func, void *arg)
1435 {
1436 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1437 	dt_ahashent_t *h, **sorted = NULL, ***bundle, **nbundle;
1438 	const dtrace_aggdata_t **data;
1439 	dt_ahashent_t *zaggdata = NULL;
1440 	dt_ahash_t *hash = &agp->dtat_hash;
1441 	size_t nentries = 0, nbundles = 0, start, zsize = 0, bundlesize;
1442 	dtrace_aggvarid_t max = 0, aggvar;
1443 	int rval = -1, *map, *remap = NULL;
1444 	int i, j;
1445 	dtrace_optval_t sortpos = dtp->dt_options[DTRACEOPT_AGGSORTPOS];
1446 
1447 	/*
1448 	 * If the sorting position is greater than the number of aggregation
1449 	 * variable IDs, we silently set it to 0.
1450 	 */
1451 	if (sortpos == DTRACEOPT_UNSET || sortpos >= naggvars)
1452 		sortpos = 0;
1453 
1454 	/*
1455 	 * First we need to translate the specified aggregation variable IDs
1456 	 * into a linear map that will allow us to translate an aggregation
1457 	 * variable ID into its position in the specified aggvars.
1458 	 */
1459 	for (i = 0; i < naggvars; i++) {
1460 		if (aggvars[i] == DTRACE_AGGVARIDNONE || aggvars[i] < 0)
1461 			return (dt_set_errno(dtp, EDT_BADAGGVAR));
1462 
1463 		if (aggvars[i] > max)
1464 			max = aggvars[i];
1465 	}
1466 
1467 	if ((map = dt_zalloc(dtp, (max + 1) * sizeof (int))) == NULL)
1468 		return (-1);
1469 
1470 	zaggdata = dt_zalloc(dtp, naggvars * sizeof (dt_ahashent_t));
1471 
1472 	if (zaggdata == NULL)
1473 		goto out;
1474 
1475 	for (i = 0; i < naggvars; i++) {
1476 		int ndx = i + sortpos;
1477 
1478 		if (ndx >= naggvars)
1479 			ndx -= naggvars;
1480 
1481 		aggvar = aggvars[ndx];
1482 		assert(aggvar <= max);
1483 
1484 		if (map[aggvar]) {
1485 			/*
1486 			 * We have an aggregation variable that is present
1487 			 * more than once in the array of aggregation
1488 			 * variables.  While it's unclear why one might want
1489 			 * to do this, it's legal.  To support this construct,
1490 			 * we will allocate a remap that will indicate the
1491 			 * position from which this aggregation variable
1492 			 * should be pulled.  (That is, where the remap will
1493 			 * map from one position to another.)
1494 			 */
1495 			if (remap == NULL) {
1496 				remap = dt_zalloc(dtp, naggvars * sizeof (int));
1497 
1498 				if (remap == NULL)
1499 					goto out;
1500 			}
1501 
1502 			/*
1503 			 * Given that the variable is already present, assert
1504 			 * that following through the mapping and adjusting
1505 			 * for the sort position yields the same aggregation
1506 			 * variable ID.
1507 			 */
1508 			assert(aggvars[(map[aggvar] - 1 + sortpos) %
1509 			    naggvars] == aggvars[ndx]);
1510 
1511 			remap[i] = map[aggvar];
1512 			continue;
1513 		}
1514 
1515 		map[aggvar] = i + 1;
1516 	}
1517 
1518 	/*
1519 	 * We need to take two passes over the data to size our allocation, so
1520 	 * we'll use the first pass to also fill in the zero-filled data to be
1521 	 * used to properly format a zero-valued aggregation.
1522 	 */
1523 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1524 		dtrace_aggvarid_t id;
1525 		int ndx;
1526 
1527 		if ((id = dt_aggregate_aggvarid(h)) > max || !(ndx = map[id]))
1528 			continue;
1529 
1530 		if (zaggdata[ndx - 1].dtahe_size == 0) {
1531 			zaggdata[ndx - 1].dtahe_size = h->dtahe_size;
1532 			zaggdata[ndx - 1].dtahe_data = h->dtahe_data;
1533 		}
1534 
1535 		nentries++;
1536 	}
1537 
1538 	if (nentries == 0) {
1539 		/*
1540 		 * We couldn't find any entries; there is nothing else to do.
1541 		 */
1542 		rval = 0;
1543 		goto out;
1544 	}
1545 
1546 	/*
1547 	 * Before we sort the data, we're going to look for any holes in our
1548 	 * zero-filled data.  This will occur if an aggregation variable that
1549 	 * we are being asked to print has not yet been assigned the result of
1550 	 * any aggregating action for _any_ tuple.  The issue becomes that we
1551 	 * would like a zero value to be printed for all columns for this
1552 	 * aggregation, but without any record description, we don't know the
1553 	 * aggregating action that corresponds to the aggregation variable.  To
1554 	 * try to find a match, we're simply going to lookup aggregation IDs
1555 	 * (which are guaranteed to be contiguous and to start from 1), looking
1556 	 * for the specified aggregation variable ID.  If we find a match,
1557 	 * we'll use that.  If we iterate over all aggregation IDs and don't
1558 	 * find a match, then we must be an anonymous enabling.  (Anonymous
1559 	 * enablings can't currently derive either aggregation variable IDs or
1560 	 * aggregation variable names given only an aggregation ID.)  In this
1561 	 * obscure case (anonymous enabling, multiple aggregation printa() with
1562 	 * some aggregations not represented for any tuple), our defined
1563 	 * behavior is that the zero will be printed in the format of the first
1564 	 * aggregation variable that contains any non-zero value.
1565 	 */
1566 	for (i = 0; i < naggvars; i++) {
1567 		if (zaggdata[i].dtahe_size == 0) {
1568 			dtrace_aggvarid_t aggvar;
1569 
1570 			aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1571 			assert(zaggdata[i].dtahe_data.dtada_data == NULL);
1572 
1573 			for (j = DTRACE_AGGIDNONE + 1; ; j++) {
1574 				dtrace_aggdesc_t *agg;
1575 				dtrace_aggdata_t *aggdata;
1576 
1577 				if (dt_aggid_lookup(dtp, j, &agg) != 0)
1578 					break;
1579 
1580 				if (agg->dtagd_varid != aggvar)
1581 					continue;
1582 
1583 				/*
1584 				 * We have our description -- now we need to
1585 				 * cons up the zaggdata entry for it.
1586 				 */
1587 				aggdata = &zaggdata[i].dtahe_data;
1588 				aggdata->dtada_size = agg->dtagd_size;
1589 				aggdata->dtada_desc = agg;
1590 				aggdata->dtada_handle = dtp;
1591 				(void) dt_epid_lookup(dtp, agg->dtagd_epid,
1592 				    &aggdata->dtada_edesc,
1593 				    &aggdata->dtada_pdesc);
1594 				aggdata->dtada_normal = 1;
1595 				zaggdata[i].dtahe_hashval = 0;
1596 				zaggdata[i].dtahe_size = agg->dtagd_size;
1597 				break;
1598 			}
1599 
1600 			if (zaggdata[i].dtahe_size == 0) {
1601 				caddr_t data;
1602 
1603 				/*
1604 				 * We couldn't find this aggregation, meaning
1605 				 * that we have never seen it before for any
1606 				 * tuple _and_ this is an anonymous enabling.
1607 				 * That is, we're in the obscure case outlined
1608 				 * above.  In this case, our defined behavior
1609 				 * is to format the data in the format of the
1610 				 * first non-zero aggregation -- of which, of
1611 				 * course, we know there to be at least one
1612 				 * (or nentries would have been zero).
1613 				 */
1614 				for (j = 0; j < naggvars; j++) {
1615 					if (zaggdata[j].dtahe_size != 0)
1616 						break;
1617 				}
1618 
1619 				assert(j < naggvars);
1620 				zaggdata[i] = zaggdata[j];
1621 
1622 				data = zaggdata[i].dtahe_data.dtada_data;
1623 				assert(data != NULL);
1624 			}
1625 		}
1626 	}
1627 
1628 	/*
1629 	 * Now we need to allocate our zero-filled data for use for
1630 	 * aggregations that don't have a value corresponding to a given key.
1631 	 */
1632 	for (i = 0; i < naggvars; i++) {
1633 		dtrace_aggdata_t *aggdata = &zaggdata[i].dtahe_data;
1634 		dtrace_aggdesc_t *aggdesc = aggdata->dtada_desc;
1635 		dtrace_recdesc_t *rec;
1636 		uint64_t larg;
1637 		caddr_t zdata;
1638 
1639 		zsize = zaggdata[i].dtahe_size;
1640 		assert(zsize != 0);
1641 
1642 		if ((zdata = dt_zalloc(dtp, zsize)) == NULL) {
1643 			/*
1644 			 * If we failed to allocated some zero-filled data, we
1645 			 * need to zero out the remaining dtada_data pointers
1646 			 * to prevent the wrong data from being freed below.
1647 			 */
1648 			for (j = i; j < naggvars; j++)
1649 				zaggdata[j].dtahe_data.dtada_data = NULL;
1650 			goto out;
1651 		}
1652 
1653 		aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1654 
1655 		/*
1656 		 * First, the easy bit.  To maintain compatibility with
1657 		 * consumers that pull the compiler-generated ID out of the
1658 		 * data, we put that ID at the top of the zero-filled data.
1659 		 */
1660 		rec = &aggdesc->dtagd_rec[0];
1661 		/* LINTED - alignment */
1662 		*((dtrace_aggvarid_t *)(zdata + rec->dtrd_offset)) = aggvar;
1663 
1664 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1665 
1666 		/*
1667 		 * Now for the more complicated part.  If (and only if) this
1668 		 * is an lquantize() aggregating action, zero-filled data is
1669 		 * not equivalent to an empty record:  we must also get the
1670 		 * parameters for the lquantize().
1671 		 */
1672 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1673 			if (aggdata->dtada_data != NULL) {
1674 				/*
1675 				 * The easier case here is if we actually have
1676 				 * some prototype data -- in which case we
1677 				 * manually dig it out of the aggregation
1678 				 * record.
1679 				 */
1680 				/* LINTED - alignment */
1681 				larg = *((uint64_t *)(aggdata->dtada_data +
1682 				    rec->dtrd_offset));
1683 			} else {
1684 				/*
1685 				 * We don't have any prototype data.  As a
1686 				 * result, we know that we _do_ have the
1687 				 * compiler-generated information.  (If this
1688 				 * were an anonymous enabling, all of our
1689 				 * zero-filled data would have prototype data
1690 				 * -- either directly or indirectly.) So as
1691 				 * gross as it is, we'll grovel around in the
1692 				 * compiler-generated information to find the
1693 				 * lquantize() parameters.
1694 				 */
1695 				dtrace_stmtdesc_t *sdp;
1696 				dt_ident_t *aid;
1697 				dt_idsig_t *isp;
1698 
1699 				sdp = (dtrace_stmtdesc_t *)(uintptr_t)
1700 				    aggdesc->dtagd_rec[0].dtrd_uarg;
1701 				aid = sdp->dtsd_aggdata;
1702 				isp = (dt_idsig_t *)aid->di_data;
1703 				assert(isp->dis_auxinfo != 0);
1704 				larg = isp->dis_auxinfo;
1705 			}
1706 
1707 			/* LINTED - alignment */
1708 			*((uint64_t *)(zdata + rec->dtrd_offset)) = larg;
1709 		}
1710 
1711 		aggdata->dtada_data = zdata;
1712 	}
1713 
1714 	/*
1715 	 * Now that we've dealt with setting up our zero-filled data, we can
1716 	 * allocate our sorted array, and take another pass over the data to
1717 	 * fill it.
1718 	 */
1719 	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1720 
1721 	if (sorted == NULL)
1722 		goto out;
1723 
1724 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall) {
1725 		dtrace_aggvarid_t id;
1726 
1727 		if ((id = dt_aggregate_aggvarid(h)) > max || !map[id])
1728 			continue;
1729 
1730 		sorted[i++] = h;
1731 	}
1732 
1733 	assert(i == nentries);
1734 
1735 	/*
1736 	 * We've loaded our array; now we need to sort by value to allow us
1737 	 * to create bundles of like value.  We're going to acquire the
1738 	 * dt_qsort_lock here, and hold it across all of our subsequent
1739 	 * comparison and sorting.
1740 	 */
1741 	(void) pthread_mutex_lock(&dt_qsort_lock);
1742 
1743 	qsort(sorted, nentries, sizeof (dt_ahashent_t *),
1744 	    dt_aggregate_keyvarcmp);
1745 
1746 	/*
1747 	 * Now we need to go through and create bundles.  Because the number
1748 	 * of bundles is bounded by the size of the sorted array, we're going
1749 	 * to reuse the underlying storage.  And note that "bundle" is an
1750 	 * array of pointers to arrays of pointers to dt_ahashent_t -- making
1751 	 * its type (regrettably) "dt_ahashent_t ***".  (Regrettable because
1752 	 * '*' -- like '_' and 'X' -- should never appear in triplicate in
1753 	 * an ideal world.)
1754 	 */
1755 	bundle = (dt_ahashent_t ***)sorted;
1756 
1757 	for (i = 1, start = 0; i <= nentries; i++) {
1758 		if (i < nentries &&
1759 		    dt_aggregate_keycmp(&sorted[i], &sorted[i - 1]) == 0)
1760 			continue;
1761 
1762 		/*
1763 		 * We have a bundle boundary.  Everything from start to
1764 		 * (i - 1) belongs in one bundle.
1765 		 */
1766 		assert(i - start <= naggvars);
1767 		bundlesize = (naggvars + 2) * sizeof (dt_ahashent_t *);
1768 
1769 		if ((nbundle = dt_zalloc(dtp, bundlesize)) == NULL) {
1770 			(void) pthread_mutex_unlock(&dt_qsort_lock);
1771 			goto out;
1772 		}
1773 
1774 		for (j = start; j < i; j++) {
1775 			dtrace_aggvarid_t id = dt_aggregate_aggvarid(sorted[j]);
1776 
1777 			assert(id <= max);
1778 			assert(map[id] != 0);
1779 			assert(map[id] - 1 < naggvars);
1780 			assert(nbundle[map[id] - 1] == NULL);
1781 			nbundle[map[id] - 1] = sorted[j];
1782 
1783 			if (nbundle[naggvars] == NULL)
1784 				nbundle[naggvars] = sorted[j];
1785 		}
1786 
1787 		for (j = 0; j < naggvars; j++) {
1788 			if (nbundle[j] != NULL)
1789 				continue;
1790 
1791 			/*
1792 			 * Before we assume that this aggregation variable
1793 			 * isn't present (and fall back to using the
1794 			 * zero-filled data allocated earlier), check the
1795 			 * remap.  If we have a remapping, we'll drop it in
1796 			 * here.  Note that we might be remapping an
1797 			 * aggregation variable that isn't present for this
1798 			 * key; in this case, the aggregation data that we
1799 			 * copy will point to the zeroed data.
1800 			 */
1801 			if (remap != NULL && remap[j]) {
1802 				assert(remap[j] - 1 < j);
1803 				assert(nbundle[remap[j] - 1] != NULL);
1804 				nbundle[j] = nbundle[remap[j] - 1];
1805 			} else {
1806 				nbundle[j] = &zaggdata[j];
1807 			}
1808 		}
1809 
1810 		bundle[nbundles++] = nbundle;
1811 		start = i;
1812 	}
1813 
1814 	/*
1815 	 * Now we need to re-sort based on the first value.
1816 	 */
1817 	dt_aggregate_qsort(dtp, bundle, nbundles, sizeof (dt_ahashent_t **),
1818 	    dt_aggregate_bundlecmp);
1819 
1820 	(void) pthread_mutex_unlock(&dt_qsort_lock);
1821 
1822 	/*
1823 	 * We're done!  Now we just need to go back over the sorted bundles,
1824 	 * calling the function.
1825 	 */
1826 	data = alloca((naggvars + 1) * sizeof (dtrace_aggdata_t *));
1827 
1828 	for (i = 0; i < nbundles; i++) {
1829 		for (j = 0; j < naggvars; j++)
1830 			data[j + 1] = NULL;
1831 
1832 		for (j = 0; j < naggvars; j++) {
1833 			int ndx = j - sortpos;
1834 
1835 			if (ndx < 0)
1836 				ndx += naggvars;
1837 
1838 			assert(bundle[i][ndx] != NULL);
1839 			data[j + 1] = &bundle[i][ndx]->dtahe_data;
1840 		}
1841 
1842 		for (j = 0; j < naggvars; j++)
1843 			assert(data[j + 1] != NULL);
1844 
1845 		/*
1846 		 * The representative key is the last element in the bundle.
1847 		 * Assert that we have one, and then set it to be the first
1848 		 * element of data.
1849 		 */
1850 		assert(bundle[i][j] != NULL);
1851 		data[0] = &bundle[i][j]->dtahe_data;
1852 
1853 		if ((rval = func(data, naggvars + 1, arg)) == -1)
1854 			goto out;
1855 	}
1856 
1857 	rval = 0;
1858 out:
1859 	for (i = 0; i < nbundles; i++)
1860 		dt_free(dtp, bundle[i]);
1861 
1862 	if (zaggdata != NULL) {
1863 		for (i = 0; i < naggvars; i++)
1864 			dt_free(dtp, zaggdata[i].dtahe_data.dtada_data);
1865 	}
1866 
1867 	dt_free(dtp, zaggdata);
1868 	dt_free(dtp, sorted);
1869 	dt_free(dtp, remap);
1870 	dt_free(dtp, map);
1871 
1872 	return (rval);
1873 }
1874 
1875 int
1876 dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
1877     dtrace_aggregate_walk_f *func)
1878 {
1879 	dt_print_aggdata_t pd;
1880 
1881 	pd.dtpa_dtp = dtp;
1882 	pd.dtpa_fp = fp;
1883 	pd.dtpa_allunprint = 1;
1884 
1885 	if (func == NULL)
1886 		func = dtrace_aggregate_walk_sorted;
1887 
1888 	if ((*func)(dtp, dt_print_agg, &pd) == -1)
1889 		return (dt_set_errno(dtp, dtp->dt_errno));
1890 
1891 	return (0);
1892 }
1893 
1894 void
1895 dtrace_aggregate_clear(dtrace_hdl_t *dtp)
1896 {
1897 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1898 	dt_ahash_t *hash = &agp->dtat_hash;
1899 	dt_ahashent_t *h;
1900 	dtrace_aggdata_t *data;
1901 	dtrace_aggdesc_t *aggdesc;
1902 	dtrace_recdesc_t *rec;
1903 	int i, max_cpus = agp->dtat_maxcpu;
1904 
1905 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1906 		aggdesc = h->dtahe_data.dtada_desc;
1907 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1908 		data = &h->dtahe_data;
1909 
1910 		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
1911 
1912 		if (data->dtada_percpu == NULL)
1913 			continue;
1914 
1915 		for (i = 0; i < max_cpus; i++)
1916 			bzero(data->dtada_percpu[i], rec->dtrd_size);
1917 	}
1918 }
1919 
1920 void
1921 dt_aggregate_destroy(dtrace_hdl_t *dtp)
1922 {
1923 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1924 	dt_ahash_t *hash = &agp->dtat_hash;
1925 	dt_ahashent_t *h, *next;
1926 	dtrace_aggdata_t *aggdata;
1927 	int i, max_cpus = agp->dtat_maxcpu;
1928 
1929 	if (hash->dtah_hash == NULL) {
1930 		assert(hash->dtah_all == NULL);
1931 	} else {
1932 		free(hash->dtah_hash);
1933 
1934 		for (h = hash->dtah_all; h != NULL; h = next) {
1935 			next = h->dtahe_nextall;
1936 
1937 			aggdata = &h->dtahe_data;
1938 
1939 			if (aggdata->dtada_percpu != NULL) {
1940 				for (i = 0; i < max_cpus; i++)
1941 					free(aggdata->dtada_percpu[i]);
1942 				free(aggdata->dtada_percpu);
1943 			}
1944 
1945 			free(aggdata->dtada_data);
1946 			free(h);
1947 		}
1948 
1949 		hash->dtah_hash = NULL;
1950 		hash->dtah_all = NULL;
1951 		hash->dtah_size = 0;
1952 	}
1953 
1954 	free(agp->dtat_buf.dtbd_data);
1955 	free(agp->dtat_cpus);
1956 }
1957