xref: /titanic_41/usr/src/uts/common/dtrace/profile.c (revision 5f8171005a0c33f3c67f7da52d41c2362c3fd891)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 #include <sys/errno.h>
28 #include <sys/stat.h>
29 #include <sys/modctl.h>
30 #include <sys/conf.h>
31 #include <sys/systm.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/cpuvar.h>
35 #include <sys/kmem.h>
36 #include <sys/strsubr.h>
37 #include <sys/dtrace.h>
38 #include <sys/cyclic.h>
39 #include <sys/atomic.h>
40 
41 static dev_info_t *profile_devi;
42 static dtrace_provider_id_t profile_id;
43 
44 /*
45  * Regardless of platform, the stack frames look like this in the case of the
46  * profile provider:
47  *
48  *	profile_fire
49  *	cyclic_expire
50  *	cyclic_fire
51  *	[ cbe ]
52  *	[ interrupt code ]
53  *
54  * On x86, there are five frames from the generic interrupt code; further, the
55  * interrupted instruction appears as its own stack frame, giving us a total of
56  * 10.
57  *
58  * On SPARC, the picture is further complicated because the compiler
59  * optimizes away tail-calls -- so the following frames are optimized away:
60  *
61  * 	profile_fire
62  *	cyclic_expire
63  *
64  * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
65  * frame cannot be tail-call eliminated, yielding four frames in this case.
66  *
67  * All of the above constraints lead to the mess below.  Yes, the profile
68  * provider should ideally figure this out on-the-fly by hitting one of its own
69  * probes and then walking its own stack trace.  This is complicated, however,
70  * and the static definition doesn't seem to be overly brittle.  Still, we
71  * allow for a manual override in case we get it completely wrong.
72  */
73 #ifdef __x86
74 #define	PROF_ARTIFICIAL_FRAMES	10
75 #else
76 #ifdef __sparc
77 #ifdef DEBUG
78 #define	PROF_ARTIFICIAL_FRAMES	4
79 #else
80 #define	PROF_ARTIFICIAL_FRAMES	3
81 #endif
82 #endif
83 #endif
84 
85 #define	PROF_NAMELEN		15
86 
87 #define	PROF_PROFILE		0
88 #define	PROF_TICK		1
89 #define	PROF_PREFIX_PROFILE	"profile-"
90 #define	PROF_PREFIX_TICK	"tick-"
91 
92 typedef struct profile_probe {
93 	char		prof_name[PROF_NAMELEN];
94 	dtrace_id_t	prof_id;
95 	int		prof_kind;
96 	hrtime_t	prof_interval;
97 	cyclic_id_t	prof_cyclic;
98 } profile_probe_t;
99 
100 typedef struct profile_probe_percpu {
101 	hrtime_t	profc_expected;
102 	hrtime_t	profc_interval;
103 	profile_probe_t	*profc_probe;
104 } profile_probe_percpu_t;
105 
106 hrtime_t	profile_interval_min = NANOSEC / 5000;		/* 5000 hz */
107 int		profile_aframes = 0;				/* override */
108 
109 static int profile_rates[] = {
110     97, 199, 499, 997, 1999,
111     4001, 4999, 0, 0, 0,
112     0, 0, 0, 0, 0,
113     0, 0, 0, 0, 0
114 };
115 
116 static int profile_ticks[] = {
117     1, 10, 100, 500, 1000,
118     5000, 0, 0, 0, 0,
119     0, 0, 0, 0, 0
120 };
121 
122 /*
123  * profile_max defines the upper bound on the number of profile probes that
124  * can exist (this is to prevent malicious or clumsy users from exhausing
125  * system resources by creating a slew of profile probes). At mod load time,
126  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
127  * present in the profile.conf file.
128  */
129 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
130 static uint32_t profile_max;		/* maximum number of profile probes */
131 static uint32_t profile_total;	/* current number of profile probes */
132 
133 static void
134 profile_fire(void *arg)
135 {
136 	profile_probe_percpu_t *pcpu = arg;
137 	profile_probe_t *prof = pcpu->profc_probe;
138 	hrtime_t late;
139 
140 	late = dtrace_gethrtime() - pcpu->profc_expected;
141 	pcpu->profc_expected += pcpu->profc_interval;
142 
143 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
144 	    CPU->cpu_profile_upc, late, 0, 0);
145 }
146 
147 static void
148 profile_tick(void *arg)
149 {
150 	profile_probe_t *prof = arg;
151 
152 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
153 	    CPU->cpu_profile_upc, 0, 0, 0);
154 }
155 
156 static void
157 profile_create(hrtime_t interval, const char *name, int kind)
158 {
159 	profile_probe_t *prof;
160 	int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
161 
162 	if (profile_aframes)
163 		nr_frames = profile_aframes;
164 
165 	if (interval < profile_interval_min)
166 		return;
167 
168 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
169 		return;
170 
171 	atomic_add_32(&profile_total, 1);
172 	if (profile_total > profile_max) {
173 		atomic_add_32(&profile_total, -1);
174 		return;
175 	}
176 
177 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
178 	(void) strcpy(prof->prof_name, name);
179 	prof->prof_interval = interval;
180 	prof->prof_cyclic = CYCLIC_NONE;
181 	prof->prof_kind = kind;
182 	prof->prof_id = dtrace_probe_create(profile_id,
183 	    NULL, NULL, name, nr_frames, prof);
184 }
185 
186 /*ARGSUSED*/
187 static void
188 profile_provide(void *arg, const dtrace_probedesc_t *desc)
189 {
190 	int i, j, rate, kind;
191 	hrtime_t val = 0, mult = 1, len;
192 	const char *name, *suffix = NULL;
193 
194 	const struct {
195 		char *prefix;
196 		int kind;
197 	} types[] = {
198 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
199 		{ PROF_PREFIX_TICK, PROF_TICK },
200 		{ NULL, NULL }
201 	};
202 
203 	const struct {
204 		char *name;
205 		hrtime_t mult;
206 	} suffixes[] = {
207 		{ "ns", 	NANOSEC / NANOSEC },
208 		{ "nsec",	NANOSEC / NANOSEC },
209 		{ "us",		NANOSEC / MICROSEC },
210 		{ "usec",	NANOSEC / MICROSEC },
211 		{ "ms",		NANOSEC / MILLISEC },
212 		{ "msec",	NANOSEC / MILLISEC },
213 		{ "s",		NANOSEC / SEC },
214 		{ "sec",	NANOSEC / SEC },
215 		{ "m",		NANOSEC * (hrtime_t)60 },
216 		{ "min",	NANOSEC * (hrtime_t)60 },
217 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
218 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
219 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
220 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
221 		{ "hz",		0 },
222 		{ NULL }
223 	};
224 
225 	if (desc == NULL) {
226 		char n[PROF_NAMELEN];
227 
228 		/*
229 		 * If no description was provided, provide all of our probes.
230 		 */
231 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
232 			if ((rate = profile_rates[i]) == 0)
233 				continue;
234 
235 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
236 			    PROF_PREFIX_PROFILE, rate);
237 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
238 		}
239 
240 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
241 			if ((rate = profile_ticks[i]) == 0)
242 				continue;
243 
244 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
245 			    PROF_PREFIX_TICK, rate);
246 			profile_create(NANOSEC / rate, n, PROF_TICK);
247 		}
248 
249 		return;
250 	}
251 
252 	name = desc->dtpd_name;
253 
254 	for (i = 0; types[i].prefix != NULL; i++) {
255 		len = strlen(types[i].prefix);
256 
257 		if (strncmp(name, types[i].prefix, len) != 0)
258 			continue;
259 		break;
260 	}
261 
262 	if (types[i].prefix == NULL)
263 		return;
264 
265 	kind = types[i].kind;
266 	j = strlen(name) - len;
267 
268 	/*
269 	 * We need to start before any time suffix.
270 	 */
271 	for (j = strlen(name); j >= len; j--) {
272 		if (name[j] >= '0' && name[j] <= '9')
273 			break;
274 		suffix = &name[j];
275 	}
276 
277 	ASSERT(suffix != NULL);
278 
279 	/*
280 	 * Now determine the numerical value present in the probe name.
281 	 */
282 	for (; j >= len; j--) {
283 		if (name[j] < '0' || name[j] > '9')
284 			return;
285 
286 		val += (name[j] - '0') * mult;
287 		mult *= (hrtime_t)10;
288 	}
289 
290 	if (val == 0)
291 		return;
292 
293 	/*
294 	 * Look-up the suffix to determine the multiplier.
295 	 */
296 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
297 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
298 			mult = suffixes[i].mult;
299 			break;
300 		}
301 	}
302 
303 	if (suffixes[i].name == NULL && *suffix != '\0')
304 		return;
305 
306 	if (mult == 0) {
307 		/*
308 		 * The default is frequency-per-second.
309 		 */
310 		val = NANOSEC / val;
311 	} else {
312 		val *= mult;
313 	}
314 
315 	profile_create(val, name, kind);
316 }
317 
318 /*ARGSUSED*/
319 static void
320 profile_destroy(void *arg, dtrace_id_t id, void *parg)
321 {
322 	profile_probe_t *prof = parg;
323 
324 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
325 	kmem_free(prof, sizeof (profile_probe_t));
326 
327 	ASSERT(profile_total >= 1);
328 	atomic_add_32(&profile_total, -1);
329 }
330 
331 /*ARGSUSED*/
332 static void
333 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
334 {
335 	profile_probe_t *prof = arg;
336 	profile_probe_percpu_t *pcpu;
337 
338 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
339 	pcpu->profc_probe = prof;
340 
341 	hdlr->cyh_func = profile_fire;
342 	hdlr->cyh_arg = pcpu;
343 	hdlr->cyh_level = CY_HIGH_LEVEL;
344 
345 	when->cyt_interval = prof->prof_interval;
346 	when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
347 
348 	pcpu->profc_expected = when->cyt_when;
349 	pcpu->profc_interval = when->cyt_interval;
350 }
351 
352 /*ARGSUSED*/
353 static void
354 profile_offline(void *arg, cpu_t *cpu, void *oarg)
355 {
356 	profile_probe_percpu_t *pcpu = oarg;
357 
358 	ASSERT(pcpu->profc_probe == arg);
359 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
360 }
361 
362 /*ARGSUSED*/
363 static int
364 profile_enable(void *arg, dtrace_id_t id, void *parg)
365 {
366 	profile_probe_t *prof = parg;
367 	cyc_omni_handler_t omni;
368 	cyc_handler_t hdlr;
369 	cyc_time_t when;
370 
371 	ASSERT(prof->prof_interval != 0);
372 	ASSERT(MUTEX_HELD(&cpu_lock));
373 
374 	if (prof->prof_kind == PROF_TICK) {
375 		hdlr.cyh_func = profile_tick;
376 		hdlr.cyh_arg = prof;
377 		hdlr.cyh_level = CY_HIGH_LEVEL;
378 
379 		when.cyt_interval = prof->prof_interval;
380 		when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
381 	} else {
382 		ASSERT(prof->prof_kind == PROF_PROFILE);
383 		omni.cyo_online = profile_online;
384 		omni.cyo_offline = profile_offline;
385 		omni.cyo_arg = prof;
386 	}
387 
388 	if (prof->prof_kind == PROF_TICK) {
389 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
390 	} else {
391 		prof->prof_cyclic = cyclic_add_omni(&omni);
392 	}
393 	return (0);
394 }
395 
396 /*ARGSUSED*/
397 static void
398 profile_disable(void *arg, dtrace_id_t id, void *parg)
399 {
400 	profile_probe_t *prof = parg;
401 
402 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
403 	ASSERT(MUTEX_HELD(&cpu_lock));
404 
405 	cyclic_remove(prof->prof_cyclic);
406 	prof->prof_cyclic = CYCLIC_NONE;
407 }
408 
409 /*ARGSUSED*/
410 static int
411 profile_usermode(void *arg, dtrace_id_t id, void *parg)
412 {
413 	return (CPU->cpu_profile_pc == 0);
414 }
415 
416 static dtrace_pattr_t profile_attr = {
417 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
418 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
419 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
420 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
421 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
422 };
423 
424 static dtrace_pops_t profile_pops = {
425 	profile_provide,
426 	NULL,
427 	profile_enable,
428 	profile_disable,
429 	NULL,
430 	NULL,
431 	NULL,
432 	NULL,
433 	profile_usermode,
434 	profile_destroy
435 };
436 
437 static int
438 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
439 {
440 	switch (cmd) {
441 	case DDI_ATTACH:
442 		break;
443 	case DDI_RESUME:
444 		return (DDI_SUCCESS);
445 	default:
446 		return (DDI_FAILURE);
447 	}
448 
449 	if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
450 	    DDI_PSEUDO, NULL) == DDI_FAILURE ||
451 	    dtrace_register("profile", &profile_attr,
452 	    DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
453 	    &profile_pops, NULL, &profile_id) != 0) {
454 		ddi_remove_minor_node(devi, NULL);
455 		return (DDI_FAILURE);
456 	}
457 
458 	profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
459 	    "profile-max-probes", PROFILE_MAX_DEFAULT);
460 
461 	ddi_report_dev(devi);
462 	profile_devi = devi;
463 	return (DDI_SUCCESS);
464 }
465 
466 static int
467 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
468 {
469 	switch (cmd) {
470 	case DDI_DETACH:
471 		break;
472 	case DDI_SUSPEND:
473 		return (DDI_SUCCESS);
474 	default:
475 		return (DDI_FAILURE);
476 	}
477 
478 	if (dtrace_unregister(profile_id) != 0)
479 		return (DDI_FAILURE);
480 
481 	ddi_remove_minor_node(devi, NULL);
482 	return (DDI_SUCCESS);
483 }
484 
485 /*ARGSUSED*/
486 static int
487 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
488 {
489 	int error;
490 
491 	switch (infocmd) {
492 	case DDI_INFO_DEVT2DEVINFO:
493 		*result = (void *)profile_devi;
494 		error = DDI_SUCCESS;
495 		break;
496 	case DDI_INFO_DEVT2INSTANCE:
497 		*result = (void *)0;
498 		error = DDI_SUCCESS;
499 		break;
500 	default:
501 		error = DDI_FAILURE;
502 	}
503 	return (error);
504 }
505 
506 /*ARGSUSED*/
507 static int
508 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
509 {
510 	return (0);
511 }
512 
513 static struct cb_ops profile_cb_ops = {
514 	profile_open,		/* open */
515 	nodev,			/* close */
516 	nulldev,		/* strategy */
517 	nulldev,		/* print */
518 	nodev,			/* dump */
519 	nodev,			/* read */
520 	nodev,			/* write */
521 	nodev,			/* ioctl */
522 	nodev,			/* devmap */
523 	nodev,			/* mmap */
524 	nodev,			/* segmap */
525 	nochpoll,		/* poll */
526 	ddi_prop_op,		/* cb_prop_op */
527 	0,			/* streamtab  */
528 	D_NEW | D_MP		/* Driver compatibility flag */
529 };
530 
531 static struct dev_ops profile_ops = {
532 	DEVO_REV,		/* devo_rev, */
533 	0,			/* refcnt  */
534 	profile_info,		/* get_dev_info */
535 	nulldev,		/* identify */
536 	nulldev,		/* probe */
537 	profile_attach,		/* attach */
538 	profile_detach,		/* detach */
539 	nodev,			/* reset */
540 	&profile_cb_ops,	/* driver operations */
541 	NULL,			/* bus operations */
542 	nodev,			/* dev power */
543 	ddi_quiesce_not_needed,		/* quiesce */
544 };
545 
546 /*
547  * Module linkage information for the kernel.
548  */
549 static struct modldrv modldrv = {
550 	&mod_driverops,		/* module type (this is a pseudo driver) */
551 	"Profile Interrupt Tracing",	/* name of module */
552 	&profile_ops,		/* driver ops */
553 };
554 
555 static struct modlinkage modlinkage = {
556 	MODREV_1,
557 	(void *)&modldrv,
558 	NULL
559 };
560 
561 int
562 _init(void)
563 {
564 	return (mod_install(&modlinkage));
565 }
566 
567 int
568 _info(struct modinfo *modinfop)
569 {
570 	return (mod_info(&modlinkage, modinfop));
571 }
572 
573 int
574 _fini(void)
575 {
576 	return (mod_remove(&modlinkage));
577 }
578