xref: /freebsd/sys/cddl/dev/profile/profile.c (revision 2008043f386721d58158e37e0d7e50df8095942d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22  *
23  */
24 
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/conf.h>
34 #include <sys/cpuvar.h>
35 #include <sys/endian.h>
36 #include <sys/fcntl.h>
37 #include <sys/filio.h>
38 #include <sys/kdb.h>
39 #include <sys/kernel.h>
40 #include <sys/kmem.h>
41 #include <sys/kthread.h>
42 #include <sys/limits.h>
43 #include <sys/linker.h>
44 #include <sys/lock.h>
45 #include <sys/malloc.h>
46 #include <sys/module.h>
47 #include <sys/mutex.h>
48 #include <sys/poll.h>
49 #include <sys/proc.h>
50 #include <sys/selinfo.h>
51 #include <sys/smp.h>
52 #include <sys/sysctl.h>
53 #include <sys/uio.h>
54 #include <sys/unistd.h>
55 #include <machine/cpu.h>
56 #include <machine/stdarg.h>
57 
58 #include <sys/dtrace.h>
59 #include <sys/dtrace_bsd.h>
60 
61 #define	PROF_NAMELEN		15
62 
63 #define	PROF_PROFILE		0
64 #define	PROF_TICK		1
65 #define	PROF_PREFIX_PROFILE	"profile-"
66 #define	PROF_PREFIX_TICK	"tick-"
67 
68 /*
69  * Regardless of platform, there are five artificial frames in the case of the
70  * profile provider:
71  *
72  *	profile_fire
73  *	cyclic_expire
74  *	cyclic_fire
75  *	[ cbe ]
76  *	[ locore ]
77  *
78  * On amd64, there are two frames associated with locore:  one in locore, and
79  * another in common interrupt dispatch code.  (i386 has not been modified to
80  * use this common layer.)  Further, on i386, the interrupted instruction
81  * appears as its own stack frame.  All of this means that we need to add one
82  * frame for amd64, and then take one away for both amd64 and i386.
83  *
84  * All of the above constraints lead to the mess below.  Yes, the profile
85  * provider should ideally figure this out on-the-fly by hiting one of its own
86  * probes and then walking its own stack trace.  This is complicated, however,
87  * and the static definition doesn't seem to be overly brittle.  Still, we
88  * allow for a manual override in case we get it completely wrong.
89  */
90 #ifdef __amd64
91 #define	PROF_ARTIFICIAL_FRAMES	10
92 #else
93 #ifdef __i386
94 #define	PROF_ARTIFICIAL_FRAMES	6
95 #endif
96 #endif
97 
98 #ifdef __powerpc__
99 /*
100  * This value is bogus just to make module compilable on powerpc
101  */
102 #define	PROF_ARTIFICIAL_FRAMES	8
103 #endif
104 
105 struct profile_probe_percpu;
106 
107 #ifdef __arm__
108 #define	PROF_ARTIFICIAL_FRAMES	3
109 #endif
110 
111 #ifdef __aarch64__
112 #define	PROF_ARTIFICIAL_FRAMES	12
113 #endif
114 
115 #ifdef __riscv
116 #define	PROF_ARTIFICIAL_FRAMES	12
117 #endif
118 
119 typedef struct profile_probe {
120 	char		prof_name[PROF_NAMELEN];
121 	dtrace_id_t	prof_id;
122 	int		prof_kind;
123 #ifdef illumos
124 	hrtime_t	prof_interval;
125 	cyclic_id_t	prof_cyclic;
126 #else
127 	sbintime_t	prof_interval;
128 	struct callout	prof_cyclic;
129 	sbintime_t	prof_expected;
130 	struct profile_probe_percpu **prof_pcpus;
131 #endif
132 } profile_probe_t;
133 
134 typedef struct profile_probe_percpu {
135 	hrtime_t	profc_expected;
136 	hrtime_t	profc_interval;
137 	profile_probe_t	*profc_probe;
138 #ifdef __FreeBSD__
139 	struct callout	profc_cyclic;
140 #endif
141 } profile_probe_percpu_t;
142 
143 static int	profile_unload(void);
144 static void	profile_create(hrtime_t, char *, int);
145 static void	profile_destroy(void *, dtrace_id_t, void *);
146 static void	profile_enable(void *, dtrace_id_t, void *);
147 static void	profile_disable(void *, dtrace_id_t, void *);
148 static void	profile_load(void *);
149 static void	profile_provide(void *, dtrace_probedesc_t *);
150 
151 static int profile_rates[] = {
152     97, 199, 499, 997, 1999,
153     4001, 4999, 0, 0, 0,
154     0, 0, 0, 0, 0,
155     0, 0, 0, 0, 0
156 };
157 
158 static int profile_ticks[] = {
159     1, 10, 100, 500, 1000,
160     5000, 0, 0, 0, 0,
161     0, 0, 0, 0, 0
162 };
163 
164 /*
165  * profile_max defines the upper bound on the number of profile probes that
166  * can exist (this is to prevent malicious or clumsy users from exhausing
167  * system resources by creating a slew of profile probes). At mod load time,
168  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
169  * present in the profile.conf file.
170  */
171 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
172 static uint32_t profile_max = PROFILE_MAX_DEFAULT;
173 					/* maximum number of profile probes */
174 static uint32_t profile_total;		/* current number of profile probes */
175 
176 static dtrace_pattr_t profile_attr = {
177 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
178 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
179 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
180 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
181 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
182 };
183 
184 static dtrace_pops_t profile_pops = {
185 	.dtps_provide =		profile_provide,
186 	.dtps_provide_module =	NULL,
187 	.dtps_enable =		profile_enable,
188 	.dtps_disable =		profile_disable,
189 	.dtps_suspend =		NULL,
190 	.dtps_resume =		NULL,
191 	.dtps_getargdesc =	NULL,
192 	.dtps_getargval =	NULL,
193 	.dtps_usermode =	NULL,
194 	.dtps_destroy =		profile_destroy
195 };
196 
197 static dtrace_provider_id_t	profile_id;
198 static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
199 static int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
200 
201 SYSCTL_DECL(_kern_dtrace);
202 SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
203     "DTrace profile parameters");
204 SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
205     0, "Skipped frames for profile provider");
206 
207 static sbintime_t
208 nsec_to_sbt(hrtime_t nsec)
209 {
210 	time_t sec;
211 
212 	/*
213 	 * We need to calculate nsec * 2^32 / 10^9
214 	 * Seconds and nanoseconds are split to avoid overflow.
215 	 */
216 	sec = nsec / NANOSEC;
217 	nsec = nsec % NANOSEC;
218 	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
219 }
220 
221 static hrtime_t
222 sbt_to_nsec(sbintime_t sbt)
223 {
224 
225 	return ((sbt >> 32) * NANOSEC +
226 	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
227 }
228 
229 static void
230 profile_probe(profile_probe_t *prof, hrtime_t late)
231 {
232 	struct thread *td;
233 	struct trapframe *frame;
234 	uintfptr_t pc, upc;
235 
236 	td = curthread;
237 	pc = upc = 0;
238 
239 	/*
240 	 * td_intr_frame can be unset if this is a catch-up event upon waking up
241 	 * from idle sleep. This can only happen on a CPU idle thread. Use a
242 	 * representative arg0 value in this case so that one of the probe
243 	 * arguments is non-zero.
244 	 */
245 	frame = td->td_intr_frame;
246 	if (frame != NULL) {
247 		if (TRAPF_USERMODE(frame))
248 			upc = TRAPF_PC(frame);
249 		else
250 			pc = TRAPF_PC(frame);
251 	} else if (TD_IS_IDLETHREAD(td))
252 		pc = (uintfptr_t)&cpu_idle;
253 
254 	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
255 }
256 
257 static void
258 profile_fire(void *arg)
259 {
260 	profile_probe_percpu_t *pcpu = arg;
261 	profile_probe_t *prof = pcpu->profc_probe;
262 	hrtime_t late;
263 
264 	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
265 
266 	profile_probe(prof, late);
267 	pcpu->profc_expected += pcpu->profc_interval;
268 	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
269 	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
270 }
271 
272 static void
273 profile_tick(void *arg)
274 {
275 	profile_probe_t *prof = arg;
276 
277 	profile_probe(prof, 0);
278 	prof->prof_expected += prof->prof_interval;
279 	callout_schedule_sbt(&prof->prof_cyclic,
280 	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
281 }
282 
283 static void
284 profile_create(hrtime_t interval, char *name, int kind)
285 {
286 	profile_probe_t *prof;
287 
288 	if (interval < profile_interval_min)
289 		return;
290 
291 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
292 		return;
293 
294 	atomic_add_32(&profile_total, 1);
295 	if (profile_total > profile_max) {
296 		atomic_add_32(&profile_total, -1);
297 		return;
298 	}
299 
300 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
301 	(void) strcpy(prof->prof_name, name);
302 #ifdef illumos
303 	prof->prof_interval = interval;
304 	prof->prof_cyclic = CYCLIC_NONE;
305 #else
306 	prof->prof_interval = nsec_to_sbt(interval);
307 	callout_init(&prof->prof_cyclic, 1);
308 #endif
309 	prof->prof_kind = kind;
310 	prof->prof_id = dtrace_probe_create(profile_id,
311 	    NULL, NULL, name,
312 	    profile_aframes, prof);
313 }
314 
315 /*ARGSUSED*/
316 static void
317 profile_provide(void *arg, dtrace_probedesc_t *desc)
318 {
319 	int i, j, rate, kind;
320 	hrtime_t val = 0, mult = 1, len = 0;
321 	char *name, *suffix = NULL;
322 
323 	const struct {
324 		char *prefix;
325 		int kind;
326 	} types[] = {
327 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
328 		{ PROF_PREFIX_TICK, PROF_TICK },
329 		{ 0, 0 }
330 	};
331 
332 	const struct {
333 		char *name;
334 		hrtime_t mult;
335 	} suffixes[] = {
336 		{ "ns", 	NANOSEC / NANOSEC },
337 		{ "nsec",	NANOSEC / NANOSEC },
338 		{ "us",		NANOSEC / MICROSEC },
339 		{ "usec",	NANOSEC / MICROSEC },
340 		{ "ms",		NANOSEC / MILLISEC },
341 		{ "msec",	NANOSEC / MILLISEC },
342 		{ "s",		NANOSEC / SEC },
343 		{ "sec",	NANOSEC / SEC },
344 		{ "m",		NANOSEC * (hrtime_t)60 },
345 		{ "min",	NANOSEC * (hrtime_t)60 },
346 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
347 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
348 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
349 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
350 		{ "hz",		0 },
351 		{ NULL }
352 	};
353 
354 	if (desc == NULL) {
355 		char n[PROF_NAMELEN];
356 
357 		/*
358 		 * If no description was provided, provide all of our probes.
359 		 */
360 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
361 			if ((rate = profile_rates[i]) == 0)
362 				continue;
363 
364 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
365 			    PROF_PREFIX_PROFILE, rate);
366 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
367 		}
368 
369 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
370 			if ((rate = profile_ticks[i]) == 0)
371 				continue;
372 
373 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
374 			    PROF_PREFIX_TICK, rate);
375 			profile_create(NANOSEC / rate, n, PROF_TICK);
376 		}
377 
378 		return;
379 	}
380 
381 	name = desc->dtpd_name;
382 
383 	for (i = 0; types[i].prefix != NULL; i++) {
384 		len = strlen(types[i].prefix);
385 
386 		if (strncmp(name, types[i].prefix, len) != 0)
387 			continue;
388 		break;
389 	}
390 
391 	if (types[i].prefix == NULL)
392 		return;
393 
394 	kind = types[i].kind;
395 	j = strlen(name) - len;
396 
397 	/*
398 	 * We need to start before any time suffix.
399 	 */
400 	for (j = strlen(name); j >= len; j--) {
401 		if (name[j] >= '0' && name[j] <= '9')
402 			break;
403 		suffix = &name[j];
404 	}
405 
406 	ASSERT(suffix != NULL);
407 
408 	/*
409 	 * Now determine the numerical value present in the probe name.
410 	 */
411 	for (; j >= len; j--) {
412 		if (name[j] < '0' || name[j] > '9')
413 			return;
414 
415 		val += (name[j] - '0') * mult;
416 		mult *= (hrtime_t)10;
417 	}
418 
419 	if (val == 0)
420 		return;
421 
422 	/*
423 	 * Look-up the suffix to determine the multiplier.
424 	 */
425 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
426 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
427 			mult = suffixes[i].mult;
428 			break;
429 		}
430 	}
431 
432 	if (suffixes[i].name == NULL && *suffix != '\0')
433 		return;
434 
435 	if (mult == 0) {
436 		/*
437 		 * The default is frequency-per-second.
438 		 */
439 		val = NANOSEC / val;
440 	} else {
441 		val *= mult;
442 	}
443 
444 	profile_create(val, name, kind);
445 }
446 
447 /* ARGSUSED */
448 static void
449 profile_destroy(void *arg, dtrace_id_t id, void *parg)
450 {
451 	profile_probe_t *prof = parg;
452 
453 #ifdef illumos
454 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
455 #else
456 	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
457 #endif
458 	kmem_free(prof, sizeof (profile_probe_t));
459 
460 	ASSERT(profile_total >= 1);
461 	atomic_add_32(&profile_total, -1);
462 }
463 
464 #ifdef illumos
465 /*ARGSUSED*/
466 static void
467 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
468 {
469 	profile_probe_t *prof = arg;
470 	profile_probe_percpu_t *pcpu;
471 
472 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
473 	pcpu->profc_probe = prof;
474 
475 	hdlr->cyh_func = profile_fire;
476 	hdlr->cyh_arg = pcpu;
477 
478 	when->cyt_interval = prof->prof_interval;
479 	when->cyt_when = gethrtime() + when->cyt_interval;
480 
481 	pcpu->profc_expected = when->cyt_when;
482 	pcpu->profc_interval = when->cyt_interval;
483 }
484 
485 /*ARGSUSED*/
486 static void
487 profile_offline(void *arg, cpu_t *cpu, void *oarg)
488 {
489 	profile_probe_percpu_t *pcpu = oarg;
490 
491 	ASSERT(pcpu->profc_probe == arg);
492 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
493 }
494 
495 /* ARGSUSED */
496 static void
497 profile_enable(void *arg, dtrace_id_t id, void *parg)
498 {
499 	profile_probe_t *prof = parg;
500 	cyc_omni_handler_t omni;
501 	cyc_handler_t hdlr;
502 	cyc_time_t when;
503 
504 	ASSERT(prof->prof_interval != 0);
505 	ASSERT(MUTEX_HELD(&cpu_lock));
506 
507 	if (prof->prof_kind == PROF_TICK) {
508 		hdlr.cyh_func = profile_tick;
509 		hdlr.cyh_arg = prof;
510 
511 		when.cyt_interval = prof->prof_interval;
512 		when.cyt_when = gethrtime() + when.cyt_interval;
513 	} else {
514 		ASSERT(prof->prof_kind == PROF_PROFILE);
515 		omni.cyo_online = profile_online;
516 		omni.cyo_offline = profile_offline;
517 		omni.cyo_arg = prof;
518 	}
519 
520 	if (prof->prof_kind == PROF_TICK) {
521 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
522 	} else {
523 		prof->prof_cyclic = cyclic_add_omni(&omni);
524 	}
525 }
526 
527 /* ARGSUSED */
528 static void
529 profile_disable(void *arg, dtrace_id_t id, void *parg)
530 {
531 	profile_probe_t *prof = parg;
532 
533 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
534 	ASSERT(MUTEX_HELD(&cpu_lock));
535 
536 	cyclic_remove(prof->prof_cyclic);
537 	prof->prof_cyclic = CYCLIC_NONE;
538 }
539 
540 #else
541 
542 static void
543 profile_enable_omni(profile_probe_t *prof)
544 {
545 	profile_probe_percpu_t *pcpu;
546 	int cpu;
547 
548 	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
549 	CPU_FOREACH(cpu) {
550 		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
551 		prof->prof_pcpus[cpu] = pcpu;
552 		pcpu->profc_probe = prof;
553 		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
554 		pcpu->profc_interval = prof->prof_interval;
555 		callout_init(&pcpu->profc_cyclic, 1);
556 		callout_reset_sbt_on(&pcpu->profc_cyclic,
557 		    pcpu->profc_expected, 0, profile_fire, pcpu,
558 		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
559 	}
560 }
561 
562 static void
563 profile_disable_omni(profile_probe_t *prof)
564 {
565 	profile_probe_percpu_t *pcpu;
566 	int cpu;
567 
568 	ASSERT(prof->prof_pcpus != NULL);
569 	CPU_FOREACH(cpu) {
570 		pcpu = prof->prof_pcpus[cpu];
571 		ASSERT(pcpu->profc_probe == prof);
572 		ASSERT(callout_active(&pcpu->profc_cyclic));
573 		callout_stop(&pcpu->profc_cyclic);
574 		callout_drain(&pcpu->profc_cyclic);
575 		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
576 	}
577 	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
578 	prof->prof_pcpus = NULL;
579 }
580 
581 /* ARGSUSED */
582 static void
583 profile_enable(void *arg, dtrace_id_t id, void *parg)
584 {
585 	profile_probe_t *prof = parg;
586 
587 	if (prof->prof_kind == PROF_TICK) {
588 		prof->prof_expected = sbinuptime() + prof->prof_interval;
589 		callout_reset_sbt(&prof->prof_cyclic,
590 		    prof->prof_expected, 0, profile_tick, prof,
591 		    C_DIRECT_EXEC | C_ABSOLUTE);
592 	} else {
593 		ASSERT(prof->prof_kind == PROF_PROFILE);
594 		profile_enable_omni(prof);
595 	}
596 }
597 
598 /* ARGSUSED */
599 static void
600 profile_disable(void *arg, dtrace_id_t id, void *parg)
601 {
602 	profile_probe_t *prof = parg;
603 
604 	if (prof->prof_kind == PROF_TICK) {
605 		ASSERT(callout_active(&prof->prof_cyclic));
606 		callout_stop(&prof->prof_cyclic);
607 		callout_drain(&prof->prof_cyclic);
608 	} else {
609 		ASSERT(prof->prof_kind == PROF_PROFILE);
610 		profile_disable_omni(prof);
611 	}
612 }
613 #endif
614 
615 static void
616 profile_load(void *dummy)
617 {
618 	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
619 	    NULL, &profile_pops, NULL, &profile_id) != 0)
620 		return;
621 }
622 
623 
624 static int
625 profile_unload(void)
626 {
627 	int error = 0;
628 
629 	if ((error = dtrace_unregister(profile_id)) != 0)
630 		return (error);
631 
632 	return (error);
633 }
634 
635 /* ARGSUSED */
636 static int
637 profile_modevent(module_t mod __unused, int type, void *data __unused)
638 {
639 	int error = 0;
640 
641 	switch (type) {
642 	case MOD_LOAD:
643 		break;
644 
645 	case MOD_UNLOAD:
646 		break;
647 
648 	case MOD_SHUTDOWN:
649 		break;
650 
651 	default:
652 		error = EOPNOTSUPP;
653 		break;
654 
655 	}
656 	return (error);
657 }
658 
659 SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
660 SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
661 
662 DEV_MODULE(profile, profile_modevent, NULL);
663 MODULE_VERSION(profile, 1);
664 MODULE_DEPEND(profile, dtrace, 1, 1, 1);
665 MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
666