1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22 *
23 */
24
25 /*
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
28 */
29
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/conf.h>
33 #include <sys/cpuvar.h>
34 #include <sys/endian.h>
35 #include <sys/fcntl.h>
36 #include <sys/filio.h>
37 #include <sys/kdb.h>
38 #include <sys/kernel.h>
39 #include <sys/kmem.h>
40 #include <sys/kthread.h>
41 #include <sys/limits.h>
42 #include <sys/linker.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/module.h>
46 #include <sys/mutex.h>
47 #include <sys/poll.h>
48 #include <sys/proc.h>
49 #include <sys/selinfo.h>
50 #include <sys/smp.h>
51 #include <sys/sysctl.h>
52 #include <sys/uio.h>
53 #include <sys/unistd.h>
54 #include <machine/cpu.h>
55 #include <machine/stdarg.h>
56
57 #include <sys/dtrace.h>
58 #include <sys/dtrace_bsd.h>
59
60 #include <cddl/dev/dtrace/dtrace_cddl.h>
61
62 #define PROF_NAMELEN 15
63
64 #define PROF_PROFILE 0
65 #define PROF_TICK 1
66 #define PROF_PREFIX_PROFILE "profile-"
67 #define PROF_PREFIX_TICK "tick-"
68
69 /*
70 * Regardless of platform, there are five artificial frames in the case of the
71 * profile provider:
72 *
73 * profile_fire
74 * cyclic_expire
75 * cyclic_fire
76 * [ cbe ]
77 * [ locore ]
78 *
79 * On amd64, there are two frames associated with locore: one in locore, and
80 * another in common interrupt dispatch code. (i386 has not been modified to
81 * use this common layer.) Further, on i386, the interrupted instruction
82 * appears as its own stack frame. All of this means that we need to add one
83 * frame for amd64, and then take one away for both amd64 and i386.
84 *
85 * All of the above constraints lead to the mess below. Yes, the profile
86 * provider should ideally figure this out on-the-fly by hiting one of its own
87 * probes and then walking its own stack trace. This is complicated, however,
88 * and the static definition doesn't seem to be overly brittle. Still, we
89 * allow for a manual override in case we get it completely wrong.
90 */
91 #ifdef __amd64
92 #define PROF_ARTIFICIAL_FRAMES 10
93 #else
94 #ifdef __i386
95 #define PROF_ARTIFICIAL_FRAMES 6
96 #endif
97 #endif
98
99 #ifdef __powerpc__
100 /*
101 * This value is bogus just to make module compilable on powerpc
102 */
103 #define PROF_ARTIFICIAL_FRAMES 8
104 #endif
105
106 struct profile_probe_percpu;
107
108 #ifdef __arm__
109 #define PROF_ARTIFICIAL_FRAMES 3
110 #endif
111
112 #ifdef __aarch64__
113 #define PROF_ARTIFICIAL_FRAMES 12
114 #endif
115
116 #ifdef __riscv
117 #define PROF_ARTIFICIAL_FRAMES 12
118 #endif
119
120 typedef struct profile_probe {
121 char prof_name[PROF_NAMELEN];
122 dtrace_id_t prof_id;
123 int prof_kind;
124 #ifdef illumos
125 hrtime_t prof_interval;
126 cyclic_id_t prof_cyclic;
127 #else
128 sbintime_t prof_interval;
129 struct callout prof_cyclic;
130 sbintime_t prof_expected;
131 struct profile_probe_percpu **prof_pcpus;
132 #endif
133 } profile_probe_t;
134
135 typedef struct profile_probe_percpu {
136 hrtime_t profc_expected;
137 hrtime_t profc_interval;
138 profile_probe_t *profc_probe;
139 #ifdef __FreeBSD__
140 struct callout profc_cyclic;
141 #endif
142 } profile_probe_percpu_t;
143
144 static int profile_unload(void);
145 static void profile_create(hrtime_t, char *, int);
146 static void profile_destroy(void *, dtrace_id_t, void *);
147 static void profile_enable(void *, dtrace_id_t, void *);
148 static void profile_disable(void *, dtrace_id_t, void *);
149 static void profile_load(void *);
150 static void profile_provide(void *, dtrace_probedesc_t *);
151
152 static int profile_rates[] = {
153 97, 199, 499, 997, 1999,
154 4001, 4999, 0, 0, 0,
155 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0
157 };
158
159 static int profile_ticks[] = {
160 1, 10, 100, 500, 1000,
161 5000, 0, 0, 0, 0,
162 0, 0, 0, 0, 0
163 };
164
165 /*
166 * profile_max defines the upper bound on the number of profile probes that
167 * can exist (this is to prevent malicious or clumsy users from exhausing
168 * system resources by creating a slew of profile probes). At mod load time,
169 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
170 * present in the profile.conf file.
171 */
172 #define PROFILE_MAX_DEFAULT 1000 /* default max. number of probes */
173 static uint32_t profile_max = PROFILE_MAX_DEFAULT;
174 /* maximum number of profile probes */
175 static uint32_t profile_total; /* current number of profile probes */
176
177 static dtrace_pattr_t profile_attr = {
178 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
179 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
180 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
181 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
182 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
183 };
184
185 static dtrace_pops_t profile_pops = {
186 .dtps_provide = profile_provide,
187 .dtps_provide_module = NULL,
188 .dtps_enable = profile_enable,
189 .dtps_disable = profile_disable,
190 .dtps_suspend = NULL,
191 .dtps_resume = NULL,
192 .dtps_getargdesc = NULL,
193 .dtps_getargval = NULL,
194 .dtps_usermode = NULL,
195 .dtps_destroy = profile_destroy
196 };
197
198 static dtrace_provider_id_t profile_id;
199 static hrtime_t profile_interval_min = NANOSEC / 5000; /* 5000 hz */
200 static int profile_aframes = PROF_ARTIFICIAL_FRAMES;
201
202 SYSCTL_DECL(_kern_dtrace);
203 SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
204 "DTrace profile parameters");
205 SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
206 0, "Skipped frames for profile provider");
207
208 static sbintime_t
nsec_to_sbt(hrtime_t nsec)209 nsec_to_sbt(hrtime_t nsec)
210 {
211 time_t sec;
212
213 /*
214 * We need to calculate nsec * 2^32 / 10^9
215 * Seconds and nanoseconds are split to avoid overflow.
216 */
217 sec = nsec / NANOSEC;
218 nsec = nsec % NANOSEC;
219 return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
220 }
221
222 static hrtime_t
sbt_to_nsec(sbintime_t sbt)223 sbt_to_nsec(sbintime_t sbt)
224 {
225
226 return ((sbt >> 32) * NANOSEC +
227 (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
228 }
229
230 static void
profile_probe(profile_probe_t * prof,hrtime_t late)231 profile_probe(profile_probe_t *prof, hrtime_t late)
232 {
233 struct thread *td;
234 struct trapframe *frame;
235 uintfptr_t pc, upc;
236
237 td = curthread;
238 pc = upc = 0;
239
240 /*
241 * td_intr_frame can be unset if this is a catch-up event upon waking up
242 * from idle sleep. This can only happen on a CPU idle thread. Use a
243 * representative arg0 value in this case so that one of the probe
244 * arguments is non-zero.
245 */
246 frame = td->td_intr_frame;
247 if (frame != NULL) {
248 if (TRAPF_USERMODE(frame))
249 upc = TRAPF_PC(frame);
250 else {
251 pc = TRAPF_PC(frame);
252 td->t_dtrace_trapframe = frame;
253 }
254 } else if (TD_IS_IDLETHREAD(td))
255 pc = (uintfptr_t)&cpu_idle;
256
257 dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
258 td->t_dtrace_trapframe = NULL;
259 }
260
261 static void
profile_fire(void * arg)262 profile_fire(void *arg)
263 {
264 profile_probe_percpu_t *pcpu = arg;
265 profile_probe_t *prof = pcpu->profc_probe;
266 hrtime_t late;
267
268 late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
269
270 profile_probe(prof, late);
271 pcpu->profc_expected += pcpu->profc_interval;
272 callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
273 pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
274 }
275
276 static void
profile_tick(void * arg)277 profile_tick(void *arg)
278 {
279 profile_probe_t *prof = arg;
280
281 profile_probe(prof, 0);
282 prof->prof_expected += prof->prof_interval;
283 callout_schedule_sbt(&prof->prof_cyclic,
284 prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
285 }
286
287 static void
profile_create(hrtime_t interval,char * name,int kind)288 profile_create(hrtime_t interval, char *name, int kind)
289 {
290 profile_probe_t *prof;
291
292 if (interval < profile_interval_min)
293 return;
294
295 if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
296 return;
297
298 atomic_add_32(&profile_total, 1);
299 if (profile_total > profile_max) {
300 atomic_add_32(&profile_total, -1);
301 return;
302 }
303
304 prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
305 (void) strcpy(prof->prof_name, name);
306 #ifdef illumos
307 prof->prof_interval = interval;
308 prof->prof_cyclic = CYCLIC_NONE;
309 #else
310 prof->prof_interval = nsec_to_sbt(interval);
311 callout_init(&prof->prof_cyclic, 1);
312 #endif
313 prof->prof_kind = kind;
314 prof->prof_id = dtrace_probe_create(profile_id,
315 NULL, NULL, name,
316 profile_aframes, prof);
317 }
318
319 /*ARGSUSED*/
320 static void
profile_provide(void * arg,dtrace_probedesc_t * desc)321 profile_provide(void *arg, dtrace_probedesc_t *desc)
322 {
323 int i, j, rate, kind;
324 hrtime_t val = 0, mult = 1, len = 0;
325 char *name, *suffix = NULL;
326
327 const struct {
328 char *prefix;
329 int kind;
330 } types[] = {
331 { PROF_PREFIX_PROFILE, PROF_PROFILE },
332 { PROF_PREFIX_TICK, PROF_TICK },
333 { 0, 0 }
334 };
335
336 const struct {
337 char *name;
338 hrtime_t mult;
339 } suffixes[] = {
340 { "ns", NANOSEC / NANOSEC },
341 { "nsec", NANOSEC / NANOSEC },
342 { "us", NANOSEC / MICROSEC },
343 { "usec", NANOSEC / MICROSEC },
344 { "ms", NANOSEC / MILLISEC },
345 { "msec", NANOSEC / MILLISEC },
346 { "s", NANOSEC / SEC },
347 { "sec", NANOSEC / SEC },
348 { "m", NANOSEC * (hrtime_t)60 },
349 { "min", NANOSEC * (hrtime_t)60 },
350 { "h", NANOSEC * (hrtime_t)(60 * 60) },
351 { "hour", NANOSEC * (hrtime_t)(60 * 60) },
352 { "d", NANOSEC * (hrtime_t)(24 * 60 * 60) },
353 { "day", NANOSEC * (hrtime_t)(24 * 60 * 60) },
354 { "hz", 0 },
355 { NULL }
356 };
357
358 if (desc == NULL) {
359 char n[PROF_NAMELEN];
360
361 /*
362 * If no description was provided, provide all of our probes.
363 */
364 for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
365 if ((rate = profile_rates[i]) == 0)
366 continue;
367
368 (void) snprintf(n, PROF_NAMELEN, "%s%d",
369 PROF_PREFIX_PROFILE, rate);
370 profile_create(NANOSEC / rate, n, PROF_PROFILE);
371 }
372
373 for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
374 if ((rate = profile_ticks[i]) == 0)
375 continue;
376
377 (void) snprintf(n, PROF_NAMELEN, "%s%d",
378 PROF_PREFIX_TICK, rate);
379 profile_create(NANOSEC / rate, n, PROF_TICK);
380 }
381
382 return;
383 }
384
385 name = desc->dtpd_name;
386
387 for (i = 0; types[i].prefix != NULL; i++) {
388 len = strlen(types[i].prefix);
389
390 if (strncmp(name, types[i].prefix, len) != 0)
391 continue;
392 break;
393 }
394
395 if (types[i].prefix == NULL)
396 return;
397
398 kind = types[i].kind;
399 j = strlen(name) - len;
400
401 /*
402 * We need to start before any time suffix.
403 */
404 for (j = strlen(name); j >= len; j--) {
405 if (name[j] >= '0' && name[j] <= '9')
406 break;
407 suffix = &name[j];
408 }
409
410 ASSERT(suffix != NULL);
411
412 /*
413 * Now determine the numerical value present in the probe name.
414 */
415 for (; j >= len; j--) {
416 if (name[j] < '0' || name[j] > '9')
417 return;
418
419 val += (name[j] - '0') * mult;
420 mult *= (hrtime_t)10;
421 }
422
423 if (val == 0)
424 return;
425
426 /*
427 * Look-up the suffix to determine the multiplier.
428 */
429 for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
430 if (strcasecmp(suffixes[i].name, suffix) == 0) {
431 mult = suffixes[i].mult;
432 break;
433 }
434 }
435
436 if (suffixes[i].name == NULL && *suffix != '\0')
437 return;
438
439 if (mult == 0) {
440 /*
441 * The default is frequency-per-second.
442 */
443 val = NANOSEC / val;
444 } else {
445 val *= mult;
446 }
447
448 profile_create(val, name, kind);
449 }
450
451 /* ARGSUSED */
452 static void
profile_destroy(void * arg,dtrace_id_t id,void * parg)453 profile_destroy(void *arg, dtrace_id_t id, void *parg)
454 {
455 profile_probe_t *prof = parg;
456
457 #ifdef illumos
458 ASSERT(prof->prof_cyclic == CYCLIC_NONE);
459 #else
460 ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
461 #endif
462 kmem_free(prof, sizeof (profile_probe_t));
463
464 ASSERT(profile_total >= 1);
465 atomic_add_32(&profile_total, -1);
466 }
467
468 #ifdef illumos
469 /*ARGSUSED*/
470 static void
profile_online(void * arg,cpu_t * cpu,cyc_handler_t * hdlr,cyc_time_t * when)471 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
472 {
473 profile_probe_t *prof = arg;
474 profile_probe_percpu_t *pcpu;
475
476 pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
477 pcpu->profc_probe = prof;
478
479 hdlr->cyh_func = profile_fire;
480 hdlr->cyh_arg = pcpu;
481
482 when->cyt_interval = prof->prof_interval;
483 when->cyt_when = gethrtime() + when->cyt_interval;
484
485 pcpu->profc_expected = when->cyt_when;
486 pcpu->profc_interval = when->cyt_interval;
487 }
488
489 /*ARGSUSED*/
490 static void
profile_offline(void * arg,cpu_t * cpu,void * oarg)491 profile_offline(void *arg, cpu_t *cpu, void *oarg)
492 {
493 profile_probe_percpu_t *pcpu = oarg;
494
495 ASSERT(pcpu->profc_probe == arg);
496 kmem_free(pcpu, sizeof (profile_probe_percpu_t));
497 }
498
499 /* ARGSUSED */
500 static void
profile_enable(void * arg,dtrace_id_t id,void * parg)501 profile_enable(void *arg, dtrace_id_t id, void *parg)
502 {
503 profile_probe_t *prof = parg;
504 cyc_omni_handler_t omni;
505 cyc_handler_t hdlr;
506 cyc_time_t when;
507
508 ASSERT(prof->prof_interval != 0);
509 ASSERT(MUTEX_HELD(&cpu_lock));
510
511 if (prof->prof_kind == PROF_TICK) {
512 hdlr.cyh_func = profile_tick;
513 hdlr.cyh_arg = prof;
514
515 when.cyt_interval = prof->prof_interval;
516 when.cyt_when = gethrtime() + when.cyt_interval;
517 } else {
518 ASSERT(prof->prof_kind == PROF_PROFILE);
519 omni.cyo_online = profile_online;
520 omni.cyo_offline = profile_offline;
521 omni.cyo_arg = prof;
522 }
523
524 if (prof->prof_kind == PROF_TICK) {
525 prof->prof_cyclic = cyclic_add(&hdlr, &when);
526 } else {
527 prof->prof_cyclic = cyclic_add_omni(&omni);
528 }
529 }
530
531 /* ARGSUSED */
532 static void
profile_disable(void * arg,dtrace_id_t id,void * parg)533 profile_disable(void *arg, dtrace_id_t id, void *parg)
534 {
535 profile_probe_t *prof = parg;
536
537 ASSERT(prof->prof_cyclic != CYCLIC_NONE);
538 ASSERT(MUTEX_HELD(&cpu_lock));
539
540 cyclic_remove(prof->prof_cyclic);
541 prof->prof_cyclic = CYCLIC_NONE;
542 }
543
544 #else
545
546 static void
profile_enable_omni(profile_probe_t * prof)547 profile_enable_omni(profile_probe_t *prof)
548 {
549 profile_probe_percpu_t *pcpu;
550 int cpu;
551
552 prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
553 CPU_FOREACH(cpu) {
554 pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
555 prof->prof_pcpus[cpu] = pcpu;
556 pcpu->profc_probe = prof;
557 pcpu->profc_expected = sbinuptime() + prof->prof_interval;
558 pcpu->profc_interval = prof->prof_interval;
559 callout_init(&pcpu->profc_cyclic, 1);
560 callout_reset_sbt_on(&pcpu->profc_cyclic,
561 pcpu->profc_expected, 0, profile_fire, pcpu,
562 cpu, C_DIRECT_EXEC | C_ABSOLUTE);
563 }
564 }
565
566 static void
profile_disable_omni(profile_probe_t * prof)567 profile_disable_omni(profile_probe_t *prof)
568 {
569 profile_probe_percpu_t *pcpu;
570 int cpu;
571
572 ASSERT(prof->prof_pcpus != NULL);
573 CPU_FOREACH(cpu) {
574 pcpu = prof->prof_pcpus[cpu];
575 ASSERT(pcpu->profc_probe == prof);
576 ASSERT(callout_active(&pcpu->profc_cyclic));
577 callout_stop(&pcpu->profc_cyclic);
578 callout_drain(&pcpu->profc_cyclic);
579 kmem_free(pcpu, sizeof(profile_probe_percpu_t));
580 }
581 kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
582 prof->prof_pcpus = NULL;
583 }
584
585 /* ARGSUSED */
586 static void
profile_enable(void * arg,dtrace_id_t id,void * parg)587 profile_enable(void *arg, dtrace_id_t id, void *parg)
588 {
589 profile_probe_t *prof = parg;
590
591 if (prof->prof_kind == PROF_TICK) {
592 prof->prof_expected = sbinuptime() + prof->prof_interval;
593 callout_reset_sbt(&prof->prof_cyclic,
594 prof->prof_expected, 0, profile_tick, prof,
595 C_DIRECT_EXEC | C_ABSOLUTE);
596 } else {
597 ASSERT(prof->prof_kind == PROF_PROFILE);
598 profile_enable_omni(prof);
599 }
600 }
601
602 /* ARGSUSED */
603 static void
profile_disable(void * arg,dtrace_id_t id,void * parg)604 profile_disable(void *arg, dtrace_id_t id, void *parg)
605 {
606 profile_probe_t *prof = parg;
607
608 if (prof->prof_kind == PROF_TICK) {
609 ASSERT(callout_active(&prof->prof_cyclic));
610 callout_stop(&prof->prof_cyclic);
611 callout_drain(&prof->prof_cyclic);
612 } else {
613 ASSERT(prof->prof_kind == PROF_PROFILE);
614 profile_disable_omni(prof);
615 }
616 }
617 #endif
618
619 static void
profile_load(void * dummy)620 profile_load(void *dummy)
621 {
622 if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
623 NULL, &profile_pops, NULL, &profile_id) != 0)
624 return;
625 }
626
627
628 static int
profile_unload(void)629 profile_unload(void)
630 {
631 int error = 0;
632
633 if ((error = dtrace_unregister(profile_id)) != 0)
634 return (error);
635
636 return (error);
637 }
638
639 /* ARGSUSED */
640 static int
profile_modevent(module_t mod __unused,int type,void * data __unused)641 profile_modevent(module_t mod __unused, int type, void *data __unused)
642 {
643 int error = 0;
644
645 switch (type) {
646 case MOD_LOAD:
647 break;
648
649 case MOD_UNLOAD:
650 break;
651
652 case MOD_SHUTDOWN:
653 break;
654
655 default:
656 error = EOPNOTSUPP;
657 break;
658
659 }
660 return (error);
661 }
662
663 SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
664 SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
665
666 DEV_MODULE(profile, profile_modevent, NULL);
667 MODULE_VERSION(profile, 1);
668 MODULE_DEPEND(profile, dtrace, 1, 1, 1);
669 MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
670