1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2011, Joyent, Inc. All rights reserved. 28 */ 29 30 #include <sys/errno.h> 31 #include <sys/stat.h> 32 #include <sys/modctl.h> 33 #include <sys/conf.h> 34 #include <sys/systm.h> 35 #include <sys/ddi.h> 36 #include <sys/sunddi.h> 37 #include <sys/cpuvar.h> 38 #include <sys/kmem.h> 39 #include <sys/strsubr.h> 40 #include <sys/dtrace.h> 41 #include <sys/cyclic.h> 42 #include <sys/atomic.h> 43 44 static dev_info_t *profile_devi; 45 static dtrace_provider_id_t profile_id; 46 47 /* 48 * Regardless of platform, the stack frames look like this in the case of the 49 * profile provider: 50 * 51 * profile_fire 52 * cyclic_expire 53 * cyclic_fire 54 * [ cbe ] 55 * [ interrupt code ] 56 * 57 * On x86, there are five frames from the generic interrupt code; further, the 58 * interrupted instruction appears as its own stack frame, giving us a total of 59 * 10. 60 * 61 * On SPARC, the picture is further complicated because the compiler 62 * optimizes away tail-calls -- so the following frames are optimized away: 63 * 64 * profile_fire 65 * cyclic_expire 66 * 67 * This gives three frames. However, on DEBUG kernels, the cyclic_expire 68 * frame cannot be tail-call eliminated, yielding four frames in this case. 69 * 70 * All of the above constraints lead to the mess below. Yes, the profile 71 * provider should ideally figure this out on-the-fly by hitting one of its own 72 * probes and then walking its own stack trace. This is complicated, however, 73 * and the static definition doesn't seem to be overly brittle. Still, we 74 * allow for a manual override in case we get it completely wrong. 75 */ 76 #ifdef __x86 77 #define PROF_ARTIFICIAL_FRAMES 10 78 #else 79 #ifdef __sparc 80 #ifdef DEBUG 81 #define PROF_ARTIFICIAL_FRAMES 4 82 #else 83 #define PROF_ARTIFICIAL_FRAMES 3 84 #endif 85 #endif 86 #endif 87 88 #define PROF_NAMELEN 15 89 90 #define PROF_PROFILE 0 91 #define PROF_TICK 1 92 #define PROF_PREFIX_PROFILE "profile-" 93 #define PROF_PREFIX_TICK "tick-" 94 95 typedef struct profile_probe { 96 char prof_name[PROF_NAMELEN]; 97 dtrace_id_t prof_id; 98 int prof_kind; 99 hrtime_t prof_interval; 100 cyclic_id_t prof_cyclic; 101 } profile_probe_t; 102 103 typedef struct profile_probe_percpu { 104 hrtime_t profc_expected; 105 hrtime_t profc_interval; 106 profile_probe_t *profc_probe; 107 } profile_probe_percpu_t; 108 109 hrtime_t profile_interval_min = NANOSEC / 5000; /* 5000 hz */ 110 int profile_aframes = 0; /* override */ 111 112 static int profile_rates[] = { 113 97, 199, 499, 997, 1999, 114 4001, 4999, 0, 0, 0, 115 0, 0, 0, 0, 0, 116 0, 0, 0, 0, 0 117 }; 118 119 static int profile_ticks[] = { 120 1, 10, 100, 500, 1000, 121 5000, 0, 0, 0, 0, 122 0, 0, 0, 0, 0 123 }; 124 125 /* 126 * profile_max defines the upper bound on the number of profile probes that 127 * can exist (this is to prevent malicious or clumsy users from exhausing 128 * system resources by creating a slew of profile probes). At mod load time, 129 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's 130 * present in the profile.conf file. 131 */ 132 #define PROFILE_MAX_DEFAULT 1000 /* default max. number of probes */ 133 static uint32_t profile_max; /* maximum number of profile probes */ 134 static uint32_t profile_total; /* current number of profile probes */ 135 136 static void 137 profile_fire(void *arg) 138 { 139 profile_probe_percpu_t *pcpu = arg; 140 profile_probe_t *prof = pcpu->profc_probe; 141 hrtime_t late; 142 143 late = dtrace_gethrtime() - pcpu->profc_expected; 144 pcpu->profc_expected += pcpu->profc_interval; 145 146 dtrace_probe(prof->prof_id, CPU->cpu_profile_pc, 147 CPU->cpu_profile_upc, late, 0, 0); 148 } 149 150 static void 151 profile_tick(void *arg) 152 { 153 profile_probe_t *prof = arg; 154 155 dtrace_probe(prof->prof_id, CPU->cpu_profile_pc, 156 CPU->cpu_profile_upc, 0, 0, 0); 157 } 158 159 static void 160 profile_create(hrtime_t interval, const char *name, int kind) 161 { 162 profile_probe_t *prof; 163 int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes(); 164 165 if (profile_aframes) 166 nr_frames = profile_aframes; 167 168 if (interval < profile_interval_min) 169 return; 170 171 if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0) 172 return; 173 174 atomic_inc_32(&profile_total); 175 if (profile_total > profile_max) { 176 atomic_dec_32(&profile_total); 177 return; 178 } 179 180 prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP); 181 (void) strcpy(prof->prof_name, name); 182 prof->prof_interval = interval; 183 prof->prof_cyclic = CYCLIC_NONE; 184 prof->prof_kind = kind; 185 prof->prof_id = dtrace_probe_create(profile_id, 186 NULL, NULL, name, nr_frames, prof); 187 } 188 189 /*ARGSUSED*/ 190 static void 191 profile_provide(void *arg, const dtrace_probedesc_t *desc) 192 { 193 int i, j, rate, kind; 194 hrtime_t val = 0, mult = 1, len; 195 const char *name, *suffix = NULL; 196 197 const struct { 198 char *prefix; 199 int kind; 200 } types[] = { 201 { PROF_PREFIX_PROFILE, PROF_PROFILE }, 202 { PROF_PREFIX_TICK, PROF_TICK }, 203 { NULL, NULL } 204 }; 205 206 const struct { 207 char *name; 208 hrtime_t mult; 209 } suffixes[] = { 210 { "ns", NANOSEC / NANOSEC }, 211 { "nsec", NANOSEC / NANOSEC }, 212 { "us", NANOSEC / MICROSEC }, 213 { "usec", NANOSEC / MICROSEC }, 214 { "ms", NANOSEC / MILLISEC }, 215 { "msec", NANOSEC / MILLISEC }, 216 { "s", NANOSEC / SEC }, 217 { "sec", NANOSEC / SEC }, 218 { "m", NANOSEC * (hrtime_t)60 }, 219 { "min", NANOSEC * (hrtime_t)60 }, 220 { "h", NANOSEC * (hrtime_t)(60 * 60) }, 221 { "hour", NANOSEC * (hrtime_t)(60 * 60) }, 222 { "d", NANOSEC * (hrtime_t)(24 * 60 * 60) }, 223 { "day", NANOSEC * (hrtime_t)(24 * 60 * 60) }, 224 { "hz", 0 }, 225 { NULL } 226 }; 227 228 if (desc == NULL) { 229 char n[PROF_NAMELEN]; 230 231 /* 232 * If no description was provided, provide all of our probes. 233 */ 234 for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) { 235 if ((rate = profile_rates[i]) == 0) 236 continue; 237 238 (void) snprintf(n, PROF_NAMELEN, "%s%d", 239 PROF_PREFIX_PROFILE, rate); 240 profile_create(NANOSEC / rate, n, PROF_PROFILE); 241 } 242 243 for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) { 244 if ((rate = profile_ticks[i]) == 0) 245 continue; 246 247 (void) snprintf(n, PROF_NAMELEN, "%s%d", 248 PROF_PREFIX_TICK, rate); 249 profile_create(NANOSEC / rate, n, PROF_TICK); 250 } 251 252 return; 253 } 254 255 name = desc->dtpd_name; 256 257 for (i = 0; types[i].prefix != NULL; i++) { 258 len = strlen(types[i].prefix); 259 260 if (strncmp(name, types[i].prefix, len) != 0) 261 continue; 262 break; 263 } 264 265 if (types[i].prefix == NULL) 266 return; 267 268 kind = types[i].kind; 269 j = strlen(name) - len; 270 271 /* 272 * We need to start before any time suffix. 273 */ 274 for (j = strlen(name); j >= len; j--) { 275 if (name[j] >= '0' && name[j] <= '9') 276 break; 277 suffix = &name[j]; 278 } 279 280 ASSERT(suffix != NULL); 281 282 /* 283 * Now determine the numerical value present in the probe name. 284 */ 285 for (; j >= len; j--) { 286 if (name[j] < '0' || name[j] > '9') 287 return; 288 289 val += (name[j] - '0') * mult; 290 mult *= (hrtime_t)10; 291 } 292 293 if (val == 0) 294 return; 295 296 /* 297 * Look-up the suffix to determine the multiplier. 298 */ 299 for (i = 0, mult = 0; suffixes[i].name != NULL; i++) { 300 if (strcasecmp(suffixes[i].name, suffix) == 0) { 301 mult = suffixes[i].mult; 302 break; 303 } 304 } 305 306 if (suffixes[i].name == NULL && *suffix != '\0') 307 return; 308 309 if (mult == 0) { 310 /* 311 * The default is frequency-per-second. 312 */ 313 val = NANOSEC / val; 314 } else { 315 val *= mult; 316 } 317 318 profile_create(val, name, kind); 319 } 320 321 /*ARGSUSED*/ 322 static void 323 profile_destroy(void *arg, dtrace_id_t id, void *parg) 324 { 325 profile_probe_t *prof = parg; 326 327 ASSERT(prof->prof_cyclic == CYCLIC_NONE); 328 kmem_free(prof, sizeof (profile_probe_t)); 329 330 ASSERT(profile_total >= 1); 331 atomic_dec_32(&profile_total); 332 } 333 334 /*ARGSUSED*/ 335 static void 336 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) 337 { 338 profile_probe_t *prof = arg; 339 profile_probe_percpu_t *pcpu; 340 341 pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP); 342 pcpu->profc_probe = prof; 343 344 hdlr->cyh_func = profile_fire; 345 hdlr->cyh_arg = pcpu; 346 hdlr->cyh_level = CY_HIGH_LEVEL; 347 348 when->cyt_interval = prof->prof_interval; 349 when->cyt_when = dtrace_gethrtime() + when->cyt_interval; 350 351 pcpu->profc_expected = when->cyt_when; 352 pcpu->profc_interval = when->cyt_interval; 353 } 354 355 /*ARGSUSED*/ 356 static void 357 profile_offline(void *arg, cpu_t *cpu, void *oarg) 358 { 359 profile_probe_percpu_t *pcpu = oarg; 360 361 ASSERT(pcpu->profc_probe == arg); 362 kmem_free(pcpu, sizeof (profile_probe_percpu_t)); 363 } 364 365 /*ARGSUSED*/ 366 static int 367 profile_enable(void *arg, dtrace_id_t id, void *parg) 368 { 369 profile_probe_t *prof = parg; 370 cyc_omni_handler_t omni; 371 cyc_handler_t hdlr; 372 cyc_time_t when; 373 374 ASSERT(prof->prof_interval != 0); 375 ASSERT(MUTEX_HELD(&cpu_lock)); 376 377 if (prof->prof_kind == PROF_TICK) { 378 hdlr.cyh_func = profile_tick; 379 hdlr.cyh_arg = prof; 380 hdlr.cyh_level = CY_HIGH_LEVEL; 381 382 when.cyt_interval = prof->prof_interval; 383 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 384 } else { 385 ASSERT(prof->prof_kind == PROF_PROFILE); 386 omni.cyo_online = profile_online; 387 omni.cyo_offline = profile_offline; 388 omni.cyo_arg = prof; 389 } 390 391 if (prof->prof_kind == PROF_TICK) { 392 prof->prof_cyclic = cyclic_add(&hdlr, &when); 393 } else { 394 prof->prof_cyclic = cyclic_add_omni(&omni); 395 } 396 return (0); 397 } 398 399 /*ARGSUSED*/ 400 static void 401 profile_disable(void *arg, dtrace_id_t id, void *parg) 402 { 403 profile_probe_t *prof = parg; 404 405 ASSERT(prof->prof_cyclic != CYCLIC_NONE); 406 ASSERT(MUTEX_HELD(&cpu_lock)); 407 408 cyclic_remove(prof->prof_cyclic); 409 prof->prof_cyclic = CYCLIC_NONE; 410 } 411 412 /*ARGSUSED*/ 413 static int 414 profile_mode(void *arg, dtrace_id_t id, void *parg) 415 { 416 profile_probe_t *prof = parg; 417 int mode; 418 419 if (CPU->cpu_profile_pc != 0) { 420 mode = DTRACE_MODE_KERNEL; 421 } else { 422 mode = DTRACE_MODE_USER; 423 } 424 425 if (prof->prof_kind == PROF_TICK) { 426 mode |= DTRACE_MODE_NOPRIV_RESTRICT; 427 } else { 428 ASSERT(prof->prof_kind == PROF_PROFILE); 429 mode |= DTRACE_MODE_NOPRIV_DROP; 430 } 431 432 return (mode); 433 } 434 435 static dtrace_pattr_t profile_attr = { 436 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, 437 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN }, 438 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 439 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, 440 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, 441 }; 442 443 static dtrace_pops_t profile_pops = { 444 profile_provide, 445 NULL, 446 profile_enable, 447 profile_disable, 448 NULL, 449 NULL, 450 NULL, 451 NULL, 452 profile_mode, 453 profile_destroy 454 }; 455 456 static int 457 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 458 { 459 switch (cmd) { 460 case DDI_ATTACH: 461 break; 462 case DDI_RESUME: 463 return (DDI_SUCCESS); 464 default: 465 return (DDI_FAILURE); 466 } 467 468 if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0, 469 DDI_PSEUDO, NULL) == DDI_FAILURE || 470 dtrace_register("profile", &profile_attr, 471 DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL, 472 &profile_pops, NULL, &profile_id) != 0) { 473 ddi_remove_minor_node(devi, NULL); 474 return (DDI_FAILURE); 475 } 476 477 profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 478 "profile-max-probes", PROFILE_MAX_DEFAULT); 479 480 ddi_report_dev(devi); 481 profile_devi = devi; 482 return (DDI_SUCCESS); 483 } 484 485 static int 486 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 487 { 488 switch (cmd) { 489 case DDI_DETACH: 490 break; 491 case DDI_SUSPEND: 492 return (DDI_SUCCESS); 493 default: 494 return (DDI_FAILURE); 495 } 496 497 if (dtrace_unregister(profile_id) != 0) 498 return (DDI_FAILURE); 499 500 ddi_remove_minor_node(devi, NULL); 501 return (DDI_SUCCESS); 502 } 503 504 /*ARGSUSED*/ 505 static int 506 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 507 { 508 int error; 509 510 switch (infocmd) { 511 case DDI_INFO_DEVT2DEVINFO: 512 *result = (void *)profile_devi; 513 error = DDI_SUCCESS; 514 break; 515 case DDI_INFO_DEVT2INSTANCE: 516 *result = (void *)0; 517 error = DDI_SUCCESS; 518 break; 519 default: 520 error = DDI_FAILURE; 521 } 522 return (error); 523 } 524 525 /*ARGSUSED*/ 526 static int 527 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 528 { 529 return (0); 530 } 531 532 static struct cb_ops profile_cb_ops = { 533 profile_open, /* open */ 534 nodev, /* close */ 535 nulldev, /* strategy */ 536 nulldev, /* print */ 537 nodev, /* dump */ 538 nodev, /* read */ 539 nodev, /* write */ 540 nodev, /* ioctl */ 541 nodev, /* devmap */ 542 nodev, /* mmap */ 543 nodev, /* segmap */ 544 nochpoll, /* poll */ 545 ddi_prop_op, /* cb_prop_op */ 546 0, /* streamtab */ 547 D_NEW | D_MP /* Driver compatibility flag */ 548 }; 549 550 static struct dev_ops profile_ops = { 551 DEVO_REV, /* devo_rev, */ 552 0, /* refcnt */ 553 profile_info, /* get_dev_info */ 554 nulldev, /* identify */ 555 nulldev, /* probe */ 556 profile_attach, /* attach */ 557 profile_detach, /* detach */ 558 nodev, /* reset */ 559 &profile_cb_ops, /* driver operations */ 560 NULL, /* bus operations */ 561 nodev, /* dev power */ 562 ddi_quiesce_not_needed, /* quiesce */ 563 }; 564 565 /* 566 * Module linkage information for the kernel. 567 */ 568 static struct modldrv modldrv = { 569 &mod_driverops, /* module type (this is a pseudo driver) */ 570 "Profile Interrupt Tracing", /* name of module */ 571 &profile_ops, /* driver ops */ 572 }; 573 574 static struct modlinkage modlinkage = { 575 MODREV_1, 576 (void *)&modldrv, 577 NULL 578 }; 579 580 int 581 _init(void) 582 { 583 return (mod_install(&modlinkage)); 584 } 585 586 int 587 _info(struct modinfo *modinfop) 588 { 589 return (mod_info(&modlinkage, modinfop)); 590 } 591 592 int 593 _fini(void) 594 { 595 return (mod_remove(&modlinkage)); 596 } 597