1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/errno.h>
28 #include <sys/cpuvar.h>
29 #include <sys/stat.h>
30 #include <sys/modctl.h>
31 #include <sys/cmn_err.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/ksynch.h>
35 #include <sys/conf.h>
36 #include <sys/kmem.h>
37 #include <sys/kcpc.h>
38 #include <sys/cap_util.h>
39 #include <sys/cpc_pcbe.h>
40 #include <sys/cpc_impl.h>
41 #include <sys/dtrace_impl.h>
42
43 /*
44 * DTrace CPU Performance Counter Provider
45 * ---------------------------------------
46 *
47 * The DTrace cpc provider allows DTrace consumers to access the CPU
48 * performance counter overflow mechanism of a CPU. The configuration
49 * presented in a probe specification is programmed into the performance
50 * counter hardware of all available CPUs on a system. Programming the
51 * hardware causes a counter on each CPU to begin counting events of the
52 * given type. When the specified number of events have occurred, an overflow
53 * interrupt will be generated and the probe is fired.
54 *
55 * The required configuration for the performance counter is encoded into
56 * the probe specification and this includes the performance counter event
57 * name, processor mode, overflow rate and an optional unit mask.
58 *
59 * Most processors provide several counters (PICs) which can count all or a
60 * subset of the events available for a given CPU. However, when overflow
61 * profiling is being used, not all CPUs can detect which counter generated the
62 * overflow interrupt. In this case we cannot reliably determine which counter
63 * overflowed and we therefore only allow such CPUs to configure one event at
64 * a time. Processors that can determine the counter which overflowed are
65 * allowed to program as many events at one time as possible (in theory up to
66 * the number of instrumentation counters supported by that platform).
67 * Therefore, multiple consumers can enable multiple probes at the same time
68 * on such platforms. Platforms which cannot determine the source of an
69 * overflow interrupt are only allowed to program a single event at one time.
70 *
71 * The performance counter hardware is made available to consumers on a
72 * first-come, first-served basis. Only a finite amount of hardware resource
73 * is available and, while we make every attempt to accomodate requests from
74 * consumers, we must deny requests when hardware resources have been exhausted.
75 * A consumer will fail to enable probes when resources are currently in use.
76 *
77 * The cpc provider contends for shared hardware resources along with other
78 * consumers of the kernel CPU performance counter subsystem (e.g. cpustat(1M)).
79 * Only one such consumer can use the performance counters at any one time and
80 * counters are made available on a first-come, first-served basis. As with
81 * cpustat, the cpc provider has priority over per-LWP libcpc usage (e.g.
82 * cputrack(1)). Invoking the cpc provider will cause all existing per-LWP
83 * counter contexts to be invalidated.
84 */
85
86 typedef struct dcpc_probe {
87 char dcpc_event_name[CPC_MAX_EVENT_LEN];
88 int dcpc_flag; /* flags (USER/SYS) */
89 uint32_t dcpc_ovfval; /* overflow value */
90 int64_t dcpc_umask; /* umask/emask for this event */
91 int dcpc_picno; /* pic this event is programmed in */
92 int dcpc_enabled; /* probe is actually enabled? */
93 int dcpc_disabling; /* probe is currently being disabled */
94 dtrace_id_t dcpc_id; /* probeid this request is enabling */
95 int dcpc_actv_req_idx; /* idx into dcpc_actv_reqs[] */
96 } dcpc_probe_t;
97
98 static dev_info_t *dcpc_devi;
99 static dtrace_provider_id_t dcpc_pid;
100 static dcpc_probe_t **dcpc_actv_reqs;
101 static uint32_t dcpc_enablings = 0;
102 static int dcpc_ovf_mask = 0;
103 static int dcpc_mult_ovf_cap = 0;
104 static int dcpc_mask_type = 0;
105
106 /*
107 * When the dcpc provider is loaded, dcpc_min_overflow is set to either
108 * DCPC_MIN_OVF_DEFAULT or the value that dcpc-min-overflow is set to in
109 * the dcpc.conf file. Decrease this value to set probes with smaller
110 * overflow values. Remember that very small values could render a system
111 * unusable with frequently occurring events.
112 */
113 #define DCPC_MIN_OVF_DEFAULT 5000
114 static uint32_t dcpc_min_overflow;
115
116 static int dcpc_aframes = 0; /* override for artificial frame setting */
117 #if defined(__x86)
118 #define DCPC_ARTIFICIAL_FRAMES 8
119 #elif defined(__sparc)
120 #define DCPC_ARTIFICIAL_FRAMES 2
121 #endif
122
123 /*
124 * Called from the platform overflow interrupt handler. 'bitmap' is a mask
125 * which contains the pic(s) that have overflowed.
126 */
127 static void
dcpc_fire(uint64_t bitmap)128 dcpc_fire(uint64_t bitmap)
129 {
130 int i;
131
132 /*
133 * No counter was marked as overflowing. Shout about it and get out.
134 */
135 if ((bitmap & dcpc_ovf_mask) == 0) {
136 cmn_err(CE_NOTE, "dcpc_fire: no counter overflow found\n");
137 return;
138 }
139
140 /*
141 * This is the common case of a processor that doesn't support
142 * multiple overflow events. Such systems are only allowed a single
143 * enabling and therefore we just look for the first entry in
144 * the active request array.
145 */
146 if (!dcpc_mult_ovf_cap) {
147 for (i = 0; i < cpc_ncounters; i++) {
148 if (dcpc_actv_reqs[i] != NULL) {
149 dtrace_probe(dcpc_actv_reqs[i]->dcpc_id,
150 CPU->cpu_cpcprofile_pc,
151 CPU->cpu_cpcprofile_upc, 0, 0, 0);
152 return;
153 }
154 }
155 return;
156 }
157
158 /*
159 * This is a processor capable of handling multiple overflow events.
160 * Iterate over the array of active requests and locate the counters
161 * that overflowed (note: it is possible for more than one counter to
162 * have overflowed at the same time).
163 */
164 for (i = 0; i < cpc_ncounters; i++) {
165 if (dcpc_actv_reqs[i] != NULL &&
166 (bitmap & (1ULL << dcpc_actv_reqs[i]->dcpc_picno))) {
167 dtrace_probe(dcpc_actv_reqs[i]->dcpc_id,
168 CPU->cpu_cpcprofile_pc,
169 CPU->cpu_cpcprofile_upc, 0, 0, 0);
170 }
171 }
172 }
173
174 static void
dcpc_create_probe(dtrace_provider_id_t id,const char * probename,char * eventname,int64_t umask,uint32_t ovfval,char flag)175 dcpc_create_probe(dtrace_provider_id_t id, const char *probename,
176 char *eventname, int64_t umask, uint32_t ovfval, char flag)
177 {
178 dcpc_probe_t *pp;
179 int nr_frames = DCPC_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
180
181 if (dcpc_aframes)
182 nr_frames = dcpc_aframes;
183
184 if (dtrace_probe_lookup(id, NULL, NULL, probename) != 0)
185 return;
186
187 pp = kmem_zalloc(sizeof (dcpc_probe_t), KM_SLEEP);
188 (void) strncpy(pp->dcpc_event_name, eventname,
189 sizeof (pp->dcpc_event_name) - 1);
190 pp->dcpc_event_name[sizeof (pp->dcpc_event_name) - 1] = '\0';
191 pp->dcpc_flag = flag | CPC_OVF_NOTIFY_EMT;
192 pp->dcpc_ovfval = ovfval;
193 pp->dcpc_umask = umask;
194 pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1;
195
196 pp->dcpc_id = dtrace_probe_create(id, NULL, NULL, probename,
197 nr_frames, pp);
198 }
199
200 /*ARGSUSED*/
201 static void
dcpc_provide(void * arg,const dtrace_probedesc_t * desc)202 dcpc_provide(void *arg, const dtrace_probedesc_t *desc)
203 {
204 /*
205 * The format of a probe is:
206 *
207 * event_name-mode-{optional_umask}-overflow_rate
208 * e.g.
209 * DC_refill_from_system-user-0x1e-50000, or,
210 * DC_refill_from_system-all-10000
211 *
212 */
213 char *str, *end, *p;
214 int i, flag = 0;
215 char event[CPC_MAX_EVENT_LEN];
216 long umask = -1, val = 0;
217 size_t evlen, len;
218
219 /*
220 * The 'cpc' provider offers no probes by default.
221 */
222 if (desc == NULL)
223 return;
224
225 len = strlen(desc->dtpd_name);
226 p = str = kmem_alloc(len + 1, KM_SLEEP);
227 (void) strcpy(str, desc->dtpd_name);
228
229 /*
230 * We have a poor man's strtok() going on here. Replace any hyphens
231 * in the the probe name with NULL characters in order to make it
232 * easy to parse the string with regular string functions.
233 */
234 for (i = 0; i < len; i++) {
235 if (str[i] == '-')
236 str[i] = '\0';
237 }
238
239 /*
240 * The first part of the string must be either a platform event
241 * name or a generic event name.
242 */
243 evlen = strlen(p);
244 (void) strncpy(event, p, CPC_MAX_EVENT_LEN - 1);
245 event[CPC_MAX_EVENT_LEN - 1] = '\0';
246
247 /*
248 * The next part of the name is the mode specification. Valid
249 * settings are "user", "kernel" or "all".
250 */
251 p += evlen + 1;
252
253 if (strcmp(p, "user") == 0)
254 flag |= CPC_COUNT_USER;
255 else if (strcmp(p, "kernel") == 0)
256 flag |= CPC_COUNT_SYSTEM;
257 else if (strcmp(p, "all") == 0)
258 flag |= CPC_COUNT_USER | CPC_COUNT_SYSTEM;
259 else
260 goto err;
261
262 /*
263 * Next we either have a mask specification followed by an overflow
264 * rate or just an overflow rate on its own.
265 */
266 p += strlen(p) + 1;
267 if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
268 /*
269 * A unit mask can only be specified if:
270 * 1) this performance counter back end supports masks.
271 * 2) the specified event is platform specific.
272 * 3) a valid hex number is converted.
273 * 4) no extraneous characters follow the mask specification.
274 */
275 if (dcpc_mask_type != 0 && strncmp(event, "PAPI", 4) != 0 &&
276 ddi_strtol(p, &end, 16, &umask) == 0 &&
277 end == p + strlen(p)) {
278 p += strlen(p) + 1;
279 } else {
280 goto err;
281 }
282 }
283
284 /*
285 * This final part must be an overflow value which has to be greater
286 * than the minimum permissible overflow rate.
287 */
288 if ((ddi_strtol(p, &end, 10, &val) != 0) || end != p + strlen(p) ||
289 val < dcpc_min_overflow)
290 goto err;
291
292 /*
293 * Validate the event and create the probe.
294 */
295 for (i = 0; i < cpc_ncounters; i++) {
296 char *events, *cp, *p, *end;
297 int found = 0, j;
298 size_t llen;
299
300 if ((events = kcpc_list_events(i)) == NULL)
301 goto err;
302
303 llen = strlen(events);
304 p = cp = ddi_strdup(events, KM_NOSLEEP);
305 end = cp + llen;
306
307 for (j = 0; j < llen; j++) {
308 if (cp[j] == ',')
309 cp[j] = '\0';
310 }
311
312 while (p < end && found == 0) {
313 if (strcmp(p, event) == 0) {
314 dcpc_create_probe(dcpc_pid, desc->dtpd_name,
315 event, umask, (uint32_t)val, flag);
316 found = 1;
317 }
318 p += strlen(p) + 1;
319 }
320 kmem_free(cp, llen + 1);
321
322 if (found)
323 break;
324 }
325
326 err:
327 kmem_free(str, len + 1);
328 }
329
330 /*ARGSUSED*/
331 static void
dcpc_destroy(void * arg,dtrace_id_t id,void * parg)332 dcpc_destroy(void *arg, dtrace_id_t id, void *parg)
333 {
334 dcpc_probe_t *pp = parg;
335
336 ASSERT(pp->dcpc_enabled == 0);
337 kmem_free(pp, sizeof (dcpc_probe_t));
338 }
339
340 /*ARGSUSED*/
341 static int
dcpc_mode(void * arg,dtrace_id_t id,void * parg)342 dcpc_mode(void *arg, dtrace_id_t id, void *parg)
343 {
344 if (CPU->cpu_cpcprofile_pc == 0) {
345 return (DTRACE_MODE_NOPRIV_DROP | DTRACE_MODE_USER);
346 } else {
347 return (DTRACE_MODE_NOPRIV_DROP | DTRACE_MODE_KERNEL);
348 }
349 }
350
351 static void
dcpc_populate_set(cpu_t * c,dcpc_probe_t * pp,kcpc_set_t * set,int reqno)352 dcpc_populate_set(cpu_t *c, dcpc_probe_t *pp, kcpc_set_t *set, int reqno)
353 {
354 kcpc_set_t *oset;
355 int i;
356
357 (void) strncpy(set->ks_req[reqno].kr_event, pp->dcpc_event_name,
358 CPC_MAX_EVENT_LEN);
359 set->ks_req[reqno].kr_config = NULL;
360 set->ks_req[reqno].kr_index = reqno;
361 set->ks_req[reqno].kr_picnum = -1;
362 set->ks_req[reqno].kr_flags = pp->dcpc_flag;
363
364 /*
365 * If a unit mask has been specified then detect which attribute
366 * the platform needs. For now, it's either "umask" or "emask".
367 */
368 if (pp->dcpc_umask >= 0) {
369 set->ks_req[reqno].kr_attr =
370 kmem_zalloc(sizeof (kcpc_attr_t), KM_SLEEP);
371 set->ks_req[reqno].kr_nattrs = 1;
372 if (dcpc_mask_type & DCPC_UMASK)
373 (void) strncpy(set->ks_req[reqno].kr_attr->ka_name,
374 "umask", 5);
375 else
376 (void) strncpy(set->ks_req[reqno].kr_attr->ka_name,
377 "emask", 5);
378 set->ks_req[reqno].kr_attr->ka_val = pp->dcpc_umask;
379 } else {
380 set->ks_req[reqno].kr_attr = NULL;
381 set->ks_req[reqno].kr_nattrs = 0;
382 }
383
384 /*
385 * If this probe is enabled, obtain its current countdown value
386 * and use that. The CPUs cpc context might not exist yet if we
387 * are dealing with a CPU that is just coming online.
388 */
389 if (pp->dcpc_enabled && (c->cpu_cpc_ctx != NULL)) {
390 oset = c->cpu_cpc_ctx->kc_set;
391
392 for (i = 0; i < oset->ks_nreqs; i++) {
393 if (strcmp(oset->ks_req[i].kr_event,
394 set->ks_req[reqno].kr_event) == 0) {
395 set->ks_req[reqno].kr_preset =
396 *(oset->ks_req[i].kr_data);
397 }
398 }
399 } else {
400 set->ks_req[reqno].kr_preset = UINT64_MAX - pp->dcpc_ovfval;
401 }
402
403 set->ks_nreqs++;
404 }
405
406
407 /*
408 * Create a fresh request set for the enablings represented in the
409 * 'dcpc_actv_reqs' array which contains the probes we want to be
410 * in the set. This can be called for several reasons:
411 *
412 * 1) We are on a single or multi overflow platform and we have no
413 * current events so we can just create the set and initialize it.
414 * 2) We are on a multi-overflow platform and we already have one or
415 * more existing events and we are adding a new enabling. Create a
416 * new set and copy old requests in and then add the new request.
417 * 3) We are on a multi-overflow platform and we have just removed an
418 * enabling but we still have enablings whch are valid. Create a new
419 * set and copy in still valid requests.
420 */
421 static kcpc_set_t *
dcpc_create_set(cpu_t * c)422 dcpc_create_set(cpu_t *c)
423 {
424 int i, reqno = 0;
425 int active_requests = 0;
426 kcpc_set_t *set;
427
428 /*
429 * First get a count of the number of currently active requests.
430 * Note that dcpc_actv_reqs[] should always reflect which requests
431 * we want to be in the set that is to be created. It is the
432 * responsibility of the caller of dcpc_create_set() to adjust that
433 * array accordingly beforehand.
434 */
435 for (i = 0; i < cpc_ncounters; i++) {
436 if (dcpc_actv_reqs[i] != NULL)
437 active_requests++;
438 }
439
440 set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
441
442 set->ks_req =
443 kmem_zalloc(sizeof (kcpc_request_t) * active_requests, KM_SLEEP);
444
445 set->ks_data =
446 kmem_zalloc(active_requests * sizeof (uint64_t), KM_SLEEP);
447
448 /*
449 * Look for valid entries in the active requests array and populate
450 * the request set for any entries found.
451 */
452 for (i = 0; i < cpc_ncounters; i++) {
453 if (dcpc_actv_reqs[i] != NULL) {
454 dcpc_populate_set(c, dcpc_actv_reqs[i], set, reqno);
455 reqno++;
456 }
457 }
458
459 return (set);
460 }
461
462 static int
dcpc_program_cpu_event(cpu_t * c)463 dcpc_program_cpu_event(cpu_t *c)
464 {
465 int i, j, subcode;
466 kcpc_ctx_t *ctx, *octx;
467 kcpc_set_t *set;
468
469 set = dcpc_create_set(c);
470
471 set->ks_ctx = ctx = kcpc_ctx_alloc(KM_SLEEP);
472 ctx->kc_set = set;
473 ctx->kc_cpuid = c->cpu_id;
474
475 if (kcpc_assign_reqs(set, ctx) != 0)
476 goto err;
477
478 if (kcpc_configure_reqs(ctx, set, &subcode) != 0)
479 goto err;
480
481 for (i = 0; i < set->ks_nreqs; i++) {
482 for (j = 0; j < cpc_ncounters; j++) {
483 if (dcpc_actv_reqs[j] != NULL &&
484 strcmp(set->ks_req[i].kr_event,
485 dcpc_actv_reqs[j]->dcpc_event_name) == 0) {
486 dcpc_actv_reqs[j]->dcpc_picno =
487 set->ks_req[i].kr_picnum;
488 }
489 }
490 }
491
492 /*
493 * If we already have an active enabling then save the current cpc
494 * context away.
495 */
496 octx = c->cpu_cpc_ctx;
497
498 kcpc_cpu_program(c, ctx);
499
500 if (octx != NULL) {
501 kcpc_set_t *oset = octx->kc_set;
502 kmem_free(oset->ks_data, oset->ks_nreqs * sizeof (uint64_t));
503 kcpc_free_configs(oset);
504 kcpc_free_set(oset);
505 kcpc_ctx_free(octx);
506 }
507
508 return (0);
509
510 err:
511 /*
512 * We failed to configure this request up so free things up and
513 * get out.
514 */
515 kcpc_free_configs(set);
516 kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
517 kcpc_free_set(set);
518 kcpc_ctx_free(ctx);
519
520 return (-1);
521 }
522
523 static void
dcpc_disable_cpu(cpu_t * c)524 dcpc_disable_cpu(cpu_t *c)
525 {
526 kcpc_ctx_t *ctx;
527 kcpc_set_t *set;
528
529 /*
530 * Leave this CPU alone if it's already offline.
531 */
532 if (c->cpu_flags & CPU_OFFLINE)
533 return;
534
535 /*
536 * Grab CPUs CPC context before kcpc_cpu_stop() stops counters and
537 * changes it.
538 */
539 ctx = c->cpu_cpc_ctx;
540
541 kcpc_cpu_stop(c, B_FALSE);
542
543 set = ctx->kc_set;
544
545 kcpc_free_configs(set);
546 kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
547 kcpc_free_set(set);
548 kcpc_ctx_free(ctx);
549 }
550
551 /*
552 * The dcpc_*_interrupts() routines are responsible for manipulating the
553 * per-CPU dcpc interrupt state byte. The purpose of the state byte is to
554 * synchronize processing of hardware overflow interrupts wth configuration
555 * changes made to the CPU performance counter subsystem by the dcpc provider.
556 *
557 * The dcpc provider claims ownership of the overflow interrupt mechanism
558 * by transitioning the state byte from DCPC_INTR_INACTIVE (indicating the
559 * dcpc provider is not in use) to DCPC_INTR_FREE (the dcpc provider owns the
560 * overflow mechanism and interrupts may be processed). Before modifying
561 * a CPUs configuration state the state byte is transitioned from
562 * DCPC_INTR_FREE to DCPC_INTR_CONFIG ("configuration in process" state).
563 * The hardware overflow handler, kcpc_hw_overflow_intr(), will only process
564 * an interrupt when a configuration is not in process (i.e. the state is
565 * marked as free). During interrupt processing the state is set to
566 * DCPC_INTR_PROCESSING by the overflow handler. When the last dcpc based
567 * enabling is removed, the state byte is set to DCPC_INTR_INACTIVE to indicate
568 * the dcpc provider is no longer interested in overflow interrupts.
569 */
570 static void
dcpc_block_interrupts(void)571 dcpc_block_interrupts(void)
572 {
573 cpu_t *c = cpu_list;
574 uint8_t *state;
575
576 ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state != DCPC_INTR_INACTIVE);
577
578 do {
579 state = &cpu_core[c->cpu_id].cpuc_dcpc_intr_state;
580
581 while (atomic_cas_8(state, DCPC_INTR_FREE,
582 DCPC_INTR_CONFIG) != DCPC_INTR_FREE)
583 continue;
584
585 } while ((c = c->cpu_next) != cpu_list);
586 }
587
588 /*
589 * Set all CPUs dcpc interrupt state to DCPC_INTR_FREE to indicate that
590 * overflow interrupts can be processed safely.
591 */
592 static void
dcpc_release_interrupts(void)593 dcpc_release_interrupts(void)
594 {
595 cpu_t *c = cpu_list;
596
597 ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state != DCPC_INTR_INACTIVE);
598
599 do {
600 cpu_core[c->cpu_id].cpuc_dcpc_intr_state = DCPC_INTR_FREE;
601 membar_producer();
602 } while ((c = c->cpu_next) != cpu_list);
603 }
604
605 /*
606 * Transition all CPUs dcpc interrupt state from DCPC_INTR_INACTIVE to
607 * to DCPC_INTR_FREE. This indicates that the dcpc provider is now
608 * responsible for handling all overflow interrupt activity. Should only be
609 * called before enabling the first dcpc based probe.
610 */
611 static void
dcpc_claim_interrupts(void)612 dcpc_claim_interrupts(void)
613 {
614 cpu_t *c = cpu_list;
615
616 ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state == DCPC_INTR_INACTIVE);
617
618 do {
619 cpu_core[c->cpu_id].cpuc_dcpc_intr_state = DCPC_INTR_FREE;
620 membar_producer();
621 } while ((c = c->cpu_next) != cpu_list);
622 }
623
624 /*
625 * Set all CPUs dcpc interrupt state to DCPC_INTR_INACTIVE to indicate that
626 * the dcpc provider is no longer processing overflow interrupts. Only called
627 * during removal of the last dcpc based enabling.
628 */
629 static void
dcpc_surrender_interrupts(void)630 dcpc_surrender_interrupts(void)
631 {
632 cpu_t *c = cpu_list;
633
634 ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state != DCPC_INTR_INACTIVE);
635
636 do {
637 cpu_core[c->cpu_id].cpuc_dcpc_intr_state = DCPC_INTR_INACTIVE;
638 membar_producer();
639 } while ((c = c->cpu_next) != cpu_list);
640 }
641
642 /*
643 * dcpc_program_event() can be called owing to a new enabling or if a multi
644 * overflow platform has disabled a request but needs to program the requests
645 * that are still valid.
646 *
647 * Every invocation of dcpc_program_event() will create a new kcpc_ctx_t
648 * and a new request set which contains the new enabling and any old enablings
649 * which are still valid (possible with multi-overflow platforms).
650 */
651 static int
dcpc_program_event(dcpc_probe_t * pp)652 dcpc_program_event(dcpc_probe_t *pp)
653 {
654 cpu_t *c;
655 int ret = 0;
656
657 ASSERT(MUTEX_HELD(&cpu_lock));
658
659 kpreempt_disable();
660
661 dcpc_block_interrupts();
662
663 c = cpu_list;
664
665 do {
666 /*
667 * Skip CPUs that are currently offline.
668 */
669 if (c->cpu_flags & CPU_OFFLINE)
670 continue;
671
672 /*
673 * Stop counters but preserve existing DTrace CPC context
674 * if there is one.
675 *
676 * If we come here when the first event is programmed for a CPU,
677 * there should be no DTrace CPC context installed. In this
678 * case, kcpc_cpu_stop() will ensure that there is no other
679 * context on the CPU.
680 *
681 * If we add new enabling to the original one, the CPU should
682 * have the old DTrace CPC context which we need to keep around
683 * since dcpc_program_event() will add to it.
684 */
685 if (c->cpu_cpc_ctx != NULL)
686 kcpc_cpu_stop(c, B_TRUE);
687 } while ((c = c->cpu_next) != cpu_list);
688
689 dcpc_release_interrupts();
690
691 /*
692 * If this enabling is being removed (in the case of a multi event
693 * capable system with more than one active enabling), we can now
694 * update the active request array to reflect the enablings that need
695 * to be reprogrammed.
696 */
697 if (pp->dcpc_disabling == 1)
698 dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
699
700 do {
701 /*
702 * Skip CPUs that are currently offline.
703 */
704 if (c->cpu_flags & CPU_OFFLINE)
705 continue;
706
707 ret = dcpc_program_cpu_event(c);
708 } while ((c = c->cpu_next) != cpu_list && ret == 0);
709
710 /*
711 * If dcpc_program_cpu_event() fails then it is because we couldn't
712 * configure the requests in the set for the CPU and not because of
713 * an error programming the hardware. If we have a failure here then
714 * we assume no CPUs have been programmed in the above step as they
715 * are all configured identically.
716 */
717 if (ret != 0) {
718 pp->dcpc_enabled = 0;
719 kpreempt_enable();
720 return (-1);
721 }
722
723 if (pp->dcpc_disabling != 1)
724 pp->dcpc_enabled = 1;
725
726 kpreempt_enable();
727
728 return (0);
729 }
730
731 /*ARGSUSED*/
732 static int
dcpc_enable(void * arg,dtrace_id_t id,void * parg)733 dcpc_enable(void *arg, dtrace_id_t id, void *parg)
734 {
735 dcpc_probe_t *pp = parg;
736 int i, found = 0;
737 cpu_t *c;
738
739 ASSERT(MUTEX_HELD(&cpu_lock));
740
741 /*
742 * Bail out if the counters are being used by a libcpc consumer.
743 */
744 rw_enter(&kcpc_cpuctx_lock, RW_READER);
745 if (kcpc_cpuctx > 0) {
746 rw_exit(&kcpc_cpuctx_lock);
747 return (-1);
748 }
749
750 dtrace_cpc_in_use++;
751 rw_exit(&kcpc_cpuctx_lock);
752
753 /*
754 * Locate this enabling in the first free entry of the active
755 * request array.
756 */
757 for (i = 0; i < cpc_ncounters; i++) {
758 if (dcpc_actv_reqs[i] == NULL) {
759 dcpc_actv_reqs[i] = pp;
760 pp->dcpc_actv_req_idx = i;
761 found = 1;
762 break;
763 }
764 }
765
766 /*
767 * If we couldn't find a slot for this probe then there is no
768 * room at the inn.
769 */
770 if (!found) {
771 dtrace_cpc_in_use--;
772 return (-1);
773 }
774
775 ASSERT(pp->dcpc_actv_req_idx >= 0);
776
777 /*
778 * DTrace is taking over CPC contexts, so stop collecting
779 * capacity/utilization data for all CPUs.
780 */
781 if (dtrace_cpc_in_use == 1)
782 cu_disable();
783
784 /*
785 * The following must hold true if we are to (attempt to) enable
786 * this request:
787 *
788 * 1) No enablings currently exist. We allow all platforms to
789 * proceed if this is true.
790 *
791 * OR
792 *
793 * 2) If the platform is multi overflow capable and there are
794 * less valid enablings than there are counters. There is no
795 * guarantee that a platform can accommodate as many events as
796 * it has counters for but we will at least try to program
797 * up to that many requests.
798 *
799 * The 'dcpc_enablings' variable is implictly protected by locking
800 * provided by the DTrace framework and the cpu management framework.
801 */
802 if (dcpc_enablings == 0 || (dcpc_mult_ovf_cap &&
803 dcpc_enablings < cpc_ncounters)) {
804 /*
805 * Before attempting to program the first enabling we need to
806 * invalidate any lwp-based contexts and lay claim to the
807 * overflow interrupt mechanism.
808 */
809 if (dcpc_enablings == 0) {
810 kcpc_invalidate_all();
811 dcpc_claim_interrupts();
812 }
813
814 if (dcpc_program_event(pp) == 0) {
815 dcpc_enablings++;
816 return (0);
817 }
818 }
819
820 /*
821 * If active enablings existed before we failed to enable this probe
822 * on a multi event capable platform then we need to restart counters
823 * as they will have been stopped in the attempted configuration. The
824 * context should now just contain the request prior to this failed
825 * enabling.
826 */
827 if (dcpc_enablings > 0 && dcpc_mult_ovf_cap) {
828 c = cpu_list;
829
830 ASSERT(dcpc_mult_ovf_cap == 1);
831 do {
832 /*
833 * Skip CPUs that are currently offline.
834 */
835 if (c->cpu_flags & CPU_OFFLINE)
836 continue;
837
838 kcpc_cpu_program(c, c->cpu_cpc_ctx);
839 } while ((c = c->cpu_next) != cpu_list);
840 }
841
842 /*
843 * Give up any claim to the overflow interrupt mechanism if no
844 * dcpc based enablings exist.
845 */
846 if (dcpc_enablings == 0)
847 dcpc_surrender_interrupts();
848
849 dtrace_cpc_in_use--;
850 dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
851 pp->dcpc_actv_req_idx = pp->dcpc_picno = -1;
852
853 /*
854 * If all probes are removed, enable capacity/utilization data
855 * collection for every CPU.
856 */
857 if (dtrace_cpc_in_use == 0)
858 cu_enable();
859
860 return (-1);
861 }
862
863 /*
864 * If only one enabling is active then remove the context and free
865 * everything up. If there are multiple enablings active then remove this
866 * one, its associated meta-data and re-program the hardware.
867 */
868 /*ARGSUSED*/
869 static void
dcpc_disable(void * arg,dtrace_id_t id,void * parg)870 dcpc_disable(void *arg, dtrace_id_t id, void *parg)
871 {
872 cpu_t *c;
873 dcpc_probe_t *pp = parg;
874
875 ASSERT(MUTEX_HELD(&cpu_lock));
876
877 kpreempt_disable();
878
879 /*
880 * This probe didn't actually make it as far as being fully enabled
881 * so we needn't do anything with it.
882 */
883 if (pp->dcpc_enabled == 0) {
884 /*
885 * If we actually allocated this request a slot in the
886 * request array but failed to enabled it then remove the
887 * entry in the array.
888 */
889 if (pp->dcpc_actv_req_idx >= 0) {
890 dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
891 pp->dcpc_actv_req_idx = pp->dcpc_picno =
892 pp->dcpc_disabling = -1;
893 }
894
895 kpreempt_enable();
896 return;
897 }
898
899 /*
900 * If this is the only enabling then stop all the counters and
901 * free up the meta-data.
902 */
903 if (dcpc_enablings == 1) {
904 ASSERT(dtrace_cpc_in_use == 1);
905
906 dcpc_block_interrupts();
907
908 c = cpu_list;
909
910 do {
911 dcpc_disable_cpu(c);
912 } while ((c = c->cpu_next) != cpu_list);
913
914 dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
915 dcpc_surrender_interrupts();
916 } else {
917 /*
918 * This platform can support multiple overflow events and
919 * the enabling being disabled is not the last one. Remove this
920 * enabling and re-program the hardware with the new config.
921 */
922 ASSERT(dcpc_mult_ovf_cap);
923 ASSERT(dcpc_enablings > 1);
924
925 pp->dcpc_disabling = 1;
926 (void) dcpc_program_event(pp);
927 }
928
929 kpreempt_enable();
930
931 dcpc_enablings--;
932 dtrace_cpc_in_use--;
933 pp->dcpc_enabled = 0;
934 pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1;
935
936 /*
937 * If all probes are removed, enable capacity/utilization data
938 * collection for every CPU
939 */
940 if (dtrace_cpc_in_use == 0)
941 cu_enable();
942 }
943
944 /*ARGSUSED*/
945 static int
dcpc_cpu_setup(cpu_setup_t what,processorid_t cpu,void * arg)946 dcpc_cpu_setup(cpu_setup_t what, processorid_t cpu, void *arg)
947 {
948 cpu_t *c;
949 uint8_t *state;
950
951 ASSERT(MUTEX_HELD(&cpu_lock));
952
953 switch (what) {
954 case CPU_OFF:
955 /*
956 * Offline CPUs are not allowed to take part so remove this
957 * CPU if we are actively tracing.
958 */
959 if (dtrace_cpc_in_use) {
960 c = cpu_get(cpu);
961 state = &cpu_core[c->cpu_id].cpuc_dcpc_intr_state;
962
963 /*
964 * Indicate that a configuration is in process in
965 * order to stop overflow interrupts being processed
966 * on this CPU while we disable it.
967 */
968 while (atomic_cas_8(state, DCPC_INTR_FREE,
969 DCPC_INTR_CONFIG) != DCPC_INTR_FREE)
970 continue;
971
972 dcpc_disable_cpu(c);
973
974 /*
975 * Reset this CPUs interrupt state as the configuration
976 * has ended.
977 */
978 cpu_core[c->cpu_id].cpuc_dcpc_intr_state =
979 DCPC_INTR_FREE;
980 membar_producer();
981 }
982 break;
983
984 case CPU_ON:
985 case CPU_SETUP:
986 /*
987 * This CPU is being initialized or brought online so program
988 * it with the current request set if we are actively tracing.
989 */
990 if (dtrace_cpc_in_use) {
991 c = cpu_get(cpu);
992 (void) dcpc_program_cpu_event(c);
993 }
994 break;
995
996 default:
997 break;
998 }
999
1000 return (0);
1001 }
1002
1003 static dtrace_pattr_t dcpc_attr = {
1004 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1005 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1006 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1007 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_CPU },
1008 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1009 };
1010
1011 static dtrace_pops_t dcpc_pops = {
1012 dcpc_provide,
1013 NULL,
1014 dcpc_enable,
1015 dcpc_disable,
1016 NULL,
1017 NULL,
1018 NULL,
1019 NULL,
1020 dcpc_mode,
1021 dcpc_destroy
1022 };
1023
1024 /*ARGSUSED*/
1025 static int
dcpc_open(dev_t * devp,int flag,int otyp,cred_t * cred_p)1026 dcpc_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
1027 {
1028 return (0);
1029 }
1030
1031 /*ARGSUSED*/
1032 static int
dcpc_info(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)1033 dcpc_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1034 {
1035 int error;
1036
1037 switch (infocmd) {
1038 case DDI_INFO_DEVT2DEVINFO:
1039 *result = (void *)dcpc_devi;
1040 error = DDI_SUCCESS;
1041 break;
1042 case DDI_INFO_DEVT2INSTANCE:
1043 *result = (void *)0;
1044 error = DDI_SUCCESS;
1045 break;
1046 default:
1047 error = DDI_FAILURE;
1048 }
1049 return (error);
1050 }
1051
1052 static int
dcpc_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)1053 dcpc_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
1054 {
1055 switch (cmd) {
1056 case DDI_DETACH:
1057 break;
1058 case DDI_SUSPEND:
1059 return (DDI_SUCCESS);
1060 default:
1061 return (DDI_FAILURE);
1062 }
1063
1064 if (dtrace_unregister(dcpc_pid) != 0)
1065 return (DDI_FAILURE);
1066
1067 ddi_remove_minor_node(devi, NULL);
1068
1069 mutex_enter(&cpu_lock);
1070 unregister_cpu_setup_func(dcpc_cpu_setup, NULL);
1071 mutex_exit(&cpu_lock);
1072
1073 kmem_free(dcpc_actv_reqs, cpc_ncounters * sizeof (dcpc_probe_t *));
1074
1075 kcpc_unregister_dcpc();
1076
1077 return (DDI_SUCCESS);
1078 }
1079
1080 static int
dcpc_attach(dev_info_t * devi,ddi_attach_cmd_t cmd)1081 dcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1082 {
1083 uint_t caps;
1084 char *attrs;
1085
1086 switch (cmd) {
1087 case DDI_ATTACH:
1088 break;
1089 case DDI_RESUME:
1090 return (DDI_SUCCESS);
1091 default:
1092 return (DDI_FAILURE);
1093 }
1094
1095 if (kcpc_pcbe_loaded() == -1)
1096 return (DDI_FAILURE);
1097
1098 caps = kcpc_pcbe_capabilities();
1099
1100 if (!(caps & CPC_CAP_OVERFLOW_INTERRUPT)) {
1101 cmn_err(CE_NOTE, "!dcpc: Counter Overflow not supported"\
1102 " on this processor");
1103 return (DDI_FAILURE);
1104 }
1105
1106 if (ddi_create_minor_node(devi, "dcpc", S_IFCHR, 0,
1107 DDI_PSEUDO, NULL) == DDI_FAILURE ||
1108 dtrace_register("cpc", &dcpc_attr, DTRACE_PRIV_KERNEL,
1109 NULL, &dcpc_pops, NULL, &dcpc_pid) != 0) {
1110 ddi_remove_minor_node(devi, NULL);
1111 return (DDI_FAILURE);
1112 }
1113
1114 mutex_enter(&cpu_lock);
1115 register_cpu_setup_func(dcpc_cpu_setup, NULL);
1116 mutex_exit(&cpu_lock);
1117
1118 dcpc_ovf_mask = (1 << cpc_ncounters) - 1;
1119 ASSERT(dcpc_ovf_mask != 0);
1120
1121 if (caps & CPC_CAP_OVERFLOW_PRECISE)
1122 dcpc_mult_ovf_cap = 1;
1123
1124 /*
1125 * Determine which, if any, mask attribute the back-end can use.
1126 */
1127 attrs = kcpc_list_attrs();
1128 if (strstr(attrs, "umask") != NULL)
1129 dcpc_mask_type |= DCPC_UMASK;
1130 else if (strstr(attrs, "emask") != NULL)
1131 dcpc_mask_type |= DCPC_EMASK;
1132
1133 /*
1134 * The dcpc_actv_reqs array is used to store the requests that
1135 * we currently have programmed. The order of requests in this
1136 * array is not necessarily the order that the event appears in
1137 * the kcpc_request_t array. Once entered into a slot in the array
1138 * the entry is not moved until it's removed.
1139 */
1140 dcpc_actv_reqs =
1141 kmem_zalloc(cpc_ncounters * sizeof (dcpc_probe_t *), KM_SLEEP);
1142
1143 dcpc_min_overflow = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
1144 DDI_PROP_DONTPASS, "dcpc-min-overflow", DCPC_MIN_OVF_DEFAULT);
1145
1146 kcpc_register_dcpc(dcpc_fire);
1147
1148 ddi_report_dev(devi);
1149 dcpc_devi = devi;
1150
1151 return (DDI_SUCCESS);
1152 }
1153
1154 static struct cb_ops dcpc_cb_ops = {
1155 dcpc_open, /* open */
1156 nodev, /* close */
1157 nulldev, /* strategy */
1158 nulldev, /* print */
1159 nodev, /* dump */
1160 nodev, /* read */
1161 nodev, /* write */
1162 nodev, /* ioctl */
1163 nodev, /* devmap */
1164 nodev, /* mmap */
1165 nodev, /* segmap */
1166 nochpoll, /* poll */
1167 ddi_prop_op, /* cb_prop_op */
1168 0, /* streamtab */
1169 D_NEW | D_MP /* Driver compatibility flag */
1170 };
1171
1172 static struct dev_ops dcpc_ops = {
1173 DEVO_REV, /* devo_rev, */
1174 0, /* refcnt */
1175 dcpc_info, /* get_dev_info */
1176 nulldev, /* identify */
1177 nulldev, /* probe */
1178 dcpc_attach, /* attach */
1179 dcpc_detach, /* detach */
1180 nodev, /* reset */
1181 &dcpc_cb_ops, /* driver operations */
1182 NULL, /* bus operations */
1183 nodev, /* dev power */
1184 ddi_quiesce_not_needed /* quiesce */
1185 };
1186
1187 /*
1188 * Module linkage information for the kernel.
1189 */
1190 static struct modldrv modldrv = {
1191 &mod_driverops, /* module type */
1192 "DTrace CPC Module", /* name of module */
1193 &dcpc_ops, /* driver ops */
1194 };
1195
1196 static struct modlinkage modlinkage = {
1197 MODREV_1,
1198 (void *)&modldrv,
1199 NULL
1200 };
1201
1202 int
_init(void)1203 _init(void)
1204 {
1205 return (mod_install(&modlinkage));
1206 }
1207
1208 int
_info(struct modinfo * modinfop)1209 _info(struct modinfo *modinfop)
1210 {
1211 return (mod_info(&modlinkage, modinfop));
1212 }
1213
1214 int
_fini(void)1215 _fini(void)
1216 {
1217 return (mod_remove(&modlinkage));
1218 }
1219