1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Support for determining capacity and utilization of performance relevant
28 * hardware components in a computer
29 *
30 * THEORY
31 * ------
32 * The capacity and utilization of the performance relevant hardware components
33 * is needed to be able to optimize performance while minimizing the amount of
34 * power used on a system. The idea is to use hardware performance counters
35 * and potentially other means to determine the capacity and utilization of
36 * performance relevant hardware components (eg. execution pipeline, cache,
37 * memory, etc.) and attribute the utilization to the responsible CPU and the
38 * thread running there.
39 *
40 * This will help characterize the utilization of performance relevant
41 * components and how much is used by each CPU and each thread. With
42 * that data, the utilization can be aggregated to all the CPUs sharing each
43 * performance relevant hardware component to calculate the total utilization
44 * of each component and compare that with the component's capacity to
45 * essentially determine the actual hardware load of the component. The
46 * hardware utilization attributed to each running thread can also be
47 * aggregated to determine the total hardware utilization of each component to
48 * a workload.
49 *
50 * Once that is done, one can determine how much of each performance relevant
51 * hardware component is needed by a given thread or set of threads (eg. a
52 * workload) and size up exactly what hardware is needed by the threads and how
53 * much. With this info, we can better place threads among CPUs to match their
54 * exact hardware resource needs and potentially lower or raise the power based
55 * on their utilization or pack threads onto the fewest hardware components
56 * needed and power off any remaining unused components to minimize power
57 * without sacrificing performance.
58 *
59 * IMPLEMENTATION
60 * --------------
61 * The code has been designed and implemented to make (un)programming and
62 * reading the counters for a given CPU as lightweight and fast as possible.
63 * This is very important because we need to read and potentially (un)program
64 * the counters very often and in performance sensitive code. Specifically,
65 * the counters may need to be (un)programmed during context switch and/or a
66 * cyclic handler when there are more counter events to count than existing
67 * counters.
68 *
69 * Consequently, the code has been split up to allow allocating and
70 * initializing everything needed to program and read the counters on a given
71 * CPU once and make (un)programming and reading the counters for a given CPU
72 * not have to allocate/free memory or grab any locks. To do this, all the
73 * state needed to (un)program and read the counters on a CPU is kept per CPU
74 * and is made lock free by forcing any code that reads or manipulates the
75 * counters or the state needed to (un)program or read the counters to run on
76 * the target CPU and disable preemption while running on the target CPU to
77 * protect any critical sections. All counter manipulation on the target CPU is
78 * happening either from a cross-call to the target CPU or at the same PIL as
79 * used by the cross-call subsystem. This guarantees that counter manipulation
80 * is not interrupted by cross-calls from other CPUs.
81 *
82 * The synchronization has been made lock free or as simple as possible for
83 * performance and to avoid getting the locking all tangled up when we interpose
84 * on the CPC routines that (un)program the counters to manage the counters
85 * between the kernel and user on each CPU. When the user starts using the
86 * counters on a given CPU, the kernel will unprogram the counters that it is
87 * using on that CPU just before they are programmed for the user. Then the
88 * kernel will program the counters on a given CPU for its own use when the user
89 * stops using them.
90 *
91 * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
92 * enables any probe, it requests to disable and unprogram all counters used for
93 * capacity and utilizations. These counters are never re-programmed back until
94 * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
95 * framework and it re-programs the counters.
96 *
97 * When a CPU is going offline, its CU counters are unprogrammed and disabled,
98 * so that they would not be re-programmed again by some other activity on the
99 * CPU that is going offline.
100 *
101 * The counters are programmed during boot. However, a flag is available to
102 * disable this if necessary (see cu_flag below). A handler is provided to
103 * (un)program the counters during CPU on/offline. Basic routines are provided
104 * to initialize and tear down this module, initialize and tear down any state
105 * needed for a given CPU, and (un)program the counters for a given CPU.
106 * Lastly, a handler is provided to read the counters and attribute the
107 * utilization to the responsible CPU.
108 */
109 #include <sys/types.h>
110 #include <sys/cmn_err.h>
111 #include <sys/cpuvar.h>
112 #include <sys/ddi.h>
113 #include <sys/systm.h>
114 #include <sys/disp.h>
115 #include <sys/sdt.h>
116 #include <sys/sunddi.h>
117 #include <sys/thread.h>
118 #include <sys/pghw.h>
119 #include <sys/cmt.h>
120 #include <sys/policy.h>
121 #include <sys/x_call.h>
122 #include <sys/cap_util.h>
123
124 #include <sys/archsystm.h>
125 #include <sys/promif.h>
126
127 #if defined(__x86)
128 #include <sys/xc_levels.h>
129 #endif
130
131
132 /*
133 * Default CPU hardware performance counter flags to use for measuring capacity
134 * and utilization
135 */
136 #define CU_CPC_FLAGS_DEFAULT \
137 (CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
138
139 /*
140 * Possible Flags for controlling this module.
141 */
142 #define CU_FLAG_ENABLE 1 /* Enable module */
143 #define CU_FLAG_READY 2 /* Ready to setup module */
144 #define CU_FLAG_ON 4 /* Module is on */
145
146 /*
147 * pg_cpu kstats calculate utilization rate and maximum utilization rate for
148 * some CPUs. The rate is calculated based on data from two subsequent
149 * snapshots. When the time between such two snapshots is too small, the
150 * resulting rate may have low accuracy, so we only consider snapshots which
151 * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
152 * update the rate if the interval is smaller than that.
153 *
154 * Use one tenth of a second as the minimum interval for utilization rate
155 * calculation.
156 *
157 * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
158 * the CU_RATE() macro below to guarantee that we never divide by zero.
159 *
160 * Rate is the number of events per second. The rate is the number of events
161 * divided by time and multiplied by the number of nanoseconds in a second. We
162 * do not want time to be too small since it will cause large errors in
163 * division.
164 *
165 * We do not want to multiply two large numbers (the instruction count and
166 * NANOSEC) either since it may cause integer overflow. So we divide both the
167 * numerator and the denominator by the same value.
168 *
169 * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
170 * above to guarantee that time divided by this value is always non-zero.
171 */
172 #define CU_RATE(val, time) \
173 (((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
174
175 #define CU_SAMPLE_INTERVAL_MIN (NANOSEC / 10)
176
177 #define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
178
179 /*
180 * When the time between two kstat reads for the same CPU is less than
181 * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
182 * for the CPU. This helps reduce cross-calls when kstat consumers read data
183 * very often or when they read PG utilization data and then CPU utilization
184 * data quickly after that.
185 */
186 #define CU_UPDATE_THRESHOLD (NANOSEC / 10)
187
188 /*
189 * The IS_HIPIL() macro verifies that the code is executed either from a
190 * cross-call or from high-PIL interrupt
191 */
192 #ifdef DEBUG
193 #define IS_HIPIL() (getpil() >= XCALL_PIL)
194 #else
195 #define IS_HIPIL()
196 #endif /* DEBUG */
197
198
199 typedef void (*cu_cpu_func_t)(uintptr_t, int *);
200
201
202 /*
203 * Flags to use for programming CPU hardware performance counters to measure
204 * capacity and utilization
205 */
206 int cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
207
208 /*
209 * Initial value used for programming hardware counters
210 */
211 uint64_t cu_cpc_preset_value = 0;
212
213 /*
214 * List of CPC event requests for capacity and utilization.
215 */
216 static kcpc_request_list_t *cu_cpc_reqs = NULL;
217
218 /*
219 * When a CPU is a member of PG with a sharing relationship that is supported
220 * by the capacity/utilization framework, a kstat is created for that CPU and
221 * sharing relationship.
222 *
223 * These kstats are updated one at a time, so we can have a single scratch
224 * space to fill the data.
225 *
226 * CPU counter kstats fields:
227 *
228 * cu_cpu_id CPU ID for this kstat
229 *
230 * cu_pg_id PG ID for this kstat
231 *
232 * cu_generation Generation value that increases whenever any CPU goes
233 * offline or online. Two kstat snapshots for the same
234 * CPU may only be compared if they have the same
235 * generation.
236 *
237 * cu_pg_id PG ID for the relationship described by this kstat
238 *
239 * cu_cpu_util Running value of CPU utilization for the sharing
240 * relationship
241 *
242 * cu_cpu_time_running Total time spent collecting CU data. The time may be
243 * less than wall time if CU counters were stopped for
244 * some time.
245 *
246 * cu_cpu_time_stopped Total time the CU counters were stopped.
247 *
248 * cu_cpu_rate Utilization rate, expressed in operations per second.
249 *
250 * cu_cpu_rate_max Maximum observed value of utilization rate.
251 *
252 * cu_cpu_relationship Name of sharing relationship for the PG in this kstat
253 */
254 struct cu_cpu_kstat {
255 kstat_named_t cu_cpu_id;
256 kstat_named_t cu_pg_id;
257 kstat_named_t cu_generation;
258 kstat_named_t cu_cpu_util;
259 kstat_named_t cu_cpu_time_running;
260 kstat_named_t cu_cpu_time_stopped;
261 kstat_named_t cu_cpu_rate;
262 kstat_named_t cu_cpu_rate_max;
263 kstat_named_t cu_cpu_relationship;
264 } cu_cpu_kstat = {
265 { "cpu_id", KSTAT_DATA_UINT32 },
266 { "pg_id", KSTAT_DATA_INT32 },
267 { "generation", KSTAT_DATA_UINT32 },
268 { "hw_util", KSTAT_DATA_UINT64 },
269 { "hw_util_time_running", KSTAT_DATA_UINT64 },
270 { "hw_util_time_stopped", KSTAT_DATA_UINT64 },
271 { "hw_util_rate", KSTAT_DATA_UINT64 },
272 { "hw_util_rate_max", KSTAT_DATA_UINT64 },
273 { "relationship", KSTAT_DATA_STRING },
274 };
275
276 /*
277 * Flags for controlling this module
278 */
279 uint_t cu_flags = CU_FLAG_ENABLE;
280
281 /*
282 * Error return value for cu_init() since it can't return anything to be called
283 * from mp_init_tbl[] (:-(
284 */
285 static int cu_init_error = 0;
286
287 hrtime_t cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
288
289 hrtime_t cu_update_threshold = CU_UPDATE_THRESHOLD;
290
291 static kmutex_t pg_cpu_kstat_lock;
292
293
294 /*
295 * Forward declaration of interface routines
296 */
297 void cu_disable(void);
298 void cu_enable(void);
299 void cu_init(void);
300 void cu_cpc_program(cpu_t *cp, int *err);
301 void cu_cpc_unprogram(cpu_t *cp, int *err);
302 int cu_cpu_update(struct cpu *cp, boolean_t move_to);
303 void cu_pg_update(pghw_t *pg);
304
305
306 /*
307 * Forward declaration of private routines
308 */
309 static int cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
310 static void cu_cpc_program_xcall(uintptr_t arg, int *err);
311 static int cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
312 int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
313 static int cu_cpu_callback(cpu_setup_t what, int id, void *arg);
314 static void cu_cpu_disable(cpu_t *cp);
315 static void cu_cpu_enable(cpu_t *cp);
316 static int cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
317 static int cu_cpu_fini(cpu_t *cp);
318 static void cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
319 static int cu_cpu_kstat_update(kstat_t *ksp, int rw);
320 static int cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
321 static int cu_cpu_update_stats(cu_cntr_stats_t *stats,
322 uint64_t cntr_value);
323 static void cu_cpu_info_detach_xcall(void);
324
325 /*
326 * Disable or enable Capacity Utilization counters on all CPUs.
327 */
328 void
cu_disable(void)329 cu_disable(void)
330 {
331 cpu_t *cp;
332
333 ASSERT(MUTEX_HELD(&cpu_lock));
334
335 cp = cpu_active;
336 do {
337 if (!(cp->cpu_flags & CPU_OFFLINE))
338 cu_cpu_disable(cp);
339 } while ((cp = cp->cpu_next_onln) != cpu_active);
340 }
341
342
343 void
cu_enable(void)344 cu_enable(void)
345 {
346 cpu_t *cp;
347
348 ASSERT(MUTEX_HELD(&cpu_lock));
349
350 cp = cpu_active;
351 do {
352 if (!(cp->cpu_flags & CPU_OFFLINE))
353 cu_cpu_enable(cp);
354 } while ((cp = cp->cpu_next_onln) != cpu_active);
355 }
356
357
358 /*
359 * Setup capacity and utilization support
360 */
361 void
cu_init(void)362 cu_init(void)
363 {
364 cpu_t *cp;
365
366 cu_init_error = 0;
367 if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
368 cu_init_error = -1;
369 return;
370 }
371
372 if (kcpc_init() != 0) {
373 cu_init_error = -2;
374 return;
375 }
376
377 /*
378 * Can't measure hardware capacity and utilization without CPU
379 * hardware performance counters
380 */
381 if (cpc_ncounters <= 0) {
382 cu_init_error = -3;
383 return;
384 }
385
386 /*
387 * Setup CPC event request queue
388 */
389 cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
390
391 mutex_enter(&cpu_lock);
392
393 /*
394 * Mark flags to say that module is ready to be setup
395 */
396 cu_flags |= CU_FLAG_READY;
397
398 cp = cpu_active;
399 do {
400 /*
401 * Allocate and setup state needed to measure capacity and
402 * utilization
403 */
404 if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
405 cu_init_error = -5;
406
407 /*
408 * Reset list of counter event requests so its space can be
409 * reused for a different set of requests for next CPU
410 */
411 (void) kcpc_reqs_reset(cu_cpc_reqs);
412
413 cp = cp->cpu_next_onln;
414 } while (cp != cpu_active);
415
416 /*
417 * Mark flags to say that module is on now and counters are ready to be
418 * programmed on all active CPUs
419 */
420 cu_flags |= CU_FLAG_ON;
421
422 /*
423 * Program counters on currently active CPUs
424 */
425 cp = cpu_active;
426 do {
427 if (cu_cpu_run(cp, cu_cpc_program_xcall,
428 (uintptr_t)B_FALSE) != 0)
429 cu_init_error = -6;
430
431 cp = cp->cpu_next_onln;
432 } while (cp != cpu_active);
433
434 /*
435 * Register callback for CPU state changes to enable and disable
436 * CPC counters as CPUs come on and offline
437 */
438 register_cpu_setup_func(cu_cpu_callback, NULL);
439
440 mutex_exit(&cpu_lock);
441 }
442
443
444 /*
445 * Return number of counter events needed to measure capacity and utilization
446 * for specified CPU and fill in list of CPC requests with each counter event
447 * needed if list where to add CPC requests is given
448 *
449 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
450 * everything that has been successfully allocated if any memory
451 * allocation fails
452 */
453 static int
cu_cpc_init(cpu_t * cp,kcpc_request_list_t * reqs,int nreqs)454 cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
455 {
456 group_t *cmt_pgs;
457 cu_cntr_info_t **cntr_info_array;
458 cpu_pg_t *cpu_pgs;
459 cu_cpu_info_t *cu_cpu_info;
460 pg_cmt_t *pg_cmt;
461 pghw_t *pg_hw;
462 cu_cntr_stats_t *stats;
463 int nevents;
464 pghw_type_t pg_hw_type;
465 group_iter_t iter;
466
467 ASSERT(MUTEX_HELD(&cpu_lock));
468
469 /*
470 * There has to be a target CPU for this
471 */
472 if (cp == NULL)
473 return (-1);
474
475 /*
476 * Return 0 when CPU doesn't belong to any group
477 */
478 cpu_pgs = cp->cpu_pg;
479 if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
480 return (0);
481
482 cmt_pgs = &cpu_pgs->cmt_pgs;
483 cu_cpu_info = cp->cpu_cu_info;
484
485 /*
486 * Grab counter statistics and info
487 */
488 if (reqs == NULL) {
489 stats = NULL;
490 cntr_info_array = NULL;
491 } else {
492 if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
493 return (-2);
494
495 stats = cu_cpu_info->cu_cntr_stats;
496 cntr_info_array = cu_cpu_info->cu_cntr_info;
497 }
498
499 /*
500 * See whether platform (or processor) specific code knows which CPC
501 * events to request, etc. are needed to measure hardware capacity and
502 * utilization on this machine
503 */
504 nevents = cu_plat_cpc_init(cp, reqs, nreqs);
505 if (nevents >= 0)
506 return (nevents);
507
508 /*
509 * Let common code decide which CPC events to request, etc. to measure
510 * capacity and utilization since platform (or processor) specific does
511 * not know....
512 *
513 * Walk CPU's PG lineage and do following:
514 *
515 * - Setup CPC request, counter info, and stats needed for each counter
516 * event to measure capacity and and utilization for each of CPU's PG
517 * hardware sharing relationships
518 *
519 * - Create PG CPU kstats to export capacity and utilization for each PG
520 */
521 nevents = 0;
522 group_iter_init(&iter);
523 while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
524 cu_cntr_info_t *cntr_info;
525 int nevents_save;
526 int nstats;
527
528 pg_hw = (pghw_t *)pg_cmt;
529 pg_hw_type = pg_hw->pghw_hw;
530 nevents_save = nevents;
531 nstats = 0;
532
533 switch (pg_hw_type) {
534 case PGHW_IPIPE:
535 if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
536 KM_NOSLEEP, &nevents) != 0)
537 continue;
538 nstats = 1;
539 break;
540
541 case PGHW_FPU:
542 if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
543 KM_NOSLEEP, &nevents) != 0)
544 continue;
545 nstats = 1;
546 break;
547
548 default:
549 /*
550 * Don't measure capacity and utilization for this kind
551 * of PG hardware relationship so skip to next PG in
552 * CPU's PG lineage
553 */
554 continue;
555 }
556
557 cntr_info = cntr_info_array[pg_hw_type];
558
559 /*
560 * Nothing to measure for this hardware sharing relationship
561 */
562 if (nevents - nevents_save == 0) {
563 if (cntr_info != NULL) {
564 kmem_free(cntr_info, sizeof (cu_cntr_info_t));
565 cntr_info_array[pg_hw_type] = NULL;
566 }
567 continue;
568 }
569
570 /*
571 * Fill in counter info for this PG hardware relationship
572 */
573 if (cntr_info == NULL) {
574 cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
575 KM_NOSLEEP);
576 if (cntr_info == NULL)
577 continue;
578 cntr_info_array[pg_hw_type] = cntr_info;
579 }
580 cntr_info->ci_cpu = cp;
581 cntr_info->ci_pg = pg_hw;
582 cntr_info->ci_stats = &stats[nevents_save];
583 cntr_info->ci_nstats = nstats;
584
585 /*
586 * Create PG CPU kstats for this hardware relationship
587 */
588 cu_cpu_kstat_create(pg_hw, cntr_info);
589 }
590
591 return (nevents);
592 }
593
594
595 /*
596 * Program counters for capacity and utilization on given CPU
597 *
598 * If any of the following conditions is true, the counters are not programmed:
599 *
600 * - CU framework is disabled
601 * - The cpu_cu_info field of the cpu structure is NULL
602 * - DTrace is active
603 * - Counters are programmed already
604 * - Counters are disabled (by calls to cu_cpu_disable())
605 */
606 void
cu_cpc_program(cpu_t * cp,int * err)607 cu_cpc_program(cpu_t *cp, int *err)
608 {
609 cu_cpc_ctx_t *cpu_ctx;
610 kcpc_ctx_t *ctx;
611 cu_cpu_info_t *cu_cpu_info;
612
613 ASSERT(IS_HIPIL());
614 /*
615 * Should be running on given CPU. We disable preemption to keep CPU
616 * from disappearing and make sure flags and CPC context don't change
617 * from underneath us
618 */
619 kpreempt_disable();
620 ASSERT(cp == CPU);
621
622 /*
623 * Module not ready to program counters
624 */
625 if (!(cu_flags & CU_FLAG_ON)) {
626 *err = -1;
627 kpreempt_enable();
628 return;
629 }
630
631 if (cp == NULL) {
632 *err = -2;
633 kpreempt_enable();
634 return;
635 }
636
637 cu_cpu_info = cp->cpu_cu_info;
638 if (cu_cpu_info == NULL) {
639 *err = -3;
640 kpreempt_enable();
641 return;
642 }
643
644 /*
645 * If DTrace CPC is active or counters turned on already or are
646 * disabled, just return.
647 */
648 if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
649 cu_cpu_info->cu_disabled) {
650 *err = 1;
651 kpreempt_enable();
652 return;
653 }
654
655 if ((CPU->cpu_cpc_ctx != NULL) &&
656 !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
657 *err = -4;
658 kpreempt_enable();
659 return;
660 }
661
662 /*
663 * Get CPU's CPC context needed for capacity and utilization
664 */
665 cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
666 ASSERT(cpu_ctx != NULL);
667 ASSERT(cpu_ctx->nctx >= 0);
668
669 ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
670 ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
671 if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
672 cpu_ctx->ctx_ptr_array_sz <= 0) {
673 *err = -5;
674 kpreempt_enable();
675 return;
676 }
677
678 /*
679 * Increment index in CPU's CPC context info to point at next context
680 * to program
681 *
682 * NOTE: Do this now instead of after programming counters to ensure
683 * that index will always point at *current* context so we will
684 * always be able to unprogram *current* context if necessary
685 */
686 cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
687
688 ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
689
690 /*
691 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
692 * context before programming counters
693 *
694 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
695 * unprogrammed and may be marked with KCPC_CTX_INVALID when
696 * kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to
697 * invalidate all CPC contexts before they take over all the counters.
698 *
699 * This isn't necessary since these flags are only used for thread bound
700 * CPC contexts not CPU bound CPC contexts like ones used for capacity
701 * and utilization.
702 *
703 * There is no need to protect the flag update since no one is using
704 * this context now.
705 */
706 ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
707
708 /*
709 * Program counters on this CPU
710 */
711 kcpc_program(ctx, B_FALSE, B_FALSE);
712
713 cp->cpu_cpc_ctx = ctx;
714
715 /*
716 * Set state in CPU structure to say that CPU's counters are programmed
717 * for capacity and utilization now and that they are transitioning from
718 * off to on state. This will cause cu_cpu_update to update stop times
719 * for all programmed counters.
720 */
721 cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
722
723 /*
724 * Update counter statistics
725 */
726 (void) cu_cpu_update(cp, B_FALSE);
727
728 cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
729
730 *err = 0;
731 kpreempt_enable();
732 }
733
734
735 /*
736 * Cross call wrapper routine for cu_cpc_program()
737 *
738 * Checks to make sure that counters on CPU aren't being used by someone else
739 * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
740 * nobody else is using the counters to catch and prevent any broken code.
741 * Also, this check needs to happen on the target CPU since the CPU's CPC
742 * context can only be changed while running on the CPU.
743 *
744 * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
745 * no valid thread bound cpc context. This is important to check to prevent
746 * re-programming thread counters with CU counters when CPU is coming on-line.
747 */
748 static void
cu_cpc_program_xcall(uintptr_t arg,int * err)749 cu_cpc_program_xcall(uintptr_t arg, int *err)
750 {
751 boolean_t avoid_thread_context = (boolean_t)arg;
752
753 kpreempt_disable();
754
755 if (CPU->cpu_cpc_ctx != NULL &&
756 !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
757 *err = -100;
758 kpreempt_enable();
759 return;
760 }
761
762 if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
763 !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
764 *err = -200;
765 kpreempt_enable();
766 return;
767 }
768
769 cu_cpc_program(CPU, err);
770 kpreempt_enable();
771 }
772
773
774 /*
775 * Unprogram counters for capacity and utilization on given CPU
776 * This function should be always executed on the target CPU at high PIL
777 */
778 void
cu_cpc_unprogram(cpu_t * cp,int * err)779 cu_cpc_unprogram(cpu_t *cp, int *err)
780 {
781 cu_cpc_ctx_t *cpu_ctx;
782 kcpc_ctx_t *ctx;
783 cu_cpu_info_t *cu_cpu_info;
784
785 ASSERT(IS_HIPIL());
786 /*
787 * Should be running on given CPU with preemption disabled to keep CPU
788 * from disappearing and make sure flags and CPC context don't change
789 * from underneath us
790 */
791 kpreempt_disable();
792 ASSERT(cp == CPU);
793
794 /*
795 * Module not on
796 */
797 if (!(cu_flags & CU_FLAG_ON)) {
798 *err = -1;
799 kpreempt_enable();
800 return;
801 }
802
803 cu_cpu_info = cp->cpu_cu_info;
804 if (cu_cpu_info == NULL) {
805 *err = -3;
806 kpreempt_enable();
807 return;
808 }
809
810 /*
811 * Counters turned off already
812 */
813 if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
814 *err = 1;
815 kpreempt_enable();
816 return;
817 }
818
819 /*
820 * Update counter statistics
821 */
822 (void) cu_cpu_update(cp, B_FALSE);
823
824 /*
825 * Get CPU's CPC context needed for capacity and utilization
826 */
827 cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
828 if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
829 cpu_ctx->ctx_ptr_array_sz <= 0) {
830 *err = -5;
831 kpreempt_enable();
832 return;
833 }
834 ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
835
836 /*
837 * CPU's CPC context should be current capacity and utilization CPC
838 * context
839 */
840 ASSERT(cp->cpu_cpc_ctx == ctx);
841 if (cp->cpu_cpc_ctx != ctx) {
842 *err = -6;
843 kpreempt_enable();
844 return;
845 }
846
847 /*
848 * Unprogram counters on CPU.
849 */
850 kcpc_unprogram(ctx, B_FALSE);
851
852 ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
853
854 /*
855 * Unset state in CPU structure saying that CPU's counters are
856 * programmed
857 */
858 cp->cpu_cpc_ctx = NULL;
859 cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
860
861 *err = 0;
862 kpreempt_enable();
863 }
864
865
866 /*
867 * Add given counter event to list of CPC requests
868 */
869 static int
cu_cpc_req_add(char * event,kcpc_request_list_t * reqs,int nreqs,cu_cntr_stats_t * stats,int kmem_flags,int * nevents)870 cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
871 cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
872 {
873 int n;
874 int retval;
875 uint_t flags;
876
877 /*
878 * Return error when no counter event specified, counter event not
879 * supported by CPC's PCBE, or number of events not given
880 */
881 if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
882 nevents == NULL)
883 return (-1);
884
885 n = *nevents;
886
887 /*
888 * Only count number of counter events needed if list
889 * where to add CPC requests not given
890 */
891 if (reqs == NULL) {
892 n++;
893 *nevents = n;
894 return (-3);
895 }
896
897 /*
898 * Return error when stats not given or not enough room on list of CPC
899 * requests for more counter events
900 */
901 if (stats == NULL || (nreqs <= 0 && n >= nreqs))
902 return (-4);
903
904 /*
905 * Use flags in cu_cpc_flags to program counters and enable overflow
906 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
907 * PCBE can catch counters before they wrap to hopefully give us an
908 * accurate (64-bit) virtualized counter
909 */
910 flags = cu_cpc_flags;
911 if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
912 flags &= ~CPC_OVF_NOTIFY_EMT;
913
914 /*
915 * Add CPC request to list
916 */
917 retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
918 flags, 0, NULL, &stats[n], kmem_flags);
919
920 if (retval != 0)
921 return (-5);
922
923 n++;
924 *nevents = n;
925 return (0);
926 }
927
928 static void
cu_cpu_info_detach_xcall(void)929 cu_cpu_info_detach_xcall(void)
930 {
931 ASSERT(IS_HIPIL());
932
933 CPU->cpu_cu_info = NULL;
934 }
935
936
937 /*
938 * Enable or disable collection of capacity/utilization data for a current CPU.
939 * Counters are enabled if 'on' argument is True and disabled if it is False.
940 * This function should be always executed at high PIL
941 */
942 static void
cu_cpc_trigger(uintptr_t arg1,uintptr_t arg2)943 cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
944 {
945 cpu_t *cp = (cpu_t *)arg1;
946 boolean_t on = (boolean_t)arg2;
947 int error;
948 cu_cpu_info_t *cu_cpu_info;
949
950 ASSERT(IS_HIPIL());
951 kpreempt_disable();
952 ASSERT(cp == CPU);
953
954 if (!(cu_flags & CU_FLAG_ON)) {
955 kpreempt_enable();
956 return;
957 }
958
959 cu_cpu_info = cp->cpu_cu_info;
960 if (cu_cpu_info == NULL) {
961 kpreempt_enable();
962 return;
963 }
964
965 ASSERT(!cu_cpu_info->cu_disabled ||
966 !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
967
968 if (on) {
969 /*
970 * Decrement the cu_disabled counter.
971 * Once it drops to zero, call cu_cpc_program.
972 */
973 if (cu_cpu_info->cu_disabled > 0)
974 cu_cpu_info->cu_disabled--;
975 if (cu_cpu_info->cu_disabled == 0)
976 cu_cpc_program(CPU, &error);
977 } else if (cu_cpu_info->cu_disabled++ == 0) {
978 /*
979 * This is the first attempt to disable CU, so turn it off
980 */
981 cu_cpc_unprogram(cp, &error);
982 ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
983 }
984
985 kpreempt_enable();
986 }
987
988
989 /*
990 * Callback for changes in CPU states
991 * Used to enable or disable hardware performance counters on CPUs that are
992 * turned on or off
993 *
994 * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
995 * We have to use thread_affinity_set to hop to the right CPU because these
996 * routines expect cpu_lock held, so we can't cross-call other CPUs while
997 * holding CPU lock.
998 */
999 static int
1000 /* LINTED E_FUNC_ARG_UNUSED */
cu_cpu_callback(cpu_setup_t what,int id,void * arg)1001 cu_cpu_callback(cpu_setup_t what, int id, void *arg)
1002 {
1003 cpu_t *cp;
1004 int retval = 0;
1005
1006 ASSERT(MUTEX_HELD(&cpu_lock));
1007
1008 if (!(cu_flags & CU_FLAG_ON))
1009 return (-1);
1010
1011 cp = cpu_get(id);
1012 if (cp == NULL)
1013 return (-2);
1014
1015 switch (what) {
1016 case CPU_ON:
1017 /*
1018 * Setup counters on CPU being turned on
1019 */
1020 retval = cu_cpu_init(cp, cu_cpc_reqs);
1021
1022 /*
1023 * Reset list of counter event requests so its space can be
1024 * reused for a different set of requests for next CPU
1025 */
1026 (void) kcpc_reqs_reset(cu_cpc_reqs);
1027 break;
1028 case CPU_INTR_ON:
1029 /*
1030 * Setup counters on CPU being turned on.
1031 */
1032 retval = cu_cpu_run(cp, cu_cpc_program_xcall,
1033 (uintptr_t)B_TRUE);
1034 break;
1035 case CPU_OFF:
1036 /*
1037 * Disable counters on CPU being turned off. Counters will not
1038 * be re-enabled on this CPU until it comes back online.
1039 */
1040 cu_cpu_disable(cp);
1041 ASSERT(!CU_CPC_ON(cp));
1042 retval = cu_cpu_fini(cp);
1043 break;
1044 default:
1045 break;
1046 }
1047 return (retval);
1048 }
1049
1050
1051 /*
1052 * Disable or enable Capacity Utilization counters on a given CPU. This function
1053 * can be called from any CPU to disable counters on the given CPU.
1054 */
1055 static void
cu_cpu_disable(cpu_t * cp)1056 cu_cpu_disable(cpu_t *cp)
1057 {
1058 cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
1059 }
1060
1061
1062 static void
cu_cpu_enable(cpu_t * cp)1063 cu_cpu_enable(cpu_t *cp)
1064 {
1065 cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
1066 }
1067
1068
1069 /*
1070 * Setup capacity and utilization support for given CPU
1071 *
1072 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
1073 * everything that has been successfully allocated including cpu_cu_info
1074 * if any memory allocation fails
1075 */
1076 static int
cu_cpu_init(cpu_t * cp,kcpc_request_list_t * reqs)1077 cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
1078 {
1079 kcpc_ctx_t **ctx_ptr_array;
1080 size_t ctx_ptr_array_sz;
1081 cu_cpc_ctx_t *cpu_ctx;
1082 cu_cpu_info_t *cu_cpu_info;
1083 int n;
1084
1085 /*
1086 * cpu_lock should be held and protect against CPU going away and races
1087 * with cu_{init,fini,cpu_fini}()
1088 */
1089 ASSERT(MUTEX_HELD(&cpu_lock));
1090
1091 /*
1092 * Return if not ready to setup counters yet
1093 */
1094 if (!(cu_flags & CU_FLAG_READY))
1095 return (-1);
1096
1097 if (cp->cpu_cu_info == NULL) {
1098 cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
1099 KM_NOSLEEP);
1100 if (cp->cpu_cu_info == NULL)
1101 return (-2);
1102 }
1103
1104 /*
1105 * Get capacity and utilization CPC context for CPU and check to see
1106 * whether it has been setup already
1107 */
1108 cu_cpu_info = cp->cpu_cu_info;
1109 cu_cpu_info->cu_cpu = cp;
1110 cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
1111
1112 cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1113 if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
1114 cpu_ctx->ctx_ptr_array_sz > 0) {
1115 return (1);
1116 }
1117
1118 /*
1119 * Should have no contexts since it hasn't been setup already
1120 */
1121 ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
1122 cpu_ctx->ctx_ptr_array_sz == 0);
1123
1124 /*
1125 * Determine how many CPC events needed to measure capacity and
1126 * utilization for this CPU, allocate space for counter statistics for
1127 * each event, and fill in list of CPC event requests with corresponding
1128 * counter stats for each request to make attributing counter data
1129 * easier later....
1130 */
1131 n = cu_cpc_init(cp, NULL, 0);
1132 if (n <= 0) {
1133 (void) cu_cpu_fini(cp);
1134 return (-3);
1135 }
1136
1137 cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
1138 KM_NOSLEEP);
1139 if (cu_cpu_info->cu_cntr_stats == NULL) {
1140 (void) cu_cpu_fini(cp);
1141 return (-4);
1142 }
1143
1144 cu_cpu_info->cu_ncntr_stats = n;
1145
1146 n = cu_cpc_init(cp, reqs, n);
1147 if (n <= 0) {
1148 (void) cu_cpu_fini(cp);
1149 return (-5);
1150 }
1151
1152 /*
1153 * Create CPC context with given requests
1154 */
1155 ctx_ptr_array = NULL;
1156 ctx_ptr_array_sz = 0;
1157 n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
1158 &ctx_ptr_array_sz);
1159 if (n <= 0) {
1160 (void) cu_cpu_fini(cp);
1161 return (-6);
1162 }
1163
1164 /*
1165 * Should have contexts
1166 */
1167 ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
1168 if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
1169 (void) cu_cpu_fini(cp);
1170 return (-7);
1171 }
1172
1173 /*
1174 * Fill in CPC context info for CPU needed for capacity and utilization
1175 */
1176 cpu_ctx->cur_index = 0;
1177 cpu_ctx->nctx = n;
1178 cpu_ctx->ctx_ptr_array = ctx_ptr_array;
1179 cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
1180 return (0);
1181 }
1182
1183 /*
1184 * Tear down capacity and utilization support for given CPU
1185 */
1186 static int
cu_cpu_fini(cpu_t * cp)1187 cu_cpu_fini(cpu_t *cp)
1188 {
1189 kcpc_ctx_t *ctx;
1190 cu_cpc_ctx_t *cpu_ctx;
1191 cu_cpu_info_t *cu_cpu_info;
1192 int i;
1193 pghw_type_t pg_hw_type;
1194
1195 /*
1196 * cpu_lock should be held and protect against CPU going away and races
1197 * with cu_{init,fini,cpu_init}()
1198 */
1199 ASSERT(MUTEX_HELD(&cpu_lock));
1200
1201 /*
1202 * Have to at least be ready to setup counters to have allocated
1203 * anything that needs to be deallocated now
1204 */
1205 if (!(cu_flags & CU_FLAG_READY))
1206 return (-1);
1207
1208 /*
1209 * Nothing to do if CPU's capacity and utilization info doesn't exist
1210 */
1211 cu_cpu_info = cp->cpu_cu_info;
1212 if (cu_cpu_info == NULL)
1213 return (1);
1214
1215 /*
1216 * Tear down any existing kstats and counter info for each hardware
1217 * sharing relationship
1218 */
1219 for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
1220 pg_hw_type++) {
1221 cu_cntr_info_t *cntr_info;
1222
1223 cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
1224 if (cntr_info == NULL)
1225 continue;
1226
1227 if (cntr_info->ci_kstat != NULL) {
1228 kstat_delete(cntr_info->ci_kstat);
1229 cntr_info->ci_kstat = NULL;
1230 }
1231 kmem_free(cntr_info, sizeof (cu_cntr_info_t));
1232 }
1233
1234 /*
1235 * Free counter statistics for CPU
1236 */
1237 ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
1238 cu_cpu_info->cu_ncntr_stats > 0);
1239 if (cu_cpu_info->cu_cntr_stats != NULL &&
1240 cu_cpu_info->cu_ncntr_stats > 0) {
1241 kmem_free(cu_cpu_info->cu_cntr_stats,
1242 cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
1243 cu_cpu_info->cu_cntr_stats = NULL;
1244 cu_cpu_info->cu_ncntr_stats = 0;
1245 }
1246
1247 /*
1248 * Get capacity and utilization CPC contexts for given CPU and check to
1249 * see whether they have been freed already
1250 */
1251 cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1252 if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
1253 cpu_ctx->ctx_ptr_array_sz > 0) {
1254 /*
1255 * Free CPC contexts for given CPU
1256 */
1257 for (i = 0; i < cpu_ctx->nctx; i++) {
1258 ctx = cpu_ctx->ctx_ptr_array[i];
1259 if (ctx == NULL)
1260 continue;
1261 kcpc_free_cpu(ctx);
1262 }
1263
1264 /*
1265 * Free CPC context pointer array
1266 */
1267 kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
1268
1269 /*
1270 * Zero CPC info for CPU
1271 */
1272 bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
1273 }
1274
1275 /*
1276 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
1277 * that no one is going to access the cpu_cu_info whicch we are going to
1278 * free.
1279 */
1280 if (cpu_is_online(cp))
1281 cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
1282 else
1283 cp->cpu_cu_info = NULL;
1284
1285 /*
1286 * Free CPU's capacity and utilization info
1287 */
1288 kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
1289
1290 return (0);
1291 }
1292
1293 /*
1294 * Create capacity & utilization kstats for given PG CPU hardware sharing
1295 * relationship
1296 */
1297 static void
cu_cpu_kstat_create(pghw_t * pg,cu_cntr_info_t * cntr_info)1298 cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
1299 {
1300 kstat_t *ks;
1301 char *sharing = pghw_type_string(pg->pghw_hw);
1302 char name[KSTAT_STRLEN + 1];
1303
1304 /*
1305 * Just return when no counter info or CPU
1306 */
1307 if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
1308 return;
1309
1310 /*
1311 * Canonify PG name to conform to kstat name rules
1312 */
1313 (void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1);
1314 strident_canon(name, TASKQ_NAMELEN + 1);
1315
1316 if ((ks = kstat_create_zone("pg_hw_perf_cpu",
1317 cntr_info->ci_cpu->cpu_id,
1318 name, "processor_group", KSTAT_TYPE_NAMED,
1319 sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
1320 KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
1321 return;
1322
1323 ks->ks_lock = &pg_cpu_kstat_lock;
1324 ks->ks_data = &cu_cpu_kstat;
1325 ks->ks_update = cu_cpu_kstat_update;
1326 ks->ks_data_size += strlen(sharing) + 1;
1327
1328 ks->ks_private = cntr_info;
1329 cntr_info->ci_kstat = ks;
1330 kstat_install(cntr_info->ci_kstat);
1331 }
1332
1333
1334 /*
1335 * Propagate values from CPU capacity & utilization stats to kstats
1336 */
1337 static int
cu_cpu_kstat_update(kstat_t * ksp,int rw)1338 cu_cpu_kstat_update(kstat_t *ksp, int rw)
1339 {
1340 cpu_t *cp;
1341 cu_cntr_info_t *cntr_info = ksp->ks_private;
1342 struct cu_cpu_kstat *kstat = &cu_cpu_kstat;
1343 pghw_t *pg;
1344 cu_cntr_stats_t *stats;
1345
1346 if (rw == KSTAT_WRITE)
1347 return (EACCES);
1348
1349 cp = cntr_info->ci_cpu;
1350 pg = cntr_info->ci_pg;
1351 kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
1352 kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id;
1353
1354 /*
1355 * The caller should have priv_cpc_cpu privilege to get utilization
1356 * data. Callers who do not have the privilege will see zeroes as the
1357 * values.
1358 */
1359 if (secpolicy_cpc_cpu(crgetcred()) != 0) {
1360 kstat->cu_generation.value.ui32 = cp->cpu_generation;
1361 kstat_named_setstr(&kstat->cu_cpu_relationship,
1362 pghw_type_string(pg->pghw_hw));
1363
1364 kstat->cu_cpu_util.value.ui64 = 0;
1365 kstat->cu_cpu_rate.value.ui64 = 0;
1366 kstat->cu_cpu_rate_max.value.ui64 = 0;
1367 kstat->cu_cpu_time_running.value.ui64 = 0;
1368 kstat->cu_cpu_time_stopped.value.ui64 = 0;
1369
1370 return (0);
1371 }
1372
1373 kpreempt_disable();
1374
1375 /*
1376 * Update capacity and utilization statistics needed for CPU's PG (CPU)
1377 * kstats
1378 */
1379
1380 (void) cu_cpu_update(cp, B_TRUE);
1381
1382 stats = cntr_info->ci_stats;
1383 kstat->cu_generation.value.ui32 = cp->cpu_generation;
1384 kstat_named_setstr(&kstat->cu_cpu_relationship,
1385 pghw_type_string(pg->pghw_hw));
1386
1387 kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
1388 kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
1389 kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
1390 kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
1391 kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
1392
1393 /*
1394 * Counters are stopped now, so the cs_time_stopped was last
1395 * updated at cs_time_start time. Add the time passed since then
1396 * to the stopped time.
1397 */
1398 if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
1399 kstat->cu_cpu_time_stopped.value.ui64 +=
1400 gethrtime() - stats->cs_time_start;
1401
1402 kpreempt_enable();
1403
1404 return (0);
1405 }
1406
1407 /*
1408 * Run specified function with specified argument on a given CPU and return
1409 * whatever the function returns
1410 */
1411 static int
cu_cpu_run(cpu_t * cp,cu_cpu_func_t func,uintptr_t arg)1412 cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
1413 {
1414 int error = 0;
1415
1416 /*
1417 * cpu_call() will call func on the CPU specified with given argument
1418 * and return func's return value in last argument
1419 */
1420 cpu_call(cp, (cpu_call_func_t)(uintptr_t)func, arg, (uintptr_t)&error);
1421 return (error);
1422 }
1423
1424
1425 /*
1426 * Update counter statistics on a given CPU.
1427 *
1428 * If move_to argument is True, execute the function on the CPU specified
1429 * Otherwise, assume that it is already runninng on the right CPU
1430 *
1431 * If move_to is specified, the caller should hold cpu_lock or have preemption
1432 * disabled. Otherwise it is up to the caller to guarantee that things do not
1433 * change in the process.
1434 */
1435 int
cu_cpu_update(struct cpu * cp,boolean_t move_to)1436 cu_cpu_update(struct cpu *cp, boolean_t move_to)
1437 {
1438 int retval;
1439 cu_cpu_info_t *cu_cpu_info = cp->cpu_cu_info;
1440 hrtime_t time_snap;
1441
1442 ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
1443
1444 /*
1445 * Nothing to do if counters are not programmed
1446 */
1447 if (!(cu_flags & CU_FLAG_ON) ||
1448 (cu_cpu_info == NULL) ||
1449 !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1450 return (0);
1451
1452 /*
1453 * Don't update CPU statistics if it was updated recently
1454 * and provide old results instead
1455 */
1456 time_snap = gethrtime();
1457 if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
1458 DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
1459 return (0);
1460 }
1461
1462 cu_cpu_info->cu_sample_time = time_snap;
1463
1464 /*
1465 * CPC counter should be read on the CPU that is running the counter. We
1466 * either have to move ourselves to the target CPU or insure that we
1467 * already run there.
1468 *
1469 * We use cross-call to the target CPU to execute kcpc_read() and
1470 * cu_cpu_update_stats() there.
1471 */
1472 retval = 0;
1473 if (move_to)
1474 (void) cu_cpu_run(cp, (cu_cpu_func_t)(uintptr_t)kcpc_read,
1475 (uintptr_t)cu_cpu_update_stats);
1476 else {
1477 retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
1478 /*
1479 * Offset negative return value by -10 so we can distinguish it
1480 * from error return values of this routine vs kcpc_read()
1481 */
1482 if (retval < 0)
1483 retval -= 10;
1484 }
1485
1486 return (retval);
1487 }
1488
1489
1490 /*
1491 * Update CPU counter statistics for current CPU.
1492 * This function may be called from a cross-call
1493 */
1494 static int
cu_cpu_update_stats(cu_cntr_stats_t * stats,uint64_t cntr_value)1495 cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
1496 {
1497 cu_cpu_info_t *cu_cpu_info = CPU->cpu_cu_info;
1498 uint_t flags;
1499 uint64_t delta;
1500 hrtime_t time_delta;
1501 hrtime_t time_snap;
1502
1503 if (stats == NULL)
1504 return (-1);
1505
1506 /*
1507 * Nothing to do if counters are not programmed. This should not happen,
1508 * but we check just in case.
1509 */
1510 ASSERT(cu_flags & CU_FLAG_ON);
1511 ASSERT(cu_cpu_info != NULL);
1512 if (!(cu_flags & CU_FLAG_ON) ||
1513 (cu_cpu_info == NULL))
1514 return (-2);
1515
1516 flags = cu_cpu_info->cu_flag;
1517 ASSERT(flags & CU_CPU_CNTRS_ON);
1518 if (!(flags & CU_CPU_CNTRS_ON))
1519 return (-2);
1520
1521 /*
1522 * Take snapshot of high resolution timer
1523 */
1524 time_snap = gethrtime();
1525
1526 /*
1527 * CU counters have just been programmed. We cannot assume that the new
1528 * cntr_value continues from where we left off, so use the cntr_value as
1529 * the new initial value.
1530 */
1531 if (flags & CU_CPU_CNTRS_OFF_ON)
1532 stats->cs_value_start = cntr_value;
1533
1534 /*
1535 * Calculate delta in counter values between start of sampling period
1536 * and now
1537 */
1538 delta = cntr_value - stats->cs_value_start;
1539
1540 /*
1541 * Calculate time between start of sampling period and now
1542 */
1543 time_delta = stats->cs_time_start ?
1544 time_snap - stats->cs_time_start :
1545 0;
1546 stats->cs_time_start = time_snap;
1547 stats->cs_value_start = cntr_value;
1548
1549 if (time_delta > 0) { /* wrap shouldn't happen */
1550 /*
1551 * Update either running or stopped time based on the transition
1552 * state
1553 */
1554 if (flags & CU_CPU_CNTRS_OFF_ON)
1555 stats->cs_time_stopped += time_delta;
1556 else
1557 stats->cs_time_running += time_delta;
1558 }
1559
1560 /*
1561 * Update rest of counter statistics if counter value didn't wrap
1562 */
1563 if (delta > 0) {
1564 /*
1565 * Update utilization rate if the interval between samples is
1566 * sufficient.
1567 */
1568 ASSERT(cu_sample_interval_min > CU_SCALE);
1569 if (time_delta > cu_sample_interval_min)
1570 stats->cs_rate = CU_RATE(delta, time_delta);
1571 if (stats->cs_rate_max < stats->cs_rate)
1572 stats->cs_rate_max = stats->cs_rate;
1573
1574 stats->cs_value_last = delta;
1575 stats->cs_value_total += delta;
1576 }
1577
1578 return (0);
1579 }
1580
1581 /*
1582 * Update CMT PG utilization data.
1583 *
1584 * This routine computes the running total utilization and times for the
1585 * specified PG by adding up the total utilization and counter running and
1586 * stopped times of all CPUs in the PG and calculates the utilization rate and
1587 * maximum rate for all CPUs in the PG.
1588 */
1589 void
cu_pg_update(pghw_t * pg)1590 cu_pg_update(pghw_t *pg)
1591 {
1592 pg_cpu_itr_t cpu_iter;
1593 pghw_type_t pg_hwtype;
1594 cpu_t *cpu;
1595 pghw_util_t *hw_util = &pg->pghw_stats;
1596 uint64_t old_utilization = hw_util->pghw_util;
1597 hrtime_t now;
1598 hrtime_t time_delta;
1599 uint64_t utilization_delta;
1600
1601 ASSERT(MUTEX_HELD(&cpu_lock));
1602
1603 now = gethrtime();
1604
1605 pg_hwtype = pg->pghw_hw;
1606
1607 /*
1608 * Initialize running total utilization and times for PG to 0
1609 */
1610 hw_util->pghw_util = 0;
1611 hw_util->pghw_time_running = 0;
1612 hw_util->pghw_time_stopped = 0;
1613
1614 /*
1615 * Iterate over all CPUs in the PG and aggregate utilization, running
1616 * time and stopped time.
1617 */
1618 PG_CPU_ITR_INIT(pg, cpu_iter);
1619 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1620 cu_cpu_info_t *cu_cpu_info = cpu->cpu_cu_info;
1621 cu_cntr_info_t *cntr_info;
1622 cu_cntr_stats_t *stats;
1623
1624 if (cu_cpu_info == NULL)
1625 continue;
1626
1627 /*
1628 * Update utilization data for the CPU and then
1629 * aggregate per CPU running totals for PG
1630 */
1631 (void) cu_cpu_update(cpu, B_TRUE);
1632 cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
1633
1634 if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
1635 continue;
1636
1637 hw_util->pghw_util += stats->cs_value_total;
1638 hw_util->pghw_time_running += stats->cs_time_running;
1639 hw_util->pghw_time_stopped += stats->cs_time_stopped;
1640
1641 /*
1642 * If counters are stopped now, the pg_time_stopped was last
1643 * updated at cs_time_start time. Add the time passed since then
1644 * to the stopped time.
1645 */
1646 if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1647 hw_util->pghw_time_stopped +=
1648 now - stats->cs_time_start;
1649 }
1650
1651 /*
1652 * Compute per PG instruction rate and maximum rate
1653 */
1654 time_delta = now - hw_util->pghw_time_stamp;
1655 hw_util->pghw_time_stamp = now;
1656
1657 if (old_utilization == 0)
1658 return;
1659
1660 /*
1661 * Calculate change in utilization over sampling period and set this to
1662 * 0 if the delta would be 0 or negative which may happen if any CPUs go
1663 * offline during the sampling period
1664 */
1665 if (hw_util->pghw_util > old_utilization)
1666 utilization_delta = hw_util->pghw_util - old_utilization;
1667 else
1668 utilization_delta = 0;
1669
1670 /*
1671 * Update utilization rate if the interval between samples is
1672 * sufficient.
1673 */
1674 ASSERT(cu_sample_interval_min > CU_SCALE);
1675 if (time_delta > CU_SAMPLE_INTERVAL_MIN)
1676 hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
1677
1678 /*
1679 * Update the maximum observed rate
1680 */
1681 if (hw_util->pghw_rate_max < hw_util->pghw_rate)
1682 hw_util->pghw_rate_max = hw_util->pghw_rate;
1683 }
1684