xref: /illumos-gate/usr/src/uts/common/os/cap_util.c (revision 99ea293e719ac006d413e4fde6ac0d5cd4dd6c59)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Support for determining capacity and utilization of performance relevant
28  * hardware components in a computer
29  *
30  * THEORY
31  * ------
32  * The capacity and utilization of the performance relevant hardware components
33  * is needed to be able to optimize performance while minimizing the amount of
34  * power used on a system.  The idea is to use hardware performance counters
35  * and potentially other means to determine the capacity and utilization of
36  * performance relevant hardware components (eg. execution pipeline, cache,
37  * memory, etc.) and attribute the utilization to the responsible CPU and the
38  * thread running there.
39  *
40  * This will help characterize the utilization of performance relevant
41  * components and how much is used by each CPU and each thread.  With
42  * that data, the utilization can be aggregated to all the CPUs sharing each
43  * performance relevant hardware component to calculate the total utilization
44  * of each component and compare that with the component's capacity to
45  * essentially determine the actual hardware load of the component.  The
46  * hardware utilization attributed to each running thread can also be
47  * aggregated to determine the total hardware utilization of each component to
48  * a workload.
49  *
50  * Once that is done, one can determine how much of each performance relevant
51  * hardware component is needed by a given thread or set of threads (eg. a
52  * workload) and size up exactly what hardware is needed by the threads and how
53  * much.  With this info, we can better place threads among CPUs to match their
54  * exact hardware resource needs and potentially lower or raise the power based
55  * on their utilization or pack threads onto the fewest hardware components
56  * needed and power off any remaining unused components to minimize power
57  * without sacrificing performance.
58  *
59  * IMPLEMENTATION
60  * --------------
61  * The code has been designed and implemented to make (un)programming and
62  * reading the counters for a given CPU as lightweight and fast as possible.
63  * This is very important because we need to read and potentially (un)program
64  * the counters very often and in performance sensitive code.  Specifically,
65  * the counters may need to be (un)programmed during context switch and/or a
66  * cyclic handler when there are more counter events to count than existing
67  * counters.
68  *
69  * Consequently, the code has been split up to allow allocating and
70  * initializing everything needed to program and read the counters on a given
71  * CPU once and make (un)programming and reading the counters for a given CPU
72  * not have to allocate/free memory or grab any locks.  To do this, all the
73  * state needed to (un)program and read the counters on a CPU is kept per CPU
74  * and is made lock free by forcing any code that reads or manipulates the
75  * counters or the state needed to (un)program or read the counters to run on
76  * the target CPU and disable preemption while running on the target CPU to
77  * protect any critical sections. All counter manipulation on the target CPU is
78  * happening either from a cross-call to the target CPU or at the same PIL as
79  * used by the cross-call subsystem. This guarantees that counter manipulation
80  * is not interrupted by cross-calls from other CPUs.
81  *
82  * The synchronization has been made lock free or as simple as possible for
83  * performance and to avoid getting the locking all tangled up when we interpose
84  * on the CPC routines that (un)program the counters to manage the counters
85  * between the kernel and user on each CPU.  When the user starts using the
86  * counters on a given CPU, the kernel will unprogram the counters that it is
87  * using on that CPU just before they are programmed for the user.  Then the
88  * kernel will program the counters on a given CPU for its own use when the user
89  * stops using them.
90  *
91  * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
92  * enables any probe, it requests to disable and unprogram all counters used for
93  * capacity and utilizations. These counters are never re-programmed back until
94  * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
95  * framework and it re-programs the counters.
96  *
97  * When a CPU is going offline, its CU counters are unprogrammed and disabled,
98  * so that they would not be re-programmed again by some other activity on the
99  * CPU that is going offline.
100  *
101  * The counters are programmed during boot.  However, a flag is available to
102  * disable this if necessary (see cu_flag below).  A handler is provided to
103  * (un)program the counters during CPU on/offline.  Basic routines are provided
104  * to initialize and tear down this module, initialize and tear down any state
105  * needed for a given CPU, and (un)program the counters for a given CPU.
106  * Lastly, a handler is provided to read the counters and attribute the
107  * utilization to the responsible CPU.
108  */
109 #include <sys/types.h>
110 #include <sys/cmn_err.h>
111 #include <sys/cpuvar.h>
112 #include <sys/ddi.h>
113 #include <sys/systm.h>
114 #include <sys/disp.h>
115 #include <sys/sdt.h>
116 #include <sys/sunddi.h>
117 #include <sys/thread.h>
118 #include <sys/pghw.h>
119 #include <sys/cmt.h>
120 #include <sys/policy.h>
121 #include <sys/x_call.h>
122 #include <sys/cap_util.h>
123 
124 #include <sys/archsystm.h>
125 #include <sys/promif.h>
126 
127 #if defined(__x86)
128 #include <sys/xc_levels.h>
129 #endif
130 
131 
132 /*
133  * Default CPU hardware performance counter flags to use for measuring capacity
134  * and utilization
135  */
136 #define	CU_CPC_FLAGS_DEFAULT	\
137 	(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
138 
139 /*
140  * Possible Flags for controlling this module.
141  */
142 #define	CU_FLAG_ENABLE		1	/* Enable module */
143 #define	CU_FLAG_READY		2	/* Ready to setup module */
144 #define	CU_FLAG_ON		4	/* Module is on */
145 
146 /*
147  * pg_cpu kstats calculate utilization rate and maximum utilization rate for
148  * some CPUs. The rate is calculated based on data from two subsequent
149  * snapshots. When the time between such two snapshots is too small, the
150  * resulting rate may have low accuracy, so we only consider snapshots which
151  * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
152  * update the rate if the interval is smaller than that.
153  *
154  * Use one tenth of a second as the minimum interval for utilization rate
155  * calculation.
156  *
157  * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
158  * the CU_RATE() macro below to guarantee that we never divide by zero.
159  *
160  * Rate is the number of events per second. The rate is the number of events
161  * divided by time and multiplied by the number of nanoseconds in a second. We
162  * do not want time to be too small since it will cause large errors in
163  * division.
164  *
165  * We do not want to multiply two large numbers (the instruction count and
166  * NANOSEC) either since it may cause integer overflow. So we divide both the
167  * numerator and the denominator by the same value.
168  *
169  * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
170  * above to guarantee that time divided by this value is always non-zero.
171  */
172 #define	CU_RATE(val, time) \
173 	(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
174 
175 #define	CU_SAMPLE_INTERVAL_MIN	(NANOSEC / 10)
176 
177 #define	CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
178 
179 /*
180  * When the time between two kstat reads for the same CPU is less than
181  * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
182  * for the CPU. This helps reduce cross-calls when kstat consumers read data
183  * very often or when they read PG utilization data and then CPU utilization
184  * data quickly after that.
185  */
186 #define	CU_UPDATE_THRESHOLD (NANOSEC / 10)
187 
188 /*
189  * The IS_HIPIL() macro verifies that the code is executed either from a
190  * cross-call or from high-PIL interrupt
191  */
192 #ifdef DEBUG
193 #define	IS_HIPIL() (getpil() >= XCALL_PIL)
194 #else
195 #define	IS_HIPIL()
196 #endif	/* DEBUG */
197 
198 
199 typedef void (*cu_cpu_func_t)(uintptr_t, int *);
200 
201 
202 /*
203  * Flags to use for programming CPU hardware performance counters to measure
204  * capacity and utilization
205  */
206 int				cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
207 
208 /*
209  * Initial value used for programming hardware counters
210  */
211 uint64_t			cu_cpc_preset_value = 0;
212 
213 /*
214  * List of CPC event requests for capacity and utilization.
215  */
216 static kcpc_request_list_t	*cu_cpc_reqs = NULL;
217 
218 /*
219  * When a CPU is a member of PG with a sharing relationship that is supported
220  * by the capacity/utilization framework, a kstat is created for that CPU and
221  * sharing relationship.
222  *
223  * These kstats are updated one at a time, so we can have a single scratch
224  * space to fill the data.
225  *
226  * CPU counter kstats fields:
227  *
228  *   cu_cpu_id		CPU ID for this kstat
229  *
230  *   cu_pg_id		PG ID for this kstat
231  *
232  *   cu_generation	Generation value that increases whenever any CPU goes
233  *			  offline or online. Two kstat snapshots for the same
234  *			  CPU may only be compared if they have the same
235  *			  generation.
236  *
237  *   cu_pg_id		PG ID for the relationship described by this kstat
238  *
239  *   cu_cpu_util	Running value of CPU utilization for the sharing
240  *			  relationship
241  *
242  *   cu_cpu_time_running Total time spent collecting CU data. The time may be
243  *			   less than wall time if CU counters were stopped for
244  *			   some time.
245  *
246  *   cu_cpu_time_stopped Total time the CU counters were stopped.
247  *
248  *   cu_cpu_rate	Utilization rate, expressed in operations per second.
249  *
250  *   cu_cpu_rate_max	Maximum observed value of utilization rate.
251  *
252  *   cu_cpu_relationship Name of sharing relationship for the PG in this kstat
253  */
254 struct cu_cpu_kstat {
255 	kstat_named_t	cu_cpu_id;
256 	kstat_named_t	cu_pg_id;
257 	kstat_named_t	cu_generation;
258 	kstat_named_t	cu_cpu_util;
259 	kstat_named_t	cu_cpu_time_running;
260 	kstat_named_t	cu_cpu_time_stopped;
261 	kstat_named_t	cu_cpu_rate;
262 	kstat_named_t	cu_cpu_rate_max;
263 	kstat_named_t	cu_cpu_relationship;
264 } cu_cpu_kstat = {
265 	{ "cpu_id",			KSTAT_DATA_UINT32 },
266 	{ "pg_id",			KSTAT_DATA_INT32 },
267 	{ "generation",			KSTAT_DATA_UINT32 },
268 	{ "hw_util",			KSTAT_DATA_UINT64 },
269 	{ "hw_util_time_running",	KSTAT_DATA_UINT64 },
270 	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64 },
271 	{ "hw_util_rate",		KSTAT_DATA_UINT64 },
272 	{ "hw_util_rate_max",		KSTAT_DATA_UINT64 },
273 	{ "relationship",		KSTAT_DATA_STRING },
274 };
275 
276 /*
277  * Flags for controlling this module
278  */
279 uint_t				cu_flags = CU_FLAG_ENABLE;
280 
281 /*
282  * Error return value for cu_init() since it can't return anything to be called
283  * from mp_init_tbl[] (:-(
284  */
285 static int			cu_init_error = 0;
286 
287 hrtime_t			cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
288 
289 hrtime_t			cu_update_threshold = CU_UPDATE_THRESHOLD;
290 
291 static kmutex_t			pg_cpu_kstat_lock;
292 
293 
294 /*
295  * Forward declaration of interface routines
296  */
297 void		cu_disable(void);
298 void		cu_enable(void);
299 void		cu_init(void);
300 void		cu_cpc_program(cpu_t *cp, int *err);
301 void		cu_cpc_unprogram(cpu_t *cp, int *err);
302 int		cu_cpu_update(struct cpu *cp, boolean_t move_to);
303 void		cu_pg_update(pghw_t *pg);
304 
305 
306 /*
307  * Forward declaration of private routines
308  */
309 static int	cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
310 static void	cu_cpc_program_xcall(uintptr_t arg, int *err);
311 static int	cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
312     int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
313 static int	cu_cpu_callback(cpu_setup_t what, int id, void *arg);
314 static void	cu_cpu_disable(cpu_t *cp);
315 static void	cu_cpu_enable(cpu_t *cp);
316 static int	cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
317 static int	cu_cpu_fini(cpu_t *cp);
318 static void	cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
319 static int	cu_cpu_kstat_update(kstat_t *ksp, int rw);
320 static int	cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
321 static int	cu_cpu_update_stats(cu_cntr_stats_t *stats,
322     uint64_t cntr_value);
323 static void cu_cpu_info_detach_xcall(void);
324 
325 /*
326  * Disable or enable Capacity Utilization counters on all CPUs.
327  */
328 void
329 cu_disable(void)
330 {
331 	cpu_t *cp;
332 
333 	ASSERT(MUTEX_HELD(&cpu_lock));
334 
335 	cp = cpu_active;
336 	do {
337 		if (!(cp->cpu_flags & CPU_OFFLINE))
338 			cu_cpu_disable(cp);
339 	} while ((cp = cp->cpu_next_onln) != cpu_active);
340 }
341 
342 
343 void
344 cu_enable(void)
345 {
346 	cpu_t *cp;
347 
348 	ASSERT(MUTEX_HELD(&cpu_lock));
349 
350 	cp = cpu_active;
351 	do {
352 		if (!(cp->cpu_flags & CPU_OFFLINE))
353 			cu_cpu_enable(cp);
354 	} while ((cp = cp->cpu_next_onln) != cpu_active);
355 }
356 
357 
358 /*
359  * Setup capacity and utilization support
360  */
361 void
362 cu_init(void)
363 {
364 	cpu_t	*cp;
365 
366 	cu_init_error = 0;
367 	if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
368 		cu_init_error = -1;
369 		return;
370 	}
371 
372 	if (kcpc_init() != 0) {
373 		cu_init_error = -2;
374 		return;
375 	}
376 
377 	/*
378 	 * Can't measure hardware capacity and utilization without CPU
379 	 * hardware performance counters
380 	 */
381 	if (cpc_ncounters <= 0) {
382 		cu_init_error = -3;
383 		return;
384 	}
385 
386 	/*
387 	 * Setup CPC event request queue
388 	 */
389 	cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
390 
391 	mutex_enter(&cpu_lock);
392 
393 	/*
394 	 * Mark flags to say that module is ready to be setup
395 	 */
396 	cu_flags |= CU_FLAG_READY;
397 
398 	cp = cpu_active;
399 	do {
400 		/*
401 		 * Allocate and setup state needed to measure capacity and
402 		 * utilization
403 		 */
404 		if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
405 			cu_init_error = -5;
406 
407 		/*
408 		 * Reset list of counter event requests so its space can be
409 		 * reused for a different set of requests for next CPU
410 		 */
411 		(void) kcpc_reqs_reset(cu_cpc_reqs);
412 
413 		cp = cp->cpu_next_onln;
414 	} while (cp != cpu_active);
415 
416 	/*
417 	 * Mark flags to say that module is on now and counters are ready to be
418 	 * programmed on all active CPUs
419 	 */
420 	cu_flags |= CU_FLAG_ON;
421 
422 	/*
423 	 * Program counters on currently active CPUs
424 	 */
425 	cp = cpu_active;
426 	do {
427 		if (cu_cpu_run(cp, cu_cpc_program_xcall,
428 		    (uintptr_t)B_FALSE) != 0)
429 			cu_init_error = -6;
430 
431 		cp = cp->cpu_next_onln;
432 	} while (cp != cpu_active);
433 
434 	/*
435 	 * Register callback for CPU state changes to enable and disable
436 	 * CPC counters as CPUs come on and offline
437 	 */
438 	register_cpu_setup_func(cu_cpu_callback, NULL);
439 
440 	mutex_exit(&cpu_lock);
441 }
442 
443 
444 /*
445  * Return number of counter events needed to measure capacity and utilization
446  * for specified CPU and fill in list of CPC requests with each counter event
447  * needed if list where to add CPC requests is given
448  *
449  * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
450  *	 everything that has been successfully allocated if any memory
451  *	 allocation fails
452  */
453 static int
454 cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
455 {
456 	group_t		*cmt_pgs;
457 	cu_cntr_info_t	**cntr_info_array;
458 	cpu_pg_t	*cpu_pgs;
459 	cu_cpu_info_t	*cu_cpu_info;
460 	pg_cmt_t	*pg_cmt;
461 	pghw_t		*pg_hw;
462 	cu_cntr_stats_t	*stats;
463 	int		nevents;
464 	pghw_type_t	pg_hw_type;
465 	group_iter_t	iter;
466 
467 	ASSERT(MUTEX_HELD(&cpu_lock));
468 
469 	/*
470 	 * There has to be a target CPU for this
471 	 */
472 	if (cp == NULL)
473 		return (-1);
474 
475 	/*
476 	 * Return 0 when CPU doesn't belong to any group
477 	 */
478 	cpu_pgs = cp->cpu_pg;
479 	if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
480 		return (0);
481 
482 	cmt_pgs = &cpu_pgs->cmt_pgs;
483 	cu_cpu_info = cp->cpu_cu_info;
484 
485 	/*
486 	 * Grab counter statistics and info
487 	 */
488 	if (reqs == NULL) {
489 		stats = NULL;
490 		cntr_info_array = NULL;
491 	} else {
492 		if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
493 			return (-2);
494 
495 		stats = cu_cpu_info->cu_cntr_stats;
496 		cntr_info_array = cu_cpu_info->cu_cntr_info;
497 	}
498 
499 	/*
500 	 * See whether platform (or processor) specific code knows which CPC
501 	 * events to request, etc. are needed to measure hardware capacity and
502 	 * utilization on this machine
503 	 */
504 	nevents = cu_plat_cpc_init(cp, reqs, nreqs);
505 	if (nevents >= 0)
506 		return (nevents);
507 
508 	/*
509 	 * Let common code decide which CPC events to request, etc. to measure
510 	 * capacity and utilization since platform (or processor) specific does
511 	 * not know....
512 	 *
513 	 * Walk CPU's PG lineage and do following:
514 	 *
515 	 * - Setup CPC request, counter info, and stats needed for each counter
516 	 *   event to measure capacity and and utilization for each of CPU's PG
517 	 *   hardware sharing relationships
518 	 *
519 	 * - Create PG CPU kstats to export capacity and utilization for each PG
520 	 */
521 	nevents = 0;
522 	group_iter_init(&iter);
523 	while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
524 		cu_cntr_info_t	*cntr_info;
525 		int		nevents_save;
526 		int		nstats;
527 
528 		pg_hw = (pghw_t *)pg_cmt;
529 		pg_hw_type = pg_hw->pghw_hw;
530 		nevents_save = nevents;
531 		nstats = 0;
532 
533 		switch (pg_hw_type) {
534 		case PGHW_IPIPE:
535 			if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
536 			    KM_NOSLEEP, &nevents) != 0)
537 				continue;
538 			nstats = 1;
539 			break;
540 
541 		case PGHW_FPU:
542 			if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
543 			    KM_NOSLEEP, &nevents) != 0)
544 				continue;
545 			nstats = 1;
546 			break;
547 
548 		default:
549 			/*
550 			 * Don't measure capacity and utilization for this kind
551 			 * of PG hardware relationship so skip to next PG in
552 			 * CPU's PG lineage
553 			 */
554 			continue;
555 		}
556 
557 		cntr_info = cntr_info_array[pg_hw_type];
558 
559 		/*
560 		 * Nothing to measure for this hardware sharing relationship
561 		 */
562 		if (nevents - nevents_save == 0) {
563 			if (cntr_info != NULL) {
564 				kmem_free(cntr_info, sizeof (cu_cntr_info_t));
565 				cntr_info_array[pg_hw_type] = NULL;
566 			}
567 			continue;
568 		}
569 
570 		/*
571 		 * Fill in counter info for this PG hardware relationship
572 		 */
573 		if (cntr_info == NULL) {
574 			cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
575 			    KM_NOSLEEP);
576 			if (cntr_info == NULL)
577 				continue;
578 			cntr_info_array[pg_hw_type] = cntr_info;
579 		}
580 		cntr_info->ci_cpu = cp;
581 		cntr_info->ci_pg = pg_hw;
582 		cntr_info->ci_stats = &stats[nevents_save];
583 		cntr_info->ci_nstats = nstats;
584 
585 		/*
586 		 * Create PG CPU kstats for this hardware relationship
587 		 */
588 		cu_cpu_kstat_create(pg_hw, cntr_info);
589 	}
590 
591 	return (nevents);
592 }
593 
594 
595 /*
596  * Program counters for capacity and utilization on given CPU
597  *
598  * If any of the following conditions is true, the counters are not programmed:
599  *
600  * - CU framework is disabled
601  * - The cpu_cu_info field of the cpu structure is NULL
602  * - DTrace is active
603  * - Counters are programmed already
604  * - Counters are disabled (by calls to cu_cpu_disable())
605  */
606 void
607 cu_cpc_program(cpu_t *cp, int *err)
608 {
609 	cu_cpc_ctx_t	*cpu_ctx;
610 	kcpc_ctx_t	*ctx;
611 	cu_cpu_info_t	*cu_cpu_info;
612 
613 	ASSERT(IS_HIPIL());
614 	/*
615 	 * Should be running on given CPU. We disable preemption to keep CPU
616 	 * from disappearing and make sure flags and CPC context don't change
617 	 * from underneath us
618 	 */
619 	kpreempt_disable();
620 	ASSERT(cp == CPU);
621 
622 	/*
623 	 * Module not ready to program counters
624 	 */
625 	if (!(cu_flags & CU_FLAG_ON)) {
626 		*err = -1;
627 		kpreempt_enable();
628 		return;
629 	}
630 
631 	if (cp == NULL) {
632 		*err = -2;
633 		kpreempt_enable();
634 		return;
635 	}
636 
637 	cu_cpu_info = cp->cpu_cu_info;
638 	if (cu_cpu_info == NULL) {
639 		*err = -3;
640 		kpreempt_enable();
641 		return;
642 	}
643 
644 	/*
645 	 * If DTrace CPC is active or counters turned on already or are
646 	 * disabled, just return.
647 	 */
648 	if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
649 	    cu_cpu_info->cu_disabled) {
650 		*err = 1;
651 		kpreempt_enable();
652 		return;
653 	}
654 
655 	if ((CPU->cpu_cpc_ctx != NULL) &&
656 	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
657 		*err = -4;
658 		kpreempt_enable();
659 		return;
660 	}
661 
662 	/*
663 	 * Get CPU's CPC context needed for capacity and utilization
664 	 */
665 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
666 	ASSERT(cpu_ctx != NULL);
667 	ASSERT(cpu_ctx->nctx >= 0);
668 
669 	ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
670 	ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
671 	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
672 	    cpu_ctx->ctx_ptr_array_sz <= 0) {
673 		*err = -5;
674 		kpreempt_enable();
675 		return;
676 	}
677 
678 	/*
679 	 * Increment index in CPU's CPC context info to point at next context
680 	 * to program
681 	 *
682 	 * NOTE: Do this now instead of after programming counters to ensure
683 	 *	 that index will always point at *current* context so we will
684 	 *	 always be able to unprogram *current* context if necessary
685 	 */
686 	cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
687 
688 	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
689 
690 	/*
691 	 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
692 	 * context before programming counters
693 	 *
694 	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
695 	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
696 	 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
697 	 * invalidate all CPC contexts before they take over all the counters.
698 	 *
699 	 * This isn't necessary since these flags are only used for thread bound
700 	 * CPC contexts not CPU bound CPC contexts like ones used for capacity
701 	 * and utilization.
702 	 *
703 	 * There is no need to protect the flag update since no one is using
704 	 * this context now.
705 	 */
706 	ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
707 
708 	/*
709 	 * Program counters on this CPU
710 	 */
711 	kcpc_program(ctx, B_FALSE, B_FALSE);
712 
713 	cp->cpu_cpc_ctx = ctx;
714 
715 	/*
716 	 * Set state in CPU structure to say that CPU's counters are programmed
717 	 * for capacity and utilization now and that they are transitioning from
718 	 * off to on state. This will cause cu_cpu_update to update stop times
719 	 * for all programmed counters.
720 	 */
721 	cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
722 
723 	/*
724 	 * Update counter statistics
725 	 */
726 	(void) cu_cpu_update(cp, B_FALSE);
727 
728 	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
729 
730 	*err = 0;
731 	kpreempt_enable();
732 }
733 
734 
735 /*
736  * Cross call wrapper routine for cu_cpc_program()
737  *
738  * Checks to make sure that counters on CPU aren't being used by someone else
739  * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
740  * nobody else is using the counters to catch and prevent any broken code.
741  * Also, this check needs to happen on the target CPU since the CPU's CPC
742  * context can only be changed while running on the CPU.
743  *
744  * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
745  * no valid thread bound cpc context. This is important to check to prevent
746  * re-programming thread counters with CU counters when CPU is coming on-line.
747  */
748 static void
749 cu_cpc_program_xcall(uintptr_t arg, int *err)
750 {
751 	boolean_t	avoid_thread_context = (boolean_t)arg;
752 
753 	kpreempt_disable();
754 
755 	if (CPU->cpu_cpc_ctx != NULL &&
756 	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
757 		*err = -100;
758 		kpreempt_enable();
759 		return;
760 	}
761 
762 	if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
763 	    !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
764 		*err = -200;
765 		kpreempt_enable();
766 		return;
767 	}
768 
769 	cu_cpc_program(CPU, err);
770 	kpreempt_enable();
771 }
772 
773 
774 /*
775  * Unprogram counters for capacity and utilization on given CPU
776  * This function should be always executed on the target CPU at high PIL
777  */
778 void
779 cu_cpc_unprogram(cpu_t *cp, int *err)
780 {
781 	cu_cpc_ctx_t	*cpu_ctx;
782 	kcpc_ctx_t	*ctx;
783 	cu_cpu_info_t	*cu_cpu_info;
784 
785 	ASSERT(IS_HIPIL());
786 	/*
787 	 * Should be running on given CPU with preemption disabled to keep CPU
788 	 * from disappearing and make sure flags and CPC context don't change
789 	 * from underneath us
790 	 */
791 	kpreempt_disable();
792 	ASSERT(cp == CPU);
793 
794 	/*
795 	 * Module not on
796 	 */
797 	if (!(cu_flags & CU_FLAG_ON)) {
798 		*err = -1;
799 		kpreempt_enable();
800 		return;
801 	}
802 
803 	cu_cpu_info = cp->cpu_cu_info;
804 	if (cu_cpu_info == NULL) {
805 		*err = -3;
806 		kpreempt_enable();
807 		return;
808 	}
809 
810 	/*
811 	 * Counters turned off already
812 	 */
813 	if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
814 		*err = 1;
815 		kpreempt_enable();
816 		return;
817 	}
818 
819 	/*
820 	 * Update counter statistics
821 	 */
822 	(void) cu_cpu_update(cp, B_FALSE);
823 
824 	/*
825 	 * Get CPU's CPC context needed for capacity and utilization
826 	 */
827 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
828 	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
829 	    cpu_ctx->ctx_ptr_array_sz <= 0) {
830 		*err = -5;
831 		kpreempt_enable();
832 		return;
833 	}
834 	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
835 
836 	/*
837 	 * CPU's CPC context should be current capacity and utilization CPC
838 	 * context
839 	 */
840 	ASSERT(cp->cpu_cpc_ctx == ctx);
841 	if (cp->cpu_cpc_ctx != ctx) {
842 		*err = -6;
843 		kpreempt_enable();
844 		return;
845 	}
846 
847 	/*
848 	 * Unprogram counters on CPU.
849 	 */
850 	kcpc_unprogram(ctx, B_FALSE);
851 
852 	ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
853 
854 	/*
855 	 * Unset state in CPU structure saying that CPU's counters are
856 	 * programmed
857 	 */
858 	cp->cpu_cpc_ctx = NULL;
859 	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
860 
861 	*err = 0;
862 	kpreempt_enable();
863 }
864 
865 
866 /*
867  * Add given counter event to list of CPC requests
868  */
869 static int
870 cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
871     cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
872 {
873 	int	n;
874 	int	retval;
875 	uint_t  flags;
876 
877 	/*
878 	 * Return error when no counter event specified, counter event not
879 	 * supported by CPC's PCBE, or number of events not given
880 	 */
881 	if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
882 	    nevents == NULL)
883 		return (-1);
884 
885 	n = *nevents;
886 
887 	/*
888 	 * Only count number of counter events needed if list
889 	 * where to add CPC requests not given
890 	 */
891 	if (reqs == NULL) {
892 		n++;
893 		*nevents = n;
894 		return (-3);
895 	}
896 
897 	/*
898 	 * Return error when stats not given or not enough room on list of CPC
899 	 * requests for more counter events
900 	 */
901 	if (stats == NULL || (nreqs <= 0 && n >= nreqs))
902 		return (-4);
903 
904 	/*
905 	 * Use flags in cu_cpc_flags to program counters and enable overflow
906 	 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
907 	 * PCBE can catch counters before they wrap to hopefully give us an
908 	 * accurate (64-bit) virtualized counter
909 	 */
910 	flags = cu_cpc_flags;
911 	if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
912 		flags &= ~CPC_OVF_NOTIFY_EMT;
913 
914 	/*
915 	 * Add CPC request to list
916 	 */
917 	retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
918 	    flags, 0, NULL, &stats[n], kmem_flags);
919 
920 	if (retval != 0)
921 		return (-5);
922 
923 	n++;
924 	*nevents = n;
925 	return (0);
926 }
927 
928 static void
929 cu_cpu_info_detach_xcall(void)
930 {
931 	ASSERT(IS_HIPIL());
932 
933 	CPU->cpu_cu_info = NULL;
934 }
935 
936 
937 /*
938  * Enable or disable collection of capacity/utilization data for a current CPU.
939  * Counters are enabled if 'on' argument is True and disabled if it is False.
940  * This function should be always executed at high PIL
941  */
942 static void
943 cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
944 {
945 	cpu_t		*cp = (cpu_t *)arg1;
946 	boolean_t	on = (boolean_t)arg2;
947 	int		error;
948 	cu_cpu_info_t	*cu_cpu_info;
949 
950 	ASSERT(IS_HIPIL());
951 	kpreempt_disable();
952 	ASSERT(cp == CPU);
953 
954 	if (!(cu_flags & CU_FLAG_ON)) {
955 		kpreempt_enable();
956 		return;
957 	}
958 
959 	cu_cpu_info = cp->cpu_cu_info;
960 	if (cu_cpu_info == NULL) {
961 		kpreempt_enable();
962 		return;
963 	}
964 
965 	ASSERT(!cu_cpu_info->cu_disabled ||
966 	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
967 
968 	if (on) {
969 		/*
970 		 * Decrement the cu_disabled counter.
971 		 * Once it drops to zero, call cu_cpc_program.
972 		 */
973 		if (cu_cpu_info->cu_disabled > 0)
974 			cu_cpu_info->cu_disabled--;
975 		if (cu_cpu_info->cu_disabled == 0)
976 			cu_cpc_program(CPU, &error);
977 	} else if (cu_cpu_info->cu_disabled++ == 0) {
978 		/*
979 		 * This is the first attempt to disable CU, so turn it off
980 		 */
981 		cu_cpc_unprogram(cp, &error);
982 		ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
983 	}
984 
985 	kpreempt_enable();
986 }
987 
988 
989 /*
990  * Callback for changes in CPU states
991  * Used to enable or disable hardware performance counters on CPUs that are
992  * turned on or off
993  *
994  * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
995  * We have to use thread_affinity_set to hop to the right CPU because these
996  * routines expect cpu_lock held, so we can't cross-call other CPUs while
997  * holding CPU lock.
998  */
999 static int
1000 /* LINTED E_FUNC_ARG_UNUSED */
1001 cu_cpu_callback(cpu_setup_t what, int id, void *arg)
1002 {
1003 	cpu_t	*cp;
1004 	int	retval = 0;
1005 
1006 	ASSERT(MUTEX_HELD(&cpu_lock));
1007 
1008 	if (!(cu_flags & CU_FLAG_ON))
1009 		return (-1);
1010 
1011 	cp = cpu_get(id);
1012 	if (cp == NULL)
1013 		return (-2);
1014 
1015 	switch (what) {
1016 	case CPU_ON:
1017 		/*
1018 		 * Setup counters on CPU being turned on
1019 		 */
1020 		retval = cu_cpu_init(cp, cu_cpc_reqs);
1021 
1022 		/*
1023 		 * Reset list of counter event requests so its space can be
1024 		 * reused for a different set of requests for next CPU
1025 		 */
1026 		(void) kcpc_reqs_reset(cu_cpc_reqs);
1027 		break;
1028 	case CPU_INTR_ON:
1029 		/*
1030 		 * Setup counters on CPU being turned on.
1031 		 */
1032 		retval = cu_cpu_run(cp, cu_cpc_program_xcall,
1033 		    (uintptr_t)B_TRUE);
1034 		break;
1035 	case CPU_OFF:
1036 		/*
1037 		 * Disable counters on CPU being turned off. Counters will not
1038 		 * be re-enabled on this CPU until it comes back online.
1039 		 */
1040 		cu_cpu_disable(cp);
1041 		ASSERT(!CU_CPC_ON(cp));
1042 		retval = cu_cpu_fini(cp);
1043 		break;
1044 	default:
1045 		break;
1046 	}
1047 	return (retval);
1048 }
1049 
1050 
1051 /*
1052  * Disable or enable Capacity Utilization counters on a given CPU. This function
1053  * can be called from any CPU to disable counters on the given CPU.
1054  */
1055 static void
1056 cu_cpu_disable(cpu_t *cp)
1057 {
1058 	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
1059 }
1060 
1061 
1062 static void
1063 cu_cpu_enable(cpu_t *cp)
1064 {
1065 	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
1066 }
1067 
1068 
1069 /*
1070  * Setup capacity and utilization support for given CPU
1071  *
1072  * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
1073  *	 everything that has been successfully allocated including cpu_cu_info
1074  *	if any memory allocation fails
1075  */
1076 static int
1077 cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
1078 {
1079 	kcpc_ctx_t	**ctx_ptr_array;
1080 	size_t		ctx_ptr_array_sz;
1081 	cu_cpc_ctx_t	*cpu_ctx;
1082 	cu_cpu_info_t	*cu_cpu_info;
1083 	int		n;
1084 
1085 	/*
1086 	 * cpu_lock should be held and protect against CPU going away and races
1087 	 * with cu_{init,fini,cpu_fini}()
1088 	 */
1089 	ASSERT(MUTEX_HELD(&cpu_lock));
1090 
1091 	/*
1092 	 * Return if not ready to setup counters yet
1093 	 */
1094 	if (!(cu_flags & CU_FLAG_READY))
1095 		return (-1);
1096 
1097 	if (cp->cpu_cu_info == NULL) {
1098 		cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
1099 		    KM_NOSLEEP);
1100 		if (cp->cpu_cu_info == NULL)
1101 			return (-2);
1102 	}
1103 
1104 	/*
1105 	 * Get capacity and utilization CPC context for CPU and check to see
1106 	 * whether it has been setup already
1107 	 */
1108 	cu_cpu_info = cp->cpu_cu_info;
1109 	cu_cpu_info->cu_cpu = cp;
1110 	cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
1111 
1112 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1113 	if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
1114 	    cpu_ctx->ctx_ptr_array_sz > 0) {
1115 		return (1);
1116 	}
1117 
1118 	/*
1119 	 * Should have no contexts since it hasn't been setup already
1120 	 */
1121 	ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
1122 	    cpu_ctx->ctx_ptr_array_sz == 0);
1123 
1124 	/*
1125 	 * Determine how many CPC events needed to measure capacity and
1126 	 * utilization for this CPU, allocate space for counter statistics for
1127 	 * each event, and fill in list of CPC event requests with corresponding
1128 	 * counter stats for each request to make attributing counter data
1129 	 * easier later....
1130 	 */
1131 	n = cu_cpc_init(cp, NULL, 0);
1132 	if (n <= 0) {
1133 		(void) cu_cpu_fini(cp);
1134 		return (-3);
1135 	}
1136 
1137 	cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
1138 	    KM_NOSLEEP);
1139 	if (cu_cpu_info->cu_cntr_stats == NULL) {
1140 		(void) cu_cpu_fini(cp);
1141 		return (-4);
1142 	}
1143 
1144 	cu_cpu_info->cu_ncntr_stats = n;
1145 
1146 	n = cu_cpc_init(cp, reqs, n);
1147 	if (n <= 0) {
1148 		(void) cu_cpu_fini(cp);
1149 		return (-5);
1150 	}
1151 
1152 	/*
1153 	 * Create CPC context with given requests
1154 	 */
1155 	ctx_ptr_array = NULL;
1156 	ctx_ptr_array_sz = 0;
1157 	n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
1158 	    &ctx_ptr_array_sz);
1159 	if (n <= 0) {
1160 		(void) cu_cpu_fini(cp);
1161 		return (-6);
1162 	}
1163 
1164 	/*
1165 	 * Should have contexts
1166 	 */
1167 	ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
1168 	if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
1169 		(void) cu_cpu_fini(cp);
1170 		return (-7);
1171 	}
1172 
1173 	/*
1174 	 * Fill in CPC context info for CPU needed for capacity and utilization
1175 	 */
1176 	cpu_ctx->cur_index = 0;
1177 	cpu_ctx->nctx = n;
1178 	cpu_ctx->ctx_ptr_array = ctx_ptr_array;
1179 	cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
1180 	return (0);
1181 }
1182 
1183 /*
1184  * Tear down capacity and utilization support for given CPU
1185  */
1186 static int
1187 cu_cpu_fini(cpu_t *cp)
1188 {
1189 	kcpc_ctx_t	*ctx;
1190 	cu_cpc_ctx_t	*cpu_ctx;
1191 	cu_cpu_info_t	*cu_cpu_info;
1192 	int		i;
1193 	pghw_type_t	pg_hw_type;
1194 
1195 	/*
1196 	 * cpu_lock should be held and protect against CPU going away and races
1197 	 * with cu_{init,fini,cpu_init}()
1198 	 */
1199 	ASSERT(MUTEX_HELD(&cpu_lock));
1200 
1201 	/*
1202 	 * Have to at least be ready to setup counters to have allocated
1203 	 * anything that needs to be deallocated now
1204 	 */
1205 	if (!(cu_flags & CU_FLAG_READY))
1206 		return (-1);
1207 
1208 	/*
1209 	 * Nothing to do if CPU's capacity and utilization info doesn't exist
1210 	 */
1211 	cu_cpu_info = cp->cpu_cu_info;
1212 	if (cu_cpu_info == NULL)
1213 		return (1);
1214 
1215 	/*
1216 	 * Tear down any existing kstats and counter info for each hardware
1217 	 * sharing relationship
1218 	 */
1219 	for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
1220 	    pg_hw_type++) {
1221 		cu_cntr_info_t	*cntr_info;
1222 
1223 		cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
1224 		if (cntr_info == NULL)
1225 			continue;
1226 
1227 		if (cntr_info->ci_kstat != NULL) {
1228 			kstat_delete(cntr_info->ci_kstat);
1229 			cntr_info->ci_kstat = NULL;
1230 		}
1231 		kmem_free(cntr_info, sizeof (cu_cntr_info_t));
1232 	}
1233 
1234 	/*
1235 	 * Free counter statistics for CPU
1236 	 */
1237 	ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
1238 	    cu_cpu_info->cu_ncntr_stats > 0);
1239 	if (cu_cpu_info->cu_cntr_stats != NULL &&
1240 	    cu_cpu_info->cu_ncntr_stats > 0) {
1241 		kmem_free(cu_cpu_info->cu_cntr_stats,
1242 		    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
1243 		cu_cpu_info->cu_cntr_stats = NULL;
1244 		cu_cpu_info->cu_ncntr_stats = 0;
1245 	}
1246 
1247 	/*
1248 	 * Get capacity and utilization CPC contexts for given CPU and check to
1249 	 * see whether they have been freed already
1250 	 */
1251 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1252 	if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
1253 	    cpu_ctx->ctx_ptr_array_sz > 0) {
1254 		/*
1255 		 * Free CPC contexts for given CPU
1256 		 */
1257 		for (i = 0; i < cpu_ctx->nctx; i++) {
1258 			ctx = cpu_ctx->ctx_ptr_array[i];
1259 			if (ctx == NULL)
1260 				continue;
1261 			kcpc_free(ctx, 0);
1262 		}
1263 
1264 		/*
1265 		 * Free CPC context pointer array
1266 		 */
1267 		kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
1268 
1269 		/*
1270 		 * Zero CPC info for CPU
1271 		 */
1272 		bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
1273 	}
1274 
1275 	/*
1276 	 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
1277 	 * that no one is going to access the cpu_cu_info whicch we are going to
1278 	 * free.
1279 	 */
1280 	if (cpu_is_online(cp))
1281 		cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
1282 	else
1283 		cp->cpu_cu_info = NULL;
1284 
1285 	/*
1286 	 * Free CPU's capacity and utilization info
1287 	 */
1288 	kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
1289 
1290 	return (0);
1291 }
1292 
1293 /*
1294  * Create capacity & utilization kstats for given PG CPU hardware sharing
1295  * relationship
1296  */
1297 static void
1298 cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
1299 {
1300 	kstat_t		*ks;
1301 	char		*sharing = pghw_type_string(pg->pghw_hw);
1302 	char		name[KSTAT_STRLEN + 1];
1303 
1304 	/*
1305 	 * Just return when no counter info or CPU
1306 	 */
1307 	if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
1308 		return;
1309 
1310 	/*
1311 	 * Canonify PG name to conform to kstat name rules
1312 	 */
1313 	(void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1);
1314 	strident_canon(name, TASKQ_NAMELEN + 1);
1315 
1316 	if ((ks = kstat_create_zone("pg_hw_perf_cpu",
1317 	    cntr_info->ci_cpu->cpu_id,
1318 	    name, "processor_group", KSTAT_TYPE_NAMED,
1319 	    sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
1320 	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
1321 		return;
1322 
1323 	ks->ks_lock = &pg_cpu_kstat_lock;
1324 	ks->ks_data = &cu_cpu_kstat;
1325 	ks->ks_update = cu_cpu_kstat_update;
1326 	ks->ks_data_size += strlen(sharing) + 1;
1327 
1328 	ks->ks_private = cntr_info;
1329 	cntr_info->ci_kstat = ks;
1330 	kstat_install(cntr_info->ci_kstat);
1331 }
1332 
1333 
1334 /*
1335  * Propagate values from CPU capacity & utilization stats to kstats
1336  */
1337 static int
1338 cu_cpu_kstat_update(kstat_t *ksp, int rw)
1339 {
1340 	cpu_t		*cp;
1341 	cu_cntr_info_t	*cntr_info = ksp->ks_private;
1342 	struct cu_cpu_kstat	*kstat = &cu_cpu_kstat;
1343 	pghw_t		*pg;
1344 	cu_cntr_stats_t	*stats;
1345 
1346 	if (rw == KSTAT_WRITE)
1347 		return (EACCES);
1348 
1349 	cp = cntr_info->ci_cpu;
1350 	pg = cntr_info->ci_pg;
1351 	kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
1352 	kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id;
1353 
1354 	/*
1355 	 * The caller should have priv_cpc_cpu privilege to get utilization
1356 	 * data. Callers who do not have the privilege will see zeroes as the
1357 	 * values.
1358 	 */
1359 	if (secpolicy_cpc_cpu(crgetcred()) != 0) {
1360 		kstat->cu_generation.value.ui32 = cp->cpu_generation;
1361 		kstat_named_setstr(&kstat->cu_cpu_relationship,
1362 		    pghw_type_string(pg->pghw_hw));
1363 
1364 		kstat->cu_cpu_util.value.ui64 = 0;
1365 		kstat->cu_cpu_rate.value.ui64 = 0;
1366 		kstat->cu_cpu_rate_max.value.ui64 = 0;
1367 		kstat->cu_cpu_time_running.value.ui64 = 0;
1368 		kstat->cu_cpu_time_stopped.value.ui64 = 0;
1369 
1370 		return (0);
1371 	}
1372 
1373 	kpreempt_disable();
1374 
1375 	/*
1376 	 * Update capacity and utilization statistics needed for CPU's PG (CPU)
1377 	 * kstats
1378 	 */
1379 
1380 	(void) cu_cpu_update(cp, B_TRUE);
1381 
1382 	stats = cntr_info->ci_stats;
1383 	kstat->cu_generation.value.ui32 = cp->cpu_generation;
1384 	kstat_named_setstr(&kstat->cu_cpu_relationship,
1385 	    pghw_type_string(pg->pghw_hw));
1386 
1387 	kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
1388 	kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
1389 	kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
1390 	kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
1391 	kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
1392 
1393 	/*
1394 	 * Counters are stopped now, so the cs_time_stopped was last
1395 	 * updated at cs_time_start time. Add the time passed since then
1396 	 * to the stopped time.
1397 	 */
1398 	if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
1399 		kstat->cu_cpu_time_stopped.value.ui64 +=
1400 		    gethrtime() - stats->cs_time_start;
1401 
1402 	kpreempt_enable();
1403 
1404 	return (0);
1405 }
1406 
1407 /*
1408  * Run specified function with specified argument on a given CPU and return
1409  * whatever the function returns
1410  */
1411 static int
1412 cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
1413 {
1414 	int error = 0;
1415 
1416 	/*
1417 	 * cpu_call() will call func on the CPU specified with given argument
1418 	 * and return func's return value in last argument
1419 	 */
1420 	cpu_call(cp, (cpu_call_func_t)(uintptr_t)func, arg, (uintptr_t)&error);
1421 	return (error);
1422 }
1423 
1424 
1425 /*
1426  * Update counter statistics on a given CPU.
1427  *
1428  * If move_to argument is True, execute the function on the CPU specified
1429  * Otherwise, assume that it is already runninng on the right CPU
1430  *
1431  * If move_to is specified, the caller should hold cpu_lock or have preemption
1432  * disabled. Otherwise it is up to the caller to guarantee that things do not
1433  * change in the process.
1434  */
1435 int
1436 cu_cpu_update(struct cpu *cp, boolean_t move_to)
1437 {
1438 	int	retval;
1439 	cu_cpu_info_t	*cu_cpu_info = cp->cpu_cu_info;
1440 	hrtime_t	time_snap;
1441 
1442 	ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
1443 
1444 	/*
1445 	 * Nothing to do if counters are not programmed
1446 	 */
1447 	if (!(cu_flags & CU_FLAG_ON) ||
1448 	    (cu_cpu_info == NULL) ||
1449 	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1450 		return (0);
1451 
1452 	/*
1453 	 * Don't update CPU statistics if it was updated recently
1454 	 * and provide old results instead
1455 	 */
1456 	time_snap = gethrtime();
1457 	if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
1458 		DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
1459 		return (0);
1460 	}
1461 
1462 	cu_cpu_info->cu_sample_time = time_snap;
1463 
1464 	/*
1465 	 * CPC counter should be read on the CPU that is running the counter. We
1466 	 * either have to move ourselves to the target CPU or insure that we
1467 	 * already run there.
1468 	 *
1469 	 * We use cross-call to the target CPU to execute kcpc_read() and
1470 	 * cu_cpu_update_stats() there.
1471 	 */
1472 	retval = 0;
1473 	if (move_to)
1474 		(void) cu_cpu_run(cp, (cu_cpu_func_t)(uintptr_t)kcpc_read,
1475 		    (uintptr_t)cu_cpu_update_stats);
1476 	else {
1477 		retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
1478 		/*
1479 		 * Offset negative return value by -10 so we can distinguish it
1480 		 * from error return values of this routine vs kcpc_read()
1481 		 */
1482 		if (retval < 0)
1483 			retval -= 10;
1484 	}
1485 
1486 	return (retval);
1487 }
1488 
1489 
1490 /*
1491  * Update CPU counter statistics for current CPU.
1492  * This function may be called from a cross-call
1493  */
1494 static int
1495 cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
1496 {
1497 	cu_cpu_info_t	*cu_cpu_info = CPU->cpu_cu_info;
1498 	uint_t		flags;
1499 	uint64_t	delta;
1500 	hrtime_t	time_delta;
1501 	hrtime_t	time_snap;
1502 
1503 	if (stats == NULL)
1504 		return (-1);
1505 
1506 	/*
1507 	 * Nothing to do if counters are not programmed. This should not happen,
1508 	 * but we check just in case.
1509 	 */
1510 	ASSERT(cu_flags & CU_FLAG_ON);
1511 	ASSERT(cu_cpu_info != NULL);
1512 	if (!(cu_flags & CU_FLAG_ON) ||
1513 	    (cu_cpu_info == NULL))
1514 		return (-2);
1515 
1516 	flags = cu_cpu_info->cu_flag;
1517 	ASSERT(flags & CU_CPU_CNTRS_ON);
1518 	if (!(flags & CU_CPU_CNTRS_ON))
1519 		return (-2);
1520 
1521 	/*
1522 	 * Take snapshot of high resolution timer
1523 	 */
1524 	time_snap = gethrtime();
1525 
1526 	/*
1527 	 * CU counters have just been programmed. We cannot assume that the new
1528 	 * cntr_value continues from where we left off, so use the cntr_value as
1529 	 * the new initial value.
1530 	 */
1531 	if (flags & CU_CPU_CNTRS_OFF_ON)
1532 		stats->cs_value_start = cntr_value;
1533 
1534 	/*
1535 	 * Calculate delta in counter values between start of sampling period
1536 	 * and now
1537 	 */
1538 	delta = cntr_value - stats->cs_value_start;
1539 
1540 	/*
1541 	 * Calculate time between start of sampling period and now
1542 	 */
1543 	time_delta = stats->cs_time_start ?
1544 	    time_snap - stats->cs_time_start :
1545 	    0;
1546 	stats->cs_time_start = time_snap;
1547 	stats->cs_value_start = cntr_value;
1548 
1549 	if (time_delta > 0) { /* wrap shouldn't happen */
1550 		/*
1551 		 * Update either running or stopped time based on the transition
1552 		 * state
1553 		 */
1554 		if (flags & CU_CPU_CNTRS_OFF_ON)
1555 			stats->cs_time_stopped += time_delta;
1556 		else
1557 			stats->cs_time_running += time_delta;
1558 	}
1559 
1560 	/*
1561 	 * Update rest of counter statistics if counter value didn't wrap
1562 	 */
1563 	if (delta > 0) {
1564 		/*
1565 		 * Update utilization rate if the interval between samples is
1566 		 * sufficient.
1567 		 */
1568 		ASSERT(cu_sample_interval_min > CU_SCALE);
1569 		if (time_delta > cu_sample_interval_min)
1570 			stats->cs_rate = CU_RATE(delta, time_delta);
1571 		if (stats->cs_rate_max < stats->cs_rate)
1572 			stats->cs_rate_max = stats->cs_rate;
1573 
1574 		stats->cs_value_last = delta;
1575 		stats->cs_value_total += delta;
1576 	}
1577 
1578 	return (0);
1579 }
1580 
1581 /*
1582  * Update CMT PG utilization data.
1583  *
1584  * This routine computes the running total utilization and times for the
1585  * specified PG by adding up the total utilization and counter running and
1586  * stopped times of all CPUs in the PG and calculates the utilization rate and
1587  * maximum rate for all CPUs in the PG.
1588  */
1589 void
1590 cu_pg_update(pghw_t *pg)
1591 {
1592 	pg_cpu_itr_t	cpu_iter;
1593 	pghw_type_t	pg_hwtype;
1594 	cpu_t		*cpu;
1595 	pghw_util_t	*hw_util = &pg->pghw_stats;
1596 	uint64_t	old_utilization = hw_util->pghw_util;
1597 	hrtime_t	now;
1598 	hrtime_t	time_delta;
1599 	uint64_t	utilization_delta;
1600 
1601 	ASSERT(MUTEX_HELD(&cpu_lock));
1602 
1603 	now = gethrtime();
1604 
1605 	pg_hwtype = pg->pghw_hw;
1606 
1607 	/*
1608 	 * Initialize running total utilization and times for PG to 0
1609 	 */
1610 	hw_util->pghw_util = 0;
1611 	hw_util->pghw_time_running = 0;
1612 	hw_util->pghw_time_stopped = 0;
1613 
1614 	/*
1615 	 * Iterate over all CPUs in the PG and aggregate utilization, running
1616 	 * time and stopped time.
1617 	 */
1618 	PG_CPU_ITR_INIT(pg, cpu_iter);
1619 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1620 		cu_cpu_info_t	*cu_cpu_info = cpu->cpu_cu_info;
1621 		cu_cntr_info_t	*cntr_info;
1622 		cu_cntr_stats_t	*stats;
1623 
1624 		if (cu_cpu_info == NULL)
1625 			continue;
1626 
1627 		/*
1628 		 * Update utilization data for the CPU and then
1629 		 * aggregate per CPU running totals for PG
1630 		 */
1631 		(void) cu_cpu_update(cpu, B_TRUE);
1632 		cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
1633 
1634 		if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
1635 			continue;
1636 
1637 		hw_util->pghw_util += stats->cs_value_total;
1638 		hw_util->pghw_time_running += stats->cs_time_running;
1639 		hw_util->pghw_time_stopped += stats->cs_time_stopped;
1640 
1641 		/*
1642 		 * If counters are stopped now, the pg_time_stopped was last
1643 		 * updated at cs_time_start time. Add the time passed since then
1644 		 * to the stopped time.
1645 		 */
1646 		if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1647 			hw_util->pghw_time_stopped +=
1648 			    now - stats->cs_time_start;
1649 	}
1650 
1651 	/*
1652 	 * Compute per PG instruction rate and maximum rate
1653 	 */
1654 	time_delta = now - hw_util->pghw_time_stamp;
1655 	hw_util->pghw_time_stamp = now;
1656 
1657 	if (old_utilization == 0)
1658 		return;
1659 
1660 	/*
1661 	 * Calculate change in utilization over sampling period and set this to
1662 	 * 0 if the delta would be 0 or negative which may happen if any CPUs go
1663 	 * offline during the sampling period
1664 	 */
1665 	if (hw_util->pghw_util > old_utilization)
1666 		utilization_delta = hw_util->pghw_util - old_utilization;
1667 	else
1668 		utilization_delta = 0;
1669 
1670 	/*
1671 	 * Update utilization rate if the interval between samples is
1672 	 * sufficient.
1673 	 */
1674 	ASSERT(cu_sample_interval_min > CU_SCALE);
1675 	if (time_delta > CU_SAMPLE_INTERVAL_MIN)
1676 		hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
1677 
1678 	/*
1679 	 * Update the maximum observed rate
1680 	 */
1681 	if (hw_util->pghw_rate_max < hw_util->pghw_rate)
1682 		hw_util->pghw_rate_max = hw_util->pghw_rate;
1683 }
1684