xref: /illumos-gate/usr/src/uts/common/os/cap_util.c (revision d0fccfcda73f8b52d101bd2b0f7885a766f7e354)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Support for determining capacity and utilization of performance relevant
28  * hardware components in a computer
29  *
30  * THEORY
31  * ------
32  * The capacity and utilization of the performance relevant hardware components
33  * is needed to be able to optimize performance while minimizing the amount of
34  * power used on a system.  The idea is to use hardware performance counters
35  * and potentially other means to determine the capacity and utilization of
36  * performance relevant hardware components (eg. execution pipeline, cache,
37  * memory, etc.) and attribute the utilization to the responsible CPU and the
38  * thread running there.
39  *
40  * This will help characterize the utilization of performance relevant
41  * components and how much is used by each CPU and each thread.  With
42  * that data, the utilization can be aggregated to all the CPUs sharing each
43  * performance relevant hardware component to calculate the total utilization
44  * of each component and compare that with the component's capacity to
45  * essentially determine the actual hardware load of the component.  The
46  * hardware utilization attributed to each running thread can also be
47  * aggregated to determine the total hardware utilization of each component to
48  * a workload.
49  *
50  * Once that is done, one can determine how much of each performance relevant
51  * hardware component is needed by a given thread or set of threads (eg. a
52  * workload) and size up exactly what hardware is needed by the threads and how
53  * much.  With this info, we can better place threads among CPUs to match their
54  * exact hardware resource needs and potentially lower or raise the power based
55  * on their utilization or pack threads onto the fewest hardware components
56  * needed and power off any remaining unused components to minimize power
57  * without sacrificing performance.
58  *
59  * IMPLEMENTATION
60  * --------------
61  * The code has been designed and implemented to make (un)programming and
62  * reading the counters for a given CPU as lightweight and fast as possible.
63  * This is very important because we need to read and potentially (un)program
64  * the counters very often and in performance sensitive code.  Specifically,
65  * the counters may need to be (un)programmed during context switch and/or a
66  * cyclic handler when there are more counter events to count than existing
67  * counters.
68  *
69  * Consequently, the code has been split up to allow allocating and
70  * initializing everything needed to program and read the counters on a given
71  * CPU once and make (un)programming and reading the counters for a given CPU
72  * not have to allocate/free memory or grab any locks.  To do this, all the
73  * state needed to (un)program and read the counters on a CPU is kept per CPU
74  * and is made lock free by forcing any code that reads or manipulates the
75  * counters or the state needed to (un)program or read the counters to run on
76  * the target CPU and disable preemption while running on the target CPU to
77  * protect any critical sections. All counter manipulation on the target CPU is
78  * happening either from a cross-call to the target CPU or at the same PIL as
79  * used by the cross-call subsystem. This guarantees that counter manipulation
80  * is not interrupted by cross-calls from other CPUs.
81  *
82  * The synchronization has been made lock free or as simple as possible for
83  * performance and to avoid getting the locking all tangled up when we interpose
84  * on the CPC routines that (un)program the counters to manage the counters
85  * between the kernel and user on each CPU.  When the user starts using the
86  * counters on a given CPU, the kernel will unprogram the counters that it is
87  * using on that CPU just before they are programmed for the user.  Then the
88  * kernel will program the counters on a given CPU for its own use when the user
89  * stops using them.
90  *
91  * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
92  * enables any probe, it requests to disable and unprogram all counters used for
93  * capacity and utilizations. These counters are never re-programmed back until
94  * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
95  * framework and it re-programs the counters.
96  *
97  * When a CPU is going offline, its CU counters are unprogrammed and disabled,
98  * so that they would not be re-programmed again by some other activity on the
99  * CPU that is going offline.
100  *
101  * The counters are programmed during boot.  However, a flag is available to
102  * disable this if necessary (see cu_flag below).  A handler is provided to
103  * (un)program the counters during CPU on/offline.  Basic routines are provided
104  * to initialize and tear down this module, initialize and tear down any state
105  * needed for a given CPU, and (un)program the counters for a given CPU.
106  * Lastly, a handler is provided to read the counters and attribute the
107  * utilization to the responsible CPU.
108  */
109 #include <sys/types.h>
110 #include <sys/cmn_err.h>
111 #include <sys/cpuvar.h>
112 #include <sys/ddi.h>
113 #include <sys/systm.h>
114 #include <sys/disp.h>
115 #include <sys/sdt.h>
116 #include <sys/sunddi.h>
117 #include <sys/thread.h>
118 #include <sys/pghw.h>
119 #include <sys/cmt.h>
120 #include <sys/policy.h>
121 #include <sys/x_call.h>
122 #include <sys/cap_util.h>
123 
124 #include <sys/archsystm.h>
125 #include <sys/promif.h>
126 
127 #if defined(__x86)
128 #include <sys/xc_levels.h>
129 #endif
130 
131 
132 /*
133  * Default CPU hardware performance counter flags to use for measuring capacity
134  * and utilization
135  */
136 #define	CU_CPC_FLAGS_DEFAULT	\
137 	(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
138 
139 /*
140  * Possible Flags for controlling this module.
141  */
142 #define	CU_FLAG_ENABLE		1	/* Enable module */
143 #define	CU_FLAG_READY		2	/* Ready to setup module */
144 #define	CU_FLAG_ON		4	/* Module is on */
145 
146 /*
147  * pg_cpu kstats calculate utilization rate and maximum utilization rate for
148  * some CPUs. The rate is calculated based on data from two subsequent
149  * snapshots. When the time between such two snapshots is too small, the
150  * resulting rate may have low accuracy, so we only consider snapshots which
151  * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
152  * update the rate if the interval is smaller than that.
153  *
154  * Use one tenth of a second as the minimum interval for utilization rate
155  * calculation.
156  *
157  * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
158  * the CU_RATE() macro below to guarantee that we never divide by zero.
159  *
160  * Rate is the number of events per second. The rate is the number of events
161  * divided by time and multiplied by the number of nanoseconds in a second. We
162  * do not want time to be too small since it will cause large errors in
163  * division.
164  *
165  * We do not want to multiply two large numbers (the instruction count and
166  * NANOSEC) either since it may cause integer overflow. So we divide both the
167  * numerator and the denominator by the same value.
168  *
169  * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
170  * above to guarantee that time divided by this value is always non-zero.
171  */
172 #define	CU_RATE(val, time) \
173 	(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
174 
175 #define	CU_SAMPLE_INTERVAL_MIN	(NANOSEC / 10)
176 
177 #define	CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
178 
179 /*
180  * When the time between two kstat reads for the same CPU is less than
181  * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
182  * for the CPU. This helps reduce cross-calls when kstat consumers read data
183  * very often or when they read PG utilization data and then CPU utilization
184  * data quickly after that.
185  */
186 #define	CU_UPDATE_THRESHOLD (NANOSEC / 10)
187 
188 /*
189  * The IS_HIPIL() macro verifies that the code is executed either from a
190  * cross-call or from high-PIL interrupt
191  */
192 #ifdef DEBUG
193 #define	IS_HIPIL() (getpil() >= XCALL_PIL)
194 #else
195 #define	IS_HIPIL()
196 #endif	/* DEBUG */
197 
198 
199 typedef void (*cu_cpu_func_t)(uintptr_t, int *);
200 
201 
202 /*
203  * Flags to use for programming CPU hardware performance counters to measure
204  * capacity and utilization
205  */
206 int				cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
207 
208 /*
209  * Initial value used for programming hardware counters
210  */
211 uint64_t			cu_cpc_preset_value = 0;
212 
213 /*
214  * List of CPC event requests for capacity and utilization.
215  */
216 static kcpc_request_list_t	*cu_cpc_reqs = NULL;
217 
218 /*
219  * When a CPU is a member of PG with a sharing relationship that is supported
220  * by the capacity/utilization framework, a kstat is created for that CPU and
221  * sharing relationship.
222  *
223  * These kstats are updated one at a time, so we can have a single scratch
224  * space to fill the data.
225  *
226  * CPU counter kstats fields:
227  *
228  *   cu_cpu_id		CPU ID for this kstat
229  *
230  *   cu_pg_id		PG ID for this kstat
231  *
232  *   cu_generation	Generation value that increases whenever any CPU goes
233  *			  offline or online. Two kstat snapshots for the same
234  *			  CPU may only be compared if they have the same
235  *			  generation.
236  *
237  *   cu_pg_id		PG ID for the relationship described by this kstat
238  *
239  *   cu_cpu_util	Running value of CPU utilization for the sharing
240  *			  relationship
241  *
242  *   cu_cpu_time_running Total time spent collecting CU data. The time may be
243  *			   less than wall time if CU counters were stopped for
244  *			   some time.
245  *
246  *   cu_cpu_time_stopped Total time the CU counters were stopped.
247  *
248  *   cu_cpu_rate	Utilization rate, expressed in operations per second.
249  *
250  *   cu_cpu_rate_max	Maximum observed value of utilization rate.
251  *
252  *   cu_cpu_relationship Name of sharing relationship for the PG in this kstat
253  */
254 struct cu_cpu_kstat {
255 	kstat_named_t	cu_cpu_id;
256 	kstat_named_t	cu_pg_id;
257 	kstat_named_t	cu_generation;
258 	kstat_named_t	cu_cpu_util;
259 	kstat_named_t	cu_cpu_time_running;
260 	kstat_named_t	cu_cpu_time_stopped;
261 	kstat_named_t	cu_cpu_rate;
262 	kstat_named_t	cu_cpu_rate_max;
263 	kstat_named_t	cu_cpu_relationship;
264 } cu_cpu_kstat = {
265 	{ "cpu_id",			KSTAT_DATA_UINT32 },
266 	{ "pg_id",			KSTAT_DATA_INT32 },
267 	{ "generation",			KSTAT_DATA_UINT32 },
268 	{ "hw_util",			KSTAT_DATA_UINT64 },
269 	{ "hw_util_time_running",	KSTAT_DATA_UINT64 },
270 	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64 },
271 	{ "hw_util_rate",		KSTAT_DATA_UINT64 },
272 	{ "hw_util_rate_max",		KSTAT_DATA_UINT64 },
273 	{ "relationship",		KSTAT_DATA_STRING },
274 };
275 
276 /*
277  * Flags for controlling this module
278  */
279 uint_t				cu_flags = CU_FLAG_ENABLE;
280 
281 /*
282  * Error return value for cu_init() since it can't return anything to be called
283  * from mp_init_tbl[] (:-(
284  */
285 static int			cu_init_error = 0;
286 
287 hrtime_t			cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
288 
289 hrtime_t			cu_update_threshold = CU_UPDATE_THRESHOLD;
290 
291 static kmutex_t			pg_cpu_kstat_lock;
292 
293 
294 /*
295  * Forward declaration of interface routines
296  */
297 void		cu_disable(void);
298 void		cu_enable(void);
299 void		cu_init(void);
300 void		cu_cpc_program(cpu_t *cp, int *err);
301 void		cu_cpc_unprogram(cpu_t *cp, int *err);
302 int		cu_cpu_update(struct cpu *cp, boolean_t move_to);
303 void		cu_pg_update(pghw_t *pg);
304 
305 
306 /*
307  * Forward declaration of private routines
308  */
309 static int	cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
310 static void	cu_cpc_program_xcall(uintptr_t arg, int *err);
311 static int	cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
312     int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
313 static int	cu_cpu_callback(cpu_setup_t what, int id, void *arg);
314 static void	cu_cpu_disable(cpu_t *cp);
315 static void	cu_cpu_enable(cpu_t *cp);
316 static int	cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
317 static int	cu_cpu_fini(cpu_t *cp);
318 static void	cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
319 static int	cu_cpu_kstat_update(kstat_t *ksp, int rw);
320 static int	cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
321 static int	cu_cpu_update_stats(cu_cntr_stats_t *stats,
322     uint64_t cntr_value);
323 static void cu_cpu_info_detach_xcall(void);
324 
325 /*
326  * Disable or enable Capacity Utilization counters on all CPUs.
327  */
328 void
329 cu_disable(void)
330 {
331 	cpu_t *cp;
332 
333 	ASSERT(MUTEX_HELD(&cpu_lock));
334 
335 	cp = cpu_active;
336 	do {
337 		if (!(cp->cpu_flags & CPU_OFFLINE))
338 			cu_cpu_disable(cp);
339 	} while ((cp = cp->cpu_next_onln) != cpu_active);
340 }
341 
342 
343 void
344 cu_enable(void)
345 {
346 	cpu_t *cp;
347 
348 	ASSERT(MUTEX_HELD(&cpu_lock));
349 
350 	cp = cpu_active;
351 	do {
352 		if (!(cp->cpu_flags & CPU_OFFLINE))
353 			cu_cpu_enable(cp);
354 	} while ((cp = cp->cpu_next_onln) != cpu_active);
355 }
356 
357 
358 /*
359  * Setup capacity and utilization support
360  */
361 void
362 cu_init(void)
363 {
364 	cpu_t	*cp;
365 
366 	cu_init_error = 0;
367 	if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
368 		cu_init_error = -1;
369 		return;
370 	}
371 
372 	if (kcpc_init() != 0) {
373 		cu_init_error = -2;
374 		return;
375 	}
376 
377 	/*
378 	 * Can't measure hardware capacity and utilization without CPU
379 	 * hardware performance counters
380 	 */
381 	if (cpc_ncounters <= 0) {
382 		cu_init_error = -3;
383 		return;
384 	}
385 
386 	/*
387 	 * Setup CPC event request queue
388 	 */
389 	cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
390 
391 	mutex_enter(&cpu_lock);
392 
393 	/*
394 	 * Mark flags to say that module is ready to be setup
395 	 */
396 	cu_flags |= CU_FLAG_READY;
397 
398 	cp = cpu_active;
399 	do {
400 		/*
401 		 * Allocate and setup state needed to measure capacity and
402 		 * utilization
403 		 */
404 		if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
405 			cu_init_error = -5;
406 
407 		/*
408 		 * Reset list of counter event requests so its space can be
409 		 * reused for a different set of requests for next CPU
410 		 */
411 		(void) kcpc_reqs_reset(cu_cpc_reqs);
412 
413 		cp = cp->cpu_next_onln;
414 	} while (cp != cpu_active);
415 
416 	/*
417 	 * Mark flags to say that module is on now and counters are ready to be
418 	 * programmed on all active CPUs
419 	 */
420 	cu_flags |= CU_FLAG_ON;
421 
422 	/*
423 	 * Program counters on currently active CPUs
424 	 */
425 	cp = cpu_active;
426 	do {
427 		if (cu_cpu_run(cp, cu_cpc_program_xcall,
428 		    (uintptr_t)B_FALSE) != 0)
429 			cu_init_error = -6;
430 
431 		cp = cp->cpu_next_onln;
432 	} while (cp != cpu_active);
433 
434 	/*
435 	 * Register callback for CPU state changes to enable and disable
436 	 * CPC counters as CPUs come on and offline
437 	 */
438 	register_cpu_setup_func(cu_cpu_callback, NULL);
439 
440 	mutex_exit(&cpu_lock);
441 }
442 
443 
444 /*
445  * Return number of counter events needed to measure capacity and utilization
446  * for specified CPU and fill in list of CPC requests with each counter event
447  * needed if list where to add CPC requests is given
448  *
449  * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
450  *	 everything that has been successfully allocated if any memory
451  *	 allocation fails
452  */
453 static int
454 cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
455 {
456 	group_t		*cmt_pgs;
457 	cu_cntr_info_t	**cntr_info_array;
458 	cpu_pg_t	*cpu_pgs;
459 	cu_cpu_info_t	*cu_cpu_info;
460 	pg_cmt_t	*pg_cmt;
461 	pghw_t		*pg_hw;
462 	cu_cntr_stats_t	*stats;
463 	int		nevents;
464 	pghw_type_t	pg_hw_type;
465 	group_iter_t	iter;
466 
467 	ASSERT(MUTEX_HELD(&cpu_lock));
468 
469 	/*
470 	 * There has to be a target CPU for this
471 	 */
472 	if (cp == NULL)
473 		return (-1);
474 
475 	/*
476 	 * Return 0 when CPU doesn't belong to any group
477 	 */
478 	cpu_pgs = cp->cpu_pg;
479 	if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
480 		return (0);
481 
482 	cmt_pgs = &cpu_pgs->cmt_pgs;
483 	cu_cpu_info = cp->cpu_cu_info;
484 
485 	/*
486 	 * Grab counter statistics and info
487 	 */
488 	if (reqs == NULL) {
489 		stats = NULL;
490 		cntr_info_array = NULL;
491 	} else {
492 		if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
493 			return (-2);
494 
495 		stats = cu_cpu_info->cu_cntr_stats;
496 		cntr_info_array = cu_cpu_info->cu_cntr_info;
497 	}
498 
499 	/*
500 	 * See whether platform (or processor) specific code knows which CPC
501 	 * events to request, etc. are needed to measure hardware capacity and
502 	 * utilization on this machine
503 	 */
504 	nevents = cu_plat_cpc_init(cp, reqs, nreqs);
505 	if (nevents >= 0)
506 		return (nevents);
507 
508 	/*
509 	 * Let common code decide which CPC events to request, etc. to measure
510 	 * capacity and utilization since platform (or processor) specific does
511 	 * not know....
512 	 *
513 	 * Walk CPU's PG lineage and do following:
514 	 *
515 	 * - Setup CPC request, counter info, and stats needed for each counter
516 	 *   event to measure capacity and and utilization for each of CPU's PG
517 	 *   hardware sharing relationships
518 	 *
519 	 * - Create PG CPU kstats to export capacity and utilization for each PG
520 	 */
521 	nevents = 0;
522 	group_iter_init(&iter);
523 	while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
524 		cu_cntr_info_t	*cntr_info;
525 		int		nevents_save;
526 		int		nstats;
527 
528 		pg_hw = (pghw_t *)pg_cmt;
529 		pg_hw_type = pg_hw->pghw_hw;
530 		nevents_save = nevents;
531 		nstats = 0;
532 
533 		switch (pg_hw_type) {
534 		case PGHW_IPIPE:
535 			if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
536 			    KM_NOSLEEP, &nevents) != 0)
537 				continue;
538 			nstats = 1;
539 			break;
540 
541 		case PGHW_FPU:
542 			if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
543 			    KM_NOSLEEP, &nevents) != 0)
544 				continue;
545 			nstats = 1;
546 			break;
547 
548 		default:
549 			/*
550 			 * Don't measure capacity and utilization for this kind
551 			 * of PG hardware relationship so skip to next PG in
552 			 * CPU's PG lineage
553 			 */
554 			continue;
555 		}
556 
557 		cntr_info = cntr_info_array[pg_hw_type];
558 
559 		/*
560 		 * Nothing to measure for this hardware sharing relationship
561 		 */
562 		if (nevents - nevents_save == 0) {
563 			if (cntr_info != NULL)
564 				kmem_free(cntr_info, sizeof (cu_cntr_info_t));
565 				cntr_info_array[pg_hw_type] = NULL;
566 			continue;
567 		}
568 
569 		/*
570 		 * Fill in counter info for this PG hardware relationship
571 		 */
572 		if (cntr_info == NULL) {
573 			cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
574 			    KM_NOSLEEP);
575 			if (cntr_info == NULL)
576 				continue;
577 			cntr_info_array[pg_hw_type] = cntr_info;
578 		}
579 		cntr_info->ci_cpu = cp;
580 		cntr_info->ci_pg = pg_hw;
581 		cntr_info->ci_stats = &stats[nevents_save];
582 		cntr_info->ci_nstats = nstats;
583 
584 		/*
585 		 * Create PG CPU kstats for this hardware relationship
586 		 */
587 		cu_cpu_kstat_create(pg_hw, cntr_info);
588 	}
589 
590 	return (nevents);
591 }
592 
593 
594 /*
595  * Program counters for capacity and utilization on given CPU
596  *
597  * If any of the following conditions is true, the counters are not programmed:
598  *
599  * - CU framework is disabled
600  * - The cpu_cu_info field of the cpu structure is NULL
601  * - DTrace is active
602  * - Counters are programmed already
603  * - Counters are disabled (by calls to cu_cpu_disable())
604  */
605 void
606 cu_cpc_program(cpu_t *cp, int *err)
607 {
608 	cu_cpc_ctx_t	*cpu_ctx;
609 	kcpc_ctx_t	*ctx;
610 	cu_cpu_info_t	*cu_cpu_info;
611 
612 	ASSERT(IS_HIPIL());
613 	/*
614 	 * Should be running on given CPU. We disable preemption to keep CPU
615 	 * from disappearing and make sure flags and CPC context don't change
616 	 * from underneath us
617 	 */
618 	kpreempt_disable();
619 	ASSERT(cp == CPU);
620 
621 	/*
622 	 * Module not ready to program counters
623 	 */
624 	if (!(cu_flags & CU_FLAG_ON)) {
625 		*err = -1;
626 		kpreempt_enable();
627 		return;
628 	}
629 
630 	if (cp == NULL) {
631 		*err = -2;
632 		kpreempt_enable();
633 		return;
634 	}
635 
636 	cu_cpu_info = cp->cpu_cu_info;
637 	if (cu_cpu_info == NULL) {
638 		*err = -3;
639 		kpreempt_enable();
640 		return;
641 	}
642 
643 	/*
644 	 * If DTrace CPC is active or counters turned on already or are
645 	 * disabled, just return.
646 	 */
647 	if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
648 	    cu_cpu_info->cu_disabled) {
649 		*err = 1;
650 		kpreempt_enable();
651 		return;
652 	}
653 
654 	if ((CPU->cpu_cpc_ctx != NULL) &&
655 	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
656 		*err = -4;
657 		kpreempt_enable();
658 		return;
659 	}
660 
661 	/*
662 	 * Get CPU's CPC context needed for capacity and utilization
663 	 */
664 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
665 	ASSERT(cpu_ctx != NULL);
666 	ASSERT(cpu_ctx->nctx >= 0);
667 
668 	ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
669 	ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
670 	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
671 	    cpu_ctx->ctx_ptr_array_sz <= 0) {
672 		*err = -5;
673 		kpreempt_enable();
674 		return;
675 	}
676 
677 	/*
678 	 * Increment index in CPU's CPC context info to point at next context
679 	 * to program
680 	 *
681 	 * NOTE: Do this now instead of after programming counters to ensure
682 	 *	 that index will always point at *current* context so we will
683 	 *	 always be able to unprogram *current* context if necessary
684 	 */
685 	cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
686 
687 	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
688 
689 	/*
690 	 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
691 	 * context before programming counters
692 	 *
693 	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
694 	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
695 	 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
696 	 * invalidate all CPC contexts before they take over all the counters.
697 	 *
698 	 * This isn't necessary since these flags are only used for thread bound
699 	 * CPC contexts not CPU bound CPC contexts like ones used for capacity
700 	 * and utilization.
701 	 *
702 	 * There is no need to protect the flag update since no one is using
703 	 * this context now.
704 	 */
705 	ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
706 
707 	/*
708 	 * Program counters on this CPU
709 	 */
710 	kcpc_program(ctx, B_FALSE, B_FALSE);
711 
712 	cp->cpu_cpc_ctx = ctx;
713 
714 	/*
715 	 * Set state in CPU structure to say that CPU's counters are programmed
716 	 * for capacity and utilization now and that they are transitioning from
717 	 * off to on state. This will cause cu_cpu_update to update stop times
718 	 * for all programmed counters.
719 	 */
720 	cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
721 
722 	/*
723 	 * Update counter statistics
724 	 */
725 	(void) cu_cpu_update(cp, B_FALSE);
726 
727 	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
728 
729 	*err = 0;
730 	kpreempt_enable();
731 }
732 
733 
734 /*
735  * Cross call wrapper routine for cu_cpc_program()
736  *
737  * Checks to make sure that counters on CPU aren't being used by someone else
738  * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
739  * nobody else is using the counters to catch and prevent any broken code.
740  * Also, this check needs to happen on the target CPU since the CPU's CPC
741  * context can only be changed while running on the CPU.
742  *
743  * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
744  * no valid thread bound cpc context. This is important to check to prevent
745  * re-programming thread counters with CU counters when CPU is coming on-line.
746  */
747 static void
748 cu_cpc_program_xcall(uintptr_t arg, int *err)
749 {
750 	boolean_t	avoid_thread_context = (boolean_t)arg;
751 
752 	kpreempt_disable();
753 
754 	if (CPU->cpu_cpc_ctx != NULL &&
755 	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
756 		*err = -100;
757 		kpreempt_enable();
758 		return;
759 	}
760 
761 	if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
762 	    !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
763 		*err = -200;
764 		kpreempt_enable();
765 		return;
766 	}
767 
768 	cu_cpc_program(CPU, err);
769 	kpreempt_enable();
770 }
771 
772 
773 /*
774  * Unprogram counters for capacity and utilization on given CPU
775  * This function should be always executed on the target CPU at high PIL
776  */
777 void
778 cu_cpc_unprogram(cpu_t *cp, int *err)
779 {
780 	cu_cpc_ctx_t	*cpu_ctx;
781 	kcpc_ctx_t	*ctx;
782 	cu_cpu_info_t	*cu_cpu_info;
783 
784 	ASSERT(IS_HIPIL());
785 	/*
786 	 * Should be running on given CPU with preemption disabled to keep CPU
787 	 * from disappearing and make sure flags and CPC context don't change
788 	 * from underneath us
789 	 */
790 	kpreempt_disable();
791 	ASSERT(cp == CPU);
792 
793 	/*
794 	 * Module not on
795 	 */
796 	if (!(cu_flags & CU_FLAG_ON)) {
797 		*err = -1;
798 		kpreempt_enable();
799 		return;
800 	}
801 
802 	cu_cpu_info = cp->cpu_cu_info;
803 	if (cu_cpu_info == NULL) {
804 		*err = -3;
805 		kpreempt_enable();
806 		return;
807 	}
808 
809 	/*
810 	 * Counters turned off already
811 	 */
812 	if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
813 		*err = 1;
814 		kpreempt_enable();
815 		return;
816 	}
817 
818 	/*
819 	 * Update counter statistics
820 	 */
821 	(void) cu_cpu_update(cp, B_FALSE);
822 
823 	/*
824 	 * Get CPU's CPC context needed for capacity and utilization
825 	 */
826 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
827 	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
828 	    cpu_ctx->ctx_ptr_array_sz <= 0) {
829 		*err = -5;
830 		kpreempt_enable();
831 		return;
832 	}
833 	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
834 
835 	/*
836 	 * CPU's CPC context should be current capacity and utilization CPC
837 	 * context
838 	 */
839 	ASSERT(cp->cpu_cpc_ctx == ctx);
840 	if (cp->cpu_cpc_ctx != ctx) {
841 		*err = -6;
842 		kpreempt_enable();
843 		return;
844 	}
845 
846 	/*
847 	 * Unprogram counters on CPU.
848 	 */
849 	kcpc_unprogram(ctx, B_FALSE);
850 
851 	ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
852 
853 	/*
854 	 * Unset state in CPU structure saying that CPU's counters are
855 	 * programmed
856 	 */
857 	cp->cpu_cpc_ctx = NULL;
858 	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
859 
860 	*err = 0;
861 	kpreempt_enable();
862 }
863 
864 
865 /*
866  * Add given counter event to list of CPC requests
867  */
868 static int
869 cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
870     cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
871 {
872 	int	n;
873 	int	retval;
874 	uint_t  flags;
875 
876 	/*
877 	 * Return error when no counter event specified, counter event not
878 	 * supported by CPC's PCBE, or number of events not given
879 	 */
880 	if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
881 	    nevents == NULL)
882 		return (-1);
883 
884 	n = *nevents;
885 
886 	/*
887 	 * Only count number of counter events needed if list
888 	 * where to add CPC requests not given
889 	 */
890 	if (reqs == NULL) {
891 		n++;
892 		*nevents = n;
893 		return (-3);
894 	}
895 
896 	/*
897 	 * Return error when stats not given or not enough room on list of CPC
898 	 * requests for more counter events
899 	 */
900 	if (stats == NULL || (nreqs <= 0 && n >= nreqs))
901 		return (-4);
902 
903 	/*
904 	 * Use flags in cu_cpc_flags to program counters and enable overflow
905 	 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
906 	 * PCBE can catch counters before they wrap to hopefully give us an
907 	 * accurate (64-bit) virtualized counter
908 	 */
909 	flags = cu_cpc_flags;
910 	if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
911 		flags &= ~CPC_OVF_NOTIFY_EMT;
912 
913 	/*
914 	 * Add CPC request to list
915 	 */
916 	retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
917 	    flags, 0, NULL, &stats[n], kmem_flags);
918 
919 	if (retval != 0)
920 		return (-5);
921 
922 	n++;
923 	*nevents = n;
924 	return (0);
925 }
926 
927 static void
928 cu_cpu_info_detach_xcall(void)
929 {
930 	ASSERT(IS_HIPIL());
931 
932 	CPU->cpu_cu_info = NULL;
933 }
934 
935 
936 /*
937  * Enable or disable collection of capacity/utilization data for a current CPU.
938  * Counters are enabled if 'on' argument is True and disabled if it is False.
939  * This function should be always executed at high PIL
940  */
941 static void
942 cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
943 {
944 	cpu_t		*cp = (cpu_t *)arg1;
945 	boolean_t	on = (boolean_t)arg2;
946 	int		error;
947 	cu_cpu_info_t	*cu_cpu_info;
948 
949 	ASSERT(IS_HIPIL());
950 	kpreempt_disable();
951 	ASSERT(cp == CPU);
952 
953 	if (!(cu_flags & CU_FLAG_ON)) {
954 		kpreempt_enable();
955 		return;
956 	}
957 
958 	cu_cpu_info = cp->cpu_cu_info;
959 	if (cu_cpu_info == NULL) {
960 		kpreempt_enable();
961 		return;
962 	}
963 
964 	ASSERT(!cu_cpu_info->cu_disabled ||
965 	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
966 
967 	if (on) {
968 		/*
969 		 * Decrement the cu_disabled counter.
970 		 * Once it drops to zero, call cu_cpc_program.
971 		 */
972 		if (cu_cpu_info->cu_disabled > 0)
973 			cu_cpu_info->cu_disabled--;
974 		if (cu_cpu_info->cu_disabled == 0)
975 			cu_cpc_program(CPU, &error);
976 	} else if (cu_cpu_info->cu_disabled++ == 0) {
977 		/*
978 		 * This is the first attempt to disable CU, so turn it off
979 		 */
980 		cu_cpc_unprogram(cp, &error);
981 		ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
982 	}
983 
984 	kpreempt_enable();
985 }
986 
987 
988 /*
989  * Callback for changes in CPU states
990  * Used to enable or disable hardware performance counters on CPUs that are
991  * turned on or off
992  *
993  * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
994  * We have to use thread_affinity_set to hop to the right CPU because these
995  * routines expect cpu_lock held, so we can't cross-call other CPUs while
996  * holding CPU lock.
997  */
998 static int
999 /* LINTED E_FUNC_ARG_UNUSED */
1000 cu_cpu_callback(cpu_setup_t what, int id, void *arg)
1001 {
1002 	cpu_t	*cp;
1003 	int	retval = 0;
1004 
1005 	ASSERT(MUTEX_HELD(&cpu_lock));
1006 
1007 	if (!(cu_flags & CU_FLAG_ON))
1008 		return (-1);
1009 
1010 	cp = cpu_get(id);
1011 	if (cp == NULL)
1012 		return (-2);
1013 
1014 	switch (what) {
1015 	case CPU_ON:
1016 		/*
1017 		 * Setup counters on CPU being turned on
1018 		 */
1019 		retval = cu_cpu_init(cp, cu_cpc_reqs);
1020 
1021 		/*
1022 		 * Reset list of counter event requests so its space can be
1023 		 * reused for a different set of requests for next CPU
1024 		 */
1025 		(void) kcpc_reqs_reset(cu_cpc_reqs);
1026 		break;
1027 	case CPU_INTR_ON:
1028 		/*
1029 		 * Setup counters on CPU being turned on.
1030 		 */
1031 		retval = cu_cpu_run(cp, cu_cpc_program_xcall,
1032 		    (uintptr_t)B_TRUE);
1033 		break;
1034 	case CPU_OFF:
1035 		/*
1036 		 * Disable counters on CPU being turned off. Counters will not
1037 		 * be re-enabled on this CPU until it comes back online.
1038 		 */
1039 		cu_cpu_disable(cp);
1040 		ASSERT(!CU_CPC_ON(cp));
1041 		retval = cu_cpu_fini(cp);
1042 		break;
1043 	default:
1044 		break;
1045 	}
1046 	return (retval);
1047 }
1048 
1049 
1050 /*
1051  * Disable or enable Capacity Utilization counters on a given CPU. This function
1052  * can be called from any CPU to disable counters on the given CPU.
1053  */
1054 static void
1055 cu_cpu_disable(cpu_t *cp)
1056 {
1057 	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
1058 }
1059 
1060 
1061 static void
1062 cu_cpu_enable(cpu_t *cp)
1063 {
1064 	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
1065 }
1066 
1067 
1068 /*
1069  * Setup capacity and utilization support for given CPU
1070  *
1071  * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
1072  *	 everything that has been successfully allocated including cpu_cu_info
1073  *	if any memory allocation fails
1074  */
1075 static int
1076 cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
1077 {
1078 	kcpc_ctx_t	**ctx_ptr_array;
1079 	size_t		ctx_ptr_array_sz;
1080 	cu_cpc_ctx_t	*cpu_ctx;
1081 	cu_cpu_info_t	*cu_cpu_info;
1082 	int		n;
1083 
1084 	/*
1085 	 * cpu_lock should be held and protect against CPU going away and races
1086 	 * with cu_{init,fini,cpu_fini}()
1087 	 */
1088 	ASSERT(MUTEX_HELD(&cpu_lock));
1089 
1090 	/*
1091 	 * Return if not ready to setup counters yet
1092 	 */
1093 	if (!(cu_flags & CU_FLAG_READY))
1094 		return (-1);
1095 
1096 	if (cp->cpu_cu_info == NULL) {
1097 		cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
1098 		    KM_NOSLEEP);
1099 		if (cp->cpu_cu_info == NULL)
1100 			return (-2);
1101 	}
1102 
1103 	/*
1104 	 * Get capacity and utilization CPC context for CPU and check to see
1105 	 * whether it has been setup already
1106 	 */
1107 	cu_cpu_info = cp->cpu_cu_info;
1108 	cu_cpu_info->cu_cpu = cp;
1109 	cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
1110 
1111 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1112 	if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
1113 	    cpu_ctx->ctx_ptr_array_sz > 0) {
1114 		return (1);
1115 	}
1116 
1117 	/*
1118 	 * Should have no contexts since it hasn't been setup already
1119 	 */
1120 	ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
1121 	    cpu_ctx->ctx_ptr_array_sz == 0);
1122 
1123 	/*
1124 	 * Determine how many CPC events needed to measure capacity and
1125 	 * utilization for this CPU, allocate space for counter statistics for
1126 	 * each event, and fill in list of CPC event requests with corresponding
1127 	 * counter stats for each request to make attributing counter data
1128 	 * easier later....
1129 	 */
1130 	n = cu_cpc_init(cp, NULL, 0);
1131 	if (n <= 0) {
1132 		(void) cu_cpu_fini(cp);
1133 		return (-3);
1134 	}
1135 
1136 	cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
1137 	    KM_NOSLEEP);
1138 	if (cu_cpu_info->cu_cntr_stats == NULL) {
1139 		(void) cu_cpu_fini(cp);
1140 		return (-4);
1141 	}
1142 
1143 	cu_cpu_info->cu_ncntr_stats = n;
1144 
1145 	n = cu_cpc_init(cp, reqs, n);
1146 	if (n <= 0) {
1147 		(void) cu_cpu_fini(cp);
1148 		return (-5);
1149 	}
1150 
1151 	/*
1152 	 * Create CPC context with given requests
1153 	 */
1154 	ctx_ptr_array = NULL;
1155 	ctx_ptr_array_sz = 0;
1156 	n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
1157 	    &ctx_ptr_array_sz);
1158 	if (n <= 0) {
1159 		(void) cu_cpu_fini(cp);
1160 		return (-6);
1161 	}
1162 
1163 	/*
1164 	 * Should have contexts
1165 	 */
1166 	ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
1167 	if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
1168 		(void) cu_cpu_fini(cp);
1169 		return (-7);
1170 	}
1171 
1172 	/*
1173 	 * Fill in CPC context info for CPU needed for capacity and utilization
1174 	 */
1175 	cpu_ctx->cur_index = 0;
1176 	cpu_ctx->nctx = n;
1177 	cpu_ctx->ctx_ptr_array = ctx_ptr_array;
1178 	cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
1179 	return (0);
1180 }
1181 
1182 /*
1183  * Tear down capacity and utilization support for given CPU
1184  */
1185 static int
1186 cu_cpu_fini(cpu_t *cp)
1187 {
1188 	kcpc_ctx_t	*ctx;
1189 	cu_cpc_ctx_t	*cpu_ctx;
1190 	cu_cpu_info_t	*cu_cpu_info;
1191 	int		i;
1192 	pghw_type_t	pg_hw_type;
1193 
1194 	/*
1195 	 * cpu_lock should be held and protect against CPU going away and races
1196 	 * with cu_{init,fini,cpu_init}()
1197 	 */
1198 	ASSERT(MUTEX_HELD(&cpu_lock));
1199 
1200 	/*
1201 	 * Have to at least be ready to setup counters to have allocated
1202 	 * anything that needs to be deallocated now
1203 	 */
1204 	if (!(cu_flags & CU_FLAG_READY))
1205 		return (-1);
1206 
1207 	/*
1208 	 * Nothing to do if CPU's capacity and utilization info doesn't exist
1209 	 */
1210 	cu_cpu_info = cp->cpu_cu_info;
1211 	if (cu_cpu_info == NULL)
1212 		return (1);
1213 
1214 	/*
1215 	 * Tear down any existing kstats and counter info for each hardware
1216 	 * sharing relationship
1217 	 */
1218 	for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
1219 	    pg_hw_type++) {
1220 		cu_cntr_info_t	*cntr_info;
1221 
1222 		cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
1223 		if (cntr_info == NULL)
1224 			continue;
1225 
1226 		if (cntr_info->ci_kstat != NULL) {
1227 			kstat_delete(cntr_info->ci_kstat);
1228 			cntr_info->ci_kstat = NULL;
1229 		}
1230 		kmem_free(cntr_info, sizeof (cu_cntr_info_t));
1231 	}
1232 
1233 	/*
1234 	 * Free counter statistics for CPU
1235 	 */
1236 	ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
1237 	    cu_cpu_info->cu_ncntr_stats > 0);
1238 	if (cu_cpu_info->cu_cntr_stats != NULL &&
1239 	    cu_cpu_info->cu_ncntr_stats > 0) {
1240 		kmem_free(cu_cpu_info->cu_cntr_stats,
1241 		    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
1242 		cu_cpu_info->cu_cntr_stats = NULL;
1243 		cu_cpu_info->cu_ncntr_stats = 0;
1244 	}
1245 
1246 	/*
1247 	 * Get capacity and utilization CPC contexts for given CPU and check to
1248 	 * see whether they have been freed already
1249 	 */
1250 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1251 	if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
1252 	    cpu_ctx->ctx_ptr_array_sz > 0) {
1253 		/*
1254 		 * Free CPC contexts for given CPU
1255 		 */
1256 		for (i = 0; i < cpu_ctx->nctx; i++) {
1257 			ctx = cpu_ctx->ctx_ptr_array[i];
1258 			if (ctx == NULL)
1259 				continue;
1260 			kcpc_free(ctx, 0);
1261 		}
1262 
1263 		/*
1264 		 * Free CPC context pointer array
1265 		 */
1266 		kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
1267 
1268 		/*
1269 		 * Zero CPC info for CPU
1270 		 */
1271 		bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
1272 	}
1273 
1274 	/*
1275 	 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
1276 	 * that no one is going to access the cpu_cu_info whicch we are going to
1277 	 * free.
1278 	 */
1279 	if (cpu_is_online(cp))
1280 		cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
1281 	else
1282 		cp->cpu_cu_info = NULL;
1283 
1284 	/*
1285 	 * Free CPU's capacity and utilization info
1286 	 */
1287 	kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
1288 
1289 	return (0);
1290 }
1291 
1292 /*
1293  * Create capacity & utilization kstats for given PG CPU hardware sharing
1294  * relationship
1295  */
1296 static void
1297 cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
1298 {
1299 	kstat_t		*ks;
1300 	char 		*sharing = pghw_type_string(pg->pghw_hw);
1301 	char		name[KSTAT_STRLEN + 1];
1302 
1303 	/*
1304 	 * Just return when no counter info or CPU
1305 	 */
1306 	if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
1307 		return;
1308 
1309 	/*
1310 	 * Canonify PG name to conform to kstat name rules
1311 	 */
1312 	(void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1);
1313 	strident_canon(name, TASKQ_NAMELEN + 1);
1314 
1315 	if ((ks = kstat_create_zone("pg_hw_perf_cpu",
1316 	    cntr_info->ci_cpu->cpu_id,
1317 	    name, "processor_group", KSTAT_TYPE_NAMED,
1318 	    sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
1319 	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
1320 		return;
1321 
1322 	ks->ks_lock = &pg_cpu_kstat_lock;
1323 	ks->ks_data = &cu_cpu_kstat;
1324 	ks->ks_update = cu_cpu_kstat_update;
1325 	ks->ks_data_size += strlen(sharing) + 1;
1326 
1327 	ks->ks_private = cntr_info;
1328 	cntr_info->ci_kstat = ks;
1329 	kstat_install(cntr_info->ci_kstat);
1330 }
1331 
1332 
1333 /*
1334  * Propagate values from CPU capacity & utilization stats to kstats
1335  */
1336 static int
1337 cu_cpu_kstat_update(kstat_t *ksp, int rw)
1338 {
1339 	cpu_t		*cp;
1340 	cu_cntr_info_t	*cntr_info = ksp->ks_private;
1341 	struct cu_cpu_kstat	*kstat = &cu_cpu_kstat;
1342 	pghw_t		*pg;
1343 	cu_cntr_stats_t	*stats;
1344 
1345 	if (rw == KSTAT_WRITE)
1346 		return (EACCES);
1347 
1348 	cp = cntr_info->ci_cpu;
1349 	pg = cntr_info->ci_pg;
1350 	kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
1351 	kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id;
1352 
1353 	/*
1354 	 * The caller should have priv_cpc_cpu privilege to get utilization
1355 	 * data. Callers who do not have the privilege will see zeroes as the
1356 	 * values.
1357 	 */
1358 	if (secpolicy_cpc_cpu(crgetcred()) != 0) {
1359 		kstat->cu_generation.value.ui32 = cp->cpu_generation;
1360 		kstat_named_setstr(&kstat->cu_cpu_relationship,
1361 		    pghw_type_string(pg->pghw_hw));
1362 
1363 		kstat->cu_cpu_util.value.ui64 = 0;
1364 		kstat->cu_cpu_rate.value.ui64 = 0;
1365 		kstat->cu_cpu_rate_max.value.ui64 = 0;
1366 		kstat->cu_cpu_time_running.value.ui64 = 0;
1367 		kstat->cu_cpu_time_stopped.value.ui64 = 0;
1368 
1369 		return (0);
1370 	}
1371 
1372 	kpreempt_disable();
1373 
1374 	/*
1375 	 * Update capacity and utilization statistics needed for CPU's PG (CPU)
1376 	 * kstats
1377 	 */
1378 
1379 	(void) cu_cpu_update(cp, B_TRUE);
1380 
1381 	stats = cntr_info->ci_stats;
1382 	kstat->cu_generation.value.ui32 = cp->cpu_generation;
1383 	kstat_named_setstr(&kstat->cu_cpu_relationship,
1384 	    pghw_type_string(pg->pghw_hw));
1385 
1386 	kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
1387 	kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
1388 	kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
1389 	kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
1390 	kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
1391 
1392 	/*
1393 	 * Counters are stopped now, so the cs_time_stopped was last
1394 	 * updated at cs_time_start time. Add the time passed since then
1395 	 * to the stopped time.
1396 	 */
1397 	if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
1398 		kstat->cu_cpu_time_stopped.value.ui64 +=
1399 		    gethrtime() - stats->cs_time_start;
1400 
1401 	kpreempt_enable();
1402 
1403 	return (0);
1404 }
1405 
1406 /*
1407  * Run specified function with specified argument on a given CPU and return
1408  * whatever the function returns
1409  */
1410 static int
1411 cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
1412 {
1413 	int error = 0;
1414 
1415 	/*
1416 	 * cpu_call() will call func on the CPU specified with given argument
1417 	 * and return func's return value in last argument
1418 	 */
1419 	cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error);
1420 	return (error);
1421 }
1422 
1423 
1424 /*
1425  * Update counter statistics on a given CPU.
1426  *
1427  * If move_to argument is True, execute the function on the CPU specified
1428  * Otherwise, assume that it is already runninng on the right CPU
1429  *
1430  * If move_to is specified, the caller should hold cpu_lock or have preemption
1431  * disabled. Otherwise it is up to the caller to guarantee that things do not
1432  * change in the process.
1433  */
1434 int
1435 cu_cpu_update(struct cpu *cp, boolean_t move_to)
1436 {
1437 	int	retval;
1438 	cu_cpu_info_t	*cu_cpu_info = cp->cpu_cu_info;
1439 	hrtime_t	time_snap;
1440 
1441 	ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
1442 
1443 	/*
1444 	 * Nothing to do if counters are not programmed
1445 	 */
1446 	if (!(cu_flags & CU_FLAG_ON) ||
1447 	    (cu_cpu_info == NULL) ||
1448 	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1449 		return (0);
1450 
1451 	/*
1452 	 * Don't update CPU statistics if it was updated recently
1453 	 * and provide old results instead
1454 	 */
1455 	time_snap = gethrtime();
1456 	if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
1457 		DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
1458 		return (0);
1459 	}
1460 
1461 	cu_cpu_info->cu_sample_time = time_snap;
1462 
1463 	/*
1464 	 * CPC counter should be read on the CPU that is running the counter. We
1465 	 * either have to move ourselves to the target CPU or insure that we
1466 	 * already run there.
1467 	 *
1468 	 * We use cross-call to the target CPU to execute kcpc_read() and
1469 	 * cu_cpu_update_stats() there.
1470 	 */
1471 	retval = 0;
1472 	if (move_to)
1473 		(void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read,
1474 		    (uintptr_t)cu_cpu_update_stats);
1475 	else {
1476 		retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
1477 		/*
1478 		 * Offset negative return value by -10 so we can distinguish it
1479 		 * from error return values of this routine vs kcpc_read()
1480 		 */
1481 		if (retval < 0)
1482 			retval -= 10;
1483 	}
1484 
1485 	return (retval);
1486 }
1487 
1488 
1489 /*
1490  * Update CPU counter statistics for current CPU.
1491  * This function may be called from a cross-call
1492  */
1493 static int
1494 cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
1495 {
1496 	cu_cpu_info_t	*cu_cpu_info = CPU->cpu_cu_info;
1497 	uint_t		flags;
1498 	uint64_t	delta;
1499 	hrtime_t	time_delta;
1500 	hrtime_t	time_snap;
1501 
1502 	if (stats == NULL)
1503 		return (-1);
1504 
1505 	/*
1506 	 * Nothing to do if counters are not programmed. This should not happen,
1507 	 * but we check just in case.
1508 	 */
1509 	ASSERT(cu_flags & CU_FLAG_ON);
1510 	ASSERT(cu_cpu_info != NULL);
1511 	if (!(cu_flags & CU_FLAG_ON) ||
1512 	    (cu_cpu_info == NULL))
1513 		return (-2);
1514 
1515 	flags = cu_cpu_info->cu_flag;
1516 	ASSERT(flags & CU_CPU_CNTRS_ON);
1517 	if (!(flags & CU_CPU_CNTRS_ON))
1518 		return (-2);
1519 
1520 	/*
1521 	 * Take snapshot of high resolution timer
1522 	 */
1523 	time_snap = gethrtime();
1524 
1525 	/*
1526 	 * CU counters have just been programmed. We cannot assume that the new
1527 	 * cntr_value continues from where we left off, so use the cntr_value as
1528 	 * the new initial value.
1529 	 */
1530 	if (flags & CU_CPU_CNTRS_OFF_ON)
1531 		stats->cs_value_start = cntr_value;
1532 
1533 	/*
1534 	 * Calculate delta in counter values between start of sampling period
1535 	 * and now
1536 	 */
1537 	delta = cntr_value - stats->cs_value_start;
1538 
1539 	/*
1540 	 * Calculate time between start of sampling period and now
1541 	 */
1542 	time_delta = stats->cs_time_start ?
1543 	    time_snap - stats->cs_time_start :
1544 	    0;
1545 	stats->cs_time_start = time_snap;
1546 	stats->cs_value_start = cntr_value;
1547 
1548 	if (time_delta > 0) { /* wrap shouldn't happen */
1549 		/*
1550 		 * Update either running or stopped time based on the transition
1551 		 * state
1552 		 */
1553 		if (flags & CU_CPU_CNTRS_OFF_ON)
1554 			stats->cs_time_stopped += time_delta;
1555 		else
1556 			stats->cs_time_running += time_delta;
1557 	}
1558 
1559 	/*
1560 	 * Update rest of counter statistics if counter value didn't wrap
1561 	 */
1562 	if (delta > 0) {
1563 		/*
1564 		 * Update utilization rate if the interval between samples is
1565 		 * sufficient.
1566 		 */
1567 		ASSERT(cu_sample_interval_min > CU_SCALE);
1568 		if (time_delta > cu_sample_interval_min)
1569 			stats->cs_rate = CU_RATE(delta, time_delta);
1570 		if (stats->cs_rate_max < stats->cs_rate)
1571 			stats->cs_rate_max = stats->cs_rate;
1572 
1573 		stats->cs_value_last = delta;
1574 		stats->cs_value_total += delta;
1575 	}
1576 
1577 	return (0);
1578 }
1579 
1580 /*
1581  * Update CMT PG utilization data.
1582  *
1583  * This routine computes the running total utilization and times for the
1584  * specified PG by adding up the total utilization and counter running and
1585  * stopped times of all CPUs in the PG and calculates the utilization rate and
1586  * maximum rate for all CPUs in the PG.
1587  */
1588 void
1589 cu_pg_update(pghw_t *pg)
1590 {
1591 	pg_cpu_itr_t	cpu_iter;
1592 	pghw_type_t	pg_hwtype;
1593 	cpu_t		*cpu;
1594 	pghw_util_t	*hw_util = &pg->pghw_stats;
1595 	uint64_t	old_utilization = hw_util->pghw_util;
1596 	hrtime_t	now;
1597 	hrtime_t	time_delta;
1598 	uint64_t	utilization_delta;
1599 
1600 	ASSERT(MUTEX_HELD(&cpu_lock));
1601 
1602 	now = gethrtime();
1603 
1604 	pg_hwtype = pg->pghw_hw;
1605 
1606 	/*
1607 	 * Initialize running total utilization and times for PG to 0
1608 	 */
1609 	hw_util->pghw_util = 0;
1610 	hw_util->pghw_time_running = 0;
1611 	hw_util->pghw_time_stopped = 0;
1612 
1613 	/*
1614 	 * Iterate over all CPUs in the PG and aggregate utilization, running
1615 	 * time and stopped time.
1616 	 */
1617 	PG_CPU_ITR_INIT(pg, cpu_iter);
1618 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1619 		cu_cpu_info_t	*cu_cpu_info = cpu->cpu_cu_info;
1620 		cu_cntr_info_t	*cntr_info;
1621 		cu_cntr_stats_t	*stats;
1622 
1623 		if (cu_cpu_info == NULL)
1624 			continue;
1625 
1626 		/*
1627 		 * Update utilization data for the CPU and then
1628 		 * aggregate per CPU running totals for PG
1629 		 */
1630 		(void) cu_cpu_update(cpu, B_TRUE);
1631 		cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
1632 
1633 		if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
1634 			continue;
1635 
1636 		hw_util->pghw_util += stats->cs_value_total;
1637 		hw_util->pghw_time_running += stats->cs_time_running;
1638 		hw_util->pghw_time_stopped += stats->cs_time_stopped;
1639 
1640 		/*
1641 		 * If counters are stopped now, the pg_time_stopped was last
1642 		 * updated at cs_time_start time. Add the time passed since then
1643 		 * to the stopped time.
1644 		 */
1645 		if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1646 			hw_util->pghw_time_stopped +=
1647 			    now - stats->cs_time_start;
1648 	}
1649 
1650 	/*
1651 	 * Compute per PG instruction rate and maximum rate
1652 	 */
1653 	time_delta = now - hw_util->pghw_time_stamp;
1654 	hw_util->pghw_time_stamp = now;
1655 
1656 	if (old_utilization == 0)
1657 		return;
1658 
1659 	/*
1660 	 * Calculate change in utilization over sampling period and set this to
1661 	 * 0 if the delta would be 0 or negative which may happen if any CPUs go
1662 	 * offline during the sampling period
1663 	 */
1664 	if (hw_util->pghw_util > old_utilization)
1665 		utilization_delta = hw_util->pghw_util - old_utilization;
1666 	else
1667 		utilization_delta = 0;
1668 
1669 	/*
1670 	 * Update utilization rate if the interval between samples is
1671 	 * sufficient.
1672 	 */
1673 	ASSERT(cu_sample_interval_min > CU_SCALE);
1674 	if (time_delta > CU_SAMPLE_INTERVAL_MIN)
1675 		hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
1676 
1677 	/*
1678 	 * Update the maximum observed rate
1679 	 */
1680 	if (hw_util->pghw_rate_max < hw_util->pghw_rate)
1681 		hw_util->pghw_rate_max = hw_util->pghw_rate;
1682 }
1683