xref: /titanic_41/usr/src/uts/common/os/cap_util.c (revision eb82ff87b34e625264561b2d267577cf9821dab0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for determining capacity and utilization of performance relevant
29  * hardware components in a computer
30  *
31  * THEORY
32  * ------
33  * The capacity and utilization of the performance relevant hardware components
34  * is needed to be able to optimize performance while minimizing the amount of
35  * power used on a system.  The idea is to use hardware performance counters
36  * and potentially other means to determine the capacity and utilization of
37  * performance relevant hardware components (eg. execution pipeline, cache,
38  * memory, etc.) and attribute the utilization to the responsible CPU and the
39  * thread running there.
40  *
41  * This will help characterize the utilization of performance relevant
42  * components and how much is used by each CPU and each thread.  With
43  * that data, the utilization can be aggregated to all the CPUs sharing each
44  * performance relevant hardware component to calculate the total utilization
45  * of each component and compare that with the component's capacity to
46  * essentially determine the actual hardware load of the component.  The
47  * hardware utilization attributed to each running thread can also be
48  * aggregated to determine the total hardware utilization of each component to
49  * a workload.
50  *
51  * Once that is done, one can determine how much of each performance relevant
52  * hardware component is needed by a given thread or set of threads (eg. a
53  * workload) and size up exactly what hardware is needed by the threads and how
54  * much.  With this info, we can better place threads among CPUs to match their
55  * exact hardware resource needs and potentially lower or raise the power based
56  * on their utilization or pack threads onto the fewest hardware components
57  * needed and power off any remaining unused components to minimize power
58  * without sacrificing performance.
59  *
60  * IMPLEMENTATION
61  * --------------
62  * The code has been designed and implemented to make (un)programming and
63  * reading the counters for a given CPU as lightweight and fast as possible.
64  * This is very important because we need to read and potentially (un)program
65  * the counters very often and in performance sensitive code.  Specifically,
66  * the counters may need to be (un)programmed during context switch and/or a
67  * cyclic handler when there are more counter events to count than existing
68  * counters.
69  *
70  * Consequently, the code has been split up to allow allocating and
71  * initializing everything needed to program and read the counters on a given
72  * CPU once and make (un)programming and reading the counters for a given CPU
73  * not have to allocate/free memory or grab any locks.  To do this, all the
74  * state needed to (un)program and read the counters on a CPU is kept per CPU
75  * and is made lock free by forcing any code that reads or manipulates the
76  * counters or the state needed to (un)program or read the counters to run on
77  * the target CPU and disable preemption while running on the target CPU to
78  * protect any critical sections. All counter manipulation on the target CPU is
79  * happening either from a cross-call to the target CPU or at the same PIL as
80  * used by the cross-call subsystem. This guarantees that counter manipulation
81  * is not interrupted by cross-calls from other CPUs.
82  *
83  * The synchronization has been made lock free or as simple as possible for
84  * performance and to avoid getting the locking all tangled up when we interpose
85  * on the CPC routines that (un)program the counters to manage the counters
86  * between the kernel and user on each CPU.  When the user starts using the
87  * counters on a given CPU, the kernel will unprogram the counters that it is
88  * using on that CPU just before they are programmed for the user.  Then the
89  * kernel will program the counters on a given CPU for its own use when the user
90  * stops using them.
91  *
92  * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
93  * enables any probe, it requests to disable and unprogram all counters used for
94  * capacity and utilizations. These counters are never re-programmed back until
95  * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
96  * framework and it re-programs the counters.
97  *
98  * When a CPU is going offline, its CU counters are unprogrammed and disabled,
99  * so that they would not be re-programmed again by some other activity on the
100  * CPU that is going offline.
101  *
102  * The counters are programmed during boot.  However, a flag is available to
103  * disable this if necessary (see cu_flag below).  A handler is provided to
104  * (un)program the counters during CPU on/offline.  Basic routines are provided
105  * to initialize and tear down this module, initialize and tear down any state
106  * needed for a given CPU, and (un)program the counters for a given CPU.
107  * Lastly, a handler is provided to read the counters and attribute the
108  * utilization to the responsible CPU.
109  */
110 #include <sys/types.h>
111 #include <sys/cmn_err.h>
112 #include <sys/cpuvar.h>
113 #include <sys/ddi.h>
114 #include <sys/disp.h>
115 #include <sys/sdt.h>
116 #include <sys/sunddi.h>
117 #include <sys/thread.h>
118 #include <sys/pghw.h>
119 #include <sys/cmt.h>
120 #include <sys/x_call.h>
121 #include <sys/cap_util.h>
122 
123 #include <sys/archsystm.h>
124 #include <sys/promif.h>
125 
126 #if defined(__x86)
127 #include <sys/xc_levels.h>
128 #endif
129 
130 
131 /*
132  * Default CPU hardware performance counter flags to use for measuring capacity
133  * and utilization
134  */
135 #define	CU_CPC_FLAGS_DEFAULT	\
136 	(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
137 
138 /*
139  * Possible Flags for controlling this module.
140  */
141 #define	CU_FLAG_ENABLE		1	/* Enable module */
142 #define	CU_FLAG_READY		2	/* Ready to setup module */
143 #define	CU_FLAG_ON		4	/* Module is on */
144 
145 /*
146  * pg_cpu kstats calculate utilization rate and maximum utilization rate for
147  * some CPUs. The rate is calculated based on data from two subsequent
148  * snapshots. When the time between such two snapshots is too small, the
149  * resulting rate may have low accuracy, so we only consider snapshots which
150  * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
151  * update the rate if the interval is smaller than that.
152  *
153  * Use one tenth of a second as the minimum interval for utilization rate
154  * calculation.
155  *
156  * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
157  * the CU_RATE() macro below to guarantee that we never divide by zero.
158  *
159  * Rate is the number of events per second. The rate is the number of events
160  * divided by time and multiplied by the number of nanoseconds in a second. We
161  * do not want time to be too small since it will cause large errors in
162  * division.
163  *
164  * We do not want to multiply two large numbers (the instruction count and
165  * NANOSEC) either since it may cause integer overflow. So we divide both the
166  * numerator and the denominator by the same value.
167  *
168  * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
169  * above to guarantee that time divided by this value is always non-zero.
170  */
171 #define	CU_RATE(val, time) \
172 	(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
173 
174 #define	CU_SAMPLE_INTERVAL_MIN	(NANOSEC / 10)
175 
176 #define	CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
177 
178 /*
179  * When the time between two kstat reads for the same CPU is less than
180  * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
181  * for the CPU. This helps reduce cross-calls when kstat consumers read data
182  * very often or when they read PG utilization data and then CPU utilization
183  * data quickly after that.
184  */
185 #define	CU_UPDATE_THRESHOLD (NANOSEC / 10)
186 
187 /*
188  * The IS_HIPIL() macro verifies that the code is executed either from a
189  * cross-call or from high-PIL interrupt
190  */
191 #ifdef DEBUG
192 #define	IS_HIPIL() (getpil() >= XCALL_PIL)
193 #else
194 #define	IS_HIPIL()
195 #endif	/* DEBUG */
196 
197 
198 typedef void (*cu_cpu_func_t)(uintptr_t, int *);
199 
200 
201 /*
202  * Flags to use for programming CPU hardware performance counters to measure
203  * capacity and utilization
204  */
205 int				cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
206 
207 /*
208  * Initial value used for programming hardware counters
209  */
210 uint64_t			cu_cpc_preset_value = 0;
211 
212 /*
213  * List of CPC event requests for capacity and utilization.
214  */
215 static kcpc_request_list_t	*cu_cpc_reqs = NULL;
216 
217 /*
218  * When a CPU is a member of PG with a sharing relationship that is supported
219  * by the capacity/utilization framework, a kstat is created for that CPU and
220  * sharing relationship.
221  *
222  * These kstats are updated one at a time, so we can have a single scratch
223  * space to fill the data.
224  *
225  * CPU counter kstats fields:
226  *
227  *   cu_cpu_id		CPU ID for this kstat
228  *
229  *   cu_generation	Generation value that increases whenever any CPU goes
230  *			  offline or online. Two kstat snapshots for the same
231  *			  CPU may only be compared if they have the same
232  *			  generation.
233  *
234  *   cu_pg_id		PG ID for the relationship described by this kstat
235  *
236  *   cu_cpu_util	Running value of CPU utilization for the sharing
237  *			  relationship
238  *
239  *   cu_cpu_time_running Total time spent collecting CU data. The time may be
240  *			   less than wall time if CU counters were stopped for
241  *			   some time.
242  *
243  *   cu_cpu_time_stopped Total time the CU counters were stopped.
244  *
245  *   cu_cpu_rate	Utilization rate, expressed in operations per second.
246  *
247  *   cu_cpu_rate_max	Maximum observed value of utilization rate.
248  */
249 struct cu_cpu_kstat {
250 	kstat_named_t	cu_cpu_id;
251 	kstat_named_t	cu_generation;
252 	kstat_named_t	cu_pg_id;
253 	kstat_named_t	cu_cpu_util;
254 	kstat_named_t	cu_cpu_time_running;
255 	kstat_named_t	cu_cpu_time_stopped;
256 	kstat_named_t	cu_cpu_rate;
257 	kstat_named_t	cu_cpu_rate_max;
258 } cu_cpu_kstat = {
259 	{ "id",				KSTAT_DATA_UINT32 },
260 	{ "generation",			KSTAT_DATA_UINT32 },
261 	{ "pg_id",			KSTAT_DATA_LONG },
262 	{ "hw_util",			KSTAT_DATA_UINT64 },
263 	{ "hw_util_time_running",	KSTAT_DATA_UINT64 },
264 	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64 },
265 	{ "hw_util_rate",		KSTAT_DATA_UINT64 },
266 	{ "hw_util_rate_max",		KSTAT_DATA_UINT64 },
267 };
268 
269 /*
270  * Flags for controlling this module
271  */
272 uint_t				cu_flags = CU_FLAG_ENABLE;
273 
274 /*
275  * Error return value for cu_init() since it can't return anything to be called
276  * from mp_init_tbl[] (:-(
277  */
278 static int			cu_init_error = 0;
279 
280 hrtime_t			cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
281 
282 hrtime_t			cu_update_threshold = CU_UPDATE_THRESHOLD;
283 
284 static kmutex_t			pg_cpu_kstat_lock;
285 
286 
287 /*
288  * Forward declaration of interface routines
289  */
290 void		cu_disable(void);
291 void		cu_enable(void);
292 void		cu_init(void);
293 void		cu_cpc_program(cpu_t *cp, int *err);
294 void		cu_cpc_unprogram(cpu_t *cp, int *err);
295 int		cu_cpu_update(struct cpu *cp, boolean_t move_to);
296 void		cu_pg_update(pghw_t *pg);
297 
298 
299 /*
300  * Forward declaration of private routines
301  */
302 static int	cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
303 static void	cu_cpc_program_xcall(uintptr_t arg, int *err);
304 static int	cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
305     int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
306 static int	cu_cpu_callback(cpu_setup_t what, int id, void *arg);
307 static void	cu_cpu_disable(cpu_t *cp);
308 static void	cu_cpu_enable(cpu_t *cp);
309 static int	cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
310 static int	cu_cpu_fini(cpu_t *cp);
311 static void	cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
312 static int	cu_cpu_kstat_update(kstat_t *ksp, int rw);
313 static int	cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
314 static int	cu_cpu_update_stats(cu_cntr_stats_t *stats,
315     uint64_t cntr_value);
316 static void cu_cpu_info_detach_xcall(void);
317 
318 /*
319  * Disable or enable Capacity Utilization counters on all CPUs.
320  */
321 void
322 cu_disable(void)
323 {
324 	cpu_t *cp;
325 
326 	ASSERT(MUTEX_HELD(&cpu_lock));
327 
328 	cp = cpu_active;
329 	do {
330 		if (!(cp->cpu_flags & CPU_OFFLINE))
331 			cu_cpu_disable(cp);
332 	} while ((cp = cp->cpu_next_onln) != cpu_active);
333 }
334 
335 
336 void
337 cu_enable(void)
338 {
339 	cpu_t *cp;
340 
341 	ASSERT(MUTEX_HELD(&cpu_lock));
342 
343 	cp = cpu_active;
344 	do {
345 		if (!(cp->cpu_flags & CPU_OFFLINE))
346 			cu_cpu_enable(cp);
347 	} while ((cp = cp->cpu_next_onln) != cpu_active);
348 }
349 
350 
351 /*
352  * Setup capacity and utilization support
353  */
354 void
355 cu_init(void)
356 {
357 	cpu_t	*cp;
358 
359 	cu_init_error = 0;
360 	if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
361 		cu_init_error = -1;
362 		return;
363 	}
364 
365 	if (kcpc_init() != 0) {
366 		cu_init_error = -2;
367 		return;
368 	}
369 
370 	/*
371 	 * Can't measure hardware capacity and utilization without CPU
372 	 * hardware performance counters
373 	 */
374 	if (cpc_ncounters <= 0) {
375 		cu_init_error = -3;
376 		return;
377 	}
378 
379 	/*
380 	 * Setup CPC event request queue
381 	 */
382 	cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
383 
384 	mutex_enter(&cpu_lock);
385 
386 	/*
387 	 * Mark flags to say that module is ready to be setup
388 	 */
389 	cu_flags |= CU_FLAG_READY;
390 
391 	cp = cpu_active;
392 	do {
393 		/*
394 		 * Allocate and setup state needed to measure capacity and
395 		 * utilization
396 		 */
397 		if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
398 			cu_init_error = -5;
399 
400 		/*
401 		 * Reset list of counter event requests so its space can be
402 		 * reused for a different set of requests for next CPU
403 		 */
404 		(void) kcpc_reqs_reset(cu_cpc_reqs);
405 
406 		cp = cp->cpu_next_onln;
407 	} while (cp != cpu_active);
408 
409 	/*
410 	 * Mark flags to say that module is on now and counters are ready to be
411 	 * programmed on all active CPUs
412 	 */
413 	cu_flags |= CU_FLAG_ON;
414 
415 	/*
416 	 * Program counters on currently active CPUs
417 	 */
418 	cp = cpu_active;
419 	do {
420 		if (cu_cpu_run(cp, cu_cpc_program_xcall,
421 		    (uintptr_t)B_FALSE) != 0)
422 			cu_init_error = -6;
423 
424 		cp = cp->cpu_next_onln;
425 	} while (cp != cpu_active);
426 
427 	/*
428 	 * Register callback for CPU state changes to enable and disable
429 	 * CPC counters as CPUs come on and offline
430 	 */
431 	register_cpu_setup_func(cu_cpu_callback, NULL);
432 
433 	mutex_exit(&cpu_lock);
434 }
435 
436 
437 /*
438  * Return number of counter events needed to measure capacity and utilization
439  * for specified CPU and fill in list of CPC requests with each counter event
440  * needed if list where to add CPC requests is given
441  *
442  * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
443  *	 everything that has been successfully allocated if any memory
444  *	 allocation fails
445  */
446 static int
447 cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
448 {
449 	group_t		*cmt_pgs;
450 	cu_cntr_info_t	**cntr_info_array;
451 	cpu_pg_t	*cpu_pgs;
452 	cu_cpu_info_t	*cu_cpu_info;
453 	pg_cmt_t	*pg_cmt;
454 	pghw_t		*pg_hw;
455 	cu_cntr_stats_t	*stats;
456 	int		nevents;
457 	pghw_type_t	pg_hw_type;
458 	group_iter_t	iter;
459 
460 	ASSERT(MUTEX_HELD(&cpu_lock));
461 
462 	/*
463 	 * There has to be a target CPU for this
464 	 */
465 	if (cp == NULL)
466 		return (-1);
467 
468 	/*
469 	 * Return 0 when CPU doesn't belong to any group
470 	 */
471 	cpu_pgs = cp->cpu_pg;
472 	if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
473 		return (0);
474 
475 	cmt_pgs = &cpu_pgs->cmt_pgs;
476 	cu_cpu_info = cp->cpu_cu_info;
477 
478 	/*
479 	 * Grab counter statistics and info
480 	 */
481 	if (reqs == NULL) {
482 		stats = NULL;
483 		cntr_info_array = NULL;
484 	} else {
485 		if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
486 			return (-2);
487 
488 		stats = cu_cpu_info->cu_cntr_stats;
489 		cntr_info_array = cu_cpu_info->cu_cntr_info;
490 	}
491 
492 	/*
493 	 * See whether platform (or processor) specific code knows which CPC
494 	 * events to request, etc. are needed to measure hardware capacity and
495 	 * utilization on this machine
496 	 */
497 	nevents = cu_plat_cpc_init(cp, reqs, nreqs);
498 	if (nevents >= 0)
499 		return (nevents);
500 
501 	/*
502 	 * Let common code decide which CPC events to request, etc. to measure
503 	 * capacity and utilization since platform (or processor) specific does
504 	 * not know....
505 	 *
506 	 * Walk CPU's PG lineage and do following:
507 	 *
508 	 * - Setup CPC request, counter info, and stats needed for each counter
509 	 *   event to measure capacity and and utilization for each of CPU's PG
510 	 *   hardware sharing relationships
511 	 *
512 	 * - Create PG CPU kstats to export capacity and utilization for each PG
513 	 */
514 	nevents = 0;
515 	group_iter_init(&iter);
516 	while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
517 		cu_cntr_info_t	*cntr_info;
518 		int		nevents_save;
519 		int		nstats;
520 
521 		pg_hw = (pghw_t *)pg_cmt;
522 		pg_hw_type = pg_hw->pghw_hw;
523 		nevents_save = nevents;
524 		nstats = 0;
525 
526 		switch (pg_hw_type) {
527 		case PGHW_IPIPE:
528 			if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
529 			    KM_NOSLEEP, &nevents) != 0)
530 				continue;
531 			nstats = 1;
532 			break;
533 
534 		case PGHW_FPU:
535 			if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
536 			    KM_NOSLEEP, &nevents) != 0)
537 				continue;
538 			nstats = 1;
539 			break;
540 
541 		default:
542 			/*
543 			 * Don't measure capacity and utilization for this kind
544 			 * of PG hardware relationship so skip to next PG in
545 			 * CPU's PG lineage
546 			 */
547 			continue;
548 		}
549 
550 		cntr_info = cntr_info_array[pg_hw_type];
551 
552 		/*
553 		 * Nothing to measure for this hardware sharing relationship
554 		 */
555 		if (nevents - nevents_save == 0) {
556 			if (cntr_info != NULL)
557 				kmem_free(cntr_info, sizeof (cu_cntr_info_t));
558 				cntr_info_array[pg_hw_type] = NULL;
559 			continue;
560 		}
561 
562 		/*
563 		 * Fill in counter info for this PG hardware relationship
564 		 */
565 		if (cntr_info == NULL) {
566 			cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
567 			    KM_NOSLEEP);
568 			if (cntr_info == NULL)
569 				continue;
570 			cntr_info_array[pg_hw_type] = cntr_info;
571 		}
572 		cntr_info->ci_cpu = cp;
573 		cntr_info->ci_pg = pg_hw;
574 		cntr_info->ci_stats = &stats[nevents_save];
575 		cntr_info->ci_nstats = nstats;
576 
577 		/*
578 		 * Create PG CPU kstats for this hardware relationship
579 		 */
580 		cu_cpu_kstat_create(pg_hw, cntr_info);
581 	}
582 
583 	return (nevents);
584 }
585 
586 
587 /*
588  * Program counters for capacity and utilization on given CPU
589  *
590  * If any of the following conditions is true, the counters are not programmed:
591  *
592  * - CU framework is disabled
593  * - The cpu_cu_info field of the cpu structure is NULL
594  * - DTrace is active
595  * - Counters are programmed already
596  * - Counters are disabled (by calls to cu_cpu_disable())
597  */
598 void
599 cu_cpc_program(cpu_t *cp, int *err)
600 {
601 	cu_cpc_ctx_t	*cpu_ctx;
602 	kcpc_ctx_t	*ctx;
603 	cu_cpu_info_t	*cu_cpu_info;
604 
605 	ASSERT(IS_HIPIL());
606 	/*
607 	 * Should be running on given CPU. We disable preemption to keep CPU
608 	 * from disappearing and make sure flags and CPC context don't change
609 	 * from underneath us
610 	 */
611 	kpreempt_disable();
612 	ASSERT(cp == CPU);
613 
614 	/*
615 	 * Module not ready to program counters
616 	 */
617 	if (!(cu_flags & CU_FLAG_ON)) {
618 		*err = -1;
619 		kpreempt_enable();
620 		return;
621 	}
622 
623 	if (cp == NULL) {
624 		*err = -2;
625 		kpreempt_enable();
626 		return;
627 	}
628 
629 	cu_cpu_info = cp->cpu_cu_info;
630 	if (cu_cpu_info == NULL) {
631 		*err = -3;
632 		kpreempt_enable();
633 		return;
634 	}
635 
636 	/*
637 	 * If DTrace CPC is active or counters turned on already or are
638 	 * disabled, just return.
639 	 */
640 	if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
641 	    cu_cpu_info->cu_disabled) {
642 		*err = 1;
643 		kpreempt_enable();
644 		return;
645 	}
646 
647 	if ((CPU->cpu_cpc_ctx != NULL) &&
648 	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
649 		*err = -4;
650 		kpreempt_enable();
651 		return;
652 	}
653 
654 	/*
655 	 * Get CPU's CPC context needed for capacity and utilization
656 	 */
657 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
658 	ASSERT(cpu_ctx != NULL);
659 	ASSERT(cpu_ctx->nctx >= 0);
660 
661 	ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
662 	ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
663 	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
664 	    cpu_ctx->ctx_ptr_array_sz <= 0) {
665 		*err = -5;
666 		kpreempt_enable();
667 		return;
668 	}
669 
670 	/*
671 	 * Increment index in CPU's CPC context info to point at next context
672 	 * to program
673 	 *
674 	 * NOTE: Do this now instead of after programming counters to ensure
675 	 *	 that index will always point at *current* context so we will
676 	 *	 always be able to unprogram *current* context if necessary
677 	 */
678 	cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
679 
680 	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
681 
682 	/*
683 	 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
684 	 * context before programming counters
685 	 *
686 	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
687 	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
688 	 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
689 	 * invalidate all CPC contexts before they take over all the counters.
690 	 *
691 	 * This isn't necessary since these flags are only used for thread bound
692 	 * CPC contexts not CPU bound CPC contexts like ones used for capacity
693 	 * and utilization.
694 	 *
695 	 * There is no need to protect the flag update since no one is using
696 	 * this context now.
697 	 */
698 	ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
699 
700 	/*
701 	 * Program counters on this CPU
702 	 */
703 	kcpc_program(ctx, B_FALSE, B_FALSE);
704 
705 	cp->cpu_cpc_ctx = ctx;
706 
707 	/*
708 	 * Set state in CPU structure to say that CPU's counters are programmed
709 	 * for capacity and utilization now and that they are transitioning from
710 	 * off to on state. This will cause cu_cpu_update to update stop times
711 	 * for all programmed counters.
712 	 */
713 	cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
714 
715 	/*
716 	 * Update counter statistics
717 	 */
718 	(void) cu_cpu_update(cp, B_FALSE);
719 
720 	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
721 
722 	*err = 0;
723 	kpreempt_enable();
724 }
725 
726 
727 /*
728  * Cross call wrapper routine for cu_cpc_program()
729  *
730  * Checks to make sure that counters on CPU aren't being used by someone else
731  * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
732  * nobody else is using the counters to catch and prevent any broken code.
733  * Also, this check needs to happen on the target CPU since the CPU's CPC
734  * context can only be changed while running on the CPU.
735  *
736  * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
737  * no valid thread bound cpc context. This is important to check to prevent
738  * re-programming thread counters with CU counters when CPU is coming on-line.
739  */
740 static void
741 cu_cpc_program_xcall(uintptr_t arg, int *err)
742 {
743 	boolean_t	avoid_thread_context = (boolean_t)arg;
744 
745 	kpreempt_disable();
746 
747 	if (CPU->cpu_cpc_ctx != NULL &&
748 	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
749 		*err = -100;
750 		kpreempt_enable();
751 		return;
752 	}
753 
754 	if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
755 	    !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
756 		*err = -200;
757 		kpreempt_enable();
758 		return;
759 	}
760 
761 	cu_cpc_program(CPU, err);
762 	kpreempt_enable();
763 }
764 
765 
766 /*
767  * Unprogram counters for capacity and utilization on given CPU
768  * This function should be always executed on the target CPU at high PIL
769  */
770 void
771 cu_cpc_unprogram(cpu_t *cp, int *err)
772 {
773 	cu_cpc_ctx_t	*cpu_ctx;
774 	kcpc_ctx_t	*ctx;
775 	cu_cpu_info_t	*cu_cpu_info;
776 
777 	ASSERT(IS_HIPIL());
778 	/*
779 	 * Should be running on given CPU with preemption disabled to keep CPU
780 	 * from disappearing and make sure flags and CPC context don't change
781 	 * from underneath us
782 	 */
783 	kpreempt_disable();
784 	ASSERT(cp == CPU);
785 
786 	/*
787 	 * Module not on
788 	 */
789 	if (!(cu_flags & CU_FLAG_ON)) {
790 		*err = -1;
791 		kpreempt_enable();
792 		return;
793 	}
794 
795 	cu_cpu_info = cp->cpu_cu_info;
796 	if (cu_cpu_info == NULL) {
797 		*err = -3;
798 		kpreempt_enable();
799 		return;
800 	}
801 
802 	/*
803 	 * Counters turned off already
804 	 */
805 	if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
806 		*err = 1;
807 		kpreempt_enable();
808 		return;
809 	}
810 
811 	/*
812 	 * Update counter statistics
813 	 */
814 	(void) cu_cpu_update(cp, B_FALSE);
815 
816 	/*
817 	 * Get CPU's CPC context needed for capacity and utilization
818 	 */
819 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
820 	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
821 	    cpu_ctx->ctx_ptr_array_sz <= 0) {
822 		*err = -5;
823 		kpreempt_enable();
824 		return;
825 	}
826 	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
827 
828 	/*
829 	 * CPU's CPC context should be current capacity and utilization CPC
830 	 * context
831 	 */
832 	ASSERT(cp->cpu_cpc_ctx == ctx);
833 	if (cp->cpu_cpc_ctx != ctx) {
834 		*err = -6;
835 		kpreempt_enable();
836 		return;
837 	}
838 
839 	/*
840 	 * Unprogram counters on CPU.
841 	 */
842 	kcpc_unprogram(ctx, B_FALSE);
843 
844 	ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
845 
846 	/*
847 	 * Unset state in CPU structure saying that CPU's counters are
848 	 * programmed
849 	 */
850 	cp->cpu_cpc_ctx = NULL;
851 	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
852 
853 	*err = 0;
854 	kpreempt_enable();
855 }
856 
857 
858 /*
859  * Add given counter event to list of CPC requests
860  */
861 static int
862 cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
863     cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
864 {
865 	int	n;
866 	int	retval;
867 	uint_t  flags;
868 
869 	/*
870 	 * Return error when no counter event specified, counter event not
871 	 * supported by CPC's PCBE, or number of events not given
872 	 */
873 	if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
874 	    nevents == NULL)
875 		return (-1);
876 
877 	n = *nevents;
878 
879 	/*
880 	 * Only count number of counter events needed if list
881 	 * where to add CPC requests not given
882 	 */
883 	if (reqs == NULL) {
884 		n++;
885 		*nevents = n;
886 		return (-3);
887 	}
888 
889 	/*
890 	 * Return error when stats not given or not enough room on list of CPC
891 	 * requests for more counter events
892 	 */
893 	if (stats == NULL || (nreqs <= 0 && n >= nreqs))
894 		return (-4);
895 
896 	/*
897 	 * Use flags in cu_cpc_flags to program counters and enable overflow
898 	 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
899 	 * PCBE can catch counters before they wrap to hopefully give us an
900 	 * accurate (64-bit) virtualized counter
901 	 */
902 	flags = cu_cpc_flags;
903 	if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
904 		flags &= ~CPC_OVF_NOTIFY_EMT;
905 
906 	/*
907 	 * Add CPC request to list
908 	 */
909 	retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
910 	    flags, 0, NULL, &stats[n], kmem_flags);
911 
912 	if (retval != 0)
913 		return (-5);
914 
915 	n++;
916 	*nevents = n;
917 	return (0);
918 }
919 
920 static void
921 cu_cpu_info_detach_xcall(void)
922 {
923 	ASSERT(IS_HIPIL());
924 
925 	CPU->cpu_cu_info = NULL;
926 }
927 
928 
929 /*
930  * Enable or disable collection of capacity/utilization data for a current CPU.
931  * Counters are enabled if 'on' argument is True and disabled if it is False.
932  * This function should be always executed at high PIL
933  */
934 static void
935 cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
936 {
937 	cpu_t		*cp = (cpu_t *)arg1;
938 	boolean_t	on = (boolean_t)arg2;
939 	int		error;
940 	cu_cpu_info_t	*cu_cpu_info;
941 
942 	ASSERT(IS_HIPIL());
943 	kpreempt_disable();
944 	ASSERT(cp == CPU);
945 
946 	if (!(cu_flags & CU_FLAG_ON)) {
947 		kpreempt_enable();
948 		return;
949 	}
950 
951 	cu_cpu_info = cp->cpu_cu_info;
952 	if (cu_cpu_info == NULL) {
953 		kpreempt_enable();
954 		return;
955 	}
956 
957 	ASSERT(!cu_cpu_info->cu_disabled ||
958 	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
959 
960 	if (on) {
961 		/*
962 		 * Decrement the cu_disabled counter.
963 		 * Once it drops to zero, call cu_cpc_program.
964 		 */
965 		if (cu_cpu_info->cu_disabled > 0)
966 			cu_cpu_info->cu_disabled--;
967 		if (cu_cpu_info->cu_disabled == 0)
968 			cu_cpc_program(CPU, &error);
969 	} else if (cu_cpu_info->cu_disabled++ == 0) {
970 		/*
971 		 * This is the first attempt to disable CU, so turn it off
972 		 */
973 		cu_cpc_unprogram(cp, &error);
974 		ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
975 	}
976 
977 	kpreempt_enable();
978 }
979 
980 
981 /*
982  * Callback for changes in CPU states
983  * Used to enable or disable hardware performance counters on CPUs that are
984  * turned on or off
985  *
986  * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
987  * We have to use thread_affinity_set to hop to the right CPU because these
988  * routines expect cpu_lock held, so we can't cross-call other CPUs while
989  * holding CPU lock.
990  */
991 static int
992 /* LINTED E_FUNC_ARG_UNUSED */
993 cu_cpu_callback(cpu_setup_t what, int id, void *arg)
994 {
995 	cpu_t	*cp;
996 	int	retval = 0;
997 
998 	ASSERT(MUTEX_HELD(&cpu_lock));
999 
1000 	if (!(cu_flags & CU_FLAG_ON))
1001 		return (-1);
1002 
1003 	cp = cpu_get(id);
1004 	if (cp == NULL)
1005 		return (-2);
1006 
1007 	switch (what) {
1008 	case CPU_ON:
1009 		/*
1010 		 * Setup counters on CPU being turned on
1011 		 */
1012 		retval = cu_cpu_init(cp, cu_cpc_reqs);
1013 
1014 		/*
1015 		 * Reset list of counter event requests so its space can be
1016 		 * reused for a different set of requests for next CPU
1017 		 */
1018 		(void) kcpc_reqs_reset(cu_cpc_reqs);
1019 		break;
1020 	case CPU_INTR_ON:
1021 		/*
1022 		 * Setup counters on CPU being turned on.
1023 		 */
1024 		retval = cu_cpu_run(cp, cu_cpc_program_xcall,
1025 		    (uintptr_t)B_TRUE);
1026 		break;
1027 	case CPU_OFF:
1028 		/*
1029 		 * Disable counters on CPU being turned off. Counters will not
1030 		 * be re-enabled on this CPU until it comes back online.
1031 		 */
1032 		cu_cpu_disable(cp);
1033 		ASSERT(!CU_CPC_ON(cp));
1034 		retval = cu_cpu_fini(cp);
1035 		break;
1036 	default:
1037 		break;
1038 	}
1039 	return (retval);
1040 }
1041 
1042 
1043 /*
1044  * Disable or enable Capacity Utilization counters on a given CPU. This function
1045  * can be called from any CPU to disable counters on the given CPU.
1046  */
1047 static void
1048 cu_cpu_disable(cpu_t *cp)
1049 {
1050 	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
1051 }
1052 
1053 
1054 static void
1055 cu_cpu_enable(cpu_t *cp)
1056 {
1057 	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
1058 }
1059 
1060 
1061 /*
1062  * Setup capacity and utilization support for given CPU
1063  *
1064  * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
1065  *	 everything that has been successfully allocated including cpu_cu_info
1066  *	if any memory allocation fails
1067  */
1068 static int
1069 cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
1070 {
1071 	kcpc_ctx_t	**ctx_ptr_array;
1072 	size_t		ctx_ptr_array_sz;
1073 	cu_cpc_ctx_t	*cpu_ctx;
1074 	cu_cpu_info_t	*cu_cpu_info;
1075 	int		n;
1076 
1077 	/*
1078 	 * cpu_lock should be held and protect against CPU going away and races
1079 	 * with cu_{init,fini,cpu_fini}()
1080 	 */
1081 	ASSERT(MUTEX_HELD(&cpu_lock));
1082 
1083 	/*
1084 	 * Return if not ready to setup counters yet
1085 	 */
1086 	if (!(cu_flags & CU_FLAG_READY))
1087 		return (-1);
1088 
1089 	if (cp->cpu_cu_info == NULL) {
1090 		cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
1091 		    KM_NOSLEEP);
1092 		if (cp->cpu_cu_info == NULL)
1093 			return (-2);
1094 	}
1095 
1096 	/*
1097 	 * Get capacity and utilization CPC context for CPU and check to see
1098 	 * whether it has been setup already
1099 	 */
1100 	cu_cpu_info = cp->cpu_cu_info;
1101 	cu_cpu_info->cu_cpu = cp;
1102 	cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
1103 
1104 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1105 	if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
1106 	    cpu_ctx->ctx_ptr_array_sz > 0) {
1107 		return (1);
1108 	}
1109 
1110 	/*
1111 	 * Should have no contexts since it hasn't been setup already
1112 	 */
1113 	ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
1114 	    cpu_ctx->ctx_ptr_array_sz == 0);
1115 
1116 	/*
1117 	 * Determine how many CPC events needed to measure capacity and
1118 	 * utilization for this CPU, allocate space for counter statistics for
1119 	 * each event, and fill in list of CPC event requests with corresponding
1120 	 * counter stats for each request to make attributing counter data
1121 	 * easier later....
1122 	 */
1123 	n = cu_cpc_init(cp, NULL, 0);
1124 	if (n <= 0) {
1125 		(void) cu_cpu_fini(cp);
1126 		return (-3);
1127 	}
1128 
1129 	cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
1130 	    KM_NOSLEEP);
1131 	if (cu_cpu_info->cu_cntr_stats == NULL) {
1132 		(void) cu_cpu_fini(cp);
1133 		return (-4);
1134 	}
1135 
1136 	cu_cpu_info->cu_ncntr_stats = n;
1137 
1138 	n = cu_cpc_init(cp, reqs, n);
1139 	if (n <= 0) {
1140 		(void) cu_cpu_fini(cp);
1141 		return (-5);
1142 	}
1143 
1144 	/*
1145 	 * Create CPC context with given requests
1146 	 */
1147 	ctx_ptr_array = NULL;
1148 	ctx_ptr_array_sz = 0;
1149 	n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
1150 	    &ctx_ptr_array_sz);
1151 	if (n <= 0) {
1152 		(void) cu_cpu_fini(cp);
1153 		return (-6);
1154 	}
1155 
1156 	/*
1157 	 * Should have contexts
1158 	 */
1159 	ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
1160 	if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
1161 		(void) cu_cpu_fini(cp);
1162 		return (-7);
1163 	}
1164 
1165 	/*
1166 	 * Fill in CPC context info for CPU needed for capacity and utilization
1167 	 */
1168 	cpu_ctx->cur_index = 0;
1169 	cpu_ctx->nctx = n;
1170 	cpu_ctx->ctx_ptr_array = ctx_ptr_array;
1171 	cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
1172 	return (0);
1173 }
1174 
1175 /*
1176  * Tear down capacity and utilization support for given CPU
1177  */
1178 static int
1179 cu_cpu_fini(cpu_t *cp)
1180 {
1181 	kcpc_ctx_t	*ctx;
1182 	cu_cpc_ctx_t	*cpu_ctx;
1183 	cu_cpu_info_t	*cu_cpu_info;
1184 	int		i;
1185 	pghw_type_t	pg_hw_type;
1186 
1187 	/*
1188 	 * cpu_lock should be held and protect against CPU going away and races
1189 	 * with cu_{init,fini,cpu_init}()
1190 	 */
1191 	ASSERT(MUTEX_HELD(&cpu_lock));
1192 
1193 	/*
1194 	 * Have to at least be ready to setup counters to have allocated
1195 	 * anything that needs to be deallocated now
1196 	 */
1197 	if (!(cu_flags & CU_FLAG_READY))
1198 		return (-1);
1199 
1200 	/*
1201 	 * Nothing to do if CPU's capacity and utilization info doesn't exist
1202 	 */
1203 	cu_cpu_info = cp->cpu_cu_info;
1204 	if (cu_cpu_info == NULL)
1205 		return (1);
1206 
1207 	/*
1208 	 * Tear down any existing kstats and counter info for each hardware
1209 	 * sharing relationship
1210 	 */
1211 	for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
1212 	    pg_hw_type++) {
1213 		cu_cntr_info_t	*cntr_info;
1214 
1215 		cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
1216 		if (cntr_info == NULL)
1217 			continue;
1218 
1219 		if (cntr_info->ci_kstat != NULL) {
1220 			kstat_delete(cntr_info->ci_kstat);
1221 			cntr_info->ci_kstat = NULL;
1222 		}
1223 		kmem_free(cntr_info, sizeof (cu_cntr_info_t));
1224 	}
1225 
1226 	/*
1227 	 * Free counter statistics for CPU
1228 	 */
1229 	ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
1230 	    cu_cpu_info->cu_ncntr_stats > 0);
1231 	if (cu_cpu_info->cu_cntr_stats != NULL &&
1232 	    cu_cpu_info->cu_ncntr_stats > 0) {
1233 		kmem_free(cu_cpu_info->cu_cntr_stats,
1234 		    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
1235 		cu_cpu_info->cu_cntr_stats = NULL;
1236 		cu_cpu_info->cu_ncntr_stats = 0;
1237 	}
1238 
1239 	/*
1240 	 * Get capacity and utilization CPC contexts for given CPU and check to
1241 	 * see whether they have been freed already
1242 	 */
1243 	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
1244 	if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
1245 	    cpu_ctx->ctx_ptr_array_sz > 0) {
1246 		/*
1247 		 * Free CPC contexts for given CPU
1248 		 */
1249 		for (i = 0; i < cpu_ctx->nctx; i++) {
1250 			ctx = cpu_ctx->ctx_ptr_array[i];
1251 			if (ctx == NULL)
1252 				continue;
1253 			kcpc_free(ctx, 0);
1254 		}
1255 
1256 		/*
1257 		 * Free CPC context pointer array
1258 		 */
1259 		kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
1260 
1261 		/*
1262 		 * Zero CPC info for CPU
1263 		 */
1264 		bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
1265 	}
1266 
1267 	/*
1268 	 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
1269 	 * that no one is going to access the cpu_cu_info whicch we are going to
1270 	 * free.
1271 	 */
1272 	if (cpu_is_online(cp))
1273 		cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
1274 	else
1275 		cp->cpu_cu_info = NULL;
1276 
1277 	/*
1278 	 * Free CPU's capacity and utilization info
1279 	 */
1280 	kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
1281 
1282 	return (0);
1283 }
1284 
1285 /*
1286  * Create capacity & utilization kstats for given PG CPU hardware sharing
1287  * relationship
1288  */
1289 static void
1290 cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
1291 {
1292 	char		*class, *sh_name;
1293 	kstat_t		*ks;
1294 
1295 	/*
1296 	 * Just return when no counter info or CPU
1297 	 */
1298 	if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
1299 		return;
1300 
1301 	/*
1302 	 * Get the class name from the leaf PG that this CPU belongs to.
1303 	 * If there are no PGs, just use the default class "cpu".
1304 	 */
1305 	class = pg ? pghw_type_string(pg->pghw_hw) : "cpu";
1306 	sh_name = pg ? pghw_type_shortstring(pg->pghw_hw) : "cpu";
1307 
1308 	if ((ks = kstat_create_zone("pg_cpu", cntr_info->ci_cpu->cpu_id,
1309 	    sh_name, class, KSTAT_TYPE_NAMED,
1310 	    sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
1311 	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
1312 		return;
1313 
1314 	ks->ks_lock = &pg_cpu_kstat_lock;
1315 	ks->ks_data = &cu_cpu_kstat;
1316 	ks->ks_update = cu_cpu_kstat_update;
1317 
1318 	ks->ks_private = cntr_info;
1319 	cntr_info->ci_kstat = ks;
1320 	kstat_install(cntr_info->ci_kstat);
1321 }
1322 
1323 
1324 /*
1325  * Propagate values from CPU capacity & utilization stats to kstats
1326  */
1327 static int
1328 cu_cpu_kstat_update(kstat_t *ksp, int rw)
1329 {
1330 	cpu_t		*cp;
1331 	cu_cntr_info_t	*cntr_info = ksp->ks_private;
1332 	struct cu_cpu_kstat	*kstat = &cu_cpu_kstat;
1333 	pghw_t		*pg;
1334 	cu_cntr_stats_t	*stats;
1335 
1336 	if (rw == KSTAT_WRITE)
1337 		return (EACCES);
1338 
1339 	kpreempt_disable();
1340 
1341 	/*
1342 	 * Update capacity and utilization statistics needed for CPU's PG (CPU)
1343 	 * kstats
1344 	 */
1345 	cp = cntr_info->ci_cpu;
1346 	(void) cu_cpu_update(cp, B_TRUE);
1347 
1348 	pg = cntr_info->ci_pg;
1349 	stats = cntr_info->ci_stats;
1350 	kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
1351 	kstat->cu_generation.value.ui32 = cp->cpu_generation;
1352 	if (pg == NULL)
1353 		kstat->cu_pg_id.value.l = -1;
1354 	else
1355 		kstat->cu_pg_id.value.l = pg->pghw_pg.pg_id;
1356 
1357 	kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
1358 	kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
1359 	kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
1360 	kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
1361 	kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
1362 	/*
1363 	 * Counters are stopped now, so the cs_time_stopped was last
1364 	 * updated at cs_time_start time. Add the time passed since then
1365 	 * to the stopped time.
1366 	 */
1367 	if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
1368 		kstat->cu_cpu_time_stopped.value.ui64 +=
1369 		    gethrtime() - stats->cs_time_start;
1370 
1371 	kpreempt_enable();
1372 
1373 	return (0);
1374 }
1375 
1376 /*
1377  * Run specified function with specified argument on a given CPU and return
1378  * whatever the function returns
1379  */
1380 static int
1381 cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
1382 {
1383 	int error = 0;
1384 
1385 	/*
1386 	 * cpu_call() will call func on the CPU specified with given argument
1387 	 * and return func's return value in last argument
1388 	 */
1389 	cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error);
1390 	return (error);
1391 }
1392 
1393 
1394 /*
1395  * Update counter statistics on a given CPU.
1396  *
1397  * If move_to argument is True, execute the function on the CPU specified
1398  * Otherwise, assume that it is already runninng on the right CPU
1399  *
1400  * If move_to is specified, the caller should hold cpu_lock or have preemption
1401  * disabled. Otherwise it is up to the caller to guarantee that things do not
1402  * change in the process.
1403  */
1404 int
1405 cu_cpu_update(struct cpu *cp, boolean_t move_to)
1406 {
1407 	int	retval;
1408 	cu_cpu_info_t	*cu_cpu_info = cp->cpu_cu_info;
1409 	hrtime_t	time_snap;
1410 
1411 	ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
1412 
1413 	/*
1414 	 * Nothing to do if counters are not programmed
1415 	 */
1416 	if (!(cu_flags & CU_FLAG_ON) ||
1417 	    (cu_cpu_info == NULL) ||
1418 	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1419 		return (0);
1420 
1421 	/*
1422 	 * Don't update CPU statistics if it was updated recently
1423 	 * and provide old results instead
1424 	 */
1425 	time_snap = gethrtime();
1426 	if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
1427 		DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
1428 		return (0);
1429 	}
1430 
1431 	cu_cpu_info->cu_sample_time = time_snap;
1432 
1433 	/*
1434 	 * CPC counter should be read on the CPU that is running the counter. We
1435 	 * either have to move ourselves to the target CPU or insure that we
1436 	 * already run there.
1437 	 *
1438 	 * We use cross-call to the target CPU to execute kcpc_read() and
1439 	 * cu_cpu_update_stats() there.
1440 	 */
1441 	retval = 0;
1442 	if (move_to)
1443 		(void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read,
1444 		    (uintptr_t)cu_cpu_update_stats);
1445 	else {
1446 		retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
1447 		/*
1448 		 * Offset negative return value by -10 so we can distinguish it
1449 		 * from error return values of this routine vs kcpc_read()
1450 		 */
1451 		if (retval < 0)
1452 			retval -= 10;
1453 	}
1454 
1455 	return (retval);
1456 }
1457 
1458 
1459 /*
1460  * Update CPU counter statistics for current CPU.
1461  * This function may be called from a cross-call
1462  */
1463 static int
1464 cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
1465 {
1466 	cu_cpu_info_t	*cu_cpu_info = CPU->cpu_cu_info;
1467 	uint_t		flags;
1468 	uint64_t	delta;
1469 	hrtime_t	time_delta;
1470 	hrtime_t	time_snap;
1471 
1472 	if (stats == NULL)
1473 		return (-1);
1474 
1475 	/*
1476 	 * Nothing to do if counters are not programmed. This should not happen,
1477 	 * but we check just in case.
1478 	 */
1479 	ASSERT(cu_flags & CU_FLAG_ON);
1480 	ASSERT(cu_cpu_info != NULL);
1481 	if (!(cu_flags & CU_FLAG_ON) ||
1482 	    (cu_cpu_info == NULL))
1483 		return (-2);
1484 
1485 	flags = cu_cpu_info->cu_flag;
1486 	ASSERT(flags & CU_CPU_CNTRS_ON);
1487 	if (!(flags & CU_CPU_CNTRS_ON))
1488 		return (-2);
1489 
1490 	/*
1491 	 * Take snapshot of high resolution timer
1492 	 */
1493 	time_snap = gethrtime();
1494 
1495 	/*
1496 	 * CU counters have just been programmed. We cannot assume that the new
1497 	 * cntr_value continues from where we left off, so use the cntr_value as
1498 	 * the new initial value.
1499 	 */
1500 	if (flags & CU_CPU_CNTRS_OFF_ON)
1501 		stats->cs_value_start = cntr_value;
1502 
1503 	/*
1504 	 * Calculate delta in counter values between start of sampling period
1505 	 * and now
1506 	 */
1507 	delta = cntr_value - stats->cs_value_start;
1508 
1509 	/*
1510 	 * Calculate time between start of sampling period and now
1511 	 */
1512 	time_delta = stats->cs_time_start ?
1513 	    time_snap - stats->cs_time_start :
1514 	    0;
1515 	stats->cs_time_start = time_snap;
1516 	stats->cs_value_start = cntr_value;
1517 
1518 	if (time_delta > 0) { /* wrap shouldn't happen */
1519 		/*
1520 		 * Update either running or stopped time based on the transition
1521 		 * state
1522 		 */
1523 		if (flags & CU_CPU_CNTRS_OFF_ON)
1524 			stats->cs_time_stopped += time_delta;
1525 		else
1526 			stats->cs_time_running += time_delta;
1527 	}
1528 
1529 	/*
1530 	 * Update rest of counter statistics if counter value didn't wrap
1531 	 */
1532 	if (delta > 0) {
1533 		/*
1534 		 * Update utilization rate if the interval between samples is
1535 		 * sufficient.
1536 		 */
1537 		ASSERT(cu_sample_interval_min > CU_SCALE);
1538 		if (time_delta > cu_sample_interval_min)
1539 			stats->cs_rate = CU_RATE(delta, time_delta);
1540 		if (stats->cs_rate_max < stats->cs_rate)
1541 			stats->cs_rate_max = stats->cs_rate;
1542 
1543 		stats->cs_value_last = delta;
1544 		stats->cs_value_total += delta;
1545 	}
1546 
1547 	return (0);
1548 }
1549 
1550 /*
1551  * Update CMT PG utilization data.
1552  *
1553  * This routine computes the running total utilization and times for the
1554  * specified PG by adding up the total utilization and counter running and
1555  * stopped times of all CPUs in the PG and calculates the utilization rate and
1556  * maximum rate for all CPUs in the PG.
1557  */
1558 void
1559 cu_pg_update(pghw_t *pg)
1560 {
1561 	pg_cpu_itr_t	cpu_iter;
1562 	pghw_type_t	pg_hwtype;
1563 	cpu_t		*cpu;
1564 	pghw_util_t	*hw_util = &pg->pghw_stats;
1565 	uint64_t	old_utilization = hw_util->pghw_util;
1566 	hrtime_t	now;
1567 	hrtime_t	time_delta;
1568 	uint64_t	utilization_delta;
1569 
1570 	ASSERT(MUTEX_HELD(&cpu_lock));
1571 
1572 	now = gethrtime();
1573 
1574 	pg_hwtype = pg->pghw_hw;
1575 
1576 	/*
1577 	 * Initialize running total utilization and times for PG to 0
1578 	 */
1579 	hw_util->pghw_util = 0;
1580 	hw_util->pghw_time_running = 0;
1581 	hw_util->pghw_time_stopped = 0;
1582 
1583 	/*
1584 	 * Iterate over all CPUs in the PG and aggregate utilization, running
1585 	 * time and stopped time.
1586 	 */
1587 	PG_CPU_ITR_INIT(pg, cpu_iter);
1588 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1589 		cu_cpu_info_t	*cu_cpu_info = cpu->cpu_cu_info;
1590 		cu_cntr_info_t	*cntr_info;
1591 		cu_cntr_stats_t	*stats;
1592 
1593 		if (cu_cpu_info == NULL)
1594 			continue;
1595 
1596 		/*
1597 		 * Update utilization data for the CPU and then
1598 		 * aggregate per CPU running totals for PG
1599 		 */
1600 		(void) cu_cpu_update(cpu, B_TRUE);
1601 		cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
1602 
1603 		if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
1604 			continue;
1605 
1606 		hw_util->pghw_util += stats->cs_value_total;
1607 		hw_util->pghw_time_running += stats->cs_time_running;
1608 		hw_util->pghw_time_stopped += stats->cs_time_stopped;
1609 
1610 		/*
1611 		 * If counters are stopped now, the pg_time_stopped was last
1612 		 * updated at cs_time_start time. Add the time passed since then
1613 		 * to the stopped time.
1614 		 */
1615 		if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
1616 			hw_util->pghw_time_stopped +=
1617 			    now - stats->cs_time_start;
1618 	}
1619 
1620 	/*
1621 	 * Compute per PG instruction rate and maximum rate
1622 	 */
1623 	time_delta = now - hw_util->pghw_time_stamp;
1624 	hw_util->pghw_time_stamp = now;
1625 
1626 	if (old_utilization == 0)
1627 		return;
1628 
1629 	/*
1630 	 * Calculate change in utilization over sampling period and set this to
1631 	 * 0 if the delta would be 0 or negative which may happen if any CPUs go
1632 	 * offline during the sampling period
1633 	 */
1634 	if (hw_util->pghw_util > old_utilization)
1635 		utilization_delta = hw_util->pghw_util - old_utilization;
1636 	else
1637 		utilization_delta = 0;
1638 
1639 	/*
1640 	 * Update utilization rate if the interval between samples is
1641 	 * sufficient.
1642 	 */
1643 	ASSERT(cu_sample_interval_min > CU_SCALE);
1644 	if (time_delta > CU_SAMPLE_INTERVAL_MIN)
1645 		hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
1646 
1647 	/*
1648 	 * Update the maximum observed rate
1649 	 */
1650 	if (hw_util->pghw_rate_max < hw_util->pghw_rate)
1651 		hw_util->pghw_rate_max = hw_util->pghw_rate;
1652 }
1653