xref: /titanic_52/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision dcf1eb702aeeccc639446ab5c5e8d725ce20cd76)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/hpet.h>
36 #include <sys/cpupm.h>
37 #include <sys/cpu_idle.h>
38 #include <sys/cpu_acpi.h>
39 #include <sys/cpupm_throttle.h>
40 #include <sys/dtrace.h>
41 
42 /*
43  * This callback is used to build the PPM CPU domains once
44  * all the CPU devices have been started. The callback is
45  * initialized by the PPM driver to point to a routine that
46  * will build the domains.
47  */
48 void (*cpupm_rebuild_cpu_domains)(void);
49 
50 /*
51  * This callback is used to reset the topspeed for all the
52  * CPU devices. The callback is initialized by the PPM driver to
53  * point to a routine that will reinitialize all the CPU devices
54  * once all the CPU devices have been started and the CPU domains
55  * built.
56  */
57 void (*cpupm_init_topspeed)(void);
58 
59 /*
60  * This callback is used to redefine the topspeed for a CPU device.
61  * Since all CPUs in a domain should have identical properties, this
62  * callback is initialized by the PPM driver to point to a routine
63  * that will redefine the topspeed for all devices in a CPU domain.
64  * This callback is exercised whenever an ACPI _PPC change notification
65  * is received by the CPU driver.
66  */
67 void (*cpupm_redefine_topspeed)(void *);
68 
69 /*
70  * This callback is used by the PPM driver to call into the CPU driver
71  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
72  */
73 void (*cpupm_set_topspeed_callb)(void *, int);
74 
75 /*
76  * This callback is used by the PPM driver to call into the CPU driver
77  * to set a new topspeed for a CPU.
78  */
79 int (*cpupm_get_topspeed_callb)(void *);
80 
81 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
82 static void cpupm_free_notify_handlers(cpu_t *);
83 
84 /*
85  * Until proven otherwise, all power states are manageable.
86  */
87 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
88 
89 /*
90  * Until all CPUs have started, we do not allow
91  * power management.
92  */
93 static boolean_t cpupm_ready = B_FALSE;
94 
95 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
96 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
97 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
98 
99 /*
100  * c-state tunables
101  *
102  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
103  * divided by time spent in the idle state transitions.
104  * A value of 10 means the CPU will not spend more than 1/10 of its time
105  * in idle latency.  The worst case performance will be 90% of non Deep C-state
106  * kernel.
107  *
108  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
109  * before it is worth going there.  Expressed as a multiple of latency.
110  */
111 uint32_t cpupm_cs_sample_tunable = 5;		/* samples in decision period */
112 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
113 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
114 uint16_t cpupm_C2_idle_pct_tunable = 70;
115 uint16_t cpupm_C3_idle_pct_tunable = 80;
116 
117 #ifndef __xpv
118 extern boolean_t cpupm_intel_init(cpu_t *);
119 extern boolean_t cpupm_amd_init(cpu_t *);
120 
121 typedef struct cpupm_vendor {
122 	boolean_t	(*cpuv_init)(cpu_t *);
123 } cpupm_vendor_t;
124 
125 /*
126  * Table of supported vendors.
127  */
128 static cpupm_vendor_t cpupm_vendors[] = {
129 	cpupm_intel_init,
130 	cpupm_amd_init,
131 	NULL
132 };
133 #endif
134 
135 /*
136  * Initialize the machine.
137  * See if a module exists for managing power for this CPU.
138  */
139 /*ARGSUSED*/
140 void
141 cpupm_init(cpu_t *cp)
142 {
143 #ifndef __xpv
144 	cpupm_vendor_t *vendors;
145 	cpupm_mach_state_t *mach_state;
146 	struct machcpu *mcpu = &(cp->cpu_m);
147 	int *speeds;
148 	uint_t nspeeds;
149 	int ret;
150 
151 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
152 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
153 	mach_state->ms_caps = CPUPM_NO_STATES;
154 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
155 
156 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
157 	if (mach_state->ms_acpi_handle == NULL) {
158 		cpupm_free(cp);
159 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
160 		    "unable to get ACPI handle", cp->cpu_id);
161 		cmn_err(CE_NOTE, "!CPU power management will not function.");
162 		CPUPM_DISABLE();
163 		return;
164 	}
165 
166 	/*
167 	 * Loop through the CPU management module table and see if
168 	 * any of the modules implement CPU power management
169 	 * for this CPU.
170 	 */
171 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
172 		if (vendors->cpuv_init(cp))
173 			break;
174 	}
175 
176 	/*
177 	 * Nope, we can't power manage this CPU.
178 	 */
179 	if (vendors == NULL) {
180 		cpupm_free(cp);
181 		CPUPM_DISABLE();
182 		return;
183 	}
184 
185 	/*
186 	 * If P-state support exists for this system, then initialize it.
187 	 */
188 	if (mach_state->ms_pstate.cma_ops != NULL) {
189 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
190 		if (ret != 0) {
191 			mach_state->ms_pstate.cma_ops = NULL;
192 			cpupm_disable(CPUPM_P_STATES);
193 		} else {
194 			nspeeds = cpupm_get_speeds(cp, &speeds);
195 			if (nspeeds == 0) {
196 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
197 				    " no speeds to manage", cp->cpu_id);
198 			} else {
199 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
200 				cpupm_free_speeds(speeds, nspeeds);
201 				mach_state->ms_caps |= CPUPM_P_STATES;
202 			}
203 		}
204 	}
205 
206 	if (mach_state->ms_tstate.cma_ops != NULL) {
207 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
208 		if (ret != 0) {
209 			mach_state->ms_tstate.cma_ops = NULL;
210 			cpupm_disable(CPUPM_T_STATES);
211 		} else {
212 			mach_state->ms_caps |= CPUPM_T_STATES;
213 		}
214 	}
215 
216 	/*
217 	 * If C-states support exists for this system, then initialize it.
218 	 */
219 	if (mach_state->ms_cstate.cma_ops != NULL) {
220 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
221 		if (ret != 0) {
222 			mach_state->ms_cstate.cma_ops = NULL;
223 			mcpu->max_cstates = CPU_ACPI_C1;
224 			cpupm_disable(CPUPM_C_STATES);
225 			idle_cpu = non_deep_idle_cpu;
226 			disp_enq_thread = non_deep_idle_disp_enq_thread;
227 		} else if (cpu_deep_cstates_supported()) {
228 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
229 			    mach_state->ms_acpi_handle);
230 			if (mcpu->max_cstates > CPU_ACPI_C1) {
231 				(void) cstate_timer_callback(
232 				    CST_EVENT_MULTIPLE_CSTATES);
233 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
234 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
235 				disp_enq_thread = cstate_wakeup;
236 			} else {
237 				(void) cstate_timer_callback(
238 				    CST_EVENT_ONE_CSTATE);
239 			}
240 			mach_state->ms_caps |= CPUPM_C_STATES;
241 		} else {
242 			mcpu->max_cstates = CPU_ACPI_C1;
243 			idle_cpu = non_deep_idle_cpu;
244 			disp_enq_thread = non_deep_idle_disp_enq_thread;
245 		}
246 	}
247 
248 
249 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
250 		cpupm_free(cp);
251 		CPUPM_DISABLE();
252 		return;
253 	}
254 
255 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
256 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
257 	    (mach_state->ms_caps & CPUPM_C_STATES))
258 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
259 #endif
260 }
261 
262 /*
263  * Free any resources allocated by cpupm_init().
264  */
265 /*ARGSUSED*/
266 void
267 cpupm_free(cpu_t *cp)
268 {
269 #ifndef __xpv
270 	cpupm_mach_state_t *mach_state =
271 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
272 
273 	if (mach_state == NULL)
274 		return;
275 	if (mach_state->ms_pstate.cma_ops != NULL) {
276 		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
277 		mach_state->ms_pstate.cma_ops = NULL;
278 	}
279 
280 	if (mach_state->ms_tstate.cma_ops != NULL) {
281 		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
282 		mach_state->ms_tstate.cma_ops = NULL;
283 	}
284 
285 	if (mach_state->ms_cstate.cma_ops != NULL) {
286 		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
287 		mach_state->ms_cstate.cma_ops = NULL;
288 	}
289 
290 	cpupm_free_notify_handlers(cp);
291 
292 	if (mach_state->ms_acpi_handle != NULL) {
293 		cpu_acpi_fini(mach_state->ms_acpi_handle);
294 		mach_state->ms_acpi_handle = NULL;
295 	}
296 
297 	mutex_destroy(&mach_state->ms_lock);
298 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
299 	cp->cpu_m.mcpu_pm_mach_state = NULL;
300 #endif
301 }
302 
303 /*
304  * If all CPUs have started and at least one power state is manageable,
305  * then the CPUs are ready for power management.
306  */
307 boolean_t
308 cpupm_is_ready()
309 {
310 #ifndef __xpv
311 	if (cpupm_enabled == CPUPM_NO_STATES)
312 		return (B_FALSE);
313 	return (cpupm_ready);
314 #else
315 	return (B_FALSE);
316 #endif
317 
318 }
319 
320 boolean_t
321 cpupm_is_enabled(uint32_t state)
322 {
323 	return ((cpupm_enabled & state) == state);
324 }
325 
326 /*
327  * By default, all states are enabled.
328  */
329 void
330 cpupm_disable(uint32_t state)
331 {
332 
333 	if (state & CPUPM_P_STATES) {
334 		cpupm_free_domains(&cpupm_pstate_domains);
335 	}
336 	if (state & CPUPM_T_STATES) {
337 		cpupm_free_domains(&cpupm_tstate_domains);
338 	}
339 	if (state & CPUPM_C_STATES) {
340 		cpupm_free_domains(&cpupm_cstate_domains);
341 	}
342 	cpupm_enabled &= ~state;
343 }
344 
345 /*
346  * Once all CPUs have been started, the PPM driver should build CPU
347  * domains and initialize the topspeed for all CPU devices.
348  */
349 void
350 cpupm_post_startup()
351 {
352 #ifndef __xpv
353 	/*
354 	 * The CPU domain built by the PPM during CPUs attaching
355 	 * should be rebuilt with the information retrieved from
356 	 * ACPI.
357 	 */
358 	if (cpupm_rebuild_cpu_domains != NULL)
359 		(*cpupm_rebuild_cpu_domains)();
360 
361 	/*
362 	 * Only initialize the topspeed if P-states are enabled.
363 	 */
364 	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
365 		(*cpupm_init_topspeed)();
366 #endif
367 	cpupm_ready = B_TRUE;
368 }
369 
370 /*
371  * Allocate power domains for C,P and T States
372  */
373 void
374 cpupm_alloc_domains(cpu_t *cp, int state)
375 {
376 	cpupm_mach_state_t *mach_state =
377 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
378 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
379 	cpupm_state_domains_t **dom_ptr;
380 	cpupm_state_domains_t *dptr;
381 	cpupm_state_domains_t **mach_dom_state_ptr;
382 	uint32_t domain;
383 	uint32_t type;
384 
385 	switch (state) {
386 	case CPUPM_P_STATES:
387 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
388 			domain = CPU_ACPI_PSD(handle).sd_domain;
389 			type = CPU_ACPI_PSD(handle).sd_type;
390 		} else {
391 			mutex_enter(&cpu_lock);
392 			domain = cpuid_get_chipid(cp);
393 			mutex_exit(&cpu_lock);
394 			type = CPU_ACPI_HW_ALL;
395 		}
396 		dom_ptr = &cpupm_pstate_domains;
397 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
398 		break;
399 	case CPUPM_T_STATES:
400 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
401 			domain = CPU_ACPI_TSD(handle).sd_domain;
402 			type = CPU_ACPI_TSD(handle).sd_type;
403 		} else {
404 			mutex_enter(&cpu_lock);
405 			domain = cpuid_get_chipid(cp);
406 			mutex_exit(&cpu_lock);
407 			type = CPU_ACPI_HW_ALL;
408 		}
409 		dom_ptr = &cpupm_tstate_domains;
410 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
411 		break;
412 	case CPUPM_C_STATES:
413 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
414 			domain = CPU_ACPI_CSD(handle).sd_domain;
415 			type = CPU_ACPI_CSD(handle).sd_type;
416 		} else {
417 			mutex_enter(&cpu_lock);
418 			domain = cpuid_get_coreid(cp);
419 			mutex_exit(&cpu_lock);
420 			type = CPU_ACPI_HW_ALL;
421 		}
422 		dom_ptr = &cpupm_cstate_domains;
423 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
424 		break;
425 	default:
426 		return;
427 	}
428 
429 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
430 		if (dptr->pm_domain == domain)
431 			break;
432 	}
433 
434 	/* new domain is created and linked at the head */
435 	if (dptr == NULL) {
436 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
437 		dptr->pm_domain = domain;
438 		dptr->pm_type = type;
439 		dptr->pm_next = *dom_ptr;
440 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
441 		    (void *)ipltospl(DISP_LEVEL));
442 		CPUSET_ZERO(dptr->pm_cpus);
443 		*dom_ptr = dptr;
444 	}
445 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
446 	*mach_dom_state_ptr = dptr;
447 }
448 
449 /*
450  * Free C, P or T state power domains
451  */
452 void
453 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
454 {
455 	cpupm_state_domains_t *this_domain, *next_domain;
456 
457 	this_domain = *dom_ptr;
458 	while (this_domain != NULL) {
459 		next_domain = this_domain->pm_next;
460 		mutex_destroy(&this_domain->pm_lock);
461 		kmem_free((void *)this_domain,
462 		    sizeof (cpupm_state_domains_t));
463 		this_domain = next_domain;
464 	}
465 	*dom_ptr = NULL;
466 }
467 
468 void
469 cpupm_alloc_ms_cstate(cpu_t *cp)
470 {
471 	cpupm_mach_state_t *mach_state;
472 	cpupm_mach_acpi_state_t *ms_cstate;
473 
474 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
475 	ms_cstate = &mach_state->ms_cstate;
476 	ASSERT(ms_cstate->cma_state.cstate == NULL);
477 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
478 	    KM_SLEEP);
479 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
480 }
481 
482 void
483 cpupm_free_ms_cstate(cpu_t *cp)
484 {
485 	cpupm_mach_state_t *mach_state =
486 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
487 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
488 
489 	if (ms_cstate->cma_state.cstate != NULL) {
490 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
491 		ms_cstate->cma_state.cstate = NULL;
492 	}
493 }
494 
495 void
496 cpupm_state_change(cpu_t *cp, int level, int state)
497 {
498 	cpupm_mach_state_t	*mach_state =
499 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
500 	cpupm_state_ops_t	*state_ops;
501 	cpupm_state_domains_t  	*state_domain;
502 	cpuset_t		set;
503 
504 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
505 
506 	if (mach_state == NULL) {
507 		return;
508 	}
509 
510 	switch (state) {
511 	case CPUPM_P_STATES:
512 		state_ops = mach_state->ms_pstate.cma_ops;
513 		state_domain = mach_state->ms_pstate.cma_domain;
514 		break;
515 	case CPUPM_T_STATES:
516 		state_ops = mach_state->ms_tstate.cma_ops;
517 		state_domain = mach_state->ms_tstate.cma_domain;
518 		break;
519 	default:
520 		break;
521 	}
522 
523 	switch (state_domain->pm_type) {
524 	case CPU_ACPI_SW_ANY:
525 		/*
526 		 * A request on any CPU in the domain transitions the domain
527 		 */
528 		CPUSET_ONLY(set, cp->cpu_id);
529 		state_ops->cpus_change(set, level);
530 		break;
531 	case CPU_ACPI_SW_ALL:
532 		/*
533 		 * All CPUs in the domain must request the transition
534 		 */
535 	case CPU_ACPI_HW_ALL:
536 		/*
537 		 * P/T-state transitions are coordinated by the hardware
538 		 * For now, request the transition on all CPUs in the domain,
539 		 * but looking ahead we can probably be smarter about this.
540 		 */
541 		mutex_enter(&state_domain->pm_lock);
542 		state_ops->cpus_change(state_domain->pm_cpus, level);
543 		mutex_exit(&state_domain->pm_lock);
544 		break;
545 	default:
546 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
547 		    state_domain->pm_type);
548 	}
549 }
550 
551 /*
552  * CPU PM interfaces exposed to the CPU power manager
553  */
554 /*ARGSUSED*/
555 id_t
556 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
557 {
558 	cpupm_mach_state_t	*mach_state =
559 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
560 
561 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
562 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
563 		return (CPUPM_NO_DOMAIN);
564 	}
565 	if (type == CPUPM_DTYPE_ACTIVE) {
566 		/*
567 		 * Return P-State domain for the specified CPU
568 		 */
569 		if (mach_state->ms_pstate.cma_domain) {
570 			return (mach_state->ms_pstate.cma_domain->pm_domain);
571 		}
572 	} else if (type == CPUPM_DTYPE_IDLE) {
573 		/*
574 		 * Return C-State domain for the specified CPU
575 		 */
576 		if (mach_state->ms_cstate.cma_domain) {
577 			return (mach_state->ms_cstate.cma_domain->pm_domain);
578 		}
579 	}
580 	return (CPUPM_NO_DOMAIN);
581 }
582 
583 /*ARGSUSED*/
584 uint_t
585 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
586     cpupm_state_t *states)
587 {
588 	int	*speeds;
589 	uint_t	nspeeds, i;
590 
591 	/*
592 	 * Idle domain support unimplemented
593 	 */
594 	if (type != CPUPM_DTYPE_ACTIVE) {
595 		return (0);
596 	}
597 	nspeeds = cpupm_get_speeds(cp, &speeds);
598 
599 	/*
600 	 * If the caller passes NULL for states, just return the
601 	 * number of states.
602 	 */
603 	if (states != NULL) {
604 		for (i = 0; i < nspeeds; i++) {
605 			states[i].cps_speed = speeds[i];
606 			states[i].cps_handle = (cpupm_handle_t)i;
607 		}
608 	}
609 	cpupm_free_speeds(speeds, nspeeds);
610 	return (nspeeds);
611 }
612 
613 /*ARGSUSED*/
614 int
615 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
616 {
617 	if (!cpupm_is_ready())
618 		return (-1);
619 
620 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
621 
622 	return (0);
623 }
624 
625 /*ARGSUSED*/
626 /*
627  * Note: It is the responsibility of the users of
628  * cpupm_get_speeds() to free the memory allocated
629  * for speeds using cpupm_free_speeds()
630  */
631 uint_t
632 cpupm_get_speeds(cpu_t *cp, int **speeds)
633 {
634 #ifndef __xpv
635 	cpupm_mach_state_t *mach_state =
636 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
637 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
638 #else
639 	return (0);
640 #endif
641 }
642 
643 /*ARGSUSED*/
644 void
645 cpupm_free_speeds(int *speeds, uint_t nspeeds)
646 {
647 #ifndef __xpv
648 	cpu_acpi_free_speeds(speeds, nspeeds);
649 #endif
650 }
651 
652 /*
653  * All CPU instances have been initialized successfully.
654  */
655 boolean_t
656 cpupm_power_ready(void)
657 {
658 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
659 }
660 
661 /*
662  * All CPU instances have been initialized successfully.
663  */
664 boolean_t
665 cpupm_throttle_ready(void)
666 {
667 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
668 }
669 
670 /*
671  * All CPU instances have been initialized successfully.
672  */
673 boolean_t
674 cpupm_cstate_ready(void)
675 {
676 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
677 }
678 
679 void
680 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
681 {
682 	cpu_t *cp = ctx;
683 	cpupm_mach_state_t *mach_state =
684 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
685 	cpupm_notification_t *entry;
686 
687 	mutex_enter(&mach_state->ms_lock);
688 	for (entry =  mach_state->ms_handlers; entry != NULL;
689 	    entry = entry->nq_next) {
690 		entry->nq_handler(obj, val, entry->nq_ctx);
691 	}
692 	mutex_exit(&mach_state->ms_lock);
693 }
694 
695 /*ARGSUSED*/
696 void
697 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
698 {
699 #ifndef __xpv
700 	cpupm_mach_state_t *mach_state =
701 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
702 	cpupm_notification_t *entry;
703 
704 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
705 	entry->nq_handler = handler;
706 	entry->nq_ctx = ctx;
707 	mutex_enter(&mach_state->ms_lock);
708 	if (mach_state->ms_handlers == NULL) {
709 		entry->nq_next = NULL;
710 		mach_state->ms_handlers = entry;
711 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
712 		    cpupm_notify_handler, cp);
713 
714 	} else {
715 		entry->nq_next = mach_state->ms_handlers;
716 		mach_state->ms_handlers = entry;
717 	}
718 	mutex_exit(&mach_state->ms_lock);
719 #endif
720 }
721 
722 /*ARGSUSED*/
723 static void
724 cpupm_free_notify_handlers(cpu_t *cp)
725 {
726 #ifndef __xpv
727 	cpupm_mach_state_t *mach_state =
728 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
729 	cpupm_notification_t *entry;
730 	cpupm_notification_t *next;
731 
732 	mutex_enter(&mach_state->ms_lock);
733 	if (mach_state->ms_handlers == NULL) {
734 		mutex_exit(&mach_state->ms_lock);
735 		return;
736 	}
737 	if (mach_state->ms_acpi_handle != NULL) {
738 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
739 		    cpupm_notify_handler);
740 	}
741 	entry = mach_state->ms_handlers;
742 	while (entry != NULL) {
743 		next = entry->nq_next;
744 		kmem_free(entry, sizeof (cpupm_notification_t));
745 		entry = next;
746 	}
747 	mach_state->ms_handlers = NULL;
748 	mutex_exit(&mach_state->ms_lock);
749 #endif
750 }
751 
752 /*
753  * Get the current max speed from the ACPI _PPC object
754  */
755 /*ARGSUSED*/
756 int
757 cpupm_get_top_speed(cpu_t *cp)
758 {
759 #ifndef __xpv
760 	cpupm_mach_state_t 	*mach_state;
761 	cpu_acpi_handle_t 	handle;
762 	int 			plat_level;
763 	uint_t			nspeeds;
764 	int			max_level;
765 
766 	mach_state =
767 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
768 	handle = mach_state->ms_acpi_handle;
769 
770 	cpu_acpi_cache_ppc(handle);
771 	plat_level = CPU_ACPI_PPC(handle);
772 
773 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
774 
775 	max_level = nspeeds - 1;
776 	if ((plat_level < 0) || (plat_level > max_level)) {
777 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
778 		    "_PPC out of range %d", cp->cpu_id, plat_level);
779 		plat_level = 0;
780 	}
781 
782 	return (plat_level);
783 #else
784 	return (0);
785 #endif
786 }
787 
788 /*
789  * This notification handler is called whenever the ACPI _PPC
790  * object changes. The _PPC is a sort of governor on power levels.
791  * It sets an upper threshold on which, _PSS defined, power levels
792  * are usuable. The _PPC value is dynamic and may change as properties
793  * (i.e., thermal or AC source) of the system change.
794  */
795 
796 static void
797 cpupm_power_manage_notifications(void *ctx)
798 {
799 	cpu_t			*cp = ctx;
800 	int			top_speed;
801 
802 	top_speed = cpupm_get_top_speed(cp);
803 	cpupm_redefine_max_activepwr_state(cp, top_speed);
804 }
805 
806 /* ARGSUSED */
807 static void
808 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
809 {
810 #ifndef __xpv
811 
812 	cpu_t *cp = ctx;
813 	cpupm_mach_state_t *mach_state =
814 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
815 
816 	if (mach_state == NULL)
817 		return;
818 
819 	/*
820 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
821 	 */
822 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
823 	    mach_state->ms_caps & CPUPM_T_STATES) {
824 		cpupm_throttle_manage_notification(ctx);
825 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
826 	    mach_state->ms_caps & CPUPM_C_STATES) {
827 		cpuidle_manage_cstates(ctx);
828 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
829 	    mach_state->ms_caps & CPUPM_P_STATES) {
830 		cpupm_power_manage_notifications(ctx);
831 	}
832 #endif
833 }
834 
835 /*
836  * Update cpupm cstate data each time CPU exits idle.
837  */
838 void
839 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
840 {
841 	cs_data->cs_idle_exit = end;
842 }
843 
844 /*
845  * Determine next cstate based on cpupm data.
846  * Update cpupm cstate data each time CPU goes idle.
847  * Do as much as possible in the idle state bookkeeping function because the
848  * performance impact while idle is minimal compared to in the wakeup function
849  * when there is real work to do.
850  */
851 uint32_t
852 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
853     uint32_t cs_count, hrtime_t start)
854 {
855 	hrtime_t duration;
856 	hrtime_t ave_interval;
857 	hrtime_t ave_idle_time;
858 	uint32_t i;
859 
860 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
861 	scalehrtime(&duration);
862 	cs_data->cs_idle += duration;
863 	cs_data->cs_idle_enter = start;
864 
865 	++cs_data->cs_cnt;
866 	if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
867 		cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
868 		scalehrtime(&cs_data->cs_smpl_len);
869 		cs_data->cs_smpl_len |= 1;	/* protect from DIV 0 */
870 		cs_data->cs_smpl_idle = cs_data->cs_idle;
871 		cs_data->cs_idle = 0;
872 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
873 		    cs_data->cs_smpl_len);
874 
875 		cs_data->cs_smpl_start = start;
876 		cs_data->cs_cnt = 0;
877 
878 		/*
879 		 * Strand level C-state policy
880 		 * The cpu_acpi_cstate_t *cstates array is not required to
881 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
882 		 * There are cs_count entries in the cstates array.
883 		 * cs_data->cs_next_cstate contains the index of the next
884 		 * C-state this CPU should enter.
885 		 */
886 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
887 
888 		/*
889 		 * Will CPU be idle long enough to save power?
890 		 */
891 		ave_idle_time = (cs_data->cs_smpl_idle /
892 		    cpupm_cs_sample_tunable) / 1000;
893 		for (i = 1; i < cs_count; ++i) {
894 			if (ave_idle_time < (cstates[i].cs_latency *
895 			    cpupm_cs_idle_save_tunable)) {
896 				cs_count = i;
897 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
898 				    CPU, int, i);
899 			}
900 		}
901 
902 		/*
903 		 * Wakeup often (even when non-idle time is very short)?
904 		 * Some producer/consumer type loads fall into this category.
905 		 */
906 		ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
907 		    / 1000;
908 		for (i = 1; i < cs_count; ++i) {
909 			if (ave_interval <= (cstates[i].cs_latency *
910 			    cpupm_cs_idle_cost_tunable)) {
911 				cs_count = i;
912 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
913 				    CPU, int, (CPU_MAX_CSTATES + i));
914 			}
915 		}
916 
917 		/*
918 		 * Idle percent
919 		 */
920 		for (i = 1; i < cs_count; ++i) {
921 			switch (cstates[i].cs_type) {
922 			case CPU_ACPI_C2:
923 				if (cs_data->cs_smpl_idle_pct <
924 				    cpupm_C2_idle_pct_tunable) {
925 					cs_count = i;
926 					DTRACE_PROBE2(cpupm__next__cstate,
927 					    cpu_t *, CPU, int,
928 					    ((2 * CPU_MAX_CSTATES) + i));
929 				}
930 				break;
931 
932 			case CPU_ACPI_C3:
933 				if (cs_data->cs_smpl_idle_pct <
934 				    cpupm_C3_idle_pct_tunable) {
935 					cs_count = i;
936 					DTRACE_PROBE2(cpupm__next__cstate,
937 					    cpu_t *, CPU, int,
938 					    ((2 * CPU_MAX_CSTATES) + i));
939 				}
940 				break;
941 			}
942 		}
943 
944 		cs_data->cs_next_cstate = cs_count - 1;
945 	}
946 
947 	return (cs_data->cs_next_cstate);
948 }
949