xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 78d5422c60f052a980af8624e3b6c7f589577c8f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/hpet.h>
36 #include <sys/acpi/acpi.h>
37 #include <sys/acpica.h>
38 #include <sys/cpupm.h>
39 #include <sys/cpu_idle.h>
40 #include <sys/cpu_acpi.h>
41 #include <sys/cpupm_throttle.h>
42 #include <sys/dtrace.h>
43 
44 /*
45  * This callback is used to build the PPM CPU domains once
46  * all the CPU devices have been started. The callback is
47  * initialized by the PPM driver to point to a routine that
48  * will build the domains.
49  */
50 void (*cpupm_rebuild_cpu_domains)(void);
51 
52 /*
53  * This callback is used to reset the topspeed for all the
54  * CPU devices. The callback is initialized by the PPM driver to
55  * point to a routine that will reinitialize all the CPU devices
56  * once all the CPU devices have been started and the CPU domains
57  * built.
58  */
59 void (*cpupm_init_topspeed)(void);
60 
61 /*
62  * This callback is used to redefine the topspeed for a CPU device.
63  * Since all CPUs in a domain should have identical properties, this
64  * callback is initialized by the PPM driver to point to a routine
65  * that will redefine the topspeed for all devices in a CPU domain.
66  * This callback is exercised whenever an ACPI _PPC change notification
67  * is received by the CPU driver.
68  */
69 void (*cpupm_redefine_topspeed)(void *);
70 
71 /*
72  * This callback is used by the PPM driver to call into the CPU driver
73  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
74  */
75 void (*cpupm_set_topspeed_callb)(void *, int);
76 
77 /*
78  * This callback is used by the PPM driver to call into the CPU driver
79  * to set a new topspeed for a CPU.
80  */
81 int (*cpupm_get_topspeed_callb)(void *);
82 
83 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
84 static void cpupm_free_notify_handlers(cpu_t *);
85 
86 /*
87  * Until proven otherwise, all power states are manageable.
88  */
89 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
90 
91 /*
92  * Until all CPUs have started, we do not allow
93  * power management.
94  */
95 static boolean_t cpupm_ready = B_FALSE;
96 
97 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
98 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
99 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
100 
101 /*
102  * c-state tunables
103  *
104  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
105  * divided by time spent in the idle state transitions.
106  * A value of 10 means the CPU will not spend more than 1/10 of its time
107  * in idle latency.  The worst case performance will be 90% of non Deep C-state
108  * kernel.
109  *
110  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
111  * before it is worth going there.  Expressed as a multiple of latency.
112  */
113 uint32_t cpupm_cs_sample_tunable = 5;		/* samples in decision period */
114 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
115 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
116 uint16_t cpupm_C2_idle_pct_tunable = 70;
117 uint16_t cpupm_C3_idle_pct_tunable = 80;
118 
119 #ifndef __xpv
120 extern boolean_t cpupm_intel_init(cpu_t *);
121 extern boolean_t cpupm_amd_init(cpu_t *);
122 
123 typedef struct cpupm_vendor {
124 	boolean_t	(*cpuv_init)(cpu_t *);
125 } cpupm_vendor_t;
126 
127 /*
128  * Table of supported vendors.
129  */
130 static cpupm_vendor_t cpupm_vendors[] = {
131 	cpupm_intel_init,
132 	cpupm_amd_init,
133 	NULL
134 };
135 #endif
136 
137 /*
138  * Initialize the machine.
139  * See if a module exists for managing power for this CPU.
140  */
141 /*ARGSUSED*/
142 void
143 cpupm_init(cpu_t *cp)
144 {
145 #ifndef __xpv
146 	cpupm_vendor_t *vendors;
147 	cpupm_mach_state_t *mach_state;
148 	struct machcpu *mcpu = &(cp->cpu_m);
149 	static boolean_t first = B_TRUE;
150 	int *speeds;
151 	uint_t nspeeds;
152 	int ret;
153 
154 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
155 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
156 	mach_state->ms_caps = CPUPM_NO_STATES;
157 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
158 
159 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
160 	if (mach_state->ms_acpi_handle == NULL) {
161 		cpupm_free(cp);
162 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
163 		    "unable to get ACPI handle", cp->cpu_id);
164 		cmn_err(CE_NOTE, "!CPU power management will not function.");
165 		CPUPM_DISABLE();
166 		first = B_FALSE;
167 		return;
168 	}
169 
170 	/*
171 	 * Loop through the CPU management module table and see if
172 	 * any of the modules implement CPU power management
173 	 * for this CPU.
174 	 */
175 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
176 		if (vendors->cpuv_init(cp))
177 			break;
178 	}
179 
180 	/*
181 	 * Nope, we can't power manage this CPU.
182 	 */
183 	if (vendors == NULL) {
184 		cpupm_free(cp);
185 		CPUPM_DISABLE();
186 		first = B_FALSE;
187 		return;
188 	}
189 
190 	/*
191 	 * If P-state support exists for this system, then initialize it.
192 	 */
193 	if (mach_state->ms_pstate.cma_ops != NULL) {
194 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
195 		if (ret != 0) {
196 			mach_state->ms_pstate.cma_ops = NULL;
197 			cpupm_disable(CPUPM_P_STATES);
198 		} else {
199 			nspeeds = cpupm_get_speeds(cp, &speeds);
200 			if (nspeeds == 0) {
201 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
202 				    " no speeds to manage", cp->cpu_id);
203 			} else {
204 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
205 				cpupm_free_speeds(speeds, nspeeds);
206 				mach_state->ms_caps |= CPUPM_P_STATES;
207 			}
208 		}
209 	}
210 
211 	if (mach_state->ms_tstate.cma_ops != NULL) {
212 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
213 		if (ret != 0) {
214 			mach_state->ms_tstate.cma_ops = NULL;
215 			cpupm_disable(CPUPM_T_STATES);
216 		} else {
217 			mach_state->ms_caps |= CPUPM_T_STATES;
218 		}
219 	}
220 
221 	/*
222 	 * If C-states support exists for this system, then initialize it.
223 	 */
224 	if (mach_state->ms_cstate.cma_ops != NULL) {
225 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
226 		if (ret != 0) {
227 			mach_state->ms_cstate.cma_ops = NULL;
228 			mcpu->max_cstates = CPU_ACPI_C1;
229 			cpupm_disable(CPUPM_C_STATES);
230 			idle_cpu = non_deep_idle_cpu;
231 			disp_enq_thread = non_deep_idle_disp_enq_thread;
232 		} else if (cpu_deep_cstates_supported()) {
233 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
234 			    mach_state->ms_acpi_handle);
235 			if (mcpu->max_cstates > CPU_ACPI_C1) {
236 				(void) cstate_timer_callback(
237 				    CST_EVENT_MULTIPLE_CSTATES);
238 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
239 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
240 				disp_enq_thread = cstate_wakeup;
241 			} else {
242 				(void) cstate_timer_callback(
243 				    CST_EVENT_ONE_CSTATE);
244 			}
245 			mach_state->ms_caps |= CPUPM_C_STATES;
246 		} else {
247 			mcpu->max_cstates = CPU_ACPI_C1;
248 			idle_cpu = non_deep_idle_cpu;
249 			disp_enq_thread = non_deep_idle_disp_enq_thread;
250 		}
251 	}
252 
253 
254 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
255 		cpupm_free(cp);
256 		CPUPM_DISABLE();
257 		first = B_FALSE;
258 		return;
259 	}
260 
261 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
262 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
263 	    (mach_state->ms_caps & CPUPM_C_STATES)) {
264 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
265 		if (first) {
266 			acpica_write_cpupm_capabilities(
267 			    mach_state->ms_caps & CPUPM_P_STATES,
268 			    mach_state->ms_caps & CPUPM_C_STATES);
269 		}
270 	}
271 	first = B_FALSE;
272 #endif
273 }
274 
275 /*
276  * Free any resources allocated by cpupm_init().
277  */
278 /*ARGSUSED*/
279 void
280 cpupm_free(cpu_t *cp)
281 {
282 #ifndef __xpv
283 	cpupm_mach_state_t *mach_state =
284 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
285 
286 	if (mach_state == NULL)
287 		return;
288 	if (mach_state->ms_pstate.cma_ops != NULL) {
289 		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
290 		mach_state->ms_pstate.cma_ops = NULL;
291 	}
292 
293 	if (mach_state->ms_tstate.cma_ops != NULL) {
294 		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
295 		mach_state->ms_tstate.cma_ops = NULL;
296 	}
297 
298 	if (mach_state->ms_cstate.cma_ops != NULL) {
299 		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
300 		mach_state->ms_cstate.cma_ops = NULL;
301 	}
302 
303 	cpupm_free_notify_handlers(cp);
304 
305 	if (mach_state->ms_acpi_handle != NULL) {
306 		cpu_acpi_fini(mach_state->ms_acpi_handle);
307 		mach_state->ms_acpi_handle = NULL;
308 	}
309 
310 	mutex_destroy(&mach_state->ms_lock);
311 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
312 	cp->cpu_m.mcpu_pm_mach_state = NULL;
313 #endif
314 }
315 
316 /*
317  * If all CPUs have started and at least one power state is manageable,
318  * then the CPUs are ready for power management.
319  */
320 boolean_t
321 cpupm_is_ready()
322 {
323 #ifndef __xpv
324 	if (cpupm_enabled == CPUPM_NO_STATES)
325 		return (B_FALSE);
326 	return (cpupm_ready);
327 #else
328 	return (B_FALSE);
329 #endif
330 
331 }
332 
333 boolean_t
334 cpupm_is_enabled(uint32_t state)
335 {
336 	return ((cpupm_enabled & state) == state);
337 }
338 
339 /*
340  * By default, all states are enabled.
341  */
342 void
343 cpupm_disable(uint32_t state)
344 {
345 
346 	if (state & CPUPM_P_STATES) {
347 		cpupm_free_domains(&cpupm_pstate_domains);
348 	}
349 	if (state & CPUPM_T_STATES) {
350 		cpupm_free_domains(&cpupm_tstate_domains);
351 	}
352 	if (state & CPUPM_C_STATES) {
353 		cpupm_free_domains(&cpupm_cstate_domains);
354 	}
355 	cpupm_enabled &= ~state;
356 }
357 
358 /*
359  * Once all CPUs have been started, the PPM driver should build CPU
360  * domains and initialize the topspeed for all CPU devices.
361  */
362 void
363 cpupm_post_startup()
364 {
365 #ifndef __xpv
366 	/*
367 	 * The CPU domain built by the PPM during CPUs attaching
368 	 * should be rebuilt with the information retrieved from
369 	 * ACPI.
370 	 */
371 	if (cpupm_rebuild_cpu_domains != NULL)
372 		(*cpupm_rebuild_cpu_domains)();
373 
374 	/*
375 	 * Only initialize the topspeed if P-states are enabled.
376 	 */
377 	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
378 		(*cpupm_init_topspeed)();
379 #endif
380 	cpupm_ready = B_TRUE;
381 }
382 
383 /*
384  * Allocate power domains for C,P and T States
385  */
386 void
387 cpupm_alloc_domains(cpu_t *cp, int state)
388 {
389 	cpupm_mach_state_t *mach_state =
390 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
391 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
392 	cpupm_state_domains_t **dom_ptr;
393 	cpupm_state_domains_t *dptr;
394 	cpupm_state_domains_t **mach_dom_state_ptr;
395 	uint32_t domain;
396 	uint32_t type;
397 
398 	switch (state) {
399 	case CPUPM_P_STATES:
400 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
401 			domain = CPU_ACPI_PSD(handle).sd_domain;
402 			type = CPU_ACPI_PSD(handle).sd_type;
403 		} else {
404 			mutex_enter(&cpu_lock);
405 			domain = cpuid_get_chipid(cp);
406 			mutex_exit(&cpu_lock);
407 			type = CPU_ACPI_HW_ALL;
408 		}
409 		dom_ptr = &cpupm_pstate_domains;
410 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
411 		break;
412 	case CPUPM_T_STATES:
413 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
414 			domain = CPU_ACPI_TSD(handle).sd_domain;
415 			type = CPU_ACPI_TSD(handle).sd_type;
416 		} else {
417 			mutex_enter(&cpu_lock);
418 			domain = cpuid_get_chipid(cp);
419 			mutex_exit(&cpu_lock);
420 			type = CPU_ACPI_HW_ALL;
421 		}
422 		dom_ptr = &cpupm_tstate_domains;
423 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
424 		break;
425 	case CPUPM_C_STATES:
426 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
427 			domain = CPU_ACPI_CSD(handle).sd_domain;
428 			type = CPU_ACPI_CSD(handle).sd_type;
429 		} else {
430 			mutex_enter(&cpu_lock);
431 			domain = cpuid_get_coreid(cp);
432 			mutex_exit(&cpu_lock);
433 			type = CPU_ACPI_HW_ALL;
434 		}
435 		dom_ptr = &cpupm_cstate_domains;
436 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
437 		break;
438 	default:
439 		return;
440 	}
441 
442 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
443 		if (dptr->pm_domain == domain)
444 			break;
445 	}
446 
447 	/* new domain is created and linked at the head */
448 	if (dptr == NULL) {
449 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
450 		dptr->pm_domain = domain;
451 		dptr->pm_type = type;
452 		dptr->pm_next = *dom_ptr;
453 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
454 		    (void *)ipltospl(DISP_LEVEL));
455 		CPUSET_ZERO(dptr->pm_cpus);
456 		*dom_ptr = dptr;
457 	}
458 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
459 	*mach_dom_state_ptr = dptr;
460 }
461 
462 /*
463  * Free C, P or T state power domains
464  */
465 void
466 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
467 {
468 	cpupm_state_domains_t *this_domain, *next_domain;
469 
470 	this_domain = *dom_ptr;
471 	while (this_domain != NULL) {
472 		next_domain = this_domain->pm_next;
473 		mutex_destroy(&this_domain->pm_lock);
474 		kmem_free((void *)this_domain,
475 		    sizeof (cpupm_state_domains_t));
476 		this_domain = next_domain;
477 	}
478 	*dom_ptr = NULL;
479 }
480 
481 void
482 cpupm_alloc_ms_cstate(cpu_t *cp)
483 {
484 	cpupm_mach_state_t *mach_state;
485 	cpupm_mach_acpi_state_t *ms_cstate;
486 
487 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
488 	ms_cstate = &mach_state->ms_cstate;
489 	ASSERT(ms_cstate->cma_state.cstate == NULL);
490 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
491 	    KM_SLEEP);
492 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
493 }
494 
495 void
496 cpupm_free_ms_cstate(cpu_t *cp)
497 {
498 	cpupm_mach_state_t *mach_state =
499 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
500 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
501 
502 	if (ms_cstate->cma_state.cstate != NULL) {
503 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
504 		ms_cstate->cma_state.cstate = NULL;
505 	}
506 }
507 
508 void
509 cpupm_state_change(cpu_t *cp, int level, int state)
510 {
511 	cpupm_mach_state_t	*mach_state =
512 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
513 	cpupm_state_ops_t	*state_ops;
514 	cpupm_state_domains_t  	*state_domain;
515 	cpuset_t		set;
516 
517 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
518 
519 	if (mach_state == NULL) {
520 		return;
521 	}
522 
523 	switch (state) {
524 	case CPUPM_P_STATES:
525 		state_ops = mach_state->ms_pstate.cma_ops;
526 		state_domain = mach_state->ms_pstate.cma_domain;
527 		break;
528 	case CPUPM_T_STATES:
529 		state_ops = mach_state->ms_tstate.cma_ops;
530 		state_domain = mach_state->ms_tstate.cma_domain;
531 		break;
532 	default:
533 		break;
534 	}
535 
536 	switch (state_domain->pm_type) {
537 	case CPU_ACPI_SW_ANY:
538 		/*
539 		 * A request on any CPU in the domain transitions the domain
540 		 */
541 		CPUSET_ONLY(set, cp->cpu_id);
542 		state_ops->cpus_change(set, level);
543 		break;
544 	case CPU_ACPI_SW_ALL:
545 		/*
546 		 * All CPUs in the domain must request the transition
547 		 */
548 	case CPU_ACPI_HW_ALL:
549 		/*
550 		 * P/T-state transitions are coordinated by the hardware
551 		 * For now, request the transition on all CPUs in the domain,
552 		 * but looking ahead we can probably be smarter about this.
553 		 */
554 		mutex_enter(&state_domain->pm_lock);
555 		state_ops->cpus_change(state_domain->pm_cpus, level);
556 		mutex_exit(&state_domain->pm_lock);
557 		break;
558 	default:
559 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
560 		    state_domain->pm_type);
561 	}
562 }
563 
564 /*
565  * CPU PM interfaces exposed to the CPU power manager
566  */
567 /*ARGSUSED*/
568 id_t
569 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
570 {
571 	cpupm_mach_state_t	*mach_state =
572 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
573 
574 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
575 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
576 		return (CPUPM_NO_DOMAIN);
577 	}
578 	if (type == CPUPM_DTYPE_ACTIVE) {
579 		/*
580 		 * Return P-State domain for the specified CPU
581 		 */
582 		if (mach_state->ms_pstate.cma_domain) {
583 			return (mach_state->ms_pstate.cma_domain->pm_domain);
584 		}
585 	} else if (type == CPUPM_DTYPE_IDLE) {
586 		/*
587 		 * Return C-State domain for the specified CPU
588 		 */
589 		if (mach_state->ms_cstate.cma_domain) {
590 			return (mach_state->ms_cstate.cma_domain->pm_domain);
591 		}
592 	}
593 	return (CPUPM_NO_DOMAIN);
594 }
595 
596 /*ARGSUSED*/
597 uint_t
598 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
599     cpupm_state_t *states)
600 {
601 	int	*speeds;
602 	uint_t	nspeeds, i;
603 
604 	/*
605 	 * Idle domain support unimplemented
606 	 */
607 	if (type != CPUPM_DTYPE_ACTIVE) {
608 		return (0);
609 	}
610 	nspeeds = cpupm_get_speeds(cp, &speeds);
611 
612 	/*
613 	 * If the caller passes NULL for states, just return the
614 	 * number of states.
615 	 */
616 	if (states != NULL) {
617 		for (i = 0; i < nspeeds; i++) {
618 			states[i].cps_speed = speeds[i];
619 			states[i].cps_handle = (cpupm_handle_t)i;
620 		}
621 	}
622 	cpupm_free_speeds(speeds, nspeeds);
623 	return (nspeeds);
624 }
625 
626 /*ARGSUSED*/
627 int
628 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
629 {
630 	if (!cpupm_is_ready())
631 		return (-1);
632 
633 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
634 
635 	return (0);
636 }
637 
638 /*ARGSUSED*/
639 /*
640  * Note: It is the responsibility of the users of
641  * cpupm_get_speeds() to free the memory allocated
642  * for speeds using cpupm_free_speeds()
643  */
644 uint_t
645 cpupm_get_speeds(cpu_t *cp, int **speeds)
646 {
647 #ifndef __xpv
648 	cpupm_mach_state_t *mach_state =
649 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
650 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
651 #else
652 	return (0);
653 #endif
654 }
655 
656 /*ARGSUSED*/
657 void
658 cpupm_free_speeds(int *speeds, uint_t nspeeds)
659 {
660 #ifndef __xpv
661 	cpu_acpi_free_speeds(speeds, nspeeds);
662 #endif
663 }
664 
665 /*
666  * All CPU instances have been initialized successfully.
667  */
668 boolean_t
669 cpupm_power_ready(void)
670 {
671 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
672 }
673 
674 /*
675  * All CPU instances have been initialized successfully.
676  */
677 boolean_t
678 cpupm_throttle_ready(void)
679 {
680 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
681 }
682 
683 /*
684  * All CPU instances have been initialized successfully.
685  */
686 boolean_t
687 cpupm_cstate_ready(void)
688 {
689 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
690 }
691 
692 void
693 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
694 {
695 	cpu_t *cp = ctx;
696 	cpupm_mach_state_t *mach_state =
697 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
698 	cpupm_notification_t *entry;
699 
700 	mutex_enter(&mach_state->ms_lock);
701 	for (entry =  mach_state->ms_handlers; entry != NULL;
702 	    entry = entry->nq_next) {
703 		entry->nq_handler(obj, val, entry->nq_ctx);
704 	}
705 	mutex_exit(&mach_state->ms_lock);
706 }
707 
708 /*ARGSUSED*/
709 void
710 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
711 {
712 #ifndef __xpv
713 	cpupm_mach_state_t *mach_state =
714 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
715 	cpupm_notification_t *entry;
716 
717 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
718 	entry->nq_handler = handler;
719 	entry->nq_ctx = ctx;
720 	mutex_enter(&mach_state->ms_lock);
721 	if (mach_state->ms_handlers == NULL) {
722 		entry->nq_next = NULL;
723 		mach_state->ms_handlers = entry;
724 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
725 		    cpupm_notify_handler, cp);
726 
727 	} else {
728 		entry->nq_next = mach_state->ms_handlers;
729 		mach_state->ms_handlers = entry;
730 	}
731 	mutex_exit(&mach_state->ms_lock);
732 #endif
733 }
734 
735 /*ARGSUSED*/
736 static void
737 cpupm_free_notify_handlers(cpu_t *cp)
738 {
739 #ifndef __xpv
740 	cpupm_mach_state_t *mach_state =
741 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
742 	cpupm_notification_t *entry;
743 	cpupm_notification_t *next;
744 
745 	mutex_enter(&mach_state->ms_lock);
746 	if (mach_state->ms_handlers == NULL) {
747 		mutex_exit(&mach_state->ms_lock);
748 		return;
749 	}
750 	if (mach_state->ms_acpi_handle != NULL) {
751 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
752 		    cpupm_notify_handler);
753 	}
754 	entry = mach_state->ms_handlers;
755 	while (entry != NULL) {
756 		next = entry->nq_next;
757 		kmem_free(entry, sizeof (cpupm_notification_t));
758 		entry = next;
759 	}
760 	mach_state->ms_handlers = NULL;
761 	mutex_exit(&mach_state->ms_lock);
762 #endif
763 }
764 
765 /*
766  * Get the current max speed from the ACPI _PPC object
767  */
768 /*ARGSUSED*/
769 int
770 cpupm_get_top_speed(cpu_t *cp)
771 {
772 #ifndef __xpv
773 	cpupm_mach_state_t 	*mach_state;
774 	cpu_acpi_handle_t 	handle;
775 	int 			plat_level;
776 	uint_t			nspeeds;
777 	int			max_level;
778 
779 	mach_state =
780 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
781 	handle = mach_state->ms_acpi_handle;
782 
783 	cpu_acpi_cache_ppc(handle);
784 	plat_level = CPU_ACPI_PPC(handle);
785 
786 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
787 
788 	max_level = nspeeds - 1;
789 	if ((plat_level < 0) || (plat_level > max_level)) {
790 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
791 		    "_PPC out of range %d", cp->cpu_id, plat_level);
792 		plat_level = 0;
793 	}
794 
795 	return (plat_level);
796 #else
797 	return (0);
798 #endif
799 }
800 
801 /*
802  * This notification handler is called whenever the ACPI _PPC
803  * object changes. The _PPC is a sort of governor on power levels.
804  * It sets an upper threshold on which, _PSS defined, power levels
805  * are usuable. The _PPC value is dynamic and may change as properties
806  * (i.e., thermal or AC source) of the system change.
807  */
808 
809 static void
810 cpupm_power_manage_notifications(void *ctx)
811 {
812 	cpu_t			*cp = ctx;
813 	int			top_speed;
814 
815 	top_speed = cpupm_get_top_speed(cp);
816 	cpupm_redefine_max_activepwr_state(cp, top_speed);
817 }
818 
819 /* ARGSUSED */
820 static void
821 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
822 {
823 #ifndef __xpv
824 
825 	cpu_t *cp = ctx;
826 	cpupm_mach_state_t *mach_state =
827 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
828 
829 	if (mach_state == NULL)
830 		return;
831 
832 	/*
833 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
834 	 */
835 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
836 	    mach_state->ms_caps & CPUPM_T_STATES) {
837 		cpupm_throttle_manage_notification(ctx);
838 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
839 	    mach_state->ms_caps & CPUPM_C_STATES) {
840 		cpuidle_manage_cstates(ctx);
841 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
842 	    mach_state->ms_caps & CPUPM_P_STATES) {
843 		cpupm_power_manage_notifications(ctx);
844 	}
845 #endif
846 }
847 
848 /*
849  * Update cpupm cstate data each time CPU exits idle.
850  */
851 void
852 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
853 {
854 	cs_data->cs_idle_exit = end;
855 }
856 
857 /*
858  * Determine next cstate based on cpupm data.
859  * Update cpupm cstate data each time CPU goes idle.
860  * Do as much as possible in the idle state bookkeeping function because the
861  * performance impact while idle is minimal compared to in the wakeup function
862  * when there is real work to do.
863  */
864 uint32_t
865 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
866     uint32_t cs_count, hrtime_t start)
867 {
868 	hrtime_t duration;
869 	hrtime_t ave_interval;
870 	hrtime_t ave_idle_time;
871 	uint32_t i;
872 
873 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
874 	scalehrtime(&duration);
875 	cs_data->cs_idle += duration;
876 	cs_data->cs_idle_enter = start;
877 
878 	++cs_data->cs_cnt;
879 	if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
880 		cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
881 		scalehrtime(&cs_data->cs_smpl_len);
882 		cs_data->cs_smpl_len |= 1;	/* protect from DIV 0 */
883 		cs_data->cs_smpl_idle = cs_data->cs_idle;
884 		cs_data->cs_idle = 0;
885 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
886 		    cs_data->cs_smpl_len);
887 
888 		cs_data->cs_smpl_start = start;
889 		cs_data->cs_cnt = 0;
890 
891 		/*
892 		 * Strand level C-state policy
893 		 * The cpu_acpi_cstate_t *cstates array is not required to
894 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
895 		 * There are cs_count entries in the cstates array.
896 		 * cs_data->cs_next_cstate contains the index of the next
897 		 * C-state this CPU should enter.
898 		 */
899 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
900 
901 		/*
902 		 * Will CPU be idle long enough to save power?
903 		 */
904 		ave_idle_time = (cs_data->cs_smpl_idle /
905 		    cpupm_cs_sample_tunable) / 1000;
906 		for (i = 1; i < cs_count; ++i) {
907 			if (ave_idle_time < (cstates[i].cs_latency *
908 			    cpupm_cs_idle_save_tunable)) {
909 				cs_count = i;
910 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
911 				    CPU, int, i);
912 			}
913 		}
914 
915 		/*
916 		 * Wakeup often (even when non-idle time is very short)?
917 		 * Some producer/consumer type loads fall into this category.
918 		 */
919 		ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
920 		    / 1000;
921 		for (i = 1; i < cs_count; ++i) {
922 			if (ave_interval <= (cstates[i].cs_latency *
923 			    cpupm_cs_idle_cost_tunable)) {
924 				cs_count = i;
925 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
926 				    CPU, int, (CPU_MAX_CSTATES + i));
927 			}
928 		}
929 
930 		/*
931 		 * Idle percent
932 		 */
933 		for (i = 1; i < cs_count; ++i) {
934 			switch (cstates[i].cs_type) {
935 			case CPU_ACPI_C2:
936 				if (cs_data->cs_smpl_idle_pct <
937 				    cpupm_C2_idle_pct_tunable) {
938 					cs_count = i;
939 					DTRACE_PROBE2(cpupm__next__cstate,
940 					    cpu_t *, CPU, int,
941 					    ((2 * CPU_MAX_CSTATES) + i));
942 				}
943 				break;
944 
945 			case CPU_ACPI_C3:
946 				if (cs_data->cs_smpl_idle_pct <
947 				    cpupm_C3_idle_pct_tunable) {
948 					cs_count = i;
949 					DTRACE_PROBE2(cpupm__next__cstate,
950 					    cpu_t *, CPU, int,
951 					    ((2 * CPU_MAX_CSTATES) + i));
952 				}
953 				break;
954 			}
955 		}
956 
957 		cs_data->cs_next_cstate = cs_count - 1;
958 	}
959 
960 	return (cs_data->cs_next_cstate);
961 }
962