xref: /titanic_50/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision e31df31051ab05e561eab5b23bb1c00627a10d64)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/archsystm.h>
36 #include <sys/hpet.h>
37 #include <sys/acpi/acpi.h>
38 #include <sys/acpica.h>
39 #include <sys/cpupm.h>
40 #include <sys/cpu_idle.h>
41 #include <sys/cpu_acpi.h>
42 #include <sys/cpupm_throttle.h>
43 #include <sys/dtrace.h>
44 #include <sys/note.h>
45 
46 /*
47  * This callback is used to build the PPM CPU domains once
48  * a CPU device has been started. The callback is initialized
49  * by the PPM driver to point to a routine that will build the
50  * domains.
51  */
52 void (*cpupm_ppm_alloc_pstate_domains)(cpu_t *);
53 
54 /*
55  * This callback is used to remove CPU from the PPM CPU domains
56  * when the cpu driver is detached. The callback is initialized
57  * by the PPM driver to point to a routine that will remove CPU
58  * from the domains.
59  */
60 void (*cpupm_ppm_free_pstate_domains)(cpu_t *);
61 
62 /*
63  * This callback is used to redefine the topspeed for a CPU device.
64  * Since all CPUs in a domain should have identical properties, this
65  * callback is initialized by the PPM driver to point to a routine
66  * that will redefine the topspeed for all devices in a CPU domain.
67  * This callback is exercised whenever an ACPI _PPC change notification
68  * is received by the CPU driver.
69  */
70 void (*cpupm_redefine_topspeed)(void *);
71 
72 /*
73  * This callback is used by the PPM driver to call into the CPU driver
74  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
75  */
76 void (*cpupm_set_topspeed_callb)(void *, int);
77 
78 /*
79  * This callback is used by the PPM driver to call into the CPU driver
80  * to set a new topspeed for a CPU.
81  */
82 int (*cpupm_get_topspeed_callb)(void *);
83 
84 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
85 static void cpupm_free_notify_handlers(cpu_t *);
86 
87 /*
88  * Until proven otherwise, all power states are manageable.
89  */
90 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
91 
92 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
93 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
94 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
95 
96 /*
97  * c-state tunables
98  *
99  * cpupm_cs_sample_interval is the length of time we wait before
100  * recalculating c-state statistics.  When a CPU goes idle it checks
101  * to see if it has been longer than cpupm_cs_sample_interval since it last
102  * caculated which C-state to go to.
103  *
104  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
105  * divided by time spent in the idle state transitions.
106  * A value of 10 means the CPU will not spend more than 1/10 of its time
107  * in idle latency.  The worst case performance will be 90% of non Deep C-state
108  * kernel.
109  *
110  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
111  * before it is worth going there.  Expressed as a multiple of latency.
112  */
113 uint32_t cpupm_cs_sample_interval = 100*1000*1000;	/* 100 milliseconds */
114 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
115 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
116 uint16_t cpupm_C2_idle_pct_tunable = 70;
117 uint16_t cpupm_C3_idle_pct_tunable = 80;
118 
119 #ifndef __xpv
120 extern boolean_t cpupm_intel_init(cpu_t *);
121 extern boolean_t cpupm_amd_init(cpu_t *);
122 
123 typedef struct cpupm_vendor {
124 	boolean_t	(*cpuv_init)(cpu_t *);
125 } cpupm_vendor_t;
126 
127 /*
128  * Table of supported vendors.
129  */
130 static cpupm_vendor_t cpupm_vendors[] = {
131 	cpupm_intel_init,
132 	cpupm_amd_init,
133 	NULL
134 };
135 #endif
136 
137 /*
138  * Initialize the machine.
139  * See if a module exists for managing power for this CPU.
140  */
141 /*ARGSUSED*/
142 void
143 cpupm_init(cpu_t *cp)
144 {
145 #ifndef __xpv
146 	cpupm_vendor_t *vendors;
147 	cpupm_mach_state_t *mach_state;
148 	struct machcpu *mcpu = &(cp->cpu_m);
149 	static boolean_t first = B_TRUE;
150 	int *speeds;
151 	uint_t nspeeds;
152 	int ret;
153 
154 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
155 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
156 	mach_state->ms_caps = CPUPM_NO_STATES;
157 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
158 
159 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
160 	if (mach_state->ms_acpi_handle == NULL) {
161 		cpupm_fini(cp);
162 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
163 		    "unable to get ACPI handle", cp->cpu_id);
164 		cmn_err(CE_NOTE, "!CPU power management will not function.");
165 		CPUPM_DISABLE();
166 		first = B_FALSE;
167 		return;
168 	}
169 
170 	/*
171 	 * Loop through the CPU management module table and see if
172 	 * any of the modules implement CPU power management
173 	 * for this CPU.
174 	 */
175 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
176 		if (vendors->cpuv_init(cp))
177 			break;
178 	}
179 
180 	/*
181 	 * Nope, we can't power manage this CPU.
182 	 */
183 	if (vendors == NULL) {
184 		cpupm_fini(cp);
185 		CPUPM_DISABLE();
186 		first = B_FALSE;
187 		return;
188 	}
189 
190 	/*
191 	 * If P-state support exists for this system, then initialize it.
192 	 */
193 	if (mach_state->ms_pstate.cma_ops != NULL) {
194 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
195 		if (ret != 0) {
196 			mach_state->ms_pstate.cma_ops = NULL;
197 			cpupm_disable(CPUPM_P_STATES);
198 		} else {
199 			nspeeds = cpupm_get_speeds(cp, &speeds);
200 			if (nspeeds == 0) {
201 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
202 				    " no speeds to manage", cp->cpu_id);
203 			} else {
204 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
205 				cpupm_free_speeds(speeds, nspeeds);
206 				mach_state->ms_caps |= CPUPM_P_STATES;
207 			}
208 		}
209 	}
210 
211 	if (mach_state->ms_tstate.cma_ops != NULL) {
212 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
213 		if (ret != 0) {
214 			mach_state->ms_tstate.cma_ops = NULL;
215 			cpupm_disable(CPUPM_T_STATES);
216 		} else {
217 			mach_state->ms_caps |= CPUPM_T_STATES;
218 		}
219 	}
220 
221 	/*
222 	 * If C-states support exists for this system, then initialize it.
223 	 */
224 	if (mach_state->ms_cstate.cma_ops != NULL) {
225 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
226 		if (ret != 0) {
227 			mach_state->ms_cstate.cma_ops = NULL;
228 			mcpu->max_cstates = CPU_ACPI_C1;
229 			cpupm_disable(CPUPM_C_STATES);
230 			idle_cpu = non_deep_idle_cpu;
231 			disp_enq_thread = non_deep_idle_disp_enq_thread;
232 		} else if (cpu_deep_cstates_supported()) {
233 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
234 			    mach_state->ms_acpi_handle);
235 			if (mcpu->max_cstates > CPU_ACPI_C1) {
236 				(void) cstate_timer_callback(
237 				    CST_EVENT_MULTIPLE_CSTATES);
238 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
239 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
240 				disp_enq_thread = cstate_wakeup;
241 			} else {
242 				(void) cstate_timer_callback(
243 				    CST_EVENT_ONE_CSTATE);
244 			}
245 			mach_state->ms_caps |= CPUPM_C_STATES;
246 		} else {
247 			mcpu->max_cstates = CPU_ACPI_C1;
248 			idle_cpu = non_deep_idle_cpu;
249 			disp_enq_thread = non_deep_idle_disp_enq_thread;
250 		}
251 	}
252 
253 
254 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
255 		cpupm_fini(cp);
256 		CPUPM_DISABLE();
257 		first = B_FALSE;
258 		return;
259 	}
260 
261 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
262 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
263 	    (mach_state->ms_caps & CPUPM_C_STATES)) {
264 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
265 		if (first) {
266 			acpica_write_cpupm_capabilities(
267 			    mach_state->ms_caps & CPUPM_P_STATES,
268 			    mach_state->ms_caps & CPUPM_C_STATES);
269 		}
270 	}
271 	first = B_FALSE;
272 #endif
273 }
274 
275 /*
276  * Free any resources allocated during cpupm initialization or cpupm start.
277  */
278 /*ARGSUSED*/
279 void
280 cpupm_free(cpu_t *cp, boolean_t cpupm_stop)
281 {
282 #ifndef __xpv
283 	cpupm_mach_state_t *mach_state =
284 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
285 
286 	if (mach_state == NULL)
287 		return;
288 
289 	if (mach_state->ms_pstate.cma_ops != NULL) {
290 		if (cpupm_stop)
291 			mach_state->ms_pstate.cma_ops->cpus_stop(cp);
292 		else
293 			mach_state->ms_pstate.cma_ops->cpus_fini(cp);
294 		mach_state->ms_pstate.cma_ops = NULL;
295 	}
296 
297 	if (mach_state->ms_tstate.cma_ops != NULL) {
298 		if (cpupm_stop)
299 			mach_state->ms_tstate.cma_ops->cpus_stop(cp);
300 		else
301 			mach_state->ms_tstate.cma_ops->cpus_fini(cp);
302 		mach_state->ms_tstate.cma_ops = NULL;
303 	}
304 
305 	if (mach_state->ms_cstate.cma_ops != NULL) {
306 		if (cpupm_stop)
307 			mach_state->ms_cstate.cma_ops->cpus_stop(cp);
308 		else
309 			mach_state->ms_cstate.cma_ops->cpus_fini(cp);
310 
311 		mach_state->ms_cstate.cma_ops = NULL;
312 	}
313 
314 	cpupm_free_notify_handlers(cp);
315 
316 	if (mach_state->ms_acpi_handle != NULL) {
317 		cpu_acpi_fini(mach_state->ms_acpi_handle);
318 		mach_state->ms_acpi_handle = NULL;
319 	}
320 
321 	mutex_destroy(&mach_state->ms_lock);
322 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
323 	cp->cpu_m.mcpu_pm_mach_state = NULL;
324 #endif
325 }
326 
327 void
328 cpupm_fini(cpu_t *cp)
329 {
330 	/*
331 	 * call (*cpus_fini)() ops to release the cpupm resource
332 	 * in the P/C/T-state driver
333 	 */
334 	cpupm_free(cp, B_FALSE);
335 }
336 
337 void
338 cpupm_start(cpu_t *cp)
339 {
340 	cpupm_init(cp);
341 }
342 
343 void
344 cpupm_stop(cpu_t *cp)
345 {
346 	/*
347 	 * call (*cpus_stop)() ops to reclaim the cpupm resource
348 	 * in the P/C/T-state driver
349 	 */
350 	cpupm_free(cp, B_TRUE);
351 }
352 
353 /*
354  * If A CPU has started and at least one power state is manageable,
355  * then the CPU is ready for power management.
356  */
357 boolean_t
358 cpupm_is_ready(cpu_t *cp)
359 {
360 #ifndef __xpv
361 	cpupm_mach_state_t *mach_state =
362 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
363 	uint32_t cpupm_caps = mach_state->ms_caps;
364 
365 	if (cpupm_enabled == CPUPM_NO_STATES)
366 		return (B_FALSE);
367 
368 	if ((cpupm_caps & CPUPM_T_STATES) ||
369 	    (cpupm_caps & CPUPM_P_STATES) ||
370 	    (cpupm_caps & CPUPM_C_STATES))
371 
372 		return (B_TRUE);
373 	return (B_FALSE);
374 #else
375 	_NOTE(ARGUNUSED(cp));
376 	return (B_FALSE);
377 #endif
378 }
379 
380 boolean_t
381 cpupm_is_enabled(uint32_t state)
382 {
383 	return ((cpupm_enabled & state) == state);
384 }
385 
386 /*
387  * By default, all states are enabled.
388  */
389 void
390 cpupm_disable(uint32_t state)
391 {
392 
393 	if (state & CPUPM_P_STATES) {
394 		cpupm_free_domains(&cpupm_pstate_domains);
395 	}
396 	if (state & CPUPM_T_STATES) {
397 		cpupm_free_domains(&cpupm_tstate_domains);
398 	}
399 	if (state & CPUPM_C_STATES) {
400 		cpupm_free_domains(&cpupm_cstate_domains);
401 	}
402 	cpupm_enabled &= ~state;
403 }
404 
405 /*
406  * Allocate power domains for C,P and T States
407  */
408 void
409 cpupm_alloc_domains(cpu_t *cp, int state)
410 {
411 	cpupm_mach_state_t *mach_state =
412 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
413 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
414 	cpupm_state_domains_t **dom_ptr;
415 	cpupm_state_domains_t *dptr;
416 	cpupm_state_domains_t **mach_dom_state_ptr;
417 	uint32_t domain;
418 	uint32_t type;
419 
420 	switch (state) {
421 	case CPUPM_P_STATES:
422 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
423 			domain = CPU_ACPI_PSD(handle).sd_domain;
424 			type = CPU_ACPI_PSD(handle).sd_type;
425 		} else {
426 			mutex_enter(&cpu_lock);
427 			domain = cpuid_get_chipid(cp);
428 			mutex_exit(&cpu_lock);
429 			type = CPU_ACPI_HW_ALL;
430 		}
431 		dom_ptr = &cpupm_pstate_domains;
432 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
433 		break;
434 	case CPUPM_T_STATES:
435 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
436 			domain = CPU_ACPI_TSD(handle).sd_domain;
437 			type = CPU_ACPI_TSD(handle).sd_type;
438 		} else {
439 			mutex_enter(&cpu_lock);
440 			domain = cpuid_get_chipid(cp);
441 			mutex_exit(&cpu_lock);
442 			type = CPU_ACPI_HW_ALL;
443 		}
444 		dom_ptr = &cpupm_tstate_domains;
445 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
446 		break;
447 	case CPUPM_C_STATES:
448 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
449 			domain = CPU_ACPI_CSD(handle).sd_domain;
450 			type = CPU_ACPI_CSD(handle).sd_type;
451 		} else {
452 			mutex_enter(&cpu_lock);
453 			domain = cpuid_get_coreid(cp);
454 			mutex_exit(&cpu_lock);
455 			type = CPU_ACPI_HW_ALL;
456 		}
457 		dom_ptr = &cpupm_cstate_domains;
458 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
459 		break;
460 	default:
461 		return;
462 	}
463 
464 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
465 		if (dptr->pm_domain == domain)
466 			break;
467 	}
468 
469 	/* new domain is created and linked at the head */
470 	if (dptr == NULL) {
471 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
472 		dptr->pm_domain = domain;
473 		dptr->pm_type = type;
474 		dptr->pm_next = *dom_ptr;
475 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
476 		    (void *)ipltospl(DISP_LEVEL));
477 		CPUSET_ZERO(dptr->pm_cpus);
478 		*dom_ptr = dptr;
479 	}
480 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
481 	*mach_dom_state_ptr = dptr;
482 }
483 
484 /*
485  * Free C, P or T state power domains
486  */
487 void
488 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
489 {
490 	cpupm_state_domains_t *this_domain, *next_domain;
491 
492 	this_domain = *dom_ptr;
493 	while (this_domain != NULL) {
494 		next_domain = this_domain->pm_next;
495 		mutex_destroy(&this_domain->pm_lock);
496 		kmem_free((void *)this_domain,
497 		    sizeof (cpupm_state_domains_t));
498 		this_domain = next_domain;
499 	}
500 	*dom_ptr = NULL;
501 }
502 
503 /*
504  * Remove CPU from C, P or T state power domains
505  */
506 void
507 cpupm_remove_domains(cpu_t *cp, int state, cpupm_state_domains_t **dom_ptr)
508 {
509 	cpupm_mach_state_t *mach_state =
510 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
511 	cpupm_state_domains_t *dptr;
512 	uint32_t pm_domain;
513 	ulong_t iflag;
514 
515 	ASSERT(mach_state);
516 
517 	switch (state) {
518 	case CPUPM_P_STATES:
519 		pm_domain = mach_state->ms_pstate.cma_domain->pm_domain;
520 		break;
521 	case CPUPM_T_STATES:
522 		pm_domain = mach_state->ms_tstate.cma_domain->pm_domain;
523 		break;
524 	case CPUPM_C_STATES:
525 		pm_domain = mach_state->ms_cstate.cma_domain->pm_domain;
526 		break;
527 	default:
528 		return;
529 	}
530 
531 	/*
532 	 * Find the CPU C, P or T state power domain
533 	 */
534 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
535 		if (dptr->pm_domain == pm_domain)
536 			break;
537 	}
538 
539 	/*
540 	 * return if no matched domain found
541 	 */
542 	if (dptr == NULL)
543 		return;
544 
545 	/*
546 	 * We found one matched power domain, remove CPU from its cpuset.
547 	 * Interrupt is disabled here to avoid the race conditions between
548 	 * event change notification and cpu remove.
549 	 */
550 	iflag = intr_clear();
551 	mutex_enter(&dptr->pm_lock);
552 	if (CPU_IN_SET(dptr->pm_cpus, cp->cpu_id))
553 		CPUSET_DEL(dptr->pm_cpus, cp->cpu_id);
554 	mutex_exit(&dptr->pm_lock);
555 	intr_restore(iflag);
556 }
557 
558 void
559 cpupm_alloc_ms_cstate(cpu_t *cp)
560 {
561 	cpupm_mach_state_t *mach_state;
562 	cpupm_mach_acpi_state_t *ms_cstate;
563 
564 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
565 	ms_cstate = &mach_state->ms_cstate;
566 	ASSERT(ms_cstate->cma_state.cstate == NULL);
567 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
568 	    KM_SLEEP);
569 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
570 }
571 
572 void
573 cpupm_free_ms_cstate(cpu_t *cp)
574 {
575 	cpupm_mach_state_t *mach_state =
576 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
577 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
578 
579 	if (ms_cstate->cma_state.cstate != NULL) {
580 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
581 		ms_cstate->cma_state.cstate = NULL;
582 	}
583 }
584 
585 void
586 cpupm_state_change(cpu_t *cp, int level, int state)
587 {
588 	cpupm_mach_state_t	*mach_state =
589 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
590 	cpupm_state_ops_t	*state_ops;
591 	cpupm_state_domains_t  	*state_domain;
592 	cpuset_t		set;
593 
594 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
595 
596 	if (mach_state == NULL) {
597 		return;
598 	}
599 
600 	switch (state) {
601 	case CPUPM_P_STATES:
602 		state_ops = mach_state->ms_pstate.cma_ops;
603 		state_domain = mach_state->ms_pstate.cma_domain;
604 		break;
605 	case CPUPM_T_STATES:
606 		state_ops = mach_state->ms_tstate.cma_ops;
607 		state_domain = mach_state->ms_tstate.cma_domain;
608 		break;
609 	default:
610 		break;
611 	}
612 
613 	switch (state_domain->pm_type) {
614 	case CPU_ACPI_SW_ANY:
615 		/*
616 		 * A request on any CPU in the domain transitions the domain
617 		 */
618 		CPUSET_ONLY(set, cp->cpu_id);
619 		state_ops->cpus_change(set, level);
620 		break;
621 	case CPU_ACPI_SW_ALL:
622 		/*
623 		 * All CPUs in the domain must request the transition
624 		 */
625 	case CPU_ACPI_HW_ALL:
626 		/*
627 		 * P/T-state transitions are coordinated by the hardware
628 		 * For now, request the transition on all CPUs in the domain,
629 		 * but looking ahead we can probably be smarter about this.
630 		 */
631 		mutex_enter(&state_domain->pm_lock);
632 		state_ops->cpus_change(state_domain->pm_cpus, level);
633 		mutex_exit(&state_domain->pm_lock);
634 		break;
635 	default:
636 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
637 		    state_domain->pm_type);
638 	}
639 }
640 
641 /*
642  * CPU PM interfaces exposed to the CPU power manager
643  */
644 /*ARGSUSED*/
645 id_t
646 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
647 {
648 	cpupm_mach_state_t	*mach_state =
649 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
650 
651 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
652 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
653 		return (CPUPM_NO_DOMAIN);
654 	}
655 	if (type == CPUPM_DTYPE_ACTIVE) {
656 		/*
657 		 * Return P-State domain for the specified CPU
658 		 */
659 		if (mach_state->ms_pstate.cma_domain) {
660 			return (mach_state->ms_pstate.cma_domain->pm_domain);
661 		}
662 	} else if (type == CPUPM_DTYPE_IDLE) {
663 		/*
664 		 * Return C-State domain for the specified CPU
665 		 */
666 		if (mach_state->ms_cstate.cma_domain) {
667 			return (mach_state->ms_cstate.cma_domain->pm_domain);
668 		}
669 	}
670 	return (CPUPM_NO_DOMAIN);
671 }
672 
673 /*ARGSUSED*/
674 uint_t
675 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
676     cpupm_state_t *states)
677 {
678 	int	*speeds;
679 	uint_t	nspeeds, i;
680 
681 	/*
682 	 * Idle domain support unimplemented
683 	 */
684 	if (type != CPUPM_DTYPE_ACTIVE) {
685 		return (0);
686 	}
687 	nspeeds = cpupm_get_speeds(cp, &speeds);
688 
689 	/*
690 	 * If the caller passes NULL for states, just return the
691 	 * number of states.
692 	 */
693 	if (states != NULL) {
694 		for (i = 0; i < nspeeds; i++) {
695 			states[i].cps_speed = speeds[i];
696 			states[i].cps_handle = (cpupm_handle_t)i;
697 		}
698 	}
699 	cpupm_free_speeds(speeds, nspeeds);
700 	return (nspeeds);
701 }
702 
703 /*ARGSUSED*/
704 int
705 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
706 {
707 	if (!cpupm_is_ready(cp))
708 		return (-1);
709 
710 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
711 
712 	return (0);
713 }
714 
715 /*ARGSUSED*/
716 /*
717  * Note: It is the responsibility of the users of
718  * cpupm_get_speeds() to free the memory allocated
719  * for speeds using cpupm_free_speeds()
720  */
721 uint_t
722 cpupm_get_speeds(cpu_t *cp, int **speeds)
723 {
724 #ifndef __xpv
725 	cpupm_mach_state_t *mach_state =
726 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
727 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
728 #else
729 	return (0);
730 #endif
731 }
732 
733 /*ARGSUSED*/
734 void
735 cpupm_free_speeds(int *speeds, uint_t nspeeds)
736 {
737 #ifndef __xpv
738 	cpu_acpi_free_speeds(speeds, nspeeds);
739 #endif
740 }
741 
742 /*
743  * All CPU instances have been initialized successfully.
744  */
745 boolean_t
746 cpupm_power_ready(cpu_t *cp)
747 {
748 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready(cp));
749 }
750 
751 /*
752  * All CPU instances have been initialized successfully.
753  */
754 boolean_t
755 cpupm_throttle_ready(cpu_t *cp)
756 {
757 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready(cp));
758 }
759 
760 /*
761  * All CPU instances have been initialized successfully.
762  */
763 boolean_t
764 cpupm_cstate_ready(cpu_t *cp)
765 {
766 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready(cp));
767 }
768 
769 void
770 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
771 {
772 	cpu_t *cp = ctx;
773 	cpupm_mach_state_t *mach_state =
774 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
775 	cpupm_notification_t *entry;
776 
777 	mutex_enter(&mach_state->ms_lock);
778 	for (entry =  mach_state->ms_handlers; entry != NULL;
779 	    entry = entry->nq_next) {
780 		entry->nq_handler(obj, val, entry->nq_ctx);
781 	}
782 	mutex_exit(&mach_state->ms_lock);
783 }
784 
785 /*ARGSUSED*/
786 void
787 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
788 {
789 #ifndef __xpv
790 	cpupm_mach_state_t *mach_state =
791 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
792 	cpupm_notification_t *entry;
793 
794 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
795 	entry->nq_handler = handler;
796 	entry->nq_ctx = ctx;
797 	mutex_enter(&mach_state->ms_lock);
798 	if (mach_state->ms_handlers == NULL) {
799 		entry->nq_next = NULL;
800 		mach_state->ms_handlers = entry;
801 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
802 		    cpupm_notify_handler, cp);
803 
804 	} else {
805 		entry->nq_next = mach_state->ms_handlers;
806 		mach_state->ms_handlers = entry;
807 	}
808 	mutex_exit(&mach_state->ms_lock);
809 #endif
810 }
811 
812 /*ARGSUSED*/
813 static void
814 cpupm_free_notify_handlers(cpu_t *cp)
815 {
816 #ifndef __xpv
817 	cpupm_mach_state_t *mach_state =
818 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
819 	cpupm_notification_t *entry;
820 	cpupm_notification_t *next;
821 
822 	mutex_enter(&mach_state->ms_lock);
823 	if (mach_state->ms_handlers == NULL) {
824 		mutex_exit(&mach_state->ms_lock);
825 		return;
826 	}
827 	if (mach_state->ms_acpi_handle != NULL) {
828 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
829 		    cpupm_notify_handler);
830 	}
831 	entry = mach_state->ms_handlers;
832 	while (entry != NULL) {
833 		next = entry->nq_next;
834 		kmem_free(entry, sizeof (cpupm_notification_t));
835 		entry = next;
836 	}
837 	mach_state->ms_handlers = NULL;
838 	mutex_exit(&mach_state->ms_lock);
839 #endif
840 }
841 
842 /*
843  * Get the current max speed from the ACPI _PPC object
844  */
845 /*ARGSUSED*/
846 int
847 cpupm_get_top_speed(cpu_t *cp)
848 {
849 #ifndef __xpv
850 	cpupm_mach_state_t 	*mach_state;
851 	cpu_acpi_handle_t 	handle;
852 	int 			plat_level;
853 	uint_t			nspeeds;
854 	int			max_level;
855 
856 	mach_state =
857 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
858 	handle = mach_state->ms_acpi_handle;
859 
860 	cpu_acpi_cache_ppc(handle);
861 	plat_level = CPU_ACPI_PPC(handle);
862 
863 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
864 
865 	max_level = nspeeds - 1;
866 	if ((plat_level < 0) || (plat_level > max_level)) {
867 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
868 		    "_PPC out of range %d", cp->cpu_id, plat_level);
869 		plat_level = 0;
870 	}
871 
872 	return (plat_level);
873 #else
874 	return (0);
875 #endif
876 }
877 
878 /*
879  * This notification handler is called whenever the ACPI _PPC
880  * object changes. The _PPC is a sort of governor on power levels.
881  * It sets an upper threshold on which, _PSS defined, power levels
882  * are usuable. The _PPC value is dynamic and may change as properties
883  * (i.e., thermal or AC source) of the system change.
884  */
885 
886 static void
887 cpupm_power_manage_notifications(void *ctx)
888 {
889 	cpu_t			*cp = ctx;
890 	int			top_speed;
891 
892 	top_speed = cpupm_get_top_speed(cp);
893 	cpupm_redefine_max_activepwr_state(cp, top_speed);
894 }
895 
896 /* ARGSUSED */
897 static void
898 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
899 {
900 #ifndef __xpv
901 
902 	cpu_t *cp = ctx;
903 	cpupm_mach_state_t *mach_state =
904 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
905 
906 	if (mach_state == NULL)
907 		return;
908 
909 	/*
910 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
911 	 */
912 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
913 	    mach_state->ms_caps & CPUPM_T_STATES) {
914 		cpupm_throttle_manage_notification(ctx);
915 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
916 	    mach_state->ms_caps & CPUPM_C_STATES) {
917 		cpuidle_manage_cstates(ctx);
918 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
919 	    mach_state->ms_caps & CPUPM_P_STATES) {
920 		cpupm_power_manage_notifications(ctx);
921 	}
922 #endif
923 }
924 
925 /*
926  * Update cpupm cstate data each time CPU exits idle.
927  */
928 void
929 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
930 {
931 	cs_data->cs_idle_exit = end;
932 }
933 
934 /*
935  * Determine next cstate based on cpupm data.
936  * Update cpupm cstate data each time CPU goes idle.
937  * Do as much as possible in the idle state bookkeeping function because the
938  * performance impact while idle is minimal compared to in the wakeup function
939  * when there is real work to do.
940  */
941 uint32_t
942 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
943     uint32_t cs_count, hrtime_t start)
944 {
945 	hrtime_t duration;
946 	hrtime_t ave_interval;
947 	hrtime_t ave_idle_time;
948 	uint32_t i, smpl_cnt;
949 
950 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
951 	scalehrtime(&duration);
952 	cs_data->cs_idle += duration;
953 	cs_data->cs_idle_enter = start;
954 
955 	smpl_cnt = ++cs_data->cs_cnt;
956 	cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
957 	scalehrtime(&cs_data->cs_smpl_len);
958 	if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
959 		cs_data->cs_smpl_idle = cs_data->cs_idle;
960 		cs_data->cs_idle = 0;
961 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
962 		    cs_data->cs_smpl_len);
963 
964 		cs_data->cs_smpl_start = start;
965 		cs_data->cs_cnt = 0;
966 
967 		/*
968 		 * Strand level C-state policy
969 		 * The cpu_acpi_cstate_t *cstates array is not required to
970 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
971 		 * There are cs_count entries in the cstates array.
972 		 * cs_data->cs_next_cstate contains the index of the next
973 		 * C-state this CPU should enter.
974 		 */
975 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
976 
977 		/*
978 		 * Will CPU be idle long enough to save power?
979 		 */
980 		ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
981 		for (i = 1; i < cs_count; ++i) {
982 			if (ave_idle_time < (cstates[i].cs_latency *
983 			    cpupm_cs_idle_save_tunable)) {
984 				cs_count = i;
985 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
986 				    CPU, int, i);
987 			}
988 		}
989 
990 		/*
991 		 * Wakeup often (even when non-idle time is very short)?
992 		 * Some producer/consumer type loads fall into this category.
993 		 */
994 		ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
995 		for (i = 1; i < cs_count; ++i) {
996 			if (ave_interval <= (cstates[i].cs_latency *
997 			    cpupm_cs_idle_cost_tunable)) {
998 				cs_count = i;
999 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1000 				    CPU, int, (CPU_MAX_CSTATES + i));
1001 			}
1002 		}
1003 
1004 		/*
1005 		 * Idle percent
1006 		 */
1007 		for (i = 1; i < cs_count; ++i) {
1008 			switch (cstates[i].cs_type) {
1009 			case CPU_ACPI_C2:
1010 				if (cs_data->cs_smpl_idle_pct <
1011 				    cpupm_C2_idle_pct_tunable) {
1012 					cs_count = i;
1013 					DTRACE_PROBE2(cpupm__next__cstate,
1014 					    cpu_t *, CPU, int,
1015 					    ((2 * CPU_MAX_CSTATES) + i));
1016 				}
1017 				break;
1018 
1019 			case CPU_ACPI_C3:
1020 				if (cs_data->cs_smpl_idle_pct <
1021 				    cpupm_C3_idle_pct_tunable) {
1022 					cs_count = i;
1023 					DTRACE_PROBE2(cpupm__next__cstate,
1024 					    cpu_t *, CPU, int,
1025 					    ((2 * CPU_MAX_CSTATES) + i));
1026 				}
1027 				break;
1028 			}
1029 		}
1030 
1031 		cs_data->cs_next_cstate = cs_count - 1;
1032 	}
1033 
1034 	return (cs_data->cs_next_cstate);
1035 }
1036