xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 3c573fcc51430b02603f62713f3f5d1b0b1aed1c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/archsystm.h>
36 #include <sys/hpet.h>
37 #include <sys/acpi/acpi.h>
38 #include <sys/acpica.h>
39 #include <sys/cpupm.h>
40 #include <sys/cpu_idle.h>
41 #include <sys/cpu_acpi.h>
42 #include <sys/cpupm_throttle.h>
43 #include <sys/dtrace.h>
44 #include <sys/note.h>
45 
46 /*
47  * This callback is used to build the PPM CPU domains once
48  * a CPU device has been started. The callback is initialized
49  * by the PPM driver to point to a routine that will build the
50  * domains.
51  */
52 void (*cpupm_ppm_alloc_pstate_domains)(cpu_t *);
53 
54 /*
55  * This callback is used to remove CPU from the PPM CPU domains
56  * when the cpu driver is detached. The callback is initialized
57  * by the PPM driver to point to a routine that will remove CPU
58  * from the domains.
59  */
60 void (*cpupm_ppm_free_pstate_domains)(cpu_t *);
61 
62 /*
63  * This callback is used to redefine the topspeed for a CPU device.
64  * Since all CPUs in a domain should have identical properties, this
65  * callback is initialized by the PPM driver to point to a routine
66  * that will redefine the topspeed for all devices in a CPU domain.
67  * This callback is exercised whenever an ACPI _PPC change notification
68  * is received by the CPU driver.
69  */
70 void (*cpupm_redefine_topspeed)(void *);
71 
72 /*
73  * This callback is used by the PPM driver to call into the CPU driver
74  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
75  */
76 void (*cpupm_set_topspeed_callb)(void *, int);
77 
78 /*
79  * This callback is used by the PPM driver to call into the CPU driver
80  * to set a new topspeed for a CPU.
81  */
82 int (*cpupm_get_topspeed_callb)(void *);
83 
84 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
85 static void cpupm_free_notify_handlers(cpu_t *);
86 static void cpupm_power_manage_notifications(void *);
87 
88 /*
89  * Until proven otherwise, all power states are manageable.
90  */
91 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
92 
93 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
94 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
95 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
96 
97 /*
98  * c-state tunables
99  *
100  * cpupm_cs_sample_interval is the length of time we wait before
101  * recalculating c-state statistics.  When a CPU goes idle it checks
102  * to see if it has been longer than cpupm_cs_sample_interval since it last
103  * caculated which C-state to go to.
104  *
105  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
106  * divided by time spent in the idle state transitions.
107  * A value of 10 means the CPU will not spend more than 1/10 of its time
108  * in idle latency.  The worst case performance will be 90% of non Deep C-state
109  * kernel.
110  *
111  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
112  * before it is worth going there.  Expressed as a multiple of latency.
113  */
114 uint32_t cpupm_cs_sample_interval = 100*1000*1000;	/* 100 milliseconds */
115 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
116 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
117 uint16_t cpupm_C2_idle_pct_tunable = 70;
118 uint16_t cpupm_C3_idle_pct_tunable = 80;
119 
120 #ifndef __xpv
121 extern boolean_t cpupm_intel_init(cpu_t *);
122 extern boolean_t cpupm_amd_init(cpu_t *);
123 
124 typedef struct cpupm_vendor {
125 	boolean_t	(*cpuv_init)(cpu_t *);
126 } cpupm_vendor_t;
127 
128 /*
129  * Table of supported vendors.
130  */
131 static cpupm_vendor_t cpupm_vendors[] = {
132 	cpupm_intel_init,
133 	cpupm_amd_init,
134 	NULL
135 };
136 #endif
137 
138 /*
139  * Initialize the machine.
140  * See if a module exists for managing power for this CPU.
141  */
142 /*ARGSUSED*/
143 void
144 cpupm_init(cpu_t *cp)
145 {
146 #ifndef __xpv
147 	cpupm_vendor_t *vendors;
148 	cpupm_mach_state_t *mach_state;
149 	struct machcpu *mcpu = &(cp->cpu_m);
150 	static boolean_t first = B_TRUE;
151 	int *speeds;
152 	uint_t nspeeds;
153 	int ret;
154 
155 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
156 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
157 	mach_state->ms_caps = CPUPM_NO_STATES;
158 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
159 
160 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
161 	if (mach_state->ms_acpi_handle == NULL) {
162 		cpupm_fini(cp);
163 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
164 		    "unable to get ACPI handle", cp->cpu_id);
165 		cmn_err(CE_NOTE, "!CPU power management will not function.");
166 		CPUPM_DISABLE();
167 		first = B_FALSE;
168 		return;
169 	}
170 
171 	/*
172 	 * Loop through the CPU management module table and see if
173 	 * any of the modules implement CPU power management
174 	 * for this CPU.
175 	 */
176 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
177 		if (vendors->cpuv_init(cp))
178 			break;
179 	}
180 
181 	/*
182 	 * Nope, we can't power manage this CPU.
183 	 */
184 	if (vendors == NULL) {
185 		cpupm_fini(cp);
186 		CPUPM_DISABLE();
187 		first = B_FALSE;
188 		return;
189 	}
190 
191 	/*
192 	 * If P-state support exists for this system, then initialize it.
193 	 */
194 	if (mach_state->ms_pstate.cma_ops != NULL) {
195 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
196 		if (ret != 0) {
197 			mach_state->ms_pstate.cma_ops = NULL;
198 			cpupm_disable(CPUPM_P_STATES);
199 		} else {
200 			nspeeds = cpupm_get_speeds(cp, &speeds);
201 			if (nspeeds == 0) {
202 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
203 				    " no speeds to manage", cp->cpu_id);
204 			} else {
205 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
206 				cpupm_free_speeds(speeds, nspeeds);
207 				mach_state->ms_caps |= CPUPM_P_STATES;
208 			}
209 		}
210 	}
211 
212 	if (mach_state->ms_tstate.cma_ops != NULL) {
213 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
214 		if (ret != 0) {
215 			mach_state->ms_tstate.cma_ops = NULL;
216 			cpupm_disable(CPUPM_T_STATES);
217 		} else {
218 			mach_state->ms_caps |= CPUPM_T_STATES;
219 		}
220 	}
221 
222 	/*
223 	 * If C-states support exists for this system, then initialize it.
224 	 */
225 	if (mach_state->ms_cstate.cma_ops != NULL) {
226 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
227 		if (ret != 0) {
228 			mach_state->ms_cstate.cma_ops = NULL;
229 			mcpu->max_cstates = CPU_ACPI_C1;
230 			cpupm_disable(CPUPM_C_STATES);
231 			idle_cpu = non_deep_idle_cpu;
232 			disp_enq_thread = non_deep_idle_disp_enq_thread;
233 		} else if (cpu_deep_cstates_supported()) {
234 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
235 			    mach_state->ms_acpi_handle);
236 			if (mcpu->max_cstates > CPU_ACPI_C1) {
237 				(void) cstate_timer_callback(
238 				    CST_EVENT_MULTIPLE_CSTATES);
239 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
240 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
241 				disp_enq_thread = cstate_wakeup;
242 			} else {
243 				(void) cstate_timer_callback(
244 				    CST_EVENT_ONE_CSTATE);
245 			}
246 			mach_state->ms_caps |= CPUPM_C_STATES;
247 		} else {
248 			mcpu->max_cstates = CPU_ACPI_C1;
249 			idle_cpu = non_deep_idle_cpu;
250 			disp_enq_thread = non_deep_idle_disp_enq_thread;
251 		}
252 	}
253 
254 
255 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
256 		cpupm_fini(cp);
257 		CPUPM_DISABLE();
258 		first = B_FALSE;
259 		return;
260 	}
261 
262 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
263 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
264 	    (mach_state->ms_caps & CPUPM_C_STATES)) {
265 		if (first) {
266 			acpica_write_cpupm_capabilities(
267 			    mach_state->ms_caps & CPUPM_P_STATES,
268 			    mach_state->ms_caps & CPUPM_C_STATES);
269 		}
270 		cpupm_throttle_manage_notification(cp);
271 		cpuidle_manage_cstates(cp);
272 		cpupm_power_manage_notifications(cp);
273 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
274 	}
275 	first = B_FALSE;
276 #endif
277 }
278 
279 /*
280  * Free any resources allocated during cpupm initialization or cpupm start.
281  */
282 /*ARGSUSED*/
283 void
284 cpupm_free(cpu_t *cp, boolean_t cpupm_stop)
285 {
286 #ifndef __xpv
287 	cpupm_mach_state_t *mach_state =
288 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
289 
290 	if (mach_state == NULL)
291 		return;
292 
293 	if (mach_state->ms_pstate.cma_ops != NULL) {
294 		if (cpupm_stop)
295 			mach_state->ms_pstate.cma_ops->cpus_stop(cp);
296 		else
297 			mach_state->ms_pstate.cma_ops->cpus_fini(cp);
298 		mach_state->ms_pstate.cma_ops = NULL;
299 	}
300 
301 	if (mach_state->ms_tstate.cma_ops != NULL) {
302 		if (cpupm_stop)
303 			mach_state->ms_tstate.cma_ops->cpus_stop(cp);
304 		else
305 			mach_state->ms_tstate.cma_ops->cpus_fini(cp);
306 		mach_state->ms_tstate.cma_ops = NULL;
307 	}
308 
309 	if (mach_state->ms_cstate.cma_ops != NULL) {
310 		if (cpupm_stop)
311 			mach_state->ms_cstate.cma_ops->cpus_stop(cp);
312 		else
313 			mach_state->ms_cstate.cma_ops->cpus_fini(cp);
314 
315 		mach_state->ms_cstate.cma_ops = NULL;
316 	}
317 
318 	cpupm_free_notify_handlers(cp);
319 
320 	if (mach_state->ms_acpi_handle != NULL) {
321 		cpu_acpi_fini(mach_state->ms_acpi_handle);
322 		mach_state->ms_acpi_handle = NULL;
323 	}
324 
325 	mutex_destroy(&mach_state->ms_lock);
326 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
327 	cp->cpu_m.mcpu_pm_mach_state = NULL;
328 #endif
329 }
330 
331 void
332 cpupm_fini(cpu_t *cp)
333 {
334 	/*
335 	 * call (*cpus_fini)() ops to release the cpupm resource
336 	 * in the P/C/T-state driver
337 	 */
338 	cpupm_free(cp, B_FALSE);
339 }
340 
341 void
342 cpupm_start(cpu_t *cp)
343 {
344 	cpupm_init(cp);
345 }
346 
347 void
348 cpupm_stop(cpu_t *cp)
349 {
350 	/*
351 	 * call (*cpus_stop)() ops to reclaim the cpupm resource
352 	 * in the P/C/T-state driver
353 	 */
354 	cpupm_free(cp, B_TRUE);
355 }
356 
357 /*
358  * If A CPU has started and at least one power state is manageable,
359  * then the CPU is ready for power management.
360  */
361 boolean_t
362 cpupm_is_ready(cpu_t *cp)
363 {
364 #ifndef __xpv
365 	cpupm_mach_state_t *mach_state =
366 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
367 	uint32_t cpupm_caps = mach_state->ms_caps;
368 
369 	if (cpupm_enabled == CPUPM_NO_STATES)
370 		return (B_FALSE);
371 
372 	if ((cpupm_caps & CPUPM_T_STATES) ||
373 	    (cpupm_caps & CPUPM_P_STATES) ||
374 	    (cpupm_caps & CPUPM_C_STATES))
375 
376 		return (B_TRUE);
377 	return (B_FALSE);
378 #else
379 	_NOTE(ARGUNUSED(cp));
380 	return (B_FALSE);
381 #endif
382 }
383 
384 boolean_t
385 cpupm_is_enabled(uint32_t state)
386 {
387 	return ((cpupm_enabled & state) == state);
388 }
389 
390 /*
391  * By default, all states are enabled.
392  */
393 void
394 cpupm_disable(uint32_t state)
395 {
396 
397 	if (state & CPUPM_P_STATES) {
398 		cpupm_free_domains(&cpupm_pstate_domains);
399 	}
400 	if (state & CPUPM_T_STATES) {
401 		cpupm_free_domains(&cpupm_tstate_domains);
402 	}
403 	if (state & CPUPM_C_STATES) {
404 		cpupm_free_domains(&cpupm_cstate_domains);
405 	}
406 	cpupm_enabled &= ~state;
407 }
408 
409 /*
410  * Allocate power domains for C,P and T States
411  */
412 void
413 cpupm_alloc_domains(cpu_t *cp, int state)
414 {
415 	cpupm_mach_state_t *mach_state =
416 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
417 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
418 	cpupm_state_domains_t **dom_ptr;
419 	cpupm_state_domains_t *dptr;
420 	cpupm_state_domains_t **mach_dom_state_ptr;
421 	uint32_t domain;
422 	uint32_t type;
423 
424 	switch (state) {
425 	case CPUPM_P_STATES:
426 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
427 			domain = CPU_ACPI_PSD(handle).sd_domain;
428 			type = CPU_ACPI_PSD(handle).sd_type;
429 		} else {
430 			mutex_enter(&cpu_lock);
431 			domain = cpuid_get_chipid(cp);
432 			mutex_exit(&cpu_lock);
433 			type = CPU_ACPI_HW_ALL;
434 		}
435 		dom_ptr = &cpupm_pstate_domains;
436 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
437 		break;
438 	case CPUPM_T_STATES:
439 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
440 			domain = CPU_ACPI_TSD(handle).sd_domain;
441 			type = CPU_ACPI_TSD(handle).sd_type;
442 		} else {
443 			mutex_enter(&cpu_lock);
444 			domain = cpuid_get_chipid(cp);
445 			mutex_exit(&cpu_lock);
446 			type = CPU_ACPI_HW_ALL;
447 		}
448 		dom_ptr = &cpupm_tstate_domains;
449 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
450 		break;
451 	case CPUPM_C_STATES:
452 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
453 			domain = CPU_ACPI_CSD(handle).sd_domain;
454 			type = CPU_ACPI_CSD(handle).sd_type;
455 		} else {
456 			mutex_enter(&cpu_lock);
457 			domain = cpuid_get_coreid(cp);
458 			mutex_exit(&cpu_lock);
459 			type = CPU_ACPI_HW_ALL;
460 		}
461 		dom_ptr = &cpupm_cstate_domains;
462 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
463 		break;
464 	default:
465 		return;
466 	}
467 
468 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
469 		if (dptr->pm_domain == domain)
470 			break;
471 	}
472 
473 	/* new domain is created and linked at the head */
474 	if (dptr == NULL) {
475 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
476 		dptr->pm_domain = domain;
477 		dptr->pm_type = type;
478 		dptr->pm_next = *dom_ptr;
479 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
480 		    (void *)ipltospl(DISP_LEVEL));
481 		CPUSET_ZERO(dptr->pm_cpus);
482 		*dom_ptr = dptr;
483 	}
484 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
485 	*mach_dom_state_ptr = dptr;
486 }
487 
488 /*
489  * Free C, P or T state power domains
490  */
491 void
492 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
493 {
494 	cpupm_state_domains_t *this_domain, *next_domain;
495 
496 	this_domain = *dom_ptr;
497 	while (this_domain != NULL) {
498 		next_domain = this_domain->pm_next;
499 		mutex_destroy(&this_domain->pm_lock);
500 		kmem_free((void *)this_domain,
501 		    sizeof (cpupm_state_domains_t));
502 		this_domain = next_domain;
503 	}
504 	*dom_ptr = NULL;
505 }
506 
507 /*
508  * Remove CPU from C, P or T state power domains
509  */
510 void
511 cpupm_remove_domains(cpu_t *cp, int state, cpupm_state_domains_t **dom_ptr)
512 {
513 	cpupm_mach_state_t *mach_state =
514 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
515 	cpupm_state_domains_t *dptr;
516 	uint32_t pm_domain;
517 	ulong_t iflag;
518 
519 	ASSERT(mach_state);
520 
521 	switch (state) {
522 	case CPUPM_P_STATES:
523 		pm_domain = mach_state->ms_pstate.cma_domain->pm_domain;
524 		break;
525 	case CPUPM_T_STATES:
526 		pm_domain = mach_state->ms_tstate.cma_domain->pm_domain;
527 		break;
528 	case CPUPM_C_STATES:
529 		pm_domain = mach_state->ms_cstate.cma_domain->pm_domain;
530 		break;
531 	default:
532 		return;
533 	}
534 
535 	/*
536 	 * Find the CPU C, P or T state power domain
537 	 */
538 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
539 		if (dptr->pm_domain == pm_domain)
540 			break;
541 	}
542 
543 	/*
544 	 * return if no matched domain found
545 	 */
546 	if (dptr == NULL)
547 		return;
548 
549 	/*
550 	 * We found one matched power domain, remove CPU from its cpuset.
551 	 * Interrupt is disabled here to avoid the race conditions between
552 	 * event change notification and cpu remove.
553 	 */
554 	iflag = intr_clear();
555 	mutex_enter(&dptr->pm_lock);
556 	if (CPU_IN_SET(dptr->pm_cpus, cp->cpu_id))
557 		CPUSET_DEL(dptr->pm_cpus, cp->cpu_id);
558 	mutex_exit(&dptr->pm_lock);
559 	intr_restore(iflag);
560 }
561 
562 void
563 cpupm_alloc_ms_cstate(cpu_t *cp)
564 {
565 	cpupm_mach_state_t *mach_state;
566 	cpupm_mach_acpi_state_t *ms_cstate;
567 
568 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
569 	ms_cstate = &mach_state->ms_cstate;
570 	ASSERT(ms_cstate->cma_state.cstate == NULL);
571 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
572 	    KM_SLEEP);
573 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
574 }
575 
576 void
577 cpupm_free_ms_cstate(cpu_t *cp)
578 {
579 	cpupm_mach_state_t *mach_state =
580 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
581 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
582 
583 	if (ms_cstate->cma_state.cstate != NULL) {
584 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
585 		ms_cstate->cma_state.cstate = NULL;
586 	}
587 }
588 
589 void
590 cpupm_state_change(cpu_t *cp, int level, int state)
591 {
592 	cpupm_mach_state_t	*mach_state =
593 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
594 	cpupm_state_ops_t	*state_ops;
595 	cpupm_state_domains_t  	*state_domain;
596 	cpuset_t		set;
597 
598 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
599 
600 	if (mach_state == NULL) {
601 		return;
602 	}
603 
604 	switch (state) {
605 	case CPUPM_P_STATES:
606 		state_ops = mach_state->ms_pstate.cma_ops;
607 		state_domain = mach_state->ms_pstate.cma_domain;
608 		break;
609 	case CPUPM_T_STATES:
610 		state_ops = mach_state->ms_tstate.cma_ops;
611 		state_domain = mach_state->ms_tstate.cma_domain;
612 		break;
613 	default:
614 		break;
615 	}
616 
617 	switch (state_domain->pm_type) {
618 	case CPU_ACPI_SW_ANY:
619 		/*
620 		 * A request on any CPU in the domain transitions the domain
621 		 */
622 		CPUSET_ONLY(set, cp->cpu_id);
623 		state_ops->cpus_change(set, level);
624 		break;
625 	case CPU_ACPI_SW_ALL:
626 		/*
627 		 * All CPUs in the domain must request the transition
628 		 */
629 	case CPU_ACPI_HW_ALL:
630 		/*
631 		 * P/T-state transitions are coordinated by the hardware
632 		 * For now, request the transition on all CPUs in the domain,
633 		 * but looking ahead we can probably be smarter about this.
634 		 */
635 		mutex_enter(&state_domain->pm_lock);
636 		state_ops->cpus_change(state_domain->pm_cpus, level);
637 		mutex_exit(&state_domain->pm_lock);
638 		break;
639 	default:
640 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
641 		    state_domain->pm_type);
642 	}
643 }
644 
645 /*
646  * CPU PM interfaces exposed to the CPU power manager
647  */
648 /*ARGSUSED*/
649 id_t
650 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
651 {
652 	cpupm_mach_state_t	*mach_state =
653 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
654 
655 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
656 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
657 		return (CPUPM_NO_DOMAIN);
658 	}
659 	if (type == CPUPM_DTYPE_ACTIVE) {
660 		/*
661 		 * Return P-State domain for the specified CPU
662 		 */
663 		if (mach_state->ms_pstate.cma_domain) {
664 			return (mach_state->ms_pstate.cma_domain->pm_domain);
665 		}
666 	} else if (type == CPUPM_DTYPE_IDLE) {
667 		/*
668 		 * Return C-State domain for the specified CPU
669 		 */
670 		if (mach_state->ms_cstate.cma_domain) {
671 			return (mach_state->ms_cstate.cma_domain->pm_domain);
672 		}
673 	}
674 	return (CPUPM_NO_DOMAIN);
675 }
676 
677 /*ARGSUSED*/
678 uint_t
679 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
680     cpupm_state_t *states)
681 {
682 	int	*speeds;
683 	uint_t	nspeeds, i;
684 
685 	/*
686 	 * Idle domain support unimplemented
687 	 */
688 	if (type != CPUPM_DTYPE_ACTIVE) {
689 		return (0);
690 	}
691 	nspeeds = cpupm_get_speeds(cp, &speeds);
692 
693 	/*
694 	 * If the caller passes NULL for states, just return the
695 	 * number of states.
696 	 */
697 	if (states != NULL) {
698 		for (i = 0; i < nspeeds; i++) {
699 			states[i].cps_speed = speeds[i];
700 			states[i].cps_handle = (cpupm_handle_t)i;
701 		}
702 	}
703 	cpupm_free_speeds(speeds, nspeeds);
704 	return (nspeeds);
705 }
706 
707 /*ARGSUSED*/
708 int
709 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
710 {
711 	if (!cpupm_is_ready(cp))
712 		return (-1);
713 
714 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
715 
716 	return (0);
717 }
718 
719 /*ARGSUSED*/
720 /*
721  * Note: It is the responsibility of the users of
722  * cpupm_get_speeds() to free the memory allocated
723  * for speeds using cpupm_free_speeds()
724  */
725 uint_t
726 cpupm_get_speeds(cpu_t *cp, int **speeds)
727 {
728 #ifndef __xpv
729 	cpupm_mach_state_t *mach_state =
730 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
731 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
732 #else
733 	return (0);
734 #endif
735 }
736 
737 /*ARGSUSED*/
738 void
739 cpupm_free_speeds(int *speeds, uint_t nspeeds)
740 {
741 #ifndef __xpv
742 	cpu_acpi_free_speeds(speeds, nspeeds);
743 #endif
744 }
745 
746 /*
747  * All CPU instances have been initialized successfully.
748  */
749 boolean_t
750 cpupm_power_ready(cpu_t *cp)
751 {
752 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready(cp));
753 }
754 
755 /*
756  * All CPU instances have been initialized successfully.
757  */
758 boolean_t
759 cpupm_throttle_ready(cpu_t *cp)
760 {
761 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready(cp));
762 }
763 
764 /*
765  * All CPU instances have been initialized successfully.
766  */
767 boolean_t
768 cpupm_cstate_ready(cpu_t *cp)
769 {
770 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready(cp));
771 }
772 
773 void
774 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
775 {
776 	cpu_t *cp = ctx;
777 	cpupm_mach_state_t *mach_state =
778 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
779 	cpupm_notification_t *entry;
780 
781 	mutex_enter(&mach_state->ms_lock);
782 	for (entry =  mach_state->ms_handlers; entry != NULL;
783 	    entry = entry->nq_next) {
784 		entry->nq_handler(obj, val, entry->nq_ctx);
785 	}
786 	mutex_exit(&mach_state->ms_lock);
787 }
788 
789 /*ARGSUSED*/
790 void
791 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
792 {
793 #ifndef __xpv
794 	cpupm_mach_state_t *mach_state =
795 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
796 	cpupm_notification_t *entry;
797 
798 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
799 	entry->nq_handler = handler;
800 	entry->nq_ctx = ctx;
801 	mutex_enter(&mach_state->ms_lock);
802 	if (mach_state->ms_handlers == NULL) {
803 		entry->nq_next = NULL;
804 		mach_state->ms_handlers = entry;
805 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
806 		    cpupm_notify_handler, cp);
807 
808 	} else {
809 		entry->nq_next = mach_state->ms_handlers;
810 		mach_state->ms_handlers = entry;
811 	}
812 	mutex_exit(&mach_state->ms_lock);
813 #endif
814 }
815 
816 /*ARGSUSED*/
817 static void
818 cpupm_free_notify_handlers(cpu_t *cp)
819 {
820 #ifndef __xpv
821 	cpupm_mach_state_t *mach_state =
822 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
823 	cpupm_notification_t *entry;
824 	cpupm_notification_t *next;
825 
826 	mutex_enter(&mach_state->ms_lock);
827 	if (mach_state->ms_handlers == NULL) {
828 		mutex_exit(&mach_state->ms_lock);
829 		return;
830 	}
831 	if (mach_state->ms_acpi_handle != NULL) {
832 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
833 		    cpupm_notify_handler);
834 	}
835 	entry = mach_state->ms_handlers;
836 	while (entry != NULL) {
837 		next = entry->nq_next;
838 		kmem_free(entry, sizeof (cpupm_notification_t));
839 		entry = next;
840 	}
841 	mach_state->ms_handlers = NULL;
842 	mutex_exit(&mach_state->ms_lock);
843 #endif
844 }
845 
846 /*
847  * Get the current max speed from the ACPI _PPC object
848  */
849 /*ARGSUSED*/
850 int
851 cpupm_get_top_speed(cpu_t *cp)
852 {
853 #ifndef __xpv
854 	cpupm_mach_state_t 	*mach_state;
855 	cpu_acpi_handle_t 	handle;
856 	int 			plat_level;
857 	uint_t			nspeeds;
858 	int			max_level;
859 
860 	mach_state =
861 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
862 	handle = mach_state->ms_acpi_handle;
863 
864 	cpu_acpi_cache_ppc(handle);
865 	plat_level = CPU_ACPI_PPC(handle);
866 
867 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
868 
869 	max_level = nspeeds - 1;
870 	if ((plat_level < 0) || (plat_level > max_level)) {
871 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
872 		    "_PPC out of range %d", cp->cpu_id, plat_level);
873 		plat_level = 0;
874 	}
875 
876 	return (plat_level);
877 #else
878 	return (0);
879 #endif
880 }
881 
882 /*
883  * This notification handler is called whenever the ACPI _PPC
884  * object changes. The _PPC is a sort of governor on power levels.
885  * It sets an upper threshold on which, _PSS defined, power levels
886  * are usuable. The _PPC value is dynamic and may change as properties
887  * (i.e., thermal or AC source) of the system change.
888  */
889 
890 static void
891 cpupm_power_manage_notifications(void *ctx)
892 {
893 	cpu_t			*cp = ctx;
894 	int			top_speed;
895 
896 	top_speed = cpupm_get_top_speed(cp);
897 	cpupm_redefine_max_activepwr_state(cp, top_speed);
898 }
899 
900 /* ARGSUSED */
901 static void
902 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
903 {
904 #ifndef __xpv
905 
906 	cpu_t *cp = ctx;
907 	cpupm_mach_state_t *mach_state =
908 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
909 
910 	if (mach_state == NULL)
911 		return;
912 
913 	/*
914 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
915 	 */
916 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
917 	    mach_state->ms_caps & CPUPM_T_STATES) {
918 		cpupm_throttle_manage_notification(ctx);
919 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
920 	    mach_state->ms_caps & CPUPM_C_STATES) {
921 		cpuidle_manage_cstates(ctx);
922 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
923 	    mach_state->ms_caps & CPUPM_P_STATES) {
924 		cpupm_power_manage_notifications(ctx);
925 	}
926 #endif
927 }
928 
929 /*
930  * Update cpupm cstate data each time CPU exits idle.
931  */
932 void
933 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
934 {
935 	cs_data->cs_idle_exit = end;
936 }
937 
938 /*
939  * Determine next cstate based on cpupm data.
940  * Update cpupm cstate data each time CPU goes idle.
941  * Do as much as possible in the idle state bookkeeping function because the
942  * performance impact while idle is minimal compared to in the wakeup function
943  * when there is real work to do.
944  */
945 uint32_t
946 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
947     uint32_t cs_count, hrtime_t start)
948 {
949 	hrtime_t duration;
950 	hrtime_t ave_interval;
951 	hrtime_t ave_idle_time;
952 	uint32_t i, smpl_cnt;
953 
954 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
955 	scalehrtime(&duration);
956 	cs_data->cs_idle += duration;
957 	cs_data->cs_idle_enter = start;
958 
959 	smpl_cnt = ++cs_data->cs_cnt;
960 	cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
961 	scalehrtime(&cs_data->cs_smpl_len);
962 	if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
963 		cs_data->cs_smpl_idle = cs_data->cs_idle;
964 		cs_data->cs_idle = 0;
965 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
966 		    cs_data->cs_smpl_len);
967 
968 		cs_data->cs_smpl_start = start;
969 		cs_data->cs_cnt = 0;
970 
971 		/*
972 		 * Strand level C-state policy
973 		 * The cpu_acpi_cstate_t *cstates array is not required to
974 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
975 		 * There are cs_count entries in the cstates array.
976 		 * cs_data->cs_next_cstate contains the index of the next
977 		 * C-state this CPU should enter.
978 		 */
979 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
980 
981 		/*
982 		 * Will CPU be idle long enough to save power?
983 		 */
984 		ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
985 		for (i = 1; i < cs_count; ++i) {
986 			if (ave_idle_time < (cstates[i].cs_latency *
987 			    cpupm_cs_idle_save_tunable)) {
988 				cs_count = i;
989 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
990 				    CPU, int, i);
991 			}
992 		}
993 
994 		/*
995 		 * Wakeup often (even when non-idle time is very short)?
996 		 * Some producer/consumer type loads fall into this category.
997 		 */
998 		ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
999 		for (i = 1; i < cs_count; ++i) {
1000 			if (ave_interval <= (cstates[i].cs_latency *
1001 			    cpupm_cs_idle_cost_tunable)) {
1002 				cs_count = i;
1003 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1004 				    CPU, int, (CPU_MAX_CSTATES + i));
1005 			}
1006 		}
1007 
1008 		/*
1009 		 * Idle percent
1010 		 */
1011 		for (i = 1; i < cs_count; ++i) {
1012 			switch (cstates[i].cs_type) {
1013 			case CPU_ACPI_C2:
1014 				if (cs_data->cs_smpl_idle_pct <
1015 				    cpupm_C2_idle_pct_tunable) {
1016 					cs_count = i;
1017 					DTRACE_PROBE2(cpupm__next__cstate,
1018 					    cpu_t *, CPU, int,
1019 					    ((2 * CPU_MAX_CSTATES) + i));
1020 				}
1021 				break;
1022 
1023 			case CPU_ACPI_C3:
1024 				if (cs_data->cs_smpl_idle_pct <
1025 				    cpupm_C3_idle_pct_tunable) {
1026 					cs_count = i;
1027 					DTRACE_PROBE2(cpupm__next__cstate,
1028 					    cpu_t *, CPU, int,
1029 					    ((2 * CPU_MAX_CSTATES) + i));
1030 				}
1031 				break;
1032 			}
1033 		}
1034 
1035 		cs_data->cs_next_cstate = cs_count - 1;
1036 	}
1037 
1038 	return (cs_data->cs_next_cstate);
1039 }
1040