xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 29091f177aaaca472c062f5f3f5d66ac1799e1a7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/archsystm.h>
36 #include <sys/hpet.h>
37 #include <sys/acpi/acpi.h>
38 #include <sys/acpica.h>
39 #include <sys/cpupm.h>
40 #include <sys/cpu_idle.h>
41 #include <sys/cpu_acpi.h>
42 #include <sys/cpupm_throttle.h>
43 #include <sys/dtrace.h>
44 #include <sys/note.h>
45 
46 /*
47  * This callback is used to build the PPM CPU domains once
48  * a CPU device has been started. The callback is initialized
49  * by the PPM driver to point to a routine that will build the
50  * domains.
51  */
52 void (*cpupm_ppm_alloc_pstate_domains)(cpu_t *);
53 
54 /*
55  * This callback is used to remove CPU from the PPM CPU domains
56  * when the cpu driver is detached. The callback is initialized
57  * by the PPM driver to point to a routine that will remove CPU
58  * from the domains.
59  */
60 void (*cpupm_ppm_free_pstate_domains)(cpu_t *);
61 
62 /*
63  * This callback is used to redefine the topspeed for a CPU device.
64  * Since all CPUs in a domain should have identical properties, this
65  * callback is initialized by the PPM driver to point to a routine
66  * that will redefine the topspeed for all devices in a CPU domain.
67  * This callback is exercised whenever an ACPI _PPC change notification
68  * is received by the CPU driver.
69  */
70 void (*cpupm_redefine_topspeed)(void *);
71 
72 /*
73  * This callback is used by the PPM driver to call into the CPU driver
74  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
75  */
76 void (*cpupm_set_topspeed_callb)(void *, int);
77 
78 /*
79  * This callback is used by the PPM driver to call into the CPU driver
80  * to set a new topspeed for a CPU.
81  */
82 int (*cpupm_get_topspeed_callb)(void *);
83 
84 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
85 static void cpupm_free_notify_handlers(cpu_t *);
86 static void cpupm_power_manage_notifications(void *);
87 
88 /*
89  * Until proven otherwise, all power states are manageable.
90  */
91 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
92 
93 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
94 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
95 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
96 
97 /*
98  * c-state tunables
99  *
100  * cpupm_cs_sample_interval is the length of time we wait before
101  * recalculating c-state statistics.  When a CPU goes idle it checks
102  * to see if it has been longer than cpupm_cs_sample_interval since it last
103  * caculated which C-state to go to.
104  *
105  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
106  * divided by time spent in the idle state transitions.
107  * A value of 10 means the CPU will not spend more than 1/10 of its time
108  * in idle latency.  The worst case performance will be 90% of non Deep C-state
109  * kernel.
110  *
111  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
112  * before it is worth going there.  Expressed as a multiple of latency.
113  */
114 uint32_t cpupm_cs_sample_interval = 100*1000*1000;	/* 100 milliseconds */
115 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
116 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
117 uint16_t cpupm_C2_idle_pct_tunable = 70;
118 uint16_t cpupm_C3_idle_pct_tunable = 80;
119 
120 #ifndef __xpv
121 extern boolean_t cpupm_intel_init(cpu_t *);
122 extern boolean_t cpupm_amd_init(cpu_t *);
123 
124 typedef struct cpupm_vendor {
125 	boolean_t	(*cpuv_init)(cpu_t *);
126 } cpupm_vendor_t;
127 
128 /*
129  * Table of supported vendors.
130  */
131 static cpupm_vendor_t cpupm_vendors[] = {
132 	cpupm_intel_init,
133 	cpupm_amd_init,
134 	NULL
135 };
136 #endif
137 
138 /*
139  * Initialize the machine.
140  * See if a module exists for managing power for this CPU.
141  */
142 /*ARGSUSED*/
143 void
144 cpupm_init(cpu_t *cp)
145 {
146 #ifndef __xpv
147 	cpupm_vendor_t *vendors;
148 	cpupm_mach_state_t *mach_state;
149 	struct machcpu *mcpu = &(cp->cpu_m);
150 	static boolean_t first = B_TRUE;
151 	int *speeds;
152 	uint_t nspeeds;
153 	int ret;
154 
155 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
156 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
157 	mach_state->ms_caps = CPUPM_NO_STATES;
158 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
159 
160 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
161 	if (mach_state->ms_acpi_handle == NULL) {
162 		cpupm_fini(cp);
163 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
164 		    "unable to get ACPI handle", cp->cpu_id);
165 		cmn_err(CE_NOTE, "!CPU power management will not function.");
166 		CPUPM_DISABLE();
167 		first = B_FALSE;
168 		return;
169 	}
170 
171 	/*
172 	 * Loop through the CPU management module table and see if
173 	 * any of the modules implement CPU power management
174 	 * for this CPU.
175 	 */
176 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
177 		if (vendors->cpuv_init(cp))
178 			break;
179 	}
180 
181 	/*
182 	 * Nope, we can't power manage this CPU.
183 	 */
184 	if (vendors == NULL) {
185 		cpupm_fini(cp);
186 		CPUPM_DISABLE();
187 		first = B_FALSE;
188 		return;
189 	}
190 
191 	/*
192 	 * If P-state support exists for this system, then initialize it.
193 	 */
194 	if (mach_state->ms_pstate.cma_ops != NULL) {
195 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
196 		if (ret != 0) {
197 			mach_state->ms_pstate.cma_ops = NULL;
198 			cpupm_disable(CPUPM_P_STATES);
199 		} else {
200 			nspeeds = cpupm_get_speeds(cp, &speeds);
201 			if (nspeeds == 0) {
202 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
203 				    " no speeds to manage", cp->cpu_id);
204 			} else {
205 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
206 				cpupm_free_speeds(speeds, nspeeds);
207 				mach_state->ms_caps |= CPUPM_P_STATES;
208 			}
209 		}
210 	} else {
211 		cpupm_disable(CPUPM_P_STATES);
212 	}
213 
214 	if (mach_state->ms_tstate.cma_ops != NULL) {
215 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
216 		if (ret != 0) {
217 			mach_state->ms_tstate.cma_ops = NULL;
218 			cpupm_disable(CPUPM_T_STATES);
219 		} else {
220 			mach_state->ms_caps |= CPUPM_T_STATES;
221 		}
222 	} else {
223 		cpupm_disable(CPUPM_T_STATES);
224 	}
225 
226 	/*
227 	 * If C-states support exists for this system, then initialize it.
228 	 */
229 	if (mach_state->ms_cstate.cma_ops != NULL) {
230 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
231 		if (ret != 0) {
232 			mach_state->ms_cstate.cma_ops = NULL;
233 			mcpu->max_cstates = CPU_ACPI_C1;
234 			cpupm_disable(CPUPM_C_STATES);
235 			idle_cpu = non_deep_idle_cpu;
236 			disp_enq_thread = non_deep_idle_disp_enq_thread;
237 		} else if (cpu_deep_cstates_supported()) {
238 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
239 			    mach_state->ms_acpi_handle);
240 			if (mcpu->max_cstates > CPU_ACPI_C1) {
241 				(void) cstate_timer_callback(
242 				    CST_EVENT_MULTIPLE_CSTATES);
243 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
244 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
245 				disp_enq_thread = cstate_wakeup;
246 			} else {
247 				(void) cstate_timer_callback(
248 				    CST_EVENT_ONE_CSTATE);
249 			}
250 			mach_state->ms_caps |= CPUPM_C_STATES;
251 		} else {
252 			mcpu->max_cstates = CPU_ACPI_C1;
253 			idle_cpu = non_deep_idle_cpu;
254 			disp_enq_thread = non_deep_idle_disp_enq_thread;
255 		}
256 	} else {
257 		cpupm_disable(CPUPM_C_STATES);
258 	}
259 
260 
261 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
262 		cpupm_fini(cp);
263 		CPUPM_DISABLE();
264 		first = B_FALSE;
265 		return;
266 	}
267 
268 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
269 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
270 	    (mach_state->ms_caps & CPUPM_C_STATES)) {
271 		if (first) {
272 			acpica_write_cpupm_capabilities(
273 			    mach_state->ms_caps & CPUPM_P_STATES,
274 			    mach_state->ms_caps & CPUPM_C_STATES);
275 		}
276 		if (mach_state->ms_caps & CPUPM_T_STATES) {
277 			cpupm_throttle_manage_notification(cp);
278 		}
279 		if (mach_state->ms_caps & CPUPM_C_STATES) {
280 			cpuidle_manage_cstates(cp);
281 		}
282 		if (mach_state->ms_caps & CPUPM_P_STATES) {
283 			cpupm_power_manage_notifications(cp);
284 		}
285 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
286 	}
287 	first = B_FALSE;
288 #endif
289 }
290 
291 /*
292  * Free any resources allocated during cpupm initialization or cpupm start.
293  */
294 /*ARGSUSED*/
295 void
296 cpupm_free(cpu_t *cp, boolean_t cpupm_stop)
297 {
298 #ifndef __xpv
299 	cpupm_mach_state_t *mach_state =
300 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
301 
302 	if (mach_state == NULL)
303 		return;
304 
305 	if (mach_state->ms_pstate.cma_ops != NULL) {
306 		if (cpupm_stop)
307 			mach_state->ms_pstate.cma_ops->cpus_stop(cp);
308 		else
309 			mach_state->ms_pstate.cma_ops->cpus_fini(cp);
310 		mach_state->ms_pstate.cma_ops = NULL;
311 	}
312 
313 	if (mach_state->ms_tstate.cma_ops != NULL) {
314 		if (cpupm_stop)
315 			mach_state->ms_tstate.cma_ops->cpus_stop(cp);
316 		else
317 			mach_state->ms_tstate.cma_ops->cpus_fini(cp);
318 		mach_state->ms_tstate.cma_ops = NULL;
319 	}
320 
321 	if (mach_state->ms_cstate.cma_ops != NULL) {
322 		if (cpupm_stop)
323 			mach_state->ms_cstate.cma_ops->cpus_stop(cp);
324 		else
325 			mach_state->ms_cstate.cma_ops->cpus_fini(cp);
326 
327 		mach_state->ms_cstate.cma_ops = NULL;
328 	}
329 
330 	cpupm_free_notify_handlers(cp);
331 
332 	if (mach_state->ms_acpi_handle != NULL) {
333 		cpu_acpi_fini(mach_state->ms_acpi_handle);
334 		mach_state->ms_acpi_handle = NULL;
335 	}
336 
337 	mutex_destroy(&mach_state->ms_lock);
338 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
339 	cp->cpu_m.mcpu_pm_mach_state = NULL;
340 #endif
341 }
342 
343 void
344 cpupm_fini(cpu_t *cp)
345 {
346 	/*
347 	 * call (*cpus_fini)() ops to release the cpupm resource
348 	 * in the P/C/T-state driver
349 	 */
350 	cpupm_free(cp, B_FALSE);
351 }
352 
353 void
354 cpupm_start(cpu_t *cp)
355 {
356 	cpupm_init(cp);
357 }
358 
359 void
360 cpupm_stop(cpu_t *cp)
361 {
362 	/*
363 	 * call (*cpus_stop)() ops to reclaim the cpupm resource
364 	 * in the P/C/T-state driver
365 	 */
366 	cpupm_free(cp, B_TRUE);
367 }
368 
369 /*
370  * If A CPU has started and at least one power state is manageable,
371  * then the CPU is ready for power management.
372  */
373 boolean_t
374 cpupm_is_ready(cpu_t *cp)
375 {
376 #ifndef __xpv
377 	cpupm_mach_state_t *mach_state =
378 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
379 	uint32_t cpupm_caps = mach_state->ms_caps;
380 
381 	if (cpupm_enabled == CPUPM_NO_STATES)
382 		return (B_FALSE);
383 
384 	if ((cpupm_caps & CPUPM_T_STATES) ||
385 	    (cpupm_caps & CPUPM_P_STATES) ||
386 	    (cpupm_caps & CPUPM_C_STATES))
387 
388 		return (B_TRUE);
389 	return (B_FALSE);
390 #else
391 	_NOTE(ARGUNUSED(cp));
392 	return (B_FALSE);
393 #endif
394 }
395 
396 boolean_t
397 cpupm_is_enabled(uint32_t state)
398 {
399 	return ((cpupm_enabled & state) == state);
400 }
401 
402 /*
403  * By default, all states are enabled.
404  */
405 void
406 cpupm_disable(uint32_t state)
407 {
408 
409 	if (state & CPUPM_P_STATES) {
410 		cpupm_free_domains(&cpupm_pstate_domains);
411 	}
412 	if (state & CPUPM_T_STATES) {
413 		cpupm_free_domains(&cpupm_tstate_domains);
414 	}
415 	if (state & CPUPM_C_STATES) {
416 		cpupm_free_domains(&cpupm_cstate_domains);
417 	}
418 	cpupm_enabled &= ~state;
419 }
420 
421 /*
422  * Allocate power domains for C,P and T States
423  */
424 void
425 cpupm_alloc_domains(cpu_t *cp, int state)
426 {
427 	cpupm_mach_state_t *mach_state =
428 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
429 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
430 	cpupm_state_domains_t **dom_ptr;
431 	cpupm_state_domains_t *dptr;
432 	cpupm_state_domains_t **mach_dom_state_ptr;
433 	uint32_t domain;
434 	uint32_t type;
435 
436 	switch (state) {
437 	case CPUPM_P_STATES:
438 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
439 			domain = CPU_ACPI_PSD(handle).sd_domain;
440 			type = CPU_ACPI_PSD(handle).sd_type;
441 		} else {
442 			mutex_enter(&cpu_lock);
443 			domain = cpuid_get_chipid(cp);
444 			mutex_exit(&cpu_lock);
445 			type = CPU_ACPI_HW_ALL;
446 		}
447 		dom_ptr = &cpupm_pstate_domains;
448 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
449 		break;
450 	case CPUPM_T_STATES:
451 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
452 			domain = CPU_ACPI_TSD(handle).sd_domain;
453 			type = CPU_ACPI_TSD(handle).sd_type;
454 		} else {
455 			mutex_enter(&cpu_lock);
456 			domain = cpuid_get_chipid(cp);
457 			mutex_exit(&cpu_lock);
458 			type = CPU_ACPI_HW_ALL;
459 		}
460 		dom_ptr = &cpupm_tstate_domains;
461 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
462 		break;
463 	case CPUPM_C_STATES:
464 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
465 			domain = CPU_ACPI_CSD(handle).sd_domain;
466 			type = CPU_ACPI_CSD(handle).sd_type;
467 		} else {
468 			mutex_enter(&cpu_lock);
469 			domain = cpuid_get_coreid(cp);
470 			mutex_exit(&cpu_lock);
471 			type = CPU_ACPI_HW_ALL;
472 		}
473 		dom_ptr = &cpupm_cstate_domains;
474 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
475 		break;
476 	default:
477 		return;
478 	}
479 
480 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
481 		if (dptr->pm_domain == domain)
482 			break;
483 	}
484 
485 	/* new domain is created and linked at the head */
486 	if (dptr == NULL) {
487 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
488 		dptr->pm_domain = domain;
489 		dptr->pm_type = type;
490 		dptr->pm_next = *dom_ptr;
491 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
492 		    (void *)ipltospl(DISP_LEVEL));
493 		CPUSET_ZERO(dptr->pm_cpus);
494 		*dom_ptr = dptr;
495 	}
496 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
497 	*mach_dom_state_ptr = dptr;
498 }
499 
500 /*
501  * Free C, P or T state power domains
502  */
503 void
504 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
505 {
506 	cpupm_state_domains_t *this_domain, *next_domain;
507 
508 	this_domain = *dom_ptr;
509 	while (this_domain != NULL) {
510 		next_domain = this_domain->pm_next;
511 		mutex_destroy(&this_domain->pm_lock);
512 		kmem_free((void *)this_domain,
513 		    sizeof (cpupm_state_domains_t));
514 		this_domain = next_domain;
515 	}
516 	*dom_ptr = NULL;
517 }
518 
519 /*
520  * Remove CPU from C, P or T state power domains
521  */
522 void
523 cpupm_remove_domains(cpu_t *cp, int state, cpupm_state_domains_t **dom_ptr)
524 {
525 	cpupm_mach_state_t *mach_state =
526 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
527 	cpupm_state_domains_t *dptr;
528 	uint32_t pm_domain;
529 	ulong_t iflag;
530 
531 	ASSERT(mach_state);
532 
533 	switch (state) {
534 	case CPUPM_P_STATES:
535 		pm_domain = mach_state->ms_pstate.cma_domain->pm_domain;
536 		break;
537 	case CPUPM_T_STATES:
538 		pm_domain = mach_state->ms_tstate.cma_domain->pm_domain;
539 		break;
540 	case CPUPM_C_STATES:
541 		pm_domain = mach_state->ms_cstate.cma_domain->pm_domain;
542 		break;
543 	default:
544 		return;
545 	}
546 
547 	/*
548 	 * Find the CPU C, P or T state power domain
549 	 */
550 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
551 		if (dptr->pm_domain == pm_domain)
552 			break;
553 	}
554 
555 	/*
556 	 * return if no matched domain found
557 	 */
558 	if (dptr == NULL)
559 		return;
560 
561 	/*
562 	 * We found one matched power domain, remove CPU from its cpuset.
563 	 * Interrupt is disabled here to avoid the race conditions between
564 	 * event change notification and cpu remove.
565 	 */
566 	iflag = intr_clear();
567 	mutex_enter(&dptr->pm_lock);
568 	if (CPU_IN_SET(dptr->pm_cpus, cp->cpu_id))
569 		CPUSET_DEL(dptr->pm_cpus, cp->cpu_id);
570 	mutex_exit(&dptr->pm_lock);
571 	intr_restore(iflag);
572 }
573 
574 void
575 cpupm_alloc_ms_cstate(cpu_t *cp)
576 {
577 	cpupm_mach_state_t *mach_state;
578 	cpupm_mach_acpi_state_t *ms_cstate;
579 
580 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
581 	ms_cstate = &mach_state->ms_cstate;
582 	ASSERT(ms_cstate->cma_state.cstate == NULL);
583 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
584 	    KM_SLEEP);
585 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
586 }
587 
588 void
589 cpupm_free_ms_cstate(cpu_t *cp)
590 {
591 	cpupm_mach_state_t *mach_state =
592 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
593 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
594 
595 	if (ms_cstate->cma_state.cstate != NULL) {
596 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
597 		ms_cstate->cma_state.cstate = NULL;
598 	}
599 }
600 
601 void
602 cpupm_state_change(cpu_t *cp, int level, int state)
603 {
604 	cpupm_mach_state_t	*mach_state =
605 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
606 	cpupm_state_ops_t	*state_ops;
607 	cpupm_state_domains_t  	*state_domain;
608 	cpuset_t		set;
609 
610 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
611 
612 	if (mach_state == NULL) {
613 		return;
614 	}
615 
616 	switch (state) {
617 	case CPUPM_P_STATES:
618 		state_ops = mach_state->ms_pstate.cma_ops;
619 		state_domain = mach_state->ms_pstate.cma_domain;
620 		break;
621 	case CPUPM_T_STATES:
622 		state_ops = mach_state->ms_tstate.cma_ops;
623 		state_domain = mach_state->ms_tstate.cma_domain;
624 		break;
625 	default:
626 		break;
627 	}
628 
629 	switch (state_domain->pm_type) {
630 	case CPU_ACPI_SW_ANY:
631 		/*
632 		 * A request on any CPU in the domain transitions the domain
633 		 */
634 		CPUSET_ONLY(set, cp->cpu_id);
635 		state_ops->cpus_change(set, level);
636 		break;
637 	case CPU_ACPI_SW_ALL:
638 		/*
639 		 * All CPUs in the domain must request the transition
640 		 */
641 	case CPU_ACPI_HW_ALL:
642 		/*
643 		 * P/T-state transitions are coordinated by the hardware
644 		 * For now, request the transition on all CPUs in the domain,
645 		 * but looking ahead we can probably be smarter about this.
646 		 */
647 		mutex_enter(&state_domain->pm_lock);
648 		state_ops->cpus_change(state_domain->pm_cpus, level);
649 		mutex_exit(&state_domain->pm_lock);
650 		break;
651 	default:
652 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
653 		    state_domain->pm_type);
654 	}
655 }
656 
657 /*
658  * CPU PM interfaces exposed to the CPU power manager
659  */
660 /*ARGSUSED*/
661 id_t
662 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
663 {
664 	cpupm_mach_state_t	*mach_state =
665 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
666 
667 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
668 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
669 		return (CPUPM_NO_DOMAIN);
670 	}
671 	if (type == CPUPM_DTYPE_ACTIVE) {
672 		/*
673 		 * Return P-State domain for the specified CPU
674 		 */
675 		if (mach_state->ms_pstate.cma_domain) {
676 			return (mach_state->ms_pstate.cma_domain->pm_domain);
677 		}
678 	} else if (type == CPUPM_DTYPE_IDLE) {
679 		/*
680 		 * Return C-State domain for the specified CPU
681 		 */
682 		if (mach_state->ms_cstate.cma_domain) {
683 			return (mach_state->ms_cstate.cma_domain->pm_domain);
684 		}
685 	}
686 	return (CPUPM_NO_DOMAIN);
687 }
688 
689 /*ARGSUSED*/
690 uint_t
691 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
692     cpupm_state_t *states)
693 {
694 	int	*speeds;
695 	uint_t	nspeeds, i;
696 
697 	/*
698 	 * Idle domain support unimplemented
699 	 */
700 	if (type != CPUPM_DTYPE_ACTIVE) {
701 		return (0);
702 	}
703 	nspeeds = cpupm_get_speeds(cp, &speeds);
704 
705 	/*
706 	 * If the caller passes NULL for states, just return the
707 	 * number of states.
708 	 */
709 	if (states != NULL) {
710 		for (i = 0; i < nspeeds; i++) {
711 			states[i].cps_speed = speeds[i];
712 			states[i].cps_handle = (cpupm_handle_t)i;
713 		}
714 	}
715 	cpupm_free_speeds(speeds, nspeeds);
716 	return (nspeeds);
717 }
718 
719 /*ARGSUSED*/
720 int
721 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
722 {
723 	if (!cpupm_is_ready(cp))
724 		return (-1);
725 
726 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
727 
728 	return (0);
729 }
730 
731 /*ARGSUSED*/
732 /*
733  * Note: It is the responsibility of the users of
734  * cpupm_get_speeds() to free the memory allocated
735  * for speeds using cpupm_free_speeds()
736  */
737 uint_t
738 cpupm_get_speeds(cpu_t *cp, int **speeds)
739 {
740 #ifndef __xpv
741 	cpupm_mach_state_t *mach_state =
742 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
743 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
744 #else
745 	return (0);
746 #endif
747 }
748 
749 /*ARGSUSED*/
750 void
751 cpupm_free_speeds(int *speeds, uint_t nspeeds)
752 {
753 #ifndef __xpv
754 	cpu_acpi_free_speeds(speeds, nspeeds);
755 #endif
756 }
757 
758 /*
759  * All CPU instances have been initialized successfully.
760  */
761 boolean_t
762 cpupm_power_ready(cpu_t *cp)
763 {
764 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready(cp));
765 }
766 
767 /*
768  * All CPU instances have been initialized successfully.
769  */
770 boolean_t
771 cpupm_throttle_ready(cpu_t *cp)
772 {
773 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready(cp));
774 }
775 
776 /*
777  * All CPU instances have been initialized successfully.
778  */
779 boolean_t
780 cpupm_cstate_ready(cpu_t *cp)
781 {
782 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready(cp));
783 }
784 
785 void
786 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
787 {
788 	cpu_t *cp = ctx;
789 	cpupm_mach_state_t *mach_state =
790 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
791 	cpupm_notification_t *entry;
792 
793 	mutex_enter(&mach_state->ms_lock);
794 	for (entry =  mach_state->ms_handlers; entry != NULL;
795 	    entry = entry->nq_next) {
796 		entry->nq_handler(obj, val, entry->nq_ctx);
797 	}
798 	mutex_exit(&mach_state->ms_lock);
799 }
800 
801 /*ARGSUSED*/
802 void
803 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
804 {
805 #ifndef __xpv
806 	cpupm_mach_state_t *mach_state =
807 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
808 	cpupm_notification_t *entry;
809 
810 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
811 	entry->nq_handler = handler;
812 	entry->nq_ctx = ctx;
813 	mutex_enter(&mach_state->ms_lock);
814 	if (mach_state->ms_handlers == NULL) {
815 		entry->nq_next = NULL;
816 		mach_state->ms_handlers = entry;
817 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
818 		    cpupm_notify_handler, cp);
819 
820 	} else {
821 		entry->nq_next = mach_state->ms_handlers;
822 		mach_state->ms_handlers = entry;
823 	}
824 	mutex_exit(&mach_state->ms_lock);
825 #endif
826 }
827 
828 /*ARGSUSED*/
829 static void
830 cpupm_free_notify_handlers(cpu_t *cp)
831 {
832 #ifndef __xpv
833 	cpupm_mach_state_t *mach_state =
834 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
835 	cpupm_notification_t *entry;
836 	cpupm_notification_t *next;
837 
838 	mutex_enter(&mach_state->ms_lock);
839 	if (mach_state->ms_handlers == NULL) {
840 		mutex_exit(&mach_state->ms_lock);
841 		return;
842 	}
843 	if (mach_state->ms_acpi_handle != NULL) {
844 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
845 		    cpupm_notify_handler);
846 	}
847 	entry = mach_state->ms_handlers;
848 	while (entry != NULL) {
849 		next = entry->nq_next;
850 		kmem_free(entry, sizeof (cpupm_notification_t));
851 		entry = next;
852 	}
853 	mach_state->ms_handlers = NULL;
854 	mutex_exit(&mach_state->ms_lock);
855 #endif
856 }
857 
858 /*
859  * Get the current max speed from the ACPI _PPC object
860  */
861 /*ARGSUSED*/
862 int
863 cpupm_get_top_speed(cpu_t *cp)
864 {
865 #ifndef __xpv
866 	cpupm_mach_state_t 	*mach_state;
867 	cpu_acpi_handle_t 	handle;
868 	int 			plat_level;
869 	uint_t			nspeeds;
870 	int			max_level;
871 
872 	mach_state =
873 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
874 	handle = mach_state->ms_acpi_handle;
875 
876 	cpu_acpi_cache_ppc(handle);
877 	plat_level = CPU_ACPI_PPC(handle);
878 
879 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
880 
881 	max_level = nspeeds - 1;
882 	if ((plat_level < 0) || (plat_level > max_level)) {
883 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
884 		    "_PPC out of range %d", cp->cpu_id, plat_level);
885 		plat_level = 0;
886 	}
887 
888 	return (plat_level);
889 #else
890 	return (0);
891 #endif
892 }
893 
894 /*
895  * This notification handler is called whenever the ACPI _PPC
896  * object changes. The _PPC is a sort of governor on power levels.
897  * It sets an upper threshold on which, _PSS defined, power levels
898  * are usuable. The _PPC value is dynamic and may change as properties
899  * (i.e., thermal or AC source) of the system change.
900  */
901 
902 static void
903 cpupm_power_manage_notifications(void *ctx)
904 {
905 	cpu_t			*cp = ctx;
906 	int			top_speed;
907 
908 	top_speed = cpupm_get_top_speed(cp);
909 	cpupm_redefine_max_activepwr_state(cp, top_speed);
910 }
911 
912 /* ARGSUSED */
913 static void
914 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
915 {
916 #ifndef __xpv
917 
918 	cpu_t *cp = ctx;
919 	cpupm_mach_state_t *mach_state =
920 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
921 
922 	if (mach_state == NULL)
923 		return;
924 
925 	/*
926 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
927 	 */
928 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
929 	    mach_state->ms_caps & CPUPM_T_STATES) {
930 		cpupm_throttle_manage_notification(ctx);
931 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
932 	    mach_state->ms_caps & CPUPM_C_STATES) {
933 		cpuidle_manage_cstates(ctx);
934 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
935 	    mach_state->ms_caps & CPUPM_P_STATES) {
936 		cpupm_power_manage_notifications(ctx);
937 	}
938 #endif
939 }
940 
941 /*
942  * Update cpupm cstate data each time CPU exits idle.
943  */
944 void
945 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
946 {
947 	cs_data->cs_idle_exit = end;
948 }
949 
950 /*
951  * Determine next cstate based on cpupm data.
952  * Update cpupm cstate data each time CPU goes idle.
953  * Do as much as possible in the idle state bookkeeping function because the
954  * performance impact while idle is minimal compared to in the wakeup function
955  * when there is real work to do.
956  */
957 uint32_t
958 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
959     uint32_t cs_count, hrtime_t start)
960 {
961 	hrtime_t duration;
962 	hrtime_t ave_interval;
963 	hrtime_t ave_idle_time;
964 	uint32_t i, smpl_cnt;
965 
966 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
967 	scalehrtime(&duration);
968 	cs_data->cs_idle += duration;
969 	cs_data->cs_idle_enter = start;
970 
971 	smpl_cnt = ++cs_data->cs_cnt;
972 	cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
973 	scalehrtime(&cs_data->cs_smpl_len);
974 	if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
975 		cs_data->cs_smpl_idle = cs_data->cs_idle;
976 		cs_data->cs_idle = 0;
977 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
978 		    cs_data->cs_smpl_len);
979 
980 		cs_data->cs_smpl_start = start;
981 		cs_data->cs_cnt = 0;
982 
983 		/*
984 		 * Strand level C-state policy
985 		 * The cpu_acpi_cstate_t *cstates array is not required to
986 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
987 		 * There are cs_count entries in the cstates array.
988 		 * cs_data->cs_next_cstate contains the index of the next
989 		 * C-state this CPU should enter.
990 		 */
991 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
992 
993 		/*
994 		 * Will CPU be idle long enough to save power?
995 		 */
996 		ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
997 		for (i = 1; i < cs_count; ++i) {
998 			if (ave_idle_time < (cstates[i].cs_latency *
999 			    cpupm_cs_idle_save_tunable)) {
1000 				cs_count = i;
1001 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1002 				    CPU, int, i);
1003 			}
1004 		}
1005 
1006 		/*
1007 		 * Wakeup often (even when non-idle time is very short)?
1008 		 * Some producer/consumer type loads fall into this category.
1009 		 */
1010 		ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
1011 		for (i = 1; i < cs_count; ++i) {
1012 			if (ave_interval <= (cstates[i].cs_latency *
1013 			    cpupm_cs_idle_cost_tunable)) {
1014 				cs_count = i;
1015 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1016 				    CPU, int, (CPU_MAX_CSTATES + i));
1017 			}
1018 		}
1019 
1020 		/*
1021 		 * Idle percent
1022 		 */
1023 		for (i = 1; i < cs_count; ++i) {
1024 			switch (cstates[i].cs_type) {
1025 			case CPU_ACPI_C2:
1026 				if (cs_data->cs_smpl_idle_pct <
1027 				    cpupm_C2_idle_pct_tunable) {
1028 					cs_count = i;
1029 					DTRACE_PROBE2(cpupm__next__cstate,
1030 					    cpu_t *, CPU, int,
1031 					    ((2 * CPU_MAX_CSTATES) + i));
1032 				}
1033 				break;
1034 
1035 			case CPU_ACPI_C3:
1036 				if (cs_data->cs_smpl_idle_pct <
1037 				    cpupm_C3_idle_pct_tunable) {
1038 					cs_count = i;
1039 					DTRACE_PROBE2(cpupm__next__cstate,
1040 					    cpu_t *, CPU, int,
1041 					    ((2 * CPU_MAX_CSTATES) + i));
1042 				}
1043 				break;
1044 			}
1045 		}
1046 
1047 		cs_data->cs_next_cstate = cs_count - 1;
1048 	}
1049 
1050 	return (cs_data->cs_next_cstate);
1051 }
1052