xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 5ad1f010a7b934be6e0dd6c13198af62791824be)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/hpet.h>
36 #include <sys/acpi/acpi.h>
37 #include <sys/acpica.h>
38 #include <sys/cpupm.h>
39 #include <sys/cpu_idle.h>
40 #include <sys/cpu_acpi.h>
41 #include <sys/cpupm_throttle.h>
42 #include <sys/dtrace.h>
43 
44 /*
45  * This callback is used to build the PPM CPU domains once
46  * all the CPU devices have been started. The callback is
47  * initialized by the PPM driver to point to a routine that
48  * will build the domains.
49  */
50 void (*cpupm_rebuild_cpu_domains)(void);
51 
52 /*
53  * This callback is used to reset the topspeed for all the
54  * CPU devices. The callback is initialized by the PPM driver to
55  * point to a routine that will reinitialize all the CPU devices
56  * once all the CPU devices have been started and the CPU domains
57  * built.
58  */
59 void (*cpupm_init_topspeed)(void);
60 
61 /*
62  * This callback is used to redefine the topspeed for a CPU device.
63  * Since all CPUs in a domain should have identical properties, this
64  * callback is initialized by the PPM driver to point to a routine
65  * that will redefine the topspeed for all devices in a CPU domain.
66  * This callback is exercised whenever an ACPI _PPC change notification
67  * is received by the CPU driver.
68  */
69 void (*cpupm_redefine_topspeed)(void *);
70 
71 /*
72  * This callback is used by the PPM driver to call into the CPU driver
73  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
74  */
75 void (*cpupm_set_topspeed_callb)(void *, int);
76 
77 /*
78  * This callback is used by the PPM driver to call into the CPU driver
79  * to set a new topspeed for a CPU.
80  */
81 int (*cpupm_get_topspeed_callb)(void *);
82 
83 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
84 static void cpupm_free_notify_handlers(cpu_t *);
85 
86 /*
87  * Until proven otherwise, all power states are manageable.
88  */
89 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
90 
91 /*
92  * Until all CPUs have started, we do not allow
93  * power management.
94  */
95 static boolean_t cpupm_ready = B_FALSE;
96 
97 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
98 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
99 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
100 
101 /*
102  * c-state tunables
103  *
104  * cpupm_cs_sample_interval is the length of time we wait before
105  * recalculating c-state statistics.  When a CPU goes idle it checks
106  * to see if it has been longer than cpupm_cs_sample_interval since it last
107  * caculated which C-state to go to.
108  *
109  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
110  * divided by time spent in the idle state transitions.
111  * A value of 10 means the CPU will not spend more than 1/10 of its time
112  * in idle latency.  The worst case performance will be 90% of non Deep C-state
113  * kernel.
114  *
115  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
116  * before it is worth going there.  Expressed as a multiple of latency.
117  */
118 uint32_t cpupm_cs_sample_interval = 100*1000*1000;	/* 100 milliseconds */
119 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
120 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
121 uint16_t cpupm_C2_idle_pct_tunable = 70;
122 uint16_t cpupm_C3_idle_pct_tunable = 80;
123 
124 #ifndef __xpv
125 extern boolean_t cpupm_intel_init(cpu_t *);
126 extern boolean_t cpupm_amd_init(cpu_t *);
127 
128 typedef struct cpupm_vendor {
129 	boolean_t	(*cpuv_init)(cpu_t *);
130 } cpupm_vendor_t;
131 
132 /*
133  * Table of supported vendors.
134  */
135 static cpupm_vendor_t cpupm_vendors[] = {
136 	cpupm_intel_init,
137 	cpupm_amd_init,
138 	NULL
139 };
140 #endif
141 
142 /*
143  * Initialize the machine.
144  * See if a module exists for managing power for this CPU.
145  */
146 /*ARGSUSED*/
147 void
148 cpupm_init(cpu_t *cp)
149 {
150 #ifndef __xpv
151 	cpupm_vendor_t *vendors;
152 	cpupm_mach_state_t *mach_state;
153 	struct machcpu *mcpu = &(cp->cpu_m);
154 	static boolean_t first = B_TRUE;
155 	int *speeds;
156 	uint_t nspeeds;
157 	int ret;
158 
159 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
160 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
161 	mach_state->ms_caps = CPUPM_NO_STATES;
162 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
163 
164 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
165 	if (mach_state->ms_acpi_handle == NULL) {
166 		cpupm_free(cp);
167 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
168 		    "unable to get ACPI handle", cp->cpu_id);
169 		cmn_err(CE_NOTE, "!CPU power management will not function.");
170 		CPUPM_DISABLE();
171 		first = B_FALSE;
172 		return;
173 	}
174 
175 	/*
176 	 * Loop through the CPU management module table and see if
177 	 * any of the modules implement CPU power management
178 	 * for this CPU.
179 	 */
180 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
181 		if (vendors->cpuv_init(cp))
182 			break;
183 	}
184 
185 	/*
186 	 * Nope, we can't power manage this CPU.
187 	 */
188 	if (vendors == NULL) {
189 		cpupm_free(cp);
190 		CPUPM_DISABLE();
191 		first = B_FALSE;
192 		return;
193 	}
194 
195 	/*
196 	 * If P-state support exists for this system, then initialize it.
197 	 */
198 	if (mach_state->ms_pstate.cma_ops != NULL) {
199 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
200 		if (ret != 0) {
201 			mach_state->ms_pstate.cma_ops = NULL;
202 			cpupm_disable(CPUPM_P_STATES);
203 		} else {
204 			nspeeds = cpupm_get_speeds(cp, &speeds);
205 			if (nspeeds == 0) {
206 				cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
207 				    " no speeds to manage", cp->cpu_id);
208 			} else {
209 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
210 				cpupm_free_speeds(speeds, nspeeds);
211 				mach_state->ms_caps |= CPUPM_P_STATES;
212 			}
213 		}
214 	}
215 
216 	if (mach_state->ms_tstate.cma_ops != NULL) {
217 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
218 		if (ret != 0) {
219 			mach_state->ms_tstate.cma_ops = NULL;
220 			cpupm_disable(CPUPM_T_STATES);
221 		} else {
222 			mach_state->ms_caps |= CPUPM_T_STATES;
223 		}
224 	}
225 
226 	/*
227 	 * If C-states support exists for this system, then initialize it.
228 	 */
229 	if (mach_state->ms_cstate.cma_ops != NULL) {
230 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
231 		if (ret != 0) {
232 			mach_state->ms_cstate.cma_ops = NULL;
233 			mcpu->max_cstates = CPU_ACPI_C1;
234 			cpupm_disable(CPUPM_C_STATES);
235 			idle_cpu = non_deep_idle_cpu;
236 			disp_enq_thread = non_deep_idle_disp_enq_thread;
237 		} else if (cpu_deep_cstates_supported()) {
238 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
239 			    mach_state->ms_acpi_handle);
240 			if (mcpu->max_cstates > CPU_ACPI_C1) {
241 				(void) cstate_timer_callback(
242 				    CST_EVENT_MULTIPLE_CSTATES);
243 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
244 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
245 				disp_enq_thread = cstate_wakeup;
246 			} else {
247 				(void) cstate_timer_callback(
248 				    CST_EVENT_ONE_CSTATE);
249 			}
250 			mach_state->ms_caps |= CPUPM_C_STATES;
251 		} else {
252 			mcpu->max_cstates = CPU_ACPI_C1;
253 			idle_cpu = non_deep_idle_cpu;
254 			disp_enq_thread = non_deep_idle_disp_enq_thread;
255 		}
256 	}
257 
258 
259 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
260 		cpupm_free(cp);
261 		CPUPM_DISABLE();
262 		first = B_FALSE;
263 		return;
264 	}
265 
266 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
267 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
268 	    (mach_state->ms_caps & CPUPM_C_STATES)) {
269 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
270 		if (first) {
271 			acpica_write_cpupm_capabilities(
272 			    mach_state->ms_caps & CPUPM_P_STATES,
273 			    mach_state->ms_caps & CPUPM_C_STATES);
274 		}
275 	}
276 	first = B_FALSE;
277 #endif
278 }
279 
280 /*
281  * Free any resources allocated by cpupm_init().
282  */
283 /*ARGSUSED*/
284 void
285 cpupm_free(cpu_t *cp)
286 {
287 #ifndef __xpv
288 	cpupm_mach_state_t *mach_state =
289 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
290 
291 	if (mach_state == NULL)
292 		return;
293 	if (mach_state->ms_pstate.cma_ops != NULL) {
294 		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
295 		mach_state->ms_pstate.cma_ops = NULL;
296 	}
297 
298 	if (mach_state->ms_tstate.cma_ops != NULL) {
299 		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
300 		mach_state->ms_tstate.cma_ops = NULL;
301 	}
302 
303 	if (mach_state->ms_cstate.cma_ops != NULL) {
304 		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
305 		mach_state->ms_cstate.cma_ops = NULL;
306 	}
307 
308 	cpupm_free_notify_handlers(cp);
309 
310 	if (mach_state->ms_acpi_handle != NULL) {
311 		cpu_acpi_fini(mach_state->ms_acpi_handle);
312 		mach_state->ms_acpi_handle = NULL;
313 	}
314 
315 	mutex_destroy(&mach_state->ms_lock);
316 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
317 	cp->cpu_m.mcpu_pm_mach_state = NULL;
318 #endif
319 }
320 
321 /*
322  * If all CPUs have started and at least one power state is manageable,
323  * then the CPUs are ready for power management.
324  */
325 boolean_t
326 cpupm_is_ready()
327 {
328 #ifndef __xpv
329 	if (cpupm_enabled == CPUPM_NO_STATES)
330 		return (B_FALSE);
331 	return (cpupm_ready);
332 #else
333 	return (B_FALSE);
334 #endif
335 
336 }
337 
338 boolean_t
339 cpupm_is_enabled(uint32_t state)
340 {
341 	return ((cpupm_enabled & state) == state);
342 }
343 
344 /*
345  * By default, all states are enabled.
346  */
347 void
348 cpupm_disable(uint32_t state)
349 {
350 
351 	if (state & CPUPM_P_STATES) {
352 		cpupm_free_domains(&cpupm_pstate_domains);
353 	}
354 	if (state & CPUPM_T_STATES) {
355 		cpupm_free_domains(&cpupm_tstate_domains);
356 	}
357 	if (state & CPUPM_C_STATES) {
358 		cpupm_free_domains(&cpupm_cstate_domains);
359 	}
360 	cpupm_enabled &= ~state;
361 }
362 
363 /*
364  * Once all CPUs have been started, the PPM driver should build CPU
365  * domains and initialize the topspeed for all CPU devices.
366  */
367 void
368 cpupm_post_startup()
369 {
370 #ifndef __xpv
371 	/*
372 	 * The CPU domain built by the PPM during CPUs attaching
373 	 * should be rebuilt with the information retrieved from
374 	 * ACPI.
375 	 */
376 	if (cpupm_rebuild_cpu_domains != NULL)
377 		(*cpupm_rebuild_cpu_domains)();
378 
379 	/*
380 	 * Only initialize the topspeed if P-states are enabled.
381 	 */
382 	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
383 		(*cpupm_init_topspeed)();
384 #endif
385 	cpupm_ready = B_TRUE;
386 }
387 
388 /*
389  * Allocate power domains for C,P and T States
390  */
391 void
392 cpupm_alloc_domains(cpu_t *cp, int state)
393 {
394 	cpupm_mach_state_t *mach_state =
395 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
396 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
397 	cpupm_state_domains_t **dom_ptr;
398 	cpupm_state_domains_t *dptr;
399 	cpupm_state_domains_t **mach_dom_state_ptr;
400 	uint32_t domain;
401 	uint32_t type;
402 
403 	switch (state) {
404 	case CPUPM_P_STATES:
405 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
406 			domain = CPU_ACPI_PSD(handle).sd_domain;
407 			type = CPU_ACPI_PSD(handle).sd_type;
408 		} else {
409 			mutex_enter(&cpu_lock);
410 			domain = cpuid_get_chipid(cp);
411 			mutex_exit(&cpu_lock);
412 			type = CPU_ACPI_HW_ALL;
413 		}
414 		dom_ptr = &cpupm_pstate_domains;
415 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
416 		break;
417 	case CPUPM_T_STATES:
418 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
419 			domain = CPU_ACPI_TSD(handle).sd_domain;
420 			type = CPU_ACPI_TSD(handle).sd_type;
421 		} else {
422 			mutex_enter(&cpu_lock);
423 			domain = cpuid_get_chipid(cp);
424 			mutex_exit(&cpu_lock);
425 			type = CPU_ACPI_HW_ALL;
426 		}
427 		dom_ptr = &cpupm_tstate_domains;
428 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
429 		break;
430 	case CPUPM_C_STATES:
431 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
432 			domain = CPU_ACPI_CSD(handle).sd_domain;
433 			type = CPU_ACPI_CSD(handle).sd_type;
434 		} else {
435 			mutex_enter(&cpu_lock);
436 			domain = cpuid_get_coreid(cp);
437 			mutex_exit(&cpu_lock);
438 			type = CPU_ACPI_HW_ALL;
439 		}
440 		dom_ptr = &cpupm_cstate_domains;
441 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
442 		break;
443 	default:
444 		return;
445 	}
446 
447 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
448 		if (dptr->pm_domain == domain)
449 			break;
450 	}
451 
452 	/* new domain is created and linked at the head */
453 	if (dptr == NULL) {
454 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
455 		dptr->pm_domain = domain;
456 		dptr->pm_type = type;
457 		dptr->pm_next = *dom_ptr;
458 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
459 		    (void *)ipltospl(DISP_LEVEL));
460 		CPUSET_ZERO(dptr->pm_cpus);
461 		*dom_ptr = dptr;
462 	}
463 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
464 	*mach_dom_state_ptr = dptr;
465 }
466 
467 /*
468  * Free C, P or T state power domains
469  */
470 void
471 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
472 {
473 	cpupm_state_domains_t *this_domain, *next_domain;
474 
475 	this_domain = *dom_ptr;
476 	while (this_domain != NULL) {
477 		next_domain = this_domain->pm_next;
478 		mutex_destroy(&this_domain->pm_lock);
479 		kmem_free((void *)this_domain,
480 		    sizeof (cpupm_state_domains_t));
481 		this_domain = next_domain;
482 	}
483 	*dom_ptr = NULL;
484 }
485 
486 void
487 cpupm_alloc_ms_cstate(cpu_t *cp)
488 {
489 	cpupm_mach_state_t *mach_state;
490 	cpupm_mach_acpi_state_t *ms_cstate;
491 
492 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
493 	ms_cstate = &mach_state->ms_cstate;
494 	ASSERT(ms_cstate->cma_state.cstate == NULL);
495 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
496 	    KM_SLEEP);
497 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
498 }
499 
500 void
501 cpupm_free_ms_cstate(cpu_t *cp)
502 {
503 	cpupm_mach_state_t *mach_state =
504 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
505 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
506 
507 	if (ms_cstate->cma_state.cstate != NULL) {
508 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
509 		ms_cstate->cma_state.cstate = NULL;
510 	}
511 }
512 
513 void
514 cpupm_state_change(cpu_t *cp, int level, int state)
515 {
516 	cpupm_mach_state_t	*mach_state =
517 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
518 	cpupm_state_ops_t	*state_ops;
519 	cpupm_state_domains_t  	*state_domain;
520 	cpuset_t		set;
521 
522 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
523 
524 	if (mach_state == NULL) {
525 		return;
526 	}
527 
528 	switch (state) {
529 	case CPUPM_P_STATES:
530 		state_ops = mach_state->ms_pstate.cma_ops;
531 		state_domain = mach_state->ms_pstate.cma_domain;
532 		break;
533 	case CPUPM_T_STATES:
534 		state_ops = mach_state->ms_tstate.cma_ops;
535 		state_domain = mach_state->ms_tstate.cma_domain;
536 		break;
537 	default:
538 		break;
539 	}
540 
541 	switch (state_domain->pm_type) {
542 	case CPU_ACPI_SW_ANY:
543 		/*
544 		 * A request on any CPU in the domain transitions the domain
545 		 */
546 		CPUSET_ONLY(set, cp->cpu_id);
547 		state_ops->cpus_change(set, level);
548 		break;
549 	case CPU_ACPI_SW_ALL:
550 		/*
551 		 * All CPUs in the domain must request the transition
552 		 */
553 	case CPU_ACPI_HW_ALL:
554 		/*
555 		 * P/T-state transitions are coordinated by the hardware
556 		 * For now, request the transition on all CPUs in the domain,
557 		 * but looking ahead we can probably be smarter about this.
558 		 */
559 		mutex_enter(&state_domain->pm_lock);
560 		state_ops->cpus_change(state_domain->pm_cpus, level);
561 		mutex_exit(&state_domain->pm_lock);
562 		break;
563 	default:
564 		cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
565 		    state_domain->pm_type);
566 	}
567 }
568 
569 /*
570  * CPU PM interfaces exposed to the CPU power manager
571  */
572 /*ARGSUSED*/
573 id_t
574 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
575 {
576 	cpupm_mach_state_t	*mach_state =
577 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
578 
579 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
580 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
581 		return (CPUPM_NO_DOMAIN);
582 	}
583 	if (type == CPUPM_DTYPE_ACTIVE) {
584 		/*
585 		 * Return P-State domain for the specified CPU
586 		 */
587 		if (mach_state->ms_pstate.cma_domain) {
588 			return (mach_state->ms_pstate.cma_domain->pm_domain);
589 		}
590 	} else if (type == CPUPM_DTYPE_IDLE) {
591 		/*
592 		 * Return C-State domain for the specified CPU
593 		 */
594 		if (mach_state->ms_cstate.cma_domain) {
595 			return (mach_state->ms_cstate.cma_domain->pm_domain);
596 		}
597 	}
598 	return (CPUPM_NO_DOMAIN);
599 }
600 
601 /*ARGSUSED*/
602 uint_t
603 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
604     cpupm_state_t *states)
605 {
606 	int	*speeds;
607 	uint_t	nspeeds, i;
608 
609 	/*
610 	 * Idle domain support unimplemented
611 	 */
612 	if (type != CPUPM_DTYPE_ACTIVE) {
613 		return (0);
614 	}
615 	nspeeds = cpupm_get_speeds(cp, &speeds);
616 
617 	/*
618 	 * If the caller passes NULL for states, just return the
619 	 * number of states.
620 	 */
621 	if (states != NULL) {
622 		for (i = 0; i < nspeeds; i++) {
623 			states[i].cps_speed = speeds[i];
624 			states[i].cps_handle = (cpupm_handle_t)i;
625 		}
626 	}
627 	cpupm_free_speeds(speeds, nspeeds);
628 	return (nspeeds);
629 }
630 
631 /*ARGSUSED*/
632 int
633 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
634 {
635 	if (!cpupm_is_ready())
636 		return (-1);
637 
638 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
639 
640 	return (0);
641 }
642 
643 /*ARGSUSED*/
644 /*
645  * Note: It is the responsibility of the users of
646  * cpupm_get_speeds() to free the memory allocated
647  * for speeds using cpupm_free_speeds()
648  */
649 uint_t
650 cpupm_get_speeds(cpu_t *cp, int **speeds)
651 {
652 #ifndef __xpv
653 	cpupm_mach_state_t *mach_state =
654 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
655 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
656 #else
657 	return (0);
658 #endif
659 }
660 
661 /*ARGSUSED*/
662 void
663 cpupm_free_speeds(int *speeds, uint_t nspeeds)
664 {
665 #ifndef __xpv
666 	cpu_acpi_free_speeds(speeds, nspeeds);
667 #endif
668 }
669 
670 /*
671  * All CPU instances have been initialized successfully.
672  */
673 boolean_t
674 cpupm_power_ready(void)
675 {
676 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
677 }
678 
679 /*
680  * All CPU instances have been initialized successfully.
681  */
682 boolean_t
683 cpupm_throttle_ready(void)
684 {
685 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
686 }
687 
688 /*
689  * All CPU instances have been initialized successfully.
690  */
691 boolean_t
692 cpupm_cstate_ready(void)
693 {
694 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
695 }
696 
697 void
698 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
699 {
700 	cpu_t *cp = ctx;
701 	cpupm_mach_state_t *mach_state =
702 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
703 	cpupm_notification_t *entry;
704 
705 	mutex_enter(&mach_state->ms_lock);
706 	for (entry =  mach_state->ms_handlers; entry != NULL;
707 	    entry = entry->nq_next) {
708 		entry->nq_handler(obj, val, entry->nq_ctx);
709 	}
710 	mutex_exit(&mach_state->ms_lock);
711 }
712 
713 /*ARGSUSED*/
714 void
715 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
716 {
717 #ifndef __xpv
718 	cpupm_mach_state_t *mach_state =
719 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
720 	cpupm_notification_t *entry;
721 
722 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
723 	entry->nq_handler = handler;
724 	entry->nq_ctx = ctx;
725 	mutex_enter(&mach_state->ms_lock);
726 	if (mach_state->ms_handlers == NULL) {
727 		entry->nq_next = NULL;
728 		mach_state->ms_handlers = entry;
729 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
730 		    cpupm_notify_handler, cp);
731 
732 	} else {
733 		entry->nq_next = mach_state->ms_handlers;
734 		mach_state->ms_handlers = entry;
735 	}
736 	mutex_exit(&mach_state->ms_lock);
737 #endif
738 }
739 
740 /*ARGSUSED*/
741 static void
742 cpupm_free_notify_handlers(cpu_t *cp)
743 {
744 #ifndef __xpv
745 	cpupm_mach_state_t *mach_state =
746 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
747 	cpupm_notification_t *entry;
748 	cpupm_notification_t *next;
749 
750 	mutex_enter(&mach_state->ms_lock);
751 	if (mach_state->ms_handlers == NULL) {
752 		mutex_exit(&mach_state->ms_lock);
753 		return;
754 	}
755 	if (mach_state->ms_acpi_handle != NULL) {
756 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
757 		    cpupm_notify_handler);
758 	}
759 	entry = mach_state->ms_handlers;
760 	while (entry != NULL) {
761 		next = entry->nq_next;
762 		kmem_free(entry, sizeof (cpupm_notification_t));
763 		entry = next;
764 	}
765 	mach_state->ms_handlers = NULL;
766 	mutex_exit(&mach_state->ms_lock);
767 #endif
768 }
769 
770 /*
771  * Get the current max speed from the ACPI _PPC object
772  */
773 /*ARGSUSED*/
774 int
775 cpupm_get_top_speed(cpu_t *cp)
776 {
777 #ifndef __xpv
778 	cpupm_mach_state_t 	*mach_state;
779 	cpu_acpi_handle_t 	handle;
780 	int 			plat_level;
781 	uint_t			nspeeds;
782 	int			max_level;
783 
784 	mach_state =
785 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
786 	handle = mach_state->ms_acpi_handle;
787 
788 	cpu_acpi_cache_ppc(handle);
789 	plat_level = CPU_ACPI_PPC(handle);
790 
791 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
792 
793 	max_level = nspeeds - 1;
794 	if ((plat_level < 0) || (plat_level > max_level)) {
795 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
796 		    "_PPC out of range %d", cp->cpu_id, plat_level);
797 		plat_level = 0;
798 	}
799 
800 	return (plat_level);
801 #else
802 	return (0);
803 #endif
804 }
805 
806 /*
807  * This notification handler is called whenever the ACPI _PPC
808  * object changes. The _PPC is a sort of governor on power levels.
809  * It sets an upper threshold on which, _PSS defined, power levels
810  * are usuable. The _PPC value is dynamic and may change as properties
811  * (i.e., thermal or AC source) of the system change.
812  */
813 
814 static void
815 cpupm_power_manage_notifications(void *ctx)
816 {
817 	cpu_t			*cp = ctx;
818 	int			top_speed;
819 
820 	top_speed = cpupm_get_top_speed(cp);
821 	cpupm_redefine_max_activepwr_state(cp, top_speed);
822 }
823 
824 /* ARGSUSED */
825 static void
826 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
827 {
828 #ifndef __xpv
829 
830 	cpu_t *cp = ctx;
831 	cpupm_mach_state_t *mach_state =
832 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
833 
834 	if (mach_state == NULL)
835 		return;
836 
837 	/*
838 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
839 	 */
840 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
841 	    mach_state->ms_caps & CPUPM_T_STATES) {
842 		cpupm_throttle_manage_notification(ctx);
843 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
844 	    mach_state->ms_caps & CPUPM_C_STATES) {
845 		cpuidle_manage_cstates(ctx);
846 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
847 	    mach_state->ms_caps & CPUPM_P_STATES) {
848 		cpupm_power_manage_notifications(ctx);
849 	}
850 #endif
851 }
852 
853 /*
854  * Update cpupm cstate data each time CPU exits idle.
855  */
856 void
857 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
858 {
859 	cs_data->cs_idle_exit = end;
860 }
861 
862 /*
863  * Determine next cstate based on cpupm data.
864  * Update cpupm cstate data each time CPU goes idle.
865  * Do as much as possible in the idle state bookkeeping function because the
866  * performance impact while idle is minimal compared to in the wakeup function
867  * when there is real work to do.
868  */
869 uint32_t
870 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
871     uint32_t cs_count, hrtime_t start)
872 {
873 	hrtime_t duration;
874 	hrtime_t ave_interval;
875 	hrtime_t ave_idle_time;
876 	uint32_t i, smpl_cnt;
877 
878 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
879 	scalehrtime(&duration);
880 	cs_data->cs_idle += duration;
881 	cs_data->cs_idle_enter = start;
882 
883 	smpl_cnt = ++cs_data->cs_cnt;
884 	cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
885 	scalehrtime(&cs_data->cs_smpl_len);
886 	if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
887 		cs_data->cs_smpl_idle = cs_data->cs_idle;
888 		cs_data->cs_idle = 0;
889 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
890 		    cs_data->cs_smpl_len);
891 
892 		cs_data->cs_smpl_start = start;
893 		cs_data->cs_cnt = 0;
894 
895 		/*
896 		 * Strand level C-state policy
897 		 * The cpu_acpi_cstate_t *cstates array is not required to
898 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
899 		 * There are cs_count entries in the cstates array.
900 		 * cs_data->cs_next_cstate contains the index of the next
901 		 * C-state this CPU should enter.
902 		 */
903 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
904 
905 		/*
906 		 * Will CPU be idle long enough to save power?
907 		 */
908 		ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
909 		for (i = 1; i < cs_count; ++i) {
910 			if (ave_idle_time < (cstates[i].cs_latency *
911 			    cpupm_cs_idle_save_tunable)) {
912 				cs_count = i;
913 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
914 				    CPU, int, i);
915 			}
916 		}
917 
918 		/*
919 		 * Wakeup often (even when non-idle time is very short)?
920 		 * Some producer/consumer type loads fall into this category.
921 		 */
922 		ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
923 		for (i = 1; i < cs_count; ++i) {
924 			if (ave_interval <= (cstates[i].cs_latency *
925 			    cpupm_cs_idle_cost_tunable)) {
926 				cs_count = i;
927 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
928 				    CPU, int, (CPU_MAX_CSTATES + i));
929 			}
930 		}
931 
932 		/*
933 		 * Idle percent
934 		 */
935 		for (i = 1; i < cs_count; ++i) {
936 			switch (cstates[i].cs_type) {
937 			case CPU_ACPI_C2:
938 				if (cs_data->cs_smpl_idle_pct <
939 				    cpupm_C2_idle_pct_tunable) {
940 					cs_count = i;
941 					DTRACE_PROBE2(cpupm__next__cstate,
942 					    cpu_t *, CPU, int,
943 					    ((2 * CPU_MAX_CSTATES) + i));
944 				}
945 				break;
946 
947 			case CPU_ACPI_C3:
948 				if (cs_data->cs_smpl_idle_pct <
949 				    cpupm_C3_idle_pct_tunable) {
950 					cs_count = i;
951 					DTRACE_PROBE2(cpupm__next__cstate,
952 					    cpu_t *, CPU, int,
953 					    ((2 * CPU_MAX_CSTATES) + i));
954 				}
955 				break;
956 			}
957 		}
958 
959 		cs_data->cs_next_cstate = cs_count - 1;
960 	}
961 
962 	return (cs_data->cs_next_cstate);
963 }
964