xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 0e7515250c8395f368aa45fb9acae7c4f8f8b786)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpu_pm.h>
27 #include <sys/x86_archext.h>
28 #include <sys/sdt.h>
29 #include <sys/spl.h>
30 #include <sys/machsystm.h>
31 #include <sys/hpet.h>
32 #include <sys/cpupm.h>
33 #include <sys/cpu_idle.h>
34 #include <sys/cpu_acpi.h>
35 #include <sys/cpupm_throttle.h>
36 
37 /*
38  * This callback is used to build the PPM CPU domains once
39  * all the CPU devices have been started. The callback is
40  * initialized by the PPM driver to point to a routine that
41  * will build the domains.
42  */
43 void (*cpupm_rebuild_cpu_domains)(void);
44 
45 /*
46  * This callback is used to reset the topspeed for all the
47  * CPU devices. The callback is initialized by the PPM driver to
48  * point to a routine that will reinitialize all the CPU devices
49  * once all the CPU devices have been started and the CPU domains
50  * built.
51  */
52 void (*cpupm_init_topspeed)(void);
53 
54 /*
55  * This callback is used to redefine the topspeed for a CPU device.
56  * Since all CPUs in a domain should have identical properties, this
57  * callback is initialized by the PPM driver to point to a routine
58  * that will redefine the topspeed for all devices in a CPU domain.
59  * This callback is exercised whenever an ACPI _PPC change notification
60  * is received by the CPU driver.
61  */
62 void (*cpupm_redefine_topspeed)(void *);
63 
64 /*
65  * This callback is used by the PPM driver to call into the CPU driver
66  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
67  */
68 void (*cpupm_set_topspeed_callb)(void *, int);
69 
70 /*
71  * This callback is used by the PPM driver to call into the CPU driver
72  * to set a new topspeed for a CPU.
73  */
74 int (*cpupm_get_topspeed_callb)(void *);
75 
76 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
77 static void cpupm_free_notify_handlers(cpu_t *);
78 
79 /*
80  * Until proven otherwise, all power states are manageable.
81  */
82 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
83 
84 /*
85  * Until all CPUs have started, we do not allow
86  * power management.
87  */
88 static boolean_t cpupm_ready = B_FALSE;
89 
90 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
91 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
92 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
93 
94 /*
95  * c-state tunables
96  *
97  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
98  * divided by time spent in the idle state transitions.
99  * A value of 10 means the CPU will not spend more than 1/10 of its time
100  * in idle latency.  The worst case performance will be 90% of non Deep C-state
101  * kernel.
102  *
103  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
104  * before it is worth going there.  Expressed as a multiple of latency.
105  */
106 uint32_t cpupm_cs_sample_tunable = 5;		/* samples in decision period */
107 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
108 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
109 uint16_t cpupm_C2_idle_pct_tunable = 70;
110 uint16_t cpupm_C3_idle_pct_tunable = 80;
111 
112 #ifndef __xpv
113 extern boolean_t cpupm_intel_init(cpu_t *);
114 extern boolean_t cpupm_amd_init(cpu_t *);
115 
116 typedef struct cpupm_vendor {
117 	boolean_t	(*cpuv_init)(cpu_t *);
118 } cpupm_vendor_t;
119 
120 /*
121  * Table of supported vendors.
122  */
123 static cpupm_vendor_t cpupm_vendors[] = {
124 	cpupm_intel_init,
125 	cpupm_amd_init,
126 	NULL
127 };
128 #endif
129 
130 /*
131  * Initialize the machine.
132  * See if a module exists for managing power for this CPU.
133  */
134 /*ARGSUSED*/
135 void
136 cpupm_init(cpu_t *cp)
137 {
138 #ifndef __xpv
139 	cpupm_vendor_t *vendors;
140 	cpupm_mach_state_t *mach_state;
141 	struct machcpu *mcpu = &(cp->cpu_m);
142 	int *speeds;
143 	uint_t nspeeds;
144 	int ret;
145 
146 	cpupm_set_supp_freqs(cp, NULL, 1);
147 
148 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
149 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
150 	mach_state->ms_caps = CPUPM_NO_STATES;
151 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
152 
153 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
154 	if (mach_state->ms_acpi_handle == NULL) {
155 		cpupm_free(cp);
156 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
157 		    "unable to get ACPI handle", cp->cpu_id);
158 		cmn_err(CE_NOTE, "!CPU power management will not function.");
159 		CPUPM_DISABLE();
160 		return;
161 	}
162 
163 	/*
164 	 * Loop through the CPU management module table and see if
165 	 * any of the modules implement CPU power management
166 	 * for this CPU.
167 	 */
168 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
169 		if (vendors->cpuv_init(cp))
170 			break;
171 	}
172 
173 	/*
174 	 * Nope, we can't power manage this CPU.
175 	 */
176 	if (vendors == NULL) {
177 		cpupm_free(cp);
178 		CPUPM_DISABLE();
179 		return;
180 	}
181 
182 	/*
183 	 * If P-state support exists for this system, then initialize it.
184 	 */
185 	if (mach_state->ms_pstate.cma_ops != NULL) {
186 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
187 		if (ret != 0) {
188 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
189 			    " unable to initialize P-state support",
190 			    cp->cpu_id);
191 			mach_state->ms_pstate.cma_ops = NULL;
192 			cpupm_disable(CPUPM_P_STATES);
193 		} else {
194 			nspeeds = cpupm_get_speeds(cp, &speeds);
195 			if (nspeeds == 0) {
196 				cmn_err(CE_WARN, "!cpupm_init: processor %d:"
197 				    " no speeds to manage", cp->cpu_id);
198 			} else {
199 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
200 				cpupm_free_speeds(speeds, nspeeds);
201 				mach_state->ms_caps |= CPUPM_P_STATES;
202 			}
203 		}
204 	}
205 
206 	if (mach_state->ms_tstate.cma_ops != NULL) {
207 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
208 		if (ret != 0) {
209 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
210 			    " unable to initialize T-state support",
211 			    cp->cpu_id);
212 			mach_state->ms_tstate.cma_ops = NULL;
213 			cpupm_disable(CPUPM_T_STATES);
214 		} else {
215 			mach_state->ms_caps |= CPUPM_T_STATES;
216 		}
217 	}
218 
219 	/*
220 	 * If C-states support exists for this system, then initialize it.
221 	 */
222 	if (mach_state->ms_cstate.cma_ops != NULL) {
223 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
224 		if (ret != 0) {
225 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
226 			    " unable to initialize C-state support",
227 			    cp->cpu_id);
228 			mach_state->ms_cstate.cma_ops = NULL;
229 			mcpu->max_cstates = CPU_ACPI_C1;
230 			cpupm_disable(CPUPM_C_STATES);
231 			idle_cpu = non_deep_idle_cpu;
232 			disp_enq_thread = non_deep_idle_disp_enq_thread;
233 		} else if (cpu_deep_cstates_supported()) {
234 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
235 			    mach_state->ms_acpi_handle);
236 			if (mcpu->max_cstates > CPU_ACPI_C1) {
237 				hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
238 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
239 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
240 				disp_enq_thread = cstate_wakeup;
241 			} else {
242 				hpet.callback(CST_EVENT_ONE_CSTATE);
243 			}
244 			mach_state->ms_caps |= CPUPM_C_STATES;
245 		} else {
246 			mcpu->max_cstates = CPU_ACPI_C1;
247 			idle_cpu = non_deep_idle_cpu;
248 			disp_enq_thread = non_deep_idle_disp_enq_thread;
249 		}
250 	}
251 
252 
253 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
254 		cpupm_free(cp);
255 		CPUPM_DISABLE();
256 		return;
257 	}
258 
259 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
260 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
261 	    (mach_state->ms_caps & CPUPM_C_STATES))
262 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
263 #endif
264 }
265 
266 /*
267  * Free any resources allocated by cpupm_init().
268  */
269 /*ARGSUSED*/
270 void
271 cpupm_free(cpu_t *cp)
272 {
273 #ifndef __xpv
274 	cpupm_mach_state_t *mach_state =
275 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
276 
277 	if (mach_state == NULL)
278 		return;
279 	if (mach_state->ms_pstate.cma_ops != NULL) {
280 		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
281 		mach_state->ms_pstate.cma_ops = NULL;
282 	}
283 
284 	if (mach_state->ms_tstate.cma_ops != NULL) {
285 		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
286 		mach_state->ms_tstate.cma_ops = NULL;
287 	}
288 
289 	if (mach_state->ms_cstate.cma_ops != NULL) {
290 		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
291 		mach_state->ms_cstate.cma_ops = NULL;
292 	}
293 
294 	cpupm_free_notify_handlers(cp);
295 
296 	if (mach_state->ms_acpi_handle != NULL) {
297 		cpu_acpi_fini(mach_state->ms_acpi_handle);
298 		mach_state->ms_acpi_handle = NULL;
299 	}
300 
301 	mutex_destroy(&mach_state->ms_lock);
302 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
303 	cp->cpu_m.mcpu_pm_mach_state = NULL;
304 #endif
305 }
306 
307 /*
308  * If all CPUs have started and at least one power state is manageable,
309  * then the CPUs are ready for power management.
310  */
311 boolean_t
312 cpupm_is_ready()
313 {
314 #ifndef __xpv
315 	if (cpupm_enabled == CPUPM_NO_STATES)
316 		return (B_FALSE);
317 	return (cpupm_ready);
318 #else
319 	return (B_FALSE);
320 #endif
321 
322 }
323 
324 boolean_t
325 cpupm_is_enabled(uint32_t state)
326 {
327 	return ((cpupm_enabled & state) == state);
328 }
329 
330 /*
331  * By default, all states are enabled.
332  */
333 void
334 cpupm_disable(uint32_t state)
335 {
336 
337 	if (state & CPUPM_P_STATES) {
338 		cpupm_free_domains(&cpupm_pstate_domains);
339 	}
340 	if (state & CPUPM_T_STATES) {
341 		cpupm_free_domains(&cpupm_tstate_domains);
342 	}
343 	if (state & CPUPM_C_STATES) {
344 		cpupm_free_domains(&cpupm_cstate_domains);
345 	}
346 	cpupm_enabled &= ~state;
347 }
348 
349 /*
350  * Once all CPUs have been started, the PPM driver should build CPU
351  * domains and initialize the topspeed for all CPU devices.
352  */
353 void
354 cpupm_post_startup()
355 {
356 #ifndef __xpv
357 	/*
358 	 * The CPU domain built by the PPM during CPUs attaching
359 	 * should be rebuilt with the information retrieved from
360 	 * ACPI.
361 	 */
362 	if (cpupm_rebuild_cpu_domains != NULL)
363 		(*cpupm_rebuild_cpu_domains)();
364 
365 	/*
366 	 * Only initialize the topspeed if P-states are enabled.
367 	 */
368 	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
369 		(*cpupm_init_topspeed)();
370 #endif
371 	cpupm_ready = B_TRUE;
372 }
373 
374 /*
375  * Allocate power domains for C,P and T States
376  */
377 void
378 cpupm_alloc_domains(cpu_t *cp, int state)
379 {
380 	cpupm_mach_state_t *mach_state =
381 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
382 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
383 	cpupm_state_domains_t **dom_ptr;
384 	cpupm_state_domains_t *dptr;
385 	cpupm_state_domains_t **mach_dom_state_ptr;
386 	uint32_t domain;
387 	uint32_t type;
388 
389 	switch (state) {
390 	case CPUPM_P_STATES:
391 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
392 			domain = CPU_ACPI_PSD(handle).sd_domain;
393 			type = CPU_ACPI_PSD(handle).sd_type;
394 		} else {
395 			mutex_enter(&cpu_lock);
396 			domain = cpuid_get_chipid(cp);
397 			mutex_exit(&cpu_lock);
398 			type = CPU_ACPI_HW_ALL;
399 		}
400 		dom_ptr = &cpupm_pstate_domains;
401 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
402 		break;
403 	case CPUPM_T_STATES:
404 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
405 			domain = CPU_ACPI_TSD(handle).sd_domain;
406 			type = CPU_ACPI_TSD(handle).sd_type;
407 		} else {
408 			mutex_enter(&cpu_lock);
409 			domain = cpuid_get_chipid(cp);
410 			mutex_exit(&cpu_lock);
411 			type = CPU_ACPI_HW_ALL;
412 		}
413 		dom_ptr = &cpupm_tstate_domains;
414 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
415 		break;
416 	case CPUPM_C_STATES:
417 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
418 			domain = CPU_ACPI_CSD(handle).sd_domain;
419 			type = CPU_ACPI_CSD(handle).sd_type;
420 		} else {
421 			mutex_enter(&cpu_lock);
422 			domain = cpuid_get_coreid(cp);
423 			mutex_exit(&cpu_lock);
424 			type = CPU_ACPI_HW_ALL;
425 		}
426 		dom_ptr = &cpupm_cstate_domains;
427 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
428 		break;
429 	default:
430 		return;
431 	}
432 
433 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
434 		if (dptr->pm_domain == domain)
435 			break;
436 	}
437 
438 	/* new domain is created and linked at the head */
439 	if (dptr == NULL) {
440 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
441 		dptr->pm_domain = domain;
442 		dptr->pm_type = type;
443 		dptr->pm_next = *dom_ptr;
444 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
445 		    (void *)ipltospl(DISP_LEVEL));
446 		CPUSET_ZERO(dptr->pm_cpus);
447 		*dom_ptr = dptr;
448 	}
449 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
450 	*mach_dom_state_ptr = dptr;
451 }
452 
453 /*
454  * Free C, P or T state power domains
455  */
456 void
457 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
458 {
459 	cpupm_state_domains_t *this_domain, *next_domain;
460 
461 	this_domain = *dom_ptr;
462 	while (this_domain != NULL) {
463 		next_domain = this_domain->pm_next;
464 		mutex_destroy(&this_domain->pm_lock);
465 		kmem_free((void *)this_domain,
466 		    sizeof (cpupm_state_domains_t));
467 		this_domain = next_domain;
468 	}
469 	*dom_ptr = NULL;
470 }
471 
472 void
473 cpupm_alloc_ms_cstate(cpu_t *cp)
474 {
475 	cpupm_mach_state_t *mach_state;
476 	cpupm_mach_acpi_state_t *ms_cstate;
477 
478 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
479 	ms_cstate = &mach_state->ms_cstate;
480 	ASSERT(ms_cstate->cma_state.cstate == NULL);
481 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
482 	    KM_SLEEP);
483 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
484 }
485 
486 void
487 cpupm_free_ms_cstate(cpu_t *cp)
488 {
489 	cpupm_mach_state_t *mach_state =
490 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
491 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
492 
493 	if (ms_cstate->cma_state.cstate != NULL) {
494 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
495 		ms_cstate->cma_state.cstate = NULL;
496 	}
497 }
498 
499 void
500 cpupm_state_change(cpu_t *cp, int level, int state)
501 {
502 	cpupm_mach_state_t	*mach_state =
503 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
504 	cpupm_state_ops_t	*state_ops;
505 	cpupm_state_domains_t  	*state_domain;
506 	cpuset_t		set;
507 
508 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
509 
510 	if (mach_state == NULL) {
511 		return;
512 	}
513 
514 	switch (state) {
515 	case CPUPM_P_STATES:
516 		state_ops = mach_state->ms_pstate.cma_ops;
517 		state_domain = mach_state->ms_pstate.cma_domain;
518 		break;
519 	case CPUPM_T_STATES:
520 		state_ops = mach_state->ms_tstate.cma_ops;
521 		state_domain = mach_state->ms_tstate.cma_domain;
522 		break;
523 	default:
524 		break;
525 	}
526 
527 	switch (state_domain->pm_type) {
528 	case CPU_ACPI_SW_ANY:
529 		/*
530 		 * A request on any CPU in the domain transitions the domain
531 		 */
532 		CPUSET_ONLY(set, cp->cpu_id);
533 		state_ops->cpus_change(set, level);
534 		break;
535 	case CPU_ACPI_SW_ALL:
536 		/*
537 		 * All CPUs in the domain must request the transition
538 		 */
539 	case CPU_ACPI_HW_ALL:
540 		/*
541 		 * P/T-state transitions are coordinated by the hardware
542 		 * For now, request the transition on all CPUs in the domain,
543 		 * but looking ahead we can probably be smarter about this.
544 		 */
545 		mutex_enter(&state_domain->pm_lock);
546 		state_ops->cpus_change(state_domain->pm_cpus, level);
547 		mutex_exit(&state_domain->pm_lock);
548 		break;
549 	default:
550 		cmn_err(CE_WARN, "Unknown domain coordination type: %d",
551 		    state_domain->pm_type);
552 	}
553 }
554 
555 /*
556  * CPU PM interfaces exposed to the CPU power manager
557  */
558 /*ARGSUSED*/
559 id_t
560 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
561 {
562 	cpupm_mach_state_t	*mach_state =
563 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
564 
565 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
566 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
567 		return (CPUPM_NO_DOMAIN);
568 	}
569 	if (type == CPUPM_DTYPE_ACTIVE) {
570 		/*
571 		 * Return P-State domain for the specified CPU
572 		 */
573 		if (mach_state->ms_pstate.cma_domain) {
574 			return (mach_state->ms_pstate.cma_domain->pm_domain);
575 		}
576 	} else if (type == CPUPM_DTYPE_IDLE) {
577 		/*
578 		 * Return C-State domain for the specified CPU
579 		 */
580 		if (mach_state->ms_cstate.cma_domain) {
581 			return (mach_state->ms_cstate.cma_domain->pm_domain);
582 		}
583 	}
584 	return (CPUPM_NO_DOMAIN);
585 }
586 
587 /*ARGSUSED*/
588 uint_t
589 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
590     cpupm_state_t *states)
591 {
592 	int	*speeds;
593 	uint_t	nspeeds, i;
594 
595 	/*
596 	 * Idle domain support unimplemented
597 	 */
598 	if (type != CPUPM_DTYPE_ACTIVE) {
599 		return (0);
600 	}
601 	nspeeds = cpupm_get_speeds(cp, &speeds);
602 
603 	/*
604 	 * If the caller passes NULL for states, just return the
605 	 * number of states.
606 	 */
607 	if (states != NULL) {
608 		for (i = 0; i < nspeeds; i++) {
609 			states[i].cps_speed = speeds[i];
610 			states[i].cps_handle = (cpupm_handle_t)i;
611 		}
612 	}
613 	cpupm_free_speeds(speeds, nspeeds);
614 	return (nspeeds);
615 }
616 
617 /*ARGSUSED*/
618 int
619 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
620 {
621 	if (!cpupm_is_ready())
622 		return (-1);
623 
624 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
625 
626 	return (0);
627 }
628 
629 /*ARGSUSED*/
630 /*
631  * Note: It is the responsibility of the users of
632  * cpupm_get_speeds() to free the memory allocated
633  * for speeds using cpupm_free_speeds()
634  */
635 uint_t
636 cpupm_get_speeds(cpu_t *cp, int **speeds)
637 {
638 #ifndef __xpv
639 	cpupm_mach_state_t *mach_state =
640 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
641 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
642 #else
643 	return (0);
644 #endif
645 }
646 
647 /*ARGSUSED*/
648 void
649 cpupm_free_speeds(int *speeds, uint_t nspeeds)
650 {
651 #ifndef __xpv
652 	cpu_acpi_free_speeds(speeds, nspeeds);
653 #endif
654 }
655 
656 /*
657  * All CPU instances have been initialized successfully.
658  */
659 boolean_t
660 cpupm_power_ready(void)
661 {
662 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
663 }
664 
665 /*
666  * All CPU instances have been initialized successfully.
667  */
668 boolean_t
669 cpupm_throttle_ready(void)
670 {
671 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
672 }
673 
674 /*
675  * All CPU instances have been initialized successfully.
676  */
677 boolean_t
678 cpupm_cstate_ready(void)
679 {
680 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
681 }
682 
683 void
684 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
685 {
686 	cpu_t *cp = ctx;
687 	cpupm_mach_state_t *mach_state =
688 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
689 	cpupm_notification_t *entry;
690 
691 	mutex_enter(&mach_state->ms_lock);
692 	for (entry =  mach_state->ms_handlers; entry != NULL;
693 	    entry = entry->nq_next) {
694 		entry->nq_handler(obj, val, entry->nq_ctx);
695 	}
696 	mutex_exit(&mach_state->ms_lock);
697 }
698 
699 /*ARGSUSED*/
700 void
701 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
702 {
703 #ifndef __xpv
704 	cpupm_mach_state_t *mach_state =
705 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
706 	cpupm_notification_t *entry;
707 
708 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
709 	entry->nq_handler = handler;
710 	entry->nq_ctx = ctx;
711 	mutex_enter(&mach_state->ms_lock);
712 	if (mach_state->ms_handlers == NULL) {
713 		entry->nq_next = NULL;
714 		mach_state->ms_handlers = entry;
715 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
716 		    cpupm_notify_handler, cp);
717 
718 	} else {
719 		entry->nq_next = mach_state->ms_handlers;
720 		mach_state->ms_handlers = entry;
721 	}
722 	mutex_exit(&mach_state->ms_lock);
723 #endif
724 }
725 
726 /*ARGSUSED*/
727 static void
728 cpupm_free_notify_handlers(cpu_t *cp)
729 {
730 #ifndef __xpv
731 	cpupm_mach_state_t *mach_state =
732 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
733 	cpupm_notification_t *entry;
734 	cpupm_notification_t *next;
735 
736 	mutex_enter(&mach_state->ms_lock);
737 	if (mach_state->ms_handlers == NULL) {
738 		mutex_exit(&mach_state->ms_lock);
739 		return;
740 	}
741 	if (mach_state->ms_acpi_handle != NULL) {
742 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
743 		    cpupm_notify_handler);
744 	}
745 	entry = mach_state->ms_handlers;
746 	while (entry != NULL) {
747 		next = entry->nq_next;
748 		kmem_free(entry, sizeof (cpupm_notification_t));
749 		entry = next;
750 	}
751 	mach_state->ms_handlers = NULL;
752 	mutex_exit(&mach_state->ms_lock);
753 #endif
754 }
755 
756 /*
757  * Get the current max speed from the ACPI _PPC object
758  */
759 /*ARGSUSED*/
760 int
761 cpupm_get_top_speed(cpu_t *cp)
762 {
763 #ifndef __xpv
764 	cpupm_mach_state_t 	*mach_state;
765 	cpu_acpi_handle_t 	handle;
766 	int 			plat_level;
767 	uint_t			nspeeds;
768 	int			max_level;
769 
770 	mach_state =
771 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
772 	handle = mach_state->ms_acpi_handle;
773 
774 	cpu_acpi_cache_ppc(handle);
775 	plat_level = CPU_ACPI_PPC(handle);
776 
777 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
778 
779 	max_level = nspeeds - 1;
780 	if ((plat_level < 0) || (plat_level > max_level)) {
781 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
782 		    "_PPC out of range %d", cp->cpu_id, plat_level);
783 		plat_level = 0;
784 	}
785 
786 	return (plat_level);
787 #else
788 	return (0);
789 #endif
790 }
791 
792 /*
793  * This notification handler is called whenever the ACPI _PPC
794  * object changes. The _PPC is a sort of governor on power levels.
795  * It sets an upper threshold on which, _PSS defined, power levels
796  * are usuable. The _PPC value is dynamic and may change as properties
797  * (i.e., thermal or AC source) of the system change.
798  */
799 
800 static void
801 cpupm_power_manage_notifications(void *ctx)
802 {
803 	cpu_t			*cp = ctx;
804 	int			top_speed;
805 
806 	top_speed = cpupm_get_top_speed(cp);
807 	cpupm_redefine_max_activepwr_state(cp, top_speed);
808 }
809 
810 /* ARGSUSED */
811 static void
812 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
813 {
814 #ifndef __xpv
815 	/*
816 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
817 	 */
818 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION) {
819 		cpupm_throttle_manage_notification(ctx);
820 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION) {
821 		cpuidle_manage_cstates(ctx);
822 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION) {
823 		cpupm_power_manage_notifications(ctx);
824 	}
825 #endif
826 }
827 
828 /*
829  * Update cpupm cstate data each time CPU exits idle.
830  */
831 void
832 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
833 {
834 	cs_data->cs_idle_exit = end;
835 }
836 
837 /*
838  * Determine next cstate based on cpupm data.
839  * Update cpupm cstate data each time CPU goes idle.
840  * Do as much as possible in the idle state bookkeeping function because the
841  * performance impact while idle is minimal compared to in the wakeup function
842  * when there is real work to do.
843  */
844 uint32_t
845 cpupm_next_cstate(cma_c_state_t *cs_data, hrtime_t start)
846 {
847 	hrtime_t		duration;
848 	hrtime_t		ave_interval;
849 	hrtime_t		ave_idle_time;
850 
851 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
852 	scalehrtime(&duration);
853 	cs_data->cs_idle += duration;
854 	cs_data->cs_idle_enter = start;
855 
856 	++cs_data->cs_cnt;
857 	if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
858 		cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
859 		scalehrtime(&cs_data->cs_smpl_len);
860 		cs_data->cs_smpl_len |= 1;	/* protect from DIV 0 */
861 		cs_data->cs_smpl_idle = cs_data->cs_idle;
862 		cs_data->cs_idle = 0;
863 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
864 		    cs_data->cs_smpl_len);
865 
866 		cs_data->cs_smpl_start = start;
867 		cs_data->cs_cnt = 0;
868 
869 		/*
870 		 * Strand level C-state policy
871 		 */
872 		cs_data->cs_next_cstate = CPU_ACPI_C3;
873 
874 		/*
875 		 * Will CPU be idle long enough to save power?
876 		 */
877 		ave_idle_time = (cs_data->cs_smpl_idle /
878 		    cpupm_cs_sample_tunable) / 1000;
879 		if (ave_idle_time < (cs_data->cs_C2_latency *
880 		    cpupm_cs_idle_save_tunable)) {
881 			cs_data->cs_next_cstate = CPU_ACPI_C1;
882 			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
883 			    int, 1);
884 			return (cs_data->cs_next_cstate);
885 		} else if (ave_idle_time < (cs_data->cs_C3_latency *
886 		    cpupm_cs_idle_save_tunable)) {
887 			cs_data->cs_next_cstate = CPU_ACPI_C2;
888 			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
889 			    int, 2);
890 		}
891 
892 		/*
893 		 * Wakeup often (even when non-idle time is very short)?
894 		 * Some producer/consumer type loads fall into this category.
895 		 */
896 		ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
897 		    / 1000;
898 		if (ave_interval <=
899 		    (cs_data->cs_C2_latency * cpupm_cs_idle_cost_tunable)) {
900 			cs_data->cs_next_cstate = CPU_ACPI_C1;
901 			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
902 			    int, 3);
903 			return (cs_data->cs_next_cstate);
904 		} else if (ave_interval <=
905 		    (cs_data->cs_C3_latency * cpupm_cs_idle_cost_tunable)) {
906 			cs_data->cs_next_cstate = CPU_ACPI_C2;
907 			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
908 			    int, 4);
909 		}
910 
911 		/*
912 		 * Idle percent
913 		 */
914 		if (cs_data->cs_smpl_idle_pct < cpupm_C2_idle_pct_tunable) {
915 			cs_data->cs_next_cstate = CPU_ACPI_C1;
916 			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
917 			    int, 5);
918 			return (cs_data->cs_next_cstate);
919 		} else if ((cs_data->cs_next_cstate > CPU_ACPI_C2) &&
920 		    (cs_data->cs_smpl_idle_pct < cpupm_C3_idle_pct_tunable)) {
921 			cs_data->cs_next_cstate = CPU_ACPI_C2;
922 			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
923 			    int, 6);
924 		}
925 	}
926 
927 	return (cs_data->cs_next_cstate);
928 }
929