xref: /titanic_52/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 3589c4f01c20349ca65899d209cdc0c17a641433)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpu_pm.h>
27 #include <sys/x86_archext.h>
28 #include <sys/sdt.h>
29 #include <sys/spl.h>
30 #include <sys/machsystm.h>
31 #include <sys/hpet.h>
32 #include <sys/cpupm.h>
33 #include <sys/cpu_idle.h>
34 #include <sys/cpu_acpi.h>
35 #include <sys/cpupm_throttle.h>
36 #include <sys/dtrace.h>
37 
38 /*
39  * This callback is used to build the PPM CPU domains once
40  * all the CPU devices have been started. The callback is
41  * initialized by the PPM driver to point to a routine that
42  * will build the domains.
43  */
44 void (*cpupm_rebuild_cpu_domains)(void);
45 
46 /*
47  * This callback is used to reset the topspeed for all the
48  * CPU devices. The callback is initialized by the PPM driver to
49  * point to a routine that will reinitialize all the CPU devices
50  * once all the CPU devices have been started and the CPU domains
51  * built.
52  */
53 void (*cpupm_init_topspeed)(void);
54 
55 /*
56  * This callback is used to redefine the topspeed for a CPU device.
57  * Since all CPUs in a domain should have identical properties, this
58  * callback is initialized by the PPM driver to point to a routine
59  * that will redefine the topspeed for all devices in a CPU domain.
60  * This callback is exercised whenever an ACPI _PPC change notification
61  * is received by the CPU driver.
62  */
63 void (*cpupm_redefine_topspeed)(void *);
64 
65 /*
66  * This callback is used by the PPM driver to call into the CPU driver
67  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
68  */
69 void (*cpupm_set_topspeed_callb)(void *, int);
70 
71 /*
72  * This callback is used by the PPM driver to call into the CPU driver
73  * to set a new topspeed for a CPU.
74  */
75 int (*cpupm_get_topspeed_callb)(void *);
76 
77 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
78 static void cpupm_free_notify_handlers(cpu_t *);
79 
80 /*
81  * Until proven otherwise, all power states are manageable.
82  */
83 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
84 
85 /*
86  * Until all CPUs have started, we do not allow
87  * power management.
88  */
89 static boolean_t cpupm_ready = B_FALSE;
90 
91 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
92 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
93 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
94 
95 /*
96  * c-state tunables
97  *
98  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
99  * divided by time spent in the idle state transitions.
100  * A value of 10 means the CPU will not spend more than 1/10 of its time
101  * in idle latency.  The worst case performance will be 90% of non Deep C-state
102  * kernel.
103  *
104  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
105  * before it is worth going there.  Expressed as a multiple of latency.
106  */
107 uint32_t cpupm_cs_sample_tunable = 5;		/* samples in decision period */
108 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
109 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
110 uint16_t cpupm_C2_idle_pct_tunable = 70;
111 uint16_t cpupm_C3_idle_pct_tunable = 80;
112 
113 #ifndef __xpv
114 extern boolean_t cpupm_intel_init(cpu_t *);
115 extern boolean_t cpupm_amd_init(cpu_t *);
116 
117 typedef struct cpupm_vendor {
118 	boolean_t	(*cpuv_init)(cpu_t *);
119 } cpupm_vendor_t;
120 
121 /*
122  * Table of supported vendors.
123  */
124 static cpupm_vendor_t cpupm_vendors[] = {
125 	cpupm_intel_init,
126 	cpupm_amd_init,
127 	NULL
128 };
129 #endif
130 
131 /*
132  * Initialize the machine.
133  * See if a module exists for managing power for this CPU.
134  */
135 /*ARGSUSED*/
136 void
137 cpupm_init(cpu_t *cp)
138 {
139 #ifndef __xpv
140 	cpupm_vendor_t *vendors;
141 	cpupm_mach_state_t *mach_state;
142 	struct machcpu *mcpu = &(cp->cpu_m);
143 	int *speeds;
144 	uint_t nspeeds;
145 	int ret;
146 
147 	cpupm_set_supp_freqs(cp, NULL, 1);
148 
149 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
150 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
151 	mach_state->ms_caps = CPUPM_NO_STATES;
152 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
153 
154 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
155 	if (mach_state->ms_acpi_handle == NULL) {
156 		cpupm_free(cp);
157 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
158 		    "unable to get ACPI handle", cp->cpu_id);
159 		cmn_err(CE_NOTE, "!CPU power management will not function.");
160 		CPUPM_DISABLE();
161 		return;
162 	}
163 
164 	/*
165 	 * Loop through the CPU management module table and see if
166 	 * any of the modules implement CPU power management
167 	 * for this CPU.
168 	 */
169 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
170 		if (vendors->cpuv_init(cp))
171 			break;
172 	}
173 
174 	/*
175 	 * Nope, we can't power manage this CPU.
176 	 */
177 	if (vendors == NULL) {
178 		cpupm_free(cp);
179 		CPUPM_DISABLE();
180 		return;
181 	}
182 
183 	/*
184 	 * If P-state support exists for this system, then initialize it.
185 	 */
186 	if (mach_state->ms_pstate.cma_ops != NULL) {
187 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
188 		if (ret != 0) {
189 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
190 			    " unable to initialize P-state support",
191 			    cp->cpu_id);
192 			mach_state->ms_pstate.cma_ops = NULL;
193 			cpupm_disable(CPUPM_P_STATES);
194 		} else {
195 			nspeeds = cpupm_get_speeds(cp, &speeds);
196 			if (nspeeds == 0) {
197 				cmn_err(CE_WARN, "!cpupm_init: processor %d:"
198 				    " no speeds to manage", cp->cpu_id);
199 			} else {
200 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
201 				cpupm_free_speeds(speeds, nspeeds);
202 				mach_state->ms_caps |= CPUPM_P_STATES;
203 			}
204 		}
205 	}
206 
207 	if (mach_state->ms_tstate.cma_ops != NULL) {
208 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
209 		if (ret != 0) {
210 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
211 			    " unable to initialize T-state support",
212 			    cp->cpu_id);
213 			mach_state->ms_tstate.cma_ops = NULL;
214 			cpupm_disable(CPUPM_T_STATES);
215 		} else {
216 			mach_state->ms_caps |= CPUPM_T_STATES;
217 		}
218 	}
219 
220 	/*
221 	 * If C-states support exists for this system, then initialize it.
222 	 */
223 	if (mach_state->ms_cstate.cma_ops != NULL) {
224 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
225 		if (ret != 0) {
226 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
227 			    " unable to initialize C-state support",
228 			    cp->cpu_id);
229 			mach_state->ms_cstate.cma_ops = NULL;
230 			mcpu->max_cstates = CPU_ACPI_C1;
231 			cpupm_disable(CPUPM_C_STATES);
232 			idle_cpu = non_deep_idle_cpu;
233 			disp_enq_thread = non_deep_idle_disp_enq_thread;
234 		} else if (cpu_deep_cstates_supported()) {
235 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
236 			    mach_state->ms_acpi_handle);
237 			if (mcpu->max_cstates > CPU_ACPI_C1) {
238 				hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
239 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
240 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
241 				disp_enq_thread = cstate_wakeup;
242 			} else {
243 				hpet.callback(CST_EVENT_ONE_CSTATE);
244 			}
245 			mach_state->ms_caps |= CPUPM_C_STATES;
246 		} else {
247 			mcpu->max_cstates = CPU_ACPI_C1;
248 			idle_cpu = non_deep_idle_cpu;
249 			disp_enq_thread = non_deep_idle_disp_enq_thread;
250 		}
251 	}
252 
253 
254 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
255 		cpupm_free(cp);
256 		CPUPM_DISABLE();
257 		return;
258 	}
259 
260 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
261 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
262 	    (mach_state->ms_caps & CPUPM_C_STATES))
263 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
264 #endif
265 }
266 
267 /*
268  * Free any resources allocated by cpupm_init().
269  */
270 /*ARGSUSED*/
271 void
272 cpupm_free(cpu_t *cp)
273 {
274 #ifndef __xpv
275 	cpupm_mach_state_t *mach_state =
276 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
277 
278 	if (mach_state == NULL)
279 		return;
280 	if (mach_state->ms_pstate.cma_ops != NULL) {
281 		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
282 		mach_state->ms_pstate.cma_ops = NULL;
283 	}
284 
285 	if (mach_state->ms_tstate.cma_ops != NULL) {
286 		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
287 		mach_state->ms_tstate.cma_ops = NULL;
288 	}
289 
290 	if (mach_state->ms_cstate.cma_ops != NULL) {
291 		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
292 		mach_state->ms_cstate.cma_ops = NULL;
293 	}
294 
295 	cpupm_free_notify_handlers(cp);
296 
297 	if (mach_state->ms_acpi_handle != NULL) {
298 		cpu_acpi_fini(mach_state->ms_acpi_handle);
299 		mach_state->ms_acpi_handle = NULL;
300 	}
301 
302 	mutex_destroy(&mach_state->ms_lock);
303 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
304 	cp->cpu_m.mcpu_pm_mach_state = NULL;
305 #endif
306 }
307 
308 /*
309  * If all CPUs have started and at least one power state is manageable,
310  * then the CPUs are ready for power management.
311  */
312 boolean_t
313 cpupm_is_ready()
314 {
315 #ifndef __xpv
316 	if (cpupm_enabled == CPUPM_NO_STATES)
317 		return (B_FALSE);
318 	return (cpupm_ready);
319 #else
320 	return (B_FALSE);
321 #endif
322 
323 }
324 
325 boolean_t
326 cpupm_is_enabled(uint32_t state)
327 {
328 	return ((cpupm_enabled & state) == state);
329 }
330 
331 /*
332  * By default, all states are enabled.
333  */
334 void
335 cpupm_disable(uint32_t state)
336 {
337 
338 	if (state & CPUPM_P_STATES) {
339 		cpupm_free_domains(&cpupm_pstate_domains);
340 	}
341 	if (state & CPUPM_T_STATES) {
342 		cpupm_free_domains(&cpupm_tstate_domains);
343 	}
344 	if (state & CPUPM_C_STATES) {
345 		cpupm_free_domains(&cpupm_cstate_domains);
346 	}
347 	cpupm_enabled &= ~state;
348 }
349 
350 /*
351  * Once all CPUs have been started, the PPM driver should build CPU
352  * domains and initialize the topspeed for all CPU devices.
353  */
354 void
355 cpupm_post_startup()
356 {
357 #ifndef __xpv
358 	/*
359 	 * The CPU domain built by the PPM during CPUs attaching
360 	 * should be rebuilt with the information retrieved from
361 	 * ACPI.
362 	 */
363 	if (cpupm_rebuild_cpu_domains != NULL)
364 		(*cpupm_rebuild_cpu_domains)();
365 
366 	/*
367 	 * Only initialize the topspeed if P-states are enabled.
368 	 */
369 	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
370 		(*cpupm_init_topspeed)();
371 #endif
372 	cpupm_ready = B_TRUE;
373 }
374 
375 /*
376  * Allocate power domains for C,P and T States
377  */
378 void
379 cpupm_alloc_domains(cpu_t *cp, int state)
380 {
381 	cpupm_mach_state_t *mach_state =
382 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
383 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
384 	cpupm_state_domains_t **dom_ptr;
385 	cpupm_state_domains_t *dptr;
386 	cpupm_state_domains_t **mach_dom_state_ptr;
387 	uint32_t domain;
388 	uint32_t type;
389 
390 	switch (state) {
391 	case CPUPM_P_STATES:
392 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
393 			domain = CPU_ACPI_PSD(handle).sd_domain;
394 			type = CPU_ACPI_PSD(handle).sd_type;
395 		} else {
396 			mutex_enter(&cpu_lock);
397 			domain = cpuid_get_chipid(cp);
398 			mutex_exit(&cpu_lock);
399 			type = CPU_ACPI_HW_ALL;
400 		}
401 		dom_ptr = &cpupm_pstate_domains;
402 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
403 		break;
404 	case CPUPM_T_STATES:
405 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
406 			domain = CPU_ACPI_TSD(handle).sd_domain;
407 			type = CPU_ACPI_TSD(handle).sd_type;
408 		} else {
409 			mutex_enter(&cpu_lock);
410 			domain = cpuid_get_chipid(cp);
411 			mutex_exit(&cpu_lock);
412 			type = CPU_ACPI_HW_ALL;
413 		}
414 		dom_ptr = &cpupm_tstate_domains;
415 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
416 		break;
417 	case CPUPM_C_STATES:
418 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
419 			domain = CPU_ACPI_CSD(handle).sd_domain;
420 			type = CPU_ACPI_CSD(handle).sd_type;
421 		} else {
422 			mutex_enter(&cpu_lock);
423 			domain = cpuid_get_coreid(cp);
424 			mutex_exit(&cpu_lock);
425 			type = CPU_ACPI_HW_ALL;
426 		}
427 		dom_ptr = &cpupm_cstate_domains;
428 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
429 		break;
430 	default:
431 		return;
432 	}
433 
434 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
435 		if (dptr->pm_domain == domain)
436 			break;
437 	}
438 
439 	/* new domain is created and linked at the head */
440 	if (dptr == NULL) {
441 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
442 		dptr->pm_domain = domain;
443 		dptr->pm_type = type;
444 		dptr->pm_next = *dom_ptr;
445 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
446 		    (void *)ipltospl(DISP_LEVEL));
447 		CPUSET_ZERO(dptr->pm_cpus);
448 		*dom_ptr = dptr;
449 	}
450 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
451 	*mach_dom_state_ptr = dptr;
452 }
453 
454 /*
455  * Free C, P or T state power domains
456  */
457 void
458 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
459 {
460 	cpupm_state_domains_t *this_domain, *next_domain;
461 
462 	this_domain = *dom_ptr;
463 	while (this_domain != NULL) {
464 		next_domain = this_domain->pm_next;
465 		mutex_destroy(&this_domain->pm_lock);
466 		kmem_free((void *)this_domain,
467 		    sizeof (cpupm_state_domains_t));
468 		this_domain = next_domain;
469 	}
470 	*dom_ptr = NULL;
471 }
472 
473 void
474 cpupm_alloc_ms_cstate(cpu_t *cp)
475 {
476 	cpupm_mach_state_t *mach_state;
477 	cpupm_mach_acpi_state_t *ms_cstate;
478 
479 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
480 	ms_cstate = &mach_state->ms_cstate;
481 	ASSERT(ms_cstate->cma_state.cstate == NULL);
482 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
483 	    KM_SLEEP);
484 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
485 }
486 
487 void
488 cpupm_free_ms_cstate(cpu_t *cp)
489 {
490 	cpupm_mach_state_t *mach_state =
491 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
492 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
493 
494 	if (ms_cstate->cma_state.cstate != NULL) {
495 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
496 		ms_cstate->cma_state.cstate = NULL;
497 	}
498 }
499 
500 void
501 cpupm_state_change(cpu_t *cp, int level, int state)
502 {
503 	cpupm_mach_state_t	*mach_state =
504 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
505 	cpupm_state_ops_t	*state_ops;
506 	cpupm_state_domains_t  	*state_domain;
507 	cpuset_t		set;
508 
509 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
510 
511 	if (mach_state == NULL) {
512 		return;
513 	}
514 
515 	switch (state) {
516 	case CPUPM_P_STATES:
517 		state_ops = mach_state->ms_pstate.cma_ops;
518 		state_domain = mach_state->ms_pstate.cma_domain;
519 		break;
520 	case CPUPM_T_STATES:
521 		state_ops = mach_state->ms_tstate.cma_ops;
522 		state_domain = mach_state->ms_tstate.cma_domain;
523 		break;
524 	default:
525 		break;
526 	}
527 
528 	switch (state_domain->pm_type) {
529 	case CPU_ACPI_SW_ANY:
530 		/*
531 		 * A request on any CPU in the domain transitions the domain
532 		 */
533 		CPUSET_ONLY(set, cp->cpu_id);
534 		state_ops->cpus_change(set, level);
535 		break;
536 	case CPU_ACPI_SW_ALL:
537 		/*
538 		 * All CPUs in the domain must request the transition
539 		 */
540 	case CPU_ACPI_HW_ALL:
541 		/*
542 		 * P/T-state transitions are coordinated by the hardware
543 		 * For now, request the transition on all CPUs in the domain,
544 		 * but looking ahead we can probably be smarter about this.
545 		 */
546 		mutex_enter(&state_domain->pm_lock);
547 		state_ops->cpus_change(state_domain->pm_cpus, level);
548 		mutex_exit(&state_domain->pm_lock);
549 		break;
550 	default:
551 		cmn_err(CE_WARN, "Unknown domain coordination type: %d",
552 		    state_domain->pm_type);
553 	}
554 }
555 
556 /*
557  * CPU PM interfaces exposed to the CPU power manager
558  */
559 /*ARGSUSED*/
560 id_t
561 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
562 {
563 	cpupm_mach_state_t	*mach_state =
564 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
565 
566 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
567 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
568 		return (CPUPM_NO_DOMAIN);
569 	}
570 	if (type == CPUPM_DTYPE_ACTIVE) {
571 		/*
572 		 * Return P-State domain for the specified CPU
573 		 */
574 		if (mach_state->ms_pstate.cma_domain) {
575 			return (mach_state->ms_pstate.cma_domain->pm_domain);
576 		}
577 	} else if (type == CPUPM_DTYPE_IDLE) {
578 		/*
579 		 * Return C-State domain for the specified CPU
580 		 */
581 		if (mach_state->ms_cstate.cma_domain) {
582 			return (mach_state->ms_cstate.cma_domain->pm_domain);
583 		}
584 	}
585 	return (CPUPM_NO_DOMAIN);
586 }
587 
588 /*ARGSUSED*/
589 uint_t
590 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
591     cpupm_state_t *states)
592 {
593 	int	*speeds;
594 	uint_t	nspeeds, i;
595 
596 	/*
597 	 * Idle domain support unimplemented
598 	 */
599 	if (type != CPUPM_DTYPE_ACTIVE) {
600 		return (0);
601 	}
602 	nspeeds = cpupm_get_speeds(cp, &speeds);
603 
604 	/*
605 	 * If the caller passes NULL for states, just return the
606 	 * number of states.
607 	 */
608 	if (states != NULL) {
609 		for (i = 0; i < nspeeds; i++) {
610 			states[i].cps_speed = speeds[i];
611 			states[i].cps_handle = (cpupm_handle_t)i;
612 		}
613 	}
614 	cpupm_free_speeds(speeds, nspeeds);
615 	return (nspeeds);
616 }
617 
618 /*ARGSUSED*/
619 int
620 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
621 {
622 	if (!cpupm_is_ready())
623 		return (-1);
624 
625 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
626 
627 	return (0);
628 }
629 
630 /*ARGSUSED*/
631 /*
632  * Note: It is the responsibility of the users of
633  * cpupm_get_speeds() to free the memory allocated
634  * for speeds using cpupm_free_speeds()
635  */
636 uint_t
637 cpupm_get_speeds(cpu_t *cp, int **speeds)
638 {
639 #ifndef __xpv
640 	cpupm_mach_state_t *mach_state =
641 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
642 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
643 #else
644 	return (0);
645 #endif
646 }
647 
648 /*ARGSUSED*/
649 void
650 cpupm_free_speeds(int *speeds, uint_t nspeeds)
651 {
652 #ifndef __xpv
653 	cpu_acpi_free_speeds(speeds, nspeeds);
654 #endif
655 }
656 
657 /*
658  * All CPU instances have been initialized successfully.
659  */
660 boolean_t
661 cpupm_power_ready(void)
662 {
663 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
664 }
665 
666 /*
667  * All CPU instances have been initialized successfully.
668  */
669 boolean_t
670 cpupm_throttle_ready(void)
671 {
672 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
673 }
674 
675 /*
676  * All CPU instances have been initialized successfully.
677  */
678 boolean_t
679 cpupm_cstate_ready(void)
680 {
681 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
682 }
683 
684 void
685 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
686 {
687 	cpu_t *cp = ctx;
688 	cpupm_mach_state_t *mach_state =
689 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
690 	cpupm_notification_t *entry;
691 
692 	mutex_enter(&mach_state->ms_lock);
693 	for (entry =  mach_state->ms_handlers; entry != NULL;
694 	    entry = entry->nq_next) {
695 		entry->nq_handler(obj, val, entry->nq_ctx);
696 	}
697 	mutex_exit(&mach_state->ms_lock);
698 }
699 
700 /*ARGSUSED*/
701 void
702 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
703 {
704 #ifndef __xpv
705 	cpupm_mach_state_t *mach_state =
706 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
707 	cpupm_notification_t *entry;
708 
709 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
710 	entry->nq_handler = handler;
711 	entry->nq_ctx = ctx;
712 	mutex_enter(&mach_state->ms_lock);
713 	if (mach_state->ms_handlers == NULL) {
714 		entry->nq_next = NULL;
715 		mach_state->ms_handlers = entry;
716 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
717 		    cpupm_notify_handler, cp);
718 
719 	} else {
720 		entry->nq_next = mach_state->ms_handlers;
721 		mach_state->ms_handlers = entry;
722 	}
723 	mutex_exit(&mach_state->ms_lock);
724 #endif
725 }
726 
727 /*ARGSUSED*/
728 static void
729 cpupm_free_notify_handlers(cpu_t *cp)
730 {
731 #ifndef __xpv
732 	cpupm_mach_state_t *mach_state =
733 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
734 	cpupm_notification_t *entry;
735 	cpupm_notification_t *next;
736 
737 	mutex_enter(&mach_state->ms_lock);
738 	if (mach_state->ms_handlers == NULL) {
739 		mutex_exit(&mach_state->ms_lock);
740 		return;
741 	}
742 	if (mach_state->ms_acpi_handle != NULL) {
743 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
744 		    cpupm_notify_handler);
745 	}
746 	entry = mach_state->ms_handlers;
747 	while (entry != NULL) {
748 		next = entry->nq_next;
749 		kmem_free(entry, sizeof (cpupm_notification_t));
750 		entry = next;
751 	}
752 	mach_state->ms_handlers = NULL;
753 	mutex_exit(&mach_state->ms_lock);
754 #endif
755 }
756 
757 /*
758  * Get the current max speed from the ACPI _PPC object
759  */
760 /*ARGSUSED*/
761 int
762 cpupm_get_top_speed(cpu_t *cp)
763 {
764 #ifndef __xpv
765 	cpupm_mach_state_t 	*mach_state;
766 	cpu_acpi_handle_t 	handle;
767 	int 			plat_level;
768 	uint_t			nspeeds;
769 	int			max_level;
770 
771 	mach_state =
772 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
773 	handle = mach_state->ms_acpi_handle;
774 
775 	cpu_acpi_cache_ppc(handle);
776 	plat_level = CPU_ACPI_PPC(handle);
777 
778 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
779 
780 	max_level = nspeeds - 1;
781 	if ((plat_level < 0) || (plat_level > max_level)) {
782 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
783 		    "_PPC out of range %d", cp->cpu_id, plat_level);
784 		plat_level = 0;
785 	}
786 
787 	return (plat_level);
788 #else
789 	return (0);
790 #endif
791 }
792 
793 /*
794  * This notification handler is called whenever the ACPI _PPC
795  * object changes. The _PPC is a sort of governor on power levels.
796  * It sets an upper threshold on which, _PSS defined, power levels
797  * are usuable. The _PPC value is dynamic and may change as properties
798  * (i.e., thermal or AC source) of the system change.
799  */
800 
801 static void
802 cpupm_power_manage_notifications(void *ctx)
803 {
804 	cpu_t			*cp = ctx;
805 	int			top_speed;
806 
807 	top_speed = cpupm_get_top_speed(cp);
808 	cpupm_redefine_max_activepwr_state(cp, top_speed);
809 }
810 
811 /* ARGSUSED */
812 static void
813 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
814 {
815 #ifndef __xpv
816 	/*
817 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
818 	 */
819 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION) {
820 		cpupm_throttle_manage_notification(ctx);
821 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION) {
822 		cpuidle_manage_cstates(ctx);
823 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION) {
824 		cpupm_power_manage_notifications(ctx);
825 	}
826 #endif
827 }
828 
829 /*
830  * Update cpupm cstate data each time CPU exits idle.
831  */
832 void
833 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
834 {
835 	cs_data->cs_idle_exit = end;
836 }
837 
838 /*
839  * Determine next cstate based on cpupm data.
840  * Update cpupm cstate data each time CPU goes idle.
841  * Do as much as possible in the idle state bookkeeping function because the
842  * performance impact while idle is minimal compared to in the wakeup function
843  * when there is real work to do.
844  */
845 uint32_t
846 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
847     uint32_t cs_count, hrtime_t start)
848 {
849 	hrtime_t duration;
850 	hrtime_t ave_interval;
851 	hrtime_t ave_idle_time;
852 	uint32_t i;
853 
854 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
855 	scalehrtime(&duration);
856 	cs_data->cs_idle += duration;
857 	cs_data->cs_idle_enter = start;
858 
859 	++cs_data->cs_cnt;
860 	if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
861 		cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
862 		scalehrtime(&cs_data->cs_smpl_len);
863 		cs_data->cs_smpl_len |= 1;	/* protect from DIV 0 */
864 		cs_data->cs_smpl_idle = cs_data->cs_idle;
865 		cs_data->cs_idle = 0;
866 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
867 		    cs_data->cs_smpl_len);
868 
869 		cs_data->cs_smpl_start = start;
870 		cs_data->cs_cnt = 0;
871 
872 		/*
873 		 * Strand level C-state policy
874 		 * The cpu_acpi_cstate_t *cstates array is not required to
875 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
876 		 * There are cs_count entries in the cstates array.
877 		 * cs_data->cs_next_cstate contains the index of the next
878 		 * C-state this CPU should enter.
879 		 */
880 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
881 
882 		/*
883 		 * Will CPU be idle long enough to save power?
884 		 */
885 		ave_idle_time = (cs_data->cs_smpl_idle /
886 		    cpupm_cs_sample_tunable) / 1000;
887 		for (i = 1; i < cs_count; ++i) {
888 			if (ave_idle_time < (cstates[i].cs_latency *
889 			    cpupm_cs_idle_save_tunable)) {
890 				cs_count = i;
891 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
892 				    CPU, int, i);
893 			}
894 		}
895 
896 		/*
897 		 * Wakeup often (even when non-idle time is very short)?
898 		 * Some producer/consumer type loads fall into this category.
899 		 */
900 		ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
901 		    / 1000;
902 		for (i = 1; i < cs_count; ++i) {
903 			if (ave_interval <= (cstates[i].cs_latency *
904 			    cpupm_cs_idle_cost_tunable)) {
905 				cs_count = i;
906 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
907 				    CPU, int, (CPU_MAX_CSTATES + i));
908 			}
909 		}
910 
911 		/*
912 		 * Idle percent
913 		 */
914 		for (i = 1; i < cs_count; ++i) {
915 			switch (cstates[i].cs_type) {
916 			case CPU_ACPI_C2:
917 				if (cs_data->cs_smpl_idle_pct <
918 				    cpupm_C2_idle_pct_tunable) {
919 					cs_count = i;
920 					DTRACE_PROBE2(cpupm__next__cstate,
921 					    cpu_t *, CPU, int,
922 					    ((2 * CPU_MAX_CSTATES) + i));
923 				}
924 				break;
925 
926 			case CPU_ACPI_C3:
927 				if (cs_data->cs_smpl_idle_pct <
928 				    cpupm_C3_idle_pct_tunable) {
929 					cs_count = i;
930 					DTRACE_PROBE2(cpupm__next__cstate,
931 					    cpu_t *, CPU, int,
932 					    ((2 * CPU_MAX_CSTATES) + i));
933 				}
934 				break;
935 			}
936 		}
937 
938 		cs_data->cs_next_cstate = cs_count - 1;
939 	}
940 
941 	return (cs_data->cs_next_cstate);
942 }
943