xref: /titanic_51/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c (revision 4f3b09fdc1c2f924ddba94e505ee4c2bff8a18d4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpu_pm.h>
27 #include <sys/x86_archext.h>
28 #include <sys/sdt.h>
29 #include <sys/spl.h>
30 #include <sys/machsystm.h>
31 #include <sys/hpet.h>
32 #include <sys/cpupm.h>
33 #include <sys/cpu_idle.h>
34 #include <sys/cpu_acpi.h>
35 #include <sys/cpupm_throttle.h>
36 #include <sys/dtrace.h>
37 
38 /*
39  * This callback is used to build the PPM CPU domains once
40  * all the CPU devices have been started. The callback is
41  * initialized by the PPM driver to point to a routine that
42  * will build the domains.
43  */
44 void (*cpupm_rebuild_cpu_domains)(void);
45 
46 /*
47  * This callback is used to reset the topspeed for all the
48  * CPU devices. The callback is initialized by the PPM driver to
49  * point to a routine that will reinitialize all the CPU devices
50  * once all the CPU devices have been started and the CPU domains
51  * built.
52  */
53 void (*cpupm_init_topspeed)(void);
54 
55 /*
56  * This callback is used to redefine the topspeed for a CPU device.
57  * Since all CPUs in a domain should have identical properties, this
58  * callback is initialized by the PPM driver to point to a routine
59  * that will redefine the topspeed for all devices in a CPU domain.
60  * This callback is exercised whenever an ACPI _PPC change notification
61  * is received by the CPU driver.
62  */
63 void (*cpupm_redefine_topspeed)(void *);
64 
65 /*
66  * This callback is used by the PPM driver to call into the CPU driver
67  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
68  */
69 void (*cpupm_set_topspeed_callb)(void *, int);
70 
71 /*
72  * This callback is used by the PPM driver to call into the CPU driver
73  * to set a new topspeed for a CPU.
74  */
75 int (*cpupm_get_topspeed_callb)(void *);
76 
77 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
78 static void cpupm_free_notify_handlers(cpu_t *);
79 
80 /*
81  * Until proven otherwise, all power states are manageable.
82  */
83 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
84 
85 /*
86  * Until all CPUs have started, we do not allow
87  * power management.
88  */
89 static boolean_t cpupm_ready = B_FALSE;
90 
91 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
92 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
93 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
94 
95 /*
96  * c-state tunables
97  *
98  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
99  * divided by time spent in the idle state transitions.
100  * A value of 10 means the CPU will not spend more than 1/10 of its time
101  * in idle latency.  The worst case performance will be 90% of non Deep C-state
102  * kernel.
103  *
104  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
105  * before it is worth going there.  Expressed as a multiple of latency.
106  */
107 uint32_t cpupm_cs_sample_tunable = 5;		/* samples in decision period */
108 uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
109 uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
110 uint16_t cpupm_C2_idle_pct_tunable = 70;
111 uint16_t cpupm_C3_idle_pct_tunable = 80;
112 
113 #ifndef __xpv
114 extern boolean_t cpupm_intel_init(cpu_t *);
115 extern boolean_t cpupm_amd_init(cpu_t *);
116 
117 typedef struct cpupm_vendor {
118 	boolean_t	(*cpuv_init)(cpu_t *);
119 } cpupm_vendor_t;
120 
121 /*
122  * Table of supported vendors.
123  */
124 static cpupm_vendor_t cpupm_vendors[] = {
125 	cpupm_intel_init,
126 	cpupm_amd_init,
127 	NULL
128 };
129 #endif
130 
131 /*
132  * Initialize the machine.
133  * See if a module exists for managing power for this CPU.
134  */
135 /*ARGSUSED*/
136 void
137 cpupm_init(cpu_t *cp)
138 {
139 #ifndef __xpv
140 	cpupm_vendor_t *vendors;
141 	cpupm_mach_state_t *mach_state;
142 	struct machcpu *mcpu = &(cp->cpu_m);
143 	int *speeds;
144 	uint_t nspeeds;
145 	int ret;
146 
147 	mach_state = cp->cpu_m.mcpu_pm_mach_state =
148 	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
149 	mach_state->ms_caps = CPUPM_NO_STATES;
150 	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
151 
152 	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
153 	if (mach_state->ms_acpi_handle == NULL) {
154 		cpupm_free(cp);
155 		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
156 		    "unable to get ACPI handle", cp->cpu_id);
157 		cmn_err(CE_NOTE, "!CPU power management will not function.");
158 		CPUPM_DISABLE();
159 		return;
160 	}
161 
162 	/*
163 	 * Loop through the CPU management module table and see if
164 	 * any of the modules implement CPU power management
165 	 * for this CPU.
166 	 */
167 	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
168 		if (vendors->cpuv_init(cp))
169 			break;
170 	}
171 
172 	/*
173 	 * Nope, we can't power manage this CPU.
174 	 */
175 	if (vendors == NULL) {
176 		cpupm_free(cp);
177 		CPUPM_DISABLE();
178 		return;
179 	}
180 
181 	/*
182 	 * If P-state support exists for this system, then initialize it.
183 	 */
184 	if (mach_state->ms_pstate.cma_ops != NULL) {
185 		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
186 		if (ret != 0) {
187 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
188 			    " unable to initialize P-state support",
189 			    cp->cpu_id);
190 			mach_state->ms_pstate.cma_ops = NULL;
191 			cpupm_disable(CPUPM_P_STATES);
192 		} else {
193 			nspeeds = cpupm_get_speeds(cp, &speeds);
194 			if (nspeeds == 0) {
195 				cmn_err(CE_WARN, "!cpupm_init: processor %d:"
196 				    " no speeds to manage", cp->cpu_id);
197 			} else {
198 				cpupm_set_supp_freqs(cp, speeds, nspeeds);
199 				cpupm_free_speeds(speeds, nspeeds);
200 				mach_state->ms_caps |= CPUPM_P_STATES;
201 			}
202 		}
203 	}
204 
205 	if (mach_state->ms_tstate.cma_ops != NULL) {
206 		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
207 		if (ret != 0) {
208 			char err_msg[128];
209 			int p_res;
210 			p_res =	snprintf(err_msg, sizeof (err_msg),
211 			    "!cpupm_init: processor %d: unable to initialize "
212 			    "T-state support", cp->cpu_id);
213 			if (p_res >= 0)
214 				DTRACE_PROBE1(cpu_ts_err_msg, char *, err_msg);
215 			mach_state->ms_tstate.cma_ops = NULL;
216 			cpupm_disable(CPUPM_T_STATES);
217 		} else {
218 			mach_state->ms_caps |= CPUPM_T_STATES;
219 		}
220 	}
221 
222 	/*
223 	 * If C-states support exists for this system, then initialize it.
224 	 */
225 	if (mach_state->ms_cstate.cma_ops != NULL) {
226 		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
227 		if (ret != 0) {
228 			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
229 			    " unable to initialize C-state support",
230 			    cp->cpu_id);
231 			mach_state->ms_cstate.cma_ops = NULL;
232 			mcpu->max_cstates = CPU_ACPI_C1;
233 			cpupm_disable(CPUPM_C_STATES);
234 			idle_cpu = non_deep_idle_cpu;
235 			disp_enq_thread = non_deep_idle_disp_enq_thread;
236 		} else if (cpu_deep_cstates_supported()) {
237 			mcpu->max_cstates = cpu_acpi_get_max_cstates(
238 			    mach_state->ms_acpi_handle);
239 			if (mcpu->max_cstates > CPU_ACPI_C1) {
240 				hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
241 				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
242 				mcpu->mcpu_idle_type = CPU_ACPI_C1;
243 				disp_enq_thread = cstate_wakeup;
244 			} else {
245 				hpet.callback(CST_EVENT_ONE_CSTATE);
246 			}
247 			mach_state->ms_caps |= CPUPM_C_STATES;
248 		} else {
249 			mcpu->max_cstates = CPU_ACPI_C1;
250 			idle_cpu = non_deep_idle_cpu;
251 			disp_enq_thread = non_deep_idle_disp_enq_thread;
252 		}
253 	}
254 
255 
256 	if (mach_state->ms_caps == CPUPM_NO_STATES) {
257 		cpupm_free(cp);
258 		CPUPM_DISABLE();
259 		return;
260 	}
261 
262 	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
263 	    (mach_state->ms_caps & CPUPM_P_STATES) ||
264 	    (mach_state->ms_caps & CPUPM_C_STATES))
265 		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
266 #endif
267 }
268 
269 /*
270  * Free any resources allocated by cpupm_init().
271  */
272 /*ARGSUSED*/
273 void
274 cpupm_free(cpu_t *cp)
275 {
276 #ifndef __xpv
277 	cpupm_mach_state_t *mach_state =
278 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
279 
280 	if (mach_state == NULL)
281 		return;
282 	if (mach_state->ms_pstate.cma_ops != NULL) {
283 		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
284 		mach_state->ms_pstate.cma_ops = NULL;
285 	}
286 
287 	if (mach_state->ms_tstate.cma_ops != NULL) {
288 		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
289 		mach_state->ms_tstate.cma_ops = NULL;
290 	}
291 
292 	if (mach_state->ms_cstate.cma_ops != NULL) {
293 		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
294 		mach_state->ms_cstate.cma_ops = NULL;
295 	}
296 
297 	cpupm_free_notify_handlers(cp);
298 
299 	if (mach_state->ms_acpi_handle != NULL) {
300 		cpu_acpi_fini(mach_state->ms_acpi_handle);
301 		mach_state->ms_acpi_handle = NULL;
302 	}
303 
304 	mutex_destroy(&mach_state->ms_lock);
305 	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
306 	cp->cpu_m.mcpu_pm_mach_state = NULL;
307 #endif
308 }
309 
310 /*
311  * If all CPUs have started and at least one power state is manageable,
312  * then the CPUs are ready for power management.
313  */
314 boolean_t
315 cpupm_is_ready()
316 {
317 #ifndef __xpv
318 	if (cpupm_enabled == CPUPM_NO_STATES)
319 		return (B_FALSE);
320 	return (cpupm_ready);
321 #else
322 	return (B_FALSE);
323 #endif
324 
325 }
326 
327 boolean_t
328 cpupm_is_enabled(uint32_t state)
329 {
330 	return ((cpupm_enabled & state) == state);
331 }
332 
333 /*
334  * By default, all states are enabled.
335  */
336 void
337 cpupm_disable(uint32_t state)
338 {
339 
340 	if (state & CPUPM_P_STATES) {
341 		cpupm_free_domains(&cpupm_pstate_domains);
342 	}
343 	if (state & CPUPM_T_STATES) {
344 		cpupm_free_domains(&cpupm_tstate_domains);
345 	}
346 	if (state & CPUPM_C_STATES) {
347 		cpupm_free_domains(&cpupm_cstate_domains);
348 	}
349 	cpupm_enabled &= ~state;
350 }
351 
352 /*
353  * Once all CPUs have been started, the PPM driver should build CPU
354  * domains and initialize the topspeed for all CPU devices.
355  */
356 void
357 cpupm_post_startup()
358 {
359 #ifndef __xpv
360 	/*
361 	 * The CPU domain built by the PPM during CPUs attaching
362 	 * should be rebuilt with the information retrieved from
363 	 * ACPI.
364 	 */
365 	if (cpupm_rebuild_cpu_domains != NULL)
366 		(*cpupm_rebuild_cpu_domains)();
367 
368 	/*
369 	 * Only initialize the topspeed if P-states are enabled.
370 	 */
371 	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
372 		(*cpupm_init_topspeed)();
373 #endif
374 	cpupm_ready = B_TRUE;
375 }
376 
377 /*
378  * Allocate power domains for C,P and T States
379  */
380 void
381 cpupm_alloc_domains(cpu_t *cp, int state)
382 {
383 	cpupm_mach_state_t *mach_state =
384 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
385 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
386 	cpupm_state_domains_t **dom_ptr;
387 	cpupm_state_domains_t *dptr;
388 	cpupm_state_domains_t **mach_dom_state_ptr;
389 	uint32_t domain;
390 	uint32_t type;
391 
392 	switch (state) {
393 	case CPUPM_P_STATES:
394 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
395 			domain = CPU_ACPI_PSD(handle).sd_domain;
396 			type = CPU_ACPI_PSD(handle).sd_type;
397 		} else {
398 			mutex_enter(&cpu_lock);
399 			domain = cpuid_get_chipid(cp);
400 			mutex_exit(&cpu_lock);
401 			type = CPU_ACPI_HW_ALL;
402 		}
403 		dom_ptr = &cpupm_pstate_domains;
404 		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
405 		break;
406 	case CPUPM_T_STATES:
407 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
408 			domain = CPU_ACPI_TSD(handle).sd_domain;
409 			type = CPU_ACPI_TSD(handle).sd_type;
410 		} else {
411 			mutex_enter(&cpu_lock);
412 			domain = cpuid_get_chipid(cp);
413 			mutex_exit(&cpu_lock);
414 			type = CPU_ACPI_HW_ALL;
415 		}
416 		dom_ptr = &cpupm_tstate_domains;
417 		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
418 		break;
419 	case CPUPM_C_STATES:
420 		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
421 			domain = CPU_ACPI_CSD(handle).sd_domain;
422 			type = CPU_ACPI_CSD(handle).sd_type;
423 		} else {
424 			mutex_enter(&cpu_lock);
425 			domain = cpuid_get_coreid(cp);
426 			mutex_exit(&cpu_lock);
427 			type = CPU_ACPI_HW_ALL;
428 		}
429 		dom_ptr = &cpupm_cstate_domains;
430 		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
431 		break;
432 	default:
433 		return;
434 	}
435 
436 	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
437 		if (dptr->pm_domain == domain)
438 			break;
439 	}
440 
441 	/* new domain is created and linked at the head */
442 	if (dptr == NULL) {
443 		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
444 		dptr->pm_domain = domain;
445 		dptr->pm_type = type;
446 		dptr->pm_next = *dom_ptr;
447 		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
448 		    (void *)ipltospl(DISP_LEVEL));
449 		CPUSET_ZERO(dptr->pm_cpus);
450 		*dom_ptr = dptr;
451 	}
452 	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
453 	*mach_dom_state_ptr = dptr;
454 }
455 
456 /*
457  * Free C, P or T state power domains
458  */
459 void
460 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
461 {
462 	cpupm_state_domains_t *this_domain, *next_domain;
463 
464 	this_domain = *dom_ptr;
465 	while (this_domain != NULL) {
466 		next_domain = this_domain->pm_next;
467 		mutex_destroy(&this_domain->pm_lock);
468 		kmem_free((void *)this_domain,
469 		    sizeof (cpupm_state_domains_t));
470 		this_domain = next_domain;
471 	}
472 	*dom_ptr = NULL;
473 }
474 
475 void
476 cpupm_alloc_ms_cstate(cpu_t *cp)
477 {
478 	cpupm_mach_state_t *mach_state;
479 	cpupm_mach_acpi_state_t *ms_cstate;
480 
481 	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
482 	ms_cstate = &mach_state->ms_cstate;
483 	ASSERT(ms_cstate->cma_state.cstate == NULL);
484 	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
485 	    KM_SLEEP);
486 	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
487 }
488 
489 void
490 cpupm_free_ms_cstate(cpu_t *cp)
491 {
492 	cpupm_mach_state_t *mach_state =
493 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
494 	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
495 
496 	if (ms_cstate->cma_state.cstate != NULL) {
497 		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
498 		ms_cstate->cma_state.cstate = NULL;
499 	}
500 }
501 
502 void
503 cpupm_state_change(cpu_t *cp, int level, int state)
504 {
505 	cpupm_mach_state_t	*mach_state =
506 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
507 	cpupm_state_ops_t	*state_ops;
508 	cpupm_state_domains_t  	*state_domain;
509 	cpuset_t		set;
510 
511 	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
512 
513 	if (mach_state == NULL) {
514 		return;
515 	}
516 
517 	switch (state) {
518 	case CPUPM_P_STATES:
519 		state_ops = mach_state->ms_pstate.cma_ops;
520 		state_domain = mach_state->ms_pstate.cma_domain;
521 		break;
522 	case CPUPM_T_STATES:
523 		state_ops = mach_state->ms_tstate.cma_ops;
524 		state_domain = mach_state->ms_tstate.cma_domain;
525 		break;
526 	default:
527 		break;
528 	}
529 
530 	switch (state_domain->pm_type) {
531 	case CPU_ACPI_SW_ANY:
532 		/*
533 		 * A request on any CPU in the domain transitions the domain
534 		 */
535 		CPUSET_ONLY(set, cp->cpu_id);
536 		state_ops->cpus_change(set, level);
537 		break;
538 	case CPU_ACPI_SW_ALL:
539 		/*
540 		 * All CPUs in the domain must request the transition
541 		 */
542 	case CPU_ACPI_HW_ALL:
543 		/*
544 		 * P/T-state transitions are coordinated by the hardware
545 		 * For now, request the transition on all CPUs in the domain,
546 		 * but looking ahead we can probably be smarter about this.
547 		 */
548 		mutex_enter(&state_domain->pm_lock);
549 		state_ops->cpus_change(state_domain->pm_cpus, level);
550 		mutex_exit(&state_domain->pm_lock);
551 		break;
552 	default:
553 		cmn_err(CE_WARN, "Unknown domain coordination type: %d",
554 		    state_domain->pm_type);
555 	}
556 }
557 
558 /*
559  * CPU PM interfaces exposed to the CPU power manager
560  */
561 /*ARGSUSED*/
562 id_t
563 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
564 {
565 	cpupm_mach_state_t	*mach_state =
566 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
567 
568 	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
569 	    !cpupm_is_enabled(CPUPM_C_STATES))) {
570 		return (CPUPM_NO_DOMAIN);
571 	}
572 	if (type == CPUPM_DTYPE_ACTIVE) {
573 		/*
574 		 * Return P-State domain for the specified CPU
575 		 */
576 		if (mach_state->ms_pstate.cma_domain) {
577 			return (mach_state->ms_pstate.cma_domain->pm_domain);
578 		}
579 	} else if (type == CPUPM_DTYPE_IDLE) {
580 		/*
581 		 * Return C-State domain for the specified CPU
582 		 */
583 		if (mach_state->ms_cstate.cma_domain) {
584 			return (mach_state->ms_cstate.cma_domain->pm_domain);
585 		}
586 	}
587 	return (CPUPM_NO_DOMAIN);
588 }
589 
590 /*ARGSUSED*/
591 uint_t
592 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
593     cpupm_state_t *states)
594 {
595 	int	*speeds;
596 	uint_t	nspeeds, i;
597 
598 	/*
599 	 * Idle domain support unimplemented
600 	 */
601 	if (type != CPUPM_DTYPE_ACTIVE) {
602 		return (0);
603 	}
604 	nspeeds = cpupm_get_speeds(cp, &speeds);
605 
606 	/*
607 	 * If the caller passes NULL for states, just return the
608 	 * number of states.
609 	 */
610 	if (states != NULL) {
611 		for (i = 0; i < nspeeds; i++) {
612 			states[i].cps_speed = speeds[i];
613 			states[i].cps_handle = (cpupm_handle_t)i;
614 		}
615 	}
616 	cpupm_free_speeds(speeds, nspeeds);
617 	return (nspeeds);
618 }
619 
620 /*ARGSUSED*/
621 int
622 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
623 {
624 	if (!cpupm_is_ready())
625 		return (-1);
626 
627 	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
628 
629 	return (0);
630 }
631 
632 /*ARGSUSED*/
633 /*
634  * Note: It is the responsibility of the users of
635  * cpupm_get_speeds() to free the memory allocated
636  * for speeds using cpupm_free_speeds()
637  */
638 uint_t
639 cpupm_get_speeds(cpu_t *cp, int **speeds)
640 {
641 #ifndef __xpv
642 	cpupm_mach_state_t *mach_state =
643 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
644 	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
645 #else
646 	return (0);
647 #endif
648 }
649 
650 /*ARGSUSED*/
651 void
652 cpupm_free_speeds(int *speeds, uint_t nspeeds)
653 {
654 #ifndef __xpv
655 	cpu_acpi_free_speeds(speeds, nspeeds);
656 #endif
657 }
658 
659 /*
660  * All CPU instances have been initialized successfully.
661  */
662 boolean_t
663 cpupm_power_ready(void)
664 {
665 	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
666 }
667 
668 /*
669  * All CPU instances have been initialized successfully.
670  */
671 boolean_t
672 cpupm_throttle_ready(void)
673 {
674 	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
675 }
676 
677 /*
678  * All CPU instances have been initialized successfully.
679  */
680 boolean_t
681 cpupm_cstate_ready(void)
682 {
683 	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
684 }
685 
686 void
687 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
688 {
689 	cpu_t *cp = ctx;
690 	cpupm_mach_state_t *mach_state =
691 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
692 	cpupm_notification_t *entry;
693 
694 	mutex_enter(&mach_state->ms_lock);
695 	for (entry =  mach_state->ms_handlers; entry != NULL;
696 	    entry = entry->nq_next) {
697 		entry->nq_handler(obj, val, entry->nq_ctx);
698 	}
699 	mutex_exit(&mach_state->ms_lock);
700 }
701 
702 /*ARGSUSED*/
703 void
704 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
705 {
706 #ifndef __xpv
707 	cpupm_mach_state_t *mach_state =
708 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
709 	cpupm_notification_t *entry;
710 
711 	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
712 	entry->nq_handler = handler;
713 	entry->nq_ctx = ctx;
714 	mutex_enter(&mach_state->ms_lock);
715 	if (mach_state->ms_handlers == NULL) {
716 		entry->nq_next = NULL;
717 		mach_state->ms_handlers = entry;
718 		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
719 		    cpupm_notify_handler, cp);
720 
721 	} else {
722 		entry->nq_next = mach_state->ms_handlers;
723 		mach_state->ms_handlers = entry;
724 	}
725 	mutex_exit(&mach_state->ms_lock);
726 #endif
727 }
728 
729 /*ARGSUSED*/
730 static void
731 cpupm_free_notify_handlers(cpu_t *cp)
732 {
733 #ifndef __xpv
734 	cpupm_mach_state_t *mach_state =
735 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
736 	cpupm_notification_t *entry;
737 	cpupm_notification_t *next;
738 
739 	mutex_enter(&mach_state->ms_lock);
740 	if (mach_state->ms_handlers == NULL) {
741 		mutex_exit(&mach_state->ms_lock);
742 		return;
743 	}
744 	if (mach_state->ms_acpi_handle != NULL) {
745 		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
746 		    cpupm_notify_handler);
747 	}
748 	entry = mach_state->ms_handlers;
749 	while (entry != NULL) {
750 		next = entry->nq_next;
751 		kmem_free(entry, sizeof (cpupm_notification_t));
752 		entry = next;
753 	}
754 	mach_state->ms_handlers = NULL;
755 	mutex_exit(&mach_state->ms_lock);
756 #endif
757 }
758 
759 /*
760  * Get the current max speed from the ACPI _PPC object
761  */
762 /*ARGSUSED*/
763 int
764 cpupm_get_top_speed(cpu_t *cp)
765 {
766 #ifndef __xpv
767 	cpupm_mach_state_t 	*mach_state;
768 	cpu_acpi_handle_t 	handle;
769 	int 			plat_level;
770 	uint_t			nspeeds;
771 	int			max_level;
772 
773 	mach_state =
774 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
775 	handle = mach_state->ms_acpi_handle;
776 
777 	cpu_acpi_cache_ppc(handle);
778 	plat_level = CPU_ACPI_PPC(handle);
779 
780 	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
781 
782 	max_level = nspeeds - 1;
783 	if ((plat_level < 0) || (plat_level > max_level)) {
784 		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
785 		    "_PPC out of range %d", cp->cpu_id, plat_level);
786 		plat_level = 0;
787 	}
788 
789 	return (plat_level);
790 #else
791 	return (0);
792 #endif
793 }
794 
795 /*
796  * This notification handler is called whenever the ACPI _PPC
797  * object changes. The _PPC is a sort of governor on power levels.
798  * It sets an upper threshold on which, _PSS defined, power levels
799  * are usuable. The _PPC value is dynamic and may change as properties
800  * (i.e., thermal or AC source) of the system change.
801  */
802 
803 static void
804 cpupm_power_manage_notifications(void *ctx)
805 {
806 	cpu_t			*cp = ctx;
807 	int			top_speed;
808 
809 	top_speed = cpupm_get_top_speed(cp);
810 	cpupm_redefine_max_activepwr_state(cp, top_speed);
811 }
812 
813 /* ARGSUSED */
814 static void
815 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
816 {
817 #ifndef __xpv
818 	/*
819 	 * Currently, we handle _TPC,_CST and _PPC change notifications.
820 	 */
821 	if (val == CPUPM_TPC_CHANGE_NOTIFICATION) {
822 		cpupm_throttle_manage_notification(ctx);
823 	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION) {
824 		cpuidle_manage_cstates(ctx);
825 	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION) {
826 		cpupm_power_manage_notifications(ctx);
827 	}
828 #endif
829 }
830 
831 /*
832  * Update cpupm cstate data each time CPU exits idle.
833  */
834 void
835 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
836 {
837 	cs_data->cs_idle_exit = end;
838 }
839 
840 /*
841  * Determine next cstate based on cpupm data.
842  * Update cpupm cstate data each time CPU goes idle.
843  * Do as much as possible in the idle state bookkeeping function because the
844  * performance impact while idle is minimal compared to in the wakeup function
845  * when there is real work to do.
846  */
847 uint32_t
848 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
849     uint32_t cs_count, hrtime_t start)
850 {
851 	hrtime_t duration;
852 	hrtime_t ave_interval;
853 	hrtime_t ave_idle_time;
854 	uint32_t i;
855 
856 	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
857 	scalehrtime(&duration);
858 	cs_data->cs_idle += duration;
859 	cs_data->cs_idle_enter = start;
860 
861 	++cs_data->cs_cnt;
862 	if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
863 		cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
864 		scalehrtime(&cs_data->cs_smpl_len);
865 		cs_data->cs_smpl_len |= 1;	/* protect from DIV 0 */
866 		cs_data->cs_smpl_idle = cs_data->cs_idle;
867 		cs_data->cs_idle = 0;
868 		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
869 		    cs_data->cs_smpl_len);
870 
871 		cs_data->cs_smpl_start = start;
872 		cs_data->cs_cnt = 0;
873 
874 		/*
875 		 * Strand level C-state policy
876 		 * The cpu_acpi_cstate_t *cstates array is not required to
877 		 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
878 		 * There are cs_count entries in the cstates array.
879 		 * cs_data->cs_next_cstate contains the index of the next
880 		 * C-state this CPU should enter.
881 		 */
882 		ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
883 
884 		/*
885 		 * Will CPU be idle long enough to save power?
886 		 */
887 		ave_idle_time = (cs_data->cs_smpl_idle /
888 		    cpupm_cs_sample_tunable) / 1000;
889 		for (i = 1; i < cs_count; ++i) {
890 			if (ave_idle_time < (cstates[i].cs_latency *
891 			    cpupm_cs_idle_save_tunable)) {
892 				cs_count = i;
893 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
894 				    CPU, int, i);
895 			}
896 		}
897 
898 		/*
899 		 * Wakeup often (even when non-idle time is very short)?
900 		 * Some producer/consumer type loads fall into this category.
901 		 */
902 		ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
903 		    / 1000;
904 		for (i = 1; i < cs_count; ++i) {
905 			if (ave_interval <= (cstates[i].cs_latency *
906 			    cpupm_cs_idle_cost_tunable)) {
907 				cs_count = i;
908 				DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
909 				    CPU, int, (CPU_MAX_CSTATES + i));
910 			}
911 		}
912 
913 		/*
914 		 * Idle percent
915 		 */
916 		for (i = 1; i < cs_count; ++i) {
917 			switch (cstates[i].cs_type) {
918 			case CPU_ACPI_C2:
919 				if (cs_data->cs_smpl_idle_pct <
920 				    cpupm_C2_idle_pct_tunable) {
921 					cs_count = i;
922 					DTRACE_PROBE2(cpupm__next__cstate,
923 					    cpu_t *, CPU, int,
924 					    ((2 * CPU_MAX_CSTATES) + i));
925 				}
926 				break;
927 
928 			case CPU_ACPI_C3:
929 				if (cs_data->cs_smpl_idle_pct <
930 				    cpupm_C3_idle_pct_tunable) {
931 					cs_count = i;
932 					DTRACE_PROBE2(cpupm__next__cstate,
933 					    cpu_t *, CPU, int,
934 					    ((2 * CPU_MAX_CSTATES) + i));
935 				}
936 				break;
937 			}
938 		}
939 
940 		cs_data->cs_next_cstate = cs_count - 1;
941 	}
942 
943 	return (cs_data->cs_next_cstate);
944 }
945