1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2009, Intel Corporation.
27 * All rights reserved.
28 */
29
30 #include <sys/cpu_pm.h>
31 #include <sys/x86_archext.h>
32 #include <sys/sdt.h>
33 #include <sys/spl.h>
34 #include <sys/machsystm.h>
35 #include <sys/archsystm.h>
36 #include <sys/hpet.h>
37 #include <sys/acpi/acpi.h>
38 #include <sys/acpica.h>
39 #include <sys/cpupm.h>
40 #include <sys/cpu_idle.h>
41 #include <sys/cpu_acpi.h>
42 #include <sys/cpupm_throttle.h>
43 #include <sys/dtrace.h>
44 #include <sys/note.h>
45
46 /*
47 * This callback is used to build the PPM CPU domains once
48 * a CPU device has been started. The callback is initialized
49 * by the PPM driver to point to a routine that will build the
50 * domains.
51 */
52 void (*cpupm_ppm_alloc_pstate_domains)(cpu_t *);
53
54 /*
55 * This callback is used to remove CPU from the PPM CPU domains
56 * when the cpu driver is detached. The callback is initialized
57 * by the PPM driver to point to a routine that will remove CPU
58 * from the domains.
59 */
60 void (*cpupm_ppm_free_pstate_domains)(cpu_t *);
61
62 /*
63 * This callback is used to redefine the topspeed for a CPU device.
64 * Since all CPUs in a domain should have identical properties, this
65 * callback is initialized by the PPM driver to point to a routine
66 * that will redefine the topspeed for all devices in a CPU domain.
67 * This callback is exercised whenever an ACPI _PPC change notification
68 * is received by the CPU driver.
69 */
70 void (*cpupm_redefine_topspeed)(void *);
71
72 /*
73 * This callback is used by the PPM driver to call into the CPU driver
74 * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
75 */
76 void (*cpupm_set_topspeed_callb)(void *, int);
77
78 /*
79 * This callback is used by the PPM driver to call into the CPU driver
80 * to set a new topspeed for a CPU.
81 */
82 int (*cpupm_get_topspeed_callb)(void *);
83
84 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
85 static void cpupm_free_notify_handlers(cpu_t *);
86 static void cpupm_power_manage_notifications(void *);
87
88 /*
89 * Until proven otherwise, all power states are manageable.
90 */
91 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
92
93 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
94 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
95 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
96
97 /*
98 * c-state tunables
99 *
100 * cpupm_cs_sample_interval is the length of time we wait before
101 * recalculating c-state statistics. When a CPU goes idle it checks
102 * to see if it has been longer than cpupm_cs_sample_interval since it last
103 * caculated which C-state to go to.
104 *
105 * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
106 * divided by time spent in the idle state transitions.
107 * A value of 10 means the CPU will not spend more than 1/10 of its time
108 * in idle latency. The worst case performance will be 90% of non Deep C-state
109 * kernel.
110 *
111 * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
112 * before it is worth going there. Expressed as a multiple of latency.
113 */
114 uint32_t cpupm_cs_sample_interval = 100*1000*1000; /* 100 milliseconds */
115 uint32_t cpupm_cs_idle_cost_tunable = 10; /* work time / latency cost */
116 uint32_t cpupm_cs_idle_save_tunable = 2; /* idle power savings */
117 uint16_t cpupm_C2_idle_pct_tunable = 70;
118 uint16_t cpupm_C3_idle_pct_tunable = 80;
119
120 #ifndef __xpv
121 extern boolean_t cpupm_intel_init(cpu_t *);
122 extern boolean_t cpupm_amd_init(cpu_t *);
123
124 typedef struct cpupm_vendor {
125 boolean_t (*cpuv_init)(cpu_t *);
126 } cpupm_vendor_t;
127
128 /*
129 * Table of supported vendors.
130 */
131 static cpupm_vendor_t cpupm_vendors[] = {
132 cpupm_intel_init,
133 cpupm_amd_init,
134 NULL
135 };
136 #endif
137
138 /*
139 * Initialize the machine.
140 * See if a module exists for managing power for this CPU.
141 */
142 /*ARGSUSED*/
143 void
cpupm_init(cpu_t * cp)144 cpupm_init(cpu_t *cp)
145 {
146 #ifndef __xpv
147 cpupm_vendor_t *vendors;
148 cpupm_mach_state_t *mach_state;
149 struct machcpu *mcpu = &(cp->cpu_m);
150 static boolean_t first = B_TRUE;
151 int *speeds;
152 uint_t nspeeds;
153 int ret;
154
155 mach_state = cp->cpu_m.mcpu_pm_mach_state =
156 kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
157 mach_state->ms_caps = CPUPM_NO_STATES;
158 mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
159
160 mach_state->ms_acpi_handle = cpu_acpi_init(cp);
161 if (mach_state->ms_acpi_handle == NULL) {
162 cpupm_fini(cp);
163 cmn_err(CE_WARN, "!cpupm_init: processor %d: "
164 "unable to get ACPI handle", cp->cpu_id);
165 cmn_err(CE_NOTE, "!CPU power management will not function.");
166 CPUPM_DISABLE();
167 first = B_FALSE;
168 return;
169 }
170
171 /*
172 * Loop through the CPU management module table and see if
173 * any of the modules implement CPU power management
174 * for this CPU.
175 */
176 for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
177 if (vendors->cpuv_init(cp))
178 break;
179 }
180
181 /*
182 * Nope, we can't power manage this CPU.
183 */
184 if (vendors == NULL) {
185 cpupm_fini(cp);
186 CPUPM_DISABLE();
187 first = B_FALSE;
188 return;
189 }
190
191 /*
192 * If P-state support exists for this system, then initialize it.
193 */
194 if (mach_state->ms_pstate.cma_ops != NULL) {
195 ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
196 if (ret != 0) {
197 mach_state->ms_pstate.cma_ops = NULL;
198 cpupm_disable(CPUPM_P_STATES);
199 } else {
200 nspeeds = cpupm_get_speeds(cp, &speeds);
201 if (nspeeds == 0) {
202 cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
203 " no speeds to manage", cp->cpu_id);
204 } else {
205 cpupm_set_supp_freqs(cp, speeds, nspeeds);
206 cpupm_free_speeds(speeds, nspeeds);
207 mach_state->ms_caps |= CPUPM_P_STATES;
208 }
209 }
210 } else {
211 cpupm_disable(CPUPM_P_STATES);
212 }
213
214 if (mach_state->ms_tstate.cma_ops != NULL) {
215 ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
216 if (ret != 0) {
217 mach_state->ms_tstate.cma_ops = NULL;
218 cpupm_disable(CPUPM_T_STATES);
219 } else {
220 mach_state->ms_caps |= CPUPM_T_STATES;
221 }
222 } else {
223 cpupm_disable(CPUPM_T_STATES);
224 }
225
226 /*
227 * If C-states support exists for this system, then initialize it.
228 */
229 if (mach_state->ms_cstate.cma_ops != NULL) {
230 ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
231 if (ret != 0) {
232 mach_state->ms_cstate.cma_ops = NULL;
233 mcpu->max_cstates = CPU_ACPI_C1;
234 cpupm_disable(CPUPM_C_STATES);
235 idle_cpu = non_deep_idle_cpu;
236 disp_enq_thread = non_deep_idle_disp_enq_thread;
237 } else if (cpu_deep_cstates_supported()) {
238 mcpu->max_cstates = cpu_acpi_get_max_cstates(
239 mach_state->ms_acpi_handle);
240 if (mcpu->max_cstates > CPU_ACPI_C1) {
241 (void) cstate_timer_callback(
242 CST_EVENT_MULTIPLE_CSTATES);
243 cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
244 mcpu->mcpu_idle_type = CPU_ACPI_C1;
245 disp_enq_thread = cstate_wakeup;
246 } else {
247 (void) cstate_timer_callback(
248 CST_EVENT_ONE_CSTATE);
249 }
250 mach_state->ms_caps |= CPUPM_C_STATES;
251 } else {
252 mcpu->max_cstates = CPU_ACPI_C1;
253 idle_cpu = non_deep_idle_cpu;
254 disp_enq_thread = non_deep_idle_disp_enq_thread;
255 }
256 } else {
257 cpupm_disable(CPUPM_C_STATES);
258 }
259
260
261 if (mach_state->ms_caps == CPUPM_NO_STATES) {
262 cpupm_fini(cp);
263 CPUPM_DISABLE();
264 first = B_FALSE;
265 return;
266 }
267
268 if ((mach_state->ms_caps & CPUPM_T_STATES) ||
269 (mach_state->ms_caps & CPUPM_P_STATES) ||
270 (mach_state->ms_caps & CPUPM_C_STATES)) {
271 if (first) {
272 acpica_write_cpupm_capabilities(
273 mach_state->ms_caps & CPUPM_P_STATES,
274 mach_state->ms_caps & CPUPM_C_STATES);
275 }
276 if (mach_state->ms_caps & CPUPM_T_STATES) {
277 cpupm_throttle_manage_notification(cp);
278 }
279 if (mach_state->ms_caps & CPUPM_C_STATES) {
280 cpuidle_manage_cstates(cp);
281 }
282 if (mach_state->ms_caps & CPUPM_P_STATES) {
283 cpupm_power_manage_notifications(cp);
284 }
285 cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
286 }
287 first = B_FALSE;
288 #endif
289 }
290
291 /*
292 * Free any resources allocated during cpupm initialization or cpupm start.
293 */
294 /*ARGSUSED*/
295 void
cpupm_free(cpu_t * cp,boolean_t cpupm_stop)296 cpupm_free(cpu_t *cp, boolean_t cpupm_stop)
297 {
298 #ifndef __xpv
299 cpupm_mach_state_t *mach_state =
300 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
301
302 if (mach_state == NULL)
303 return;
304
305 if (mach_state->ms_pstate.cma_ops != NULL) {
306 if (cpupm_stop)
307 mach_state->ms_pstate.cma_ops->cpus_stop(cp);
308 else
309 mach_state->ms_pstate.cma_ops->cpus_fini(cp);
310 mach_state->ms_pstate.cma_ops = NULL;
311 }
312
313 if (mach_state->ms_tstate.cma_ops != NULL) {
314 if (cpupm_stop)
315 mach_state->ms_tstate.cma_ops->cpus_stop(cp);
316 else
317 mach_state->ms_tstate.cma_ops->cpus_fini(cp);
318 mach_state->ms_tstate.cma_ops = NULL;
319 }
320
321 if (mach_state->ms_cstate.cma_ops != NULL) {
322 if (cpupm_stop)
323 mach_state->ms_cstate.cma_ops->cpus_stop(cp);
324 else
325 mach_state->ms_cstate.cma_ops->cpus_fini(cp);
326
327 mach_state->ms_cstate.cma_ops = NULL;
328 }
329
330 cpupm_free_notify_handlers(cp);
331
332 if (mach_state->ms_acpi_handle != NULL) {
333 cpu_acpi_fini(mach_state->ms_acpi_handle);
334 mach_state->ms_acpi_handle = NULL;
335 }
336
337 mutex_destroy(&mach_state->ms_lock);
338 kmem_free(mach_state, sizeof (cpupm_mach_state_t));
339 cp->cpu_m.mcpu_pm_mach_state = NULL;
340 #endif
341 }
342
343 void
cpupm_fini(cpu_t * cp)344 cpupm_fini(cpu_t *cp)
345 {
346 /*
347 * call (*cpus_fini)() ops to release the cpupm resource
348 * in the P/C/T-state driver
349 */
350 cpupm_free(cp, B_FALSE);
351 }
352
353 void
cpupm_start(cpu_t * cp)354 cpupm_start(cpu_t *cp)
355 {
356 cpupm_init(cp);
357 }
358
359 void
cpupm_stop(cpu_t * cp)360 cpupm_stop(cpu_t *cp)
361 {
362 /*
363 * call (*cpus_stop)() ops to reclaim the cpupm resource
364 * in the P/C/T-state driver
365 */
366 cpupm_free(cp, B_TRUE);
367 }
368
369 /*
370 * If A CPU has started and at least one power state is manageable,
371 * then the CPU is ready for power management.
372 */
373 boolean_t
cpupm_is_ready(cpu_t * cp)374 cpupm_is_ready(cpu_t *cp)
375 {
376 #ifndef __xpv
377 cpupm_mach_state_t *mach_state =
378 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
379 uint32_t cpupm_caps = mach_state->ms_caps;
380
381 if (cpupm_enabled == CPUPM_NO_STATES)
382 return (B_FALSE);
383
384 if ((cpupm_caps & CPUPM_T_STATES) ||
385 (cpupm_caps & CPUPM_P_STATES) ||
386 (cpupm_caps & CPUPM_C_STATES))
387
388 return (B_TRUE);
389 return (B_FALSE);
390 #else
391 _NOTE(ARGUNUSED(cp));
392 return (B_FALSE);
393 #endif
394 }
395
396 boolean_t
cpupm_is_enabled(uint32_t state)397 cpupm_is_enabled(uint32_t state)
398 {
399 return ((cpupm_enabled & state) == state);
400 }
401
402 /*
403 * By default, all states are enabled.
404 */
405 void
cpupm_disable(uint32_t state)406 cpupm_disable(uint32_t state)
407 {
408
409 if (state & CPUPM_P_STATES) {
410 cpupm_free_domains(&cpupm_pstate_domains);
411 }
412 if (state & CPUPM_T_STATES) {
413 cpupm_free_domains(&cpupm_tstate_domains);
414 }
415 if (state & CPUPM_C_STATES) {
416 cpupm_free_domains(&cpupm_cstate_domains);
417 }
418 cpupm_enabled &= ~state;
419 }
420
421 /*
422 * Allocate power domains for C,P and T States
423 */
424 void
cpupm_alloc_domains(cpu_t * cp,int state)425 cpupm_alloc_domains(cpu_t *cp, int state)
426 {
427 cpupm_mach_state_t *mach_state =
428 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
429 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
430 cpupm_state_domains_t **dom_ptr;
431 cpupm_state_domains_t *dptr;
432 cpupm_state_domains_t **mach_dom_state_ptr;
433 uint32_t domain;
434 uint32_t type;
435
436 switch (state) {
437 case CPUPM_P_STATES:
438 if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
439 domain = CPU_ACPI_PSD(handle).sd_domain;
440 type = CPU_ACPI_PSD(handle).sd_type;
441 } else {
442 if (MUTEX_HELD(&cpu_lock)) {
443 domain = cpuid_get_chipid(cp);
444 } else {
445 mutex_enter(&cpu_lock);
446 domain = cpuid_get_chipid(cp);
447 mutex_exit(&cpu_lock);
448 }
449 type = CPU_ACPI_HW_ALL;
450 }
451 dom_ptr = &cpupm_pstate_domains;
452 mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
453 break;
454 case CPUPM_T_STATES:
455 if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
456 domain = CPU_ACPI_TSD(handle).sd_domain;
457 type = CPU_ACPI_TSD(handle).sd_type;
458 } else {
459 if (MUTEX_HELD(&cpu_lock)) {
460 domain = cpuid_get_chipid(cp);
461 } else {
462 mutex_enter(&cpu_lock);
463 domain = cpuid_get_chipid(cp);
464 mutex_exit(&cpu_lock);
465 }
466 type = CPU_ACPI_HW_ALL;
467 }
468 dom_ptr = &cpupm_tstate_domains;
469 mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
470 break;
471 case CPUPM_C_STATES:
472 if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
473 domain = CPU_ACPI_CSD(handle).sd_domain;
474 type = CPU_ACPI_CSD(handle).sd_type;
475 } else {
476 if (MUTEX_HELD(&cpu_lock)) {
477 domain = cpuid_get_coreid(cp);
478 } else {
479 mutex_enter(&cpu_lock);
480 domain = cpuid_get_coreid(cp);
481 mutex_exit(&cpu_lock);
482 }
483 type = CPU_ACPI_HW_ALL;
484 }
485 dom_ptr = &cpupm_cstate_domains;
486 mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
487 break;
488 default:
489 return;
490 }
491
492 for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
493 if (dptr->pm_domain == domain)
494 break;
495 }
496
497 /* new domain is created and linked at the head */
498 if (dptr == NULL) {
499 dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
500 dptr->pm_domain = domain;
501 dptr->pm_type = type;
502 dptr->pm_next = *dom_ptr;
503 mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
504 (void *)ipltospl(DISP_LEVEL));
505 CPUSET_ZERO(dptr->pm_cpus);
506 *dom_ptr = dptr;
507 }
508 CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
509 *mach_dom_state_ptr = dptr;
510 }
511
512 /*
513 * Free C, P or T state power domains
514 */
515 void
cpupm_free_domains(cpupm_state_domains_t ** dom_ptr)516 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
517 {
518 cpupm_state_domains_t *this_domain, *next_domain;
519
520 this_domain = *dom_ptr;
521 while (this_domain != NULL) {
522 next_domain = this_domain->pm_next;
523 mutex_destroy(&this_domain->pm_lock);
524 kmem_free((void *)this_domain,
525 sizeof (cpupm_state_domains_t));
526 this_domain = next_domain;
527 }
528 *dom_ptr = NULL;
529 }
530
531 /*
532 * Remove CPU from C, P or T state power domains
533 */
534 void
cpupm_remove_domains(cpu_t * cp,int state,cpupm_state_domains_t ** dom_ptr)535 cpupm_remove_domains(cpu_t *cp, int state, cpupm_state_domains_t **dom_ptr)
536 {
537 cpupm_mach_state_t *mach_state =
538 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
539 cpupm_state_domains_t *dptr;
540 uint32_t pm_domain;
541
542 ASSERT(mach_state);
543
544 switch (state) {
545 case CPUPM_P_STATES:
546 pm_domain = mach_state->ms_pstate.cma_domain->pm_domain;
547 break;
548 case CPUPM_T_STATES:
549 pm_domain = mach_state->ms_tstate.cma_domain->pm_domain;
550 break;
551 case CPUPM_C_STATES:
552 pm_domain = mach_state->ms_cstate.cma_domain->pm_domain;
553 break;
554 default:
555 return;
556 }
557
558 /*
559 * Find the CPU C, P or T state power domain
560 */
561 for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
562 if (dptr->pm_domain == pm_domain)
563 break;
564 }
565
566 /*
567 * return if no matched domain found
568 */
569 if (dptr == NULL)
570 return;
571
572 /*
573 * We found one matched power domain, remove CPU from its cpuset.
574 * pm_lock(spin lock) here to avoid the race conditions between
575 * event change notification and cpu remove.
576 */
577 mutex_enter(&dptr->pm_lock);
578 if (CPU_IN_SET(dptr->pm_cpus, cp->cpu_id))
579 CPUSET_DEL(dptr->pm_cpus, cp->cpu_id);
580 mutex_exit(&dptr->pm_lock);
581 }
582
583 void
cpupm_alloc_ms_cstate(cpu_t * cp)584 cpupm_alloc_ms_cstate(cpu_t *cp)
585 {
586 cpupm_mach_state_t *mach_state;
587 cpupm_mach_acpi_state_t *ms_cstate;
588
589 mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
590 ms_cstate = &mach_state->ms_cstate;
591 ASSERT(ms_cstate->cma_state.cstate == NULL);
592 ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
593 KM_SLEEP);
594 ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
595 }
596
597 void
cpupm_free_ms_cstate(cpu_t * cp)598 cpupm_free_ms_cstate(cpu_t *cp)
599 {
600 cpupm_mach_state_t *mach_state =
601 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
602 cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
603
604 if (ms_cstate->cma_state.cstate != NULL) {
605 kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
606 ms_cstate->cma_state.cstate = NULL;
607 }
608 }
609
610 void
cpupm_state_change(cpu_t * cp,int level,int state)611 cpupm_state_change(cpu_t *cp, int level, int state)
612 {
613 cpupm_mach_state_t *mach_state =
614 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
615 cpupm_state_ops_t *state_ops;
616 cpupm_state_domains_t *state_domain;
617 cpuset_t set;
618
619 DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
620
621 if (mach_state == NULL) {
622 return;
623 }
624
625 switch (state) {
626 case CPUPM_P_STATES:
627 state_ops = mach_state->ms_pstate.cma_ops;
628 state_domain = mach_state->ms_pstate.cma_domain;
629 break;
630 case CPUPM_T_STATES:
631 state_ops = mach_state->ms_tstate.cma_ops;
632 state_domain = mach_state->ms_tstate.cma_domain;
633 break;
634 default:
635 return;
636 }
637
638 switch (state_domain->pm_type) {
639 case CPU_ACPI_SW_ANY:
640 /*
641 * A request on any CPU in the domain transitions the domain
642 */
643 CPUSET_ONLY(set, cp->cpu_id);
644 state_ops->cpus_change(set, level);
645 break;
646 case CPU_ACPI_SW_ALL:
647 /*
648 * All CPUs in the domain must request the transition
649 */
650 case CPU_ACPI_HW_ALL:
651 /*
652 * P/T-state transitions are coordinated by the hardware
653 * For now, request the transition on all CPUs in the domain,
654 * but looking ahead we can probably be smarter about this.
655 */
656 mutex_enter(&state_domain->pm_lock);
657 state_ops->cpus_change(state_domain->pm_cpus, level);
658 mutex_exit(&state_domain->pm_lock);
659 break;
660 default:
661 cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
662 state_domain->pm_type);
663 }
664 }
665
666 /*
667 * CPU PM interfaces exposed to the CPU power manager
668 */
669 /*ARGSUSED*/
670 id_t
cpupm_plat_domain_id(cpu_t * cp,cpupm_dtype_t type)671 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
672 {
673 cpupm_mach_state_t *mach_state =
674 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
675
676 if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
677 !cpupm_is_enabled(CPUPM_C_STATES))) {
678 return (CPUPM_NO_DOMAIN);
679 }
680 if (type == CPUPM_DTYPE_ACTIVE) {
681 /*
682 * Return P-State domain for the specified CPU
683 */
684 if (mach_state->ms_pstate.cma_domain) {
685 return (mach_state->ms_pstate.cma_domain->pm_domain);
686 }
687 } else if (type == CPUPM_DTYPE_IDLE) {
688 /*
689 * Return C-State domain for the specified CPU
690 */
691 if (mach_state->ms_cstate.cma_domain) {
692 return (mach_state->ms_cstate.cma_domain->pm_domain);
693 }
694 }
695 return (CPUPM_NO_DOMAIN);
696 }
697
698 uint_t
cpupm_plat_state_enumerate(cpu_t * cp,cpupm_dtype_t type,cpupm_state_t * states)699 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
700 cpupm_state_t *states)
701 {
702 int *speeds = NULL;
703 uint_t nspeeds, i;
704
705 /*
706 * Idle domain support unimplemented
707 */
708 if (type != CPUPM_DTYPE_ACTIVE) {
709 return (0);
710 }
711 nspeeds = cpupm_get_speeds(cp, &speeds);
712
713 /*
714 * If the caller passes NULL for states, just return the
715 * number of states.
716 */
717 if (states != NULL) {
718 for (i = 0; i < nspeeds; i++) {
719 states[i].cps_speed = speeds[i];
720 states[i].cps_handle = (cpupm_handle_t)i;
721 }
722 }
723 cpupm_free_speeds(speeds, nspeeds);
724 return (nspeeds);
725 }
726
727 /*ARGSUSED*/
728 int
cpupm_plat_change_state(cpu_t * cp,cpupm_state_t * state)729 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
730 {
731 if (!cpupm_is_ready(cp))
732 return (-1);
733
734 cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
735
736 return (0);
737 }
738
739 /*ARGSUSED*/
740 /*
741 * Note: It is the responsibility of the users of
742 * cpupm_get_speeds() to free the memory allocated
743 * for speeds using cpupm_free_speeds()
744 */
745 uint_t
cpupm_get_speeds(cpu_t * cp,int ** speeds)746 cpupm_get_speeds(cpu_t *cp, int **speeds)
747 {
748 #ifndef __xpv
749 cpupm_mach_state_t *mach_state =
750 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
751 return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
752 #else
753 return (0);
754 #endif
755 }
756
757 /*ARGSUSED*/
758 void
cpupm_free_speeds(int * speeds,uint_t nspeeds)759 cpupm_free_speeds(int *speeds, uint_t nspeeds)
760 {
761 #ifndef __xpv
762 cpu_acpi_free_speeds(speeds, nspeeds);
763 #endif
764 }
765
766 /*
767 * All CPU instances have been initialized successfully.
768 */
769 boolean_t
cpupm_power_ready(cpu_t * cp)770 cpupm_power_ready(cpu_t *cp)
771 {
772 return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready(cp));
773 }
774
775 /*
776 * All CPU instances have been initialized successfully.
777 */
778 boolean_t
cpupm_throttle_ready(cpu_t * cp)779 cpupm_throttle_ready(cpu_t *cp)
780 {
781 return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready(cp));
782 }
783
784 /*
785 * All CPU instances have been initialized successfully.
786 */
787 boolean_t
cpupm_cstate_ready(cpu_t * cp)788 cpupm_cstate_ready(cpu_t *cp)
789 {
790 return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready(cp));
791 }
792
793 void
cpupm_notify_handler(ACPI_HANDLE obj,UINT32 val,void * ctx)794 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
795 {
796 cpu_t *cp = ctx;
797 cpupm_mach_state_t *mach_state =
798 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
799 cpupm_notification_t *entry;
800
801 mutex_enter(&mach_state->ms_lock);
802 for (entry = mach_state->ms_handlers; entry != NULL;
803 entry = entry->nq_next) {
804 entry->nq_handler(obj, val, entry->nq_ctx);
805 }
806 mutex_exit(&mach_state->ms_lock);
807 }
808
809 /*ARGSUSED*/
810 void
cpupm_add_notify_handler(cpu_t * cp,CPUPM_NOTIFY_HANDLER handler,void * ctx)811 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
812 {
813 #ifndef __xpv
814 cpupm_mach_state_t *mach_state =
815 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
816 cpupm_notification_t *entry;
817
818 entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
819 entry->nq_handler = handler;
820 entry->nq_ctx = ctx;
821 mutex_enter(&mach_state->ms_lock);
822 if (mach_state->ms_handlers == NULL) {
823 entry->nq_next = NULL;
824 mach_state->ms_handlers = entry;
825 cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
826 cpupm_notify_handler, cp);
827
828 } else {
829 entry->nq_next = mach_state->ms_handlers;
830 mach_state->ms_handlers = entry;
831 }
832 mutex_exit(&mach_state->ms_lock);
833 #endif
834 }
835
836 /*ARGSUSED*/
837 static void
cpupm_free_notify_handlers(cpu_t * cp)838 cpupm_free_notify_handlers(cpu_t *cp)
839 {
840 #ifndef __xpv
841 cpupm_mach_state_t *mach_state =
842 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
843 cpupm_notification_t *entry;
844 cpupm_notification_t *next;
845
846 mutex_enter(&mach_state->ms_lock);
847 if (mach_state->ms_handlers == NULL) {
848 mutex_exit(&mach_state->ms_lock);
849 return;
850 }
851 if (mach_state->ms_acpi_handle != NULL) {
852 cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
853 cpupm_notify_handler);
854 }
855 entry = mach_state->ms_handlers;
856 while (entry != NULL) {
857 next = entry->nq_next;
858 kmem_free(entry, sizeof (cpupm_notification_t));
859 entry = next;
860 }
861 mach_state->ms_handlers = NULL;
862 mutex_exit(&mach_state->ms_lock);
863 #endif
864 }
865
866 /*
867 * Get the current max speed from the ACPI _PPC object
868 */
869 /*ARGSUSED*/
870 int
cpupm_get_top_speed(cpu_t * cp)871 cpupm_get_top_speed(cpu_t *cp)
872 {
873 #ifndef __xpv
874 cpupm_mach_state_t *mach_state;
875 cpu_acpi_handle_t handle;
876 int plat_level;
877 uint_t nspeeds;
878 int max_level;
879
880 mach_state =
881 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
882 handle = mach_state->ms_acpi_handle;
883
884 cpu_acpi_cache_ppc(handle);
885 plat_level = CPU_ACPI_PPC(handle);
886
887 nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
888
889 max_level = nspeeds - 1;
890 if ((plat_level < 0) || (plat_level > max_level)) {
891 cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
892 "_PPC out of range %d", cp->cpu_id, plat_level);
893 plat_level = 0;
894 }
895
896 return (plat_level);
897 #else
898 return (0);
899 #endif
900 }
901
902 /*
903 * This notification handler is called whenever the ACPI _PPC
904 * object changes. The _PPC is a sort of governor on power levels.
905 * It sets an upper threshold on which, _PSS defined, power levels
906 * are usuable. The _PPC value is dynamic and may change as properties
907 * (i.e., thermal or AC source) of the system change.
908 */
909
910 static void
cpupm_power_manage_notifications(void * ctx)911 cpupm_power_manage_notifications(void *ctx)
912 {
913 cpu_t *cp = ctx;
914 int top_speed;
915
916 top_speed = cpupm_get_top_speed(cp);
917 cpupm_redefine_max_activepwr_state(cp, top_speed);
918 }
919
920 /* ARGSUSED */
921 static void
cpupm_event_notify_handler(ACPI_HANDLE obj,UINT32 val,void * ctx)922 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
923 {
924 #ifndef __xpv
925
926 cpu_t *cp = ctx;
927 cpupm_mach_state_t *mach_state =
928 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
929
930 if (mach_state == NULL)
931 return;
932
933 /*
934 * Currently, we handle _TPC,_CST and _PPC change notifications.
935 */
936 if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
937 mach_state->ms_caps & CPUPM_T_STATES) {
938 cpupm_throttle_manage_notification(ctx);
939 } else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
940 mach_state->ms_caps & CPUPM_C_STATES) {
941 cpuidle_manage_cstates(ctx);
942 } else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
943 mach_state->ms_caps & CPUPM_P_STATES) {
944 cpupm_power_manage_notifications(ctx);
945 }
946 #endif
947 }
948
949 /*
950 * Update cpupm cstate data each time CPU exits idle.
951 */
952 void
cpupm_wakeup_cstate_data(cma_c_state_t * cs_data,hrtime_t end)953 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
954 {
955 cs_data->cs_idle_exit = end;
956 }
957
958 /*
959 * Determine next cstate based on cpupm data.
960 * Update cpupm cstate data each time CPU goes idle.
961 * Do as much as possible in the idle state bookkeeping function because the
962 * performance impact while idle is minimal compared to in the wakeup function
963 * when there is real work to do.
964 */
965 uint32_t
cpupm_next_cstate(cma_c_state_t * cs_data,cpu_acpi_cstate_t * cstates,uint32_t cs_count,hrtime_t start)966 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
967 uint32_t cs_count, hrtime_t start)
968 {
969 hrtime_t duration;
970 hrtime_t ave_interval;
971 hrtime_t ave_idle_time;
972 uint32_t i, smpl_cnt;
973
974 duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
975 scalehrtime(&duration);
976 cs_data->cs_idle += duration;
977 cs_data->cs_idle_enter = start;
978
979 smpl_cnt = ++cs_data->cs_cnt;
980 cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
981 scalehrtime(&cs_data->cs_smpl_len);
982 if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
983 cs_data->cs_smpl_idle = cs_data->cs_idle;
984 cs_data->cs_idle = 0;
985 cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
986 cs_data->cs_smpl_len);
987
988 cs_data->cs_smpl_start = start;
989 cs_data->cs_cnt = 0;
990
991 /*
992 * Strand level C-state policy
993 * The cpu_acpi_cstate_t *cstates array is not required to
994 * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
995 * There are cs_count entries in the cstates array.
996 * cs_data->cs_next_cstate contains the index of the next
997 * C-state this CPU should enter.
998 */
999 ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
1000
1001 /*
1002 * Will CPU be idle long enough to save power?
1003 */
1004 ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
1005 for (i = 1; i < cs_count; ++i) {
1006 if (ave_idle_time < (cstates[i].cs_latency *
1007 cpupm_cs_idle_save_tunable)) {
1008 cs_count = i;
1009 DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1010 CPU, int, i);
1011 }
1012 }
1013
1014 /*
1015 * Wakeup often (even when non-idle time is very short)?
1016 * Some producer/consumer type loads fall into this category.
1017 */
1018 ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
1019 for (i = 1; i < cs_count; ++i) {
1020 if (ave_interval <= (cstates[i].cs_latency *
1021 cpupm_cs_idle_cost_tunable)) {
1022 cs_count = i;
1023 DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1024 CPU, int, (CPU_MAX_CSTATES + i));
1025 }
1026 }
1027
1028 /*
1029 * Idle percent
1030 */
1031 for (i = 1; i < cs_count; ++i) {
1032 switch (cstates[i].cs_type) {
1033 case CPU_ACPI_C2:
1034 if (cs_data->cs_smpl_idle_pct <
1035 cpupm_C2_idle_pct_tunable) {
1036 cs_count = i;
1037 DTRACE_PROBE2(cpupm__next__cstate,
1038 cpu_t *, CPU, int,
1039 ((2 * CPU_MAX_CSTATES) + i));
1040 }
1041 break;
1042
1043 case CPU_ACPI_C3:
1044 if (cs_data->cs_smpl_idle_pct <
1045 cpupm_C3_idle_pct_tunable) {
1046 cs_count = i;
1047 DTRACE_PROBE2(cpupm__next__cstate,
1048 cpu_t *, CPU, int,
1049 ((2 * CPU_MAX_CSTATES) + i));
1050 }
1051 break;
1052 }
1053 }
1054
1055 cs_data->cs_next_cstate = cs_count - 1;
1056 }
1057
1058 return (cs_data->cs_next_cstate);
1059 }
1060