1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2009-2010, Intel Corporation.
27 * All rights reserved.
28 */
29 /*
30 * Copyright 2019 Joyent, Inc.
31 */
32
33 #include <sys/x86_archext.h>
34 #include <sys/machsystm.h>
35 #include <sys/x_call.h>
36 #include <sys/stat.h>
37 #include <sys/acpi/acpi.h>
38 #include <sys/acpica.h>
39 #include <sys/cpu_acpi.h>
40 #include <sys/cpu_idle.h>
41 #include <sys/cpupm.h>
42 #include <sys/cpu_event.h>
43 #include <sys/hpet.h>
44 #include <sys/archsystm.h>
45 #include <vm/hat_i86.h>
46 #include <sys/dtrace.h>
47 #include <sys/sdt.h>
48 #include <sys/callb.h>
49
50 #define CSTATE_USING_HPET 1
51 #define CSTATE_USING_LAT 2
52
53 #define CPU_IDLE_STOP_TIMEOUT 1000
54
55 extern void cpu_idle_adaptive(void);
56 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
57 cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
58
59 static int cpu_idle_init(cpu_t *);
60 static void cpu_idle_fini(cpu_t *);
61 static void cpu_idle_stop(cpu_t *);
62 static boolean_t cpu_deep_idle_callb(void *arg, int code);
63 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
64 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
65
66 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
67
68 /*
69 * the flag of always-running local APIC timer.
70 * the flag of HPET Timer use in deep cstate.
71 */
72 static boolean_t cpu_cstate_arat = B_FALSE;
73 static boolean_t cpu_cstate_hpet = B_FALSE;
74
75 /*
76 * Interfaces for modules implementing Intel's deep c-state.
77 */
78 cpupm_state_ops_t cpu_idle_ops = {
79 "Generic ACPI C-state Support",
80 cpu_idle_init,
81 cpu_idle_fini,
82 NULL,
83 cpu_idle_stop
84 };
85
86 static kmutex_t cpu_idle_callb_mutex;
87 static callb_id_t cpu_deep_idle_callb_id;
88 static callb_id_t cpu_idle_cpr_callb_id;
89 static uint_t cpu_idle_cfg_state;
90
91 static kmutex_t cpu_idle_mutex;
92
93 cpu_idle_kstat_t cpu_idle_kstat = {
94 { "address_space_id", KSTAT_DATA_STRING },
95 { "latency", KSTAT_DATA_UINT32 },
96 { "power", KSTAT_DATA_UINT32 },
97 };
98
99 /*
100 * kstat update function of the c-state info
101 */
102 static int
cpu_idle_kstat_update(kstat_t * ksp,int flag)103 cpu_idle_kstat_update(kstat_t *ksp, int flag)
104 {
105 cpu_acpi_cstate_t *cstate = ksp->ks_private;
106
107 if (flag == KSTAT_WRITE) {
108 return (EACCES);
109 }
110
111 if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
112 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
113 "FFixedHW");
114 } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
115 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
116 "SystemIO");
117 } else {
118 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
119 "Unsupported");
120 }
121
122 cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
123 cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
124
125 return (0);
126 }
127
128 /*
129 * Used during configuration callbacks to manage implementation specific
130 * details of the hardware timer used during Deep C-state.
131 */
132 boolean_t
cstate_timer_callback(int code)133 cstate_timer_callback(int code)
134 {
135 if (cpu_cstate_arat) {
136 return (B_TRUE);
137 } else if (cpu_cstate_hpet) {
138 return (hpet.callback(code));
139 }
140 return (B_FALSE);
141 }
142
143 /*
144 * Some Local APIC Timers do not work during Deep C-states.
145 * The Deep C-state idle function uses this function to ensure it is using a
146 * hardware timer that works during Deep C-states. This function also
147 * switches the timer back to the LACPI Timer after Deep C-state.
148 */
149 static boolean_t
cstate_use_timer(hrtime_t * lapic_expire,int timer)150 cstate_use_timer(hrtime_t *lapic_expire, int timer)
151 {
152 if (cpu_cstate_arat)
153 return (B_TRUE);
154
155 /*
156 * We have to return B_FALSE if no arat or hpet support
157 */
158 if (!cpu_cstate_hpet)
159 return (B_FALSE);
160
161 switch (timer) {
162 case CSTATE_USING_HPET:
163 return (hpet.use_hpet_timer(lapic_expire));
164 case CSTATE_USING_LAT:
165 hpet.use_lapic_timer(*lapic_expire);
166 return (B_TRUE);
167 default:
168 return (B_FALSE);
169 }
170 }
171
172 /*
173 * c-state wakeup function.
174 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
175 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
176 */
177 void
cstate_wakeup(cpu_t * cp,int bound)178 cstate_wakeup(cpu_t *cp, int bound)
179 {
180 struct machcpu *mcpu = &(cp->cpu_m);
181 volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
182 cpupart_t *cpu_part;
183 uint_t cpu_found;
184 processorid_t cpu_sid;
185
186 cpu_part = cp->cpu_part;
187 cpu_sid = cp->cpu_seqid;
188 /*
189 * Clear the halted bit for that CPU since it will be woken up
190 * in a moment.
191 */
192 if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
193 /*
194 * Clear the halted bit for that CPU since it will be
195 * poked in a moment.
196 */
197 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
198
199 /*
200 * We may find the current CPU present in the halted cpuset
201 * if we're in the context of an interrupt that occurred
202 * before we had a chance to clear our bit in cpu_idle().
203 * Waking ourself is obviously unnecessary, since if
204 * we're here, we're not halted.
205 */
206 if (cp != CPU) {
207 /*
208 * Use correct wakeup mechanism
209 */
210 if ((mcpu_mwait != NULL) &&
211 (*mcpu_mwait == MWAIT_HALTED))
212 MWAIT_WAKEUP(cp);
213 else
214 poke_cpu(cp->cpu_id);
215 }
216 return;
217 } else {
218 /*
219 * This cpu isn't halted, but it's idle or undergoing a
220 * context switch. No need to awaken anyone else.
221 */
222 if (cp->cpu_thread == cp->cpu_idle_thread ||
223 cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
224 return;
225 }
226
227 /*
228 * No need to wake up other CPUs if the thread we just enqueued
229 * is bound.
230 */
231 if (bound)
232 return;
233
234
235 /*
236 * See if there's any other halted CPUs. If there are, then
237 * select one, and awaken it.
238 * It's possible that after we find a CPU, somebody else
239 * will awaken it before we get the chance.
240 * In that case, look again.
241 */
242 do {
243 cpu_found = bitset_find(&cpu_part->cp_haltset);
244 if (cpu_found == (uint_t)-1)
245 return;
246
247 } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
248 cpu_found) < 0);
249
250 /*
251 * Must use correct wakeup mechanism to avoid lost wakeup of
252 * alternate cpu.
253 */
254 if (cpu_found != CPU->cpu_seqid) {
255 mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
256 if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
257 MWAIT_WAKEUP(cpu_seq[cpu_found]);
258 else
259 poke_cpu(cpu_seq[cpu_found]->cpu_id);
260 }
261 }
262
263 /*
264 * Function called by CPU idle notification framework to check whether CPU
265 * has been awakened. It will be called with interrupt disabled.
266 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
267 * notification framework.
268 */
269 static void
acpi_cpu_mwait_check_wakeup(void * arg)270 acpi_cpu_mwait_check_wakeup(void *arg)
271 {
272 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
273
274 ASSERT(arg != NULL);
275 if (*mcpu_mwait != MWAIT_HALTED) {
276 /*
277 * CPU has been awakened, notify CPU idle notification system.
278 */
279 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
280 } else {
281 /*
282 * Toggle interrupt flag to detect pending interrupts.
283 * If interrupt happened, do_interrupt() will notify CPU idle
284 * notification framework so no need to call cpu_idle_exit()
285 * here.
286 */
287 sti();
288 SMT_PAUSE();
289 cli();
290 }
291 }
292
293 static void
acpi_cpu_mwait_ipi_check_wakeup(void * arg)294 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
295 {
296 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
297
298 ASSERT(arg != NULL);
299 if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
300 /*
301 * CPU has been awakened, notify CPU idle notification system.
302 */
303 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
304 } else {
305 /*
306 * Toggle interrupt flag to detect pending interrupts.
307 * If interrupt happened, do_interrupt() will notify CPU idle
308 * notification framework so no need to call cpu_idle_exit()
309 * here.
310 */
311 sti();
312 SMT_PAUSE();
313 cli();
314 }
315 }
316
317 /*ARGSUSED*/
318 static void
acpi_cpu_check_wakeup(void * arg)319 acpi_cpu_check_wakeup(void *arg)
320 {
321 /*
322 * Toggle interrupt flag to detect pending interrupts.
323 * If interrupt happened, do_interrupt() will notify CPU idle
324 * notification framework so no need to call cpu_idle_exit() here.
325 */
326 sti();
327 SMT_PAUSE();
328 cli();
329 }
330
331 /*
332 * enter deep c-state handler
333 */
334 static void
acpi_cpu_cstate(cpu_acpi_cstate_t * cstate)335 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
336 {
337 volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
338 cpu_t *cpup = CPU;
339 processorid_t cpu_sid = cpup->cpu_seqid;
340 cpupart_t *cp = cpup->cpu_part;
341 hrtime_t lapic_expire;
342 uint8_t type = cstate->cs_addrspace_id;
343 uint32_t cs_type = cstate->cs_type;
344 int hset_update = 1;
345 boolean_t using_timer;
346 cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
347
348 /*
349 * Set our mcpu_mwait here, so we can tell if anyone tries to
350 * wake us between now and when we call mwait. No other cpu will
351 * attempt to set our mcpu_mwait until we add ourself to the haltset.
352 */
353 if (mcpu_mwait) {
354 if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
355 *mcpu_mwait = MWAIT_WAKEUP_IPI;
356 check_func = &acpi_cpu_mwait_ipi_check_wakeup;
357 } else {
358 *mcpu_mwait = MWAIT_HALTED;
359 check_func = &acpi_cpu_mwait_check_wakeup;
360 }
361 }
362
363 /*
364 * If this CPU is online, and there are multiple CPUs
365 * in the system, then we should note our halting
366 * by adding ourselves to the partition's halted CPU
367 * bitmap. This allows other CPUs to find/awaken us when
368 * work becomes available.
369 */
370 if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
371 hset_update = 0;
372
373 /*
374 * Add ourselves to the partition's halted CPUs bitmask
375 * and set our HALTED flag, if necessary.
376 *
377 * When a thread becomes runnable, it is placed on the queue
378 * and then the halted cpuset is checked to determine who
379 * (if anyone) should be awakened. We therefore need to first
380 * add ourselves to the halted cpuset, and and then check if there
381 * is any work available.
382 *
383 * Note that memory barriers after updating the HALTED flag
384 * are not necessary since an atomic operation (updating the bitmap)
385 * immediately follows. On x86 the atomic operation acts as a
386 * memory barrier for the update of cpu_disp_flags.
387 */
388 if (hset_update) {
389 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
390 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
391 }
392
393 /*
394 * Check to make sure there's really nothing to do.
395 * Work destined for this CPU may become available after
396 * this check. We'll be notified through the clearing of our
397 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
398 *
399 * disp_anywork() checks disp_nrunnable, so we do not have to later.
400 */
401 if (disp_anywork()) {
402 if (hset_update) {
403 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
404 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
405 }
406 return;
407 }
408
409 /*
410 * We're on our way to being halted.
411 *
412 * The local APIC timer can stop in ACPI C2 and deeper c-states.
413 * Try to program the HPET hardware to substitute for this CPU's
414 * LAPIC timer.
415 * cstate_use_timer() could disable the LAPIC Timer. Make sure
416 * to start the LAPIC Timer again before leaving this function.
417 *
418 * Disable interrupts here so we will awaken immediately after halting
419 * if someone tries to poke us between now and the time we actually
420 * halt.
421 */
422 cli();
423 using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
424
425 /*
426 * We check for the presence of our bit after disabling interrupts.
427 * If it's cleared, we'll return. If the bit is cleared after
428 * we check then the cstate_wakeup() will pop us out of the halted
429 * state.
430 *
431 * This means that the ordering of the cstate_wakeup() and the clearing
432 * of the bit by cpu_wakeup is important.
433 * cpu_wakeup() must clear our mc_haltset bit, and then call
434 * cstate_wakeup().
435 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
436 */
437 if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
438 (void) cstate_use_timer(&lapic_expire,
439 CSTATE_USING_LAT);
440 sti();
441 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
442 return;
443 }
444
445 /*
446 * The check for anything locally runnable is here for performance
447 * and isn't needed for correctness. disp_nrunnable ought to be
448 * in our cache still, so it's inexpensive to check, and if there
449 * is anything runnable we won't have to wait for the poke.
450 */
451 if (cpup->cpu_disp->disp_nrunnable != 0) {
452 (void) cstate_use_timer(&lapic_expire,
453 CSTATE_USING_LAT);
454 sti();
455 if (hset_update) {
456 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
457 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
458 }
459 return;
460 }
461
462 if (using_timer == B_FALSE) {
463
464 (void) cstate_use_timer(&lapic_expire,
465 CSTATE_USING_LAT);
466 sti();
467
468 /*
469 * We are currently unable to program the HPET to act as this
470 * CPU's proxy LAPIC timer. This CPU cannot enter C2 or deeper
471 * because no timer is set to wake it up while its LAPIC timer
472 * stalls in deep C-States.
473 * Enter C1 instead.
474 *
475 * cstate_wake_cpu() will wake this CPU with an IPI which
476 * works with MWAIT.
477 */
478 i86_monitor(mcpu_mwait, 0, 0);
479 if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
480 if (cpu_idle_enter(IDLE_STATE_C1, 0,
481 check_func, (void *)mcpu_mwait) == 0) {
482 if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
483 MWAIT_HALTED) {
484 i86_mwait(0, 0);
485 }
486 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
487 }
488 }
489
490 /*
491 * We're no longer halted
492 */
493 if (hset_update) {
494 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
495 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
496 }
497 return;
498 }
499
500 if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
501 /*
502 * We're on our way to being halted.
503 * To avoid a lost wakeup, arm the monitor before checking
504 * if another cpu wrote to mcpu_mwait to wake us up.
505 */
506 i86_monitor(mcpu_mwait, 0, 0);
507 if (*mcpu_mwait == MWAIT_HALTED) {
508 if (cpu_idle_enter((uint_t)cs_type, 0,
509 check_func, (void *)mcpu_mwait) == 0) {
510 if (*mcpu_mwait == MWAIT_HALTED) {
511 i86_mwait(cstate->cs_address, 1);
512 }
513 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
514 }
515 }
516 } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
517 uint32_t value;
518 ACPI_TABLE_FADT *gbl_FADT;
519
520 if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
521 if (cpu_idle_enter((uint_t)cs_type, 0,
522 check_func, (void *)mcpu_mwait) == 0) {
523 if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
524 /*
525 * The following calls will cause us to
526 * halt which will cause the store
527 * buffer to be repartitioned,
528 * potentially exposing us to the Intel
529 * CPU vulnerability MDS. As such, we
530 * need to explicitly call that here.
531 * The other idle methods in this
532 * function do this automatically as
533 * part of the implementation of
534 * i86_mwait().
535 */
536 x86_md_clear();
537 (void) cpu_acpi_read_port(
538 cstate->cs_address, &value, 8);
539 acpica_get_global_FADT(&gbl_FADT);
540 (void) cpu_acpi_read_port(
541 gbl_FADT->XPmTimerBlock.Address,
542 &value, 32);
543 }
544 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
545 }
546 }
547 }
548
549 /*
550 * The LAPIC timer may have stopped in deep c-state.
551 * Reprogram this CPU's LAPIC here before enabling interrupts.
552 */
553 (void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
554 sti();
555
556 /*
557 * We're no longer halted
558 */
559 if (hset_update) {
560 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
561 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
562 }
563 }
564
565 /*
566 * Idle the present CPU, deep c-state is supported
567 */
568 void
cpu_acpi_idle(void)569 cpu_acpi_idle(void)
570 {
571 cpu_t *cp = CPU;
572 cpu_acpi_handle_t handle;
573 cma_c_state_t *cs_data;
574 cpu_acpi_cstate_t *cstates;
575 hrtime_t start, end;
576 int cpu_max_cstates;
577 uint32_t cs_indx;
578 uint16_t cs_type;
579
580 cpupm_mach_state_t *mach_state =
581 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
582 handle = mach_state->ms_acpi_handle;
583 ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
584
585 cs_data = mach_state->ms_cstate.cma_state.cstate;
586 cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
587 ASSERT(cstates != NULL);
588 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
589 if (cpu_max_cstates > CPU_MAX_CSTATES)
590 cpu_max_cstates = CPU_MAX_CSTATES;
591 if (cpu_max_cstates == 1) { /* no ACPI c-state data */
592 (*non_deep_idle_cpu)();
593 return;
594 }
595
596 start = gethrtime_unscaled();
597
598 cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
599
600 cs_type = cstates[cs_indx].cs_type;
601
602 switch (cs_type) {
603 default:
604 /* FALLTHROUGH */
605 case CPU_ACPI_C1:
606 (*non_deep_idle_cpu)();
607 break;
608
609 case CPU_ACPI_C2:
610 acpi_cpu_cstate(&cstates[cs_indx]);
611 break;
612
613 case CPU_ACPI_C3:
614 /*
615 * All supported Intel processors maintain cache coherency
616 * during C3. Currently when entering C3 processors flush
617 * core caches to higher level shared cache. The shared cache
618 * maintains state and supports probes during C3.
619 * Consequently there is no need to handle cache coherency
620 * and Bus Master activity here with the cache flush, BM_RLD
621 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
622 * in section 8.1.4 of the ACPI Specification 4.0.
623 */
624 acpi_cpu_cstate(&cstates[cs_indx]);
625 break;
626 }
627
628 end = gethrtime_unscaled();
629
630 /*
631 * Update statistics
632 */
633 cpupm_wakeup_cstate_data(cs_data, end);
634 }
635
636 boolean_t
cpu_deep_cstates_supported(void)637 cpu_deep_cstates_supported(void)
638 {
639 extern int idle_cpu_no_deep_c;
640
641 if (idle_cpu_no_deep_c)
642 return (B_FALSE);
643
644 if (!cpuid_deep_cstates_supported())
645 return (B_FALSE);
646
647 if (cpuid_arat_supported()) {
648 cpu_cstate_arat = B_TRUE;
649 return (B_TRUE);
650 }
651
652 if ((hpet.supported == HPET_FULL_SUPPORT) &&
653 hpet.install_proxy()) {
654 cpu_cstate_hpet = B_TRUE;
655 return (B_TRUE);
656 }
657
658 return (B_FALSE);
659 }
660
661 /*
662 * Validate that this processor supports deep cstate and if so,
663 * get the c-state data from ACPI and cache it.
664 */
665 static int
cpu_idle_init(cpu_t * cp)666 cpu_idle_init(cpu_t *cp)
667 {
668 cpupm_mach_state_t *mach_state =
669 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
670 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
671 cpu_acpi_cstate_t *cstate;
672 char name[KSTAT_STRLEN];
673 int cpu_max_cstates, i;
674 int ret;
675
676 /*
677 * Cache the C-state specific ACPI data.
678 */
679 if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
680 if (ret < 0)
681 cmn_err(CE_NOTE,
682 "!Support for CPU deep idle states is being "
683 "disabled due to errors parsing ACPI C-state "
684 "objects exported by BIOS.");
685 cpu_idle_fini(cp);
686 return (-1);
687 }
688
689 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
690
691 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
692
693 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
694 (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
695 /*
696 * Allocate, initialize and install cstate kstat
697 */
698 cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
699 name, "misc",
700 KSTAT_TYPE_NAMED,
701 sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
702 KSTAT_FLAG_VIRTUAL);
703
704 if (cstate->cs_ksp == NULL) {
705 cmn_err(CE_NOTE, "kstat_create(c_state) fail");
706 } else {
707 cstate->cs_ksp->ks_data = &cpu_idle_kstat;
708 cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
709 cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
710 cstate->cs_ksp->ks_data_size += MAXNAMELEN;
711 cstate->cs_ksp->ks_private = cstate;
712 kstat_install(cstate->cs_ksp);
713 }
714 cstate++;
715 }
716
717 cpupm_alloc_domains(cp, CPUPM_C_STATES);
718 cpupm_alloc_ms_cstate(cp);
719
720 if (cpu_deep_cstates_supported()) {
721 uint32_t value;
722
723 mutex_enter(&cpu_idle_callb_mutex);
724 if (cpu_deep_idle_callb_id == (callb_id_t)0)
725 cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
726 (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
727 if (cpu_idle_cpr_callb_id == (callb_id_t)0)
728 cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
729 (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
730 mutex_exit(&cpu_idle_callb_mutex);
731
732
733 /*
734 * All supported CPUs (Nehalem and later) will remain in C3
735 * during Bus Master activity.
736 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
737 * is not already 0 before enabling Deeper C-states.
738 */
739 cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
740 if (value & 1)
741 cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
742 }
743
744 return (0);
745 }
746
747 /*
748 * Free resources allocated by cpu_idle_init().
749 */
750 static void
cpu_idle_fini(cpu_t * cp)751 cpu_idle_fini(cpu_t *cp)
752 {
753 cpupm_mach_state_t *mach_state =
754 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
755 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
756 cpu_acpi_cstate_t *cstate;
757 uint_t cpu_max_cstates, i;
758
759 /*
760 * idle cpu points back to the generic one
761 */
762 idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
763 disp_enq_thread = non_deep_idle_disp_enq_thread;
764
765 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
766 if (cstate) {
767 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
768
769 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
770 if (cstate->cs_ksp != NULL)
771 kstat_delete(cstate->cs_ksp);
772 cstate++;
773 }
774 }
775
776 cpupm_free_ms_cstate(cp);
777 cpupm_free_domains(&cpupm_cstate_domains);
778 cpu_acpi_free_cstate_data(handle);
779
780 mutex_enter(&cpu_idle_callb_mutex);
781 if (cpu_deep_idle_callb_id != (callb_id_t)0) {
782 (void) callb_delete(cpu_deep_idle_callb_id);
783 cpu_deep_idle_callb_id = (callb_id_t)0;
784 }
785 if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
786 (void) callb_delete(cpu_idle_cpr_callb_id);
787 cpu_idle_cpr_callb_id = (callb_id_t)0;
788 }
789 mutex_exit(&cpu_idle_callb_mutex);
790 }
791
792 /*
793 * This function is introduced here to solve a race condition
794 * between the master and the slave to touch c-state data structure.
795 * After the slave calls this idle function to switch to the non
796 * deep idle function, the master can go on to reclaim the resource.
797 */
798 static void
cpu_idle_stop_sync(void)799 cpu_idle_stop_sync(void)
800 {
801 /* switch to the non deep idle function */
802 CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
803 }
804
805 static void
cpu_idle_stop(cpu_t * cp)806 cpu_idle_stop(cpu_t *cp)
807 {
808 cpupm_mach_state_t *mach_state =
809 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
810 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
811 cpu_acpi_cstate_t *cstate;
812 uint_t cpu_max_cstates, i = 0;
813
814 mutex_enter(&cpu_idle_callb_mutex);
815 if (idle_cpu == cpu_idle_adaptive) {
816 /*
817 * invoke the slave to call synchronous idle function.
818 */
819 cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
820 poke_cpu(cp->cpu_id);
821
822 /*
823 * wait until the slave switchs to non deep idle function,
824 * so that the master is safe to go on to reclaim the resource.
825 */
826 while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
827 drv_usecwait(10);
828 if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
829 cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
830 " idle stop timeout");
831 }
832 }
833 mutex_exit(&cpu_idle_callb_mutex);
834
835 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
836 if (cstate) {
837 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
838
839 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
840 if (cstate->cs_ksp != NULL)
841 kstat_delete(cstate->cs_ksp);
842 cstate++;
843 }
844 }
845 cpupm_free_ms_cstate(cp);
846 cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
847 cpu_acpi_free_cstate_data(handle);
848 }
849
850 /*ARGSUSED*/
851 static boolean_t
cpu_deep_idle_callb(void * arg,int code)852 cpu_deep_idle_callb(void *arg, int code)
853 {
854 boolean_t rslt = B_TRUE;
855
856 mutex_enter(&cpu_idle_callb_mutex);
857 switch (code) {
858 case PM_DEFAULT_CPU_DEEP_IDLE:
859 /*
860 * Default policy is same as enable
861 */
862 /*FALLTHROUGH*/
863 case PM_ENABLE_CPU_DEEP_IDLE:
864 if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
865 break;
866
867 if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
868 disp_enq_thread = cstate_wakeup;
869 idle_cpu = cpu_idle_adaptive;
870 cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
871 } else {
872 rslt = B_FALSE;
873 }
874 break;
875
876 case PM_DISABLE_CPU_DEEP_IDLE:
877 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
878 break;
879
880 idle_cpu = non_deep_idle_cpu;
881 if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
882 disp_enq_thread = non_deep_idle_disp_enq_thread;
883 cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
884 }
885 break;
886
887 default:
888 cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
889 code);
890 break;
891 }
892 mutex_exit(&cpu_idle_callb_mutex);
893 return (rslt);
894 }
895
896 /*ARGSUSED*/
897 static boolean_t
cpu_idle_cpr_callb(void * arg,int code)898 cpu_idle_cpr_callb(void *arg, int code)
899 {
900 boolean_t rslt = B_TRUE;
901
902 mutex_enter(&cpu_idle_callb_mutex);
903 switch (code) {
904 case CB_CODE_CPR_RESUME:
905 if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
906 /*
907 * Do not enable dispatcher hooks if disabled by user.
908 */
909 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
910 break;
911
912 disp_enq_thread = cstate_wakeup;
913 idle_cpu = cpu_idle_adaptive;
914 } else {
915 rslt = B_FALSE;
916 }
917 break;
918
919 case CB_CODE_CPR_CHKPT:
920 idle_cpu = non_deep_idle_cpu;
921 disp_enq_thread = non_deep_idle_disp_enq_thread;
922 (void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
923 break;
924
925 default:
926 cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
927 break;
928 }
929 mutex_exit(&cpu_idle_callb_mutex);
930 return (rslt);
931 }
932
933 /*
934 * handle _CST notification
935 */
936 void
cpuidle_cstate_instance(cpu_t * cp)937 cpuidle_cstate_instance(cpu_t *cp)
938 {
939 #ifndef __xpv
940 cpupm_mach_state_t *mach_state =
941 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
942 cpu_acpi_handle_t handle;
943 struct machcpu *mcpu;
944 cpuset_t dom_cpu_set;
945 kmutex_t *pm_lock;
946 int result = 0;
947 processorid_t cpu_id;
948
949 if (mach_state == NULL) {
950 return;
951 }
952
953 ASSERT(mach_state->ms_cstate.cma_domain != NULL);
954 dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
955 pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
956
957 /*
958 * Do for all the CPU's in the domain
959 */
960 mutex_enter(pm_lock);
961 do {
962 CPUSET_FIND(dom_cpu_set, cpu_id);
963 if (cpu_id == CPUSET_NOTINSET)
964 break;
965
966 ASSERT(cpu_id >= 0 && cpu_id < NCPU);
967 cp = cpu[cpu_id];
968 mach_state = (cpupm_mach_state_t *)
969 cp->cpu_m.mcpu_pm_mach_state;
970 if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
971 mutex_exit(pm_lock);
972 return;
973 }
974 handle = mach_state->ms_acpi_handle;
975 ASSERT(handle != NULL);
976
977 /*
978 * re-evaluate cstate object
979 */
980 if (cpu_acpi_cache_cstate_data(handle) != 0) {
981 cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
982 " object Instance: %d", cpu_id);
983 }
984 mcpu = &(cp->cpu_m);
985 mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
986 if (mcpu->max_cstates > CPU_ACPI_C1) {
987 (void) cstate_timer_callback(
988 CST_EVENT_MULTIPLE_CSTATES);
989 disp_enq_thread = cstate_wakeup;
990 cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
991 } else if (mcpu->max_cstates == CPU_ACPI_C1) {
992 disp_enq_thread = non_deep_idle_disp_enq_thread;
993 cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
994 (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
995 }
996
997 CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
998 } while (result < 0);
999 mutex_exit(pm_lock);
1000 #endif
1001 }
1002
1003 /*
1004 * handle the number or the type of available processor power states change
1005 */
1006 void
cpuidle_manage_cstates(void * ctx)1007 cpuidle_manage_cstates(void *ctx)
1008 {
1009 cpu_t *cp = ctx;
1010 cpupm_mach_state_t *mach_state =
1011 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
1012 boolean_t is_ready;
1013
1014 if (mach_state == NULL) {
1015 return;
1016 }
1017
1018 /*
1019 * We currently refuse to power manage if the CPU is not ready to
1020 * take cross calls (cross calls fail silently if CPU is not ready
1021 * for it).
1022 *
1023 * Additionally, for x86 platforms we cannot power manage an instance,
1024 * until it has been initialized.
1025 */
1026 is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1027 if (!is_ready)
1028 return;
1029
1030 cpuidle_cstate_instance(cp);
1031 }
1032