1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2009-2010, Intel Corporation.
27 * All rights reserved.
28 */
29 /*
30 * Copyright 2019 Joyent, Inc.
31 * Copyright 2025 Oxide Computer Company
32 */
33
34 #include <sys/x86_archext.h>
35 #include <sys/machsystm.h>
36 #include <sys/x_call.h>
37 #include <sys/stat.h>
38 #include <sys/acpi/acpi.h>
39 #include <sys/acpica.h>
40 #include <sys/cpu_acpi.h>
41 #include <sys/cpu_idle.h>
42 #include <sys/cpupm.h>
43 #include <sys/cpu_event.h>
44 #include <sys/hpet.h>
45 #include <sys/archsystm.h>
46 #include <vm/hat_i86.h>
47 #include <sys/dtrace.h>
48 #include <sys/sdt.h>
49 #include <sys/callb.h>
50
51 #define CSTATE_USING_HPET 1
52 #define CSTATE_USING_LAT 2
53
54 #define CPU_IDLE_STOP_TIMEOUT 1000
55
56 extern void cpu_idle_adaptive(void);
57 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
58 cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
59
60 static int cpu_idle_init(cpu_t *);
61 static void cpu_idle_fini(cpu_t *);
62 static void cpu_idle_stop(cpu_t *);
63 static boolean_t cpu_deep_idle_callb(void *arg, int code);
64 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
65 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
66
67 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
68
69 /*
70 * the flag of always-running local APIC timer.
71 * the flag of HPET Timer use in deep cstate.
72 */
73 static boolean_t cpu_cstate_arat = B_FALSE;
74 static boolean_t cpu_cstate_hpet = B_FALSE;
75
76 /*
77 * Interfaces for modules implementing Intel's deep c-state.
78 */
79 cpupm_state_ops_t cpu_idle_ops = {
80 "Generic ACPI C-state Support",
81 cpu_idle_init,
82 cpu_idle_fini,
83 NULL,
84 cpu_idle_stop
85 };
86
87 static kmutex_t cpu_idle_callb_mutex;
88 static callb_id_t cpu_deep_idle_callb_id;
89 static callb_id_t cpu_idle_cpr_callb_id;
90 static uint_t cpu_idle_cfg_state;
91
92 static kmutex_t cpu_idle_mutex;
93
94 cpu_idle_kstat_t cpu_idle_kstat = {
95 { "address_space_id", KSTAT_DATA_STRING },
96 { "latency", KSTAT_DATA_UINT32 },
97 { "power", KSTAT_DATA_UINT32 },
98 };
99
100 /*
101 * kstat update function of the c-state info
102 */
103 static int
cpu_idle_kstat_update(kstat_t * ksp,int flag)104 cpu_idle_kstat_update(kstat_t *ksp, int flag)
105 {
106 cpu_acpi_cstate_t *cstate = ksp->ks_private;
107
108 if (flag == KSTAT_WRITE) {
109 return (EACCES);
110 }
111
112 if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
113 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
114 "FFixedHW");
115 } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
116 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
117 "SystemIO");
118 } else {
119 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
120 "Unsupported");
121 }
122
123 cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
124 cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
125
126 return (0);
127 }
128
129 /*
130 * Used during configuration callbacks to manage implementation specific
131 * details of the hardware timer used during Deep C-state.
132 */
133 boolean_t
cstate_timer_callback(int code)134 cstate_timer_callback(int code)
135 {
136 if (cpu_cstate_arat) {
137 return (B_TRUE);
138 } else if (cpu_cstate_hpet) {
139 return (hpet.callback(code));
140 }
141 return (B_FALSE);
142 }
143
144 /*
145 * Some Local APIC Timers do not work during Deep C-states.
146 * The Deep C-state idle function uses this function to ensure it is using a
147 * hardware timer that works during Deep C-states. This function also
148 * switches the timer back to the LACPI Timer after Deep C-state.
149 */
150 static boolean_t
cstate_use_timer(hrtime_t * lapic_expire,int timer)151 cstate_use_timer(hrtime_t *lapic_expire, int timer)
152 {
153 if (cpu_cstate_arat)
154 return (B_TRUE);
155
156 /*
157 * We have to return B_FALSE if no arat or hpet support
158 */
159 if (!cpu_cstate_hpet)
160 return (B_FALSE);
161
162 switch (timer) {
163 case CSTATE_USING_HPET:
164 return (hpet.use_hpet_timer(lapic_expire));
165 case CSTATE_USING_LAT:
166 hpet.use_lapic_timer(*lapic_expire);
167 return (B_TRUE);
168 default:
169 return (B_FALSE);
170 }
171 }
172
173 /*
174 * c-state wakeup function.
175 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
176 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
177 */
178 void
cstate_wakeup(cpu_t * cp,int bound)179 cstate_wakeup(cpu_t *cp, int bound)
180 {
181 struct machcpu *mcpu = &(cp->cpu_m);
182 volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
183 cpupart_t *cpu_part;
184 uint_t cpu_found;
185 processorid_t cpu_sid;
186
187 cpu_part = cp->cpu_part;
188 cpu_sid = cp->cpu_seqid;
189 /*
190 * Clear the halted bit for that CPU since it will be woken up
191 * in a moment.
192 */
193 if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
194 /*
195 * Clear the halted bit for that CPU since it will be
196 * poked in a moment.
197 */
198 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
199
200 /*
201 * We may find the current CPU present in the halted cpuset
202 * if we're in the context of an interrupt that occurred
203 * before we had a chance to clear our bit in cpu_idle().
204 * Waking ourself is obviously unnecessary, since if
205 * we're here, we're not halted.
206 */
207 if (cp != CPU) {
208 /*
209 * Use correct wakeup mechanism
210 */
211 if ((mcpu_mwait != NULL) &&
212 (*mcpu_mwait == MWAIT_HALTED))
213 MWAIT_WAKEUP(cp);
214 else
215 poke_cpu(cp->cpu_id);
216 }
217 return;
218 } else {
219 /*
220 * This cpu isn't halted, but it's idle or undergoing a
221 * context switch. No need to awaken anyone else.
222 */
223 if (cp->cpu_thread == cp->cpu_idle_thread ||
224 cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
225 return;
226 }
227
228 /*
229 * No need to wake up other CPUs if the thread we just enqueued
230 * is bound.
231 */
232 if (bound)
233 return;
234
235
236 /*
237 * See if there's any other halted CPUs. If there are, then
238 * select one, and awaken it.
239 * It's possible that after we find a CPU, somebody else
240 * will awaken it before we get the chance.
241 * In that case, look again.
242 */
243 do {
244 cpu_found = bitset_find(&cpu_part->cp_haltset);
245 if (cpu_found == (uint_t)-1)
246 return;
247
248 } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
249 cpu_found) < 0);
250
251 /*
252 * Must use correct wakeup mechanism to avoid lost wakeup of
253 * alternate cpu.
254 */
255 if (cpu_found != CPU->cpu_seqid) {
256 mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
257 if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
258 MWAIT_WAKEUP(cpu_seq[cpu_found]);
259 else
260 poke_cpu(cpu_seq[cpu_found]->cpu_id);
261 }
262 }
263
264 /*
265 * Function called by CPU idle notification framework to check whether CPU
266 * has been awakened. It will be called with interrupt disabled.
267 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
268 * notification framework.
269 */
270 static void
acpi_cpu_mwait_check_wakeup(void * arg)271 acpi_cpu_mwait_check_wakeup(void *arg)
272 {
273 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
274
275 ASSERT(arg != NULL);
276 if (*mcpu_mwait != MWAIT_HALTED) {
277 /*
278 * CPU has been awakened, notify CPU idle notification system.
279 */
280 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
281 } else {
282 /*
283 * Toggle interrupt flag to detect pending interrupts.
284 * If interrupt happened, do_interrupt() will notify CPU idle
285 * notification framework so no need to call cpu_idle_exit()
286 * here.
287 */
288 sti();
289 SMT_PAUSE();
290 cli();
291 }
292 }
293
294 static void
acpi_cpu_mwait_ipi_check_wakeup(void * arg)295 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
296 {
297 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
298
299 ASSERT(arg != NULL);
300 if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
301 /*
302 * CPU has been awakened, notify CPU idle notification system.
303 */
304 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
305 } else {
306 /*
307 * Toggle interrupt flag to detect pending interrupts.
308 * If interrupt happened, do_interrupt() will notify CPU idle
309 * notification framework so no need to call cpu_idle_exit()
310 * here.
311 */
312 sti();
313 SMT_PAUSE();
314 cli();
315 }
316 }
317
318 /*ARGSUSED*/
319 static void
acpi_cpu_check_wakeup(void * arg)320 acpi_cpu_check_wakeup(void *arg)
321 {
322 /*
323 * Toggle interrupt flag to detect pending interrupts.
324 * If interrupt happened, do_interrupt() will notify CPU idle
325 * notification framework so no need to call cpu_idle_exit() here.
326 */
327 sti();
328 SMT_PAUSE();
329 cli();
330 }
331
332 /*
333 * Idle the current CPU via ACPI-defined System I/O read to an ACPI-specified
334 * address.
335 */
336 static void
acpi_io_idle(uint32_t address)337 acpi_io_idle(uint32_t address)
338 {
339 uint32_t value;
340 ACPI_TABLE_FADT *gbl_FADT;
341
342 /*
343 * Do we need to work around an ancient chipset bug in early ACPI
344 * implementations that would result in a late STPCLK# assertion?
345 *
346 * Must be true when running on systems where the ACPI-indicated I/O
347 * read to enter low-power states may resolve before actually stopping
348 * the processor that initiated a low-power transition. On such systems,
349 * it is possible the processor would proceed past the idle point and
350 * *then* be stopped.
351 *
352 * An early workaround that has been carried forward is to read the ACPI
353 * PM Timer after requesting a low-power transition. The timer read will
354 * take long enough that we are certain the processor is safe to be
355 * stopped.
356 *
357 * From some investigation, this was only ever necessary on older Intel
358 * chipsets. Additionally, the timer read can take upwards of a thousand
359 * CPU clocks, so for systems that work correctly, it's just a tarpit
360 * for the CPU as it is woken back up.
361 */
362 boolean_t need_stpclk_workaround =
363 cpuid_getvendor(CPU) == X86_VENDOR_Intel;
364
365 /*
366 * The following call will cause us to halt which will cause the store
367 * buffer to be repartitioned, potentially exposing us to the Intel CPU
368 * vulnerability MDS. As such, we need to explicitly call that here.
369 * The other idle methods do this automatically as part of the
370 * implementation of i86_mwait().
371 */
372 x86_md_clear();
373 (void) cpu_acpi_read_port(address, &value, 8);
374 if (need_stpclk_workaround) {
375 acpica_get_global_FADT(&gbl_FADT);
376 (void) cpu_acpi_read_port(
377 gbl_FADT->XPmTimerBlock.Address,
378 &value, 32);
379 }
380 }
381
382 /*
383 * enter deep c-state handler
384 */
385 static void
acpi_cpu_cstate(cpu_acpi_cstate_t * cstate)386 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
387 {
388 volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
389 uint32_t mwait_idle_state;
390 cpu_t *cpup = CPU;
391 processorid_t cpu_sid = cpup->cpu_seqid;
392 cpupart_t *cp = cpup->cpu_part;
393 hrtime_t lapic_expire;
394 uint8_t type = cstate->cs_addrspace_id;
395 uint32_t cs_type = cstate->cs_type;
396 int hset_update = 1;
397 boolean_t using_timer;
398 cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
399
400 /*
401 * Set our mcpu_mwait here, so we can tell if anyone tries to
402 * wake us between now and when we call mwait. No other cpu will
403 * attempt to set our mcpu_mwait until we add ourself to the haltset.
404 */
405 if (mcpu_mwait != NULL) {
406 if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
407 mwait_idle_state = MWAIT_WAKEUP_IPI;
408 check_func = &acpi_cpu_mwait_ipi_check_wakeup;
409 } else {
410 mwait_idle_state = MWAIT_HALTED;
411 check_func = &acpi_cpu_mwait_check_wakeup;
412 }
413 *mcpu_mwait = mwait_idle_state;
414 } else {
415 /*
416 * Initialize mwait_idle_state, but with mcpu_mwait NULL we'll
417 * never actually use it here. "MWAIT_RUNNING" just
418 * distinguishes from the "WAKEUP_IPI" and "HALTED" cases above.
419 */
420 mwait_idle_state = MWAIT_RUNNING;
421 }
422
423 /*
424 * If this CPU is online, and there are multiple CPUs
425 * in the system, then we should note our halting
426 * by adding ourselves to the partition's halted CPU
427 * bitmap. This allows other CPUs to find/awaken us when
428 * work becomes available.
429 */
430 if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
431 hset_update = 0;
432
433 /*
434 * Add ourselves to the partition's halted CPUs bitmask
435 * and set our HALTED flag, if necessary.
436 *
437 * When a thread becomes runnable, it is placed on the queue
438 * and then the halted cpuset is checked to determine who
439 * (if anyone) should be awakened. We therefore need to first
440 * add ourselves to the halted cpuset, and and then check if there
441 * is any work available.
442 *
443 * Note that memory barriers after updating the HALTED flag
444 * are not necessary since an atomic operation (updating the bitmap)
445 * immediately follows. On x86 the atomic operation acts as a
446 * memory barrier for the update of cpu_disp_flags.
447 */
448 if (hset_update) {
449 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
450 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
451 }
452
453 /*
454 * Check to make sure there's really nothing to do. Work destined for
455 * this CPU may become available after this check. If we're
456 * mwait-halting we'll be notified through the clearing of our bit in
457 * the halted CPU bitmask, and a write to our mcpu_mwait. Otherwise,
458 * we're hlt-based halting, and we'll be immediately woken by the
459 * pending interrupt.
460 *
461 * disp_anywork() checks disp_nrunnable, so we do not have to later.
462 */
463 if (disp_anywork()) {
464 if (hset_update) {
465 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
466 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
467 }
468 return;
469 }
470
471 /*
472 * We're on our way to being halted.
473 *
474 * The local APIC timer can stop in ACPI C2 and deeper c-states.
475 * Try to program the HPET hardware to substitute for this CPU's
476 * LAPIC timer.
477 * cstate_use_timer() could disable the LAPIC Timer. Make sure
478 * to start the LAPIC Timer again before leaving this function.
479 *
480 * Disable interrupts here so we will awaken immediately after halting
481 * if someone tries to poke us between now and the time we actually
482 * halt.
483 */
484 cli();
485 using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
486
487 /*
488 * We check for the presence of our bit after disabling interrupts.
489 * If it's cleared, we'll return. If the bit is cleared after
490 * we check then the cstate_wakeup() will pop us out of the halted
491 * state.
492 *
493 * This means that the ordering of the cstate_wakeup() and the clearing
494 * of the bit by cpu_wakeup is important.
495 * cpu_wakeup() must clear our mc_haltset bit, and then call
496 * cstate_wakeup().
497 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
498 */
499 if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
500 (void) cstate_use_timer(&lapic_expire,
501 CSTATE_USING_LAT);
502 sti();
503 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
504 return;
505 }
506
507 /*
508 * The check for anything locally runnable is here for performance
509 * and isn't needed for correctness. disp_nrunnable ought to be
510 * in our cache still, so it's inexpensive to check, and if there
511 * is anything runnable we won't have to wait for the poke.
512 */
513 if (cpup->cpu_disp->disp_nrunnable != 0) {
514 (void) cstate_use_timer(&lapic_expire,
515 CSTATE_USING_LAT);
516 sti();
517 if (hset_update) {
518 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
519 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
520 }
521 return;
522 }
523
524 if (using_timer == B_FALSE) {
525
526 (void) cstate_use_timer(&lapic_expire,
527 CSTATE_USING_LAT);
528 sti();
529
530 /*
531 * We are currently unable to program the HPET to act as this
532 * CPU's proxy LAPIC timer. This CPU cannot enter C2 or deeper
533 * because no timer is set to wake it up while its LAPIC timer
534 * stalls in deep C-States.
535 * Enter C1 instead.
536 *
537 * cstate_wakeup() will wake this CPU with an IPI, which works
538 * with either MWAIT or HLT.
539 */
540 if (mcpu_mwait != NULL) {
541 i86_monitor(mcpu_mwait, 0, 0);
542 if (*mcpu_mwait == MWAIT_HALTED) {
543 if (cpu_idle_enter(IDLE_STATE_C1, 0,
544 check_func, (void *)mcpu_mwait) == 0) {
545 if (*mcpu_mwait == MWAIT_HALTED) {
546 i86_mwait(0, 0);
547 }
548 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
549 }
550 }
551 } else {
552 if (cpu_idle_enter(cs_type, 0, check_func, NULL) == 0) {
553 mach_cpu_idle();
554 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
555 }
556 }
557
558 /*
559 * We're no longer halted
560 */
561 if (hset_update) {
562 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
563 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
564 }
565 return;
566 }
567
568 /*
569 * Tell the cpu idle framework we're going to try idling.
570 *
571 * If cpu_idle_enter returns nonzero, we've found out at the last minute
572 * that we don't actually want to idle.
573 */
574 boolean_t idle_ok = cpu_idle_enter(cs_type, 0, check_func,
575 (void *)mcpu_mwait) == 0;
576
577 if (idle_ok) {
578 if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
579 if (mcpu_mwait != NULL) {
580 /*
581 * We're on our way to being halted.
582 * To avoid a lost wakeup, arm the monitor
583 * before checking if another cpu wrote to
584 * mcpu_mwait to wake us up.
585 */
586 i86_monitor(mcpu_mwait, 0, 0);
587 if (*mcpu_mwait == mwait_idle_state) {
588 i86_mwait(cstate->cs_address, 1);
589 }
590 } else {
591 mach_cpu_idle();
592 }
593 } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
594 /*
595 * mcpu_mwait is not directly part of idling or wakeup
596 * in the ACPI System I/O case, but if available it can
597 * hint that we shouldn't actually try to idle because
598 * we're about to be woken up anyway.
599 *
600 * A trip through idle/wakeup can be upwards of a few
601 * microseconds, so avoiding that makes this a helpful
602 * optimization, but consulting mcpu_mwait is still not
603 * necessary for correctness here.
604 */
605 if (!mcpu_mwait || *mcpu_mwait == mwait_idle_state) {
606 acpi_io_idle(cstate->cs_address);
607 }
608 }
609
610 /*
611 * We've either idled and woken up, or decided not to idle.
612 * Either way, tell the cpu idle framework that we're not trying
613 * to idle anymore.
614 */
615 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
616 }
617
618 /*
619 * The LAPIC timer may have stopped in deep c-state.
620 * Reprogram this CPU's LAPIC here before enabling interrupts.
621 */
622 (void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
623 sti();
624
625 /*
626 * We're no longer halted
627 */
628 if (hset_update) {
629 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
630 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
631 }
632 }
633
634 /*
635 * Idle the present CPU, deep c-state is supported
636 */
637 void
cpu_acpi_idle(void)638 cpu_acpi_idle(void)
639 {
640 cpu_t *cp = CPU;
641 cpu_acpi_handle_t handle;
642 cma_c_state_t *cs_data;
643 cpu_acpi_cstate_t *cstates;
644 hrtime_t start, end;
645 int cpu_max_cstates;
646 uint32_t cs_indx;
647 uint16_t cs_type;
648
649 cpupm_mach_state_t *mach_state =
650 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
651 handle = mach_state->ms_acpi_handle;
652 ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
653
654 cs_data = mach_state->ms_cstate.cma_state.cstate;
655 cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
656 ASSERT(cstates != NULL);
657 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
658 if (cpu_max_cstates > CPU_MAX_CSTATES)
659 cpu_max_cstates = CPU_MAX_CSTATES;
660 if (cpu_max_cstates == 1) { /* no ACPI c-state data */
661 (*non_deep_idle_cpu)();
662 return;
663 }
664
665 start = gethrtime_unscaled();
666
667 cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
668
669 cs_type = cstates[cs_indx].cs_type;
670
671 switch (cs_type) {
672 default:
673 /* FALLTHROUGH */
674 case CPU_ACPI_C1:
675 (*non_deep_idle_cpu)();
676 break;
677
678 case CPU_ACPI_C2:
679 acpi_cpu_cstate(&cstates[cs_indx]);
680 break;
681
682 case CPU_ACPI_C3:
683 /*
684 * All supported Intel processors maintain cache coherency
685 * during C3. Currently when entering C3 processors flush
686 * core caches to higher level shared cache. The shared cache
687 * maintains state and supports probes during C3.
688 * Consequently there is no need to handle cache coherency
689 * and Bus Master activity here with the cache flush, BM_RLD
690 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
691 * in section 8.1.4 of the ACPI Specification 4.0.
692 */
693 acpi_cpu_cstate(&cstates[cs_indx]);
694 break;
695 }
696
697 end = gethrtime_unscaled();
698
699 /*
700 * Update statistics
701 */
702 cpupm_wakeup_cstate_data(cs_data, end);
703 }
704
705 boolean_t
cpu_deep_cstates_supported(void)706 cpu_deep_cstates_supported(void)
707 {
708 extern int idle_cpu_no_deep_c;
709
710 if (idle_cpu_no_deep_c)
711 return (B_FALSE);
712
713 if (!cpuid_deep_cstates_supported())
714 return (B_FALSE);
715
716 if (cpuid_arat_supported()) {
717 cpu_cstate_arat = B_TRUE;
718 return (B_TRUE);
719 }
720
721 /*
722 * In theory we can use the HPET as a proxy timer in case we can't rely
723 * on the LAPIC in deep C-states. In practice on AMD it seems something
724 * isn't quite right and we just don't get woken up, so the proxy timer
725 * approach doesn't work. Only set up the HPET as proxy timer on Intel
726 * systems for now.
727 */
728 if (cpuid_getvendor(CPU) == X86_VENDOR_Intel &&
729 (hpet.supported == HPET_FULL_SUPPORT) &&
730 hpet.install_proxy()) {
731 cpu_cstate_hpet = B_TRUE;
732 return (B_TRUE);
733 }
734
735 return (B_FALSE);
736 }
737
738 /*
739 * Validate that this processor supports deep cstate and if so,
740 * get the c-state data from ACPI and cache it.
741 */
742 static int
cpu_idle_init(cpu_t * cp)743 cpu_idle_init(cpu_t *cp)
744 {
745 cpupm_mach_state_t *mach_state =
746 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
747 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
748 cpu_acpi_cstate_t *cstate;
749 char name[KSTAT_STRLEN];
750 int cpu_max_cstates, i;
751 int ret;
752
753 /*
754 * Cache the C-state specific ACPI data.
755 */
756 if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
757 if (ret < 0)
758 cmn_err(CE_NOTE,
759 "!Support for CPU deep idle states is being "
760 "disabled due to errors parsing ACPI C-state "
761 "objects exported by BIOS.");
762 cpu_idle_fini(cp);
763 return (-1);
764 }
765
766 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
767
768 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
769
770 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
771 (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
772 /*
773 * Allocate, initialize and install cstate kstat
774 */
775 cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
776 name, "misc",
777 KSTAT_TYPE_NAMED,
778 sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
779 KSTAT_FLAG_VIRTUAL);
780
781 if (cstate->cs_ksp == NULL) {
782 cmn_err(CE_NOTE, "kstat_create(c_state) fail");
783 } else {
784 cstate->cs_ksp->ks_data = &cpu_idle_kstat;
785 cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
786 cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
787 cstate->cs_ksp->ks_data_size += MAXNAMELEN;
788 cstate->cs_ksp->ks_private = cstate;
789 kstat_install(cstate->cs_ksp);
790 }
791 cstate++;
792 }
793
794 cpupm_alloc_domains(cp, CPUPM_C_STATES);
795 cpupm_alloc_ms_cstate(cp);
796
797 if (cpu_deep_cstates_supported()) {
798 uint32_t value;
799
800 mutex_enter(&cpu_idle_callb_mutex);
801 if (cpu_deep_idle_callb_id == (callb_id_t)0)
802 cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
803 (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
804 if (cpu_idle_cpr_callb_id == (callb_id_t)0)
805 cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
806 (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
807 mutex_exit(&cpu_idle_callb_mutex);
808
809
810 /*
811 * All supported CPUs (Nehalem and later) will remain in C3
812 * during Bus Master activity.
813 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
814 * is not already 0 before enabling Deeper C-states.
815 */
816 cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
817 if (value & 1)
818 cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
819 }
820
821 return (0);
822 }
823
824 /*
825 * Free resources allocated by cpu_idle_init().
826 */
827 static void
cpu_idle_fini(cpu_t * cp)828 cpu_idle_fini(cpu_t *cp)
829 {
830 cpupm_mach_state_t *mach_state =
831 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
832 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
833 cpu_acpi_cstate_t *cstate;
834 uint_t cpu_max_cstates, i;
835
836 /*
837 * idle cpu points back to the generic one
838 */
839 idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
840 disp_enq_thread = non_deep_idle_disp_enq_thread;
841
842 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
843 if (cstate) {
844 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
845
846 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
847 if (cstate->cs_ksp != NULL)
848 kstat_delete(cstate->cs_ksp);
849 cstate++;
850 }
851 }
852
853 cpupm_free_ms_cstate(cp);
854 cpupm_free_domains(&cpupm_cstate_domains);
855 cpu_acpi_free_cstate_data(handle);
856
857 mutex_enter(&cpu_idle_callb_mutex);
858 if (cpu_deep_idle_callb_id != (callb_id_t)0) {
859 (void) callb_delete(cpu_deep_idle_callb_id);
860 cpu_deep_idle_callb_id = (callb_id_t)0;
861 }
862 if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
863 (void) callb_delete(cpu_idle_cpr_callb_id);
864 cpu_idle_cpr_callb_id = (callb_id_t)0;
865 }
866 mutex_exit(&cpu_idle_callb_mutex);
867 }
868
869 /*
870 * This function is introduced here to solve a race condition
871 * between the master and the slave to touch c-state data structure.
872 * After the slave calls this idle function to switch to the non
873 * deep idle function, the master can go on to reclaim the resource.
874 */
875 static void
cpu_idle_stop_sync(void)876 cpu_idle_stop_sync(void)
877 {
878 /* switch to the non deep idle function */
879 CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
880 }
881
882 static void
cpu_idle_stop(cpu_t * cp)883 cpu_idle_stop(cpu_t *cp)
884 {
885 cpupm_mach_state_t *mach_state =
886 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
887 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
888 cpu_acpi_cstate_t *cstate;
889 uint_t cpu_max_cstates, i = 0;
890
891 mutex_enter(&cpu_idle_callb_mutex);
892 if (idle_cpu == cpu_idle_adaptive) {
893 /*
894 * invoke the slave to call synchronous idle function.
895 */
896 cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
897 poke_cpu(cp->cpu_id);
898
899 /*
900 * wait until the slave switchs to non deep idle function,
901 * so that the master is safe to go on to reclaim the resource.
902 */
903 while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
904 drv_usecwait(10);
905 if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
906 cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
907 " idle stop timeout");
908 }
909 }
910 mutex_exit(&cpu_idle_callb_mutex);
911
912 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
913 if (cstate) {
914 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
915
916 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
917 if (cstate->cs_ksp != NULL)
918 kstat_delete(cstate->cs_ksp);
919 cstate++;
920 }
921 }
922 cpupm_free_ms_cstate(cp);
923 cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
924 cpu_acpi_free_cstate_data(handle);
925 }
926
927 /*ARGSUSED*/
928 static boolean_t
cpu_deep_idle_callb(void * arg,int code)929 cpu_deep_idle_callb(void *arg, int code)
930 {
931 boolean_t rslt = B_TRUE;
932
933 mutex_enter(&cpu_idle_callb_mutex);
934 switch (code) {
935 case PM_DEFAULT_CPU_DEEP_IDLE:
936 /*
937 * Default policy is same as enable
938 */
939 /*FALLTHROUGH*/
940 case PM_ENABLE_CPU_DEEP_IDLE:
941 if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
942 break;
943
944 if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
945 disp_enq_thread = cstate_wakeup;
946 idle_cpu = cpu_idle_adaptive;
947 cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
948 } else {
949 rslt = B_FALSE;
950 }
951 break;
952
953 case PM_DISABLE_CPU_DEEP_IDLE:
954 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
955 break;
956
957 idle_cpu = non_deep_idle_cpu;
958 if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
959 disp_enq_thread = non_deep_idle_disp_enq_thread;
960 cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
961 }
962 break;
963
964 default:
965 cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
966 code);
967 break;
968 }
969 mutex_exit(&cpu_idle_callb_mutex);
970 return (rslt);
971 }
972
973 /*ARGSUSED*/
974 static boolean_t
cpu_idle_cpr_callb(void * arg,int code)975 cpu_idle_cpr_callb(void *arg, int code)
976 {
977 boolean_t rslt = B_TRUE;
978
979 mutex_enter(&cpu_idle_callb_mutex);
980 switch (code) {
981 case CB_CODE_CPR_RESUME:
982 if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
983 /*
984 * Do not enable dispatcher hooks if disabled by user.
985 */
986 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
987 break;
988
989 disp_enq_thread = cstate_wakeup;
990 idle_cpu = cpu_idle_adaptive;
991 } else {
992 rslt = B_FALSE;
993 }
994 break;
995
996 case CB_CODE_CPR_CHKPT:
997 idle_cpu = non_deep_idle_cpu;
998 disp_enq_thread = non_deep_idle_disp_enq_thread;
999 (void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
1000 break;
1001
1002 default:
1003 cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
1004 break;
1005 }
1006 mutex_exit(&cpu_idle_callb_mutex);
1007 return (rslt);
1008 }
1009
1010 /*
1011 * handle _CST notification
1012 */
1013 void
cpuidle_cstate_instance(cpu_t * cp)1014 cpuidle_cstate_instance(cpu_t *cp)
1015 {
1016 #ifndef __xpv
1017 cpupm_mach_state_t *mach_state =
1018 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
1019 cpu_acpi_handle_t handle;
1020 struct machcpu *mcpu;
1021 cpuset_t dom_cpu_set;
1022 kmutex_t *pm_lock;
1023 int result = 0;
1024 processorid_t cpu_id;
1025
1026 if (mach_state == NULL) {
1027 return;
1028 }
1029
1030 ASSERT(mach_state->ms_cstate.cma_domain != NULL);
1031 dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
1032 pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
1033
1034 /*
1035 * Do for all the CPU's in the domain
1036 */
1037 mutex_enter(pm_lock);
1038 do {
1039 CPUSET_FIND(dom_cpu_set, cpu_id);
1040 if (cpu_id == CPUSET_NOTINSET)
1041 break;
1042
1043 ASSERT(cpu_id >= 0 && cpu_id < NCPU);
1044 cp = cpu[cpu_id];
1045 mach_state = (cpupm_mach_state_t *)
1046 cp->cpu_m.mcpu_pm_mach_state;
1047 if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
1048 mutex_exit(pm_lock);
1049 return;
1050 }
1051 handle = mach_state->ms_acpi_handle;
1052 ASSERT(handle != NULL);
1053
1054 /*
1055 * re-evaluate cstate object
1056 */
1057 if (cpu_acpi_cache_cstate_data(handle) != 0) {
1058 cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
1059 " object Instance: %d", cpu_id);
1060 }
1061 mcpu = &(cp->cpu_m);
1062 mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
1063 if (mcpu->max_cstates > CPU_ACPI_C1) {
1064 (void) cstate_timer_callback(
1065 CST_EVENT_MULTIPLE_CSTATES);
1066 disp_enq_thread = cstate_wakeup;
1067 cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
1068 } else if (mcpu->max_cstates == CPU_ACPI_C1) {
1069 disp_enq_thread = non_deep_idle_disp_enq_thread;
1070 cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
1071 (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
1072 }
1073
1074 CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
1075 } while (result < 0);
1076 mutex_exit(pm_lock);
1077 #endif
1078 }
1079
1080 /*
1081 * handle the number or the type of available processor power states change
1082 */
1083 void
cpuidle_manage_cstates(void * ctx)1084 cpuidle_manage_cstates(void *ctx)
1085 {
1086 cpu_t *cp = ctx;
1087 cpupm_mach_state_t *mach_state =
1088 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
1089 boolean_t is_ready;
1090
1091 if (mach_state == NULL) {
1092 return;
1093 }
1094
1095 /*
1096 * We currently refuse to power manage if the CPU is not ready to
1097 * take cross calls (cross calls fail silently if CPU is not ready
1098 * for it).
1099 *
1100 * Additionally, for x86 platforms we cannot power manage an instance,
1101 * until it has been initialized.
1102 */
1103 is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1104 if (!is_ready)
1105 return;
1106
1107 cpuidle_cstate_instance(cp);
1108 }
1109