xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision 3fe455549728ac525df3be56130ad8e075d645d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009-2010, Intel Corporation.
27  * All rights reserved.
28  */
29 /*
30  * Copyright 2019 Joyent, Inc.
31  * Copyright 2024 Oxide Computer Company
32  */
33 
34 #include <sys/x86_archext.h>
35 #include <sys/machsystm.h>
36 #include <sys/x_call.h>
37 #include <sys/stat.h>
38 #include <sys/acpi/acpi.h>
39 #include <sys/acpica.h>
40 #include <sys/cpu_acpi.h>
41 #include <sys/cpu_idle.h>
42 #include <sys/cpupm.h>
43 #include <sys/cpu_event.h>
44 #include <sys/hpet.h>
45 #include <sys/archsystm.h>
46 #include <vm/hat_i86.h>
47 #include <sys/dtrace.h>
48 #include <sys/sdt.h>
49 #include <sys/callb.h>
50 
51 #define	CSTATE_USING_HPET		1
52 #define	CSTATE_USING_LAT		2
53 
54 #define	CPU_IDLE_STOP_TIMEOUT		1000
55 
56 extern void cpu_idle_adaptive(void);
57 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
58     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
59 
60 static int cpu_idle_init(cpu_t *);
61 static void cpu_idle_fini(cpu_t *);
62 static void cpu_idle_stop(cpu_t *);
63 static boolean_t cpu_deep_idle_callb(void *arg, int code);
64 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
65 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
66 
67 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
68 
69 /*
70  * the flag of always-running local APIC timer.
71  * the flag of HPET Timer use in deep cstate.
72  */
73 static boolean_t cpu_cstate_arat = B_FALSE;
74 static boolean_t cpu_cstate_hpet = B_FALSE;
75 
76 /*
77  * Interfaces for modules implementing Intel's deep c-state.
78  */
79 cpupm_state_ops_t cpu_idle_ops = {
80 	"Generic ACPI C-state Support",
81 	cpu_idle_init,
82 	cpu_idle_fini,
83 	NULL,
84 	cpu_idle_stop
85 };
86 
87 static kmutex_t		cpu_idle_callb_mutex;
88 static callb_id_t	cpu_deep_idle_callb_id;
89 static callb_id_t	cpu_idle_cpr_callb_id;
90 static uint_t		cpu_idle_cfg_state;
91 
92 static kmutex_t cpu_idle_mutex;
93 
94 cpu_idle_kstat_t cpu_idle_kstat = {
95 	{ "address_space_id",	KSTAT_DATA_STRING },
96 	{ "latency",		KSTAT_DATA_UINT32 },
97 	{ "power",		KSTAT_DATA_UINT32 },
98 };
99 
100 /*
101  * kstat update function of the c-state info
102  */
103 static int
104 cpu_idle_kstat_update(kstat_t *ksp, int flag)
105 {
106 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
107 
108 	if (flag == KSTAT_WRITE) {
109 		return (EACCES);
110 	}
111 
112 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
113 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
114 		"FFixedHW");
115 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
116 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
117 		"SystemIO");
118 	} else {
119 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
120 		"Unsupported");
121 	}
122 
123 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
124 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
125 
126 	return (0);
127 }
128 
129 /*
130  * Used during configuration callbacks to manage implementation specific
131  * details of the hardware timer used during Deep C-state.
132  */
133 boolean_t
134 cstate_timer_callback(int code)
135 {
136 	if (cpu_cstate_arat) {
137 		return (B_TRUE);
138 	} else if (cpu_cstate_hpet) {
139 		return (hpet.callback(code));
140 	}
141 	return (B_FALSE);
142 }
143 
144 /*
145  * Some Local APIC Timers do not work during Deep C-states.
146  * The Deep C-state idle function uses this function to ensure it is using a
147  * hardware timer that works during Deep C-states.  This function also
148  * switches the timer back to the LACPI Timer after Deep C-state.
149  */
150 static boolean_t
151 cstate_use_timer(hrtime_t *lapic_expire, int timer)
152 {
153 	if (cpu_cstate_arat)
154 		return (B_TRUE);
155 
156 	/*
157 	 * We have to return B_FALSE if no arat or hpet support
158 	 */
159 	if (!cpu_cstate_hpet)
160 		return (B_FALSE);
161 
162 	switch (timer) {
163 	case CSTATE_USING_HPET:
164 		return (hpet.use_hpet_timer(lapic_expire));
165 	case CSTATE_USING_LAT:
166 		hpet.use_lapic_timer(*lapic_expire);
167 		return (B_TRUE);
168 	default:
169 		return (B_FALSE);
170 	}
171 }
172 
173 /*
174  * c-state wakeup function.
175  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
176  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
177  */
178 void
179 cstate_wakeup(cpu_t *cp, int bound)
180 {
181 	struct machcpu	*mcpu = &(cp->cpu_m);
182 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
183 	cpupart_t	*cpu_part;
184 	uint_t		cpu_found;
185 	processorid_t	cpu_sid;
186 
187 	cpu_part = cp->cpu_part;
188 	cpu_sid = cp->cpu_seqid;
189 	/*
190 	 * Clear the halted bit for that CPU since it will be woken up
191 	 * in a moment.
192 	 */
193 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
194 		/*
195 		 * Clear the halted bit for that CPU since it will be
196 		 * poked in a moment.
197 		 */
198 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
199 
200 		/*
201 		 * We may find the current CPU present in the halted cpuset
202 		 * if we're in the context of an interrupt that occurred
203 		 * before we had a chance to clear our bit in cpu_idle().
204 		 * Waking ourself is obviously unnecessary, since if
205 		 * we're here, we're not halted.
206 		 */
207 		if (cp != CPU) {
208 			/*
209 			 * Use correct wakeup mechanism
210 			 */
211 			if ((mcpu_mwait != NULL) &&
212 			    (*mcpu_mwait == MWAIT_HALTED))
213 				MWAIT_WAKEUP(cp);
214 			else
215 				poke_cpu(cp->cpu_id);
216 		}
217 		return;
218 	} else {
219 		/*
220 		 * This cpu isn't halted, but it's idle or undergoing a
221 		 * context switch. No need to awaken anyone else.
222 		 */
223 		if (cp->cpu_thread == cp->cpu_idle_thread ||
224 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
225 			return;
226 	}
227 
228 	/*
229 	 * No need to wake up other CPUs if the thread we just enqueued
230 	 * is bound.
231 	 */
232 	if (bound)
233 		return;
234 
235 
236 	/*
237 	 * See if there's any other halted CPUs. If there are, then
238 	 * select one, and awaken it.
239 	 * It's possible that after we find a CPU, somebody else
240 	 * will awaken it before we get the chance.
241 	 * In that case, look again.
242 	 */
243 	do {
244 		cpu_found = bitset_find(&cpu_part->cp_haltset);
245 		if (cpu_found == (uint_t)-1)
246 			return;
247 
248 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
249 	    cpu_found) < 0);
250 
251 	/*
252 	 * Must use correct wakeup mechanism to avoid lost wakeup of
253 	 * alternate cpu.
254 	 */
255 	if (cpu_found != CPU->cpu_seqid) {
256 		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
257 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
258 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
259 		else
260 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
261 	}
262 }
263 
264 /*
265  * Function called by CPU idle notification framework to check whether CPU
266  * has been awakened. It will be called with interrupt disabled.
267  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
268  * notification framework.
269  */
270 static void
271 acpi_cpu_mwait_check_wakeup(void *arg)
272 {
273 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
274 
275 	ASSERT(arg != NULL);
276 	if (*mcpu_mwait != MWAIT_HALTED) {
277 		/*
278 		 * CPU has been awakened, notify CPU idle notification system.
279 		 */
280 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
281 	} else {
282 		/*
283 		 * Toggle interrupt flag to detect pending interrupts.
284 		 * If interrupt happened, do_interrupt() will notify CPU idle
285 		 * notification framework so no need to call cpu_idle_exit()
286 		 * here.
287 		 */
288 		sti();
289 		SMT_PAUSE();
290 		cli();
291 	}
292 }
293 
294 static void
295 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
296 {
297 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
298 
299 	ASSERT(arg != NULL);
300 	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
301 		/*
302 		 * CPU has been awakened, notify CPU idle notification system.
303 		 */
304 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
305 	} else {
306 		/*
307 		 * Toggle interrupt flag to detect pending interrupts.
308 		 * If interrupt happened, do_interrupt() will notify CPU idle
309 		 * notification framework so no need to call cpu_idle_exit()
310 		 * here.
311 		 */
312 		sti();
313 		SMT_PAUSE();
314 		cli();
315 	}
316 }
317 
318 /*ARGSUSED*/
319 static void
320 acpi_cpu_check_wakeup(void *arg)
321 {
322 	/*
323 	 * Toggle interrupt flag to detect pending interrupts.
324 	 * If interrupt happened, do_interrupt() will notify CPU idle
325 	 * notification framework so no need to call cpu_idle_exit() here.
326 	 */
327 	sti();
328 	SMT_PAUSE();
329 	cli();
330 }
331 
332 /*
333  * Idle the current CPU via ACPI-defined System I/O read to an ACPI-specified
334  * address.
335  */
336 static void
337 acpi_io_idle(uint32_t address)
338 {
339 	uint32_t value;
340 	ACPI_TABLE_FADT *gbl_FADT;
341 
342 	/*
343 	 * Do we need to work around an ancient chipset bug in early ACPI
344 	 * implementations that would result in a late STPCLK# assertion?
345 	 *
346 	 * Must be true when running on systems where the ACPI-indicated I/O
347 	 * read to enter low-power states may resolve before actually stopping
348 	 * the processor that initiated a low-power transition. On such systems,
349 	 * it is possible the processor would proceed past the idle point and
350 	 * *then* be stopped.
351 	 *
352 	 * An early workaround that has been carried forward is to read the ACPI
353 	 * PM Timer after requesting a low-power transition. The timer read will
354 	 * take long enough that we are certain the processor is safe to be
355 	 * stopped.
356 	 *
357 	 * From some investigation, this was only ever necessary on older Intel
358 	 * chipsets. Additionally, the timer read can take upwards of a thousand
359 	 * CPU clocks, so for systems that work correctly, it's just a tarpit
360 	 * for the CPU as it is woken back up.
361 	 */
362 	boolean_t need_stpclk_workaround =
363 	    cpuid_getvendor(CPU) == X86_VENDOR_Intel;
364 
365 	/*
366 	 * The following call will cause us to halt which will cause the store
367 	 * buffer to be repartitioned, potentially exposing us to the Intel CPU
368 	 * vulnerability MDS. As such, we need to explicitly call that here.
369 	 * The other idle methods do this automatically as part of the
370 	 * implementation of i86_mwait().
371 	 */
372 	x86_md_clear();
373 	(void) cpu_acpi_read_port(address, &value, 8);
374 	if (need_stpclk_workaround) {
375 		acpica_get_global_FADT(&gbl_FADT);
376 		(void) cpu_acpi_read_port(
377 		    gbl_FADT->XPmTimerBlock.Address,
378 		    &value, 32);
379 	}
380 }
381 
382 /*
383  * enter deep c-state handler
384  */
385 static void
386 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
387 {
388 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
389 	cpu_t			*cpup = CPU;
390 	processorid_t		cpu_sid = cpup->cpu_seqid;
391 	cpupart_t		*cp = cpup->cpu_part;
392 	hrtime_t		lapic_expire;
393 	uint8_t			type = cstate->cs_addrspace_id;
394 	uint32_t		cs_type = cstate->cs_type;
395 	int			hset_update = 1;
396 	boolean_t		using_timer;
397 	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
398 
399 	/*
400 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
401 	 * wake us between now and when we call mwait.  No other cpu will
402 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
403 	 */
404 	if (mcpu_mwait) {
405 		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
406 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
407 			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
408 		} else {
409 			*mcpu_mwait = MWAIT_HALTED;
410 			check_func = &acpi_cpu_mwait_check_wakeup;
411 		}
412 	}
413 
414 	/*
415 	 * If this CPU is online, and there are multiple CPUs
416 	 * in the system, then we should note our halting
417 	 * by adding ourselves to the partition's halted CPU
418 	 * bitmap. This allows other CPUs to find/awaken us when
419 	 * work becomes available.
420 	 */
421 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
422 		hset_update = 0;
423 
424 	/*
425 	 * Add ourselves to the partition's halted CPUs bitmask
426 	 * and set our HALTED flag, if necessary.
427 	 *
428 	 * When a thread becomes runnable, it is placed on the queue
429 	 * and then the halted cpuset is checked to determine who
430 	 * (if anyone) should be awakened. We therefore need to first
431 	 * add ourselves to the halted cpuset, and and then check if there
432 	 * is any work available.
433 	 *
434 	 * Note that memory barriers after updating the HALTED flag
435 	 * are not necessary since an atomic operation (updating the bitmap)
436 	 * immediately follows. On x86 the atomic operation acts as a
437 	 * memory barrier for the update of cpu_disp_flags.
438 	 */
439 	if (hset_update) {
440 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
441 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
442 	}
443 
444 	/*
445 	 * Check to make sure there's really nothing to do.
446 	 * Work destined for this CPU may become available after
447 	 * this check. We'll be notified through the clearing of our
448 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
449 	 *
450 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
451 	 */
452 	if (disp_anywork()) {
453 		if (hset_update) {
454 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
455 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
456 		}
457 		return;
458 	}
459 
460 	/*
461 	 * We're on our way to being halted.
462 	 *
463 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
464 	 * Try to program the HPET hardware to substitute for this CPU's
465 	 * LAPIC timer.
466 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
467 	 * to start the LAPIC Timer again before leaving this function.
468 	 *
469 	 * Disable interrupts here so we will awaken immediately after halting
470 	 * if someone tries to poke us between now and the time we actually
471 	 * halt.
472 	 */
473 	cli();
474 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
475 
476 	/*
477 	 * We check for the presence of our bit after disabling interrupts.
478 	 * If it's cleared, we'll return. If the bit is cleared after
479 	 * we check then the cstate_wakeup() will pop us out of the halted
480 	 * state.
481 	 *
482 	 * This means that the ordering of the cstate_wakeup() and the clearing
483 	 * of the bit by cpu_wakeup is important.
484 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
485 	 * cstate_wakeup().
486 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
487 	 */
488 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
489 		(void) cstate_use_timer(&lapic_expire,
490 		    CSTATE_USING_LAT);
491 		sti();
492 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
493 		return;
494 	}
495 
496 	/*
497 	 * The check for anything locally runnable is here for performance
498 	 * and isn't needed for correctness. disp_nrunnable ought to be
499 	 * in our cache still, so it's inexpensive to check, and if there
500 	 * is anything runnable we won't have to wait for the poke.
501 	 */
502 	if (cpup->cpu_disp->disp_nrunnable != 0) {
503 		(void) cstate_use_timer(&lapic_expire,
504 		    CSTATE_USING_LAT);
505 		sti();
506 		if (hset_update) {
507 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
508 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
509 		}
510 		return;
511 	}
512 
513 	if (using_timer == B_FALSE) {
514 
515 		(void) cstate_use_timer(&lapic_expire,
516 		    CSTATE_USING_LAT);
517 		sti();
518 
519 		/*
520 		 * We are currently unable to program the HPET to act as this
521 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
522 		 * because no timer is set to wake it up while its LAPIC timer
523 		 * stalls in deep C-States.
524 		 * Enter C1 instead.
525 		 *
526 		 * cstate_wake_cpu() will wake this CPU with an IPI which
527 		 * works with MWAIT.
528 		 */
529 		i86_monitor(mcpu_mwait, 0, 0);
530 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
531 			if (cpu_idle_enter(IDLE_STATE_C1, 0,
532 			    check_func, (void *)mcpu_mwait) == 0) {
533 				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
534 				    MWAIT_HALTED) {
535 					i86_mwait(0, 0);
536 				}
537 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
538 			}
539 		}
540 
541 		/*
542 		 * We're no longer halted
543 		 */
544 		if (hset_update) {
545 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
546 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
547 		}
548 		return;
549 	}
550 
551 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
552 		/*
553 		 * We're on our way to being halted.
554 		 * To avoid a lost wakeup, arm the monitor before checking
555 		 * if another cpu wrote to mcpu_mwait to wake us up.
556 		 */
557 		i86_monitor(mcpu_mwait, 0, 0);
558 		if (*mcpu_mwait == MWAIT_HALTED) {
559 			if (cpu_idle_enter((uint_t)cs_type, 0,
560 			    check_func, (void *)mcpu_mwait) == 0) {
561 				if (*mcpu_mwait == MWAIT_HALTED) {
562 					i86_mwait(cstate->cs_address, 1);
563 				}
564 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
565 			}
566 		}
567 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
568 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
569 			if (cpu_idle_enter((uint_t)cs_type, 0,
570 			    check_func, (void *)mcpu_mwait) == 0) {
571 				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
572 					acpi_io_idle(cstate->cs_address);
573 				}
574 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
575 			}
576 		}
577 	}
578 
579 	/*
580 	 * The LAPIC timer may have stopped in deep c-state.
581 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
582 	 */
583 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
584 	sti();
585 
586 	/*
587 	 * We're no longer halted
588 	 */
589 	if (hset_update) {
590 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
591 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
592 	}
593 }
594 
595 /*
596  * Idle the present CPU, deep c-state is supported
597  */
598 void
599 cpu_acpi_idle(void)
600 {
601 	cpu_t *cp = CPU;
602 	cpu_acpi_handle_t handle;
603 	cma_c_state_t *cs_data;
604 	cpu_acpi_cstate_t *cstates;
605 	hrtime_t start, end;
606 	int cpu_max_cstates;
607 	uint32_t cs_indx;
608 	uint16_t cs_type;
609 
610 	cpupm_mach_state_t *mach_state =
611 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
612 	handle = mach_state->ms_acpi_handle;
613 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
614 
615 	cs_data = mach_state->ms_cstate.cma_state.cstate;
616 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
617 	ASSERT(cstates != NULL);
618 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
619 	if (cpu_max_cstates > CPU_MAX_CSTATES)
620 		cpu_max_cstates = CPU_MAX_CSTATES;
621 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
622 		(*non_deep_idle_cpu)();
623 		return;
624 	}
625 
626 	start = gethrtime_unscaled();
627 
628 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
629 
630 	cs_type = cstates[cs_indx].cs_type;
631 
632 	switch (cs_type) {
633 	default:
634 		/* FALLTHROUGH */
635 	case CPU_ACPI_C1:
636 		(*non_deep_idle_cpu)();
637 		break;
638 
639 	case CPU_ACPI_C2:
640 		acpi_cpu_cstate(&cstates[cs_indx]);
641 		break;
642 
643 	case CPU_ACPI_C3:
644 		/*
645 		 * All supported Intel processors maintain cache coherency
646 		 * during C3.  Currently when entering C3 processors flush
647 		 * core caches to higher level shared cache. The shared cache
648 		 * maintains state and supports probes during C3.
649 		 * Consequently there is no need to handle cache coherency
650 		 * and Bus Master activity here with the cache flush, BM_RLD
651 		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
652 		 * in section 8.1.4 of the ACPI Specification 4.0.
653 		 */
654 		acpi_cpu_cstate(&cstates[cs_indx]);
655 		break;
656 	}
657 
658 	end = gethrtime_unscaled();
659 
660 	/*
661 	 * Update statistics
662 	 */
663 	cpupm_wakeup_cstate_data(cs_data, end);
664 }
665 
666 boolean_t
667 cpu_deep_cstates_supported(void)
668 {
669 	extern int	idle_cpu_no_deep_c;
670 
671 	if (idle_cpu_no_deep_c)
672 		return (B_FALSE);
673 
674 	if (!cpuid_deep_cstates_supported())
675 		return (B_FALSE);
676 
677 	if (cpuid_arat_supported()) {
678 		cpu_cstate_arat = B_TRUE;
679 		return (B_TRUE);
680 	}
681 
682 	/*
683 	 * In theory we can use the HPET as a proxy timer in case we can't rely
684 	 * on the LAPIC in deep C-states. In practice on AMD it seems something
685 	 * isn't quite right and we just don't get woken up, so the proxy timer
686 	 * approach doesn't work. Only set up the HPET as proxy timer on Intel
687 	 * systems for now.
688 	 */
689 	if (cpuid_getvendor(CPU) == X86_VENDOR_Intel &&
690 	    (hpet.supported == HPET_FULL_SUPPORT) &&
691 	    hpet.install_proxy()) {
692 		cpu_cstate_hpet = B_TRUE;
693 		return (B_TRUE);
694 	}
695 
696 	return (B_FALSE);
697 }
698 
699 /*
700  * Validate that this processor supports deep cstate and if so,
701  * get the c-state data from ACPI and cache it.
702  */
703 static int
704 cpu_idle_init(cpu_t *cp)
705 {
706 	cpupm_mach_state_t *mach_state =
707 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
708 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
709 	cpu_acpi_cstate_t *cstate;
710 	char name[KSTAT_STRLEN];
711 	int cpu_max_cstates, i;
712 	int ret;
713 
714 	/*
715 	 * Cache the C-state specific ACPI data.
716 	 */
717 	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
718 		if (ret < 0)
719 			cmn_err(CE_NOTE,
720 			    "!Support for CPU deep idle states is being "
721 			    "disabled due to errors parsing ACPI C-state "
722 			    "objects exported by BIOS.");
723 		cpu_idle_fini(cp);
724 		return (-1);
725 	}
726 
727 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
728 
729 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
730 
731 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
732 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
733 		/*
734 		 * Allocate, initialize and install cstate kstat
735 		 */
736 		cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
737 		    name, "misc",
738 		    KSTAT_TYPE_NAMED,
739 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
740 		    KSTAT_FLAG_VIRTUAL);
741 
742 		if (cstate->cs_ksp == NULL) {
743 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
744 		} else {
745 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
746 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
747 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
748 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
749 			cstate->cs_ksp->ks_private = cstate;
750 			kstat_install(cstate->cs_ksp);
751 		}
752 		cstate++;
753 	}
754 
755 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
756 	cpupm_alloc_ms_cstate(cp);
757 
758 	if (cpu_deep_cstates_supported()) {
759 		uint32_t value;
760 
761 		mutex_enter(&cpu_idle_callb_mutex);
762 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
763 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
764 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
765 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
766 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
767 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
768 		mutex_exit(&cpu_idle_callb_mutex);
769 
770 
771 		/*
772 		 * All supported CPUs (Nehalem and later) will remain in C3
773 		 * during Bus Master activity.
774 		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
775 		 * is not already 0 before enabling Deeper C-states.
776 		 */
777 		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
778 		if (value & 1)
779 			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
780 	}
781 
782 	return (0);
783 }
784 
785 /*
786  * Free resources allocated by cpu_idle_init().
787  */
788 static void
789 cpu_idle_fini(cpu_t *cp)
790 {
791 	cpupm_mach_state_t *mach_state =
792 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
793 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
794 	cpu_acpi_cstate_t *cstate;
795 	uint_t	cpu_max_cstates, i;
796 
797 	/*
798 	 * idle cpu points back to the generic one
799 	 */
800 	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
801 	disp_enq_thread = non_deep_idle_disp_enq_thread;
802 
803 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
804 	if (cstate) {
805 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
806 
807 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
808 			if (cstate->cs_ksp != NULL)
809 				kstat_delete(cstate->cs_ksp);
810 			cstate++;
811 		}
812 	}
813 
814 	cpupm_free_ms_cstate(cp);
815 	cpupm_free_domains(&cpupm_cstate_domains);
816 	cpu_acpi_free_cstate_data(handle);
817 
818 	mutex_enter(&cpu_idle_callb_mutex);
819 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
820 		(void) callb_delete(cpu_deep_idle_callb_id);
821 		cpu_deep_idle_callb_id = (callb_id_t)0;
822 	}
823 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
824 		(void) callb_delete(cpu_idle_cpr_callb_id);
825 		cpu_idle_cpr_callb_id = (callb_id_t)0;
826 	}
827 	mutex_exit(&cpu_idle_callb_mutex);
828 }
829 
830 /*
831  * This function is introduced here to solve a race condition
832  * between the master and the slave to touch c-state data structure.
833  * After the slave calls this idle function to switch to the non
834  * deep idle function, the master can go on to reclaim the resource.
835  */
836 static void
837 cpu_idle_stop_sync(void)
838 {
839 	/* switch to the non deep idle function */
840 	CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
841 }
842 
843 static void
844 cpu_idle_stop(cpu_t *cp)
845 {
846 	cpupm_mach_state_t *mach_state =
847 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
848 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
849 	cpu_acpi_cstate_t *cstate;
850 	uint_t cpu_max_cstates, i = 0;
851 
852 	mutex_enter(&cpu_idle_callb_mutex);
853 	if (idle_cpu == cpu_idle_adaptive) {
854 		/*
855 		 * invoke the slave to call synchronous idle function.
856 		 */
857 		cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
858 		poke_cpu(cp->cpu_id);
859 
860 		/*
861 		 * wait until the slave switchs to non deep idle function,
862 		 * so that the master is safe to go on to reclaim the resource.
863 		 */
864 		while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
865 			drv_usecwait(10);
866 			if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
867 				cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
868 				    " idle stop timeout");
869 		}
870 	}
871 	mutex_exit(&cpu_idle_callb_mutex);
872 
873 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
874 	if (cstate) {
875 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
876 
877 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
878 			if (cstate->cs_ksp != NULL)
879 				kstat_delete(cstate->cs_ksp);
880 			cstate++;
881 		}
882 	}
883 	cpupm_free_ms_cstate(cp);
884 	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
885 	cpu_acpi_free_cstate_data(handle);
886 }
887 
888 /*ARGSUSED*/
889 static boolean_t
890 cpu_deep_idle_callb(void *arg, int code)
891 {
892 	boolean_t rslt = B_TRUE;
893 
894 	mutex_enter(&cpu_idle_callb_mutex);
895 	switch (code) {
896 	case PM_DEFAULT_CPU_DEEP_IDLE:
897 		/*
898 		 * Default policy is same as enable
899 		 */
900 		/*FALLTHROUGH*/
901 	case PM_ENABLE_CPU_DEEP_IDLE:
902 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
903 			break;
904 
905 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
906 			disp_enq_thread = cstate_wakeup;
907 			idle_cpu = cpu_idle_adaptive;
908 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
909 		} else {
910 			rslt = B_FALSE;
911 		}
912 		break;
913 
914 	case PM_DISABLE_CPU_DEEP_IDLE:
915 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
916 			break;
917 
918 		idle_cpu = non_deep_idle_cpu;
919 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
920 			disp_enq_thread = non_deep_idle_disp_enq_thread;
921 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
922 		}
923 		break;
924 
925 	default:
926 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
927 		    code);
928 		break;
929 	}
930 	mutex_exit(&cpu_idle_callb_mutex);
931 	return (rslt);
932 }
933 
934 /*ARGSUSED*/
935 static boolean_t
936 cpu_idle_cpr_callb(void *arg, int code)
937 {
938 	boolean_t rslt = B_TRUE;
939 
940 	mutex_enter(&cpu_idle_callb_mutex);
941 	switch (code) {
942 	case CB_CODE_CPR_RESUME:
943 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
944 			/*
945 			 * Do not enable dispatcher hooks if disabled by user.
946 			 */
947 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
948 				break;
949 
950 			disp_enq_thread = cstate_wakeup;
951 			idle_cpu = cpu_idle_adaptive;
952 		} else {
953 			rslt = B_FALSE;
954 		}
955 		break;
956 
957 	case CB_CODE_CPR_CHKPT:
958 		idle_cpu = non_deep_idle_cpu;
959 		disp_enq_thread = non_deep_idle_disp_enq_thread;
960 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
961 		break;
962 
963 	default:
964 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
965 		break;
966 	}
967 	mutex_exit(&cpu_idle_callb_mutex);
968 	return (rslt);
969 }
970 
971 /*
972  * handle _CST notification
973  */
974 void
975 cpuidle_cstate_instance(cpu_t *cp)
976 {
977 #ifndef	__xpv
978 	cpupm_mach_state_t	*mach_state =
979 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
980 	cpu_acpi_handle_t	handle;
981 	struct machcpu		*mcpu;
982 	cpuset_t		dom_cpu_set;
983 	kmutex_t		*pm_lock;
984 	int			result = 0;
985 	processorid_t		cpu_id;
986 
987 	if (mach_state == NULL) {
988 		return;
989 	}
990 
991 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
992 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
993 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
994 
995 	/*
996 	 * Do for all the CPU's in the domain
997 	 */
998 	mutex_enter(pm_lock);
999 	do {
1000 		CPUSET_FIND(dom_cpu_set, cpu_id);
1001 		if (cpu_id == CPUSET_NOTINSET)
1002 			break;
1003 
1004 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
1005 		cp = cpu[cpu_id];
1006 		mach_state = (cpupm_mach_state_t *)
1007 		    cp->cpu_m.mcpu_pm_mach_state;
1008 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
1009 			mutex_exit(pm_lock);
1010 			return;
1011 		}
1012 		handle = mach_state->ms_acpi_handle;
1013 		ASSERT(handle != NULL);
1014 
1015 		/*
1016 		 * re-evaluate cstate object
1017 		 */
1018 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
1019 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
1020 			    " object Instance: %d", cpu_id);
1021 		}
1022 		mcpu = &(cp->cpu_m);
1023 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
1024 		if (mcpu->max_cstates > CPU_ACPI_C1) {
1025 			(void) cstate_timer_callback(
1026 			    CST_EVENT_MULTIPLE_CSTATES);
1027 			disp_enq_thread = cstate_wakeup;
1028 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
1029 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
1030 			disp_enq_thread = non_deep_idle_disp_enq_thread;
1031 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
1032 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
1033 		}
1034 
1035 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
1036 	} while (result < 0);
1037 	mutex_exit(pm_lock);
1038 #endif
1039 }
1040 
1041 /*
1042  * handle the number or the type of available processor power states change
1043  */
1044 void
1045 cpuidle_manage_cstates(void *ctx)
1046 {
1047 	cpu_t			*cp = ctx;
1048 	cpupm_mach_state_t	*mach_state =
1049 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
1050 	boolean_t		is_ready;
1051 
1052 	if (mach_state == NULL) {
1053 		return;
1054 	}
1055 
1056 	/*
1057 	 * We currently refuse to power manage if the CPU is not ready to
1058 	 * take cross calls (cross calls fail silently if CPU is not ready
1059 	 * for it).
1060 	 *
1061 	 * Additionally, for x86 platforms we cannot power manage an instance,
1062 	 * until it has been initialized.
1063 	 */
1064 	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1065 	if (!is_ready)
1066 		return;
1067 
1068 	cpuidle_cstate_instance(cp);
1069 }
1070