xref: /titanic_41/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision efd31e1d839d4665462b5c267a1c654548082663)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009-2010, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/x86_archext.h>
31 #include <sys/machsystm.h>
32 #include <sys/x_call.h>
33 #include <sys/stat.h>
34 #include <sys/acpi/acpi.h>
35 #include <sys/acpica.h>
36 #include <sys/cpu_acpi.h>
37 #include <sys/cpu_idle.h>
38 #include <sys/cpupm.h>
39 #include <sys/cpu_event.h>
40 #include <sys/hpet.h>
41 #include <sys/archsystm.h>
42 #include <vm/hat_i86.h>
43 #include <sys/dtrace.h>
44 #include <sys/sdt.h>
45 #include <sys/callb.h>
46 
47 #define	CSTATE_USING_HPET		1
48 #define	CSTATE_USING_LAT		2
49 
50 #define	CPU_IDLE_STOP_TIMEOUT		1000
51 
52 extern void cpu_idle_adaptive(void);
53 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
54     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
55 
56 static int cpu_idle_init(cpu_t *);
57 static void cpu_idle_fini(cpu_t *);
58 static void cpu_idle_stop(cpu_t *);
59 static boolean_t cpu_deep_idle_callb(void *arg, int code);
60 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
61 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
62 
63 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
64 
65 /*
66  * the flag of always-running local APIC timer.
67  * the flag of HPET Timer use in deep cstate.
68  */
69 static boolean_t cpu_cstate_arat = B_FALSE;
70 static boolean_t cpu_cstate_hpet = B_FALSE;
71 
72 /*
73  * Interfaces for modules implementing Intel's deep c-state.
74  */
75 cpupm_state_ops_t cpu_idle_ops = {
76 	"Generic ACPI C-state Support",
77 	cpu_idle_init,
78 	cpu_idle_fini,
79 	NULL,
80 	cpu_idle_stop
81 };
82 
83 static kmutex_t		cpu_idle_callb_mutex;
84 static callb_id_t	cpu_deep_idle_callb_id;
85 static callb_id_t	cpu_idle_cpr_callb_id;
86 static uint_t		cpu_idle_cfg_state;
87 
88 static kmutex_t cpu_idle_mutex;
89 
90 cpu_idle_kstat_t cpu_idle_kstat = {
91 	{ "address_space_id",	KSTAT_DATA_STRING },
92 	{ "latency",		KSTAT_DATA_UINT32 },
93 	{ "power",		KSTAT_DATA_UINT32 },
94 };
95 
96 /*
97  * kstat update function of the c-state info
98  */
99 static int
100 cpu_idle_kstat_update(kstat_t *ksp, int flag)
101 {
102 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
103 
104 	if (flag == KSTAT_WRITE) {
105 		return (EACCES);
106 	}
107 
108 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
109 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
110 		"FFixedHW");
111 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
112 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
113 		"SystemIO");
114 	} else {
115 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
116 		"Unsupported");
117 	}
118 
119 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
120 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
121 
122 	return (0);
123 }
124 
125 /*
126  * Used during configuration callbacks to manage implementation specific
127  * details of the hardware timer used during Deep C-state.
128  */
129 boolean_t
130 cstate_timer_callback(int code)
131 {
132 	if (cpu_cstate_arat) {
133 		return (B_TRUE);
134 	} else if (cpu_cstate_hpet) {
135 		return (hpet.callback(code));
136 	}
137 	return (B_FALSE);
138 }
139 
140 /*
141  * Some Local APIC Timers do not work during Deep C-states.
142  * The Deep C-state idle function uses this function to ensure it is using a
143  * hardware timer that works during Deep C-states.  This function also
144  * switches the timer back to the LACPI Timer after Deep C-state.
145  */
146 static boolean_t
147 cstate_use_timer(hrtime_t *lapic_expire, int timer)
148 {
149 	if (cpu_cstate_arat)
150 		return (B_TRUE);
151 
152 	/*
153 	 * We have to return B_FALSE if no arat or hpet support
154 	 */
155 	if (!cpu_cstate_hpet)
156 		return (B_FALSE);
157 
158 	switch (timer) {
159 	case CSTATE_USING_HPET:
160 		return (hpet.use_hpet_timer(lapic_expire));
161 	case CSTATE_USING_LAT:
162 		hpet.use_lapic_timer(*lapic_expire);
163 		return (B_TRUE);
164 	default:
165 		return (B_FALSE);
166 	}
167 }
168 
169 /*
170  * c-state wakeup function.
171  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
172  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
173  */
174 void
175 cstate_wakeup(cpu_t *cp, int bound)
176 {
177 	struct machcpu	*mcpu = &(cp->cpu_m);
178 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
179 	cpupart_t	*cpu_part;
180 	uint_t		cpu_found;
181 	processorid_t	cpu_sid;
182 
183 	cpu_part = cp->cpu_part;
184 	cpu_sid = cp->cpu_seqid;
185 	/*
186 	 * Clear the halted bit for that CPU since it will be woken up
187 	 * in a moment.
188 	 */
189 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
190 		/*
191 		 * Clear the halted bit for that CPU since it will be
192 		 * poked in a moment.
193 		 */
194 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
195 
196 		/*
197 		 * We may find the current CPU present in the halted cpuset
198 		 * if we're in the context of an interrupt that occurred
199 		 * before we had a chance to clear our bit in cpu_idle().
200 		 * Waking ourself is obviously unnecessary, since if
201 		 * we're here, we're not halted.
202 		 */
203 		if (cp != CPU) {
204 			/*
205 			 * Use correct wakeup mechanism
206 			 */
207 			if ((mcpu_mwait != NULL) &&
208 			    (*mcpu_mwait == MWAIT_HALTED))
209 				MWAIT_WAKEUP(cp);
210 			else
211 				poke_cpu(cp->cpu_id);
212 		}
213 		return;
214 	} else {
215 		/*
216 		 * This cpu isn't halted, but it's idle or undergoing a
217 		 * context switch. No need to awaken anyone else.
218 		 */
219 		if (cp->cpu_thread == cp->cpu_idle_thread ||
220 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
221 			return;
222 	}
223 
224 	/*
225 	 * No need to wake up other CPUs if the thread we just enqueued
226 	 * is bound.
227 	 */
228 	if (bound)
229 		return;
230 
231 
232 	/*
233 	 * See if there's any other halted CPUs. If there are, then
234 	 * select one, and awaken it.
235 	 * It's possible that after we find a CPU, somebody else
236 	 * will awaken it before we get the chance.
237 	 * In that case, look again.
238 	 */
239 	do {
240 		cpu_found = bitset_find(&cpu_part->cp_haltset);
241 		if (cpu_found == (uint_t)-1)
242 			return;
243 
244 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
245 	    cpu_found) < 0);
246 
247 	/*
248 	 * Must use correct wakeup mechanism to avoid lost wakeup of
249 	 * alternate cpu.
250 	 */
251 	if (cpu_found != CPU->cpu_seqid) {
252 		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
253 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
254 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
255 		else
256 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
257 	}
258 }
259 
260 /*
261  * Function called by CPU idle notification framework to check whether CPU
262  * has been awakened. It will be called with interrupt disabled.
263  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
264  * notification framework.
265  */
266 static void
267 acpi_cpu_mwait_check_wakeup(void *arg)
268 {
269 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
270 
271 	ASSERT(arg != NULL);
272 	if (*mcpu_mwait != MWAIT_HALTED) {
273 		/*
274 		 * CPU has been awakened, notify CPU idle notification system.
275 		 */
276 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
277 	} else {
278 		/*
279 		 * Toggle interrupt flag to detect pending interrupts.
280 		 * If interrupt happened, do_interrupt() will notify CPU idle
281 		 * notification framework so no need to call cpu_idle_exit()
282 		 * here.
283 		 */
284 		sti();
285 		SMT_PAUSE();
286 		cli();
287 	}
288 }
289 
290 static void
291 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
292 {
293 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
294 
295 	ASSERT(arg != NULL);
296 	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
297 		/*
298 		 * CPU has been awakened, notify CPU idle notification system.
299 		 */
300 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
301 	} else {
302 		/*
303 		 * Toggle interrupt flag to detect pending interrupts.
304 		 * If interrupt happened, do_interrupt() will notify CPU idle
305 		 * notification framework so no need to call cpu_idle_exit()
306 		 * here.
307 		 */
308 		sti();
309 		SMT_PAUSE();
310 		cli();
311 	}
312 }
313 
314 /*ARGSUSED*/
315 static void
316 acpi_cpu_check_wakeup(void *arg)
317 {
318 	/*
319 	 * Toggle interrupt flag to detect pending interrupts.
320 	 * If interrupt happened, do_interrupt() will notify CPU idle
321 	 * notification framework so no need to call cpu_idle_exit() here.
322 	 */
323 	sti();
324 	SMT_PAUSE();
325 	cli();
326 }
327 
328 /*
329  * enter deep c-state handler
330  */
331 static void
332 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
333 {
334 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
335 	cpu_t			*cpup = CPU;
336 	processorid_t		cpu_sid = cpup->cpu_seqid;
337 	cpupart_t		*cp = cpup->cpu_part;
338 	hrtime_t		lapic_expire;
339 	uint8_t			type = cstate->cs_addrspace_id;
340 	uint32_t		cs_type = cstate->cs_type;
341 	int			hset_update = 1;
342 	boolean_t		using_timer;
343 	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
344 
345 	/*
346 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
347 	 * wake us between now and when we call mwait.  No other cpu will
348 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
349 	 */
350 	if (mcpu_mwait) {
351 		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
352 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
353 			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
354 		} else {
355 			*mcpu_mwait = MWAIT_HALTED;
356 			check_func = &acpi_cpu_mwait_check_wakeup;
357 		}
358 	}
359 
360 	/*
361 	 * If this CPU is online, and there are multiple CPUs
362 	 * in the system, then we should note our halting
363 	 * by adding ourselves to the partition's halted CPU
364 	 * bitmap. This allows other CPUs to find/awaken us when
365 	 * work becomes available.
366 	 */
367 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
368 		hset_update = 0;
369 
370 	/*
371 	 * Add ourselves to the partition's halted CPUs bitmask
372 	 * and set our HALTED flag, if necessary.
373 	 *
374 	 * When a thread becomes runnable, it is placed on the queue
375 	 * and then the halted cpuset is checked to determine who
376 	 * (if anyone) should be awakened. We therefore need to first
377 	 * add ourselves to the halted cpuset, and and then check if there
378 	 * is any work available.
379 	 *
380 	 * Note that memory barriers after updating the HALTED flag
381 	 * are not necessary since an atomic operation (updating the bitmap)
382 	 * immediately follows. On x86 the atomic operation acts as a
383 	 * memory barrier for the update of cpu_disp_flags.
384 	 */
385 	if (hset_update) {
386 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
387 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
388 	}
389 
390 	/*
391 	 * Check to make sure there's really nothing to do.
392 	 * Work destined for this CPU may become available after
393 	 * this check. We'll be notified through the clearing of our
394 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
395 	 *
396 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
397 	 */
398 	if (disp_anywork()) {
399 		if (hset_update) {
400 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
401 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
402 		}
403 		return;
404 	}
405 
406 	/*
407 	 * We're on our way to being halted.
408 	 *
409 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
410 	 * Try to program the HPET hardware to substitute for this CPU's
411 	 * LAPIC timer.
412 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
413 	 * to start the LAPIC Timer again before leaving this function.
414 	 *
415 	 * Disable interrupts here so we will awaken immediately after halting
416 	 * if someone tries to poke us between now and the time we actually
417 	 * halt.
418 	 */
419 	cli();
420 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
421 
422 	/*
423 	 * We check for the presence of our bit after disabling interrupts.
424 	 * If it's cleared, we'll return. If the bit is cleared after
425 	 * we check then the cstate_wakeup() will pop us out of the halted
426 	 * state.
427 	 *
428 	 * This means that the ordering of the cstate_wakeup() and the clearing
429 	 * of the bit by cpu_wakeup is important.
430 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
431 	 * cstate_wakeup().
432 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
433 	 */
434 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
435 		(void) cstate_use_timer(&lapic_expire,
436 		    CSTATE_USING_LAT);
437 		sti();
438 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
439 		return;
440 	}
441 
442 	/*
443 	 * The check for anything locally runnable is here for performance
444 	 * and isn't needed for correctness. disp_nrunnable ought to be
445 	 * in our cache still, so it's inexpensive to check, and if there
446 	 * is anything runnable we won't have to wait for the poke.
447 	 */
448 	if (cpup->cpu_disp->disp_nrunnable != 0) {
449 		(void) cstate_use_timer(&lapic_expire,
450 		    CSTATE_USING_LAT);
451 		sti();
452 		if (hset_update) {
453 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
454 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
455 		}
456 		return;
457 	}
458 
459 	if (using_timer == B_FALSE) {
460 
461 		(void) cstate_use_timer(&lapic_expire,
462 		    CSTATE_USING_LAT);
463 		sti();
464 
465 		/*
466 		 * We are currently unable to program the HPET to act as this
467 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
468 		 * because no timer is set to wake it up while its LAPIC timer
469 		 * stalls in deep C-States.
470 		 * Enter C1 instead.
471 		 *
472 		 * cstate_wake_cpu() will wake this CPU with an IPI which
473 		 * works with MWAIT.
474 		 */
475 		i86_monitor(mcpu_mwait, 0, 0);
476 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
477 			if (cpu_idle_enter(IDLE_STATE_C1, 0,
478 			    check_func, (void *)mcpu_mwait) == 0) {
479 				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
480 				    MWAIT_HALTED) {
481 					i86_mwait(0, 0);
482 				}
483 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
484 			}
485 		}
486 
487 		/*
488 		 * We're no longer halted
489 		 */
490 		if (hset_update) {
491 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
492 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
493 		}
494 		return;
495 	}
496 
497 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
498 		/*
499 		 * We're on our way to being halted.
500 		 * To avoid a lost wakeup, arm the monitor before checking
501 		 * if another cpu wrote to mcpu_mwait to wake us up.
502 		 */
503 		i86_monitor(mcpu_mwait, 0, 0);
504 		if (*mcpu_mwait == MWAIT_HALTED) {
505 			if (cpu_idle_enter((uint_t)cs_type, 0,
506 			    check_func, (void *)mcpu_mwait) == 0) {
507 				if (*mcpu_mwait == MWAIT_HALTED) {
508 					i86_mwait(cstate->cs_address, 1);
509 				}
510 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
511 			}
512 		}
513 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
514 		uint32_t value;
515 		ACPI_TABLE_FADT *gbl_FADT;
516 
517 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
518 			if (cpu_idle_enter((uint_t)cs_type, 0,
519 			    check_func, (void *)mcpu_mwait) == 0) {
520 				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
521 					(void) cpu_acpi_read_port(
522 					    cstate->cs_address, &value, 8);
523 					acpica_get_global_FADT(&gbl_FADT);
524 					(void) cpu_acpi_read_port(
525 					    gbl_FADT->XPmTimerBlock.Address,
526 					    &value, 32);
527 				}
528 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
529 			}
530 		}
531 	}
532 
533 	/*
534 	 * The LAPIC timer may have stopped in deep c-state.
535 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
536 	 */
537 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
538 	sti();
539 
540 	/*
541 	 * We're no longer halted
542 	 */
543 	if (hset_update) {
544 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
545 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
546 	}
547 }
548 
549 /*
550  * Idle the present CPU, deep c-state is supported
551  */
552 void
553 cpu_acpi_idle(void)
554 {
555 	cpu_t *cp = CPU;
556 	cpu_acpi_handle_t handle;
557 	cma_c_state_t *cs_data;
558 	cpu_acpi_cstate_t *cstates;
559 	hrtime_t start, end;
560 	int cpu_max_cstates;
561 	uint32_t cs_indx;
562 	uint16_t cs_type;
563 
564 	cpupm_mach_state_t *mach_state =
565 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
566 	handle = mach_state->ms_acpi_handle;
567 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
568 
569 	cs_data = mach_state->ms_cstate.cma_state.cstate;
570 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
571 	ASSERT(cstates != NULL);
572 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
573 	if (cpu_max_cstates > CPU_MAX_CSTATES)
574 		cpu_max_cstates = CPU_MAX_CSTATES;
575 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
576 		(*non_deep_idle_cpu)();
577 		return;
578 	}
579 
580 	start = gethrtime_unscaled();
581 
582 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
583 
584 	cs_type = cstates[cs_indx].cs_type;
585 
586 	switch (cs_type) {
587 	default:
588 		/* FALLTHROUGH */
589 	case CPU_ACPI_C1:
590 		(*non_deep_idle_cpu)();
591 		break;
592 
593 	case CPU_ACPI_C2:
594 		acpi_cpu_cstate(&cstates[cs_indx]);
595 		break;
596 
597 	case CPU_ACPI_C3:
598 		/*
599 		 * All supported Intel processors maintain cache coherency
600 		 * during C3.  Currently when entering C3 processors flush
601 		 * core caches to higher level shared cache. The shared cache
602 		 * maintains state and supports probes during C3.
603 		 * Consequently there is no need to handle cache coherency
604 		 * and Bus Master activity here with the cache flush, BM_RLD
605 		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
606 		 * in section 8.1.4 of the ACPI Specification 4.0.
607 		 */
608 		acpi_cpu_cstate(&cstates[cs_indx]);
609 		break;
610 	}
611 
612 	end = gethrtime_unscaled();
613 
614 	/*
615 	 * Update statistics
616 	 */
617 	cpupm_wakeup_cstate_data(cs_data, end);
618 }
619 
620 boolean_t
621 cpu_deep_cstates_supported(void)
622 {
623 	extern int	idle_cpu_no_deep_c;
624 
625 	if (idle_cpu_no_deep_c)
626 		return (B_FALSE);
627 
628 	if (!cpuid_deep_cstates_supported())
629 		return (B_FALSE);
630 
631 	if (cpuid_arat_supported()) {
632 		cpu_cstate_arat = B_TRUE;
633 		return (B_TRUE);
634 	}
635 
636 	if ((hpet.supported == HPET_FULL_SUPPORT) &&
637 	    hpet.install_proxy()) {
638 		cpu_cstate_hpet = B_TRUE;
639 		return (B_TRUE);
640 	}
641 
642 	return (B_FALSE);
643 }
644 
645 /*
646  * Validate that this processor supports deep cstate and if so,
647  * get the c-state data from ACPI and cache it.
648  */
649 static int
650 cpu_idle_init(cpu_t *cp)
651 {
652 	cpupm_mach_state_t *mach_state =
653 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
654 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
655 	cpu_acpi_cstate_t *cstate;
656 	char name[KSTAT_STRLEN];
657 	int cpu_max_cstates, i;
658 	int ret;
659 
660 	/*
661 	 * Cache the C-state specific ACPI data.
662 	 */
663 	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
664 		if (ret < 0)
665 			cmn_err(CE_NOTE,
666 			    "!Support for CPU deep idle states is being "
667 			    "disabled due to errors parsing ACPI C-state "
668 			    "objects exported by BIOS.");
669 		cpu_idle_fini(cp);
670 		return (-1);
671 	}
672 
673 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
674 
675 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
676 
677 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
678 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
679 		/*
680 		 * Allocate, initialize and install cstate kstat
681 		 */
682 		cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
683 		    name, "misc",
684 		    KSTAT_TYPE_NAMED,
685 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
686 		    KSTAT_FLAG_VIRTUAL);
687 
688 		if (cstate->cs_ksp == NULL) {
689 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
690 		} else {
691 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
692 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
693 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
694 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
695 			cstate->cs_ksp->ks_private = cstate;
696 			kstat_install(cstate->cs_ksp);
697 		}
698 		cstate++;
699 	}
700 
701 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
702 	cpupm_alloc_ms_cstate(cp);
703 
704 	if (cpu_deep_cstates_supported()) {
705 		uint32_t value;
706 
707 		mutex_enter(&cpu_idle_callb_mutex);
708 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
709 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
710 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
711 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
712 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
713 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
714 		mutex_exit(&cpu_idle_callb_mutex);
715 
716 
717 		/*
718 		 * All supported CPUs (Nehalem and later) will remain in C3
719 		 * during Bus Master activity.
720 		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
721 		 * is not already 0 before enabling Deeper C-states.
722 		 */
723 		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
724 		if (value & 1)
725 			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
726 	}
727 
728 	return (0);
729 }
730 
731 /*
732  * Free resources allocated by cpu_idle_init().
733  */
734 static void
735 cpu_idle_fini(cpu_t *cp)
736 {
737 	cpupm_mach_state_t *mach_state =
738 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
739 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
740 	cpu_acpi_cstate_t *cstate;
741 	uint_t	cpu_max_cstates, i;
742 
743 	/*
744 	 * idle cpu points back to the generic one
745 	 */
746 	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
747 	disp_enq_thread = non_deep_idle_disp_enq_thread;
748 
749 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
750 	if (cstate) {
751 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
752 
753 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
754 			if (cstate->cs_ksp != NULL)
755 				kstat_delete(cstate->cs_ksp);
756 			cstate++;
757 		}
758 	}
759 
760 	cpupm_free_ms_cstate(cp);
761 	cpupm_free_domains(&cpupm_cstate_domains);
762 	cpu_acpi_free_cstate_data(handle);
763 
764 	mutex_enter(&cpu_idle_callb_mutex);
765 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
766 		(void) callb_delete(cpu_deep_idle_callb_id);
767 		cpu_deep_idle_callb_id = (callb_id_t)0;
768 	}
769 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
770 		(void) callb_delete(cpu_idle_cpr_callb_id);
771 		cpu_idle_cpr_callb_id = (callb_id_t)0;
772 	}
773 	mutex_exit(&cpu_idle_callb_mutex);
774 }
775 
776 /*
777  * This function is introduced here to solve a race condition
778  * between the master and the slave to touch c-state data structure.
779  * After the slave calls this idle function to switch to the non
780  * deep idle function, the master can go on to reclaim the resource.
781  */
782 static void
783 cpu_idle_stop_sync(void)
784 {
785 	/* switch to the non deep idle function */
786 	CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
787 }
788 
789 static void
790 cpu_idle_stop(cpu_t *cp)
791 {
792 	cpupm_mach_state_t *mach_state =
793 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
794 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
795 	cpu_acpi_cstate_t *cstate;
796 	uint_t cpu_max_cstates, i = 0;
797 
798 	mutex_enter(&cpu_idle_callb_mutex);
799 	if (idle_cpu == cpu_idle_adaptive) {
800 		/*
801 		 * invoke the slave to call synchronous idle function.
802 		 */
803 		cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
804 		poke_cpu(cp->cpu_id);
805 
806 		/*
807 		 * wait until the slave switchs to non deep idle function,
808 		 * so that the master is safe to go on to reclaim the resource.
809 		 */
810 		while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
811 			drv_usecwait(10);
812 			if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
813 				cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
814 				    " idle stop timeout");
815 		}
816 	}
817 	mutex_exit(&cpu_idle_callb_mutex);
818 
819 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
820 	if (cstate) {
821 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
822 
823 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
824 			if (cstate->cs_ksp != NULL)
825 				kstat_delete(cstate->cs_ksp);
826 			cstate++;
827 		}
828 	}
829 	cpupm_free_ms_cstate(cp);
830 	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
831 	cpu_acpi_free_cstate_data(handle);
832 }
833 
834 /*ARGSUSED*/
835 static boolean_t
836 cpu_deep_idle_callb(void *arg, int code)
837 {
838 	boolean_t rslt = B_TRUE;
839 
840 	mutex_enter(&cpu_idle_callb_mutex);
841 	switch (code) {
842 	case PM_DEFAULT_CPU_DEEP_IDLE:
843 		/*
844 		 * Default policy is same as enable
845 		 */
846 		/*FALLTHROUGH*/
847 	case PM_ENABLE_CPU_DEEP_IDLE:
848 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
849 			break;
850 
851 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
852 			disp_enq_thread = cstate_wakeup;
853 			idle_cpu = cpu_idle_adaptive;
854 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
855 		} else {
856 			rslt = B_FALSE;
857 		}
858 		break;
859 
860 	case PM_DISABLE_CPU_DEEP_IDLE:
861 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
862 			break;
863 
864 		idle_cpu = non_deep_idle_cpu;
865 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
866 			disp_enq_thread = non_deep_idle_disp_enq_thread;
867 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
868 		}
869 		break;
870 
871 	default:
872 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
873 		    code);
874 		break;
875 	}
876 	mutex_exit(&cpu_idle_callb_mutex);
877 	return (rslt);
878 }
879 
880 /*ARGSUSED*/
881 static boolean_t
882 cpu_idle_cpr_callb(void *arg, int code)
883 {
884 	boolean_t rslt = B_TRUE;
885 
886 	mutex_enter(&cpu_idle_callb_mutex);
887 	switch (code) {
888 	case CB_CODE_CPR_RESUME:
889 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
890 			/*
891 			 * Do not enable dispatcher hooks if disabled by user.
892 			 */
893 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
894 				break;
895 
896 			disp_enq_thread = cstate_wakeup;
897 			idle_cpu = cpu_idle_adaptive;
898 		} else {
899 			rslt = B_FALSE;
900 		}
901 		break;
902 
903 	case CB_CODE_CPR_CHKPT:
904 		idle_cpu = non_deep_idle_cpu;
905 		disp_enq_thread = non_deep_idle_disp_enq_thread;
906 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
907 		break;
908 
909 	default:
910 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
911 		break;
912 	}
913 	mutex_exit(&cpu_idle_callb_mutex);
914 	return (rslt);
915 }
916 
917 /*
918  * handle _CST notification
919  */
920 void
921 cpuidle_cstate_instance(cpu_t *cp)
922 {
923 #ifndef	__xpv
924 	cpupm_mach_state_t	*mach_state =
925 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
926 	cpu_acpi_handle_t	handle;
927 	struct machcpu		*mcpu;
928 	cpuset_t 		dom_cpu_set;
929 	kmutex_t		*pm_lock;
930 	int			result = 0;
931 	processorid_t		cpu_id;
932 
933 	if (mach_state == NULL) {
934 		return;
935 	}
936 
937 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
938 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
939 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
940 
941 	/*
942 	 * Do for all the CPU's in the domain
943 	 */
944 	mutex_enter(pm_lock);
945 	do {
946 		CPUSET_FIND(dom_cpu_set, cpu_id);
947 		if (cpu_id == CPUSET_NOTINSET)
948 			break;
949 
950 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
951 		cp = cpu[cpu_id];
952 		mach_state = (cpupm_mach_state_t *)
953 		    cp->cpu_m.mcpu_pm_mach_state;
954 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
955 			mutex_exit(pm_lock);
956 			return;
957 		}
958 		handle = mach_state->ms_acpi_handle;
959 		ASSERT(handle != NULL);
960 
961 		/*
962 		 * re-evaluate cstate object
963 		 */
964 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
965 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
966 			    " object Instance: %d", cpu_id);
967 		}
968 		mcpu = &(cp->cpu_m);
969 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
970 		if (mcpu->max_cstates > CPU_ACPI_C1) {
971 			(void) cstate_timer_callback(
972 			    CST_EVENT_MULTIPLE_CSTATES);
973 			disp_enq_thread = cstate_wakeup;
974 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
975 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
976 			disp_enq_thread = non_deep_idle_disp_enq_thread;
977 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
978 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
979 		}
980 
981 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
982 	} while (result < 0);
983 	mutex_exit(pm_lock);
984 #endif
985 }
986 
987 /*
988  * handle the number or the type of available processor power states change
989  */
990 void
991 cpuidle_manage_cstates(void *ctx)
992 {
993 	cpu_t			*cp = ctx;
994 	cpupm_mach_state_t	*mach_state =
995 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
996 	boolean_t		is_ready;
997 
998 	if (mach_state == NULL) {
999 		return;
1000 	}
1001 
1002 	/*
1003 	 * We currently refuse to power manage if the CPU is not ready to
1004 	 * take cross calls (cross calls fail silently if CPU is not ready
1005 	 * for it).
1006 	 *
1007 	 * Additionally, for x86 platforms we cannot power manage an instance,
1008 	 * until it has been initialized.
1009 	 */
1010 	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1011 	if (!is_ready)
1012 		return;
1013 
1014 	cpuidle_cstate_instance(cp);
1015 }
1016