xref: /titanic_44/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision 6876da76f91687fee15a706830b990a2c0d55157)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/x86_archext.h>
31 #include <sys/machsystm.h>
32 #include <sys/x_call.h>
33 #include <sys/stat.h>
34 #include <sys/acpi/acpi.h>
35 #include <sys/acpica.h>
36 #include <sys/cpu_acpi.h>
37 #include <sys/cpu_idle.h>
38 #include <sys/cpupm.h>
39 #include <sys/cpu_event.h>
40 #include <sys/hpet.h>
41 #include <sys/archsystm.h>
42 #include <vm/hat_i86.h>
43 #include <sys/dtrace.h>
44 #include <sys/sdt.h>
45 #include <sys/callb.h>
46 
47 #define	CSTATE_USING_HPET		1
48 #define	CSTATE_USING_LAT		2
49 
50 extern void cpu_idle_adaptive(void);
51 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
52     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
53 
54 static int cpu_idle_init(cpu_t *);
55 static void cpu_idle_fini(cpu_t *);
56 static void cpu_idle_stop(cpu_t *);
57 static boolean_t cpu_deep_idle_callb(void *arg, int code);
58 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
59 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
60 
61 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
62 
63 /*
64  * the flag of always-running local APIC timer.
65  * the flag of HPET Timer use in deep cstate.
66  */
67 static boolean_t cpu_cstate_arat = B_FALSE;
68 static boolean_t cpu_cstate_hpet = B_FALSE;
69 
70 /*
71  * Interfaces for modules implementing Intel's deep c-state.
72  */
73 cpupm_state_ops_t cpu_idle_ops = {
74 	"Generic ACPI C-state Support",
75 	cpu_idle_init,
76 	cpu_idle_fini,
77 	NULL,
78 	cpu_idle_stop
79 };
80 
81 static kmutex_t		cpu_idle_callb_mutex;
82 static callb_id_t	cpu_deep_idle_callb_id;
83 static callb_id_t	cpu_idle_cpr_callb_id;
84 static uint_t		cpu_idle_cfg_state;
85 
86 static kmutex_t cpu_idle_mutex;
87 
88 cpu_idle_kstat_t cpu_idle_kstat = {
89 	{ "address_space_id",	KSTAT_DATA_STRING },
90 	{ "latency",		KSTAT_DATA_UINT32 },
91 	{ "power",		KSTAT_DATA_UINT32 },
92 };
93 
94 /*
95  * kstat update function of the c-state info
96  */
97 static int
98 cpu_idle_kstat_update(kstat_t *ksp, int flag)
99 {
100 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
101 
102 	if (flag == KSTAT_WRITE) {
103 		return (EACCES);
104 	}
105 
106 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
107 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
108 		"FFixedHW");
109 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
110 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
111 		"SystemIO");
112 	} else {
113 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
114 		"Unsupported");
115 	}
116 
117 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
118 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
119 
120 	return (0);
121 }
122 
123 /*
124  * Used during configuration callbacks to manage implementation specific
125  * details of the hardware timer used during Deep C-state.
126  */
127 boolean_t
128 cstate_timer_callback(int code)
129 {
130 	if (cpu_cstate_arat) {
131 		return (B_TRUE);
132 	} else if (cpu_cstate_hpet) {
133 		return (hpet.callback(code));
134 	}
135 	return (B_FALSE);
136 }
137 
138 /*
139  * Some Local APIC Timers do not work during Deep C-states.
140  * The Deep C-state idle function uses this function to ensure it is using a
141  * hardware timer that works during Deep C-states.  This function also
142  * switches the timer back to the LACPI Timer after Deep C-state.
143  */
144 static boolean_t
145 cstate_use_timer(hrtime_t *lapic_expire, int timer)
146 {
147 	if (cpu_cstate_arat)
148 		return (B_TRUE);
149 
150 	/*
151 	 * We have to return B_FALSE if no arat or hpet support
152 	 */
153 	if (!cpu_cstate_hpet)
154 		return (B_FALSE);
155 
156 	switch (timer) {
157 	case CSTATE_USING_HPET:
158 		return (hpet.use_hpet_timer(lapic_expire));
159 	case CSTATE_USING_LAT:
160 		hpet.use_lapic_timer(*lapic_expire);
161 		return (B_TRUE);
162 	default:
163 		return (B_FALSE);
164 	}
165 }
166 
167 /*
168  * c-state wakeup function.
169  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
170  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
171  */
172 void
173 cstate_wakeup(cpu_t *cp, int bound)
174 {
175 	struct machcpu	*mcpu = &(cp->cpu_m);
176 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
177 	cpupart_t	*cpu_part;
178 	uint_t		cpu_found;
179 	processorid_t	cpu_sid;
180 
181 	cpu_part = cp->cpu_part;
182 	cpu_sid = cp->cpu_seqid;
183 	/*
184 	 * Clear the halted bit for that CPU since it will be woken up
185 	 * in a moment.
186 	 */
187 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
188 		/*
189 		 * Clear the halted bit for that CPU since it will be
190 		 * poked in a moment.
191 		 */
192 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
193 
194 		/*
195 		 * We may find the current CPU present in the halted cpuset
196 		 * if we're in the context of an interrupt that occurred
197 		 * before we had a chance to clear our bit in cpu_idle().
198 		 * Waking ourself is obviously unnecessary, since if
199 		 * we're here, we're not halted.
200 		 */
201 		if (cp != CPU) {
202 			/*
203 			 * Use correct wakeup mechanism
204 			 */
205 			if ((mcpu_mwait != NULL) &&
206 			    (*mcpu_mwait == MWAIT_HALTED))
207 				MWAIT_WAKEUP(cp);
208 			else
209 				poke_cpu(cp->cpu_id);
210 		}
211 		return;
212 	} else {
213 		/*
214 		 * This cpu isn't halted, but it's idle or undergoing a
215 		 * context switch. No need to awaken anyone else.
216 		 */
217 		if (cp->cpu_thread == cp->cpu_idle_thread ||
218 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
219 			return;
220 	}
221 
222 	/*
223 	 * No need to wake up other CPUs if the thread we just enqueued
224 	 * is bound.
225 	 */
226 	if (bound)
227 		return;
228 
229 
230 	/*
231 	 * See if there's any other halted CPUs. If there are, then
232 	 * select one, and awaken it.
233 	 * It's possible that after we find a CPU, somebody else
234 	 * will awaken it before we get the chance.
235 	 * In that case, look again.
236 	 */
237 	do {
238 		cpu_found = bitset_find(&cpu_part->cp_haltset);
239 		if (cpu_found == (uint_t)-1)
240 			return;
241 
242 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
243 	    cpu_found) < 0);
244 
245 	/*
246 	 * Must use correct wakeup mechanism to avoid lost wakeup of
247 	 * alternate cpu.
248 	 */
249 	if (cpu_found != CPU->cpu_seqid) {
250 		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
251 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
252 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
253 		else
254 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
255 	}
256 }
257 
258 /*
259  * Function called by CPU idle notification framework to check whether CPU
260  * has been awakened. It will be called with interrupt disabled.
261  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
262  * notification framework.
263  */
264 static void
265 acpi_cpu_mwait_check_wakeup(void *arg)
266 {
267 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
268 
269 	ASSERT(arg != NULL);
270 	if (*mcpu_mwait != MWAIT_HALTED) {
271 		/*
272 		 * CPU has been awakened, notify CPU idle notification system.
273 		 */
274 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
275 	} else {
276 		/*
277 		 * Toggle interrupt flag to detect pending interrupts.
278 		 * If interrupt happened, do_interrupt() will notify CPU idle
279 		 * notification framework so no need to call cpu_idle_exit()
280 		 * here.
281 		 */
282 		sti();
283 		SMT_PAUSE();
284 		cli();
285 	}
286 }
287 
288 static void
289 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
290 {
291 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
292 
293 	ASSERT(arg != NULL);
294 	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
295 		/*
296 		 * CPU has been awakened, notify CPU idle notification system.
297 		 */
298 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
299 	} else {
300 		/*
301 		 * Toggle interrupt flag to detect pending interrupts.
302 		 * If interrupt happened, do_interrupt() will notify CPU idle
303 		 * notification framework so no need to call cpu_idle_exit()
304 		 * here.
305 		 */
306 		sti();
307 		SMT_PAUSE();
308 		cli();
309 	}
310 }
311 
312 /*ARGSUSED*/
313 static void
314 acpi_cpu_check_wakeup(void *arg)
315 {
316 	/*
317 	 * Toggle interrupt flag to detect pending interrupts.
318 	 * If interrupt happened, do_interrupt() will notify CPU idle
319 	 * notification framework so no need to call cpu_idle_exit() here.
320 	 */
321 	sti();
322 	SMT_PAUSE();
323 	cli();
324 }
325 
326 /*
327  * enter deep c-state handler
328  */
329 static void
330 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
331 {
332 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
333 	cpu_t			*cpup = CPU;
334 	processorid_t		cpu_sid = cpup->cpu_seqid;
335 	cpupart_t		*cp = cpup->cpu_part;
336 	hrtime_t		lapic_expire;
337 	uint8_t			type = cstate->cs_addrspace_id;
338 	uint32_t		cs_type = cstate->cs_type;
339 	int			hset_update = 1;
340 	boolean_t		using_timer;
341 	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
342 
343 	/*
344 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
345 	 * wake us between now and when we call mwait.  No other cpu will
346 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
347 	 */
348 	if (mcpu_mwait) {
349 		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
350 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
351 			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
352 		} else {
353 			*mcpu_mwait = MWAIT_HALTED;
354 			check_func = &acpi_cpu_mwait_check_wakeup;
355 		}
356 	}
357 
358 	/*
359 	 * If this CPU is online, and there are multiple CPUs
360 	 * in the system, then we should note our halting
361 	 * by adding ourselves to the partition's halted CPU
362 	 * bitmap. This allows other CPUs to find/awaken us when
363 	 * work becomes available.
364 	 */
365 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
366 		hset_update = 0;
367 
368 	/*
369 	 * Add ourselves to the partition's halted CPUs bitmask
370 	 * and set our HALTED flag, if necessary.
371 	 *
372 	 * When a thread becomes runnable, it is placed on the queue
373 	 * and then the halted cpuset is checked to determine who
374 	 * (if anyone) should be awakened. We therefore need to first
375 	 * add ourselves to the halted cpuset, and and then check if there
376 	 * is any work available.
377 	 *
378 	 * Note that memory barriers after updating the HALTED flag
379 	 * are not necessary since an atomic operation (updating the bitmap)
380 	 * immediately follows. On x86 the atomic operation acts as a
381 	 * memory barrier for the update of cpu_disp_flags.
382 	 */
383 	if (hset_update) {
384 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
385 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
386 	}
387 
388 	/*
389 	 * Check to make sure there's really nothing to do.
390 	 * Work destined for this CPU may become available after
391 	 * this check. We'll be notified through the clearing of our
392 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
393 	 *
394 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
395 	 */
396 	if (disp_anywork()) {
397 		if (hset_update) {
398 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
399 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
400 		}
401 		return;
402 	}
403 
404 	/*
405 	 * We're on our way to being halted.
406 	 *
407 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
408 	 * Try to program the HPET hardware to substitute for this CPU's
409 	 * LAPIC timer.
410 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
411 	 * to start the LAPIC Timer again before leaving this function.
412 	 *
413 	 * Disable interrupts here so we will awaken immediately after halting
414 	 * if someone tries to poke us between now and the time we actually
415 	 * halt.
416 	 */
417 	cli();
418 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
419 
420 	/*
421 	 * We check for the presence of our bit after disabling interrupts.
422 	 * If it's cleared, we'll return. If the bit is cleared after
423 	 * we check then the cstate_wakeup() will pop us out of the halted
424 	 * state.
425 	 *
426 	 * This means that the ordering of the cstate_wakeup() and the clearing
427 	 * of the bit by cpu_wakeup is important.
428 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
429 	 * cstate_wakeup().
430 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
431 	 */
432 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
433 		(void) cstate_use_timer(&lapic_expire,
434 		    CSTATE_USING_LAT);
435 		sti();
436 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
437 		return;
438 	}
439 
440 	/*
441 	 * The check for anything locally runnable is here for performance
442 	 * and isn't needed for correctness. disp_nrunnable ought to be
443 	 * in our cache still, so it's inexpensive to check, and if there
444 	 * is anything runnable we won't have to wait for the poke.
445 	 */
446 	if (cpup->cpu_disp->disp_nrunnable != 0) {
447 		(void) cstate_use_timer(&lapic_expire,
448 		    CSTATE_USING_LAT);
449 		sti();
450 		if (hset_update) {
451 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
452 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
453 		}
454 		return;
455 	}
456 
457 	if (using_timer == B_FALSE) {
458 
459 		(void) cstate_use_timer(&lapic_expire,
460 		    CSTATE_USING_LAT);
461 		sti();
462 
463 		/*
464 		 * We are currently unable to program the HPET to act as this
465 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
466 		 * because no timer is set to wake it up while its LAPIC timer
467 		 * stalls in deep C-States.
468 		 * Enter C1 instead.
469 		 *
470 		 * cstate_wake_cpu() will wake this CPU with an IPI which
471 		 * works with MWAIT.
472 		 */
473 		i86_monitor(mcpu_mwait, 0, 0);
474 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
475 			if (cpu_idle_enter(IDLE_STATE_C1, 0,
476 			    check_func, (void *)mcpu_mwait) == 0) {
477 				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
478 				    MWAIT_HALTED) {
479 					i86_mwait(0, 0);
480 				}
481 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
482 			}
483 		}
484 
485 		/*
486 		 * We're no longer halted
487 		 */
488 		if (hset_update) {
489 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
490 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
491 		}
492 		return;
493 	}
494 
495 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
496 		/*
497 		 * We're on our way to being halted.
498 		 * To avoid a lost wakeup, arm the monitor before checking
499 		 * if another cpu wrote to mcpu_mwait to wake us up.
500 		 */
501 		i86_monitor(mcpu_mwait, 0, 0);
502 		if (*mcpu_mwait == MWAIT_HALTED) {
503 			if (cpu_idle_enter((uint_t)cs_type, 0,
504 			    check_func, (void *)mcpu_mwait) == 0) {
505 				if (*mcpu_mwait == MWAIT_HALTED) {
506 					i86_mwait(cstate->cs_address, 1);
507 				}
508 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
509 			}
510 		}
511 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
512 		uint32_t value;
513 		ACPI_TABLE_FADT *gbl_FADT;
514 
515 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
516 			if (cpu_idle_enter((uint_t)cs_type, 0,
517 			    check_func, (void *)mcpu_mwait) == 0) {
518 				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
519 					(void) cpu_acpi_read_port(
520 					    cstate->cs_address, &value, 8);
521 					acpica_get_global_FADT(&gbl_FADT);
522 					(void) cpu_acpi_read_port(
523 					    gbl_FADT->XPmTimerBlock.Address,
524 					    &value, 32);
525 				}
526 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
527 			}
528 		}
529 	}
530 
531 	/*
532 	 * The LAPIC timer may have stopped in deep c-state.
533 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
534 	 */
535 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
536 	sti();
537 
538 	/*
539 	 * We're no longer halted
540 	 */
541 	if (hset_update) {
542 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
543 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
544 	}
545 }
546 
547 /*
548  * Idle the present CPU, deep c-state is supported
549  */
550 void
551 cpu_acpi_idle(void)
552 {
553 	cpu_t *cp = CPU;
554 	cpu_acpi_handle_t handle;
555 	cma_c_state_t *cs_data;
556 	cpu_acpi_cstate_t *cstates;
557 	hrtime_t start, end;
558 	int cpu_max_cstates;
559 	uint32_t cs_indx;
560 	uint16_t cs_type;
561 
562 	cpupm_mach_state_t *mach_state =
563 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
564 	handle = mach_state->ms_acpi_handle;
565 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
566 
567 	cs_data = mach_state->ms_cstate.cma_state.cstate;
568 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
569 	ASSERT(cstates != NULL);
570 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
571 	if (cpu_max_cstates > CPU_MAX_CSTATES)
572 		cpu_max_cstates = CPU_MAX_CSTATES;
573 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
574 		(*non_deep_idle_cpu)();
575 		return;
576 	}
577 
578 	start = gethrtime_unscaled();
579 
580 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
581 
582 	cs_type = cstates[cs_indx].cs_type;
583 
584 	switch (cs_type) {
585 	default:
586 		/* FALLTHROUGH */
587 	case CPU_ACPI_C1:
588 		(*non_deep_idle_cpu)();
589 		break;
590 
591 	case CPU_ACPI_C2:
592 		acpi_cpu_cstate(&cstates[cs_indx]);
593 		break;
594 
595 	case CPU_ACPI_C3:
596 		/*
597 		 * All supported Intel processors maintain cache coherency
598 		 * during C3.  Currently when entering C3 processors flush
599 		 * core caches to higher level shared cache. The shared cache
600 		 * maintains state and supports probes during C3.
601 		 * Consequently there is no need to handle cache coherency
602 		 * and Bus Master activity here with the cache flush, BM_RLD
603 		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
604 		 * in section 8.1.4 of the ACPI Specification 4.0.
605 		 */
606 		acpi_cpu_cstate(&cstates[cs_indx]);
607 		break;
608 	}
609 
610 	end = gethrtime_unscaled();
611 
612 	/*
613 	 * Update statistics
614 	 */
615 	cpupm_wakeup_cstate_data(cs_data, end);
616 }
617 
618 boolean_t
619 cpu_deep_cstates_supported(void)
620 {
621 	extern int	idle_cpu_no_deep_c;
622 
623 	if (idle_cpu_no_deep_c)
624 		return (B_FALSE);
625 
626 	if (!cpuid_deep_cstates_supported())
627 		return (B_FALSE);
628 
629 	if (cpuid_arat_supported()) {
630 		cpu_cstate_arat = B_TRUE;
631 		return (B_TRUE);
632 	}
633 
634 	if ((hpet.supported == HPET_FULL_SUPPORT) &&
635 	    hpet.install_proxy()) {
636 		cpu_cstate_hpet = B_TRUE;
637 		return (B_TRUE);
638 	}
639 
640 	return (B_FALSE);
641 }
642 
643 /*
644  * Validate that this processor supports deep cstate and if so,
645  * get the c-state data from ACPI and cache it.
646  */
647 static int
648 cpu_idle_init(cpu_t *cp)
649 {
650 	cpupm_mach_state_t *mach_state =
651 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
652 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
653 	cpu_acpi_cstate_t *cstate;
654 	char name[KSTAT_STRLEN];
655 	int cpu_max_cstates, i;
656 	int ret;
657 
658 	/*
659 	 * Cache the C-state specific ACPI data.
660 	 */
661 	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
662 		if (ret < 0)
663 			cmn_err(CE_NOTE,
664 			    "!Support for CPU deep idle states is being "
665 			    "disabled due to errors parsing ACPI C-state "
666 			    "objects exported by BIOS.");
667 		cpu_idle_fini(cp);
668 		return (-1);
669 	}
670 
671 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
672 
673 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
674 
675 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
676 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
677 		/*
678 		 * Allocate, initialize and install cstate kstat
679 		 */
680 		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
681 		    name, "misc",
682 		    KSTAT_TYPE_NAMED,
683 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
684 		    KSTAT_FLAG_VIRTUAL);
685 
686 		if (cstate->cs_ksp == NULL) {
687 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
688 		} else {
689 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
690 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
691 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
692 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
693 			cstate->cs_ksp->ks_private = cstate;
694 			kstat_install(cstate->cs_ksp);
695 			cstate++;
696 		}
697 	}
698 
699 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
700 	cpupm_alloc_ms_cstate(cp);
701 
702 	if (cpu_deep_cstates_supported()) {
703 		uint32_t value;
704 
705 		mutex_enter(&cpu_idle_callb_mutex);
706 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
707 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
708 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
709 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
710 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
711 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
712 		mutex_exit(&cpu_idle_callb_mutex);
713 
714 
715 		/*
716 		 * All supported CPUs (Nehalem and later) will remain in C3
717 		 * during Bus Master activity.
718 		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
719 		 * is not already 0 before enabling Deeper C-states.
720 		 */
721 		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
722 		if (value & 1)
723 			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
724 	}
725 
726 	return (0);
727 }
728 
729 /*
730  * Free resources allocated by cpu_idle_init().
731  */
732 static void
733 cpu_idle_fini(cpu_t *cp)
734 {
735 	cpupm_mach_state_t *mach_state =
736 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
737 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
738 	cpu_acpi_cstate_t *cstate;
739 	uint_t	cpu_max_cstates, i;
740 
741 	/*
742 	 * idle cpu points back to the generic one
743 	 */
744 	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
745 	disp_enq_thread = non_deep_idle_disp_enq_thread;
746 
747 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
748 	if (cstate) {
749 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
750 
751 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
752 			if (cstate->cs_ksp != NULL)
753 				kstat_delete(cstate->cs_ksp);
754 			cstate++;
755 		}
756 	}
757 
758 	cpupm_free_ms_cstate(cp);
759 	cpupm_free_domains(&cpupm_cstate_domains);
760 	cpu_acpi_free_cstate_data(handle);
761 
762 	mutex_enter(&cpu_idle_callb_mutex);
763 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
764 		(void) callb_delete(cpu_deep_idle_callb_id);
765 		cpu_deep_idle_callb_id = (callb_id_t)0;
766 	}
767 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
768 		(void) callb_delete(cpu_idle_cpr_callb_id);
769 		cpu_idle_cpr_callb_id = (callb_id_t)0;
770 	}
771 	mutex_exit(&cpu_idle_callb_mutex);
772 }
773 
774 static void
775 cpu_idle_stop(cpu_t *cp)
776 {
777 	cpupm_mach_state_t *mach_state =
778 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
779 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
780 	cpu_acpi_cstate_t *cstate;
781 	uint_t cpu_max_cstates, i;
782 
783 	/*
784 	 * place the CPUs in a safe place so that we can disable
785 	 * deep c-state on them.
786 	 */
787 	pause_cpus(NULL);
788 	cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
789 	start_cpus();
790 
791 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
792 	if (cstate) {
793 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
794 
795 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
796 			if (cstate->cs_ksp != NULL)
797 				kstat_delete(cstate->cs_ksp);
798 			cstate++;
799 		}
800 	}
801 	cpupm_free_ms_cstate(cp);
802 	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
803 	cpu_acpi_free_cstate_data(handle);
804 }
805 
806 /*ARGSUSED*/
807 static boolean_t
808 cpu_deep_idle_callb(void *arg, int code)
809 {
810 	boolean_t rslt = B_TRUE;
811 
812 	mutex_enter(&cpu_idle_callb_mutex);
813 	switch (code) {
814 	case PM_DEFAULT_CPU_DEEP_IDLE:
815 		/*
816 		 * Default policy is same as enable
817 		 */
818 		/*FALLTHROUGH*/
819 	case PM_ENABLE_CPU_DEEP_IDLE:
820 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
821 			break;
822 
823 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
824 			disp_enq_thread = cstate_wakeup;
825 			idle_cpu = cpu_idle_adaptive;
826 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
827 		} else {
828 			rslt = B_FALSE;
829 		}
830 		break;
831 
832 	case PM_DISABLE_CPU_DEEP_IDLE:
833 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
834 			break;
835 
836 		idle_cpu = non_deep_idle_cpu;
837 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
838 			disp_enq_thread = non_deep_idle_disp_enq_thread;
839 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
840 		}
841 		break;
842 
843 	default:
844 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
845 		    code);
846 		break;
847 	}
848 	mutex_exit(&cpu_idle_callb_mutex);
849 	return (rslt);
850 }
851 
852 /*ARGSUSED*/
853 static boolean_t
854 cpu_idle_cpr_callb(void *arg, int code)
855 {
856 	boolean_t rslt = B_TRUE;
857 
858 	mutex_enter(&cpu_idle_callb_mutex);
859 	switch (code) {
860 	case CB_CODE_CPR_RESUME:
861 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
862 			/*
863 			 * Do not enable dispatcher hooks if disabled by user.
864 			 */
865 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
866 				break;
867 
868 			disp_enq_thread = cstate_wakeup;
869 			idle_cpu = cpu_idle_adaptive;
870 		} else {
871 			rslt = B_FALSE;
872 		}
873 		break;
874 
875 	case CB_CODE_CPR_CHKPT:
876 		idle_cpu = non_deep_idle_cpu;
877 		disp_enq_thread = non_deep_idle_disp_enq_thread;
878 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
879 		break;
880 
881 	default:
882 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
883 		break;
884 	}
885 	mutex_exit(&cpu_idle_callb_mutex);
886 	return (rslt);
887 }
888 
889 /*
890  * handle _CST notification
891  */
892 void
893 cpuidle_cstate_instance(cpu_t *cp)
894 {
895 #ifndef	__xpv
896 	cpupm_mach_state_t	*mach_state =
897 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
898 	cpu_acpi_handle_t	handle;
899 	struct machcpu		*mcpu;
900 	cpuset_t 		dom_cpu_set;
901 	kmutex_t		*pm_lock;
902 	int			result = 0;
903 	processorid_t		cpu_id;
904 
905 	if (mach_state == NULL) {
906 		return;
907 	}
908 
909 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
910 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
911 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
912 
913 	/*
914 	 * Do for all the CPU's in the domain
915 	 */
916 	mutex_enter(pm_lock);
917 	do {
918 		CPUSET_FIND(dom_cpu_set, cpu_id);
919 		if (cpu_id == CPUSET_NOTINSET)
920 			break;
921 
922 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
923 		cp = cpu[cpu_id];
924 		mach_state = (cpupm_mach_state_t *)
925 		    cp->cpu_m.mcpu_pm_mach_state;
926 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
927 			mutex_exit(pm_lock);
928 			return;
929 		}
930 		handle = mach_state->ms_acpi_handle;
931 		ASSERT(handle != NULL);
932 
933 		/*
934 		 * re-evaluate cstate object
935 		 */
936 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
937 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
938 			    " object Instance: %d", cpu_id);
939 		}
940 		mutex_enter(&cpu_lock);
941 		mcpu = &(cp->cpu_m);
942 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
943 		if (mcpu->max_cstates > CPU_ACPI_C1) {
944 			(void) cstate_timer_callback(
945 			    CST_EVENT_MULTIPLE_CSTATES);
946 			disp_enq_thread = cstate_wakeup;
947 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
948 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
949 			disp_enq_thread = non_deep_idle_disp_enq_thread;
950 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
951 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
952 		}
953 		mutex_exit(&cpu_lock);
954 
955 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
956 	} while (result < 0);
957 	mutex_exit(pm_lock);
958 #endif
959 }
960 
961 /*
962  * handle the number or the type of available processor power states change
963  */
964 void
965 cpuidle_manage_cstates(void *ctx)
966 {
967 	cpu_t			*cp = ctx;
968 	cpupm_mach_state_t	*mach_state =
969 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
970 	boolean_t		is_ready;
971 
972 	if (mach_state == NULL) {
973 		return;
974 	}
975 
976 	/*
977 	 * We currently refuse to power manage if the CPU is not ready to
978 	 * take cross calls (cross calls fail silently if CPU is not ready
979 	 * for it).
980 	 *
981 	 * Additionally, for x86 platforms we cannot power manage an instance,
982 	 * until it has been initialized.
983 	 */
984 	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
985 	if (!is_ready)
986 		return;
987 
988 	cpuidle_cstate_instance(cp);
989 }
990